From 178b6b759074597777ce13438efb0e0ba625e429 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 11 Jun 2024 01:28:01 -0700
Subject: [PATCH] 2.22.3-1

Rework core for NVIDIA Trusted Computing
 * Compress work structs so that they are shared between channels
 * Utilize the full amount of kernel argument space permitted (4k)
   before resorting to work fifo.
 * Rework the task preprocessing phase.
 * Use a separate abortDevFlag which is kept in sync with abortFlag
   using cudaMemcpy operations.
 * Rename src/include/align.h to src/include/bitops.h

Add lazy connection establishment for collective operations
 * Move buffer allocation and connection establishment to the first
   collective operation using that algorithm.
 * Accelerate init time and reduce memory usage.
 * Avoid allocating NVLS buffers if all calls are registered.
 * Compute algo/proto in ncclLaunchCollTasksInfo early on.
 * Connect peers in ncclCollPreconnectFunc if not connected already.
 * Also move shared buffer creation to the first send/recv call.

Accelerate intra-node NVLink detection
 * Make each rank only detect NVLinks attached to its GPU.
 * Fuse XMLs to reconstruct the full NVLink topology

Add init profiling to report time spend in different init phases.
 * Report timings of bootstrap, allgather, search, connect, etc.
 * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS.

Add support for PCI p2p on split PCI switches
 * Detect split PCI switches through a kernel module exposing
   switch information.
 * Update the topology XML and graph to add those inter-switch
   connections.

Add cost estimation API
 * Add a new ncclGroupEndSimulate primitive to return the estimated
   time a group would take.

Net/IB: Add separate traffic class for fifo messages
 * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages
   independently from NCCL_IB_TC.
   Merges PR #1194

Net/IB: Add support for IB router
 * Use flid instead of lid if subnets do not match
 * Warn if flid is 0

Optimizations and fixes for device network offload (unpack)
 * Double the default number of channels
 * Cache netDeviceType
 * Fix save/increment head logic to enable Tree support.

Support ncclGroupStart/End for ncclCommAbort/Destroy
 * Allow Abort/Destroy to be called within a group when managing
   multiple GPUs with a single process.

Improve Tuner API
 * Provide to the plugin the original cost table so that the plugin
   can leave unknown or disabled algo/proto combinations untouched.
 * Remove nvlsSupport and collnetSupport.

Do not print version to stdout when using a debug file
 * Also print version from all processes with INFO debug level.
   Fixes issue #1271

Fix clang warnings in NVTX headers
 * Update NVTX headers to the latest version
   Fixes issue #1270

Disable port fusion in heterogeneous systems
 * Do not fuse ports if a mix of multi-port and single port are detected.

Fix NVLS graphs search for dual NICs.
 * Fix NVLS graph search when we have more than one NIC per GPU.

Fix crash with collnetDirect
 * Add separate graph search for collnetDirect, testing alltoall paths
   and working similarly to the NVLS search.

Fix hang when nodes have different CPU types
 * Add the CPU type to the rank peer info.
 * Align all ranks on the CPU type after the first allgather.
 * Only use the aligned CPU type for all tuning operations.
   Fixes issue #1136
   Fixes issue #1184

Fix performance of registered send/recv operations
 * Allow for single full size operations
 * Add INFO to confirm the registration of send/recv buffers.

Move all sync ops to finalize stage
 * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has
   been called.

Improve error reporting during SHM segment creation

Improve support of various compilers
   Merges PR #1177
   Merges PR #1228

Allow net and tuner plugins to be statically linked
 * Search for ncclNet or ncclTuner symbols in the main binary.
   Merges PR #979

Plugin examples includes cleanup
 * Harmonize err.h and common.h usage.
 * Add mixed plugin with both net and tuner.
---
 ext-net/example/nccl/common.h                 |   15 +
 ext-net/example/nccl/err.h                    |    1 +
 ext-net/example/nccl/net.h                    |    6 +-
 ext-net/example/nccl/types.h                  |    4 +-
 ext-tuner/example/nccl/common.h               |   15 +
 ext-tuner/example/nccl/err.h                  |   17 +
 ext-tuner/example/nccl/tuner.h                |   43 +-
 ext-tuner/example/plugin.c                    |   13 +-
 makefiles/version.mk                          |    4 +-
 src/bootstrap.cc                              |    5 +-
 src/channel.cc                                |    8 +-
 src/collectives.cc                            |   63 +
 src/debug.cc                                  |   47 +-
 src/device/all_gather.h                       |  117 +-
 src/device/all_reduce.h                       |  290 +-
 src/device/broadcast.h                        |   37 +-
 src/device/common.cu                          |    6 +-
 src/device/common.h                           |  404 ++-
 src/device/generate.py                        |    4 +-
 src/device/network/unpack/unpack.h            |   14 +-
 src/device/network/unpack/unpack_defs.h       |    2 +-
 src/device/prims_ll.h                         |   10 +-
 src/device/prims_ll128.h                      |    5 +-
 src/device/prims_simple.h                     |   58 +-
 src/device/reduce.h                           |   33 +-
 src/device/reduce_kernel.h                    |   43 +-
 src/device/reduce_scatter.h                   |  117 +-
 src/device/sendrecv.h                         |  210 +-
 src/enqueue.cc                                | 2619 +++++++++--------
 src/graph/connect.cc                          |   23 +-
 src/graph/paths.cc                            |   25 +-
 src/graph/search.cc                           |   72 +-
 src/graph/topo.cc                             |  140 +-
 src/graph/topo.h                              |    2 +-
 src/graph/tuning.cc                           |   32 +-
 src/graph/xml.cc                              |  138 +-
 src/graph/xml.h                               |   25 +-
 src/group.cc                                  |  261 +-
 src/include/align.h                           |   47 -
 src/include/alloc.h                           |  125 +-
 src/include/bitops.h                          |  277 ++
 src/include/channel.h                         |   41 +-
 src/include/checks.h                          |    8 +-
 src/include/collectives.h                     |    8 +
 src/include/comm.h                            |  282 +-
 src/include/cudawrap.h                        |    4 +
 src/include/debug.h                           |    9 +-
 src/include/device.h                          |  273 +-
 src/include/enqueue.h                         |    1 +
 src/include/gdrwrap.h                         |    9 +-
 src/include/graph.h                           |    7 +-
 src/include/group.h                           |   15 +-
 src/include/info.h                            |  123 -
 src/include/nccl_common.h                     |   30 +-
 src/include/nccl_tuner.h                      |   56 +-
 src/include/net.h                             |    4 +-
 src/include/nvmlwrap.h                        |   38 +
 src/include/nvtx.h                            |    2 +-
 src/include/nvtx3/nvToolsExt.h                |   96 +-
 src/include/nvtx3/nvToolsExtCounters.h        |  335 +++
 src/include/nvtx3/nvToolsExtCuda.h            |    4 +-
 src/include/nvtx3/nvToolsExtCudaRt.h          |    4 +-
 src/include/nvtx3/nvToolsExtMem.h             |  694 +++++
 src/include/nvtx3/nvToolsExtMemCudaRt.h       |  150 +
 src/include/nvtx3/nvToolsExtOpenCL.h          |    6 +-
 src/include/nvtx3/nvToolsExtPayload.h         |  977 ++++--
 src/include/nvtx3/nvToolsExtPayloadHelper.h   |  170 ++
 .../nvtx3/nvToolsExtSemanticsCounters.h       |   88 +
 src/include/nvtx3/nvToolsExtSemanticsScope.h  |   30 +
 src/include/nvtx3/nvToolsExtSync.h            |   26 +-
 src/include/nvtx3/nvtx3.hpp                   |   21 +-
 .../nvtx3/nvtxDetail/nvtxExtHelperMacros.h    |   31 +
 .../nvtxExtImpl.h                             |   50 +-
 .../nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h |  148 +
 .../nvtxDetail/nvtxExtImplMemCudaRt_v1.h      |   74 +
 .../nvtx3/nvtxDetail/nvtxExtImplMem_v1.h      |  133 +
 .../nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h  |  155 +
 .../nvtxExtInit.h                             |  161 +-
 .../nvtxDetail/nvtxExtPayloadHelperInternal.h |  272 ++
 .../nvtxExtPayloadTypeInfo.h                  |   20 +-
 .../nvtxExtTypes.h                            |    0
 src/include/nvtx3/nvtxDetail/nvtxImpl.h       |   21 +-
 src/include/nvtx3/nvtxDetail/nvtxInit.h       |    6 +-
 src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h   |    2 +-
 .../nvtxExtDetail/nvtxExtImplPayload_v1.h     |   86 -
 src/include/p2p.h                             |    3 +
 src/include/proxy.h                           |   26 +-
 src/include/register.h                        |    5 +
 src/include/transport.h                       |   36 +-
 src/include/tuner.h                           |    5 +-
 src/include/utils.h                           |   87 +-
 src/init.cc                                   |  905 +++---
 src/init_nvtx.cc                              |   13 +-
 src/misc/argcheck.cc                          |    2 -
 src/misc/cudawrap.cc                          |    8 +
 src/misc/gdrwrap.cc                           |   10 +-
 src/misc/ipcsocket.cc                         |    4 +-
 src/misc/nvmlwrap.cc                          |   41 +
 src/misc/param.cc                             |    2 +-
 src/misc/shmutils.cc                          |   34 +-
 src/misc/socket.cc                            |   10 +-
 src/misc/tuner.cc                             |  179 +-
 src/misc/utils.cc                             |   79 +-
 src/nccl.h.in                                 |   26 +
 src/net.cc                                    |  169 +-
 src/proxy.cc                                  |   90 +-
 src/register.cc                               |    2 +
 src/transport.cc                              |   14 +-
 src/transport/coll_net.cc                     |  301 +-
 src/transport/generic.cc                      |   36 +
 src/transport/net.cc                          |   20 +-
 src/transport/net_ib.cc                       |  159 +-
 src/transport/nvls.cc                         |  596 ++--
 src/transport/p2p.cc                          |   11 +-
 src/transport/shm.cc                          |    1 +
 115 files changed, 8595 insertions(+), 4326 deletions(-)
 create mode 100644 ext-net/example/nccl/common.h
 create mode 100644 ext-tuner/example/nccl/common.h
 create mode 100644 ext-tuner/example/nccl/err.h
 delete mode 100644 src/include/align.h
 create mode 100644 src/include/bitops.h
 create mode 100644 src/include/nvtx3/nvToolsExtCounters.h
 create mode 100644 src/include/nvtx3/nvToolsExtMem.h
 create mode 100644 src/include/nvtx3/nvToolsExtMemCudaRt.h
 create mode 100644 src/include/nvtx3/nvToolsExtPayloadHelper.h
 create mode 100644 src/include/nvtx3/nvToolsExtSemanticsCounters.h
 create mode 100644 src/include/nvtx3/nvToolsExtSemanticsScope.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
 rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtImpl.h (79%)
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
 rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtInit.h (71%)
 create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
 rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtPayloadTypeInfo.h (90%)
 rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtTypes.h (100%)
 delete mode 100644 src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
 create mode 100644 src/transport/generic.cc

diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h
new file mode 100644
index 000000000..912925225
--- /dev/null
+++ b/ext-net/example/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-net/example/nccl/err.h b/ext-net/example/nccl/err.h
index 0a2267719..bb92e8354 100644
--- a/ext-net/example/nccl/err.h
+++ b/ext-net/example/nccl/err.h
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess                 =  0,
                ncclSystemError             =  2,
                ncclInternalError           =  3,
                ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
                ncclRemoteError             =  6 } ncclResult_t;
 
 #endif
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 2f455c60f..2aea8c439 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -8,6 +8,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#include "common.h"
 #include "err.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -19,11 +20,6 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
-
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
diff --git a/ext-net/example/nccl/types.h b/ext-net/example/nccl/types.h
index 0a5d83788..f43fdc163 100644
--- a/ext-net/example/nccl/types.h
+++ b/ext-net/example/nccl/types.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_ERR_H_
-#define NCCL_ERR_H_
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
 
 /* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
diff --git a/ext-tuner/example/nccl/common.h b/ext-tuner/example/nccl/common.h
new file mode 100644
index 000000000..912925225
--- /dev/null
+++ b/ext-tuner/example/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-tuner/example/nccl/err.h b/ext-tuner/example/nccl/err.h
new file mode 100644
index 000000000..bb92e8354
--- /dev/null
+++ b/ext-tuner/example/nccl/err.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
index 57825b99c..a1f18d393 100644
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@@ -8,15 +8,24 @@
 #ifndef NCCL_TUNER_H_
 #define NCCL_TUNER_H_
 
-#include "nccl.h"
+#include <stdint.h>
+#include <stdlib.h>
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+#include "common.h"
+#include "err.h"
 
 #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
 
 #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2
 
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
 // API to be implemented by external tuner
 typedef struct {
   // Name of the tuner
@@ -52,31 +63,33 @@ typedef struct {
   //   - context: tuner context object
   //   - collType: collective type , e.g., allreduce, allgather…
   //   - nBytes: collective size in bytes
-  //   - collNetSupport: whether collnet supports this type
-  //   - nvlsSupport: whether nvlink sharp supports this time
   //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
   //
   // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the given collective
   //   - nChannels: number of channels (hence SMs) to be used.
   //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
   // default tuning for the given collective.
   // Also, the plugin is allowed to not set any output, or set only the
   // algorithm and protocol, but not only the algorithm or only the protocol.
   // Unset fields will be set automatically by NCCL.
   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels);
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
   ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
+} ncclTuner_v3_t;
 
-typedef ncclTuner_v2_t ncclTuner_t;
+typedef ncclTuner_v3_t ncclTuner_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
 
 #endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index 3c669433a..c3cf00dfd 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -11,14 +11,21 @@
 __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels) {
+  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+  if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  }
+  *nChannels = 1;
+  return ncclSuccess;
+}
 
 __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
 
 #define PLUGIN_NAME "Example"
 
-const ncclTuner_v2_t ncclTunerPlugin_v2 = {
+const ncclTuner_v3_t ncclTunerPlugin_v3 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .getCollInfo = pluginGetCollInfo,
diff --git a/makefiles/version.mk b/makefiles/version.mk
index d4da30daf..9039cb7dd 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 21
-NCCL_PATCH   := 5
+NCCL_MINOR   := 22
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index cff2df50d..a7d775440 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom
 
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
   memset(handle, 0, sizeof(ncclBootstrapHandle));
-  NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
 
   const char* env = ncclGetEnv("NCCL_COMM_ID");
   if (env) {
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
       WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
       return ncclInvalidArgument;
     }
+    handle->magic = NCCL_MAGIC;
   } else {
+    NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
     memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
     NCCLCHECK(bootstrapCreateRoot(handle, false));
   }
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
   struct bootstrapState* state = (struct bootstrapState*)commState;
   if (state->unexpectedConnections != NULL) {
     unexpectedFree(state);
-    if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
+    if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
       WARN("Unexpected connections are not empty");
       return ncclInternalError;
     }
diff --git a/src/channel.cc b/src/channel.cc
index 52591e0e7..b3a8f29b5 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -7,16 +7,17 @@
 #include "channel.h"
 #include "param.h"
 #include "gdrwrap.h"
+#include "transport.h"
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   struct ncclChannel* channel = &comm->channels[channelId];
   if (channel->id != -1) return ncclSuccess;
 
   int nRanks = comm->nRanks;
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+  int nvlsRanks = comm->localRanks;
   int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
   channel->id = channelId;
-  channel->workFifoSent = 0;
+  channel->workFifoProduced = 0;
 
   struct ncclSharedResources* sharedRes = comm->sharedRes;
 
@@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
 
   NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
 
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+  int nvlsRanks = comm->localRanks;
+
   if (share) {
     channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
     channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
diff --git a/src/collectives.cc b/src/collectives.cc
index 571134fca..e21807e04 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -9,6 +9,69 @@
 #include "enqueue.h"
 #include "nccl.h"
 
+const char* ncclFuncToString(ncclFunc_t fn) {
+  switch (fn) {
+  case ncclFuncAllGather: return "AllGather";
+  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncRecv: return "Recv";
+  case ncclFuncReduce: return "Reduce";
+  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncSendRecv: return "SendRecv";
+  case ncclFuncSend: return "Send";
+  default: return "Invalid";
+  }
+}
+
+const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
+  switch (op) {
+  case ncclDevSum: return "Sum";
+  case ncclDevProd: return "Prod";
+  case ncclDevMinMax: return "MinMax";
+  case ncclDevPreMulSum: return "PreMulSum";
+  case ncclDevSumPostDiv: return "SumPostDiv";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclDatatypeToString(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8: return "ncclInt8";
+  case ncclInt32: return "ncclInt32";
+  case ncclUint32: return "ncclUint32";
+  case ncclInt64: return "ncclInt64";
+  case ncclUint64: return "ncclUint64";
+  case ncclFloat16: return "ncclFloat16";
+  case ncclFloat32: return "ncclFloat32";
+  case ncclFloat64: return "ncclFloat64";
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16: return "ncclBfloat16";
+#endif
+  default: return "Unknown";
+  }
+}
+
+const char* ncclAlgoToString(int algo) {
+  switch (algo) {
+  case NCCL_ALGO_TREE: return "TREE";
+  case NCCL_ALGO_RING: return "RING";
+  case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
+  case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
+  case NCCL_ALGO_NVLS: return "NVLS";
+  case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclProtoToString(int proto) {
+  switch (proto) {
+  case NCCL_PROTO_LL: return "LL";
+  case NCCL_PROTO_LL128: return "LL128";
+  case NCCL_PROTO_SIMPLE: return "SIMPLE";
+  default: return "Unknown";
+  }
+}
+
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
diff --git a/src/debug.cc b/src/debug.cc
index 522999b44..dde8e8fcb 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -8,7 +8,10 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <string.h>
+#include <strings.h>
 #include <sys/syscall.h>
+#include <chrono>
 #include "param.h"
 
 int ncclDebugLevel = -1;
@@ -16,14 +19,15 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
+static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
 FILE *ncclDebugFile = stdout;
-pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
-std::chrono::steady_clock::time_point ncclEpoch;
+static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+static std::chrono::steady_clock::time_point ncclEpoch;
+static bool ncclWarnSetDebugInfo = false;
 
 static __thread int tid = -1;
 
-void ncclDebugInit() {
+static void ncclDebugInit() {
   pthread_mutex_lock(&ncclDebugLock);
   if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
   const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
@@ -83,6 +87,8 @@ void ncclDebugInit() {
         mask = NCCL_BOOTSTRAP;
       } else if (strcasecmp(subsys, "REG") == 0) {
         mask = NCCL_REG;
+      } else if (strcasecmp(subsys, "PROFILE") == 0) {
+        mask = NCCL_PROFILE;
       } else if (strcasecmp(subsys, "ALL") == 0) {
         mask = NCCL_ALL;
       }
@@ -94,6 +100,15 @@ void ncclDebugInit() {
     free(ncclDebugSubsys);
   }
 
+  const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
+  if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
+    int64_t value;
+    errno = 0;
+    value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0);
+    if (!errno)
+      ncclWarnSetDebugInfo = value;
+  }
+
   // Cache pid and hostname
   getHostName(hostname, 1024, '.');
   pid = getpid();
@@ -143,8 +158,6 @@ void ncclDebugInit() {
   pthread_mutex_unlock(&ncclDebugLock);
 }
 
-NCCL_PARAM(WarnSetDebugInfo, "WARN_ENABLE_DEBUG_INFO", 0);
-
 /* Common logging function used by the INFO, WARN and TRACE macros
  * Also exported to the dynamically loadable Net transport modules so
  * they can share the debugging mechanisms and output files
@@ -178,7 +191,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   if (level == NCCL_LOG_WARN) {
     len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
                    hostname, pid, tid, cudaDev, filefunc, line);
-    if (ncclParamWarnSetDebugInfo()) ncclDebugLevel = NCCL_LOG_INFO;
+    if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
   } else if (level == NCCL_LOG_INFO) {
     len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
   } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
@@ -190,17 +203,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
                    hostname, pid, tid, cudaDev, timestamp, filefunc, line);
   }
 
-  if (len) {
-    va_list vargs;
-    va_start(vargs, fmt);
-    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
-    va_end(vargs);
-    // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
-    // Rewind len so that we can replace the final \0 by \n
-    if (len > sizeof(buffer)) len = sizeof(buffer)-1;
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  va_list vargs;
+  va_start(vargs, fmt);
+  len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+  va_end(vargs);
+  // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
+  // Rewind len so that we can replace the final \0 by \n
+  if (len > sizeof(buffer)) len = sizeof(buffer)-1;
+  buffer[len++] = '\n';
+  if (len) fwrite(buffer, 1, len, ncclDebugFile);
 }
 
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 809e8ae12..8fe224848 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -10,30 +10,26 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->userRanks;
     const int nranks = ncclShmem.comm.nRanks;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t count = args->count;
+    size_t count, partOffset, partCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
     size_t offset;
     size_t dataOffset;
     int nelem;
     int rankDest;
 
-    T *inputBuf = (T*)args->sendbuff;
-    T *outputBuf = (T*)args->recvbuff;
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
 
-    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+    for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
       /////////////// begin AllGather steps ///////////////
-      nelem = min(chunkCount, channelCount - elemOffset);
-      dataOffset = gridOffset + elemOffset;
+      nelem = min(chunkCount, partCount - elemOffset);
+      dataOffset = partOffset + elemOffset;
 
       // step 0: push data to next GPU
       rankDest = ringRanks[0];
@@ -64,52 +60,50 @@ namespace {
 }
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t count = args->count;
     const ssize_t rank = ncclShmem.comm.rank;
-    const size_t chunkCount = args->chunkCount;
-    size_t gridOffset = args->workOffset;
-    size_t channelCount = args->workCount;
+    size_t count, gridOffset, channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
 
-    const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
-    const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
+    const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
+    const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
     const int tidEndGather = nThreadsGather;
     const int tidEndBcast = tidEndGather + nThreadsBcast;
 
-    if (!args->regUsed) {
+    if (!work->regUsed) {
       if (tid < tidEndGather) {
         // Gather
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
@@ -119,8 +113,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
         // Bcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
@@ -133,7 +127,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
           prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
 
         /* used as sync */
         prims.scatter(0, 0, 0, 0, -1, 0);
@@ -144,8 +138,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
       } else if (tid < tidEndBcast) {
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
         /* used as sync */
         prims.recv(0, 0);
 
@@ -161,10 +155,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
   template<bool BcastSendNotRecv>
   struct Scatterer {
-    struct ncclWorkElem* args;
+    struct ncclDevWorkColl* work;
     ssize_t chunkSize;
     ssize_t railGridOffset;
 
@@ -179,13 +173,13 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
       int nNodes = ncclShmem.comm.nNodes;
       int nRails = direct->nHeads;
-      int bid = args->bid;
-      char* inbuf = (char*)args->sendbuff;
-      char* outbuf = (char*)args->recvbuff;
-      ssize_t sizePerRank = args->count*sizeof(T);
+      int part = ncclShmem.channelId - work->channelLo;
+      char* inbuf = (char*)work->sendbuff;
+      char* outbuf = (char*)work->recvbuff;
+      ssize_t sizePerRank = work->collnet.count*sizeof(T);
       bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
 
-      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
       ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -232,28 +226,27 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
     }
   };
 
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    int tid = threadIdx.x;
-    const int nChannels = args->nChannels;
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
+    const int part = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t chunkSize = int(args->chunkCount);
-    ssize_t const &sizePerRank = args->count;
-
+    ssize_t sizePerRank = work->collnet.count*sizeof(T);
+    size_t chunkSize = work->collnet.chunkCount;
     bool isMultiRail = (direct->nHeads > 1);
     int nWarps1 = 1;
     int nWarps2 = (isMultiRail ? 2 : 1);
     int nWarps3 = (isMultiRail ? 2 : 0);
-    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
     nWarps3 = int(denom*nWarps3);
     nWarps2 = int(denom*nWarps2);
-    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+    nWarps1 = work->nWarps - (nWarps2+nWarps3);
 
     using Proto = ProtoSimple<1, 1>;
 
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
         if (tid == 0) {
           int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
           Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -262,10 +255,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       } else {
         // Phase 1: send to network
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
+          prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
             /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
-          ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
+          ssize_t railAllBeg = railGridOffset + part * chunkSize;
           ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
           ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
           ssize_t railOneEnd = railOneBeg + sizePerRank;
@@ -280,7 +273,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
         if (tid == 0) {
           int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
           Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -293,10 +286,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
             /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*BcastSendNotRecv=*/true> scat;
-          scat.args = args;
+          scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
         }
       }
       return;
@@ -311,10 +304,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
               /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
       for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*BcastSendNotRecv=*/false> scat;
-        scat.args = args;
+        scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.process</*Recv=*/1, /*Send=*/0>(scat);
+        prims.template process</*Recv=*/1, /*Send=*/0>(scat);
       }
       return;
     }
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 49f8dc65b..293138f4d 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -10,28 +10,27 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     int ringIx = ring->index;
-    ssize_t chunkCount = args->chunkCount;
     const int nranks = ncclShmem.comm.nRanks;
+    ssize_t gridOffset;
+    ssize_t channelCount;
+    ssize_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     const ssize_t loopCount = nranks * chunkCount;
     ssize_t offset;
-    ssize_t gridOffset = args->workOffset;
-    ssize_t channelCount = args->workCount;
     int nelem;
     int chunk;
 
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
 
     for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
       ssize_t remCount = channelCount - elemOffset;
       ssize_t chunkOffset;
 
-      if (remCount < loopCount) chunkCount = args->lastChunkCount;
+      if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T));
 
       auto modRanks = [&]__device__(int r)->int {
         return r - (r >= nranks ? nranks : 0);
@@ -75,24 +74,24 @@ namespace {
       chunkOffset = chunk * chunkCount;
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
+
       prims.directRecv(offset, nelem);
     }
   }
 
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclTree *tree = &ncclShmem.channel.tree;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t chunkCount = args->chunkCount;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
 
     { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
-        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
+        (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
       if (tree->up == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
@@ -118,7 +117,7 @@ namespace {
 
     { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
-        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
+        (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
       if (tree->up == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
@@ -144,16 +143,14 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclTree *tree = &ncclShmem.channel.tree;
-    const size_t chunkCount = args->chunkCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t channelCount = args->workCount;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
-
     int nthreadsSplit;
     if (Proto::Id == NCCL_PROTO_SIMPLE) {
       nthreadsSplit = nthreads/2;
@@ -167,7 +164,7 @@ namespace {
     if (tree->up == -1) {
       // Reduce and broadcast. Max number of recv is 2, max number of send is 2
       Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
+        prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
         offset = gridOffset + elemOffset;
         nelem = min(chunkCount, channelCount - elemOffset);
@@ -184,7 +181,7 @@ namespace {
        * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
        */
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
+        prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
@@ -203,8 +200,8 @@ namespace {
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
-            args->redOpArg, 1*Proto::MaxGroupWidth);
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
+            work->redOpArg, 1*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
@@ -224,34 +221,33 @@ namespace {
 }
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800
-      runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
+      runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
     #else
-      runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(args);
+      runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
     #endif
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     static constexpr int COLLNET_COPY_THREADS = 96;
-    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int bid = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
-    const ssize_t chunkSize = args->chunkCount;
-    const ssize_t size = args->count;
+    const ssize_t chunkSize = work->collnet.chunkCount;
+    const ssize_t size = work->collnet.count;
     const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
 
     const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
@@ -259,7 +255,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
     const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
     const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
     const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -269,12 +265,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
-           args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
         int nelem = min(direct->nHeads*chunkSize, size-offset);
-        if (args->regUsed) {
+        if (work->regUsed) {
           prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
         } else {
           prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
@@ -284,12 +280,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       if (hasDn) {
         // Reduce, send to network
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
-             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
+          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
-          if (args->regUsed) {
+          if (work->regUsed) {
             prims.directRecvReduceSend(offset, nelem);
           } else {
             prims.recvReduceSend(offset, nelem);
@@ -297,7 +293,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
         }
       } else {
         // Directly send to network
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
           if (tid == tidStartReduce) {
             int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
             Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -305,8 +301,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
           __syncwarp();
         } else {
           Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
-             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
             int nelem = min(chunkSize, size-offset);
@@ -317,8 +313,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
     } else if (tid < tidStartBcast && hasUp) {
       // Gather
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
-        prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
+        prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
+           work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
         int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -328,15 +324,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
       if (hasDn) {
         // Recv from network, broadcast
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
-             args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
+             work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
           prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
         }
       } else {
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
           if (tid == tidStartBcast) {
             int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
             Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -345,8 +341,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
         } else {
           // Recv from network (no post thread needed)
           Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-            prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
-              args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
+            prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff,
+              work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
             int nelem = min(chunkSize, size - offset);
@@ -359,18 +355,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    ssize_t chunkSize = args->chunkCount;
     const bool hasOut = nvls->out != -1;
     const int nranks = ncclShmem.comm.nRanks;
     const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-    const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
-    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
-    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
-    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
+    const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
+    const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
 
     const int nThreadsScatter = scatterWarps*WARP_SIZE;
     const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -381,35 +375,37 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
     const int tidEndReduce = tidEndGather + nThreadsReduce;
     const int tidEndBcast = tidEndReduce + nThreadsBcast;
 
-    if (args->oneNode) {
+    if (work->oneNode) {
+      ssize_t gridOffset, channelCount, chunkSize;
+      ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
       const ssize_t loopCount = nvls->nHeads * chunkSize;
-      const ssize_t channelCount = args->workCount;
-      const ssize_t gridOffset = args->workOffset;
       ssize_t offset;
       int nelem;
+      int remCount = channelCount%(nvls->nHeads*chunkSize);
+      int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
 
       if (tid < tidEndScatter) {
         // Scatter
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
           offset = gridOffset + elemOffset;
-          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndGather) {
         // Gather
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
           offset = gridOffset + elemOffset;
-          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce) {
@@ -417,10 +413,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
           chunkOffset = elemOffset + nvls->headRank * chunkSize;
           offset = gridOffset + chunkOffset;
           nelem = min(chunkSize, channelCount - chunkOffset);
@@ -428,30 +424,32 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
         }
       }
     } else {
-      const int bid = args->bid;
-      const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
-      const ssize_t size = args->count;
+      const int bid = ncclShmem.channelId - work->channelLo;
+      const int nChannels = work->channelHi - work->channelLo + 1;
+      const ssize_t chunkSize = work->collnet.chunkCount;
+      const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize;
+      const ssize_t size = work->collnet.count;
 
       if (tid < tidEndScatter) {
         // Scatter
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndGather) {
         // Gather
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -460,7 +458,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
           using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
             int nelem = min(chunkSize, size - offset);
@@ -471,7 +469,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
           using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
+              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
             int nelem = min(chunkSize, size - offset);
@@ -483,7 +481,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
           int nelem = min(chunkSize, size - offset);
@@ -495,25 +493,25 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
     const int treeUp = nvls->treeUp;
     const int* treeDown = nvls->treeDown;
-    ssize_t chunkCount = args->chunkCount;
+    ssize_t gridOffset, channelCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     const ssize_t loopCount = nvls->nHeads * chunkCount;
-    const ssize_t channelCount = args->workCount;
-    const ssize_t gridOffset = args->workOffset;
     const int nranks = ncclShmem.comm.nRanks;
     const bool hasUp = treeUp != -1;
     const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-    const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
-    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
-    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
-    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
+    const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
+    const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
     ssize_t offset;
     int nelem;
+    int remCount = channelCount%(nvls->nHeads*chunkCount);
+    int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
 
     const int nThreadsScatter = scatterWarps*WARP_SIZE;
     const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -528,24 +526,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
       // Scatter
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+          work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
         offset = gridOffset + elemOffset;
-        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
         prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
       }
     } else if (tid < tidEndGather) {
       // Gather
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+          work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
         offset = gridOffset + elemOffset;
-        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
         prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
       }
     } else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -554,10 +552,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
         Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
           chunkOffset = elemOffset + nvls->headRank * chunkCount;
           offset = gridOffset + chunkOffset;
           nelem = min(chunkCount, channelCount - chunkOffset);
@@ -568,10 +566,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
         Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
           chunkOffset = elemOffset + nvls->headRank * chunkCount;
           offset = gridOffset + chunkOffset;
           nelem = min(chunkCount, channelCount - chunkOffset);
@@ -583,10 +581,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
       using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
       Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
         prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
-          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+          work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
       for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
         ssize_t chunkOffset;
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
         chunkOffset = elemOffset + nvls->headRank * chunkCount;
         offset = gridOffset + chunkOffset;
         nelem = min(chunkCount, channelCount - chunkOffset);
@@ -597,17 +595,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    const int bid = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
     ncclTree *tree = &ncclShmem.channel.collnetChain;
-    ssize_t chunkSize = args->chunkCount;
+    ssize_t chunkSize = work->collnet.chunkCount;
     const ssize_t loopSize = int(nChannels*chunkSize);
     const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t size = args->count;
+    const ssize_t size = work->collnet.count;
 
     int nthreadsSplit = nthreads/2;
     if (nthreadsSplit >= 256) nthreadsSplit += 64;
@@ -634,7 +630,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
 
     if (tid < nthreadsSplit) {
       if (recv == -1) {
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
           if (groupTid == 0) {
             int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
             Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
@@ -642,8 +638,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
           __syncwarp();
         } else {
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-              args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
@@ -652,8 +648,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
         }
       } else {
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-            args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+          prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * int(chunkSize);
           int nelem = min(chunkSize, size - offset);
@@ -665,7 +661,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
       if (recv == nranks) {
         // I'm the first in the broadcast chain, I need to perform the division (postOp)
         if (send == -1) {
-          if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+          if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
             if (groupTid == 0) {
               int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
               Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
@@ -673,8 +669,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
             __syncwarp();
           } else {
             Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-              prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-                args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+              prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+                work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
             for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
               ssize_t offset = gridOffset + bid * int(chunkSize);
               int nelem = min(chunkSize, size - offset);
@@ -683,8 +679,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
           }
         } else {
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-              args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
@@ -693,8 +689,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
         }
       } else {
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-            args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+          prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
         if (send == -1) {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -714,29 +710,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runTreeSplit<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runTreeSplit<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runTreeSplit<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runTreeSplit<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 86d45e77e..7026adc3d 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -10,23 +10,22 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     const int rank = ring->userRanks[0];
     const int nextRank = ring->userRanks[1];
-    const int root = args->root;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
+    const int root = work->root;
+    size_t chunkCount;
+    size_t channelCount;
+    size_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
 
-    T *inputBuf = (T*)args->sendbuff;
-    T *outputBuf = (T*)args->recvbuff;
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
 
     for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
       offset = gridOffset + elemOffset;
@@ -48,23 +47,23 @@ namespace {
 }
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
diff --git a/src/device/common.cu b/src/device/common.cu
index d1b6acd1b..a8b5ed571 100644
--- a/src/device/common.cu
+++ b/src/device/common.cu
@@ -14,11 +14,11 @@ __shared__ ncclShmemData ncclShmem;
 #endif
 
 struct RunWorkNop {
-  __device__ void run(ncclWork *w) {}
+  __device__ void run() {}
 };
 
-__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
-  ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
+__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+  ncclKernelMain<-1, RunWorkNop>(&args4K.args);
 }
 
 __device__ void ncclDevFunc_Nop() {}
diff --git a/src/device/common.h b/src/device/common.h
index d8581d3f4..5fa7be9ce 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -10,10 +10,19 @@
 #include "collectives.h"
 #include "device.h"
 #include "op128.h"
+#include "reduce_kernel.h"
 #include "network/unpack/unpack_defs.h"
 
 #define COLL_UNROLL (ncclCollUnroll())
 
+#if __CUDA_ARCH__ >= 700
+// __grid_constant__ appears to break cuda-gdb
+//#define NCCL_GRID_CONSTANT __grid_constant__
+#define NCCL_GRID_CONSTANT
+#else
+#define NCCL_GRID_CONSTANT
+#endif
+
 typedef void(*ncclDevFuncPtr_t)();
 extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
 
@@ -31,18 +40,28 @@ struct ncclShmemGroup {
 };
 
 struct ncclShmemData {
-  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
-  uint64_t redOpArgs[NCCL_MAX_ARITY+1];
+  struct ncclDevKernelArgs args;
   int channelId;
   int aborted;
   alignas(16) struct ncclDevComm comm;
   alignas(16) struct ncclDevChannel channel;
-  alignas(16) struct ncclWork work;
+
+  int batchIx, nextBatchIx;
+  enum ncclDevWorkType workType;
+  uint8_t directMode;
+  uint16_t funcId;
+  int nWorks;
+  int workSize;
+  uint32_t workConsumed;
+  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
+  uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
+
+  alignas(16) char workStorage[1024];
+
   alignas(16) union {
     unpackShmem unpack;
   } devicePlugin;
 };
-static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
 
 extern __shared__ ncclShmemData ncclShmem;
 #if __CUDA_ARCH__ >= 700
@@ -55,14 +74,62 @@ __device__ inline void* ncclScratchForWarp(int warp) {
   return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
 }
 
-__device__ inline bool barrierReduceAny(int bit) {
-  uint32_t popc;
-  asm ("{"
-    ".reg .pred barr_pred;"
-    "setp.eq.u32 barr_pred, %1, 1;"
-    "bar.red.popc.u32 %0, 2, barr_pred;"
-  "}" : "=r"(popc) : "r"(bit));
-  return popc != 0;
+__device__ inline void barrier_sync(int name) {
+  #if 0
+  asm volatile("barrier.sync %0;" :: "r"(name) : "memory");
+  #else
+  asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
+  #endif
+}
+__device__ inline void barrier_sync(int name, int nThreads) {
+  #if 0
+  asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+  #else
+  asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+  #endif
+}
+__device__ inline void barrier_sync_aligned(int name) {
+  asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
+}
+__device__ inline void barrier_sync_aligned(int name, int nThreads) {
+  asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+}
+
+__device__ inline bool barrier_red_or(bool vote, int name) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred p, %2, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred p, %2, %3, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred.aligned p, %2, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred.aligned p, %2, %3, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
+  return bool(ans);
 }
 
 // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
@@ -71,158 +138,261 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
   if (offset < bytes) {
     uint64_t a=0, b=0;
     asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
-    asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
+    uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
+    asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
+  }
+}
+
+// Must run with at least 64 threads
+__device__ __forceinline__ void loadWorkBatchToShmem(
+    int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx
+  ) {
+  int lane = tid%WARP_SIZE;
+  int workCursor = 0; // num works written in previous loop iterations.
+  while (true) {
+    struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx];
+
+    // fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset.
+    // PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS,
+    // since we know all lanes will be querying the same bitmask we can compute
+    // much faster using shared memory.
+    uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE);
+    __syncwarp();
+    if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
+      int nWorksBelow = __popc(uint32_t(batch.offsetBitset) & ((1u<<lane)-1));
+      fnsOfBitset[nWorksBelow] = lane;
+    }
+    int nWorksLow32 = __popc(uint32_t(batch.offsetBitset)); // just of low 32 bits
+    if (uint32_t(batch.offsetBitset>>32) & (1u<<lane)) {
+      int nWorksBelow = nWorksLow32;
+      nWorksBelow += __popc(uint32_t(batch.offsetBitset>>32) & ((1u<<lane)-1));
+      fnsOfBitset[nWorksBelow] = 32 + lane;
+    }
+    int nWorks = nWorksLow32 + __popc(uint32_t(batch.offsetBitset>>32)); // add high 32 bits
+    __syncwarp();
+
+    int workSize;
+    int nPacks; // total number of packs loaded, each pack is 16 bytes
+    int packInWork; // my pack index within work struct
+    int dstWork; // my work index in contiguous destination shmem
+    switch (batch.workType) {
+    case (int)ncclDevWorkTypeP2p:
+      workSize = sizeof(struct ncclDevWorkP2p);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    case (int)ncclDevWorkTypeColl:
+      workSize = sizeof(struct ncclDevWorkColl);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    case (int)ncclDevWorkTypeCollReg:
+    default:
+      workSize = sizeof(struct ncclDevWorkCollReg);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    }
+    if (tid == 0) {
+      ncclShmem.workSize = workSize;
+      ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
+    }
+    // We deliberately replicate these div and mod calculations into the case
+    // blocks above so that they get constant divisor optimizations by the compiler.
+    //   packInWork = tid%(workSize/16);
+    //   dstWork = tid/(workSize/16);
+
+    // We can only assume we have 64 threads, which means we can read at most 1024 bytes
+    // here which is the per batch maximum.
+    if (tid < nPacks) {
+      int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset
+      ulong2 tmp;
+      // The loads done in these two cases must be kept separate since we are
+      // relying on the compiler to use "ld.param" in the first one. The parameter
+      // space is not generically addressable, so any attempt to load through
+      // a pointer that *might* be parameter space backed will cause the
+      // compiler to spill the parameter struct (4K!) to each thread's local space
+      // before creating a pointer (to the spill) and decimate perf.
+      //
+      // An example of what not to do would be the following:
+      //
+      // if (condition) {
+      //   // The compiler could spill parameter_variable to local space and take
+      //   // the address of that, since when src is loaded below it could also
+      //   // be global space.
+      //   src = &parameter_variable;
+      // } else {
+      //   src = &global_variable;
+      // }
+      // memcpy(dst, src, n);
+      if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) {
+        char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16);
+        tmp = *(ulong2*)src; // becomes ld.param.v2.u64
+      } else {
+        char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask);
+        tmp = *(ulong2*)src; // becomes ld.v2.u64
+      }
+      char* dst = ncclShmem.workStorage;
+      dst += (workCursor + dstWork)*workSize + packInWork*16;
+      *(ulong2*)dst = tmp;
+    }
+    workCursor += nWorks;
+
+    if (batch.nextExtends) {
+      batchIx += batch.nextJump;
+      tid -= 64; // Rotate threads so we use the next two warps for next batch struct.
+      if (tid < 0) tid += tn;
+    } else {
+      if (tid == 0) {
+        ncclShmem.batchIx = batchIx;
+        ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump;
+        ncclShmem.workType = (enum ncclDevWorkType)batch.workType;
+        ncclShmem.nWorks = workCursor;
+        ncclShmem.funcId = batch.funcId;
+      }
+      break;
+    }
   }
 }
 
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWorkElement {
-  __device__ void run(ncclWorkElem*) {
+struct RunWorkColl {
+  __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
     // Put NOT IMPLEMENTED behavior here.
   }
 };
 
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWork {
+struct RunWorkBatch;
+
+// Specialized for P2p in sendrecv.h
+template<typename T, typename RedOp>
+struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE>;
+
+// Specialized here for non-P2p (Coll and CollReg)
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWorkBatch {
   // This __forceinline__ is necessary. The compiler was inserting a function call
   // here from the LL ncclKernel.
-  __device__ __forceinline__ void run(ncclWork *w) {
-    int wid = threadIdx.x / WARP_SIZE;
-    ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
-    int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
+  __device__ __forceinline__ void run() {
+    int tid = threadIdx.x;
+    int tn = blockDim.x;
+
+    if (RedOpArg<RedOp>::ArgUsed) {
+      int nWorks = ncclShmem.nWorks;
+      for (int w=tid; w < nWorks; w += tn) {
+        struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
+        if (work->redOpArgIsPtr) {
+          work->redOpArg = RedOpArg<RedOp>::loadArg(reinterpret_cast<void*>(work->redOpArg));
+        }
+      }
+      __syncthreads();
+    }
+
     #pragma unroll 1
-    while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
-      if (wid < we->nWarps) {
-        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
+    for (int w=0; w < ncclShmem.nWorks; w++) {
+      struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
+      if (w != 0) {
+        struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize);
+        if (work->nWarps != workPrev->nWarps) __syncthreads();
       }
-      we = (ncclWorkElem*)((char*)we + stride);
+      int subtn = work->nWarps*WARP_SIZE;
+      if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
     }
   }
 };
 
-static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
-  if (we->isUsed && we->redOpArgIsPtr) {
-    /* redOpArg is a pointer to the scalar value, so we'll dereference it
-     * here so that redOpArg holds the bits of the scalar going forward.
-     * The tricky thing is we don't know its type T since that's encoded in
-     * the funcIndex. Because it would be difficult to get sizeof(T) from
-     * funcIndex, we'll cheat and just dereference the largest possible size
-     * given the alignment of the pointer. We might be reading in more bytes
-     * than we need but that's harmless.
-     */
-    if (we->redOpArg%2 != 0)
-      we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
-    else if (we->redOpArg%4 != 0)
-      we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
-    else if (we->redOpArg%8 != 0)
-      we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
-    else
-      we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
-  }
-}
-
-template<int SpecializedFnId, typename SpecializedRunWork>
-__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
+template<int SpecializedFnId, typename SpecializedRunWorkBatch>
+__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
   int tid = threadIdx.x;
+  int tn = blockDim.x;
+
+  // Copy kernel args to shmem and then only read those. Otherwise the compiler
+  // will end up putting the args into thread local stack which is very wasteful.
+  if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
+    ((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid];
+  }
 
   // To map blockId to channelId, we need the n'th set bit of channelMask which
   // is the inverse of counting the number of set bits among the the first n.
-  if (tid < WARP_SIZE) {
-    int x = tid;
-    if (channelMask & (1ull<<x)) {
-      int y = __popcll(channelMask & ((1ull<<x)-1));
-      if (blockIdx.x == y) ncclShmem.channelId = x;
-    }
-    if (32 < MAXCHANNELS) {
-      x = 32 + tid;
-      if (channelMask & (1ull<<x)) {
-        int y = __popcll(channelMask & ((1ull<<x)-1));
-        if (blockIdx.x == y) ncclShmem.channelId = x;
-      }
-    }
+  // PTX has the fns instruction which does this but is extremely slow. We can
+  // do better when we know all threads are querying the same bitmask.
+  if (tid < MAXCHANNELS && (args->channelMask & (1ull<<tid))) {
+    int n = __popcll(args->channelMask & ((1ull<<tid)-1));
+    if (blockIdx.x == n) ncclShmem.channelId = tid;
   }
-  __syncthreads(); // publish ncclShmem.channelId
-  int channelId = ncclShmem.channelId;
+  __syncthreads(); // publish ncclShmem.{args, channelId}
   /* set abort flag to 0 */
   if (tid == 0) ncclShmem.aborted = 0;
 
-  if (true) {
-    void *dst, *src;
-    int bytes;
-    // Use first 3 warps to load comm, channel, and work into ncclShmem
-    switch (tid/WARP_SIZE) {
-    case 0:
-      dst = &ncclShmem.comm;
-      src = comm;
-      bytes = sizeof(ncclDevComm);
+  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  switch (tid/WARP_SIZE) {
+  case 0:
+    { void* dst = &ncclShmem.comm;
+      void* src = ncclShmem.args.comm;
+      int bytes = sizeof(ncclDevComm);
       static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
-      break;
-    case 1:
-      // Get address of channel without incurring indirect load from ncclDevComm::channels
-      dst = &ncclShmem.channel;
-      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
-      bytes = sizeof(ncclDevChannel);
+      copyToShmem16(tid, dst, src, bytes);
+    } break;
+  case 1:
+    { // Get address of channel without incurring indirect load from ncclDevComm::channels
+      void* dst = &ncclShmem.channel;
+      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
+      int bytes = sizeof(ncclDevChannel);
       static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
-      break;
-    case 2:
-      dst = &ncclShmem.work;
-      src = workHead + blockIdx.x;
-      bytes = sizeof(ncclWork);
-      static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
-      break;
-    default:
-      bytes = 0;
-      break;
-    }
-    if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+      copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
+    } break;
+  default:
+    { int subtid = tid - 2*WARP_SIZE;
+      int subtn = tn - 2*WARP_SIZE;
+      loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
+    } break;
   }
   __syncthreads(); // publish ncclShmem
 
-  while (true) {
-    // Notify host that all fifo reads are complete.
-    if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
-      *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
-    }
-
-    __syncwarp();
-    if (ncclShmem.work.header.type == ncclWorkTypeColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
-    } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
-    }
-    __syncthreads();
+  if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
+    // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
+    ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
+  }
 
-    if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
-      SpecializedRunWork().run(&ncclShmem.work);
+  while (true) {
+    if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
+      SpecializedRunWorkBatch().run();
     } else {
-      ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
+      ncclDevFuncTable[ncclShmem.funcId]();
     }
 
-    int workIxNext = ncclShmem.work.header.workNext;
+    if (ncclShmem.nextBatchIx == -1) break;
+    int batchIx = ncclShmem.nextBatchIx;
     __syncthreads();
-    if (ncclShmem.work.header.isLast) break;
-
-    copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
+    loadWorkBatchToShmem(tid, tn, args, batchIx);
 
-    { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (barrierReduceAny(aborted)) // publish ncclShmem.work
-        break;
+    // Check whether the last operation was aborted and make sure all threads exit
+    bool aborted = false;
+    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
+    aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
+    if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
     }
+    if (aborted) break;
   }
 }
 
-__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __device__ void ncclDevFunc_Nop();
 
 #define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
-  __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
-    ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \
+    ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
   }
 
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
   __device__ void ncclDevFunc_##suffix() { \
-    RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
+    RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
   }
 
 #endif
diff --git a/src/device/generate.py b/src/device/generate.py
index 43de85d61..d0feee10f 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -233,6 +233,8 @@ def validate(coll, redop, ty, algo, proto):
   out('#include "device.h"\n')
   out("\n")
 
+  out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs))
+
   # The mapping from function rows to valid primary function ids.
   out("extern int const ncclDevFuncRowToId[] = {\n")
   index = 0
@@ -251,7 +253,7 @@ def validate(coll, redop, ty, algo, proto):
     cudart, _ = required_cuda(*kfn)
     sym = paste("_", "ncclDevKernel", *kfn)
     if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
-    out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
+    out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
     if cudart != 0: out("#endif\n")
   out("\n")
 
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
index 3bc910047..b213fbe39 100644
--- a/src/device/network/unpack/unpack.h
+++ b/src/device/network/unpack/unpack.h
@@ -10,7 +10,7 @@
 #include "unpack_defs.h"
 
 #include "op128.h"
-#include "align.h"
+#include "bitops.h"
 #include "device.h"
 #include "common.h"
 
@@ -35,16 +35,16 @@ inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group,
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
   ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
   ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
-  ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
+  ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
 }
 
-inline __device__ void ncclNetDeviceIncrementHead(const int group) {
-  ncclShmem.groups[group].devicePlugin.unpack.head++;
+inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
+  ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
 }
 
-inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
+inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
-  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
+  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
 }
 
 template <uint8_t sz>
@@ -183,7 +183,7 @@ inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
     // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
     // + Src is necessary in the case of accessing the user buffer directly
     ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
-        ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
+      ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]);
   }
 }
 
diff --git a/src/device/network/unpack/unpack_defs.h b/src/device/network/unpack/unpack_defs.h
index 9be1c5e42..ecbed01fe 100644
--- a/src/device/network/unpack/unpack_defs.h
+++ b/src/device/network/unpack/unpack_defs.h
@@ -54,7 +54,7 @@ struct unpackShmem {
 
 struct unpackGroupShmem {
   int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
-  uint64_t head;
+  uint64_t head[NET_UNPACK_MAX_NPEERS];
   struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
 };
 
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 5f5969099..4a6f9e267 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -44,10 +44,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
 
   inline __device__ void barrier() {
-    if (nthreads == WARP_SIZE)
+    if (nthreads == WARP_SIZE) {
       __syncwarp();
-    else
-      asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+    } else {
+      barrier_sync(15-group, nthreads);
+    }
   }
 
   uint32_t abort = 0;
@@ -323,7 +324,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   __device__  Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
+      bool userBufReg=false, int stepSize_=0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 698eea68e..9c7169545 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -50,7 +50,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
   inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
 
   inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+    barrier_sync(15-group, nthreads);
   }
 
   uint32_t abort = 0;
@@ -364,7 +364,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
   __device__ Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
+      bool userBufReg=false, int stepSize_=0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 2431c2fdd..c02657038 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -23,7 +23,7 @@ class Primitives<
                        ConnFifoEnabled = 0x100,
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
-                       ThreadsSynced = 0x800,
+                       // 0x800 is free to use
                        NvlsMinPolling = 0x1000,
                        NetDeviceUnpack = 0x2000,
                        AnyNetDeviceUnpack = 0x4000,
@@ -44,53 +44,38 @@ class Primitives<
   uint64_t *connStepPtr;
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
   int      connStepSize; // Connection step size
-  void*    mhandle;
   void*    netDeviceHandle;
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
-    flags |= ThreadsSynced;
     if (nthreads == WARP_SIZE) __syncwarp();
     else {
       int bar = 15-group;
-      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
+      barrier_sync(bar, nthreads);
     }
   }
   __device__ void subBarrier() {
     if (nworkers == WARP_SIZE) __syncwarp();
     else {
-      int bar = (nworkers==nthreads ? 15 : 8) - group;
-      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
+      int bar = 15-group - (nworkers!=nthreads ? 1 : 0);
+      barrier_sync(bar, nworkers);
     }
   }
 
   __device__ bool barrierAny(int vote) {
-    flags |= ThreadsSynced;
     if (nthreads == WARP_SIZE) {
       return __any_sync(~0u, vote);
     } else {
-      int ans, bar = 15-group;
-      asm volatile(
-        "{ .reg .pred p;"
-        "  setp.ne.s32 p, %1, 0;"
-        "  bar.red.or.pred p, %2, %3, p; "
-        "  selp.s32 %0, 1, 0, p; }"
-        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
-      return ans != 0;
+      int name = 15-group;
+      return barrier_red_or(vote, name, nthreads);
     }
   }
   __device__ bool subBarrierAny(int vote) {
     if (nworkers == WARP_SIZE) {
       return __any_sync(~0u, vote);
     } else {
-      int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
-      asm volatile(
-        "{ .reg .pred p;"
-        "  setp.ne.s32 p, %1, 0;"
-        "  bar.red.or.pred p, %2, %3, p; "
-        "  selp.s32 %0, 1, 0, p; }"
-        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
-      return ans != 0;
+      int name = 15-group - (nworkers!=nthreads ? 1 : 0);
+      return barrier_red_or(vote, name, nworkers);
     }
   }
 
@@ -164,8 +149,8 @@ class Primitives<
       else {
         ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
       }
-      if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
-        ncclNetDeviceIncrementHead(group);
+      if (flags & NetDeviceUnpack) {
+        ncclNetDeviceIncrementHead(group, index);
       }
       step += StepPerSlice;
     }
@@ -436,7 +421,7 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
     if (flags & (RoleWaitRecv|RolePostRecv)) {
       auto *conn = &peer->recv[connIndex];
       if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
@@ -488,7 +473,7 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
     if (flags & (RoleWaitSend|RolePostSend)) {
       auto *conn = &peer->send[connIndex];
       step = conn->step;
@@ -538,13 +523,13 @@ class Primitives<
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
     ):
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
+    this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
 
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -572,7 +557,7 @@ class Primitives<
     loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
     loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
 
-    if (p2p && p2p->reg) flags |= UserBufferMode;
+    if (userBufReg) flags |= UserBufferMode;
 
     if (barrierAny(flags & NetDeviceUnpack)) {
       flags |= AnyNetDeviceUnpack;
@@ -584,13 +569,12 @@ class Primitives<
       }
     }
 
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
   }
 
   __device__ ~Primitives() {
     // Ensure ncclShmem.groups[].send/recvConns are available
-    if (!(flags & ThreadsSynced))
-      barrier();
+    barrier();
     // Save steps for the next operation
     if (flags & (RolePostSend|RolePostRecv)) {
       auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
@@ -606,8 +590,8 @@ class Primitives<
       while (*ptr != -1) if (checkAbort(spins)) break;
     }
 
-    if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
-      ncclNetDeviceSaveHead(netDeviceHandle, group);
+    if (flags & NetDeviceUnpack) {
+      ncclNetDeviceSaveHead(netDeviceHandle, group, index);
     }
 
     // Make sure all threads are done writing back conn->step and done using
@@ -615,7 +599,7 @@ class Primitives<
     barrier();
   }
 
-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
     if (tid==0) {
       ncclShmem.groups[group].userInput = (void*)inputBuf;
       ncclShmem.groups[group].userOutput = (void*)outputBuf;
@@ -625,7 +609,7 @@ class Primitives<
     bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
     bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
     bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
-    int regUsed = e != nullptr ? e->elem.regUsed : 0;
+    int regUsed = e != nullptr ? e->coll.regUsed : 0;
 
     if (Direct && recvProvider) {
       int spins = 0;
diff --git a/src/device/reduce.h b/src/device/reduce.h
index 43cae213b..91cdaeb25 100644
--- a/src/device/reduce.h
+++ b/src/device/reduce.h
@@ -10,22 +10,21 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     const int prevRank = ring->userRanks[nranks-1];
-    const int root = args->root;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
+    const int root = work->root;
+    size_t chunkCount;
+    size_t channelCount;
+    size_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
 
     if (prevRank == root) {
       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
@@ -52,23 +51,23 @@ namespace {
 }
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index cbf774338..9e78da98a 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -37,6 +37,7 @@ template<typename T>
 struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
+
 template<typename T>
 struct FuncMinMax {
   using EltType = T;
@@ -47,9 +48,30 @@ struct FuncMinMax {
     isMinNotMax = (opArg&1)==0;
   }
 };
+
 template<typename T> struct FuncPreMulSum;
 template<typename T> struct FuncSumPostDiv;
 
+////////////////////////////////////////////////////////////////////////////////
+// Trait class for handling the reduction argument.
+
+template<typename Fn>
+struct RedOpArg { // default case: no argument
+  static constexpr bool ArgUsed = false;
+  __device__ static uint64_t loadArg(void *ptr) { return 0; }
+};
+
+template<typename T>
+struct RedOpArg<FuncMinMax<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    union { uint64_t u64; T val; };
+    u64 = 0;
+    val = *(T*)ptr;
+    return u64;
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Trait classes for reduction functions. Given a function (FuncSum, etc.)
 // and a number of elements in a pack, will reduce, preOp, or postOp a pack
@@ -356,6 +378,17 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncPreMulSum
 
+template<typename T>
+struct RedOpArg<FuncPreMulSum<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    union { uint64_t u64; T val; };
+    u64 = 0;
+    val = *(T*)ptr;
+    return u64;
+  }
+};
+
 // General definition for all integral types, float, and double.
 template<typename T>
 struct FuncPreMulSum {
@@ -486,6 +519,14 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv
 
+template<typename T>
+struct RedOpArg<FuncSumPostDiv<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    return *(uint64_t*)ptr;
+  }
+};
+
 template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
 struct FuncSumPostDiv_IntOnly;
 
@@ -658,7 +699,7 @@ struct Apply_LoadMultimem {
     static constexpr bool IsFloat = IsFloatingPoint<T>::value;
     static constexpr int BigPackSize =
       IsFloat && IsSum && sizeof(T) < 8 ? 16 :
-      IsFloat && IsSum ? 8 :
+      IsFloat && IsSum ? sizeof(T) :
       IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
       !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
       /*multimem.ld_reduce not supported:*/ 0;
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index d0b52494e..cf068ff55 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -10,23 +10,22 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     int const *ringRanks = ring->userRanks;
-    const size_t chunkCount = args->chunkCount;
     const int nranks = ncclShmem.comm.nRanks;
-    size_t channelCount = args->workCount;
-    size_t gridOffset = args->workOffset;
+    size_t count;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     size_t dataOffset;
-    size_t count = args->count;
     uint32_t nelem;
     int rankDest;
 
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
 
     for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
       nelem = min(chunkCount, channelCount - elemOffset);
@@ -54,56 +53,56 @@ namespace {
 }
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
     using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
   }
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const size_t chunkCount = args->chunkCount;
-    const size_t count = args->count;
+    size_t count;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
     const int rank = ncclShmem.comm.rank;
     const int nranks = ncclShmem.comm.nRanks;
-    size_t gridOffset = args->workOffset;
-    size_t channelCount = args->workCount;
     size_t offset;
     int nelem;
 
     /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
      * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
      * and the rest are allocated to scatter. */
-    const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
-    const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
+    const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
+    const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
     const int tidEndScatter = nThreadsScatter;
     const int tidEndReduce = tidEndScatter + nThreadsReduce;
 
-    if (!args->regUsed) {
+    if (!work->regUsed) {
       if (tid < tidEndScatter) {
         // Scatter
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
@@ -113,8 +112,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
         // Reduce through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
@@ -127,7 +126,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
         Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
           prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           prims.scatter(0, 0, 0, 0, -1, 0);
         }
@@ -138,8 +137,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
         // Reduce through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           size_t outOffset = gridOffset + elemOffset;
           size_t inpOffset = outOffset + rank * count;
@@ -155,10 +154,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
 };
 
 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
   template<bool ReduceSendNotRecv>
   struct Scatterer {
-    struct ncclWorkElem* args;
+    struct ncclDevWorkColl* work;
     int chunkSize;
     ssize_t railGridOffset;
 
@@ -173,11 +172,11 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
       struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
       int nNodes = ncclShmem.comm.nNodes;
       int nRails = direct->nHeads;
-      int bid = args->bid;
-      void* inbuf = (void*)args->sendbuff;
-      ssize_t sizePerRank = args->count;
+      int part = ncclShmem.channelId - work->channelLo;
+      void* inbuf = (void*)work->sendbuff;
+      ssize_t sizePerRank = work->collnet.count;
 
-      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
       ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -204,7 +203,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
                      /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
                      /*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
                      /*PreOpSrcs=*/1>
-            (tid, tn, args->redOpArg, &args->redOpArg, false,
+            (tid, tn, work->redOpArg, &work->redOpArg, false,
              /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
                return s==0 ? (T*)inbuf + userOneBeg
                            : (T*)srcPtrs[s-1] + railAllOffset;
@@ -223,23 +222,23 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
     }
   };
 
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    int tid = threadIdx.x;
-    const int nChannels = args->nChannels;
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    const int part = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t chunkSize = int(args->chunkCount);
-    ssize_t sizePerRank = args->count;
+    ssize_t chunkSize = int(work->collnet.chunkCount);
+    ssize_t sizePerRank = work->collnet.count;
 
     if (direct->out == -1) __trap();
     bool isMultiRail = (direct->nHeads > 1);
     int nWarps1 = (isMultiRail ? 2 : 0);
     int nWarps2 = (isMultiRail ? 2 : 1);
     int nWarps3 = 1;
-    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
     nWarps3 = int(denom*nWarps3);
     nWarps2 = int(denom*nWarps2);
-    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+    nWarps1 = work->nWarps - (nWarps2+nWarps3);
 
     using Proto = ProtoSimple<1, 1>;
 
@@ -248,13 +247,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
       // Phase 1: Scatter inputs to peers
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
-              args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*ReduceSendNotRecv=*/true> scat;
-        scat.args = args;
+        scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.process</*Recv=*/0, /*Send=*/1>(scat);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat);
       }
       return;
     }
@@ -262,7 +261,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
         if (tid == 0) {
           int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
           Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -272,13 +271,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
         // Phase 2: Reduce from peers + local input -> send to network
         Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*ReduceSendNotRecv=*/false> scat;
-          scat.args = args;
+          scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
         }
       }
       return;
@@ -287,7 +286,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
 
     tn = nWarps3*WARP_SIZE;
     if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
         if (tid == 0) {
           int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
           Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -296,10 +295,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
       } else {
         // Phase 3: recv from network
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
-          ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
+          ssize_t railAllBeg = railGridOffset + part * chunkSize;
           ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
           ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
           ssize_t railOneEnd = railOneBeg + sizePerRank;
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index 347ac78c5..7774202a1 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -9,83 +9,159 @@
 #include "primitives.h"
 
 template<typename T, typename RedOp>
-struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  static_assert(sizeof(T)==1, "SendRecv only works on single byte types T.");
+
   template<typename Proto>
-  __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
-    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
-    ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
-    if (args->peer == ncclShmem.comm.rank) {
-      struct ncclWorkElemP2p* recvArgs = args-1;
-      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
-      if (buff != recvBuff) {
-        reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
-          (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
-      }
-    } else {
-      int chunkSize = args->chunkSize/sizeof(T);
-      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
-      int const peer = args->peer;
-      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
-      size_t offset = 0;
-      do {
-        int nelem = min(size_t(chunkSize), count-offset);
-        prims.directSend(offset, offset, nelem);
-        offset += nelem;
-      } while(offset < count && args->reg == 0);
-    }
+  __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
+    size_t bytes = work->sendBytes;
+    int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
+    Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
+      prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
+            /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+    size_t cursor = 0;
+    do {
+      int n = min(size_t(chunkSize), bytes-cursor);
+      prims.directSend(cursor, cursor, n);
+      cursor += n;
+    } while (cursor < bytes && work->sendRegistered == 0);
   }
 
   template<typename Proto>
-  __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
-    if (args->peer != ncclShmem.comm.rank) {
-      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
-      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
-      int chunkSize = args->chunkSize/sizeof(T);
-      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
-      int const peer = args->peer;
-      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
-      size_t offset = 0;
-      do {
-        int nelem = min(size_t(chunkSize), count-offset);
-        prims.directRecv(offset, nelem);
-        offset += nelem;
-      } while(offset < count && args->reg == 0);
-    }
+  __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
+    size_t bytes = work->recvBytes;
+    int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
+    Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
+      prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
+            /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+    size_t cursor = 0;
+    do {
+      int n = min(size_t(chunkSize), bytes-cursor);
+      prims.directRecv(cursor, n);
+      cursor += n;
+    } while (cursor < bytes && work->recvRegistered == 0);
   }
 
-  __device__ __forceinline__ void run(ncclWork *work) {
-    struct ncclWorkElemP2p* args = work->p2pElems;
-    int ngroups = args->ngroups;
-    int tid = threadIdx.x;
-    int wid = tid / WARP_SIZE;
-    // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
-    // warps for send, 2 warps for recv).
-    // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
-    // So we mirror wid then mirror again the group.
-    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-    uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
-    args += group;
-    tid -= args->warpStart * WARP_SIZE;
-    int nthreads = args->nWarps * WARP_SIZE;
-
-    if (args->p2pType == ncclWorkP2pTypeUnused) return;
-    if (tid >= nthreads || args->peer == -1) return;
-
-    // Select Proto here
-    // This is to allow the same kernel to run multiple primitives on different warps (thread groups)
-    if ((group%2) == 0) {
-      if (args->proto == NCCL_PROTO_LL) {
-        runRecv<ProtoLL>(tid, nthreads, group, args);
+  __device__ __forceinline__ void run() {
+    const int tid = threadIdx.x;
+    const int tn = blockDim.x;
+    const int wid = tid/WARP_SIZE;
+    const int nWarps = tn/WARP_SIZE;
+    const int lane = tid%WARP_SIZE;
+
+    struct Shared {
+      uint32_t workSendMask; // bitmasks of which work indices have send/recv
+      uint32_t workRecvMask;
+    };
+    Shared* shared = (Shared*)ncclScratchForWarp(0);
+
+    struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage;
+    int nWorks = ncclShmem.nWorks;
+
+    if (wid == 0) {
+      // Modify the memory range of each work[] to reflect this channel's
+      // partition of the work. Since integer divides are very heavy it's
+      // best to do them all in one warp.
+      int workIx = lane%16;
+      int isSend = lane < 16 ? 0 : 1;
+      bool hasWork = false;
+      if (workIx < nWorks) {
+        struct ncclDevWorkP2p* work = &works[workIx];
+        size_t bytes = isSend ? work->sendBytes : work->recvBytes;
+        int nParts = isSend ? work->nSendChannels : work->nRecvChannels;
+        int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId);
+        hasWork = (part < nParts);
+        if (nParts != 0) {
+          size_t partBeg, partEnd;
+          ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
+          (isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg;
+          (isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
+        }
+      }
+      uint32_t mask = __ballot_sync(~0u, hasWork);
+      if (lane == 0) {
+        shared->workSendMask = mask>>16;
+        shared->workRecvMask = mask & 0xffff;
+      }
+    }
+
+    // The fastest way to compute a warp uniform division x/y in [0,32) is to
+    // use each lane to guess a solution and count the ones that don't exceed
+    // the numerator:
+    //   __popc(__ballot_sync(~0u, y*(lane+1) <= x))
+    // That takes 1/3 the time of standard division and about 3/4 the time of
+    // approximate floating point division:
+    //   __float2int_rd(__fdividef(float(x),float(y))).
+
+    // nWarpPerWork = nWarps/nWorks
+    int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps));
+    int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2;
+    int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1;
+    // This might reduce nWarpPerWork which is probably desirable. It is better
+    // to have a balanced number of reading and writing threads even if that
+    // leaves warps unused.
+    nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork;
+    // The work index this warp belongs to: workIx = wid/nWarpPerWork
+    int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid));
+
+    __syncthreads(); // Wait for works[] and shared->* to be updated by warp=0
+
+    uint32_t workSendMask = shared->workSendMask;
+    uint32_t workRecvMask = shared->workRecvMask;
+
+    __syncthreads(); // release scratch space used by shared->*
+    if (nWorks <= workIx) return;
+
+    // Thread range for whole work (send & recv combined)
+    int subtid = tid - workIx*nWarpPerWork*WARP_SIZE;
+    int subtn = nWarpPerWork*WARP_SIZE;
+
+    // A send primtive of sufficient size requires 2 cuda barrier ids.
+    constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE;
+    // Count up all group ids used below this workIx:
+    int group, extra;
+    // Each recv gets one group id:
+    group = __popc(workRecvMask & ((1<<workIx)-1));
+    // Sends accompanying recvs get one and maybe an extra:
+    extra = (nSendWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
+    group += __popc((workSendMask & workRecvMask) & ((1<<workIx)-1))*(1+extra);
+    // Sends without recvs use more warps so compute extra accordingly:
+    extra = (nWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
+    group += __popc((workSendMask & ~workRecvMask) & ((1<<workIx)-1))*(1+extra);
+
+    struct ncclDevWorkP2p* work = &works[workIx];
+    bool hasSend = 1 & (workSendMask>>workIx);
+    bool hasRecv = 1 & (workRecvMask>>workIx);
+    bool isCopy = work->sendRank == ncclShmem.comm.rank;
+    bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE);
+
+    if (!isCopy && hasSend && hasRecv) {
+      // Translate thread ids to reflect just this send or recv as opposed to whole work.
+      if (isSend) {
+        subtn = nSendWarpPerWork*WARP_SIZE;
+      } else {
+        subtid -= nSendWarpPerWork*WARP_SIZE;
+        subtn = nRecvWarpPerWork*WARP_SIZE;
+        group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0);
+      }
+    }
+
+    if (isCopy) {
+      reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
+        (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
+    } else if (isSend) {
+      if (work->sendProtoLL) {
+        runSend<ProtoLL>(subtid, subtn, group, work);
       } else {
-        runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
+        runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
       }
     } else {
-      if (args->proto == NCCL_PROTO_LL) {
-        runSend<ProtoLL>(tid, nthreads, group, args);
+      if (work->recvProtoLL) {
+        runRecv<ProtoLL>(subtid, subtn, group, work);
       } else {
-        runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
+        runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
       }
     }
   }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index af57f1be4..0e07e3f25 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -12,25 +12,12 @@
 #include "channel.h"
 #include "cudawrap.h"
 #include "transport.h"
-#include <cassert>
+
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
 
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
-static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work);
-static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work);
-static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg);
-static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels);
-static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp);
-static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps);
-static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps);
-static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo);
-static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo);
-static ncclResult_t getPatternInfo(struct ncclInfo* collInfo);
-static ncclResult_t getLoopInfo(struct ncclInfo* collInfo);
-static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport);
-
 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
@@ -64,114 +51,30 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
   return result;
 }
 
-/*****************************************************************************/
-/*       Launch system : synchronization and CUDA kernel launch              */
-/*****************************************************************************/
+////////////////////////////////////////////////////////////////////////////////
+// Data movement metrics.
 
-static void appendWorkElemColl(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
-    int funcIndex, struct ncclWorkElem const *elem) {
-  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
-  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
-  if (q && funcIndex == q->work.header.funcIndex
-        && elem->nWarps == q->work.elems[0].nWarps
-        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS
-        && ncclWorkTypeColl == q->work.header.type) {
-    int e = chan->nWorkElem++;
-    q->work.elems[e] = *elem; // C++ struct assignment
-    return;
-  }
-  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
-  q->work.header.type = ncclWorkTypeColl;
-  q->work.header.funcIndex = funcIndex;
-  q->work.elems[0] = *elem; // C++ struct assignment
-  chan->nWorkElem = 1;
-  chan->nWork += 1;
-  ncclIntruQueueEnqueue(&chan->workQueue, q);
+static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) {
+  switch (func) {
+  case ncclFuncAllReduce: return 2;
+  case ncclFuncAllGather: return nRanks;
+  case ncclFuncReduceScatter: return nRanks;
+  default: return 1;
+  }
 }
-
-static void appendWorkElemColl(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
-    int funcIndex, struct ncclWorkElemReg const *elem) {
-  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
-  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
-  if (q && funcIndex == q->work.header.funcIndex
-        && elem->elem.nWarps == q->work.regElems[0].elem.nWarps
-        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG
-        && ncclWorkTypeRegColl == q->work.header.type) {
-    int e = chan->nWorkElem++;
-    q->work.regElems[e] = *elem; // C++ struct assignment
-    q->work.regElems[e].elem.isUsed = 1;
-    return;
-  }
-  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
-  q->work.header.type = ncclWorkTypeRegColl;
-  q->work.header.funcIndex = funcIndex;
-  q->work.regElems[0] = *elem; // C++ struct assignment
-  q->work.regElems[0].elem.isUsed = 1;
-  chan->nWorkElem = 1;
-  chan->nWork += 1;
-  ncclIntruQueueEnqueue(&chan->workQueue, q);
+static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncReduceScatter ? nRanks*count : count;
 }
-
-static void finishWorkP2p(struct ncclWork* work) {
-  int nElem = 0;
-  for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) {
-    if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused)
-      nElem = e+1;
-  }
-  int nGroup = 1;
-  while (nGroup < nElem) nGroup *= 2;
-  int nWarp = 1;
-  while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WARP_SIZE)/2) nWarp *= 2;
-  for (int i=0; i < nGroup; i++) {
-    work->p2pElems[i].ngroups = nGroup;
-    work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WARP_SIZE)/nGroup;
-    int extraWarp = nWarp >= 2 ? i%2 : 0;
-    work->p2pElems[i].nWarps = nWarp + extraWarp;
-  }
+static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather ? nRanks*count : count;
 }
-
-static void finishWork(struct ncclWork* work) {
-  if (work->header.type == ncclWorkTypeP2p) {
-    finishWorkP2p(work);
-  }
+static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
 }
 
-static void appendWorkElemP2p(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
-    struct ncclWorkElemP2p const *elem, bool fuseOk
-  ) {
-  int funcIndex = ncclDevFuncId_P2p();
-  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
-  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
-  if (q && funcIndex == q->work.header.funcIndex) {
-    if (!fuseOk) goto NewWork;
-    if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
-      for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
-        // Can't have multiple elements of the same ncclWork communicate with the
-        // same peer otherwise they would attempt to use that connection concurrently.
-        if (q->work.p2pElems[e].peer == elem->peer)
-          goto NewWork;
-      }
-      int e = chan->p2pTailElem[elem->p2pType-1];
-      q->work.p2pElems[e] = *elem; // C++ struct assignment
-      chan->p2pTailElem[elem->p2pType-1] += 2;
-      return;
-    }
-  NewWork:
-    finishWorkP2p(&q->work);
-  }
-  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
-  q->work.header.type = ncclWorkTypeP2p;
-  q->work.header.funcIndex = ncclDevFuncId_P2p();
-  chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
-  chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
-  q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
-  chan->p2pTailElem[elem->p2pType-1] += 2;
-  chan->nWork += 1;
-  ncclIntruQueueEnqueue(&chan->workQueue, q);
-}
+/*****************************************************************************/
+/*       Launch system : synchronization and CUDA kernel launch              */
+/*****************************************************************************/
 
 static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
   bool needed = true;
@@ -179,459 +82,212 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
   if (needed) {
     struct ncclProxyOp* q = ncclMemoryPoolAlloc<struct ncclProxyOp>(&comm->memPool_ncclProxyOp, &comm->memPermanent);
     *q = *op; // C++ struct assignment
-    ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q);
+    ncclIntruQueueEnqueue(&comm->planner.wipPlan.channels[op->channelId].proxyOpQueue, q);
   }
   return ncclSuccess;
 }
 
-static ncclResult_t computeCollSteps(struct ncclInfo* collInfo, size_t workCount, uint32_t* steps) {
-  struct ncclComm* comm = collInfo->comm;
-  if (collInfo->coll == ncclFuncAllReduce) {
-    if (collInfo->algorithm == NCCL_ALGO_RING)
-      *steps = DIVUP(workCount, comm->nRanks * collInfo->chunkCount) * (comm->nRanks - 1) * 2 * collInfo->chunkSteps;
-    else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT)
-      *steps = DIVUP(workCount, comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount) * collInfo->chunkSteps;
-    else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE)
-      *steps = DIVUP(workCount, comm->channels[0].nvls.nHeads * collInfo->chunkCount) * collInfo->chunkSteps;
-    else
-      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
-  } else if (collInfo->coll == ncclFuncReduceScatter) {
-    if (collInfo->algorithm == NCCL_ALGO_RING)
-      *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps;
-    else
-      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
-  } else if (collInfo->coll == ncclFuncAllGather) {
-    if (collInfo->algorithm == NCCL_ALGO_RING)
-      *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps;
-    else
-      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
-  } else {
-    *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t computeCollAlignCount(struct ncclInfo* collInfo, size_t* alignCount) {
-  if (collInfo->protocol == NCCL_PROTO_SIMPLE) {
-    *alignCount = NCCL_SIMPLE_ALIGNMENT / ncclTypeSize(collInfo->datatype);
-  } else if (collInfo->protocol == NCCL_PROTO_LL128) {
-    *alignCount = NCCL_LL128_ALIGNMENT_PER_WARP / ncclTypeSize(collInfo->datatype) * (collInfo->nThreads / WARP_SIZE);
-  } else {
-    *alignCount = NCCL_LL_ALIGNMENT_PER_THREAD / ncclTypeSize(collInfo->datatype) * collInfo->nThreads;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t computeCollLastChunkInfo(struct ncclInfo* collInfo, size_t workCount, size_t alignCount, size_t* lastChunkCount) {
-  struct ncclComm* comm = collInfo->comm;
-
-  if (collInfo->coll == ncclFuncAllReduce) {
-    if (collInfo->algorithm == NCCL_ALGO_RING) {
-      size_t remCount = workCount % (comm->nRanks * collInfo->chunkCount);
-      *lastChunkCount = DIVUP(DIVUP(remCount, comm->nRanks), alignCount) * alignCount;
-    } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
-      size_t remCount = workCount % (comm->channels[0].nvls.nHeads * collInfo->chunkCount);
-      *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].nvls.nHeads), alignCount) * alignCount;
-    } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
-      size_t remCount = workCount % (comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount);
-      *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].collnetDirect.nHeads), alignCount) * alignCount;
-    } else {
-      *lastChunkCount = collInfo->chunkCount;
-    }
-  } else {
-    *lastChunkCount = collInfo->chunkCount;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t getCollnetLoopInfo(struct ncclInfo* collInfo, int* nstepsPerLoop, int* nchunksPerLoop) {
-  switch (collInfo->pattern) {
-    case ncclPatternCollnetChain:
-      *nstepsPerLoop = *nchunksPerLoop = 1; break;
-    case ncclPatternNvls:
-      *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].nvls.nHeads; break;
-    case ncclPatternCollnetDirect:
-      *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].collnetDirect.nHeads; break;
-    default:
-      WARN("Unknown collnet pattern %d", collInfo->pattern);
-      return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t addCollnetCollToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
-    struct ncclInfo* collInfo, int* nWorkBudget
-  ) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclKernelPlan::Channel *chans = plan->channels;
-  struct ncclWorkElem workElem;
-  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
-  ncclRegBufferType regBufType = collInfo->regBufType;
-  int nChannels = std::min(collInfo->nChannels, usableChannels);
-  size_t countPerChannel = DIVUP(collInfo->count, nChannels);
-  uint32_t typeSize = ncclTypeSize(collInfo->datatype);
-  int steps, nchunksPerLoop, nstepsPerLoop, nLoop;
-
-  NCCLCHECK(computeCollChunkInfo(collInfo, collInfo->nBytes, collInfo->nChannels));
-  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);
-  workElem.nChannels = nChannels;
-
-  NCCLCHECKGOTO(getCollnetLoopInfo(collInfo, &nstepsPerLoop, &nchunksPerLoop), ret, fail);
-  nLoop = (int)DIVUP(collInfo->nBytes, (size_t)nChannels * nchunksPerLoop * collInfo->chunkSize);
-  steps = nstepsPerLoop * nLoop * collInfo->chunkSteps;
-
-  for (int bid = 0; bid < nChannels; bid++) {
-    workElem.bid = bid;
-    // Add work elem
-    *nWorkBudget += chans[bid].nWork;
-    if (regBufType == NCCL_REGULAR_BUFFER) {
-      appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElem);
-    } else {
-      struct ncclWorkElemReg workElemReg;
-      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[bid], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
-      appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElemReg);
-    }
-    *nWorkBudget -= chans[bid].nWork; // subtract delta of chans[c].nWork
-
-    // Add proxy task. Empty collectives do not make it to the proxy thread
-    // since they don't imply synchronization for the user like p2p.
-    if (collInfo->nBytes != 0) {
-      struct ncclProxyOp proxyOp;
-      NCCLCHECKGOTO(initCollProxyOp(collInfo, bid, opCount, steps, &proxyOp), ret, fail);
-      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
-    }
-
-    chans[bid].collBytes += countPerChannel * typeSize;
-  }
-
-  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
-  if (!plan->kernelSpecialized) {
-    plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex];
-    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex];
-  }
-
-  if (comm->rank == 0) {
-    TRACE(NCCL_COLL, "collnetColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), collInfo->nChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, collInfo->workFuncIndex, collInfo->nThreads);
-  }
-
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-static ncclResult_t addTunedCollToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
-    struct ncclInfo* collInfo, int* nWorkBudget
+static void addWorkBatchToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
+    enum ncclDevWorkType workType, int devFuncId, uint32_t workOffset,
+    int p2pRound = -1
   ) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclKernelPlan::Channel *chans = plan->channels;
-  struct ncclWorkElem workElem;
-  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
-  uint64_t workCount;
-  uint64_t workOffset = 0;
-  uint32_t typeSize = ncclTypeSize(collInfo->datatype);
-  ncclRegBufferType regBufType = collInfo->regBufType;
-  size_t alignCount, lastChunkCount;
-  int least[/*nBid*/MAXCHANNELS];
-  int maxIndexInLeast;
-  size_t maxBytesInLeast;
-  int nChannels = std::min(collInfo->nChannels, usableChannels);
-  int rnChannels = 0;
-  size_t countPerChannels;
-  size_t remCount = collInfo->count;
-
-  NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail);
-  countPerChannels = DIVUP(DIVUP(collInfo->count, nChannels), alignCount) * alignCount;
-  nChannels = DIVUP(collInfo->count, countPerChannels);
-  NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->nBytes, nChannels), ret, fail);
-  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);
-
-  // Choose the `nBid` least loaded channels to do the work. This ensures
-  // all bids go to different channels in case they need to synchronize.
-  least[0] = 0;
-  maxIndexInLeast = 0;
-  maxBytesInLeast = chans[0].collBytes;
-  // Initialize least[] such that the first nBid channels are accounted for.
-  for (int b = 1; b < nChannels; b++) {
-    least[b] = b;
-    if (maxBytesInLeast < chans[b].collBytes) {
-      maxIndexInLeast = b;
-      maxBytesInLeast = chans[b].collBytes;
-    }
-  }
-  // Sort in the rest of the channels. If a channel has less work than the max
-  // member of least[], replace that member and compute the new max. We only
-  // sort channels when coll algo is not collnet.
-  for (int c = nChannels; c < usableChannels; c++) {
-    if (chans[c].collBytes < maxBytesInLeast) {
-      least[maxIndexInLeast] = c;
-      maxBytesInLeast = chans[least[0]].collBytes;
-      maxIndexInLeast = 0;
-      for (int b = 1; b < nChannels; b++) {
-        if (maxBytesInLeast < chans[least[b]].collBytes) {
-          maxIndexInLeast = b;
-          maxBytesInLeast = chans[least[b]].collBytes;
-        }
+  ncclKernelPlanner::WipPlan::Channel* chan = &comm->planner.wipPlan.channels[channelId];
+  size_t workSize = ncclDevWorkSize(workType);
+  // Conditions causing us to create a new blank batch.
+  bool newBatch = (chan->workBatchQueue.tail == nullptr);
+  struct ncclDevWorkBatch* batch = nullptr;
+  if (!newBatch) {
+    batch = &chan->workBatchQueue.tail->batch;
+    // All of the conditions that prevent us from appending to current batch.
+    newBatch |= batch->workType != (uint8_t)workType;
+    newBatch |= batch->funcId != devFuncId;
+    // The following ensure the device can handle a batch this large. They have to
+    // account for all extension batches being fused together which is why
+    // wipBatch.workBytes and wipBatch.nP2ps aren't reset to 0 for a new extension
+    // batch further down.
+    newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
+    if (workType == ncclDevWorkTypeP2p) {
+      newBatch |= chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+      for (int i=0; i < chan->wipBatch.nP2ps; i++) {
+        newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
       }
     }
   }
-
-  for (int bid = 0; bid < nChannels && remCount > 0; bid++) {
-    int c = least[bid];
-
-    workCount = std::min(countPerChannels, remCount);
-    NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail);
-    NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail);
-
-    // Add work elem
-    *nWorkBudget += chans[c].nWork;
-    if (regBufType == NCCL_REGULAR_BUFFER) {
-      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem);
-    } else {
-      struct ncclWorkElemReg workElemReg;
-      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
-      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg);
+  // Conditions causing us to create an extension batch (prev->nextExtends=1)
+  uint32_t offset = newBatch ? 0 : (workOffset - batch->offsetBase);
+  bool extendBatch = 63*workSize < offset;
+  extendBatch |= 0 != offset%workSize;
+  if (newBatch || extendBatch) {
+    if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch.
+    struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc<ncclWorkBatchList>(&comm->memScoped);
+    ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode);
+    batch = &batchNode->batch;
+    batch->nextExtends = 0;
+    batch->workType = (uint32_t)workType;
+    batch->funcId = devFuncId;
+    batch->offsetBase = workOffset;
+    batch->offsetBitset = 0;
+    offset = 0;
+    if (newBatch) {
+      // Since extension batches are fused together on the device, and these values
+      // account for constraints on the fused batch, we only reset the values on
+      // a new batch
+      chan->wipBatch.workBytes = 0;
+      chan->wipBatch.nP2ps = 0;
+      // We don't count extension batches since this is used to derive a proxyOpCount,
+      // and we wan't all ops which are fused together to have the same value.
+      chan->nWorkBatchesP2p += (workType == ncclDevWorkTypeP2p ? 1 : 0);
     }
-    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork
-
-    // Add proxy task. Empty collectives do not make it to the proxy thread
-    // since they don't imply synchronization for the user like p2p.
-    if (collInfo->nBytes != 0) {
-      uint32_t steps;
-      struct ncclProxyOp proxyOp;
-      NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail);
-      NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail);
-      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
-    }
-
-    remCount -= workCount;
-    chans[c].collBytes += workCount * typeSize;
-    workOffset += workCount;
-    rnChannels++;
-  }
-
-  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
-  if (!plan->kernelSpecialized) {
-    plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex];
-    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex];
+    plan->nWorkBatches += 1;
   }
-
-  if (comm->rank == 0) {
-    TRACE(NCCL_COLL, "tunedColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads);
+  batch->offsetBitset |= 1ull<<(offset/workSize);
+  chan->wipBatch.workBytes += workSize;
+  if (workType == ncclDevWorkTypeP2p) {
+    // We need to ensure that a single batch doesn't have multiple p2p's
+    // of the same round since they would use the same connections.
+    chan->wipBatch.p2pRounds[chan->wipBatch.nP2ps++] = p2pRound;
   }
-
-exit:
-  return ret;
-fail:
-  goto exit;
 }
 
-static ncclResult_t addCBDCollToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
-    struct ncclInfo* collInfo, int* nWorkBudget
-  ) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclKernelPlan::Channel *chans = plan->channels;
-  size_t enqBytes;
-  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
-  size_t typeSize = ncclTypeSize(collInfo->datatype);
-  size_t workBytesTotal = collInfo->count * typeSize;
-  size_t workCountTotal = collInfo->count;
-  struct ncclWorkElem workElem;
-  size_t workOffset = 0;
-  size_t workCount;
-  ncclRegBufferType regBufType = collInfo->regBufType;
-  size_t alignCount;
-  size_t lastChunkCount;
-  int rnChannel = 0;
-
-  NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->aggnBytes, collInfo->nChannels), ret, fail);
-  NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail);
-  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);
-  for (int c = 0; c < usableChannels; c++) {
-    if (plan->maxBytesPerChannel <= chans[c].collBytes) continue;
-    if (workBytesTotal == 0) break;
-    enqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal);
-    workCount = std::min(DIVUP(DIVUP(enqBytes, typeSize), alignCount) * alignCount, workCountTotal);
-    enqBytes = workCount * typeSize;
-
-    NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail);
-    NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail);
-
-    // Add work elem
-    *nWorkBudget += chans[c].nWork;
-    if (regBufType == NCCL_REGULAR_BUFFER) {
-      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem);
-    } else {
-      struct ncclWorkElemReg workElemReg;
-      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
-      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg);
-    }
-    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork
-
-    // Add proxy task. Empty collectives do not make it to the proxy thread
-    // since they don't imply synchronization for the user like p2p.
-    if (collInfo->nBytes != 0) {
-      uint32_t steps;
-      struct ncclProxyOp proxyOp;
-      NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail);
-      NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail);
-      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
-    }
-
-    workBytesTotal -= enqBytes;
-    workCountTotal -= workCount;
-    chans[c].collBytes += enqBytes;
-    workOffset += workCount;
-    rnChannel++;
-  }
-
-  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
-  if (!plan->kernelSpecialized) {
-    plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex];
-    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex];
-  }
-
-  if (comm->rank == 0) {
-    TRACE(NCCL_COLL, "CBDColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, maxBytesPerChannel %ld, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannel, collInfo->count, collInfo->workBytes, usableChannels, plan->maxBytesPerChannel, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads);
+static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclKernelPlanner::WipPlan::Channel* wipChannels = comm->planner.wipPlan.channels;
+  size_t workBytes = plan->workBytes;
+  size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
+
+  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
+
+  // If we can fit everything into the kernel args we do so.
+  if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) {
+    plan->workStorageType = ncclDevWorkStorageTypeArgs;
+  }
+  plan->kernelArgsSize = sizeof(struct ncclDevKernelArgs) + batchBytes;
+  plan->kernelArgsSize += (plan->workStorageType == ncclDevWorkStorageTypeArgs) ? workBytes : 0;
+  plan->kernelArgsSize = alignUp(plan->kernelArgsSize, 16);
+  plan->kernelArgs = (struct ncclDevKernelArgs*)ncclMemoryStackAlloc(&comm->memScoped, plan->kernelArgsSize, /*align=*/16);
+  plan->kernelArgs->comm = comm->devComm;
+  plan->kernelArgs->channelMask = plan->channelMask;
+  plan->kernelArgs->workStorageType = plan->workStorageType;
+
+  // Put batches into the kernel arguments. The first batch for each channel
+  // must be located at batchZero[blockIdx.x]. To achieve this we round robin
+  // over the channels in ascending order until they're exhausted.
+  uint64_t hasBatchMask = plan->channelMask;
+  struct ncclDevWorkBatch* batchPrev[MAXCHANNELS] = {}; // {0...}
+  struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1);
+  int batchIx = 0;
+  while (hasBatchMask != 0) {
+    uint64_t tmpMask = hasBatchMask; // channels with a batch for this round.
+    do {
+      int c = popFirstOneBit(&tmpMask);
+      if (!ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) {
+        struct ncclWorkBatchList* batchNode = ncclIntruQueueDequeue(&wipChannels[c].workBatchQueue);
+        if (batchPrev[c] != nullptr) {
+          batchPrev[c]->nextJump = int(&batchZero[batchIx] - batchPrev[c]);
+        }
+        batchPrev[c] = &batchZero[batchIx];
+        batchZero[batchIx++] = batchNode->batch;
+      }
+      if (ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) {
+        hasBatchMask ^= 1ull<<c;
+      }
+    } while (tmpMask != 0);
   }
 
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
-
-// Put p2p op in plan assuming there is space in nWorkBudget, so you must
-// ensure *nWorkBudget >= 1 upon entry.
-static ncclResult_t addP2pToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
-    bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, bool fuseOk
-  ) {
-  struct ncclInfo info = {
-    isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
-    isSendNotRecv ? "Send" : "Recv",
-    nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0,
-    /*Args*/1, 1
-  };
-
-  int channelId;
-  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId));
-  info.channelId = channelId;
-
-  // 1 is connIndex
-  struct ncclConnInfo* conn = isSendNotRecv ?
-    &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn;
-  info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
-
-  int reg = 0;
-  if (info.protocol == NCCL_PROTO_SIMPLE) {
-    struct ncclReg* regRecord;
-    NCCLCHECK(ncclRegFind(comm, addr, bytes, &regRecord));
-    reg = regRecord && regRecord->nDevs ? 1 : 0;
-  }
-
-  struct ncclProxyOp proxyOp = {};
-  // May tune chunksize and set proxyOp.reg=0 if not using the network.
-  NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp, reg));
-
-  struct ncclWorkElemP2p elem = {0};
-  elem.proto = info.protocol;
-  elem.peer = peer;
-  elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-  elem.reg = proxyOp.reg;
-  elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv;
-  elem.buffLo32 = uint32_t(reinterpret_cast<uintptr_t>(addr));
-  elem.buffHi32 = reinterpret_cast<uintptr_t>(addr)>>32;
-  elem.countLo32 = uint32_t(bytes);
-  elem.countHi32 = bytes>>32;
-  elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p
-
-  *nWorkBudget += plan->channels[channelId].nWork;
-  appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk);
-  *nWorkBudget -= plan->channels[channelId].nWork;
-
-  // Calculate the opCount after appendWorkElemP2p since it will always return
-  // with channel->nWork equal to one plus the work index this p2p settled in.
-  proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1;
-  NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
-  return ncclSuccess;
-}
-
-static void finishPlan(struct ncclKernelPlan* plan) {
+  // Merge-sort per-channel proxy-op lists by opCount when merging them into plan->proxyOpQueue
+  // Phase 1: scan first op of each channel, store opCount in headIds[c].
+  uint64_t headIds[MAXCHANNELS];
+  int nHeads = 0;
   int channelUbound = 0;
-  int channelCount = 0;
-  uint64_t channelMask = 0;
-  bool hasProxyOps = false;
   for (int c=0; c < MAXCHANNELS; c++) {
-    struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue);
-    if (tail != nullptr) {
-      channelUbound = c+1;
-      channelCount += 1;
-      channelMask |= 1ull<<c;
-      tail->work.header.isLast = 1;
-      finishWork(&tail->work);
+    struct ncclProxyOp* op = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue);
+    headIds[c] = op ? op->opCount : uint64_t(-1);
+    if (op) nHeads += 1;
+    if (op) plan->hasProxyOps = true;
+    if (op) channelUbound = c+1;
+  }
+  // Phase 2: Dequeue from planner->channels[c], enqueue in merged order to plan
+  while (nHeads != 0) {
+    int c = -1;
+    uint64_t minId = uint64_t(-1);
+    // Find channel with least proxy-op id. We store the heads[c]->opCount in
+    // headIds[c] to remove indirect loads from this loop.
+    for (int c1=0; c1 < channelUbound; c1++) {
+      uint64_t id = headIds[c1];
+      id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's
+      if (id < minId) { c = c1; minId = id; }
     }
-    hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue);
+    struct ncclProxyOp* op = ncclIntruQueueDequeue(&wipChannels[c].proxyOpQueue);
+    struct ncclProxyOp* opNext = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue);
+    headIds[c] = opNext ? opNext->opCount : uint64_t(-1);
+    nHeads -= opNext ? 0 : 1;
+    ncclIntruQueueEnqueue(&plan->proxyOpQueue, op);
   }
-  plan->channelUbound = channelUbound;
-  plan->channelCount = channelCount;
-  plan->channelMask = channelMask;
-  plan->hasProxyOps = hasProxyOps;
-  plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
 }
 
 int64_t ncclParamLocalRegister();
 NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
 
+struct ncclIpcCleanupCallback {
+  struct ncclCommCallback base;
+  void* ptr;
+};
+static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb;
+  CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr));
+  free(me);
+  return ncclSuccess;
+}
+
 static ncclResult_t registerIntraNodeBuffers(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
   ) {
   ncclResult_t result = ncclSuccess;
 
   info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
 #if CUDART_VERSION >= 11030
   if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
     bool regBufUsed = false;
     const void *sendbuff = info->sendbuff;
     void *recvbuff = info->recvbuff;
-
-    if (info->coll == ncclFuncAllGather)
-      sendbuff = NULL;
-    else if (info->coll == ncclFuncReduceScatter)
-      recvbuff = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
 
     /* first try local registration. */
     if (ncclParamLocalRegister()) {
-      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, info->regBufSend, info->regBufRecv);
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
     }
 
-    if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) {
-      ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, info->regBufSend, info->regBufRecv);
+    if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
     }
 
     if (regBufUsed) {
+      *regNeedConnect = false;
       /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
        * saturate bandwidth. */
       if (comm->nNodes == 1) {
-        if (info->coll == ncclFuncReduceScatter)
-          info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
         else
-          info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
       } else {
-        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
       }
-
       info->regBufType = NCCL_NVLS_REG_BUFFER;
     }
   } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
     comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
     comm->intraRanks < comm->localRanks &&  // only with inter-process & intra-node peers
-    plan->persistent && 0) {
+    comm->planner.persistent && 0) {
     /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
     int localRank = comm->localRank;
     cudaPointerAttributes sattr, rattr;
@@ -663,51 +319,56 @@ static ncclResult_t registerIntraNodeBuffers(
     // Open handles locally
     for (int i=0; i < comm->localRanks; i++) {
       if (i == localRank) { // Skip self
-        info->regBufSend[i] = nullptr;
-        info->regBufRecv[i] = nullptr;
+        outRegBufSend[i] = nullptr;
+        outRegBufRecv[i] = nullptr;
       } else {
         for (int sr=0; sr < 2; sr++) {
           // Get base address of mapping
           void* base;
           CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
           // Get real buffer address by adding offset in the mapping
-          (sr == 0 ? info->regBufSend : info->regBufRecv)[i] = (char*)base + handles[i].offset[sr];
+          (sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
           // Enqueue reminder to close memory handle
-          struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
-          q->ptr = base;
-          ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+          struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback));
+          cb->base.fn = cleanupIpc;
+          cb->ptr = base;
+          ncclIntruQueueEnqueue(cleanupQueue, &cb->base);
+          info->nCleanupQueueElts += 1;
         }
       }
     }
     info->regBufType = NCCL_IPC_REG_BUFFER;
-  } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opFull.op != ncclDevPreMulSum && info->opFull.op != ncclDevSumPostDiv) {
+  } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
     int sendRegBufFlag = 0;
     int recvRegBufFlag = 0;
     void *sendHandle, *recvHandle;
 
     if (ncclParamLocalRegister()) {
-      ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
+      ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
       info->sendMhandle = sendHandle;
       if (sendRegBufFlag) {
-        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
         info->recvMhandle = recvHandle;
       }
     }
 
-    if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && plan->persistent && ncclParamGraphRegister()) {
-      ncclCollnetGraphRegisterBuffer(comm, plan, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
+    if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
       info->sendMhandle = sendHandle;
       if (sendRegBufFlag) {
-        ncclCollnetGraphRegisterBuffer(comm, plan, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
         info->recvMhandle = recvHandle;
       }
     }
 
     if (sendRegBufFlag && recvRegBufFlag) {
-      info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
+      info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
       info->regBufType = NCCL_COLLNET_REG_BUFFER;
       if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
-        INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, info->sendbuffSize, info->recvbuff, recvHandle, info->recvbuffSize);
+        INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
       }
     }
   }
@@ -716,177 +377,623 @@ static ncclResult_t registerIntraNodeBuffers(
   return result;
 }
 
-static ncclResult_t getCBDCollnChannel(struct ncclKernelPlan* plan, struct ncclInfo* collInfo, int usableChannels) {
-  size_t firstEnqBytes;
-  size_t workBytesTotal = collInfo->workBytes;
-  struct ncclKernelPlan::Channel *chans = plan->channels;
-  int typeSize = ncclTypeSize(collInfo->datatype);
-  size_t maxCount = DIVUP(plan->maxBytesPerChannel, typeSize);
+static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
+static ncclResult_t getAlgoInfo(
+  struct ncclComm* comm, struct ncclTaskColl* task,
+  int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
+);
+static ncclResult_t calcCollChunking(
+  struct ncclComm* comm, struct ncclTaskColl* task, int nChannels, size_t nBytes,
+  /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
+);
+
+struct ncclKernelPlanBudget {
+  ssize_t inArgsBytes; // Space available within kernel args struct
+  ssize_t outArgsBytes; // Space available outside of args struct (fifo or persistent buf)
+};
+
+static bool testBudget(
+    struct ncclKernelPlanBudget* budget, int nWorkBatches, ssize_t workBytes
+  ) {
+  ssize_t batchBytes = nWorkBatches*sizeof(struct ncclDevWorkBatch);
+  bool ok = false;
+  ok |= (batchBytes + workBytes <= budget->inArgsBytes);
+  ok |= (batchBytes <= budget->inArgsBytes) && (workBytes <= budget->outArgsBytes);
+  return ok;
+}
+
+// Called once per ncclGroup to organize the user submitted tasks in
+// comm->planner so that they can be peeled off into plans.
+ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
+  struct ncclKernelPlanner* planner = &comm->planner;
+  // Tasks from the sorter come out ordered size descending.
+  struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
+  // Tasks are assembled by (fn,op,ty) size ascending.
+  struct ncclTaskColl* tasksByFnOpTy[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
+  memset(tasksByFnOpTy, 0, sizeof(tasksByFnOpTy));
+  int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
+  int fnOpTyCount = 0;
+
+  // Walk the size sorted tasks, binning them by (fn,op,ty).
+  while (task != nullptr) {
+    struct ncclTaskColl* next = task->next;
+    int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype;
+    // Add to set of (fn,op,ty) indices on first occurrence
+    if (tasksByFnOpTy[index] == nullptr) fnOpTyIndices[fnOpTyCount++] = index;
+    // Add to LIFO for this (fn,op,ty)
+    task->next = tasksByFnOpTy[index];
+    tasksByFnOpTy[index] = task;
+    // Next task
+    task = next;
+  }
+
+  // Walk (fn,op,ty) bins, compute algo and proto etc. Then bin them by their
+  // scheduling constraints (collnet x nvls).
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collBins[2][2] = {};
+  for (int cursor=0; cursor < fnOpTyCount; cursor++) {
+    struct ncclTaskColl* aggBeg = tasksByFnOpTy[fnOpTyIndices[cursor]];
+    int collNetSupport = 0;
+    NCCLCHECK(getCollNetSupport(comm, aggBeg, &collNetSupport));
+    int nvlsSupport = comm->nvlsSupport && (ncclNvlsSupported(aggBeg->opDev.op, aggBeg->datatype) || aggBeg->func == ncclFuncAllGather);
+    // Crudely estimate number of tasks per channel. This is using the wrong number
+    // of channels for NVLS algos, but knowing the algo requires having this value,
+    // so either be crude our iterate until fixed point, we chose the former.
+    int nTasksPerChannel = divUp(comm->planner.nTasksColl, comm->nChannels);
+    do {
+      struct ncclTaskColl* aggEnd = aggBeg->next;
+      struct ncclTaskColl agg = *aggBeg;
+      // We aggregate operations that are within 4X size of each other.
+      while (aggEnd != nullptr && aggEnd->trafficBytes < 4*aggBeg->trafficBytes) {
+        agg.count += aggEnd->count;
+        agg.trafficBytes += aggEnd->trafficBytes;
+        aggEnd = aggEnd->next;
+      }
 
-  if (workBytesTotal == 0) {
-    collInfo->nChannels = 1;
-    goto exit;
+      NCCLCHECK(getAlgoInfo(comm, &agg, collNetSupport, nvlsSupport, nTasksPerChannel, simInfo));
+      agg.devFuncId = ncclDevFuncId(agg.func, agg.opDev.op, agg.datatype, agg.algorithm, agg.protocol);
+
+      int isCollnet=0, isNvls=0;
+      switch (agg.algorithm) {
+      case NCCL_ALGO_NVLS:
+      case NCCL_ALGO_NVLS_TREE:
+        isNvls = 1;
+        isCollnet = agg.algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1;
+        break;
+      case NCCL_ALGO_COLLNET_CHAIN:
+      case NCCL_ALGO_COLLNET_DIRECT:
+        isCollnet = 1;
+        break;
+      }
+      // Update the aggregated tasks with the computed values.
+      do {
+        struct ncclTaskColl* next = aggBeg->next;
+        aggBeg->algorithm = agg.algorithm;
+        aggBeg->protocol = agg.protocol;
+        aggBeg->nMaxChannels = agg.nMaxChannels;
+        aggBeg->nWarps = agg.nWarps;
+        aggBeg->devFuncId = agg.devFuncId;
+        aggBeg->isCollnet = isCollnet;
+        aggBeg->isNvls = isNvls;
+        ncclIntruQueueEnqueue(&collBins[isCollnet][isNvls], aggBeg);
+        aggBeg = next;
+      } while (aggBeg != aggEnd);
+    } while (aggBeg != nullptr);
+  }
+
+  // Concatenate `collBins[*][*]` together into final list `planner->collTaskQueue`.
+  // Collnet is the outer dimension since that affects how we divide over the
+  // channels.
+  for (int isCollnet=0; isCollnet <= 1; isCollnet++) {
+    for (int isNvls=0; isNvls <= 1; isNvls++) {
+      ncclIntruQueueTransfer(&planner->collTaskQueue, &collBins[isCollnet][isNvls]);
+    }
   }
 
-  for (int c = 0; c < usableChannels; c++) {
-    if (plan->maxBytesPerChannel <= chans[c].collBytes) continue;
-    firstEnqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal);
-    firstEnqBytes = DIVUP(firstEnqBytes, typeSize) * typeSize;
-    collInfo->nChannels = 1 + DIVUP((workBytesTotal - firstEnqBytes) / typeSize, maxCount);
-    break;
+  // Walk tasks again to:
+  // 1. Possibly register buffers.
+  // 2. Build ncclDevWorkColl structs.
+  // 3. Bin the work structs according to the number of valid channels they
+  //    may be assigned to {collnet, nvls, standard}
+  task = ncclIntruQueueHead(&planner->collTaskQueue);
+  while (task != nullptr) {
+    // Build a ncclDevWorkColl[Reg?] struct for each task.
+    void* regBufSend[NCCL_MAX_LOCAL_RANKS];
+    void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
+    bool regNeedConnect = true;
+    registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+
+    if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
+      if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
+        comm->initAlgoChannels[NCCL_ALGO_NVLS] = true;
+        algoNeedConnect[NCCL_ALGO_NVLS] = true;
+      }
+      if (task->algorithm != NCCL_ALGO_NVLS || regNeedConnect == true) {
+        comm->initAlgoChannels[task->algorithm] = true;
+        algoNeedConnect[task->algorithm] = true;
+        *needConnect = true;
+      }
+    }
+
+    struct ncclDevWorkColl devWork = {};
+    devWork.sendbuff = (void*)task->sendbuff;
+    devWork.recvbuff = (void*)task->recvbuff;
+    devWork.root = task->root;
+    devWork.nWarps = task->nWarps;
+    devWork.redOpArg = task->opDev.scalarArg;
+    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
+    devWork.oneNode = (comm->nNodes == 1);
+    devWork.regUsed = task->regBufType;
+
+    struct ncclWorkList* workNode;
+    switch (task->regBufType) {
+    case NCCL_REGULAR_BUFFER:
+    case NCCL_COLLNET_REG_BUFFER:
+      { workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
+        workNode->workType = ncclDevWorkTypeColl;
+        workNode->size = sizeof(struct ncclDevWorkColl);
+        memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
+      } break;
+    case NCCL_IPC_REG_BUFFER:
+      { struct ncclDevWorkCollReg workReg = {};
+        workReg.coll = devWork;
+        struct ncclChannel *channel0 = &comm->channels[0];
+        for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
+          int peer = channel0->collnetDirect.down[i];
+          if (peer == -1) break;
+          int j = comm->rankToLocalRank[peer]; // Get intra-node slot
+          workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
+          workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
+        }
+        for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
+          int peer = channel0->collnetDirect.up[i];
+          if (peer == -1) break;
+          int j = comm->rankToLocalRank[peer];
+          // Output buffer of root peer
+          workReg.upOutputs[i] = regBufRecv[j];
+        }
+        workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
+        workNode->workType = ncclDevWorkTypeCollReg;
+        workNode->size = sizeof(struct ncclDevWorkCollReg);
+        memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
+      } break;
+    case NCCL_NVLS_REG_BUFFER:
+      { struct ncclDevWorkCollReg workReg = {};
+        workReg.coll = devWork; // C++ struct assignment
+        /* NVLS only has one send and recv buffer registered */
+        workReg.dnInputs[0] = regBufSend[0];
+        workReg.dnOutputs[0] = regBufRecv[0];
+        workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
+        workNode->workType = ncclDevWorkTypeCollReg;
+        workNode->size = sizeof(struct ncclDevWorkCollReg);
+        memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
+      } break;
+    default:
+      /* impossible value */
+      WARN("Invalid regBufType %d", task->regBufType);
+      return ncclInvalidArgument;
+    }
+
+    ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode);
+    task = task->next;
   }
 
-exit:
   return ncclSuccess;
 }
 
 static ncclResult_t scheduleCollTasksToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
+    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
   ) {
-  struct ncclTasks* tasks = &comm->tasks;
-  size_t totalCBDBytes = tasks->workBytesTotal;
-  struct ncclInfo* collInfo;
-
-  if (!ncclIntruQueueEmpty(&tasks->collQueue)) {
-    int usableChannels = 0, accChannels = 0;
-
-    tasks->usableChannels = 1;
-    while (!ncclIntruQueueEmpty(&tasks->collQueue)) {
-      collInfo = ncclIntruQueueDequeue(&tasks->collQueue);
-      if (collInfo->count == 0) continue;
-      if (collInfo->algorithm == NCCL_ALGO_UNDEF) {
-        struct ncclInfo* aggInfo = ncclMemoryStackAlloc<struct ncclInfo>(&comm->memScoped);
-        struct ncclInfo* nextInfo = collInfo->next;
-        int nvlsSupport;
-        int collNetSupport;
-
-        memcpy(aggInfo, collInfo, sizeof(struct ncclInfo));
-        while (nextInfo) {
-          if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) {
-            aggInfo->count += nextInfo->count;
-            nextInfo = nextInfo->next;
-          } else {
-            break;
-          }
-        }
+  struct ncclKernelPlanner* planner = &comm->planner;
+  // Estimate number of tasks that will fit in this plan.
+  int nPlanColls = 0;
+  size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
+  int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
+  int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
+                                 comm->nChannels, comm->nvlsChannels};
+  do {
+    size_t workBytes = 0;
+    struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
+    struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue);
+    while (task != nullptr) {
+      int nBatches = divUp(nPlanColls, 4); // Rough guess: 4 colls per batch.
+      if (!testBudget(budget, nBatches, workBytes + workNode->size)) goto plan_full;
+
+      nPlanColls += 1;
+      workBytes += workNode->size;
+      int kind = 2*task->isCollnet + task->isNvls;
+      trafficBytes[kind] += task->trafficBytes;
+      nChannels[kind] += task->nMaxChannels;
+      nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]);
+      task = task->next;
+      workNode = workNode->next;
+    }
+  plan_full:;
+  } while (0);
+
+  int kindPrev = -1;
+  constexpr size_t MinTrafficPerChannel = 512;
+  size_t trafficPerChannel = 0;
+  int channelId = 0;
+  size_t currentTraffic = 0;
+  while (nPlanColls!=0 && !ncclIntruQueueEmpty(&planner->collTaskQueue)) {
+    struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
+    struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue);
+    struct ncclDevWorkColl* devWork = (struct ncclDevWorkColl*)(workNode+1);
+    size_t elementSize = ncclTypeSize(task->datatype);
+
+    int kind = 2*task->isCollnet + task->isNvls;
+    if (kind != kindPrev) {
+      trafficPerChannel = std::max<size_t>(MinTrafficPerChannel, trafficBytes[kind]/nChannels[kind]);
+      kindPrev = kind;
+      channelId = 0;
+      currentTraffic = 0;
+    }
 
-        nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo->opFull.op, aggInfo->datatype);
-        NCCLCHECK(getCollNetSupport(aggInfo, &collNetSupport));
-        NCCLCHECK(ncclInfoSetDerived(aggInfo, comm->nRanks));
-        NCCLCHECK(getTunerInfo(aggInfo, collNetSupport, nvlsSupport, 1));
-        NCCLCHECK(topoGetAlgoInfo(aggInfo, collNetSupport, nvlsSupport, 1));
-        NCCLCHECK(getChannnelThreadInfo(aggInfo));
-        NCCLCHECK(computeCollWorkFunc(aggInfo));
-        NCCLCHECK(getPatternInfo(aggInfo));
-
-        // Try to assign algo and proto to all possible collectives
-        nextInfo = collInfo;
-        while (nextInfo) {
-          if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) {
-            NCCLCHECK(ncclInfoSetDerived(nextInfo, comm->nRanks));
-            NCCLCHECK(getTunerInfo(nextInfo, collNetSupport, nvlsSupport, 1));
-            nextInfo->algorithm = aggInfo->algorithm;
-            nextInfo->protocol = aggInfo->protocol;
-            nextInfo->nThreads = aggInfo->nThreads;
-            nextInfo->pattern = aggInfo->pattern;
-            nextInfo->workFuncIndex = aggInfo->workFuncIndex;
-            nextInfo->aggnBytes = aggInfo->nBytes;
-
-            NCCLCHECK(getChannnelThreadInfo(nextInfo));
-            // if possible, start registration
-            registerIntraNodeBuffers(comm, plan, nextInfo);
-            // accumulate channels
-            accChannels += nextInfo->nChannels;
-            nextInfo = nextInfo->next;
-          } else {
-            break;
-          }
-        }
-      } // end of aggInfo
+    if (task->isCollnet) {
+      int nChannels = task->nMaxChannels;
+      // Ensure room for worst case of one new batch per channel
+      if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) {
+        return ncclSuccess;
+      }
+
+      size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1);
+      struct ncclProxyOp proxyOp;
+      uint32_t chunkSize, directFlags=0;
+      NCCLCHECK(calcCollChunking(comm, task, nChannels, globalBytesPerElement*task->count, &chunkSize, &directFlags, &proxyOp));
+      devWork->channelLo = 0;
+      devWork->channelHi = nChannels-1;
+      devWork->collnet.count = task->count;
+      devWork->collnet.chunkCount = chunkSize/ncclTypeSize(task->datatype);
+      devWork->direct = directFlags;
+
+      uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0;
+      for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
+        proxyOp.channelId = c;
+        proxyOp.opCount = proxyOpId;
+        addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
+      }
+    } else { // not task->isCollnet
+      constexpr size_t cellSize = 16;
+      int elementsPerCell = cellSize/elementSize;
+      size_t cells = divUp(task->count*elementSize, cellSize);
+      int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      size_t trafficPerElement = elementSize*trafficPerByte;
+      size_t trafficPerCell = cellSize*trafficPerByte;
+      size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell));
+      size_t cellsLo;
+      if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo"
+        cellsLo = cells;
+      } else {
+        cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell);
+      }
+      int nMidChannels = (cells-cellsLo)/cellsPerChannel;
+      size_t cellsHi = (cells-cellsLo)%cellsPerChannel;
+      int nChannels = (cellsLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0);
+      if (nMaxChannels[kind] < channelId + nChannels) { // Overflowed available channels
+        nMidChannels = nMaxChannels[kind] - channelId - 2;
+        cellsPerChannel = (cells-cellsLo)/(nMidChannels+1);
+        cellsHi = cellsPerChannel + (cells-cellsLo)%(nMidChannels+1);
+      }
+      if (cellsHi == 0 && nMidChannels != 0) {
+        cellsHi = cellsPerChannel;
+        nMidChannels -= 1;
+      }
+      if (cellsLo == 0) { // Least channel skipped. Make the next channel the new least.
+        channelId += 1;
+        if (nMidChannels == 0) { cellsLo = cellsHi; cellsHi = 0; }
+        else { cellsLo = cellsPerChannel; nMidChannels -= 1; }
+      }
+      size_t countMid = nMidChannels!=0 ? cellsPerChannel*elementsPerCell : 0;
+      size_t countLo = cellsLo*elementsPerCell;
+      size_t countHi = cellsHi*elementsPerCell;
+      (countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count;
+
+      nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0);
+      // Ensure room for worst case of one new batch per channel
+      if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) {
+        return ncclSuccess;
+      }
 
-      if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
-        usableChannels = std::max(usableChannels, comm->nvlsChannels);
+      devWork->channelLo = channelId;
+      devWork->channelHi = channelId + nChannels-1;
+      devWork->cbd.countLo = countLo;
+      devWork->cbd.countMid = countMid;
+      devWork->cbd.countHi = countHi;
+
+      // calcCollChunking() uses global bytes instead of traffic which differs
+      // in that allreduce isn't multiplied by 2.
+      size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1);
+      struct ncclProxyOp proxyOpLo, proxyOpMid, proxyOpHi;
+
+      uint32_t chunkSize, directFlags=0;
+      size_t grainSize = ncclProtoGrainSize(task->protocol);
+      if (countLo != 0) {
+        NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo));
+        devWork->cbd.chunkGrainsLo = chunkSize/grainSize;
+      }
+      if (countHi != 0) {
+        NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countHi, &chunkSize, &directFlags, &proxyOpHi));
+        devWork->cbd.chunkGrainsHi = chunkSize/grainSize;
+      }
+      if (nMidChannels != 0) {
+        NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countMid, &chunkSize, &directFlags, &proxyOpMid));
+        devWork->cbd.chunkGrainsMid = chunkSize/grainSize;
+      }
+      devWork->direct = directFlags;
+
+      // Update the current channel and vacant traffic budget.
+      if (countHi != 0) {
+        channelId += nChannels-1;
+        currentTraffic = countHi*trafficPerElement;
+      } else if (nMidChannels != 0) {
+        channelId += nChannels;
+        currentTraffic = 0;
       } else {
-        usableChannels = std::max(usableChannels, comm->collChannels);
+        currentTraffic += countLo*trafficPerElement;
+      }
+
+      if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) {
+        channelId += 1;
+        currentTraffic = 0;
+      }
+
+      uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0;
+      for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
+        struct ncclProxyOp* proxyOp;
+        if (c == (int)devWork->channelLo) {
+          proxyOp = &proxyOpLo;
+        } else if (c == (int)devWork->channelHi) {
+          proxyOp = &proxyOpHi;
+        } else {
+          proxyOp = &proxyOpMid;
+        }
+        proxyOp->channelId = c;
+        proxyOp->opCount = proxyOpId;
+        addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
       }
+    }
+
+    plan->channelMask |= (2ull<<devWork->channelHi) - (1ull<<devWork->channelLo);
+    plan->threadPerBlock = std::max(plan->threadPerBlock, task->nWarps*WARP_SIZE);
+    if (!plan->kernelSpecialized) {
+      plan->kernelFn = ncclDevKernelForFunc[task->devFuncId];
+      plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[task->devFuncId];
+    }
 
-      if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT || collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN || (collInfo->algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1)) {
-        // substract collective which needs to be executed separately
-        totalCBDBytes -= collInfo->workBytes;
-        tasks->workBytesTotal -= collInfo->workBytes;
-        ncclIntruQueueEnqueue(&tasks->collnetQueue, collInfo);
-      } else if (collInfo->userTuned) {
-        // substract collective which needs to be executed separately
-        totalCBDBytes -= collInfo->workBytes;
-        tasks->workBytesTotal -= collInfo->workBytes;
-        ncclIntruQueueEnqueue(&tasks->collTunedQueue, collInfo);
+    if (comm->rank == 0) {
+      if (task->isCollnet) {
+        TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d",
+          ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op),
+          ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm),
+          ncclProtoToString(task->protocol),
+          (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
+          (long)devWork->collnet.count, devWork->collnet.chunkCount);
       } else {
-        ncclIntruQueueEnqueue(&tasks->collCBDQueue, collInfo);
+        TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count{Lo,Mid,Hi}={%ld,%ld,%ld} chunkBytes{Lo,Mid,Hi}={%d,%d,%d}",
+          ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op),
+          ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm),
+          ncclProtoToString(task->protocol),
+          (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
+          (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi,
+          int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)),
+          int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)),
+          int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol)));
       }
     }
 
-    tasks->usableChannels = std::min(usableChannels, accChannels);
+    for (int i=0; i < task->nCleanupQueueElts; i++) {
+      ncclIntruQueueEnqueue(&plan->cleanupQueue, ncclIntruQueueDequeue(&planner->collCleanupQueue));
+    }
+    ncclIntruQueueDequeue(&planner->collTaskQueue);
+    ncclIntruQueueDequeue(&planner->collWorkQueue);
+    nPlanColls -= 1;
+    planner->nTasksColl -= 1;
+    ncclIntruQueueEnqueue(&plan->workQueue, workNode);
+    plan->workBytes += workNode->size;
   }
+  return ncclSuccess;
+}
 
-  /* Calculate maxBytesPerChannel for CBD colls and it should be 16 bytes aligned
-   * Note: it it not hard upper bound for maxBytes, we can relax it if any optimization
-   * is needed */
-  plan->maxBytesPerChannel = DIVUP(DIVUP(totalCBDBytes, tasks->usableChannels), NCCL_BYTES_ALIGNMENT) * NCCL_BYTES_ALIGNMENT;
-  // First enqueue CBD colls
-  while (!ncclIntruQueueEmpty(&tasks->collCBDQueue)) {
-    // Get nChannels and peek whether the budget allows before we enqueue
-    collInfo = ncclIntruQueueHead(&tasks->collCBDQueue);
-    collInfo->nChannels = DIVUP(collInfo->workBytes * tasks->usableChannels, totalCBDBytes);
-    // Haven't got nChannels info yet, relax the budget boundary a bit.
-    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;
+NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
+NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
 
-    collInfo = ncclIntruQueueDequeue(&tasks->collCBDQueue);
-    NCCLCHECK(addCBDCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
-    tasks->nTasksColl -= 1;
-    tasks->workBytesTotal -= collInfo->count * ncclTypeSize(collInfo->datatype);
+// Put p2p op in plan assuming there is sizeof(ncclDevWorkBatch) in batch budget
+// and sizeof(ncclDevWorkP2p) in work budget. "sendRank" and "recvRank" must
+// match the corresponding values for this round of the p2p schedule (no -1's).
+// No-op's are encoded with a -1 size.
+static ncclResult_t addP2pToPlan(
+    struct ncclComm* comm, struct ncclKernelPlan* plan,
+    int nChannelsMin, int nChannelsMax, int p2pRound,
+    int sendRank, void* sendAddr, ssize_t sendBytes,
+    int recvRank, void* recvAddr, ssize_t recvBytes
+  ) {
+  constexpr int connIndex = 1;
+  bool selfSend = (sendRank == comm->rank);
+  // recv: dir=0, send: dir=1
+  void* addrs[2] = {recvAddr, sendAddr};
+  ssize_t bytes[2] = {recvBytes, sendBytes};
+  bool protoLL[2] = {!selfSend, !selfSend};
+  bool network[2] = {false, false};
+  bool proxySameProcess[2] = {true, true};
+  uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound);
+  if (!selfSend) {
+    for (int part=0; part < nChannelsMax; part++) {
+      int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
+      struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
+      for (int dir=0; dir <= 1; dir++) {
+        int peerRank = dir ? sendRank : recvRank;
+        struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
+                                         : &channelPeers[peerRank]->recv[connIndex];
+        protoLL[dir] &= conn->conn.buffs[NCCL_PROTO_LL] != nullptr;
+        network[dir] |= conn->transportComm == (dir ? &netTransport.send : &netTransport.recv);
+        proxySameProcess[dir] &= conn->proxyConn.sameProcess;
+      }
+    }
   }
 
-  // Then enqueue collnet colls
-  while (!ncclIntruQueueEmpty(&tasks->collnetQueue)) {
-    collInfo = ncclIntruQueueHead(&tasks->collnetQueue);
-    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;
+  ssize_t thresholdLL = nChannelsMax*ncclParamP2pLLThreshold();
+  ssize_t paramChunkSize = ncclParamChunkSize();
+  // Arrays indexed by dir where recv=0, send=1:
+  int nChannels[2];
+  int protocol[2];
+  int stepSize[2];
+  int chunkSize[2];
+  int chunkDataSize[2];
+  int chunkDataSize_u32fp8[2];
+  bool registered[2];
+
+  for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
+    if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL;
+    protocol[dir] = protoLL[dir] ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;
+
+    stepSize[dir] = comm->buffSizes[protocol[dir]]/NCCL_STEPS;
+    if (protocol[dir] == NCCL_PROTO_SIMPLE) stepSize[dir] = comm->p2pChunkSize;
+    chunkSize[dir] = stepSize[dir];
+    if (paramChunkSize != 0) {
+      chunkSize[dir] = paramChunkSize;
+    } else if (network[dir]) {
+      // Tune chunk size for the network
+      if (protocol[dir] == NCCL_PROTO_SIMPLE && bytes[dir] < stepSize[dir]) chunkSize[dir] /= 4;
+      else if (bytes[dir] < 8*stepSize[dir]) chunkSize[dir] /= 2;
+    }
 
-    collInfo = ncclIntruQueueDequeue(&tasks->collnetQueue);
-    NCCLCHECK(addCollnetCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
-    tasks->nTasksColl -= 1;
+    chunkDataSize[dir] = chunkSize[dir];
+    if (protocol[dir] == NCCL_PROTO_LL) chunkDataSize[dir] /= 2;
+    chunkDataSize_u32fp8[dir] = u32fp8Encode(chunkDataSize[dir]);
+    chunkDataSize[dir] = u32fp8Decode(chunkDataSize_u32fp8[dir]);
+    chunkSize[dir] = chunkDataSize[dir];
+    if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
+
+    registered[dir] = false;
+    if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
+      struct ncclReg* regRecord;
+      NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
+      registered[dir] = (regRecord && regRecord->nDevs);
+    }
+
+    if (bytes[dir] == -1) nChannels[dir] = 0;
+    else if (bytes[dir] == 0) nChannels[dir] = 1;
+    else {
+      ssize_t minPartSize = comm->nNodes > 1 ? stepSize[dir]/2 : stepSize[dir]/8;
+      ssize_t maxPartSize = comm->nNodes > 1 ? stepSize[dir]   : stepSize[dir]*32;
+      nChannels[dir] = std::min<int>(nChannelsMin, divUp(bytes[dir], minPartSize));
+      size_t partSize = std::max(minPartSize, divUp(bytes[dir], nChannels[dir]));
+      while (partSize > maxPartSize && nChannels[dir] <= nChannelsMax/2) {
+        nChannels[dir] *= 2;
+        partSize = divUp(bytes[dir], nChannels[dir]);
+      }
+    }
   }
 
-  // Finally enqueue user-tuned colls
-  while (!ncclIntruQueueEmpty(&tasks->collTunedQueue)) {
-    collInfo = ncclIntruQueueHead(&tasks->collTunedQueue);
-    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;
+  struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
+  workNode->workType = ncclDevWorkTypeP2p;
+  workNode->size = sizeof(struct ncclDevWorkP2p);
+  ncclIntruQueueEnqueue(&plan->workQueue, workNode);
+  uint32_t workOffset = plan->workBytes;
+  plan->workBytes += sizeof(struct ncclDevWorkP2p);
+
+  struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1);
+  work->nP2pChannels = comm->p2pnChannels;
+  work->channelBase = base;
+  work->nSendChannels = nChannels[1];
+  work->sendProtoLL = protoLL[1];
+  work->sendRegistered = registered[1];
+  work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
+  work->sendRank = sendRank;
+  work->sendAddr = sendAddr;
+  work->sendBytes = sendBytes==-1 ? 0 : sendBytes;
+  work->nRecvChannels = nChannels[0];
+  work->recvProtoLL = protoLL[0];
+  work->recvRegistered = registered[0];
+  work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
+  work->recvRank = recvRank;
+  work->recvAddr = recvAddr;
+  work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
+
+  struct ncclProxyOp proxyOps[2] = {};
+  int nProxyOps = selfSend ? 0 : 2;
+  for (int dir=0; dir < nProxyOps; dir++) {
+    struct ncclProxyOp* op = &proxyOps[dir];
+    op->root = dir ? sendRank : recvRank;
+    op->sliceSteps = 1;
+    op->chunkSteps = 1;
+    op->dtype = ncclInt8;
+    op->redOp = ncclSum;
+    op->protocol = protocol[dir];
+    op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
+    op->chunkSize = chunkSize[dir];
+    op->reg = registered[dir];
+    // The following are modified per channel part in addWorkToChannels():
+    // op->buffer, op->nbytes, op->nsteps = ...;
+  }
+
+  nChannelsMax = std::max(nChannels[0], nChannels[1]);
+  for (int part=0; part < nChannelsMax; part++) {
+    int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
+    plan->channelMask |= uint64_t(1)<<channelId;
+    // Add batch first.
+    addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, ncclDevFuncId_P2p(), workOffset, p2pRound);
+    // Add proxy ops.
+    for (int dir=0; dir < nProxyOps; dir++) {
+      // Partition steps across channels.
+      int nParts = dir ? work->nSendChannels : work->nRecvChannels;
+      void* addr = dir ? work->sendAddr : work->recvAddr;
+      size_t bytes = dir ? work->sendBytes : work->recvBytes;
+
+      proxyOps[dir].recvbuff = nullptr;
+      if (nParts <= part) {
+        proxyOps[dir].nsteps = 0;
+      } else if (bytes == 0) {
+        proxyOps[dir].nsteps = 1;
+        proxyOps[dir].nbytes = 0;
+      } else {
+        size_t chunkDataSize = u32fp8Decode(dir ? work->sendChunkSize_u32fp8 : work->recvChunkSize_u32fp8);
+        size_t partBeg, partEnd;
+        ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
+        if (proxyOps[dir].reg) {
+          proxyOps[dir].nsteps = 1;
+          proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg;
+          proxyOps[dir].nbytes = partEnd-partBeg;
+        } else {
+          proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize);
+          proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize);
+        }
+        if (proxyOps[dir].protocol == NCCL_PROTO_LL) {
+          proxyOps[dir].nbytes *= 2;
+          proxyOps[dir].nbytes = roundUp(proxyOps[dir].nbytes, sizeof(union ncclLLFifoLine));
+        }
+      }
 
-    collInfo = ncclIntruQueueDequeue(&tasks->collTunedQueue);
-    NCCLCHECK(addTunedCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
-    tasks->nTasksColl -= 1;
+      if (proxyOps[dir].nsteps != 0) {
+        // Calculate the opCount after adding batch since then the batch count will
+        // equal one plus the batch index this p2p settled in.
+        proxyOps[dir].channelId = channelId;
+        proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
+        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+      }
+    }
   }
 
   return ncclSuccess;
 }
 
-static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
+static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
   size_t size = std::max(minSize, divUp(totalSize, minChannels));
   int nChannels = minChannels;
   while (size > maxSize && nChannels <= maxChannels/2) {
     nChannels *= 2;
     size = divUp(totalSize, nChannels);
   }
-  return alignUp(size, minSize);
+  return nChannels;
 }
 
 static ncclResult_t scheduleP2pTasksToPlan(
-    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
+    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
   ) {
-  struct ncclTasks* tasks = &comm->tasks;
   int nRanks = comm->nRanks;
-  struct ncclTasks::Peer* peers = tasks->peers;
-  int const *sendOrder = tasks->p2pSendOrder;
-  int const *recvOrder = tasks->p2pRecvOrder;
+  struct ncclKernelPlanner::Peer* peers = comm->planner.peers;
 
   plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
   if (!plan->kernelSpecialized) {
@@ -895,264 +1002,213 @@ static ncclResult_t scheduleP2pTasksToPlan(
   }
 
   // Compute how much to split operations
-  // Natural step size matching buffer steps.
-  ssize_t stepSize = comm->p2pChunkSize;
   // Try to use all channels
   int nChannelsMax = comm->p2pnChannelsPerPeer;
   int nChannelsMin = nChannelsMax;
   // Try to use all channels, but one channel per operation.
   while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
 
-  bool fuseOk = false;
-  // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries.
-  while (tasks->nTasksP2p != 0) {
-    for (int i=0; i < tasks->p2pOrderSteps; i++) {
-      int sendPeer = sendOrder[i];
-      int recvPeer = recvOrder[i];
-      struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL;
-      struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL;
-      if (sendPeer == comm->rank) {
-        if (recvPeer != comm->rank) {
-          WARN("Sendrecv plan not aligned for self");
-          return ncclInternalError;
-        }
-        if (send && recv == nullptr) {
+  while (comm->planner.nTasksP2p != 0) {
+    for (int round=0; round < nRanks; round++) {
+      int sendRank = comm->p2pSchedule[round].sendRank;
+      int recvRank = comm->p2pSchedule[round].recvRank;
+      struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendRank].sendQueue);
+      struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvRank].recvQueue);
+      if (send == nullptr && recv == nullptr) continue;
+
+      if (sendRank == comm->rank) {
+        if (send != nullptr && recv == nullptr) {
           WARN("Trying to send to self without a matching recv");
           return ncclInvalidUsage;
         }
-        if (send == nullptr && recv) {
+        if (send == nullptr && recv != nullptr) {
           WARN("Trying to recv to self without a matching send");
           return ncclInvalidUsage;
         }
       }
-      if (send != nullptr || recv != nullptr) {
-        char* recvPtr = recv ? (char*)recv->buff : nullptr;
-        char* sendPtr = send ? (char*)send->buff : nullptr;
-        ssize_t recvBytes = recv ? recv->bytes : 0;
-        ssize_t sendBytes = send ? send->bytes : 0;
-        ssize_t minSize = comm->nNodes > 1 ? stepSize/2 : stepSize/8;
-        ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32;
-        ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
-        ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
-        // Zero size send/recv are syncs, encode here with -1.
-        recvBytes = recv && recvBytes == 0 ? -1 : recvBytes;
-        sendBytes = send && sendBytes == 0 ? -1 : sendBytes;
-        // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1.
-        if (recv) recvPtr   += recv->chunk*recvChunkBytesMax;
-        if (recv) recvBytes -= recv->chunk*recvChunkBytesMax;
-        if (send) sendPtr   += send->chunk*sendChunkBytesMax;
-        if (send) sendBytes -= send->chunk*sendChunkBytesMax;
-
-        do {
-          if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false;
-          ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved
-          ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax);
-          if (recvChunkBytes != 0) {
-            if (recvChunkBytes == -1) recvChunkBytes = 0;
-            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
-            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk));
-            fuseOk = true;
-            recvPtr += recvChunkBytes;
-            recvBytes -= recvChunkBytes;
-            recv->chunk += 1;
-            if (recvBytes <= 0) {
-              recvBytes = 0; // in case still -1
-              ncclIntruQueueDequeue(&peers[recvPeer].recvQueue);
-              tasks->nTasksP2p -= 1;
-            }
-          }
-          if (sendChunkBytes != 0) {
-            if (sendChunkBytes == -1) sendChunkBytes = 0;
-            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
-            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, fuseOk));
-            fuseOk = true;
-            sendPtr += sendChunkBytes;
-            sendBytes -= sendChunkBytes;
-            send->chunk += 1;
-            if (sendBytes <= 0) {
-              sendBytes = 0; // in case still -1
-              ncclIntruQueueDequeue(&peers[sendPeer].sendQueue);
-              tasks->nTasksP2p -= 1;
-            }
-          }
-        } while (sendBytes != 0 || recvBytes != 0);
+      ssize_t sendBytes = send ? send->bytes : -1;
+      ssize_t recvBytes = recv ? recv->bytes : -1;
+      void* sendBuff = send ? send->buff : nullptr;
+      void* recvBuff = recv ? recv->buff : nullptr;
+
+      if (sendRank == comm->rank && send->buff == recv->buff) {
+        // Skip send to self in-place (we don't need to support this).
+        ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
+        ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+        comm->planner.nTasksP2p -= 2;
+      } else {
+        // Ensure room for worst case of one new batch per channel.
+        if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
+          return ncclSuccess;
+        }
+        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
+        if (send != nullptr) {
+          ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
+          comm->planner.nTasksP2p -= 1;
+        }
+        if (recv != nullptr) {
+          ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+          comm->planner.nTasksP2p -= 1;
+        }
       }
     }
   }
   return ncclSuccess;
 }
 
-// Comparison of monotonic rolling counters.
-static inline bool rollingLess32(uint32_t a, uint32_t b) {
-  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
-  return a-b > PositiveMax;
-}
-static inline uint32_t rollingMin32(uint32_t a, uint32_t b) {
-  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
-  return (b-a <= PositiveMax) ? a : b;
-}
-
-// Spin until its safe to increase comm->workFifoSent to desiredSent.
-static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
-  if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) {
-    while (1) {
-      // We have to poll for notifications from device.
-      uint32_t* doneLive = comm->workFifoDone;
-      uint32_t ackd[MAXCHANNELS];
-      for (int c=0; c < MAXCHANNELS; c++) {
-        ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED);
-      }
-      // Compiler-only fence to prevent fusion of loops to encourage dense loads.
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-      uint32_t ackdAll = comm->workFifoSent;
-      for (int c=0; c < MAXCHANNELS; c++) {
-        // ackdAll is min over all non-quiesced channels
-        if (ackd[c] != comm->channels[c].workFifoSent)
-          ackdAll = rollingMin32(ackdAll, ackd[c]);
+// Spin until its safe to increase comm->workFifoProduced to desiredProduced.
+static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) {
+  bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
+  if (hasRoom) return;
+  while (true) {
+    // We have to poll for notifications from device.
+    uint32_t* consumedLive = comm->workFifoConsumed;
+    uint32_t consumed[MAXCHANNELS];
+    for (int c=0; c < MAXCHANNELS; c++) {
+      consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED);
+    }
+    // Compiler-only fence to prevent fusion of loops to encourage dense loads.
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+    uint32_t produced = comm->workFifoProduced;
+    uint32_t consumedLeast = produced;
+    for (int c=0; c < MAXCHANNELS; c++) {
+      // consumedLeast is min over all non-quiesced channels
+      if (consumed[c] != comm->channels[c].workFifoProduced) {
+        if ((produced - consumedLeast) < (produced - consumed[c])) {
+          consumedLeast = consumed[c];
+        }
       }
+    }
 
-      // Compiler only fence to prevent fusion of loops to encourage dense stores.
-      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    // Compiler only fence to prevent fusion of loops to encourage dense stores.
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
 
-      for (int c=0; c < MAXCHANNELS; c++) {
-        // Advance counter on quiesced channels so they don't lag behind
-        // too far where they could get lost in 32-bit wraparound.
-        if (ackd[c] == comm->channels[c].workFifoSent) {
-          comm->channels[c].workFifoSent = ackdAll;
-          __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED);
-        }
+    for (int c=0; c < MAXCHANNELS; c++) {
+      // Advance counter on quiesced channels so they don't lag behind
+      // too far where they could get lost in 32-bit wraparound.
+      if (consumed[c] == comm->channels[c].workFifoProduced) {
+        comm->channels[c].workFifoProduced = consumedLeast;
+        __atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED);
       }
-      comm->workFifoAckdMin = ackdAll;
-
-      // See if that was enough.
-      if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break;
-      sched_yield();
     }
+    comm->workFifoConsumedLeast = consumedLeast;
+
+    hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
+    if (hasRoom) break;
+    sched_yield();
   }
 }
 
 static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  bool persistent = plan->persistent;
-  int channelUbound = plan->channelUbound;
-  int nWork = 0;
-  for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork;
-
-  struct ncclWork* workHeap;
-  if (!persistent) {
-    workHeap = comm->workFifoHeap;
-  } else {
-    workHeap = ncclMemoryStackAlloc<struct ncclWork>(&comm->memScoped, nWork);
-  }
-  uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1;
-  uint32_t ixSent;
-  if (persistent) {
-    ixSent = 0;
-  } else {
-    ixSent = comm->workFifoSent;
-    // First work for a channel has to be at workHeap+blockIdx.x which means
-    // we cannot tolerate fifo wraparound. So round up to the wrap boundary
-    // if not doing so would incur crossing it.
-    if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) {
-      ixSent = (ixSent + ixMask) & ~ixMask;
-      // Need to update workFifoSent so waitWorkFifoAvailable() knows we've
-      // skipped those elements. Consider if all the channels report quiesced,
-      // this way the skipped slots will be considered consumed as well.
-      comm->workFifoSent = ixSent;
-    }
-    waitWorkFifoAvailable(comm, ixSent + nWork);
-  }
-  uint32_t ixHead = ixSent;
-  ixSent += plan->channelCount;
-  int channelsWithWork = 0; // number of channels below `c` with work structs.
-  for (int c=0; c < channelUbound; c++) {
-    struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue);
-    // Offset of first work equals number of channels below with work.
-    uint32_t ix = ixHead + channelsWithWork;
-    channelsWithWork += q != nullptr ? 1 : 0;
-    while (q != nullptr) {
-      if (q->next != nullptr) {
-        q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask);
-      } else {
-        q->work.header.inFifo = !persistent ? 1 : 0;
-        // Tell channel to ack us back ix+1 indicating that all slots up to and
-        // including ix have been consumed.
-        q->work.header.doneAcks = ix+1;
-        comm->channels[c].workFifoSent = ix+1;
-      }
-      workHeap[ix & ixMask] = q->work; // C++ struct assignment
-      q = q->next;
-      if (q != nullptr) ix = ixSent++;
+  size_t workBytes = plan->workBytes;
+  size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
+  void* fifoBuf;
+  uint32_t fifoCursor, fifoMask;
+
+  switch (plan->workStorageType) {
+  case ncclDevWorkStorageTypeArgs:
+    plan->kernelArgs->workBuf = nullptr;
+    fifoBuf = (void*)plan->kernelArgs;
+    fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes;
+    fifoMask = ~0u;
+    break;
+  case ncclDevWorkStorageTypeFifo:
+    fifoBuf = comm->workFifoBuf;
+    fifoCursor = comm->workFifoProduced;
+    fifoMask = comm->workFifoBytes-1;
+    waitWorkFifoAvailable(comm, fifoCursor + workBytes);
+    plan->kernelArgs->workBuf = comm->workFifoBufDev;
+    break;
+  case ncclDevWorkStorageTypePersistent:
+    ncclMemoryStackPush(&comm->memScoped);
+    fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16);
+    fifoCursor = 0;
+    fifoMask = ~0u;
+    break;
+  default:
+    return ncclInternalError;
+  }
+  plan->kernelArgs->workMask = fifoMask;
+
+  // Batches were placed after kernelArgs by finishPlan(). Only thing left to
+  // do is translate the work offset from zero based (in plan) to:
+  //  ncclDevWorkStorageTypeArgs: offset from beginning of kernel args
+  //  ncclDevWorkStorageTypeFifo: offset from base of fifo
+  //  ncclDevWorkStorageTypePersistent: no translation since our dedicated buffer will also begin at zero.
+  struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1);
+  for (int b=0; b < plan->nWorkBatches; b++) {
+    batchZero[b].offsetBase += fifoCursor;
+  }
+
+  // Write the channel-shared work structs.
+  struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue);
+  while (workNode != nullptr) {
+    char* dst = (char*)fifoBuf;
+    char* src = (char*)(workNode+1);
+    for (int n = workNode->size; n != 0; n -= 16) {
+      memcpy(
+        __builtin_assume_aligned(dst + (fifoCursor & fifoMask), 16),
+        __builtin_assume_aligned(src, 16),
+        16
+      );
+      fifoCursor += 16;
+      src += 16;
     }
+    workNode = workNode->next;
   }
 
-  if (!persistent) {
-    comm->workFifoSent = ixSent;
-    if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence();
-    plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask];
-  } else {
-    NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork));
-    NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork));
+  switch (plan->workStorageType) {
+  case ncclDevWorkStorageTypeFifo:
+    comm->workFifoProduced = fifoCursor;
+    if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence();
+    break;
+  case ncclDevWorkStorageTypePersistent:
+    NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes));
+    plan->kernelArgs->workBuf = plan->workBufPersistent;
+    NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes));
+    ncclMemoryStackPop(&comm->memScoped);
+    break;
+  default: break;
   }
   return ncclSuccess;
 }
 
 static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   uint64_t collOpCount = comm->sharedRes->collOpCount;
+  uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/};
   // Advance comm's collOpCount by number of colls in this plan.
   comm->sharedRes->collOpCount += plan->collOpCount;
 
-  uint64_t p2pOpBump[MAXCHANNELS];
-  struct ncclProxyOp* heads[MAXCHANNELS];
-  uint64_t headIds[MAXCHANNELS];
-  int nHeads = 0;
-  for (int c=0; c < plan->channelUbound; c++) {
-    p2pOpBump[c] = 0;
-    heads[c] = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
-    nHeads += (heads[c] != nullptr) ? 1 : 0;
-    headIds[c] = (heads[c] != nullptr) ? heads[c]->opCount : uint64_t(-1);
-  }
-
-  while (nHeads != 0) {
-    int minChan = -1;
-    uint64_t minId = uint64_t(-1);
-    // We store the heads[c]->opCount in headIds[c] specifically to remove indirect
-    // loads from this loop which speeds it up considerably.
-    for (int c=0; c < plan->channelUbound; c++) {
-      uint64_t id = headIds[c];
-      id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's
-      if (id < minId) { minChan = c; minId = id; }
-    }
-
-    struct ncclProxyOp* q = heads[minChan];
-    uint64_t oldId = headIds[minChan]; // same as q->opCount
-    // Advance heads[c]
-    heads[minChan] = q->enqNext;
-    if (q->enqNext == nullptr) nHeads -= 1;
-    headIds[minChan] = (q->enqNext != nullptr) ? q->enqNext->opCount : uint64_t(-1);
-
+  struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
+  while (op != nullptr) {
+    uint64_t oldId = op->opCount;
     // Ignoring the bottom tag bit, opCount's are zero-based within plan so
     // translate them to the tip of the comm's history.
     if (oldId & 1) { // p2p
       // opCount is monotonic increasing within a plan's channel so just
       // remember last value to compute max.
-      p2pOpBump[minChan] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide
-      q->opCount = (comm->sharedRes->p2pOpCount[minChan]<<1) + oldId;
+      p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide
+      op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId;
     } else { // coll
-      q->opCount = (collOpCount<<1) + oldId;
+      op->opCount = (collOpCount<<1) + oldId;
     }
 
-    NCCLCHECK(ncclProxySaveOp(comm, q, nullptr));
-    q->opCount = oldId; // Restore for next uploadProxyOps()
+    NCCLCHECK(ncclProxySaveOp(comm, op, nullptr));
+    op->opCount = oldId; // Restore for next uploadProxyOps()
+
+    struct ncclProxyOp* opNext = op->enqNext;
     if (!plan->persistent) {
       // Non-persistent kernels upload ops only once so can be free'd here.
-      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
+      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op);
     }
+    op = opNext;
   }
 
-  for (int c=0; c < plan->channelUbound; c++) {
-    // Erase proxyOpQueue since all ops were free'd back to mempool.
-    if (!plan->persistent) ncclIntruQueueConstruct(&plan->channels[c].proxyOpQueue);
+  // Erase proxyOpQueue since all ops were free'd back to mempool.
+  if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue);
+
+  for (int c=0; c < MAXCHANNELS; c++) {
     // Advance channel's p2pOpCount by number of p2p's in this plan channel.
     comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
   }
@@ -1182,33 +1238,20 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
     comm->persistentRefs -= 1;
-    NCCLCHECK(ncclCudaFree(plan->workHead));
-    for (int c=0; c < plan->channelUbound; c++) {
-      struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
-      while (q != nullptr) {
-        struct ncclProxyOp* q1 = q->enqNext;
-        ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
-        q = q1;
-      }
-    }
-    while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) {
-      struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue);
-      CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
-      ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
-    }
-    /* free mcHandle */
-    while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) {
-      struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue);
-      NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
-      INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
-      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
+    NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
+    struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
+    while (q != nullptr) {
+      struct ncclProxyOp* q1 = q->enqNext;
+      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
+      q = q1;
     }
-    while (!ncclIntruQueueEmpty(&plan->collnetHandleQueue)) {
-      struct ncclCollnetHandleList* obj = ncclIntruQueueDequeue(&plan->collnetHandleQueue);
-      NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyconn, obj->collnetHandle));
-      INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->collnetHandle, obj->size, obj->buffer);
-      ncclMemoryPoolFree(&comm->memPool_ncclCollnetHandleList, obj);
+    ncclResult_t result = ncclSuccess;
+    while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
+      struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
+      ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb
+      if (res1 != ncclSuccess) result = res1;
     }
+    NCCLCHECK(result);
   }
   ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
   return ncclSuccess;
@@ -1226,57 +1269,54 @@ static void persistentDestructor(void* plans_) {
 
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   ncclResult_t result = ncclSuccess;
-  struct ncclTasks* tasks = &comm->tasks;
-  bool persistent = ncclCudaGraphValid(tasks->capturingGraph);
+  struct ncclKernelPlanner* planner = &comm->planner;
+  bool persistent = ncclCudaGraphValid(planner->capturingGraph);
+  planner->persistent = persistent;
   int nPlans = 0;
 
   // Poll for callbacks sent to us from other threads. Typically these free
   // resources from to our memory pools.
   NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));
 
-  // We already have one frame present which holds all of our tasks (which we
-  // are about to schedule). Now push an additional frame for allocating
-  // work structs (see appendWorkElem() variants all use scoped allocation).
-  ncclMemoryStackPush(&comm->memScoped);
-
-  if (tasks->nTasksColl + tasks->nTasksP2p != 0) {
+  if (planner->nTasksColl + planner->nTasksP2p != 0) {
     do {
+      memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));
+
       struct ncclKernelPlan* plan = ncclMemoryPoolAlloc<struct ncclKernelPlan>(&comm->memPool_ncclKernelPlan, &comm->memPermanent);
-      ncclIntruQueueEnqueue(&comm->planQueue, plan);
-      nPlans += 1;
       plan->comm = comm;
       plan->reclaimer.fn = reclaimPlan;
       plan->persistent = persistent;
+      // uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit.
+      plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
+                                         : ncclDevWorkStorageTypeFifo;
 
+      struct ncclKernelPlanBudget budget;
+      budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
       // Non-persistent kernels fill up at most half of our fifo per kernel.
-      int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2;
-      int nWorkBudgetOld = nWorkBudget;
+      budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
 
       // Drain coll tasks first. This is essential since we partition tasks based
       // on the work budget and p2p work isn't collective. If we were to drain p2p
       // first, the place where we cut the kernel could vary by rank which would
       // cause the "shortest channel first" channel picker to have divergent results.
-      if (tasks->nTasksColl != 0) {
-        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure);
+      if (planner->nTasksColl != 0) {
+        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
       }
       // And only drain p2p tasks once colls are depleted.
-      if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) {
-        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure);
+      if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
+        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
       }
-      if (nWorkBudget == nWorkBudgetOld) {
-        // We weren't able to fit any tasks into our budget which means now we're
-        // stuck in an infinite loop. We defer this check until here, instead of
-        // doing it in comm init, to permit testing with insanely shallow queues
-        // for cases where that's expected to still work (e.g. few channels).
-        WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS);
-        result = ncclInvalidUsage;
-        goto failure;
+      finishPlan(comm, plan);
+      if (plan->workBytes != 0) {
+        ncclIntruQueueEnqueue(&planner->planQueue, plan);
+        nPlans += 1;
       }
-      finishPlan(plan);
-    } while (tasks->nTasksColl + tasks->nTasksP2p != 0);
+    } while (planner->nTasksColl + planner->nTasksP2p != 0);
+
+    struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue);
+    planner->unlaunchedPlansHead = planHead;
 
-    struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);
-    comm->unlaunchedPlansHead = planHead;
+    if (nPlans == 0) return ncclSuccess;
 
     // Semantically we want these dependencies for the kernels launched:
     //   1. Launch host task on hostStream.
@@ -1292,15 +1332,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
     //   7. userStream[1...] each waits on deviceStream
     // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
     // at least one of the two streams to be strong-stream.
-    cudaStream_t launchStream = tasks->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
+    cudaStream_t launchStream = planner->streams->stream;
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
 
     // Create dependency for device stream on user streams. First from extra user
     // streams to deviceStream. Then deviceStream to first user stream.
-    for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
+    for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
+    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
 
     if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
       // We have to launch host tasks to push proxy args. We are careful to only
@@ -1310,28 +1350,24 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
         if (plan->hasProxyOps) {
           if (!acquired) {
             acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
           }
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
+          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
       if (acquired) {
         // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
       }
     }
 
     if (persistent) {
       comm->persistentRefs += nPlans;
-      NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
+      NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
-
-  if (false) {
-  failure:
-    ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's
-  }
+failure:
   return result;
 }
 
@@ -1349,13 +1385,21 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  struct ncclTasks* tasks = &comm->tasks;
-  void *fn = plan->kernelFn;
-  cudaStream_t launchStream = tasks->streams->stream;
-  dim3 grid = {(unsigned)plan->channelCount, 1, 1};
+  struct ncclKernelPlanner* planner = &comm->planner;
+  int nChannels = countOneBits(plan->channelMask);
+  void* sym = plan->kernelFn;
+  dim3 grid = {(unsigned)nChannels, 1, 1};
   dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
-  size_t smem = ncclShmemDynamicSize(comm->cudaArch);
-  void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
+  int smem = ncclShmemDynamicSize(comm->cudaArch);
+  cudaStream_t launchStream = planner->streams->stream;
+  void* extra[] = {
+    CU_LAUNCH_PARAM_BUFFER_POINTER, plan->kernelArgs,
+    CU_LAUNCH_PARAM_BUFFER_SIZE, &plan->kernelArgsSize,
+    CU_LAUNCH_PARAM_END
+  };
+
+  CUfunction fn;
+  CUDACHECK(cudaGetFuncBySymbol(&fn, sym));
 
   #if CUDART_VERSION >= 11080
   int driverVersion;
@@ -1364,8 +1408,8 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     int compCap = comm->compCap;
     unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
 
-    cudaLaunchConfig_t launchConfig = {0};
-    cudaLaunchAttribute launchAttrs[3];
+    CUlaunchConfig launchConfig = {0};
+    CUlaunchAttribute launchAttrs[3];
     int attrs = 0;
     /* Cooperative Group Array (CGA)
      * On sm90 and later we have an extra level of hierarchy where we
@@ -1380,31 +1424,37 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     if (clusterSize) {
       // Grid dimension must be divisible by clusterSize
       if (grid.x % clusterSize) clusterSize = 1;
-      launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension;
-      launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1};
-      launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
-      launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      launchAttrs[attrs++].value.clusterDim = {clusterSize, 1, 1};
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      launchAttrs[attrs++].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
     }
     #if CUDART_VERSION >= 12000
     if (compCap >= 90 && driverVersion >= 12000) {
       // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90)
-      launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain;
-      launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain();
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN;
+      launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
     }
     #endif
-    launchConfig.gridDim = grid;
-    launchConfig.blockDim = block;
-    launchConfig.dynamicSmemBytes = smem;
+    launchConfig.gridDimX = grid.x;
+    launchConfig.gridDimY = grid.y;
+    launchConfig.gridDimZ = grid.z;
+    launchConfig.blockDimX = block.x;
+    launchConfig.blockDimY = block.y;
+    launchConfig.blockDimZ = block.z;
+    launchConfig.sharedMemBytes = smem;
     launchConfig.attrs = launchAttrs;
     launchConfig.numAttrs = attrs;
-    launchConfig.stream = launchStream;
+    launchConfig.hStream = launchStream;
 
-    CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
+    //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
+    CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
     return ncclSuccess;
   }
   #endif
   // Standard kernel launch
-  CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
+  CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra));
+  //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream));
   return ncclSuccess;
 }
 
@@ -1426,35 +1476,30 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
 
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
   ncclResult_t result = ncclSuccess;
-  struct ncclTasks* tasks = &comm->tasks;
-  tasks->workBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0
-
-  // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare
-  // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here.
-  ncclMemoryStackPop(&comm->memScoped);
+  struct ncclKernelPlanner* planner = &comm->planner;
 
-  if (!ncclIntruQueueEmpty(&comm->planQueue)) {
+  if (!ncclIntruQueueEmpty(&planner->planQueue)) {
     // Reset queue to empty without destroying plans since those will be sent
     // back to us for reclaiming via callbackQueue.
-    ncclIntruQueueConstruct(&comm->planQueue);
-    cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
+    ncclIntruQueueConstruct(&planner->planQueue);
+    cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
     // Create dependency for deviceStream on launchStream. We know that deviceStream
     // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
     // so we can say that launchStream subsumes it.
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
+    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
   resume1:
     // Create dependency for other user streams (skip launch stream) on deviceStream.
     // Again, the user streams haven't been touched since deviceStream waited on them
     // so we can say they are subsumed by deviceStream.
-    struct ncclCudaStreamList* sl = tasks->streams->next;
-    tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
+    struct ncclCudaStreamList* sl = planner->streams->next;
+    planner->streams = nullptr; // Reset comm->planner.streams to empty.
     while (sl != nullptr) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
     resume2:
       sl = sl->next;
     }
     // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
   resume3:;
   }
   return result;
@@ -1464,15 +1509,20 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
 
-static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) {
+static inline ncclResult_t getCollNetSupport(
+    struct ncclComm* comm, struct ncclTaskColl* info, int* collNetSupport
+  ) {
   // Translate ncclAvg and PreMulSum
-  ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
-  *collNetSupport = info->comm->collNetSupport;
-  switch (info->coll) {
+  ncclRedOp_t netOp = info->opHost;
+  if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
+    netOp = ncclSum;
+  }
+  *collNetSupport = comm->collNetSupport;
+  switch (info->func) {
   case ncclFuncAllReduce:
   case ncclFuncReduce:
   case ncclFuncReduceScatter:
-    *collNetSupport &= info->comm->collNetSupportMatrix[netOp][info->datatype];
+    *collNetSupport &= comm->collNetSupportMatrix[netOp][info->datatype];
     break;
   default:
     break;
@@ -1480,339 +1530,329 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
   return ncclSuccess;
 }
 
+static void initCollCostTable(float** collCostTable) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) {
+    for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
+      table[a][p] = NCCL_ALGO_PROTO_IGNORE;
+    }
+  }
+}
+
 // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
-static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) {
-  struct ncclComm* comm = collInfo->comm;
+static ncclResult_t updateCollCostTable(
+    struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
+    int collNetSupport, int nvlsSupport, int numPipeOps,
+    float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime
+  ) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+
   if (comm->nRanks == 1) {
-    collInfo->algorithm = NCCL_ALGO_RING;
-    collInfo->protocol = NCCL_PROTO_SIMPLE;
-  }
-  else if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) {
-    float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
-    float backupMinTime = 3600000000.0;
-    bool backup = false;
-    int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up.
-    int backupProto = NCCL_PROTO_UNDEF;
-    // Find algorithm / protocol.
-    collInfo->algorithm = -1;
-    collInfo->protocol = -1;
-    int nAlgos = NCCL_NUM_ALGORITHMS;
-    for (int a=0; a<nAlgos; a++) {
-      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
-      if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
-      if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
-      /* now we only support single-node NVLS allgather and reducescatter */
-      if (a == NCCL_ALGO_NVLS && (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
-
-      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        float time;
-        NCCLCHECK(ncclTopoGetAlgoTime(collInfo, a, p, numPipeOps, &time, &backup));
-        if (!backup) {
-          if (time >= 0 && time < minTime) {
-            collInfo->algorithm = a;
-            collInfo->protocol = p;
-            minTime = time;
-          }
-        } else {
-          if (time >= 0 && time < backupMinTime) {
-            backupAlgo = a;
-            backupProto = p;
-            backupMinTime = time;
-          }
-        }
-      }
-    }
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+    return ncclSuccess;
+  }
 
-    if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) {
-      if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
-        WARN("Error : no algorithm/protocol available");
-        return ncclInternalError;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
+    if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
+    if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
+    /* now we only support single-node NVLS allgather and reducescatter */
+    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      bool backup;
+      float time;
+      NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &time, &backup));
+      if (!backup) {
+        table[a][p] = time;
+      } else {
+        if (time >= 0.0 && time < *backupTime) {
+          *backupAlgo = a;
+          *backupProto = p;
+          *backupTime = time;
+        }
       }
-      collInfo->algorithm = backupAlgo;
-      collInfo->protocol = backupProto;
     }
-    if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime);
-    TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime);
   }
 
   return ncclSuccess;
 }
 
-// Use the default topo-based tuner if tuner plugin is not successful.
-// Call the plugin first. Let it set algo+proto, and/or nChannels.
-// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
-// Finally, nChannels will be overriden by the plugin setting.
-static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) {
-  collInfo->algorithm = NCCL_ALGO_UNDEF;
-  collInfo->protocol = NCCL_PROTO_UNDEF;
-  collInfo->nChannels = 0;
-  if (collInfo->comm->tuner != NULL) {
-    NCCLCHECK(collInfo->comm->tuner->getCollInfo(
-          collInfo->comm->tunerContext, collInfo->coll, collInfo->nBytes,
-          collNetSupport, nvlsSupport, numPipeOps,
-          &collInfo->algorithm, &collInfo->protocol, &collInfo->nChannels));
-  }
-
-  /* We only honor nChannels decision when user sets the nChannels by tuner plugin or the coll picks
-   * collnet algorithm. For other cases, we need to decide nChannels based on the maxBytesPerChannel */
-  if (collInfo->nChannels != 0)
-    collInfo->userTuned = true;
-  else
-    collInfo->userTuned = false;
-  return ncclSuccess;
-}
-
-/* Compute nChannels and nThreads. */
-static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) {
-  struct ncclComm *comm = collInfo->comm;
-  int nc = comm->collChannels;
-  int nt = comm->maxThreads[collInfo->algorithm][collInfo->protocol];
-  int threadThreshold = comm->threadThresholds[collInfo->algorithm][collInfo->protocol];
-
-  if (collInfo->nChannels == 0) {
-    /* not preset by users */
-    if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
-      // CollNet channel tuning
-      int ncSwitch = 16;
-      bool flag = true;
-      while (ncSwitch >= 1 && flag) {
-        while ((flag = collInfo->nBytes < nc * nt * collInfo->comm->channels[0].collnetDirect.nHeads * threadThreshold) && nc > ncSwitch) {
-          if (nc == ncSwitch + ncSwitch / 2) threadThreshold /= 2;
-          nc--;
-        }
-        ncSwitch /= 2;
-      }
-    } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
-      // NVLS should not need more than 16 channels to get peak BW.
-      nc = comm->nvlsChannels;
-    } else {
-      // Ring/Tree channel tuning
-      while (collInfo->nBytes < nc * nt * threadThreshold) {
-        if (nc >= 2) nc--;
-        else break;
+static ncclResult_t topoGetAlgoInfo(
+    struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
+    float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo
+  ) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+
+  float minTime = 3600000000.0;
+  int algorithm = info->algorithm = NCCL_ALGO_UNDEF;
+  int protocol = info->protocol = NCCL_PROTO_UNDEF;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      if (table[a][p] == NCCL_ALGO_PROTO_IGNORE) continue;
+      if (table[a][p] >= 0.0 && table[a][p] < minTime) {
+        algorithm = a;
+        protocol = p;
+        minTime = table[a][p];
       }
     }
-    collInfo->nChannels = nc;
-  } else {
-    nc = collInfo->nChannels;
   }
 
-  if (collInfo->nThreads == 0) {
-    if (collInfo->algorithm != NCCL_ALGO_NVLS && collInfo->algorithm != NCCL_ALGO_NVLS_TREE &&
-      collInfo->algorithm != NCCL_ALGO_COLLNET_DIRECT) {
-      while (collInfo->nBytes < nc * nt * threadThreshold) {
-        if (nt % 128 == 0) nt /= 2;
-        else break;
+  info->algorithm = algorithm;
+  info->protocol = protocol;
+  float time = minTime;
+
+  if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
+    if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
+      WARN("Error : no algorithm/protocol available");
+      return ncclInternalError;
+    }
+    info->algorithm = backupAlgo;
+    info->protocol = backupProto;
+    time = backupTime;
+  }
+  if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
+  if (simInfo) simInfo->estimatedTime = time;
+  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
+
+  int nc = comm->nChannels;
+  int nt = comm->maxThreads[info->algorithm][info->protocol];
+  int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
+  if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+    // CollNet channel tuning
+    int ncSwitch = 16;
+    bool flag = true;
+    while (ncSwitch >= 1 && flag) {
+      while ((flag = nBytes < nc*nt*comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) {
+        if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
+        nc--;
       }
+      ncSwitch /= 2;
     }
-
-    if (collInfo->protocol == NCCL_PROTO_SIMPLE) {
-      if (collInfo->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
-      // More threads or sync warps needed due to split thread model
-      if (collInfo->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
+  } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    // NVLS should not need more than 16 channels to get peak BW.
+    nc = comm->nvlsChannels;
+  } else {
+    // Ring/Tree channel tuning
+    while (nBytes < nc * nt * threadThreshold) {
+      if (nc >= 2) nc--;
+      else break;
     }
-    nt = nt / WARP_SIZE < 3 ? 3 * WARP_SIZE : nt;
-    collInfo->nThreads = nt;
   }
 
-  return ncclSuccess;
-}
-
-static ncclResult_t getPatternInfo(struct ncclInfo* collInfo) {
-  switch (collInfo->coll) {
-    case ncclFuncBroadcast:
-      collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
-    case ncclFuncReduce:
-      collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
-    case ncclFuncReduceScatter:
-    case ncclFuncAllGather:
-      collInfo->pattern =
-        collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
-        collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
-        ncclPatternRing; break;
-    case ncclFuncAllReduce:
-      collInfo->pattern =
-        collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
-        collInfo->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
-        collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
-        collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
-        collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
-        ncclPatternRingTwice; break;
-    default:
-      WARN("Unknown pattern for collective %d algorithm %d", collInfo->coll, collInfo->algorithm);
-      return ncclInternalError;
+  if (info->algorithm != NCCL_ALGO_NVLS && info->algorithm != NCCL_ALGO_NVLS_TREE &&
+    info->algorithm != NCCL_ALGO_COLLNET_DIRECT) {
+    while (nBytes < nc * nt * threadThreshold) {
+      if (nt % 128 == 0) nt /= 2;
+      else break;
+    }
   }
-  return ncclSuccess;
-}
-
-static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo) {
-  collInfo->workFuncIndex = ncclDevFuncId(collInfo->coll, collInfo->opFull.op, collInfo->datatype, collInfo->algorithm, collInfo->protocol);
-  return ncclSuccess;
-}
-
-static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work) {
-  work->sendbuff = collInfo->sendbuff;
-  work->recvbuff = collInfo->recvbuff;
-  work->root = collInfo->root;
-  work->count = collInfo->count;
-  work->nWarps = collInfo->nThreads / WARP_SIZE;
-  work->redOpArg = collInfo->opFull.scalarArg;
-  work->redOpArgIsPtr = collInfo->opFull.scalarArgIsPtr;
-  work->chunkCount = collInfo->chunkCount;
-  work->regUsed = 0;
-  work->isUsed = 1;
-
-  if (collInfo->comm->nNodes == 1)
-    work->oneNode = 1;
-  else
-    work->oneNode = 0;
-  if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
-    // Set direct direction for broadcast-gather (read or write)
-    work->direct = (collInfo->nBytes / collInfo->nChannels <= 1024 * 1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
-  } else {
-    work->direct = 0;
+  if (info->protocol == NCCL_PROTO_SIMPLE) {
+    if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
+    // More threads or sync warps needed due to split thread model
+    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
   }
+  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
+  if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
+  info->nMaxChannels = nc;
+  info->nWarps = nt/WARP_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work) {
-  work->workCount = workCount;
-  work->workOffset = workOffset;
-  work->lastChunkCount = lastChunkCount;
-  return ncclSuccess;
-}
-
-static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg) {
-  if (regBufType == NCCL_IPC_REG_BUFFER) {
-    workElemReg->elem = *work;
-    workElemReg->elem.regUsed = NCCL_IPC_REG_BUFFER;
-    for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collnetDirect.down[i];
-      if (peer == -1) break;
-      int j = comm->rankToLocalRank[peer]; // Get intra-node slot
-      workElemReg->dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
-      workElemReg->dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
-    }
-    for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collnetDirect.up[i];
-      if (peer == -1) break;
-      int j = comm->rankToLocalRank[peer];
-      // Output buffer of root peer
-      workElemReg->upOutputs[i] = regBufRecv[j];
-    }
-  } else if (regBufType == NCCL_NVLS_REG_BUFFER) {
-    workElemReg->elem = *work;
-    workElemReg->elem.regUsed = NCCL_NVLS_REG_BUFFER;
-    /* NVLS only has one send and recv buffer registered */
-    workElemReg->dnInputs[0] = regBufSend[0];
-    workElemReg->dnOutputs[0] = regBufRecv[0];
-  } else if (regBufType == NCCL_COLLNET_REG_BUFFER) {
-    workElemReg->elem = *work;
-    workElemReg->elem.regUsed = NCCL_COLLNET_REG_BUFFER;
-  } else {
-    /* impossible value */
-    WARN("Invalid regBufType %d\n", regBufType);
-    return ncclInvalidArgument;
-  }
+// Use the default topo-based tuner if tuner plugin is not successful.
+// Call the plugin first. Let it set algo+proto, and/or nChannels.
+// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
+// Finally, nChannels will be overriden by the plugin setting.
+static ncclResult_t getAlgoInfo(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/
+  ) {
+  size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count);
+  info->algorithm = NCCL_ALGO_UNDEF;
+  info->protocol = NCCL_PROTO_UNDEF;
+  int nMaxChannels = 0;
+  int backupAlgo = NCCL_ALGO_UNDEF;
+  int backupProto = NCCL_PROTO_UNDEF;
+  float backupTime = 3600000000.0;
+  float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  initCollCostTable((float **)collCostTable);
+  NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime));
+  if (comm->tuner != NULL) {
+    NCCLCHECK(comm->tuner->getCollInfo(
+          comm->tunerContext, info->func, nBytes,
+          numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+          &nMaxChannels));
+  }
+  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo));
+  info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels;
   return ncclSuccess;
 }
 
 NCCL_PARAM(NvlsTreeMaxChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);
 
-static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels) {
-  int stepSize = collInfo->comm->buffSizes[collInfo->protocol] / NCCL_STEPS;
-  int chunkSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->chunkSteps : 1;
-  int sliceSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->sliceSteps : 1;
-  int chunkSize = stepSize * chunkSteps;
+static ncclResult_t calcCollChunking(
+    struct ncclComm* comm, struct ncclTaskColl* info, int nChannels, size_t nBytes,
+    /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
+  ) {
+  ncclPattern_t pattern;
+  size_t grainSize = ncclProtoGrainSize(info->protocol);
+
+  switch (info->func) {
+  case ncclFuncBroadcast:
+    pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom;
+    break;
+  case ncclFuncReduce:
+    pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo;
+    break;
+  case ncclFuncReduceScatter:
+  case ncclFuncAllGather:
+    pattern =
+      info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
+      info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
+      ncclPatternRing;
+    break;
+  case ncclFuncAllReduce:
+    pattern =
+      info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
+      info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
+      info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
+      info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
+      info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
+      ncclPatternRingTwice;
+    break;
+  default:
+    WARN("Unknown pattern for collective %d algorithm %d", info->func, info->algorithm);
+    return ncclInternalError;
+  }
+
+  int nstepsPerLoop, nchunksPerLoop;
+  switch (pattern) {
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo:
+  case ncclPatternCollnetChain:
+    nstepsPerLoop = nchunksPerLoop = 1;
+    break;
+  case ncclPatternNvls:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    break;
+  case ncclPatternCollnetDirect:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads;
+    break;
+  case ncclPatternRing:
+    nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternRingTwice:
+    nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternNvlsTree:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    break;
+  default:
+    WARN("Unknown pattern %d", pattern);
+    return ncclInternalError;
+  }
 
-  if (collInfo->protocol == NCCL_PROTO_LL) chunkSize /= 2;
-  if (collInfo->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
+  int stepSize   = comm->buffSizes[info->protocol]/NCCL_STEPS;
+  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
+  int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
+  int chunkSize = stepSize*chunkSteps;
+  if (info->protocol == NCCL_PROTO_LL) chunkSize /= 2;
+  if (info->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
 
-  if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+  if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Optimize chunkSize / nSteps
-    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2;
-    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2;
-    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2;
-  } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
-    stepSize = collInfo->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+    while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2;
+    while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2;
+    while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2;
+  } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+    stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
     chunkSize = std::min(256 * 1024, stepSize * chunkSteps);
-    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2;
-    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
-    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
-  } else if (collInfo->algorithm == NCCL_ALGO_NVLS) {
-    int maxChunkSize = collInfo->comm->nvlsChunkSize;
-    if (collInfo->comm->nNodes > 1 && collInfo->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
+    while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2;
+    while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
+    while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
+  } else if (info->algorithm == NCCL_ALGO_NVLS) {
+    int maxChunkSize = comm->nvlsChunkSize;
+    if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
     if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
-    uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
+    uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
     if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
     if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
     if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
-  } else if (collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
+  } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
-    uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
-    chunkSize = collInfo->comm->nvlsChunkSize;
+    uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
+    chunkSize = comm->nvlsChunkSize;
     int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
-    if (maxChunkSize == -2) maxChunkSize = collInfo->comm->nNodes >= 4 ? 65536 : chunkSize;
+    if (maxChunkSize == -2) maxChunkSize = comm->nNodes >= 4 ? 65536 : chunkSize;
     chunkSize = std::min(chunkSize, maxChunkSize);
     if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
     if ((nBytes < (16 * (concurrentOps * chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
     if ((nBytes < (4 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
     if ((nBytes < (1 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
-  } else if (collInfo->algorithm == NCCL_ALGO_TREE && collInfo->protocol == NCCL_PROTO_LL128) {
-    int nNodes = collInfo->comm->nNodes;
-    float ppn = collInfo->comm->nRanks / (float)nNodes;
+  } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
+    int nNodes = comm->nNodes;
+    float ppn = comm->nRanks / (float)nNodes;
     float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
     while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
     while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
   }
 
-  collInfo->chunkSize = chunkSize;
-  collInfo->chunkCount = chunkSize / ncclTypeSize(collInfo->datatype);
-  collInfo->chunkSteps = chunkSteps;
-  collInfo->sliceSteps = sliceSteps;
-  collInfo->stepSize = stepSize;
-  return ncclSuccess;
-}
-
-static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp) {
-  proxyOp->nsteps = nsteps;
-  proxyOp->sliceSteps = collInfo->sliceSteps;
-  proxyOp->chunkSteps = collInfo->chunkSteps;
-  proxyOp->chunkSize = collInfo->chunkSize;
-  proxyOp->protocol = collInfo->protocol;
-  proxyOp->dtype = collInfo->datatype;
-  // Network sees avg as sum
-  proxyOp->redOp = collInfo->opFull.op == ncclDevPreMulSum || collInfo->opFull.op == ncclDevSumPostDiv ? ncclSum : collInfo->opFull.proxyOp;
-  proxyOp->pattern = collInfo->pattern;
-  proxyOp->coll = collInfo->coll;
-  proxyOp->root = collInfo->root;
+  // Compute directFlags of work struct.
+  if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+    // Set direct direction for broadcast-gather (read or write)
+    *outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
+  } else {
+    *outDirectFlags = 0;
+  }
+
+  // Compute nSteps for proxies
+  //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
+  chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize
+  int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize);
+  memset(proxyOp, 0, sizeof(*proxyOp));
+  proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps;
+  proxyOp->sliceSteps = sliceSteps;
+  proxyOp->chunkSteps = chunkSteps;
+  proxyOp->chunkSize = chunkSize;
+  proxyOp->protocol = info->protocol;
+  proxyOp->dtype = info->datatype;
+  if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
+    proxyOp->redOp = ncclSum; // Network sees avg as sum
+  } else {
+    proxyOp->redOp = info->opHost;
+  }
+  proxyOp->pattern = pattern;
+  proxyOp->coll = info->func;
+  proxyOp->root = info->root;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
   // because some protocols need to transmit more than the total size, plus they sometimes
   // round up
-  proxyOp->nbytes = collInfo->stepSize * proxyOp->sliceSteps;
-  if (collInfo->regBufType == NCCL_COLLNET_REG_BUFFER) {
+  proxyOp->nbytes = stepSize*sliceSteps;
+
+  if (info->regBufType == NCCL_COLLNET_REG_BUFFER) {
     proxyOp->reg = 1;
-    proxyOp->nsteps = DIVUP(collInfo->nBytes, NCCL_MAX_COLLNET_SIZE);
-    proxyOp->sendMhandle = collInfo->sendMhandle;
-    proxyOp->recvMhandle = collInfo->recvMhandle;
-    proxyOp->sendbuff = (uint8_t*)collInfo->sendbuff;
-    proxyOp->recvbuff = (uint8_t*)collInfo->recvbuff;
-    proxyOp->nbytes = collInfo->nBytes;
+    proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE);
+    proxyOp->sendMhandle = info->sendMhandle;
+    proxyOp->recvMhandle = info->recvMhandle;
+    proxyOp->sendbuff = (uint8_t*)info->sendbuff;
+    proxyOp->recvbuff = (uint8_t*)info->recvbuff;
+    proxyOp->nbytes = nBytes;
   } else {
     proxyOp->reg = 0;
   }
 
-  proxyOp->channelId = channelId;
-  proxyOp->opCount = opCount;
-
-  if (collInfo->pattern == ncclPatternCollnetDirect) {
-    proxyOp->specifics.collnetDirect.nNodes = collInfo->comm->nNodes;
-    proxyOp->specifics.collnetDirect.node = collInfo->comm->node;
-    if (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) {
-      proxyOp->specifics.collnetDirect.sizePerRank = collInfo->count * ncclTypeSize(collInfo->datatype);
+  if (pattern == ncclPatternCollnetDirect) {
+    proxyOp->specifics.collnetDirect.nNodes = comm->nNodes;
+    proxyOp->specifics.collnetDirect.node = comm->node;
+    if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) {
+      proxyOp->specifics.collnetDirect.sizePerRank = info->count*ncclTypeSize(info->datatype);
     }
   }
+
+  *outChunkSize = chunkSize;
   return ncclSuccess;
 }
 
@@ -1894,26 +1934,12 @@ static ncclResult_t hostToDevRedOp(
   return ncclSuccess;
 }
 
-static int collCmp(struct ncclInfo *a, struct ncclInfo *b) {
-  if (a->coll > b->coll)
-    return 1;
-  else if (a->coll == b->coll && a->datatype > b->datatype)
-    return 1;
-  else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op > b->opFull.op)
-    return 1;
-  else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op == b->opFull.op && a->count > b->count)
-    return 1;
-  else
-    return -1;
-}
-
-// Converts `info` to a task and adds it to `comm->tasks`. The exception is with
+// Converts `info` to a task and adds it to `comm->planner`. The exception is with
 // single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
 // thus don't need a task.
 static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
-  ncclTasks *tasks = &comm->tasks;
+  struct ncclKernelPlanner *planner = &comm->planner;
 
-  if (info->count == 0 && info->coll != ncclFuncSend && info->coll != ncclFuncRecv) return ncclSuccess;
   if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
     int peer = info->root;
     ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
@@ -1924,21 +1950,23 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
     p2p->buff = (void*)info->recvbuff;
     p2p->bytes = nBytes;
-    p2p->chunk = 0;
     ncclIntruQueueEnqueue(
-      isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue,
+      isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
       p2p);
-    tasks->nTasksP2p += 1;
+    planner->nTasksP2p += 1;
 
     // Mark channels that need pre-connect
     if (comm->rank != peer) {
-      int channelBaseId;
-      NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
-      if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) {
-        (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true;
+      if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+        (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
+        int round = 0;
+        while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
+                                      : comm->p2pSchedule[round].recvRank)) {
+          round += 1;
+        }
+        uint8_t base = ncclP2pChannelBaseForRound(comm, round);
         for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
-          int channelId;
-          NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
+          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
           if (isSendNotRecv) {
             if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
               comm->connectSend[peer] |= (1UL<<channelId);
@@ -1954,46 +1982,61 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       }
     }
   } else {
+    // Empty collectives can be discarded.
+    if (info->count == 0) return ncclSuccess;
+
     // Copy reduction op state from op handle into info struct here since the
     // op handle may be destroyed before ncclGroupEnd().
-    NCCLCHECK(hostToDevRedOp(&info->opFull, info->op, info->datatype, comm));
+    struct ncclDevRedOpFull opDev;
+    NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm));
 
     if (comm->nRanks == 1) {
-      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, info->opFull, info->datatype, info->stream));
+      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
       return ncclSuccess;
     } else {
       // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
       ncclGroupCommJoin(info->comm);
-      struct ncclInfo* t = ncclMemoryStackAlloc<struct ncclInfo>(&comm->memScoped);
-      info->nChannels = 0;
-      info->nThreads = 0;
-      info->algorithm = NCCL_ALGO_UNDEF;
-      info->protocol = NCCL_PROTO_UNDEF;
-      info->userTuned = false;
-      memcpy(t, info, sizeof(struct ncclInfo));
-      ncclIntruQueueSortEnqueue(&tasks->collQueue, t, collCmp);
-      tasks->workBytesTotal += info->count * ncclTypeSize(info->datatype);
-      tasks->nTasksColl += 1;
+      struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
+      t->func = info->coll;
+      t->sendbuff = info->sendbuff;
+      t->recvbuff = info->recvbuff;
+      t->count = info->count;
+      t->root = info->root;
+      t->datatype = info->datatype;
+      size_t elementSize = ncclTypeSize(t->datatype);
+      if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
+        t->count *= elementSize;
+        t->datatype = ncclInt8;
+        elementSize = 1;
+      }
+      t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
+      t->opHost = info->op;
+      t->opDev = opDev; // C++ struct assignment
+      t->chunkSteps = info->chunkSteps;
+      t->sliceSteps = info->sliceSteps;
+
+      planner->nTasksColl += 1;
+      ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
     }
   }
 
-  if (info->stream != tasks->streamRecent || tasks->streams == nullptr) {
-    tasks->streamRecent = info->stream;
-    struct ncclCudaStreamList* l = tasks->streams;
+  if (info->stream != planner->streamRecent || planner->streams == nullptr) {
+    planner->streamRecent = info->stream;
+    struct ncclCudaStreamList* l = planner->streams;
     while (true) {
       if (l == nullptr) { // Got to the end, this must be a new stream.
         struct ncclCudaGraph graph;
         NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
-        if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) {
+        if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
           WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
           return ncclInvalidUsage;
         }
-        tasks->capturingGraph = graph; // C++ struct assignment
+        planner->capturingGraph = graph; // C++ struct assignment
         // Add stream to list
         l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
         l->stream = info->stream;
-        l->next = tasks->streams;
-        tasks->streams = l;
+        l->next = planner->streams;
+        planner->streams = l;
         break;
       }
       if (l->stream == info->stream)
@@ -2019,10 +2062,10 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
   }
   NCCLCHECKGOTO(ArgsCheck(info), ret, fail);
 
-  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zu datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-  TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
+  TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zu,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
 
   NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
 
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 90687bb6a..b1b99d4e3 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -5,7 +5,9 @@
  ************************************************************************/
 
 #include "comm.h"
+#include "device.h"
 #include "graph.h"
+#include "transport.h"
 #include "trees.h"
 #include "rings.h"
 #include "topo.h"
@@ -84,6 +86,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
       topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
     }
   }
+  memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
 
   return ncclSuccess;
 }
@@ -188,7 +191,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     char line[1024];
-    sprintf(line, "CollNet channel %d rank %d ", c, rank);
+    sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
     int nDown = 0;
     for (int i=0; i<nHeads; i++) {
       if (rank == heads[i]) { // is head
@@ -334,10 +337,14 @@ int ncclMinNchannels() {
   if (minNchannels < 0) minNchannels = 0;
   return minNchannels;
 }
+
+extern int64_t ncclParamWorkArgsBytes();
+
 int ncclMaxNchannels() {
   int maxNchannels = MAXCHANNELS;
   if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
   if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+  maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
   if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
   if (maxNchannels < 1) {
     WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
@@ -363,6 +370,8 @@ void exchangeValues(int* v0, int* v1) {
   *v0 = tmp;
 }
 
+NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
+
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
   // Gather data from all ranks
   int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
@@ -444,13 +453,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
 
   // Setup CollNet
   if (comm->collNetSupport == 1) {
-    struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
+    struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-    if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
+    if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
     }
-    NCCLCHECK(connectCollNet(comm, collNetGraph));
+    NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -458,6 +467,12 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
   }
 
+  // Double the number of channels when using unpack networking (greater than 1 node)
+  // We won't automatically double past 16 channels, users can specify 32 if they want
+  if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+  }
+
   // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
   // We permit combining max, then min, to only use the first channels, then duplicate them.
   if (comm->sharedRes->owner != comm) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index e033c5b45..1380d2449 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -10,6 +10,8 @@
 #include "comm.h"
 #include "net.h"
 #include "channel.h"
+#include "transport.h"
+#include "device.h"
 
 // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
 
@@ -732,12 +734,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
 
 NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
 NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
-
-static int nextPow2(int v) {
-  int pow2 = 1;
-  while (pow2 < v) pow2 <<= 1;
-  return pow2;
-}
+extern int64_t ncclParamWorkArgsBytes();
 
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
   /* here we already honor comm->max/minCTAs for p2pnChannels. */
@@ -759,19 +756,17 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     }
   }
 
-  // Round to next pow2 nChannelsPerPeer and nChannels
-  comm->p2pnChannelsPerPeer = nextPow2(minChannels);
-  comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  // Make nChannelsPerPeer and nChannels powers of 2. This is relied on when
+  // mapping p2p peers to channels.
+  comm->p2pnChannelsPerPeer = pow2Up(minChannels);
+  comm->p2pnChannels = pow2Up(comm->p2pnChannels);
+
+  comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
+  comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels);
 
   // Init channels that weren't used so far
   for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
 
-  // We want to spread channels used when there aren't many and progressively
-  // fill the whole space of nChannels. To do so we mirror the bits in the
-  // nChannels space.
-  for (int c=0; c<comm->p2pnChannels; c++) {
-    comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
-  }
   return ncclSuccess;
 }
 
diff --git a/src/graph/search.cc b/src/graph/search.cc
index c7b4d96ae..7f16cb769 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -8,6 +8,7 @@
 #include "core.h"
 #include "graph.h"
 #include "topo.h"
+#include "transport.h"
 #include "xml.h"
 #include <math.h>
 
@@ -51,6 +52,15 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) {
+  // We assume there is at least one CPU and that the CPUs have the same
+  // architecture and vendor.
+  const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU];
+  comm->cpuArch = cpus->nodes[0].cpu.arch;
+  comm->cpuVendor = cpus->nodes[0].cpu.vendor;
+  return ncclSuccess;
+}
+
 static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
   for (int l=0; l<node2->nlinks; l++) {
     struct ncclTopoLink* link = node2->links+l;
@@ -104,7 +114,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
 }
 
 // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
-static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) {
   // First handle easy cases
   *node = system->nodes[type2].nodes+index2;
   if (type1 == -1) return ncclSuccess;
@@ -334,6 +344,42 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
+  int fwdg = 0;
+  int bwdg = 0;
+  struct ncclTopoNode* gpu = NULL;
+  float mul = 1.0 / (float)(system->nodes[GPU].count - 1);
+  do {
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu));
+  } while (gpu && ++fwdg < system->nodes[GPU].count);
+
+  if (gpu != NULL) {
+    do {
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu));
+    } while (gpu && ++bwdg < system->nodes[GPU].count);
+    if (gpu != NULL) {
+      // Both directions worked. Now we already have head, so pop the all other intra ranks.
+      int step = 1;
+      for (int index = 0; index < ngpus; ++index) {
+        if (index != g) {
+          graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank;
+          step++;
+        }
+      }
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
+    }
+    while (bwdg) {
+      bwdg--;
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu));
+    }
+  }
+  while (fwdg) {
+    fwdg--;
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
   struct ncclTopoNode* nvs;
   struct ncclTopoNode* gpu;
@@ -514,6 +560,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     }
   } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
+  } else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
+    NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time));
   } else if (step < system->nodes[GPU].count-1) {
     // Go to next GPU
     int next[NCCL_TOPO_MAX_NODES];
@@ -552,9 +600,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   int netCount;
+  int graphFound = 0;
   NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
   for (int i=0; i<netCount; i++) {
-    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
+    if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
     int n = nets[(graph->nChannels+i)%netCount];
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
     if (graph->collNet && net->net.collSupport == 0) continue;
@@ -571,12 +620,22 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
 
-    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
         int gpu;
+        int duplicate = 0;
         NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        // check whether there is duplicate head when one GPU connects with multiple NICs
+        for (int gc = 0; gc < graph->nChannels; gc++) {
+          if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
+            duplicate = 1;
+            break;
+          }
+        }
+        if (duplicate) continue;
         if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
+        graphFound = 1;
       }
     } else {
       if (graph->nChannels > 0) {
@@ -891,8 +950,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   int ccMin;
   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
-  // NVLS search must have ngpus heads at most.
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
+  // NVLS and COLLNET_DIRECT search must have ngpus heads at most.
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
+    graph->maxChannels = system->nodes[GPU].count;
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
@@ -1104,7 +1164,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
 exit:
   return ret;
 fail:
-  WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
+  WARN("Could not find NIC for rank %d in NVLS graph", comm->rank);
   goto exit;
 }
 
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 30304582f..d6af9282e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -11,6 +11,7 @@
 #include "nvmlwrap.h"
 #include "net.h"
 #include "coll_net.h"
+#include "transport.h"
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "xml.h"
@@ -51,7 +52,12 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
     return ncclSuccess;
   }
   for (int l=0; l<node->nlinks; l++) {
-    if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    // Go up the PCI tree to find the CPU. Follow only PCI switches.
+    if (node->links[l].type == LINK_PCI
+	&& (node->links[l].remNode->type == PCI
+	    || node->links[l].remNode->type == CPU)) {
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    }
     if (*cpu != NULL) return ncclSuccess;
   }
   return ncclSuccess;
@@ -109,11 +115,6 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
   n->type = type;
   n->id = id;
   if (type == GPU) {
-    // Create link to itself (used in some corner cases)
-    n->nlinks=1;
-    n->links[0].type = LINK_LOC;
-    n->links[0].remNode = n;
-    n->links[0].bw = LOC_BW;
     n->gpu.dev = NCCL_TOPO_UNDEF;
     n->gpu.rank = NCCL_TOPO_UNDEF;
     n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
@@ -279,8 +280,10 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 
   for (int l=0; l<node->nlinks; l++) {
     struct ncclTopoLink* link = node->links+l;
-    if (link->type == LINK_LOC) continue;
-    if (link->type != LINK_PCI || link->remNode != prevNode) {
+    if (link->type == LINK_LOC) {
+      sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
+      INFO(NCCL_GRAPH, "%s", line);
+    } else if (link->type != LINK_PCI || link->remNode != prevNode) {
       sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
       int nextOffset = strlen(line);
       if (link->type == LINK_PCI) {
@@ -443,7 +446,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
 
     for (int s=0; s<xmlPci->nSubs; s++) {
       struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
-      NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+      if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
+        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+      }
     }
   }
 
@@ -579,6 +584,38 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
+  if (strcmp(node->name, "pcilink") == 0) {
+    struct ncclTopoNode* pci = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    pBusId = NCCL_TOPO_ID(systemId, pBusId);
+    NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId));
+    if (pci == NULL) {
+      WARN("Add PCI Link error : could not find PCI SW %lx", pBusId);
+      return ncclInternalError;
+    }
+    struct ncclTopoNode* remote = NULL;
+    const char* target;
+    NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+    int64_t busId;
+    NCCLCHECK(busIdToInt64(target, &busId));
+    NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId)));
+    if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW));
+  } else {
+    if (strcmp(node->name, "cpu") == 0) {
+      NCCLCHECK(ncclGetSystemId(system, node, &systemId));
+    }
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
+    }
+  }
+  return ncclSuccess;
+}
+
+
 ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
   if (strcmp(node->name, "c2c") == 0) {
     struct ncclTopoNode* gpu = NULL;
@@ -626,6 +663,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
 
   NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
   NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
+  NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0));
 
   NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
   NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -668,6 +706,18 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
+  //refresh the switch topology by reading the link below
+  FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r");
+  if (fp != NULL) {
+    int tmp;
+    size_t r = fread(&tmp, sizeof(tmp), 1, fp);
+    if (r != 1)
+      INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy");
+    fclose(fp);
+  }
+  return ncclSuccess;
+}
 
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
   struct ncclXml* xml;
@@ -687,18 +737,17 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
   }
 
-  // Auto-detect GPUs if needed
-  for (int r=0; r<comm->nRanks; r++) {
-    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
-      struct ncclXmlNode* node;
-      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
-      if (node == NULL) continue;
-      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
-      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
-      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
-    }
+  NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
+
+  // Detect only the GPU managed by this process.  We'll get any others through XML fusion.
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
+  struct ncclXmlNode* node;
+  NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+  if (node) {
+    NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
+    NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
+    NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
   }
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
@@ -728,6 +777,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   for (int n=0; n<netDevCount; n++) {
     ncclNetProperties_t props;
     NCCLCHECK(comm->ncclNet->getProperties(n, &props));
+    comm->netDeviceType = props.netDeviceType;
     struct ncclXmlNode* netNode;
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
     NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -745,24 +795,46 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
   NCCLCHECK(ncclTopoTrimXml(xml));
 
+  // XML topo fusion.
+  int* localRanks;
+  int localRank = -1, nLocalRanks = 0;
   if (comm->MNNVL) {
     // MNNVL clique support
-    char* mem;
-    NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
-    struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
-    memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
-    NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
-    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
-    struct ncclXml* cliqueXml;
-    NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
-    for (int i = 0; i < comm->clique.size; i++) {
-      struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
-      NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
-      NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
+    nLocalRanks = comm->clique.size;
+    localRank = comm->cliqueRank;
+    localRanks = comm->clique.ranks;
+  } else {
+    // Intra-node fusion.  Much of the comm is not initialized yet at this point so we need to do our own calculations.
+    NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
+    for (int i = 0; i < comm->nRanks; i++) {
+      if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
+        if (i == comm->rank)
+          localRank = nLocalRanks;
+        localRanks[nLocalRanks++] = i;
+      }
     }
+  }
+  char* mem;
+  NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
+  struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
+  memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
+  NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
+  if (comm->MNNVL) {
+    // Ensure that we have enough room when fusing topos from multiple nodes.
     free(xml);
-    xml = cliqueXml;
+    NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
+  } else {
+    // In the intra-node case there's no need to enlarge the topo xml.
+    xml->maxIndex = 0;
+    free(localRanks);
+  }
+  for (int i = 0; i < nLocalRanks; i++) {
+    struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
+    NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
+    NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
   }
+  free(mem);
 
   xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
   if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 548747913..6613f3271 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -218,7 +218,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
       return ncclSuccess;
     }
   }
-  WARN("Could not find NET with id %lx\n", id);
+  WARN("Could not find NET with id %lx", id);
   return ncclInternalError;
 }
 
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index d8f0b6e44..f9d814a25 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -110,11 +110,9 @@ NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
 
 static float getNetOverhead(struct ncclComm* comm) {
   if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
-  else return 1.0;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
+  return 1.0;
 }
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
@@ -317,6 +315,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
     if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
   }
 
   for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
@@ -415,15 +414,15 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
   {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
 
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
-  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
-  float lat = info->comm->latencies[info->coll][algorithm][protocol];
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
+  float bw = comm->bandwidths[coll][algorithm][protocol];
+  float lat = comm->latencies[coll][algorithm][protocol];
 
   if (backup) {
     *backup = false;
     if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
       /* try back up RING algorithm */
-      bw = info->comm->ringbdw[info->coll][protocol];
+      bw = comm->ringbdw[coll][protocol];
       *backup = true;
     }
   }
@@ -431,15 +430,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
   if (bw == 0) {
     *time = -1.0; return ncclSuccess;
   }
-  int logSize = log2i(info->nBytes>>6);
-  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
-  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
-  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
-      && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
-    lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+  int logSize = log2i(nBytes>>6);
+  if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
+      && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
+    lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
   }
   // Tree pipelining saves latency in aggregation cases
-  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
-  *time = lat * latCount + (info->nBytes) / (1000 * bw);
+  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
+  *time = lat * latCount + nBytes / (1000 * bw);
   return ncclSuccess;
 }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index b145d34ef..c2c6a1c81 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -272,56 +272,34 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
   return ncclSuccess;
 }
 
+static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) {
+  for (int i = 0; i < srcParent->nSubs; i++) {
+    struct ncclXmlNode* srcNode = srcParent->subs[i];
+    struct ncclXmlNode* dstNode;
+    NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode));
+    if (dstNode == NULL) {
+      NCCLCHECK(xmlAddTree(dst, dstParent, srcNode));
+    } else {
+      NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode));
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
-  struct ncclXmlNode* topNode;
-  NCCLCHECK(xmlFindTag(dst, "system", &topNode));
+  struct ncclXmlNode* topNodeDst;
+  NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst));
 
-  if (topNode == NULL) {
+  if (topNodeDst == NULL) {
     xmlAddTree(dst, NULL, src->nodes);
     return ncclSuccess;
   }
 
-  // Fuse the CPUs with the first XML
-  struct ncclXmlNode* srcCpu;
-  NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
-  while (srcCpu) {
-    const char* srcNumaId;
-    const char* srcHostHash;
-    NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
-    if (srcNumaId == NULL) {
-      WARN("TopoFuseXmls : could not find CPU numa ID.");
-      return ncclInternalError;
-    }
-    xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
-    if (srcHostHash == NULL)
-      srcHostHash = "0";
-
-    // Search through the destination for a duplicate.  Note that
-    // this makes the complexity of this whole function O(n^2), but n
-    // is expected to be small.
-    struct ncclXmlNode* dstCpu;
-    NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
-    while (dstCpu) {
-      const char* dstNumaId;
-      const char* dstHostHash;
-      NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
-      if (dstNumaId == NULL) {
-        WARN("TopoFuseXmls : could not find CPU numa ID.");
-        return ncclInternalError;
-      }
-      xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
-      if (dstHostHash == NULL)
-        dstHostHash = "0";
-      if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
-        break;
+  struct ncclXmlNode* topNodeSrc;
+  NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc));
+
+  NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc));
 
-      NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
-    }
-    // Only add the CPU if no duplicate was found
-    if (dstCpu == NULL)
-      NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
-    NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
-  }
   return ncclSuccess;
 }
 
@@ -335,6 +313,11 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
   NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
   return ncclSuccess;
@@ -357,8 +340,8 @@ ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlN
 }
 
 ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
-  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
-  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4));
   return ncclSuccess;
 }
 
@@ -423,6 +406,28 @@ static ncclResult_t getPciPath(const char* busId, char** path) {
   return ncclSuccess;
 }
 
+#include <dirent.h>
+static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
+  *nlinks = 0;
+  *peers = NULL;
+  char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0";
+  memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1);
+  DIR *dir = opendir(dirPath);
+  if (dir) {
+    struct dirent* file;
+    while ((file = readdir(dir)) != NULL) {
+      if (strlen(file->d_name) != BUSID_SIZE-1) continue;
+      char* path;
+      if (getPciPath(file->d_name, &path) == ncclSystemError) continue;
+      free(path);
+      NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE));
+      memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE);
+    }
+    closedir(dir);
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
   char filePath[PATH_MAX];
   sprintf(filePath, "%s/%s", path, fileName);
@@ -541,10 +546,11 @@ ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct n
 // There can be trailing chars.
 int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
 int checkBDFFormat(char* bdf) {
-  if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
-  if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
-      isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
-      isHex(bdf[11] == 0)) return 0;
+  if (strlen(bdf) != 12) return 0;
+  if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0;
+  if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) ||
+      (isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) ||
+      (isHex(bdf[11]) == 0)) return 0;
   return 1;
 }
 
@@ -608,6 +614,24 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
       NCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
     }
   }
+
+  const char* vendor;
+  NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
+  if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
+    int nlinks;
+    char* peers;
+    NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
+    for (int l=0; l<nlinks; l++) {
+      char* target = peers+l*BUSID_SIZE;
+      struct ncclXmlNode* linkNode;
+      NCCLCHECK(xmlGetSubKv(pciNode, "pcilink", &linkNode, "target", target));
+      if (linkNode == NULL) {
+        NCCLCHECK(xmlAddNode(xml, pciNode, "pcilink", &linkNode));
+        NCCLCHECK(xmlSetAttr(linkNode, "target", target));
+      }
+    }
+  }
+
   struct ncclXmlNode* parent = pciNode->parent;
   if (parent == NULL) {
     if (path) {
@@ -911,25 +935,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
+ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
   const char* str;
   NCCLCHECK(xmlGetAttr(node, "keep", &str));
   if (str && strcmp(str, "1") == 0) {
     NCCLCHECK(xmlUnsetAttr(node, "keep"));
+    *keep = 1;
   } else {
     // Copy nSubs and subs as they could change as we trim recursively.
     struct ncclXmlNode* subs[MAX_SUBS];
     int nSubs = node->nSubs;
     memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
+    *keep = 0;
     for (int s=0; s<nSubs; s++) {
-      NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
+      int k = 0;
+      NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
+      *keep += k;
+    }
+    if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
+        (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
+      NCCLCHECK(xmlRemoveNode(node));
     }
-    if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
   }
   return ncclSuccess;
 }
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
-  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
+  int keep = 0;
+  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep));
   return ncclSuccess;
 }
 
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 9090ecc0f..0ee56790b 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -55,7 +55,7 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
 /* Remove unneeded parts */
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
 
-/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
+/* Fuse multiple system XMLs into one, skipping duplicate entries */
 ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
 /* Relocate pointers in XML to (de-)serialize the structure */
 ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
@@ -172,6 +172,29 @@ static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struc
   return ncclSuccess;
 }
 
+static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) {
+  *node = NULL;
+  // Search for the node at the current level only.
+  for (int i=0; i<parentNode->nSubs; i++) {
+    struct ncclXmlNode* n = parentNode->subs[i];
+    if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) {
+      int a;
+      // Ensure that all the attributes are the same.
+      for (a=0; a<searchNode->nAttrs; a++) {
+        const char* val;
+        NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val));
+        if (!val || strcmp(val, searchNode->attrs[a].value))
+          break;
+      }
+      if (a == searchNode->nAttrs) {
+        *node = n;
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
   int index;
   NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
diff --git a/src/group.cc b/src/group.cc
index eb45e31ac..7158b45c2 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -10,6 +10,7 @@
 #include "transport.h"
 #include "channel.h"
 #include <assert.h>
+#include "bootstrap.h"
 
 __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
 __thread ncclResult_t ncclGroupError = ncclSuccess;
@@ -31,6 +32,7 @@ ncclResult_t ncclAsyncLaunch(
   ) {
   ncclResult_t ret = ncclSuccess;
 
+  job->destroyFlag = comm->destroyFlag;
   if (ncclGroupDepth == 0) {
     ret = func(job);
     if (ret != ncclSuccess && undo) undo(job);
@@ -40,11 +42,15 @@ ncclResult_t ncclAsyncLaunch(
     job->undo = undo;
     job->destructor = destructor;
     job->abortFlag = comm->abortFlag;
+    job->abortFlagDev = comm->abortFlagDev;
     job->childAbortFlag = comm->childAbortFlag;
+    job->childAbortFlagDev = comm->childAbortFlagDev;
     job->state = ncclGroupJobRunning;
     job->comm = comm;
     /* check if there are blocking and nonblocking comms at the same time in group. */
-    if (ncclGroupBlocking == -1) {
+    if (comm->destroyFlag) {
+      ncclGroupBlocking = 1;
+    } else if (ncclGroupBlocking == -1) {
       /* first met communicator */
       ncclGroupBlocking = comm->config.blocking;
     } else if (ncclGroupBlocking != comm->config.blocking) {
@@ -98,11 +104,23 @@ ncclResult_t ncclGroupEnd() {
   return ret;
 }
 
+NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo);
+ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
+  ncclResult_t ret = ncclSuccess;
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
+  TRACE_CALL("ncclGroupSimulateEnd()");
+exit:
+  return ret;
+}
+
 struct ncclPreconnectJob {
   struct ncclAsyncJob base;
   struct ncclComm* comm;
+  bool* algoNeedConnect;
 };
-ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
+
+ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
   struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
   struct ncclComm* comm = job->comm;
   CUDACHECK(cudaSetDevice(comm->cudaDev));
@@ -111,6 +129,57 @@ ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  ncclResult_t ret = ncclSuccess;
+
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
+    if (job->algoNeedConnect[i]) {
+      switch (i) {
+        case NCCL_ALGO_RING: {
+          NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_TREE: {
+          NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_NVLS: {
+          /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
+           * NVLS intra-node buffer */
+          NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_NVLS_TREE: {
+          NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_COLLNET_CHAIN: {
+          NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_COLLNET_DIRECT: {
+          NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+          break;
+        }
+        default: {
+          ret = ncclInternalError;
+          goto fail;
+        }
+      }
+    }
+  }
+
+exit:
+  free(job->algoNeedConnect);
+  return ret;
+fail:
+  goto exit;
+}
+
 static ncclResult_t doLaunches(struct ncclComm* head) {
   ncclResult_t result = ncclSuccess;
   struct ncclComm* cliqueComm0 = head->intraComm0;
@@ -124,7 +193,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
     struct ncclComm* comm = cliqueHead;
     bool capturingYes = false, capturingNo = false;
     do {
-      (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
+      (ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true;
       CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
       NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
       if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
@@ -150,19 +219,19 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
           // Barrier reduction result tells us if this was the final round.
           moreRounds = 0 != ncclCommIntraBarrierOut(comm);
         } else {
-          moreRounds |= comm->unlaunchedPlansHead != nullptr;
+          moreRounds |= comm->planner.unlaunchedPlansHead != nullptr;
         }
         if (moreRounds) {
           // Pop next unlaunched kernel
-          struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
+          struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead;
           if (plan != nullptr) {
-            comm->unlaunchedPlansHead = plan->next;
+            comm->planner.unlaunchedPlansHead = plan->next;
             CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
             NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
             NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
           }
           // Barrier reduction input indicates if we require further rounds.
-          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
+          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
           if (plan != nullptr) {
             NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
           }
@@ -210,37 +279,29 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
     // is needed.
     comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
     for (int i = 0; i < comm->nRanks; i++) {
-      comm->tasks.peers[i].sendSeen = false;
-      comm->tasks.peers[i].recvSeen = false;
       comm->connectSend[i] = 0UL;
       comm->connectRecv[i] = 0UL;
     }
-    comm->unlaunchedPlansHead = nullptr;
     // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
     // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
-    while (!ncclIntruQueueEmpty(&comm->planQueue)) {
-      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+    while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
+      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
       // Persistent plans will be reclaimed via the callbackQueue when the
       // graph drops its UserObject reference.
       if (!plan->persistent) {
-        for (int c = 0; c < MAXCHANNELS; c++) {
-          while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
-            struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
-            ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
-          }
+        while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
+          struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
+          ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
         }
         ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
       }
     }
-    // Reset comm->tasks to empty.
-    comm->tasks.nTasksColl = 0;
-    comm->tasks.nTasksP2p = 0;
-    comm->tasks.workBytesTotal = 0;
-    comm->tasks.streams = nullptr;
-    ncclIntruQueueConstruct(&comm->tasks.collQueue);
-    for (int i = 0; i < comm->nRanks; i++) {
-      ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
-      ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+
+    { // Reset comm->planner to empty.
+      ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+      memset(&comm->planner, 0, sizeof(comm->planner));
+      comm->planner.peers = tmp;
+      memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
     }
 
     if (!comm->config.blocking)
@@ -260,37 +321,10 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
   return;
 }
 
-static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
-  int savedDev;
+static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain, volatile bool *groupAbortFlag) {
   ncclResult_t ret = ncclSuccess;
   bool jobsDone = false;
   bool errorJobAbortFlag = false;
-  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
-  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
-  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
-  volatile bool *groupAbortFlag = gjob->abortFlagPtr;
-
-  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-
-  if (groupCommPreconnectHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommPreconnectHeadMain;
-    do {
-      struct ncclPreconnectJob* job;
-      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-      job->base.func = ncclPreconnectFunc;
-      job->base.undo = nullptr;
-      job->base.destructor = free;
-      job->base.state = ncclGroupJobRunning;
-      job->base.abortFlag = comm->abortFlag;
-      job->comm = comm;
-      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
-
-      struct ncclComm* next = comm->preconnectNext;
-      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
-      comm = next;
-    } while (comm != nullptr);
-  }
 
   if (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
@@ -321,9 +355,13 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
           assert(state == ncclGroupJobJoined);
         }
 
-        if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
-          __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
-          if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
+        if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) {
+          __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE);
+          __atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE);
+          if (job->childAbortFlag) {
+            __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE);
+            __atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE);
+          }
         }
 
         job = job->next;
@@ -335,17 +373,86 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
     if (ret != ncclSuccess) goto fail;
   }
 
-  if (groupCommHeadMain != nullptr) {
-    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
-  }
-
   while (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
-    if (job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, ret);
     if (job->destructor) job->destructor((void*)job);
   }
 
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
+  int savedDev;
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
+  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
+  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
+  bool *groupAbortFlag = gjob->abortFlagPtr;
+
+  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
+
+  if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommPreconnectHeadMain;
+    do {
+      struct ncclPreconnectJob* job;
+      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+      job->base.func = ncclP2PPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->base.state = ncclGroupJobRunning;
+      job->base.abortFlag = comm->abortFlag;
+      job->base.abortFlagDev = comm->abortFlagDev;
+      job->comm = comm;
+      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+
+      struct ncclComm* next = comm->preconnectNext;
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      comm = next;
+    } while (comm != nullptr);
+  }
+
+  NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
+
+  /* Connect channels at runtime if cumem is supported */
+  if (groupCommHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommHeadMain;
+
+    do {
+      bool needConnect = false;
+      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+
+      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+
+      if (comm->cuMemSupport && needConnect) {
+        struct ncclPreconnectJob* job;
+        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+        job->base.func = ncclCollPreconnectFunc;
+        job->base.undo = nullptr;
+        job->base.destructor = free;
+        job->base.state = ncclGroupJobRunning;
+        job->base.abortFlag = comm->abortFlag;
+        job->comm = comm;
+        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+        ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+      }
+      comm = comm->groupNext;
+    } while (comm);
+
+    NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
+  }
+
+  if ((!simInfo) && (groupCommHeadMain != nullptr)) {
+    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
+  }
+
   while (groupCommHeadMain != nullptr) {
     struct ncclComm* comm = groupCommHeadMain;
     struct ncclComm* next = comm->groupNext;
@@ -365,8 +472,17 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
   goto exit;
 }
 
-ncclResult_t ncclGroupEndInternal() {
+static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) {
+  return groupLaunch(job_ /* estimatedTime = NULL */);
+}
+
+ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
   ncclResult_t ret = ncclSuccess;
+  ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
+  ncclSimInfo_t* internalSimInfoPtr = NULL;
+  size_t realSize = 0;
+
+  internalSimInfo.magic = 0;
 
   if (ncclGroupDepth == 0) {
     WARN("ncclGroupEnd: not in a group call.");
@@ -378,6 +494,18 @@ ncclResult_t ncclGroupEndInternal() {
 
   if ((ret = ncclGroupError) != ncclSuccess) goto fail;
 
+  if (simInfo) {
+    memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t));
+    realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize;
+    memcpy((void*)&internalSimInfo, (void*)simInfo, realSize);
+    if (internalSimInfo.magic != 0x74685283) {
+      WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER");
+      ret = ncclInvalidArgument;
+      goto fail;
+    }
+    internalSimInfoPtr = &internalSimInfo;
+  }
+
   if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
     ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
     ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
@@ -410,12 +538,13 @@ ncclResult_t ncclGroupEndInternal() {
         } while (comm);
       }
 
-      ncclGroupJobMainPtr->base.func = groupLaunch;
+      ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
       SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
       ret = ncclInProgress;
     } else {
       /* blocking group */
-      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
+      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
       groupResetJobState(ncclGroupJobMainPtr);
     }
   }
@@ -438,7 +567,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
 
 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
   if (groupJob && groupJob->initialized) {
-    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
+    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
     NCCLCHECK(ncclGroupJobComplete(groupJob));
   }
   return ncclSuccess;
diff --git a/src/include/align.h b/src/include/align.h
deleted file mode 100644
index 2a71dd1bc..000000000
--- a/src/include/align.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ALIGN_H_
-#define NCCL_ALIGN_H_
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_POWER(x, y) \
-    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-#if !__CUDA_ARCH__
-  #ifndef __host__
-    #define __host__
-  #endif
-  #ifndef __device__
-    #define __device__
-  #endif
-#endif
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
-  return (x+y-1)/y;
-}
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
-  return (x+y-1) - (x+y-1)%y;
-}
-
-// assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x+a-1) & Z(-a);
-}
-
-#endif
diff --git a/src/include/alloc.h b/src/include/alloc.h
index aa522ea1a..71d0777cc 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -9,7 +9,7 @@
 
 #include "nccl.h"
 #include "checks.h"
-#include "align.h"
+#include "bitops.h"
 #include "utils.h"
 #include "p2p.h"
 #include <sys/mman.h>
@@ -19,18 +19,25 @@
 
 uint64_t clockNano(); // from utils.h with which we have a circular dependency
 
+template<typename T>
+constexpr size_t ncclSizeOfT() { return sizeof(T); }
+template<>
+constexpr size_t ncclSizeOfT<void>() { return 1; }
+
 template <typename T>
 ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
-  memset(*ptr, 0, nelem*sizeof(T));
+  if (nelem > 0) {
+    CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
+    memset(*ptr, 0, nelem*ncclSizeOfT<T>());
+  }
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
   return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -42,14 +49,18 @@ inline ncclResult_t ncclCudaHostFree(void* ptr) {
 
 template <typename T>
 ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
+  if (nelem > 0) {
+    void* p = malloc(nelem*ncclSizeOfT<T>());
+    if (p == NULL) {
+      WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
+      return ncclSystemError;
+    }
+    //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
+    memset(p, 0, nelem*ncclSizeOfT<T>());
+    *ptr = (T*)p;
+  } else {
+    *ptr = NULL;
   }
-  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
   return ncclSuccess;
 }
 #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -60,16 +71,16 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
   if (nelem == oldNelem) return ncclSuccess;
 
   T* oldp = *ptr;
-  T* p = (T*)malloc(nelem*sizeof(T));
+  T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
   if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
     return ncclSystemError;
   }
-  memcpy(p, oldp, oldNelem*sizeof(T));
+  memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
   free(oldp);
-  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
   *ptr = (T*)p;
-  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
   return ncclSuccess;
 }
 
@@ -111,7 +122,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
   if (handlep) *handlep = handle;
-  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
+  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
   return result;
 }
 
@@ -123,7 +134,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
   CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
   CUCHECK(cuMemRelease(handle));
   CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle);
   CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
   CUCHECK(cuMemRelease(handle));
   CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
@@ -151,15 +162,17 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
   }
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
   return result;
 }
 #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -170,21 +183,23 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    // Need a side stream so as not to interfere with graph capture.
+    cudaStream_t stream;
+    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
+    CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
+    CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+    CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
   }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
   return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -195,16 +210,18 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
+    CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
   }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
   return result;
 }
 #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -230,7 +247,7 @@ ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stre
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT<T>(), cudaMemcpyDefault, stream), result, finish);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   return result;
@@ -256,13 +273,17 @@ ncclResult_t ncclCudaFree(T* ptr) {
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
 inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return ncclSystemError;
-  memset(p, 0, size);
-  *ptr = p;
+  if (size > 0) {
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    void* p;
+    int size_aligned = ROUNDUP(size, page_size);
+    int ret = posix_memalign(&p, page_size, size_aligned);
+    if (ret != 0) return ncclSystemError;
+    memset(p, 0, size);
+    *ptr = p;
+  } else {
+    *ptr = NULL;
+  }
   INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
   return ncclSuccess;
 }
diff --git a/src/include/bitops.h b/src/include/bitops.h
new file mode 100644
index 000000000..95620cbe3
--- /dev/null
+++ b/src/include/bitops.h
@@ -0,0 +1,277 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BITOPS_H_
+#define NCCL_BITOPS_H_
+
+#include <stdint.h>
+
+#if !__NVCC__
+  #ifndef __host__
+    #define __host__
+  #endif
+  #ifndef __device__
+    #define __device__
+  #endif
+#endif
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_POWER(x, y) \
+    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z divUp(X x, Y y) {
+  return (x+y-1)/y;
+}
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z roundUp(X x, Y y) {
+  return (x+y-1) - (x+y-1)%y;
+}
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z roundDown(X x, Y y) {
+  return x - x%y;
+}
+
+// assumes second argument is a power of 2
+template<typename X, typename Z = decltype(X()+int())>
+__host__ __device__ constexpr Z alignUp(X x, int a) {
+  return (x + a-1) & Z(-a);
+}
+// assumes second argument is a power of 2
+template<typename X, typename Z = decltype(X()+int())>
+__host__ __device__ constexpr Z alignDown(X x, int a) {
+  return x & Z(-a);
+}
+
+template<typename Int>
+inline __host__ __device__ int countOneBits(Int x) {
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(unsigned int)) {
+    return __popc((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    return __popcll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+    return -1;
+  }
+#else
+  if (sizeof(Int) <= sizeof(unsigned int)) {
+    return __builtin_popcount((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    return __builtin_popcountl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    return __builtin_popcountll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+    return -1;
+  }
+#endif
+}
+
+// Returns index of first one bit or returns -1 if mask is zero.
+template<typename Int>
+inline __host__ __device__ int firstOneBit(Int mask) {
+  int i;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    i = __ffs((int)mask);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    i = __ffsll((long long)mask);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (sizeof(Int) <= sizeof(int)) {
+    i = __builtin_ffs((int)mask);
+  } else if (sizeof(Int) <= sizeof(long)) {
+    i = __builtin_ffsl((long)mask);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    i = __builtin_ffsll((long long)mask);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#endif
+  return i-1;
+}
+
+template<typename Int>
+inline __host__ __device__ int popFirstOneBit(Int* mask) {
+  Int tmp = *mask;
+  *mask &= *mask-1;
+  return firstOneBit(tmp);
+}
+
+template<typename Int>
+inline __host__ __device__ int log2Down(Int x) {
+  int w, n;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    w = 8*sizeof(int);
+    n = __clz((int)x);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    w = 8*sizeof(long long);
+    n = __clzll((long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (x == 0) {
+    return -1;
+  } else if (sizeof(Int) <= sizeof(unsigned int)) {
+    w = 8*sizeof(unsigned int);
+    n = __builtin_clz((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    w = 8*sizeof(unsigned long);
+    n = __builtin_clzl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    w = 8*sizeof(unsigned long long);
+    n = __builtin_clzll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+  }
+#endif
+  return (w-1)-n;
+}
+
+template<typename Int>
+inline __host__ __device__ int log2Up(Int x) {
+  int w, n;
+  if (x != 0) x -= 1;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    w = 8*sizeof(int);
+    n = __clz((int)x);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    w = 8*sizeof(long long);
+    n = __clzll((long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (x == 0) {
+    return 0;
+  } else if (sizeof(Int) <= sizeof(unsigned int)) {
+    w = 8*sizeof(unsigned int);
+    n = __builtin_clz((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    w = 8*sizeof(unsigned long);
+    n = __builtin_clzl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    w = 8*sizeof(unsigned long long);
+    n = __builtin_clzll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+  }
+#endif
+  return w-n;
+}
+
+template<typename Int>
+inline __host__ __device__ Int pow2Up(Int x) {
+  return Int(1)<<log2Up(x);
+}
+
+template<typename Int>
+inline __host__ __device__ Int pow2Down(Int x) {
+  return Int(1)<<log2Down(x);
+}
+
+template<typename UInt, int nSubBits>
+inline __host__ UInt reverseSubBits(UInt x) {
+  if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
+    switch (8*sizeof(UInt)) {
+    case 16: x = __builtin_bswap16(x); break;
+    case 32: x = __builtin_bswap32(x); break;
+    case 64: x = __builtin_bswap64(x); break;
+    default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
+    }
+    return reverseSubBits<UInt, 8>(x);
+  } else if (nSubBits == 1) {
+    return x;
+  } else {
+    UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
+    x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2);
+    return reverseSubBits<UInt, nSubBits/2>(x);
+  }
+}
+
+template<typename T> struct ncclToUnsigned;
+template<> struct ncclToUnsigned<char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<signed char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<unsigned char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<signed short> { using type = unsigned short; };
+template<> struct ncclToUnsigned<unsigned short> { using type = unsigned short; };
+template<> struct ncclToUnsigned<signed int> { using type = unsigned int; };
+template<> struct ncclToUnsigned<unsigned int> { using type = unsigned int; };
+template<> struct ncclToUnsigned<signed long> { using type = unsigned long; };
+template<> struct ncclToUnsigned<unsigned long> { using type = unsigned long; };
+template<> struct ncclToUnsigned<signed long long> { using type = unsigned long long; };
+template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned long long; };
+
+// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
+template<typename Int>
+inline __host__ __device__ Int reverseBits(Int x, int nBits) {
+  using UInt = typename ncclToUnsigned<Int>::type;
+  union { UInt ux; Int sx; };
+  sx = x;
+  #if __CUDA_ARCH__
+    if (sizeof(Int) <= sizeof(unsigned int)) {
+      ux = __brev(ux);
+    } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+      ux = __brevll(ux);
+    } else {
+      static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type.");
+    }
+  #else
+    ux = reverseSubBits<UInt, 8*sizeof(UInt)>(ux);
+  #endif
+  ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits);
+  return sx;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Custom 8 bit floating point format for approximating 32 bit uints. This format
+// has nearly the full range of uint32_t except it only keeps the top 3 bits
+// beneath the leading 1 bit and thus has a max value of 0xf0000000.
+
+inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
+  int log2x;
+  #if __CUDA_ARCH__
+    log2x = 31-__clz(x|1);
+  #else
+    log2x = 31-__builtin_clz(x|1);
+  #endif
+  uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<<bitsPerPow2)-1);
+  uint32_t exponent = log2x >= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0;
+  return exponent<<bitsPerPow2 | mantissa;
+}
+
+inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
+  uint32_t exponent = x>>bitsPerPow2;
+  uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
+  if (exponent != 0) exponent -= 1;
+  return mantissa<<exponent;
+}
+
+constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
+
+inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
+  return u32fpEncode(x, 3);
+}
+inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
+  return u32fpDecode(x, 3);
+}
+
+#endif
diff --git a/src/include/channel.h b/src/include/channel.h
index adc38749a..ee9aa6d0b 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -7,42 +7,25 @@
 #ifndef NCCL_CHANNEL_H_
 #define NCCL_CHANNEL_H_
 #include "comm.h"
+#include "utils.h"
+
+#include <algorithm>
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
-static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
-  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
-  int peerNode = comm->rankToNode[peer];
-  int peerIndex = comm->rankToLocalRank[peer];
-  int nsteps = comm->maxLocalRanks;
-  int rankIndex = comm->rankToLocalRank[comm->rank];
-  int step, delta;
-  if (coll == ncclFuncSend) {
-    step = (nsteps + peerIndex - rankIndex)%nsteps;
-    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
-  } else if (coll == ncclFuncRecv) {
-    step = (nsteps + rankIndex - peerIndex)%nsteps;
-    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+
+inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
+  if (comm->nNodes > 1) {
+    int nodeDelta = p2pRound/comm->maxLocalRanks;
+    int localDelta = p2pRound%comm->maxLocalRanks;
+    int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+    base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+    return base & 0xff;
   } else {
-    return ncclInternalError;
+    return p2pRound & 0xff;
   }
-  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
-  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
-  int base;
-  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
-  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
-  return ncclSuccess;
 }
 
 #endif
diff --git a/src/include/checks.h b/src/include/checks.h
index c9fd16176..89355c3da 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -123,23 +123,23 @@
 } while (0);
 
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  uint32_t* tmpAbortFlag = (abortFlagPtr);     \
   ncclResult_t RES = call;                \
   if (RES != ncclSuccess && RES != ncclInProgress) {               \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     return ncclInternalError;             \
   }                                       \
-  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+  if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
 } while (!(cond));
 
 #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  uint32_t* tmpAbortFlag = (abortFlagPtr);             \
   RES = call;                             \
   if (RES != ncclSuccess && RES != ncclInProgress) {               \
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     goto label;                           \
   }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
+  if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
 } while (!(cond));
 
 #define NCCLCHECKTHREAD(a, args) do { \
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 888df728f..fb7af3bff 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -8,6 +8,8 @@
 #define NCCL_COLLECTIVES_H_
 
 #include "nccl.h"
+#include "nccl_common.h"
+#include "device.h"
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -22,6 +24,12 @@
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
 
+const char* ncclFuncToString(ncclFunc_t op);
+const char* ncclDevRedOpToString(ncclDevRedOp_t op);
+const char* ncclDatatypeToString(ncclDataType_t type);
+const char* ncclAlgoToString(int algo);
+const char* ncclProtoToString(int proto);
+
 inline int ncclTypeSize(ncclDataType_t type) {
   switch (type) {
   case ncclInt8:
diff --git a/src/include/comm.h b/src/include/comm.h
index 0ba913ada..0cc0a8911 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_
 
-#include "transport.h"
+//#include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
 #include "nccl_tuner.h"
@@ -15,6 +15,7 @@
 #include "strongstream.h"
 #include "nccl_net.h"
 #include "register.h"
+#include "graph.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -144,7 +145,7 @@ struct ncclChannel {
   struct ncclNvls nvls;
 
   int id; // index of this channel
-  uint32_t workFifoSent; // last used work index+1
+  uint32_t workFifoProduced; // +1 successor of last used work fifo byte
 
   /* comm split sharable resources */
   struct ncclChannelPeer* collnetPeers;
@@ -153,22 +154,15 @@ struct ncclChannel {
   struct ncclDevChannelPeer* nvlsDevPeers;
 };
 
-struct ncclWorkList {
-  struct ncclWorkList* next;
-  struct ncclWork work;
-};
-
-struct ncclPointerList {
-  struct ncclPointerList* next;
-  void *ptr;
+struct ncclWorkBatchList {
+  struct ncclWorkBatchList* next;
+  struct ncclDevWorkBatch batch;
 };
-
-struct ncclNvlsMcHandleList {
-  struct ncclNvlsMcHandleList *next;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
+struct alignas(16) ncclWorkList {
+  struct ncclWorkList* next;
+  enum ncclDevWorkType workType;
+  int size; // Size of struct following this node
+  // ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]...
 };
 
 struct ncclCollnetHandleList {
@@ -188,33 +182,190 @@ struct ncclKernelPlan {
   struct ncclKernelPlan* next;
 
   bool persistent; // aka captured in a graph
+  enum ncclDevWorkStorageType workStorageType;
   bool kernelSpecialized;
   void *kernelFn;
-  int channelUbound; // only channels c < channelUbound are present
-  int channelCount; // number of channels present
-  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  struct ncclDevKernelArgs* kernelArgs;
+  size_t kernelArgsSize;
+  uint64_t channelMask; // bitset of which channels are present
   bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
   int threadPerBlock;
-  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
-  struct ncclWork* workHead;
 
-  int collOpCount; // zero based for this plan
+  int collOpCount; // Number of collectives in this plan.
+  int nWorkBatches; // Number of work batches.
+  size_t workBytes; // Sum size of all work (in the fifo) in bytes.
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
+  void* workBufPersistent;
 
-  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
-  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
-  struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
+  struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+};
 
-  struct Channel {
-    int nWork;
-    union {
-      int nWorkElem; // used for coll and reg coll
-      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
-    };
-    size_t collBytes;
-    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
-    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
-  } channels[MAXCHANNELS];
-  size_t maxBytesPerChannel;
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclRedOp_t opHost;
+  struct ncclDevRedOpFull opDev;
+  int chunkSteps, sliceSteps;
+  // Computed later:
+  size_t trafficBytes;
+  int32_t nMaxChannels:8;
+  int32_t nWarps:8;
+  int32_t algorithm:8, protocol:8;
+  uint32_t isCollnet:1, isNvls:1;
+  uint32_t devFuncId:30;
+  enum ncclRegBufferType regBufType;
+  // number of elements in planner->ipcMemQueue associated with this collective
+  int nCleanupQueueElts;
+
+  void* sendMhandle;
+  void* recvMhandle;
+};
+struct ncclTaskP2p {
+  struct ncclTaskP2p* next;
+  void* buff;
+  size_t bytes;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Roughly sorts ncclTaskColl's by their size descending. This structure is
+// self-referential, meaning that pointers it contains internally may point
+// into the structure itself. This means that it is NOT memcpy-moveable:
+
+struct ncclTaskCollSorter {
+  static constexpr int UnitLog2 = 10; // 1K
+  static constexpr size_t UnitSize = 1<<UnitLog2;
+  static constexpr int MaxLog2 = 30; // 1GB
+  static constexpr size_t MaxSize = 1ull<<MaxLog2;
+  // Number of bins between powers of 2. For 4 bins, the worst case out-of-order
+  // relative magnitude is (5/4)-1 = 25%
+  static constexpr int BitsPerPow2 = 2;
+  static constexpr int BinsPerPow2 = 1<<BitsPerPow2;
+  static constexpr int BinCount = 1 + (MaxLog2-UnitLog2)*BinsPerPow2;
+
+  struct ncclTaskColl* head;
+  struct ncclTaskColl* tail;
+  // Least bin such that it and all above are empty.
+  int binEdge;
+  // Pointer to the pointer to this bin's head node which is either the
+  // previous node's `next` field or `head`.
+  struct ncclTaskColl** bins[BinCount];
+};
+
+inline void ncclTaskCollSorterInsert(
+    struct ncclTaskCollSorter* me, struct ncclTaskColl* x, size_t size
+  ) {
+  constexpr int UnitLog2 = ncclTaskCollSorter::UnitLog2;
+  constexpr size_t MaxSize = ncclTaskCollSorter::MaxSize;
+  constexpr int BitsPerPow2 = ncclTaskCollSorter::BitsPerPow2;
+  constexpr int BinCount = ncclTaskCollSorter::BinCount;
+  int bin = u32fpEncode(std::min(MaxSize, size)>>UnitLog2, BitsPerPow2);
+  bin = BinCount-1 - bin; // descending bin
+
+  if (me->bins[bin] == nullptr) {
+    if (me->binEdge <= bin) {
+      me->binEdge = bin+1;
+      me->bins[bin] = me->tail ? &me->tail->next : &me->head;
+      me->tail = x;
+    } else {
+      // Find successor non-empty bin after this one.
+      int succ = bin+1;
+      while (me->bins[succ] == nullptr) succ++;
+      // What was our successor's head's previous is now our head's previous.
+      me->bins[bin] = me->bins[succ];
+      // The first node we insert is our tail, so that becomes our successor's
+      // head's new previous.
+      me->bins[succ] = &x->next;
+    }
+  }
+  // Push a new head for this bin.
+  x->next = *me->bins[bin];
+  *me->bins[bin] = x;
+}
+
+inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) {
+  return me->head == nullptr;
+}
+
+// Reset sorter and return sorted linked list of its coll tasks.
+inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) {
+  struct ncclTaskColl* head = me->head;
+  if (head != nullptr) memset(me, 0, sizeof(*me));
+  return head;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+
+struct ncclKernelPlanner {
+  //////////////////////////////////////////////////////////////////////////////
+  // State for accumulating tasks between ncclGroupStart/End()
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclTaskCollSorter collSorter;
+  struct Peer* peers/*[nRanks]*/;
+  int nTasksColl, nTasksP2p;
+  bool persistent;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Lists of tasks to be assembled into plans.
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // State for building current (Work-In-Progress) plan:
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct WipPlan {
+    struct Channel {
+      struct {
+        int workBytes; // Sum size of work metadata referenced by this batch.
+        int nP2ps; // Number of p2p works in this batch
+        int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch.
+      } wipBatch; // work-in-progress batch which will be next tail of workBatchQueue
+      int nWorkBatchesP2p; // number of p2p batches for this channel.
+      struct ncclIntruQueue<struct ncclWorkBatchList, &ncclWorkBatchList::next> workBatchQueue;
+      struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+    } channels[MAXCHANNELS];
+  } wipPlan;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // State for launching built plans:
+  //////////////////////////////////////////////////////////////////////////////
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
 };
 
 #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
@@ -233,12 +384,18 @@ struct ncclComm {
   struct ncclPeerInfo* peerInfo;
   struct ncclTopoSystem* topo;
 
+  int netPluginLoaded;
   ncclNet_t* ncclNet;
+  ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
   void* bootstrap;
   // Bitmasks for ncclTransportP2pSetup
   uint64_t* connectSend;
   uint64_t* connectRecv;
+  struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
+  bool runtimeConn; // if dynamic connection is supported
+  int cuMemSupport;
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
 
@@ -253,6 +410,9 @@ struct ncclComm {
   cpu_set_t cpuAffinity; // CPU affinity of the GPU
   int cudaArch; // matches __CUDA_ARCH__ of device
 
+  int cpuArch;   // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed
+  int cpuVendor; // vendor - As defined in src/include/graph.h
+
   int node;
   int nNodes;
   int localRank;
@@ -278,10 +438,11 @@ struct ncclComm {
   int nChannels; // connection nChannels
   int collChannels; // enqueue nChannels
   int nvlsChannels; // enqueue nChannels
+  // all nvls heads stored to check if we can splitShare
+  int nvlsHeads[MAXCHANNELS];
   // Channels (per peer) for p2p
   int p2pnChannels;
   int p2pnChannelsPerPeer;
-  int p2pChannels[MAXCHANNELS];
 
   // Should this comm allocate LL buffers for network P2P connections?
   bool allocP2pNetLLBuffers;
@@ -303,23 +464,28 @@ struct ncclComm {
   ncclResult_t asyncResult;
 
   // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
-  volatile uint32_t *childAbortFlag;
-  uint32_t *abortFlagRefCount;
+  uint32_t* abortFlag;
+  uint32_t* abortFlagDev;
+  int* abortFlagRefCount;
+  uint32_t* childAbortFlag;
+  uint32_t* childAbortFlagDev;
+  uint32_t destroyFlag;
 
   // Device side of the communicator (for cudaFree's)
   struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
 
-  // Operation pool.
-  int workFifoDepth; // size of workFifoHeap[], power of 2
-  struct ncclWork* workFifoHeap;
-  struct ncclWork* devWorkFifoHeap;
-  void* workFifoHeapGdrHandle;
+  uint32_t workArgsBytes; // max size of kernel args
+  uint32_t workFifoBytes; // size of workFifoBuf, power of 2
+  void* workFifoBuf;
+  void* workFifoBufDev;
+  void* workFifoBufGdrHandle;
 
-  // Work completion notificaion
-  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
-  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
-  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
+  // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
+  uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
+  // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
+  uint32_t workFifoConsumedLeast;
+  // Monotonic number of bytes (mod 1<<32) sent to fifo.
+  uint32_t workFifoProduced;
 
   // Intra-process sync
   struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -337,7 +503,7 @@ struct ncclComm {
   // Whether this communicator uses collNet
   int collNetSupport;
   bool collNetRegSupport;
-  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
+  uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
   int intraHighestTransportType;
   int* collNetHeads;
   int collNetHeadsNum;
@@ -355,16 +521,16 @@ struct ncclComm {
   // pools backed by comm->memPermanent
   struct ncclMemoryPool memPool_ncclProxyOp;
   struct ncclMemoryPool memPool_ncclKernelPlan;
-  struct ncclMemoryPool memPool_ncclPointerList;
-  struct ncclMemoryPool memPool_ncclNvlsHandleList;
-  struct ncclMemoryPool memPool_ncclCollnetHandleList;
+
   // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
   // this comm is not yet in a group.
   struct ncclComm* groupNext;
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
   int persistentRefs; // number of persistent plan-lists capturing this comm
-  struct ncclTasks tasks;
+  struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
+
+  struct ncclKernelPlanner planner;
 
   // user-created reduction ops
   int userRedOpCapacity, userRedOpFreeHead;
@@ -373,11 +539,6 @@ struct ncclComm {
   // Queue of things for the main thread to do
   struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
 
-  // List of kernel plans built form tasks.
-  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
-  // First of the unlaunched kernels in `planQueue`
-  struct ncclKernelPlan* unlaunchedPlansHead;
-
   ncclConfig_t config;
   // initState is to more conveniently reclaim resources when errors happen.
   ncclResult_t initState;
@@ -389,6 +550,7 @@ struct ncclComm {
   struct ncclGroupJob *groupJob;
 
   // Tuning plugin
+  int tunerPluginLoaded;
   ncclTuner_t* tuner;
   void *tunerContext;
   // buffer registration cache
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index b4eb5c312..fd7b0310e 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -80,6 +80,10 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
 DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
 DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
+#if CUDART_VERSION >= 11080
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
+#endif
 // cuMem API support
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
diff --git a/src/include/debug.h b/src/include/debug.h
index eb5189058..491ac3e12 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -10,21 +10,14 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include <stdio.h>
-#include <chrono>
-#include <type_traits>
 
-#include <limits.h>
-#include <string.h>
 #include <pthread.h>
 
 // Conform to pthread and NVTX standard
 #define NCCL_THREAD_NAMELEN 16
 
 extern int ncclDebugLevel;
-extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
 
@@ -32,13 +25,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
 extern thread_local int ncclDebugNoWarn;
 extern char ncclLastError[];
 
+#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
 
 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::steady_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
diff --git a/src/include/device.h b/src/include/device.h
index 50f841bfc..76a909f7a 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -9,8 +9,10 @@
 
 #include "nccl.h"
 #include "nccl_common.h"
-#include "align.h"
+#include "bitops.h"
+#include <algorithm>
 #include <stdint.h>
+#include <sys/types.h>
 
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 
@@ -21,6 +23,12 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8
 
+#ifdef __CUDA_ARCH__
+  #define NCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+  #define NCCL_CUDA_ARCH 0
+#endif
+
 #include "net_device.h"
 
 enum ncclDevRedOp_t {
@@ -52,8 +60,11 @@ union ncclLLFifoLine {
 
 #define WARP_SIZE 32
 #define MAXCHANNELS 32
+#define NCCL_MAX_LOCAL_RANKS 64
 #define NCCL_MAX_NTHREADS 640
+#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
 #define NCCL_SIMPLE_MAX_NTHREADS 512
+#define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE)
 #define NCCL_LL_MAX_NTHREADS 512
 #define NCCL_LL_LINES_PER_THREAD 8
 #ifdef TEST_LL_CLEANUP
@@ -84,6 +95,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_IPC_READ     0x10
 #define NCCL_NVLS_MIN_POLL 0x20
 
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS 16
+
 #define NCCL_MAX_COLLNET_SIZE (1L << 29)
 
 enum ncclRegBufferType {
@@ -196,112 +210,155 @@ struct ncclChannelPeer {
 
 struct ncclDevComm;
 
-/* ncclWork is to be a power of two, currently 8x64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclWorkElem. */
-#define NCCL_WORK_SIZE 512
-
-enum ncclWorkType : uint8_t {
-   ncclWorkTypeUnused=0,
-   ncclWorkTypeColl=1,
-   ncclWorkTypeP2p=2,
-   ncclWorkTypeRegColl=3
-};
-enum ncclWorkP2PType : uint8_t {
-  ncclWorkP2pTypeUnused=0,
-  ncclWorkP2pTypeSend,
-  ncclWorkP2pTypeRecv
+struct alignas(16) ncclDevWorkP2p {
+  void *sendAddr, *recvAddr;
+  size_t sendBytes, recvBytes;
+  int sendRank, recvRank;
+  // From the part index, nP2pChannels, and channelBase the device code can
+  // calculate which part of the transfer a channel is responsible for.
+  uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
+  uint8_t channelBase; // Channel owning first part.
+  // Zero channels indicates no work in that direction.
+  uint8_t nSendChannels, nRecvChannels;
+  // Chunk size stored in 8 bits via u32fp8Encode/Decode.
+  uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;
+
+  uint8_t sendProtoLL:1, recvProtoLL:1;
+  uint8_t sendRegistered:1, recvRegistered:1;
 };
 
-struct ncclWorkHeader {
-  union {
-    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
-    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
-  };
-  uint16_t funcIndex;
-  uint8_t isLast:1; // last work for this kernel
-  uint8_t inFifo:1; // is this work in the fifo
-  enum ncclWorkType type;
-};
+// Compute the subset of the data transfer corresponding to the given part index.
+inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) {
+  size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10);
+  #if __CUDA_ARCH__
+    *partBeg = min((part+0)*partBytes, bytes);
+    *partEnd = min((part+1)*partBytes, bytes);
+  #else
+    *partBeg = std::min<size_t>((part+0)*partBytes, bytes);
+    *partEnd = std::min<size_t>((part+1)*partBytes, bytes);
+  #endif
+}
 
-struct ncclWorkElem {
-  union {
-    uint8_t flagBits;
-    struct {
-      uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
-    };
-  };
-  uint8_t regUsed;
-  uint8_t nWarps;
-  uint8_t direct;
-  uint32_t root;
-  const void *sendbuff;
-  void *recvbuff;
+// implemented in channel.h
+inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound);
 
-  size_t count;
-  uint64_t redOpArg;
-  uint64_t chunkCount:25, workCount:39;
+// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
+// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
+inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) {
+  // Only works because nP2pChannels is pow2
+  int nChannelsLog2 = countOneBits(nP2pChannels-1);
+  int delta = reverseBits(part, nChannelsLog2);
+  return (base + delta) & (nP2pChannels-1);
+}
+inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) {
+  // Only works because nP2pChannels is pow2
+  int nChannelsLog2 = countOneBits(nP2pChannels-1);
+  int delta = (channel-base) & (nP2pChannels-1);
+  return reverseBits(delta, nChannelsLog2);
+}
+
+struct alignas(16) ncclDevWorkColl {
+  // Running on channels [channelLo..channelHi], hi is inclusive.
+  //   nChannels == (channelHi - channelLo) + 1
+  uint32_t channelLo:8, channelHi:8;
+  uint32_t nWarps:8;
+  uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
+  uint32_t root;
+  void* recvbuff;
+  void* sendbuff;
   union {
+    // Continuous-byte-distribution scheduling. The lo and hi channels are of
+    // different size than the channels in the middle.
     struct {
-      uint64_t lastChunkCount:25;
-      uint64_t workOffset:39;
-    };
+      size_t countLo, countMid, countHi;
+      // Chunk counts where units are ncclProtoGrainSize(protocol) bytes
+      uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21;
+    } cbd;
+    // Collnet scheduling. All channels divide work evenly.
     struct {
-      uint64_t bid:32;
-      uint64_t nChannels:32;
-    };
+      size_t count; // Total size, not divided per channel.
+      uint32_t chunkCount;
+    } collnet;
   };
+  uint64_t redOpArg;
 };
 
-#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
-static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
-
-struct ncclWorkElemP2p {
-  int peer : 30;
-  int proto : 2;
-
-  enum ncclWorkP2PType p2pType;
-  uint8_t reg:1;
-  uint8_t nWarps:5;
-  uint8_t warpStart;
-  uint8_t ngroups;
-  // Important not to use any fields with greater than 4-byte alignment since
-  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
-  // there were 8-byte fields.
-  //void* buff;
-  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
-  //size_t count;
-  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
-  int chunkSize;
-};
 
-static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
-#define NCCL_MAX_WORK_ELEMENTS_P2P 16
+__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
+  return proto == NCCL_PROTO_LL ? 16 :
+         proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
+         proto == NCCL_PROTO_SIMPLE ? 512 :
+         -1;
+}
+
+template<typename Int>
+__host__ __device__ inline void ncclCollCbdPart(
+    struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
+    Int* count, Int* partOffset, Int* partCount, Int* chunkCount
+  ) {
+  int eltPerGrain = ncclProtoGrainSize(proto)/eltSize;
+  int nMidChannels = work->channelHi - work->channelLo - 1;
+  // We can assum that nMidChannels<0 implies countMid==0, which let's us assume
+  // that countMid*nMidChannels == 0.
+  if (count != nullptr) {
+    *count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi;
+  }
+  if (channelId == work->channelLo) {
+    *partOffset = 0;
+    *partCount = work->cbd.countLo;
+    *chunkCount = work->cbd.chunkGrainsLo*eltPerGrain;
+  } else if (channelId == work->channelHi) {
+    *partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid;
+    *partCount = work->cbd.countHi;
+    *chunkCount = work->cbd.chunkGrainsHi*eltPerGrain;
+  } else {
+    int mid = channelId - work->channelLo - 1;
+    *partOffset = work->cbd.countLo + mid*work->cbd.countMid;
+    *partCount = work->cbd.countMid;
+    *chunkCount = work->cbd.chunkGrainsMid*eltPerGrain;
+  }
+}
 
-struct ncclWorkElemReg {
-  struct ncclWorkElem elem;
+struct alignas(16) ncclDevWorkCollReg {
+  struct ncclDevWorkColl coll;
   void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
   void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
   void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
 
-#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
-static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
+enum ncclDevWorkType: uint8_t {
+  ncclDevWorkTypeP2p,
+  ncclDevWorkTypeColl,
+  ncclDevWorkTypeCollReg
+};
 
-// Number of named barriers supported by CUDA
-#define NCCL_MAX_GROUPS 16
+constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
+  return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) :
+         type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
+}
 
-struct ncclWork {
-  struct ncclWorkHeader header;
+#define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024
+#define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl))
+#define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8
+struct alignas(16) ncclDevWorkBatch {
   union {
-    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
-    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
-    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
+    struct {
+      // nextExtends: should next one be merged into this one.
+      // nextJump=0: end of this channel's batch list
+      // nextJump>0: batches[thisIndex+nextJump] is next batch in this list
+      uint32_t nextJump:14, nextExtends:1;
+      uint32_t workType:2, funcId:15;
+    };
+    // Unioning bitfields with underlying type hints compiler to emit the best
+    // SASS LD/ST accesses.
+    uint32_t flags;
   };
+  // Rolling offset in fifo where this batch's work structs begin
+  uint32_t offsetBase;
+  // Set of relative offsets from offsetBase for this channel's subset of the batch:
+  // For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType)
+  uint64_t offsetBitset;
 };
-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
-static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
 
 struct ncclDevChannelPeer {
   // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
@@ -328,9 +385,8 @@ struct ncclDevComm {
   int buffSizes[NCCL_NUM_PROTOCOLS];
   int p2pChunkSize;
 
-  // Operation list for aggregation
-  int workFifoDepth;
-  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+  // Work fifo return credits
+  uint32_t* workConsumed/*[MAXCHANNELS]*/;
 
   int* collNetDenseToUserRank;
 
@@ -346,11 +402,37 @@ struct alignas(16) ncclDevCommAndChannels {
   struct ncclDevChannel channels[MAXCHANNELS];
 };
 
-#ifdef __CUDA_ARCH__
-  #define NCCL_CUDA_ARCH __CUDA_ARCH__
-#else
-  #define NCCL_CUDA_ARCH 0
-#endif
+enum ncclDevWorkStorageType: uint8_t {
+  ncclDevWorkStorageTypeArgs=0,
+  ncclDevWorkStorageTypeFifo=1,
+  ncclDevWorkStorageTypePersistent=2
+};
+
+struct alignas(16) ncclDevKernelArgs {
+  struct ncclDevComm* comm;
+  uint64_t channelMask;
+  enum ncclDevWorkStorageType workStorageType;
+  uint32_t workMask;
+  void* workBuf;
+  // A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list.
+  // struct ncclDevWorkBatch batches[];
+};
+
+__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
+  //return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
+  return 4<<10;
+}
+
+template<size_t capacity>
+struct alignas(16) ncclDevKernelArgsStorage {
+  union {
+    struct ncclDevKernelArgs args;
+    ulong2 storage[capacity/sizeof(ulong2)];
+  };
+};
+
+typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K;
+//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K;
 
 template<typename T>
 __host__ __device__ constexpr T min_constexpr(T a) { return a; }
@@ -366,6 +448,10 @@ __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
   return max_constexpr<T>((a > b ? a : b), c...);
 }
 
+constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) {
+  return min_constexpr<size_t>(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch));
+}
+
 // Calculate the unroll factor given:
 // * bytePerPack: number of bytes accessed per instruction
 // * insns: max permissible unroll value
@@ -412,6 +498,7 @@ extern int const ncclDevKernelCount;
 extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
 
 // Table of most specialized kernel function to run given func index.
+extern int const ncclDevFuncIdCount;
 extern int const ncclDevFuncRowToId[];
 extern void* const ncclDevKernelForFunc[/*funcIndex*/];
 extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 8ab59607d..1bb5a604f 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -24,5 +24,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
+ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
 
 #endif // End include guard
diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h
index a64674cc5..705f866ea 100644
--- a/src/include/gdrwrap.h
+++ b/src/include/gdrwrap.h
@@ -8,6 +8,7 @@
 #define NCCL_GDRWRAP_H_
 
 #include "nccl.h"
+#include "alloc.h"
 #include <stdint.h> // for standard [u]intX_t types
 #include <stdio.h>
 #include <stdlib.h>
@@ -194,7 +195,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
   char *devMem;
   void *gdrMap;
 
-  mapSize = sizeof(T)*nelem;
+  mapSize = ncclSizeOfT<T>()*nelem;
 
   // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
   ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
@@ -203,7 +204,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
   uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
   size_t align = alignedAddr - (uint64_t)devMem;
 
-  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
+  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize);
   NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
 
   NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
@@ -226,7 +227,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
   *ptr = (T *)((char *)gdrMap+off);
   if (devPtr) *devPtr = (T *)(devMem+off+align);
 
-  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p",
        md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
 
   return ncclSuccess;
@@ -235,7 +236,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
 template <typename T>
 static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
   gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
-  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
+  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT<T>()));
   return ncclSuccess;
 }
 
diff --git a/src/include/graph.h b/src/include/graph.h
index 08cfba4fd..0271b52d1 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -29,6 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
@@ -46,9 +47,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_ARCH_X86 1
 #define NCCL_TOPO_CPU_ARCH_POWER 2
 #define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_ARCH_MIXED 4
 #define NCCL_TOPO_CPU_VENDOR_INTEL 1
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
+#define NCCL_TOPO_CPU_VENDOR_MIXED 4
 #define NCCL_TOPO_CPU_TYPE_BDW 1
 #define NCCL_TOPO_CPU_TYPE_SKL 2
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
@@ -70,6 +73,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 #define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 #define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
+#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6  // Collnet Direct
 struct ncclTopoGraph {
   // Input / output
   int id; // ring : 0, tree : 1, collnet : 2
@@ -113,7 +117,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-#include "info.h"
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
 
 #endif
diff --git a/src/include/group.h b/src/include/group.h
index 72251147f..91bc19068 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -35,9 +35,12 @@ struct ncclAsyncJob {
   void(*undo)(struct ncclAsyncJob*);
   void(*destructor)(void*);
   ncclGroupJobState_t state;
-  volatile uint32_t *abortFlag; /* point to comm abortFlag */
-  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
+  uint32_t* abortFlag; /* point to comm abortFlag */
+  uint32_t* abortFlagDev; /* point to comm abortFlagDev */
+  uint32_t* childAbortFlag; /* point to child abortFlag */
+  uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
   ncclComm_t comm;
+  int destroyFlag;
 };
 
 ncclResult_t ncclAsyncLaunch(
@@ -52,14 +55,14 @@ struct ncclGroupJob {
   struct ncclComm **groupCommHeadPtr;
   struct ncclComm **groupCommPreconnectHeadPtr;
   ncclResult_t *groupErrorPtr;
-  volatile bool *abortFlagPtr;
+  bool *abortFlagPtr;
   int *groupBlockingPtr;
   struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
   bool initialized;
 };
 
 ncclResult_t ncclGroupStartInternal();
-ncclResult_t ncclGroupEndInternal();
+ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL);
 ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -114,6 +117,10 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     // Comms gets a new memory stack scope upon joining. Each task batched for
     // this comm is allocated there.
     ncclMemoryStackPush(&comm->memScoped);
+    // Initialize planner
+    ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+    memset(&comm->planner, 0, sizeof(comm->planner));
+    comm->planner.peers = tmp;
   }
 
   ncclGroupBlocking = comm->config.blocking;
diff --git a/src/include/info.h b/src/include/info.h
index 0d53b9a21..3cabae866 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -8,28 +8,9 @@
 #define NCCL_INFO_H_
 
 #include "nccl.h"
-#include "device.h"
 #include "collectives.h"
 #include "core.h"
 #include "utils.h"
-#include "strongstream.h"
-#define NCCL_MAX_LOCAL_RANKS 64
-
-typedef enum : uint8_t {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown,
-  ncclPatternCollnetChain,
-  ncclPatternCollnetDirect,
-  ncclPatternNvls,
-  ncclPatternNvlsTree,
-  ncclPatternSend,
-  ncclPatternRecv
-} ncclPattern_t;
 
 // Used to pass NCCL call information between functions
 struct ncclInfo {
@@ -47,110 +28,6 @@ struct ncclInfo {
   // Algorithm details
   int chunkSteps;
   int sliceSteps;
-  // Computed later
-  ncclDevRedOpFull opFull;
-  ncclPattern_t pattern;
-  size_t nBytes;
-  size_t aggnBytes;
-  size_t workBytes;
-  size_t sendbuffSize;
-  size_t recvbuffSize;
-  int stepSize;
-  int chunkCount;
-  int chunkSize;
-  int channelId;
-  int workFuncIndex;
-  ncclRegBufferType regBufType;
-  void* regBufSend[NCCL_MAX_LOCAL_RANKS];
-  void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
-  // collnet buffer reg handles
-  void* sendMhandle;
-  void* recvMhandle;
-  // Need to initialize
-  int nThreads;
-  int nChannels;
-  int algorithm;
-  int protocol;
-  bool userTuned;
-  struct ncclInfo *next;
-};
-
-inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
-  info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->workBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
-
-  /* compute buffer size for NVLS buffer registration */
-  if (info->coll == ncclFuncAllGather) {
-    info->sendbuffSize = info->workBytes;
-    info->recvbuffSize = info->sendbuffSize * nRanks;
-  } else if (info->coll == ncclFuncReduceScatter) {
-    info->recvbuffSize = info->workBytes;
-    info->sendbuffSize = info->recvbuffSize * nRanks;
-  } else {
-    info->sendbuffSize = info->recvbuffSize = info->workBytes;
-  }
-  return ncclSuccess;
-}
-
-struct ncclTaskColl {
-  struct ncclTaskColl* next;
-  ncclFunc_t func;
-  void const* sendbuff;
-  void* recvbuff;
-  size_t count;
-  int root;
-  ncclDataType_t datatype;
-  ncclDevRedOpFull op;
-  int chunkSteps, sliceSteps;
-  struct ncclInfo info;
-};
-struct ncclTaskP2p {
-  ncclTaskP2p *next;
-  void *buff;
-  size_t bytes;
-  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
-  // of where it left off.
-  int chunk;
-};
-
-struct ncclCudaStreamList {
-  struct ncclCudaStreamList *next;
-  cudaStream_t stream;
-};
-struct ncclTasks {
-  struct Peer {
-    bool sendSeen, recvSeen;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
-  };
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
-  // Queue for user-tuned executed collectives
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
-  // Queue for continuous bytes distribution (CBD) collectives
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
-  // Queue for collnet
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
-  size_t workBytesTotal;
-  int usableChannels;
-  bool sorted;
-  struct Peer* peers/*[nRanks]*/;
-  int *p2pSendOrder, *p2pRecvOrder;
-  int p2pOrderSteps;
-  int nTasksColl, nTasksP2p;
-
-  // The list of user streams aggregated over all tasks present.
-  struct ncclCudaStreamList* streams;
-  // The most recent user stream. Ignored if streams==nullptr
-  cudaStream_t streamRecent;
-  // The graph capturing all user streams or invalid if none. Thus we restrict the
-  // user that all streams must be captured in the same graph or not captured
-  // at all. Technically we could probably relax this, but that would mean
-  // collecting a different `ncclTasks` per graph and one for non-graph.
-  struct ncclCudaGraph capturingGraph;
 };
 
 #endif
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index 5796eb9fb..a0fb3a55f 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -7,8 +7,33 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {
+  NCCL_LOG_NONE = 0,
+  NCCL_LOG_VERSION = 1,
+  NCCL_LOG_WARN = 2,
+  NCCL_LOG_INFO = 3,
+  NCCL_LOG_ABORT = 4,
+  NCCL_LOG_TRACE = 5
+} ncclDebugLogLevel;
+
+typedef enum {
+  NCCL_INIT = 0x1,
+  NCCL_COLL = 0x2,
+  NCCL_P2P = 0x4,
+  NCCL_SHM = 0x8,
+  NCCL_NET = 0x10,
+  NCCL_GRAPH = 0x20,
+  NCCL_TUNING = 0x40,
+  NCCL_ENV = 0x80,
+  NCCL_ALLOC = 0x100,
+  NCCL_CALL = 0x200,
+  NCCL_PROXY = 0x400,
+  NCCL_NVLS = 0x800,
+  NCCL_BOOTSTRAP = 0x1000,
+  NCCL_REG = 0x2000,
+  NCCL_PROFILE = 0x4000,
+  NCCL_ALL = ~0
+} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
@@ -40,4 +65,5 @@ typedef enum {
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2
 
+#define NCCL_ALGO_PROTO_IGNORE -1.0
 #endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
index 1917e2815..5cd02149f 100644
--- a/src/include/nccl_tuner.h
+++ b/src/include/nccl_tuner.h
@@ -11,6 +11,54 @@
 #include "nccl.h"
 #include "nccl_common.h"
 
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+typedef ncclTuner_v3_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+
 // API to be implemented by external tuner
 typedef struct {
   // Name of the tuner
@@ -36,7 +84,7 @@ typedef struct {
   //
   // Outputs:
   //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
   //   - nChannels: number of channels (hence SMs) to be used.
   //
   // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
@@ -46,15 +94,11 @@ typedef struct {
   // Unset fields will be set automatically by NCCL.
   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                               int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels);
+                              int* algorithm, int* protocol, int* nChannels);
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
   ncclResult_t (*destroy)(void* context);
 } ncclTuner_v2_t;
 
-typedef ncclTuner_v2_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
-
 #endif
diff --git a/src/include/net.h b/src/include/net.h
index b5df58968..d1926ccd8 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -14,8 +14,10 @@
 
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
-ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
+ncclResult_t ncclNetFinalize(struct ncclComm* comm);
 int ncclNetVersion(struct ncclComm* comm);
 
 // Test whether the current GPU support GPU Direct RDMA.
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index bad0b7937..7dee7d4ae 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -253,6 +253,38 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
 */
 #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
 
+/**
+ * Confidential Compute Feature Status values
+ */
+#define NVML_CC_SYSTEM_FEATURE_DISABLED 0
+#define NVML_CC_SYSTEM_FEATURE_ENABLED  1
+
+typedef struct nvmlConfComputeSystemState_st {
+    unsigned int environment;
+    unsigned int ccFeature;
+    unsigned int devToolsMode;
+} nvmlConfComputeSystemState_t;
+
+/**
+ * Confidential Compute Multigpu mode values
+ */
+#define NVML_CC_SYSTEM_MULTIGPU_NONE 0
+#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
+
+/**
+ * Confidential Compute System settings
+ */
+typedef struct {
+    unsigned int version;
+    unsigned int environment;
+    unsigned int ccFeature;
+    unsigned int devToolsMode;
+    unsigned int multiGpuMode;
+} nvmlSystemConfComputeSettings_v1_t;
+
+typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
+#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1)
+
 /* End of nvml.h */
 #endif // NCCL_NVML_DIRECT
 
@@ -268,6 +300,11 @@ extern int ncclNvmlDeviceCount;
 extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
 extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
 
+struct ncclNvmlCCStatus {
+    bool CCEnabled;
+    bool multiGpuCCEnabled;
+};
+
 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
 // Outsiders need only call it if they want to inspect the ncclNvml global
 // tables above.
@@ -283,5 +320,6 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma
 ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
 ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
+ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status);
 
 #endif // End include guard
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index ab32ef27f..3bdfec59d 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -63,7 +63,7 @@ class payload_schema {
     nullptr,
     NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
     NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
-    nullptr, 0, 0, 0};
+    nullptr, 0, 0, 0, 0, nullptr};
 };
 
 // Create NVTX push/pop range with parameters
diff --git a/src/include/nvtx3/nvToolsExt.h b/src/include/nvtx3/nvToolsExt.h
index 10938385d..1ba00bef7 100644
--- a/src/include/nvtx3/nvToolsExt.h
+++ b/src/include/nvtx3/nvToolsExt.h
@@ -25,9 +25,9 @@
  *
  * \section INITIALIZATION_SECTION Initialization
  *
- * Typically the tool's library that plugs into NVTX is indirectly 
- * loaded via enviromental properties that are platform specific. 
- * For some platform or special cases, the user may be required 
+ * Typically the tool's library that plugs into NVTX is indirectly
+ * loaded via enviromental properties that are platform specific.
+ * For some platform or special cases, the user may be required
  * to instead explicity initialize instead though.   This can also
  * be helpful to control when the API loads a tool's library instead
  * of what would typically be the first function call to emit info.
@@ -37,16 +37,16 @@
  *
  * Markers and ranges are used to describe events at a specific time (markers)
  * or over a time span (ranges) during the execution of the application
- * respectively. 
+ * respectively.
  *
  * \subsection MARKERS Markers
- * 
+ *
  * Markers denote specific moments in time.
- * 
- * 
+ *
+ *
  * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
  * how to specify the domain.
- * 
+ *
  * \subsection THREAD_RANGES Thread Ranges
  *
  * Thread ranges denote nested time ranges. Nesting is maintained per thread
@@ -59,9 +59,9 @@
  *
  * \subsection PROCESS_RANGES Process Ranges
  *
- * Process ranges denote a time span that can expose arbitrary concurrency, as 
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
  * opposed to thread ranges that only support nesting. In addition the range
- * start event can happen on a different thread than the end marker. For the 
+ * start event can happen on a different thread than the end marker. For the
  * correlation of a start/end pair an unique correlation ID is used that is
  * returned from the start API call and needs to be passed into the end API
  * call.
@@ -87,15 +87,15 @@
  *
  * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
  * a named domain.
- * 
+ *
  * Each domain maintains its own
  * - categories
  * - thread range stacks
  * - registered strings
  *
- * The function ::nvtxDomainDestroy marks the end of the domain. Destroying 
- * a domain unregisters and destroys all objects associated with it such as 
- * registered strings, resource objects, named categories, and started ranges. 
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
+ * a domain unregisters and destroys all objects associated with it such as
+ * registered strings, resource objects, named categories, and started ranges.
  *
  * \section RESOURCE_NAMING Resource Naming
  *
@@ -105,41 +105,41 @@
  * The functions can be called multiple times during the execution of an
  * application, however, in that case it is implementation dependent which
  * name will be reported by the tool.
- * 
+ *
  * \subsection CATEGORY_NAMING Category Naming
  *
- * Some function in this library support associating an integer category 
- * to enable filtering and sorting.  The category naming functions allow 
- * the application to associate a user friendly name with the integer 
- * category.  Support for domains have been added in NVTX_VERSION_2 to 
- * avoid collisions when domains are developed independantly. 
+ * Some function in this library support associating an integer category
+ * to enable filtering and sorting.  The category naming functions allow
+ * the application to associate a user friendly name with the integer
+ * category.  Support for domains have been added in NVTX_VERSION_2 to
+ * avoid collisions when domains are developed independantly.
  *
  * \subsection RESOURCE_OBJECTS Resource Objects
  *
- * Resource objects are a generic mechanism for attaching data to an application 
- * resource.  The identifier field makes the association to a pointer or handle, 
- * while the type field helps provide deeper understanding of the identifier as 
+ * Resource objects are a generic mechanism for attaching data to an application
+ * resource.  The identifier field makes the association to a pointer or handle,
+ * while the type field helps provide deeper understanding of the identifier as
  * well as enabling differentiation in cases where handles generated by different
  * APIs may collide.  The resource object may also have an associated message to
- * associate with the application resource, enabling further annotation of this 
+ * associate with the application resource, enabling further annotation of this
  * object and how it is used.
- * 
+ *
  * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
  * functions and allow the application resource identified by those functions to be
  * associated to a domain.  The other naming functions are still supported for backward
  * compatibility but will be associated only to the default domain.
  *
  * \subsection RESOURCE_NAMING_OS Resource Naming
- * 
- * Some operating system resources creation APIs do not support providing a user friendly 
- * name, such as some OS thread creation APIs.  This API support resource naming though 
- * both through resource objects and functions following the pattern 
- * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2 
+ *
+ * Some operating system resources creation APIs do not support providing a user friendly
+ * name, such as some OS thread creation APIs.  This API support resource naming though
+ * both through resource objects and functions following the pattern
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2
  * supersede the other functions with a a more general method of assigning names to OS resources,
- * along with associating them to domains too.  The older nvtxName* functions are only associated 
+ * along with associating them to domains too.  The older nvtxName* functions are only associated
  * with the default domain.
  * \section EXTENSIONS Optional Extensions
- * Optional extensions will either appear within the existing sections the extend or appear 
+ * Optional extensions will either appear within the existing sections the extend or appear
  * in the "Related Pages" when they introduce new concepts.
  */
 
@@ -159,7 +159,11 @@
 #define NVTX_INLINE_STATIC __inline static
 #else /*defined(__GNUC__)*/
 #define NVTX_API
+#if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define NVTX_INLINE_STATIC inline static
+#else
+#define NVTX_INLINE_STATIC __inline__ static
+#endif
 #endif /* Platform */
 
 #if defined(NVTX_NO_IMPL)
@@ -212,7 +216,7 @@
 extern "C" {
 #endif /* __cplusplus */
 
-/** 
+/**
 * Result Codes
 */
 
@@ -281,12 +285,12 @@ typedef enum nvtxColorType_t
  * ------------------------------------------------------------------------- */
 typedef enum nvtxMessageType_t
 {
-    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message payload is unused. */
+    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message attribute is unused. */
     NVTX_MESSAGE_TYPE_ASCII       = 1,    /**< A character sequence is used as payload. */
     NVTX_MESSAGE_TYPE_UNICODE     = 2,     /**< A wide character sequence is used as payload. */
     /* NVTX_VERSION_2 */
     NVTX_MESSAGE_TYPE_REGISTERED  = 3,    /**< A unique string handle that was registered
-                                                with \ref nvtxDomainRegisterStringA() or 
+                                                with \ref nvtxDomainRegisterStringA() or
                                                 \ref nvtxDomainRegisterStringW(). */
 } nvtxMessageType_t;
 
@@ -338,7 +342,7 @@ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
 * ------------------------------------------------------------------------- */
 typedef enum nvtxPayloadType_t
 {
-    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Payload attribute is unused. */
     NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1,   /**< A 64 bit unsigned integer value is used as payload. */
     NVTX_PAYLOAD_TYPE_INT64 = 2,   /**< A 64 bit signed integer value is used as payload. */
     NVTX_PAYLOAD_TYPE_DOUBLE = 3,   /**< A 64 bit floating point value is used as payload. */
@@ -714,10 +718,10 @@ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
 /* ------------------------------------------------------------------------- */
 /** \brief Ends a process range.
 *
-* \param domain - The domain 
+* \param domain - The domain
 * \param id - The correlation ID returned from a nvtxRangeStart call.
 *
-* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. 
+* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
 * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
 *
 * \par Example:
@@ -929,10 +933,10 @@ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
 
 /*  ------------------------------------------------------------------------- */
 /** \cond SHOW_HIDDEN
-* \brief Resource typing helpers.  
+* \brief Resource typing helpers.
 *
-* Classes are used to make it easy to create a series of resource types 
-* per API without collisions 
+* Classes are used to make it easy to create a series of resource types
+* per API without collisions
 */
 #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
 #define NVTX_RESOURCE_CLASS_GENERIC 1
@@ -1062,7 +1066,7 @@ typedef struct nvtxResourceAttributes_v0
     int32_t identifierType;            /* values from enums following the pattern nvtxResource[name]Type_t */
 
     /**
-    * \brief Identifier for the resource. 
+    * \brief Identifier for the resource.
     * \anchor RESOURCE_IDENTIFIER_FIELD
     *
     * An identifier may be a pointer or a handle to an OS or middleware API object.
@@ -1093,7 +1097,7 @@ typedef struct nvtxResourceAttributes_v0
 
 typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
 
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \version \NVTX_VERSION_2
 */
 #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
@@ -1106,7 +1110,7 @@ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
 /** \brief Create a resource object to track and associate data with OS and middleware objects
 *
 * Allows users to associate an API handle or pointer with a user-provided name.
-* 
+*
 *
 * \param domain - Domain to own the resource object
 * \param attribs - Attributes to be associated with the resource
@@ -1240,7 +1244,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t*
  * POSIX pthread_t type returned by pthread_self() may not comply with these
  * expectations. Please use OS-specific thread ID instead of pthread_t.
  *
- * The thread name is associated to the default domain.  To support domains 
+ * The thread name is associated to the default domain.  To support domains
  * use resource objects via ::nvtxDomainResourceCreate.
  *
  * \param threadId - The ID of the thread to name.
@@ -1457,7 +1461,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
 } /* extern "C" */
 #endif /* __cplusplus */
 
-#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */
 
 #include "nvtxDetail/nvtxTypes.h"
 
diff --git a/src/include/nvtx3/nvToolsExtCounters.h b/src/include/nvtx3/nvToolsExtCounters.h
new file mode 100644
index 000000000..00e2b7f8f
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtCounters.h
@@ -0,0 +1,335 @@
+/**
+ * The NVTX counters extension is intended to collect counter values of various
+ * sources. It uses the NVTX payload extension to specify the data layout a
+ * counter group.
+ *
+ * A counter group is a set of counters that are collected together (at the same
+ * time). Counters are always registered as a group. Hence, a single counter is
+ * represented by a group with one counter.
+ *
+ * A sample refers to all values for a given timestamp. These values must
+ * include counter values and may include multiple instances of a counter group.
+ *
+ * The NVTX domain handle is the first argument to all counter collect
+ * functions. 0/NULL/nullptr represents the default domain (no domain).
+ */
+
+#include "nvToolsExtPayload.h"
+
+#ifndef NVTOOLSEXT_COUNTERS_H
+#define NVTOOLSEXT_COUNTERS_H
+
+/**
+ * \brief The compatibility ID is used for versioning of this extension.
+ */
+#ifndef NVTX_EXT_COUNTERS_COMPATID
+#define NVTX_EXT_COUNTERS_COMPATID 0x0101
+#endif
+
+/**
+ * \brief The module ID identifies the payload extension. It has to be unique
+ * among the extension modules.
+ */
+#ifndef NVTX_EXT_COUNTERS_MODULEID
+#define NVTX_EXT_COUNTERS_MODULEID 4
+#endif
+
+
+/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */
+#define NVTX_SCOPE_NONE                   0 /* no scope */
+
+#define NVTX_SCOPE_ROOT                   1
+
+#define NVTX_SCOPE_CURRENT_HW_MACHINE     2 /* Node/machine name, Device? */
+#define NVTX_SCOPE_CURRENT_HW_SOCKET      3
+#define NVTX_SCOPE_CURRENT_HW_CPU         4
+#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5
+/* Innermost HW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_HW_INNERMOST   6
+
+/* Virtualized hardware, virtual machines, OS (if you don't know any better) */
+#define NVTX_SCOPE_CURRENT_HYPERVISOR     7
+#define NVTX_SCOPE_CURRENT_VM             8
+#define NVTX_SCOPE_CURRENT_KERNEL         9
+#define NVTX_SCOPE_CURRENT_CONTAINER     10
+#define NVTX_SCOPE_CURRENT_OS	         11
+
+/* Software scopes */
+#define NVTX_SCOPE_CURRENT_SW_PROCESS 	 12 /* Process scope */
+#define NVTX_SCOPE_CURRENT_SW_THREAD  	 13 /* Thread scope */
+#define NVTX_SCOPE_CURRENT_SW_FIBER      14
+/* Innermost SW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_SW_INNERMOST  15
+
+/** Static (user-provided) scope IDs (feed forward) */
+#define NVTX_SCOPE_ID_STATIC_START  (1 << 24)
+
+/** Dynamically (tool) generated scope IDs */
+#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296  /* 1 << 32 */
+
+
+/** Identifier of the semantic extension for counters. */
+#define NVTX_SEMANTIC_ID_COUNTERS_V1 5
+
+/***  Flags to augment the counter value. ***/
+#define NVTX_COUNTERS_FLAG_NONE       0
+
+/**
+ * Convert the fixed point value to a normalized floating point.
+ * Use the sign/unsign from the underlying type this flag is applied to.
+ * Unsigned [0f : 1f] or signed [-1f : 1f]
+ */
+#define NVTX_COUNTERS_FLAG_NORM       (1 << 1)
+
+/**
+ * Tools should apply scale and limits when graphing, ideally in a "soft" way to
+ * to see when limits are exceeded.
+ */
+#define NVTX_COUNTERS_FLAG_LIMIT_MIN  (1 << 2)
+#define NVTX_COUNTERS_FLAG_LIMIT_MAX  (1 << 3)
+#define NVTX_COUNTERS_FLAG_LIMITS \
+    (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
+
+/** Counter time scope **/
+#define NVTX_COUNTERS_FLAG_TIME_POINT       (1 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST  (2 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT  (3 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5)
+
+/** Counter value type **/
+#define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE   (1 << 10)
+#define NVTX_COUNTERS_FLAG_VALUE_DELTA      (2 << 10) // delta to previous counter sample
+
+/** Counter visualization hints **/
+#define NVTX_COUNTERS_FLAG_INTERPOLATE      (1 << 14)
+
+/** Datatypes for limits union (value of `limitType`). */
+#define NVTX_COUNTERS_LIMIT_I64 0
+#define NVTX_COUNTERS_LIMIT_U64 1
+#define NVTX_COUNTERS_LIMIT_F64 2
+
+/** Reasons for the missing sample value. */
+#define NVTX_COUNTERS_SAMPLE_ZERO        0
+#define NVTX_COUNTERS_SAMPLE_UNCHANGED   1
+#define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \brief Specify additional properties of a counter or counter group.
+ */
+typedef struct nvtxSemanticsCounter_v1
+{
+    /** Header of the semantic extension (with identifier, version, etc.). */
+    struct nvtxSemanticsHeader_v1 header;
+
+    /**
+     * Flag if normalization, scale limits, etc. should be applied to counter
+     * values.
+     */
+    uint64_t flags;
+
+    /** Unit of the counter value (case insensitive) */
+    const char* unit;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleNumerator;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleDenominator;
+
+    /** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */
+    int64_t limitType;
+
+    /** Soft graph limit. */
+    union limits_t {
+        int64_t i64[2];
+        uint64_t u64[2];
+        double d[2];
+    } limits;
+} nvtxSemanticsCounter_t;
+
+typedef struct nvtxCountersAttr_v1
+{
+    size_t structSize;
+
+    /**
+     * A schema ID referring to the data layout of the counter group or a
+     * predefined NVTX payloads number type.
+     */
+    uint64_t schemaId;
+
+    /** Name of the counter group. */
+    const char* name;
+
+    /** Identifier of the scope of the counters. */
+    uint64_t scopeId;
+
+    /**
+     * (Optional) Specify additional semantics for a counter (group). The
+     * semantics provided are applied to the all counters in a group. If the
+     * semantics should only refer to a single counter in a group, the semantics
+     * field of the payload entry has to be used. Accepted semantics are
+     * `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`.
+     */
+    const nvtxSemanticsHeader_t* semantics;
+} nvtxCountersAttr_t;
+
+/* Forward declaration of opaque counter group registration structure */
+struct nvtxCountersRegistration_st;
+typedef struct nvtxCountersRegistration_st nvtxCountersRegistration;
+
+/* \brief Counters Handle Structure.
+* \anchor COUNTERS_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference a counter group.
+* This type is returned from tools when using the NVTX API to create a counters group.
+*/
+typedef nvtxCountersRegistration* nvtxCountersHandle_t;
+
+typedef struct nvtxCountersBatch_v1
+{
+    /** Handle to attributes (data layout, scope, etc.) of a counter (group). */
+    nvtxCountersHandle_t hCounter;
+
+    /** Array of counter samples. */
+    const void* counters;
+
+    /** Size of the `counters` array (in bytes). */
+    size_t cntArrSize;
+
+    /** Array of timestamps or reference-time plus delta pair. `NULL` is used, if
+    timestamps are part of the counter (group) layout.) */
+    const void* timestamps;
+
+    /** Size of the `timestamps` array or definition (in bytes). */
+    size_t tsSize;
+} nvtxCountersBatch_t;
+
+/**
+ * \brief Register a counter group.
+ *
+ * @param hDomain NVTX domain handle.
+ * @param attr Pointer to the attributes of the counter (group).
+ *
+ * @return Counter handle identifying a counter or counter (group).
+ *         The counter handle is unique within the NVTX domain.
+ */
+NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister(
+    nvtxDomainHandle_t hDomain,
+    const nvtxCountersAttr_t* attr);
+
+/**
+ * \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param value 64-bit integer counter value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    int64_t value);
+
+/**
+ * \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param value 64-bit floating-point counter value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    double value);
+
+/**
+ * \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param counters pointer to one or more counter values.
+ * @param size size of the counter value(s) in bytes.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSample(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    void* values,
+    size_t size);
+
+/**
+ * \brief Sample without value.
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param reason reason for the missing sample value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    uint8_t reason);
+
+/**
+ * \brief Submit a batch of counters in the given domain.
+ *        Timestamps are part of the counter sample data.
+ *
+ * The size of a data sampling point is defined by the `staticSize` field of the
+ * payload schema. An NVTX tool can assume that the counter samples are stored
+ * as an array with each entry being `staticSize` bytes.
+ *
+ * @param hDomain handle of the NVTX domain
+ * @param hCounter handle of the counter group (includes counter data decoding schema)
+ * @param counters blob containing counter data and timestamps
+ * @param size size of the counter data blob in bytes
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    const void* counters,
+    size_t size);
+
+/**
+ * \brief Submit a batch of counters in the given domain.
+ *        Timestamps are separated from the counter data.
+ *
+ * @param hDomain handle of the NVTX domain
+ * @param counterBatch Pointer to the counter data to be submitted.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx(
+    nvtxDomainHandle_t hDomain,
+    const nvtxCountersBatch_t* counterBatch);
+
+
+#define NVTX3EXT_CBID_nvtxCountersRegister           0
+#define NVTX3EXT_CBID_nvtxCountersSampleInt64        1
+#define NVTX3EXT_CBID_nvtxCountersSampleFloat64      2
+#define NVTX3EXT_CBID_nvtxCountersSample             3
+#define NVTX3EXT_CBID_nvtxCountersSampleNoValue      4
+#define NVTX3EXT_CBID_nvtxCountersSubmitBatch        5
+#define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx      6
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtImplCounters_v1.h"
+#undef NVTX_EXT_IMPL_COUNTERS_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_COUNTERS_H */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvToolsExtCuda.h b/src/include/nvtx3/nvToolsExtCuda.h
index b1b80ad67..de9aa9d48 100644
--- a/src/include/nvtx3/nvToolsExtCuda.h
+++ b/src/include/nvtx3/nvToolsExtCuda.h
@@ -30,7 +30,7 @@ extern "C" {
  */
 
 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -133,7 +133,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 #endif /* __cplusplus */
 
 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplCuda_v3.h"
 #undef NVTX_IMPL_GUARD_CUDA
 #endif /*NVTX_NO_IMPL*/
diff --git a/src/include/nvtx3/nvToolsExtCudaRt.h b/src/include/nvtx3/nvToolsExtCudaRt.h
index 1e19958ec..6a85da816 100644
--- a/src/include/nvtx3/nvToolsExtCudaRt.h
+++ b/src/include/nvtx3/nvToolsExtCudaRt.h
@@ -31,7 +31,7 @@ extern "C" {
  */
 
 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -109,7 +109,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t*
 #endif /* __cplusplus */
 
 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplCudaRt_v3.h"
 #undef NVTX_IMPL_GUARD_CUDART
 #endif /*NVTX_NO_IMPL*/
diff --git a/src/include/nvtx3/nvToolsExtMem.h b/src/include/nvtx3/nvToolsExtMem.h
new file mode 100644
index 000000000..3b3406e35
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtMem.h
@@ -0,0 +1,694 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#ifndef NVTOOLSEXTV3_MEM_V1
+#define NVTOOLSEXTV3_MEM_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define NVTX_EXT_MODULEID_MEM 1
+
+/* \cond SHOW_HIDDEN
+ * \brief A compatibility ID value used in structures and initialization to
+ * identify version differences.
+ */
+#define NVTX_EXT_COMPATID_MEM 0x0102
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemHeapHandle_t`,
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1)
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemRegionHandle_t`
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1)
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t`
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1)
+
+
+/* \cond SHOW_HIDDEN
+ * \brief This should not be used and is considered an error but defined to
+ * detect an accidental use of zero or NULL.
+ */
+#define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0
+
+
+/* \cond SHOW_HIDDEN
+ * \brief This should not be used and is considered an error but defined to
+ * detect an accidental use of zero or NULL.
+ */
+#define NVTX_MEM_TYPE_UNKNOWN 0x0
+
+
+/*  ------------------------------------------------------------------------- */
+/** \defgroup MEMORY Memory
+ * See page \ref PAGE_MEMORY.
+ * @{
+ */
+
+/**
+ * \brief To indicate the full process virtual address space as a heap for
+ * functions where a nvtxMemHeapHandle_t is accepted.
+ *
+ * The heap by default is always read-write-execute permissions without creating regions.
+ * Regions created in this heap have read-write access by default but not execute.
+ */
+#define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0)
+
+/** \brief This heap is a sub-allocator.
+ *
+ * Heap created with this usage should not be accessed by the user until regions are registered.
+ * Regions from a heap with this usage have read-write access by default but not execute.
+ */
+#define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1
+
+/**
+ * \brief This is a heap of memory that has an explicit layout.
+ *
+ * The layout could be static or dynamic (calculated). This often represents an algorithm's
+ * structures that are packed together. By default this heap is assumed to be accessible for
+ * scopes where the memory is naturally accessible by hardware. Regions may be use to further
+ * annotate or restrict access. A tool may have an option to be more strict, but special
+ * consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
+ *
+ * The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but
+ * a tool can use it to track special behaviors and reservation.
+ *
+ * Memory in a heap with this usage has read-write permissions by default but not execute without
+ * creating regions. Regions created in this heap have the same default permission access.
+ */
+#define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2
+
+
+/**
+ * \brief Standard process userspace virtual addresses for linear allocations.
+ *
+ * APIs that map into this space, such as CUDA UVA should use this type.
+ *
+ * Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1
+
+
+/**
+ * \brief To indicate you are modifying permissions to the process-wide
+ * full virtual address space.
+ *
+ * This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
+ */
+#define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0)
+
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4
+
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory heap structure.
+ */
+struct nvtxMemHeap_v1;
+typedef struct nvtxMemHeap_v1 nvtxMemHeap_t;
+
+/** \brief A handle returned by a tool to represent a memory heap. */
+typedef nvtxMemHeap_t* nvtxMemHeapHandle_t;
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory heap structure.
+ */
+struct nvtxMemRegion_v1;
+typedef struct nvtxMemRegion_v1 nvtxMemRegion_t;
+
+/** \brief A handle returned by a tool to represent a memory region. */
+typedef nvtxMemRegion_t* nvtxMemRegionHandle_t;
+
+/** \brief A reference to a memory region (by pointer or handle).
+ * Which member of the union will be determined by a type or flag field outside.
+ */
+typedef union nvtxMemRegionRef_t
+{
+    void const* pointer;
+    nvtxMemRegionHandle_t handle;
+} nvtxMemRegionRef_t;
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory permissions structure
+ */
+struct nvtxMemPermissions_v1;
+typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t;
+
+/** \brief A handle returned by a tool to represent a memory permissions mask. */
+typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t;
+
+
+typedef struct nvtxMemVirtualRangeDesc_v1
+{
+    size_t  size;
+    void const*  ptr;
+} nvtxMemVirtualRangeDesc_v1 ;
+typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t;
+
+
+/** \brief structure to describe a heap in process virtual memory. */
+typedef struct nvtxMemHeapDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+
+    /** \brief Usage characteristics of the heap
+     *
+     * Usage characteristics help tools like memcheckers, santiizer,
+     * as well as other debugging and profiling tools to determine some
+     * special behaviors they should apply to the heap and it's regions.
+     * The value follows the convention NVTX_MEM_HEAP_USAGE_*
+     *
+     * Default Value is 0, which is invalid.
+     */
+    uint32_t usage;
+
+    /** \brief Memory type characteristics of the heap
+     *
+     * The 'type' indicates how to interpret the ptr field of the heapDesc.
+     * This is intended to support many additional types of memory, beyond
+     * standard process virtual memory, such as API specific memory only
+     * addressed by handles or multi-dimensional memory requiring more complex
+     * descriptions to handle features like strides, tiling, or interlace.
+     *
+     * The values conforms to NVTX_MEM_TYPE_*
+     *
+     * The value in the field 'type' identifies the descriptor type that will
+     * be in the field 'typeSpecificDesc'.  'typeSpecificDesc' is void* because
+     * it is extensible.  Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
+     * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
+     *
+     * Default Value is 0, which is invalid.
+     */
+    uint32_t type;
+
+    /** \brief size of the heap memory descriptor pointed to by typeSpecificDesc
+     *
+     * Default Value is 0 which is invalid.
+     */
+    size_t typeSpecificDescSize;
+
+    /** \brief Pointer to the heap memory descriptor
+     *
+     * The value in the field 'type' identifies the descriptor type that will
+     * be in the field 'typeSpecificDesc'.  'typeSpecificDesc' is void* because
+     * it is extensible.  Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
+     * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
+     *
+     * Default Value is 0, which is invalid.
+     */
+    void const* typeSpecificDesc;
+
+    /** \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0.
+     */
+    uint32_t category;
+
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is `NVTX_MESSAGE_UNKNOWN`.
+     */
+    uint32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    nvtxMessageValue_t message;
+
+} nvtxMemHeapDesc_v1 ;
+typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t;
+
+/**
+ * \brief Create a memory heap to represent a object or range of memory that will be further
+ * sub-divided into regions.
+ *
+ * The handle used to addrss the heap will depend on the heap's type.  Where the heap is virtual
+ * memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise
+ * be returned from the function.
+ *
+ * For more advanced types, where the heap is not virtual memory accessible the tools may be
+ * responsible for returning a void const * that that uniquely identifies the object. Please see
+ * the description of each heap type for more details on whether this is expected to be a uniquely
+ * generated by the tool or otherwise.
+ */
+NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapDesc_t const* desc);
+
+ /** \brief Destroy a memory heap. */
+NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */
+
+/**
+ * \brief Reset the memory heap wipes out any changes, as if it were a fresh heap.
+ *
+ * This includes invalidating all regions and their handles.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides. This can be from
+ * `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided
+ * from other extension API.
+ *
+ * The regionType arg will define which type is used in regionDescArray.
+ * The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`.
+ * In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
+ * a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if
+ * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
+ * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsRegisterBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t regionType; /* NVTX_MEM_TYPE_* */
+
+    nvtxMemHeapHandle_t heap;
+
+    size_t regionCount;
+    size_t regionDescElementSize;
+    void const* regionDescElements; /* This will also become the handle for this region. */
+    nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsRegisterBatch_v1;
+typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t;
+
+ /** \brief Register a region of memory inside of a heap of linear process virtual memory
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsRegisterBatch_t const* desc);
+
+
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides.
+ * This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or
+ * one provided from other extension API.
+ *
+ * The regionType arg will define which type is used in regionDescArray.
+ * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
+ * a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if
+ * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
+ * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsResizeBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t regionType; /* NVTX_MEM_TYPE_* */
+
+    size_t regionDescCount;
+    size_t regionDescElementSize;
+    void const* regionDescElements; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsResizeBatch_v1;
+typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t;
+
+ /** \brief Register a region of memory inside of a heap of linear process virtual memory
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsResizeBatch_t const* desc);
+
+
+#define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0
+#define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1
+#define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides.
+ * This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or
+ * one provided from other extension API.
+ *
+ * The regionType arg will define which type is used in `regionDescArray`.
+ * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles.
+ * If a pointer if provided, it is expected to have regionCount elements.
+ * This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.  In this case,
+ * the user can use the pointer to the virtual memory to reference the region in other
+ * related functions which accept a nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsUnregisterBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */
+
+    size_t refCount; /* count of elements in refArray */
+    size_t refElementSize;
+    nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsUnregisterBatch_v1;
+typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t;
+
+/**
+ * \brief Unregistration for regions of process virtual memory
+ *
+ * This is not necessary if the nvtx heap destroy function has been called that
+ * contains this object.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsUnregisterBatch_t const* desc);
+
+typedef struct nvtxMemRegionNameDesc_v1
+{
+    uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
+    uint32_t nameType; /* nvtxMessageType_t */
+
+    nvtxMemRegionRef_t region;
+    nvtxMessageValue_t name;
+
+    uint32_t category;
+    uint32_t reserved0;
+} nvtxMemRegionNameDesc_v1;
+typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t;
+
+
+typedef struct nvtxMemRegionsNameBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t reserved0;
+
+    size_t regionCount;
+    size_t regionElementSize;
+    nvtxMemRegionNameDesc_t const* regionElements;
+    size_t reserved1;
+} nvtxMemRegionsNameBatch_v1 ;
+typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t;
+
+
+ /** \brief Name or rename a region. */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsNameBatch_t const* desc);
+
+/** \brief There are no permissions for this memory. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0
+
+/** \brief The memory is readable. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1
+
+/** \brief The memory is writable. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2
+
+/** \brief The memory is for atomic RW. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4
+
+/**
+ * \brief The memory access permissions are reset for a region.
+ *
+ * This is as if never set, rather than documented defaults.  As as result any flags
+ * indicating how unspecified regions are handle will affect this area.
+ *
+ * This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect.
+ */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8
+
+
+typedef struct nvtxMemPermissionsAssignRegionDesc_v1
+{
+    uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
+    uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
+    nvtxMemRegionRef_t region;
+
+} nvtxMemPermissionsAssignRegionDesc_v1 ;
+typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t;
+
+
+typedef struct nvtxMemPermissionsAssignBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t reserved0;
+
+    nvtxMemPermissionsHandle_t permissions;
+
+    size_t regionCount;
+    size_t regionElementSize;
+    nvtxMemPermissionsAssignRegionDesc_t const* regionElements;
+
+    size_t reserved1;
+} nvtxMemPermissionsAssignBatch_v1 ;
+typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t;
+
+
+ /** \brief Change the permissions of a region of process virtual memory. */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsAssignBatch_t const* desc);
+
+
+/**
+ * \brief Create a permissions object for fine grain thread-local control in
+ * multi-threading scenarios
+ *
+ * Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new
+ * permissions object is empty. There are no regions registered to it, so more memory is accessible
+ * if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not
+ * active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details.
+ *
+ * Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control  how the regions in
+ * this permission object will interact with global permissions when bound. You may choose to
+ * either replace global memory regions setting or overlay on top of them. The most common uses are
+ * as follows:
+ *     * To limit tools to validate writing exclusively specified in this object but inherit all
+ *       global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE`
+ *     * To limit tools to validate both read & write permissions exclusively specified in this
+ *        object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ
+ *                   & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE
+ *
+ * Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`.
+ */
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate(
+    nvtxDomainHandle_t domain,
+    int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */
+
+/**
+ * \brief Destroy the permissions object.
+ *
+ * If bound(bind), destroy will also unbind it.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */
+
+/** \brief Reset the permissions object back to its created state. */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissionsHandle);
+/* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */
+
+
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0
+
+ /** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them.
+  *
+   * EX A developer may chose to first prevent all writes except the ones specified to avoid
+  * OOB writes, since there are typically less regions written to than read from.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2
+
+ /** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them.
+  *
+  * EX After eliminating any errors when applying strict writes, a developer may then choose to
+  * annotate and enforce strict reads behaviors in segments of code.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1
+
+ /** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them.
+  *
+  * EX After eliminating any errors from read and write, a developer may chose to ensure
+  * that atomics are in their own region, removing standard read/write, and replacing with
+  * this strict atomic only access.  This way they know that conventional reads or writes
+  * will not cause unepected issues.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4
+
+
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0
+
+ /** \brief Bind to thread scope.  In this case, tools should validate that local thread's
+  * execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE
+  * at the time of binding.  If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be
+  * used to validate the memory.
+  *
+  * Not all tools will support every scope, such a GPU sanitizer.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1
+
+/**
+ * \brief Bind to CUDA stream scope.
+ *
+ * In this case, work enqueued to a CUDA stream should be validated by the tool,
+ * when it executes, that it respect the permission of the permission at the point
+ * of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the
+ * time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at
+ * the time of stream enqueue should be used to validate the memory.
+ *
+ * This could apply to work done either on the GPU like a kernel launch or to
+ * CPU based callbacks like cudaStreamAddCallback if the tools supports it.
+ *
+ * Binding is applies locally to a CPU thread so that if N CPU threads are enqueing
+ * work to the same stream (like the default stream) that there cannot be a race
+ * condition between thread binding vs launching their work. IE users should
+ * expect the permissions bound in the thread to be honored by the proceeding
+ * work (launches, copies, etc) invoked from in the CPU thread until unbound.
+ */
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2
+
+
+/**
+ * \brief Bind the permissions object into a particular scope on the caller thread
+ *
+ * Permissions do not take affect until binding. Binding permissions is a thread local
+ * activity that overrides global behaviors.  This is to avoid multi-threaded race conditions,
+ *
+ * The scope dictates what type of processing it applies to, and when in some cases.
+ * EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound.
+ * EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions
+ * must be recorded and applied when the work in the stream dequeues to executes.  In this case
+ * it could be GPU or CPU, if the tool support both.
+ *
+ * Bind can be called again on the same object and thread to take any updates to the
+ * specified permission object or the inherited properties.
+ *
+ * Bind flags support changing how the binding process inherits region access control.
+ * In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM
+ * this is nvtxMemCudaGetDevicePermissions.  Choosing stricter modes allows the user to
+ * further reduce the access with less work, since memory by default, behaves as natural
+ * until the NVTX annotations instructs a tool to treat it anther way.  See strict flags
+ * for more details.
+ *
+ * Also see nvtxMemPermissionsUnbind
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */
+    uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */
+    uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */
+
+/**
+ * \brief Unbind the permissions object bound to the caller thread.
+ *
+ * Upon unbind, the thread local permissions for a scope are restored to the default
+ * behavior defined by the scope.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind(
+    nvtxDomainHandle_t domain,
+    uint32_t bindScope);
+
+/** @} */ /*END defgroup*/
+
+typedef enum NvtxExtMemCallbackId
+{
+    /* CBID 0 is invalid */
+    NVTX3EXT_CBID_nvtxMemHeapRegister                  = 1,
+    NVTX3EXT_CBID_nvtxMemHeapUnregister                = 2,
+    NVTX3EXT_CBID_nvtxMemHeapReset                     = 3,
+    NVTX3EXT_CBID_nvtxMemRegionsRegister               = 4,
+    NVTX3EXT_CBID_nvtxMemRegionsResize                 = 5,
+    NVTX3EXT_CBID_nvtxMemRegionsUnregister             = 6,
+    NVTX3EXT_CBID_nvtxMemRegionsName                   = 7,
+    NVTX3EXT_CBID_nvtxMemPermissionsAssign             = 8,
+    NVTX3EXT_CBID_nvtxMemPermissionsCreate             = 9,
+    NVTX3EXT_CBID_nvtxMemPermissionsDestroy            = 10,
+    NVTX3EXT_CBID_nvtxMemPermissionsReset              = 11,
+    NVTX3EXT_CBID_nvtxMemPermissionsBind               = 12,
+    NVTX3EXT_CBID_nvtxMemPermissionsUnbind             = 13,
+
+    /* 14-16 in nvtExtImplMemCudaRt1.h */
+    NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14,
+    NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions  = 15,
+    NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess             = 16,
+
+    NVTX3EXT_CBID_MEM_FN_NUM                           = 17
+} NvtxExtMemCallbackId;
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+/* Extension types are required for the implementation and the NVTX handler. */
+#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+/* Ensure other headers cannot be included directly */
+#define NVTX_EXT_IMPL_MEM_GUARD
+#include "nvtxDetail/nvtxExtImplMem_v1.h"
+#undef NVTX_EXT_IMPL_MEM_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXTV3_MEM_V1 */
diff --git a/src/include/nvtx3/nvToolsExtMemCudaRt.h b/src/include/nvtx3/nvToolsExtMemCudaRt.h
new file mode 100644
index 000000000..2b374bff9
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtMemCudaRt.h
@@ -0,0 +1,150 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+#ifndef NVTOOLSEXTV3_MEM_CUDART_V1
+#define NVTOOLSEXTV3_MEM_CUDART_V1
+
+#include "nvToolsExtMem.h"
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/** \brief The memory is from a CUDA runtime array.
+ *
+ * Relevant functions: cudaMallocArray,  cudaMalloc3DArray
+ * Also cudaArray_t from other types such as cudaMipmappedArray_t
+ *
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCudaArrayRangeDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    cudaArray_t  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCudaArrayRangeDesc_v1;
+typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
+
+
+/** \brief The memory is from a CUDA device array.
+ *
+ * Relevant functions: cuArrayCreate,  cuArray3DCreate
+ * Also CUarray from other types such as CUmipmappedArray
+ *
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CU_ARRAY 0x12
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCuArrayRangeDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    CUarray  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCuArrayRangeDesc_v1;
+typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
+
+/* Reserving 0x2-0xF for more common types */
+
+#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
+
+/** \brief Get the permission object that represent the CUDA runtime device
+ * or cuda driver context
+ *
+ * This object will allow developers to adjust permissions applied to work executed
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ *
+ * Ex. change the peer to peer access permissions between devices in entirety
+ * or punch through special holes
+ *
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ *
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
+    nvtxDomainHandle_t domain);
+
+/** \brief Get the permission object that represent the CUDA runtime device
+ * or cuda driver context
+ *
+ * This object will allow developers to adjust permissions applied to work executed
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ *
+ * Ex. change the peer to peer access permissions between devices in entirety
+ * or punch through special holes
+ *
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ *
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
+    nvtxDomainHandle_t domain,
+    int device);
+
+/** \brief Change the default behavior for all memory mapped in from a particular device.
+ *
+ * While typically all memory defaults to readable and writable, users may desire to limit
+ * access to reduced default permissions such as read-only and a per-device basis.
+ *
+ * Regions can used to further override smaller windows of memory.
+ *
+ * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
+ *
+*/
+NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissions,
+    int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
+    uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
+
+/** @} */ /*END defgroup*/
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h"
+#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */
diff --git a/src/include/nvtx3/nvToolsExtOpenCL.h b/src/include/nvtx3/nvToolsExtOpenCL.h
index a7b8a19b0..7b40b4115 100644
--- a/src/include/nvtx3/nvToolsExtOpenCL.h
+++ b/src/include/nvtx3/nvToolsExtOpenCL.h
@@ -30,11 +30,11 @@ extern "C" {
  */
 
 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
-#define NVTX_RESOURCE_CLASS_OPENCL 6 
+#define NVTX_RESOURCE_CLASS_OPENCL 6
 /** \endcond */
 
 /*  ------------------------------------------------------------------------- */
@@ -183,7 +183,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
 #endif /* __cplusplus */
 
 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplOpenCL_v3.h"
 #undef NVTX_IMPL_GUARD_OPENCL
 #endif /*NVTX_NO_IMPL*/
diff --git a/src/include/nvtx3/nvToolsExtPayload.h b/src/include/nvtx3/nvToolsExtPayload.h
index a46c833e2..c775738b1 100644
--- a/src/include/nvtx3/nvToolsExtPayload.h
+++ b/src/include/nvtx3/nvToolsExtPayload.h
@@ -1,5 +1,5 @@
 /*
-* Copyright 2021-2022  NVIDIA Corporation.  All rights reserved.
+* Copyright 2021-2024  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@@ -8,34 +8,41 @@
 
 #include "nvToolsExt.h"
 
-#ifndef NVTOOLSEXT_PAYLOAD_H
-#define NVTOOLSEXT_PAYLOAD_H
+/* Optionally include helper macros. */
+/* #include "nvToolsExtPayloadHelper.h" */
 
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
+/**
+ * If needed, semantic extension headers can be included after this header.
+ */
 
 /**
- * \brief A compatibility ID value used in initialization to identify version
- * differences.
+ * \brief The compatibility ID is used for versioning of this extension.
  */
-#define NVTX_EXT_COMPATID_PAYLOAD 0x0103
+#ifndef NVTX_EXT_PAYLOAD_COMPATID
+#define NVTX_EXT_PAYLOAD_COMPATID 0x0103
+#endif
 
 /**
- * \brief This module ID identifies the payload extension. It has to be unique
+ * \brief The module ID identifies the payload extension. It has to be unique
  * among the extension modules.
  */
-#define NVTX_EXT_MODULEID_PAYLOAD 2
+#ifndef NVTX_EXT_PAYLOAD_MODULEID
+#define NVTX_EXT_PAYLOAD_MODULEID 2
+#endif
 
 /**
- * \brief Additional values for the enum @ref nvtxPayloadType_t
+ * \brief Additional value for the enum @ref nvtxPayloadType_t
  */
-#define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009)
-
+#ifndef NVTX_PAYLOAD_TYPE_EXT
+#define NVTX_PAYLOAD_TYPE_EXT ((int32_t)0xDFBD0009)
+#endif
 
 /** ---------------------------------------------------------------------------
- * Payload schema entry flags.
+ * Payload schema entry flags. Used for @ref nvtxPayloadSchemaEntry_t::flags.
  * ------------------------------------------------------------------------- */
+#ifndef NVTX_PAYLOAD_ENTRY_FLAGS_V1
+#define NVTX_PAYLOAD_ENTRY_FLAGS_V1
+
 #define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0
 
 /**
@@ -56,37 +63,79 @@ extern "C" {
 /**
  * The value is an array with fixed length, set with the field `arrayLength`.
  */
-#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE      (1 << 4)
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE           (1 << 4)
 
 /**
  * The value is a zero-/null-terminated array.
  */
-#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4)
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED      (2 << 4)
 
 /**
  * \brief A single or multi-dimensional array of variable length.
  *
- * The field `arrayLength` contains the index of the schema entry that holds the
- * length(s). If the other field points to a scalar entry then this will be the
- * 1D array. If the other field points to a FIXED_SIZE array, then the number of
- * dimensions is defined with the registration of the scheme. If the other field
- * is ZERO_TERMINATED, the array the dimensions can be determined at runtime.
+ * The field `arrayOrUnionDetail` contains the index of the schema entry that
+ * holds the length(s). If the length entry is a scalar, then this entry is a 1D
+ * array. If the length entry is a fixed-size array, then the number of
+ * dimensions is defined with the registration of the schema. If the length
+ * entry is a zero-terminated array, then the array of the dimensions can be
+ * determined at runtime.
+ * For multidimensional arrays, values are stored in row-major order, with rows
+ * being stored consecutively in contiguous memory. The size of the entry (in
+ * bytes) is the product of the dimensions multiplied with size of the array
+ * element.
  */
-#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX    (3 << 4)
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX         (3 << 4)
 
 /**
+ * \brief A single or multi-dimensional array of variable length, where the
+ * dimensions are stored in a different payload (index) of the same event.
+ *
+ * This enables an existing address to an array to be directly passed, while the
+ * dimensions are defined in a separate payload (with only one payload entry).
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_PAYLOAD_INDEX (4 << 4)
+
+/**
+ * \brief The value or data that is pointed to by this payload entry value shall
+ * be copied by the NVTX handler.
+ *
  * A tool may not support deep copy and just ignore this flag.
  * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details.
  */
-#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY             (1 << 9)
+#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY          (1 << 8)
+
+/**
+ * Notifies the NVTX handler to hide this entry in case of visualization.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_HIDE               (1 << 9)
+
+/**
+ * The entry specifies the event message. Any string type can be used.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE      (1 << 10)
 
 /**
- * The entry specifies the message in a deferred event. The entry type can be
- * any string type. The flag is ignored for schemas that are not flagged with
- * `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`.
+ * \brief The entry contains an event timestamp.
+ *
+ * The time source might be provided via the entry semantics field. In most
+ * cases, the timestamp (entry) type is @ref NVTX_PAYLOAD_ENTRY_TYPE_UINT64.
  */
-#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE         (1 << 10)
+#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP    (2 << 10)
 
+/**
+ * These flags specify the NVTX event type to which an entry refers.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN        (1 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END          (2 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_MARK               (3 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_COUNTER            (4 << 12)
+
+#endif /* NVTX_PAYLOAD_ENTRY_FLAGS_V1 */
+/** ---------------------------------------------------------------------------
+ * END: Payload schema entry flags.
+ * ------------------------------------------------------------------------- */
+
+/** \todo: Keep this in the header? */
 /**
  * @note The ‘array’ flags assume that the array is embedded. Otherwise,
  * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some
@@ -103,11 +152,14 @@ extern "C" {
     NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \
     NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX)
 
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_TYPE(F) \
+    (F & NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY)
+/** \todo end */
+
+
 /** ---------------------------------------------------------------------------
  * Types of entries in a payload schema.
- * ------------------------------------------------------------------------- */
-
-/**
+ *
  * @note Several of the predefined types contain the size (in bits) in their
  * names. For some data types the size (in bytes) is not fixed and may differ
  * for different platforms/operating systems/compilers. To provide portability,
@@ -116,9 +168,11 @@ extern "C" {
  * is passed to the NVTX extension initialization function
  * @ref InitializeInjectionNvtxExtension via the `extInfo` field of
  * @ref nvtxExtModuleInfo_t.
- */
+ * ------------------------------------------------------------------------- */
+#ifndef NVTX_PAYLOAD_ENTRY_TYPES_V1
+#define NVTX_PAYLOAD_ENTRY_TYPES_V1
 
-#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0
+#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID     0
 
 /**
  * Basic integer types.
@@ -147,14 +201,14 @@ extern "C" {
 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT64     18
 
 /**
- * C floating point types
+ * Floating point types
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT      19
 #define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE     20
 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21
 
 /**
- * Size type (`size_t`)
+ * Size type (`size_t` in C).
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_SIZE       22
 
@@ -179,7 +233,7 @@ extern "C" {
 
 /**
  * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed.
- * Typically a tool will display this as hex or binary.
+ * Typically, a tool will display this as hex or binary.
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_BYTE       32
 
@@ -201,36 +255,37 @@ extern "C" {
 #define NVTX_PAYLOAD_ENTRY_TYPE_TF32       52
 
 /**
- * These types are normalized numbers stored in integers. UNORMs represent 0.0
- * to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the
- * number of integer bits. Alignment is take from equivalent types INT# matching
- * to SNORM# and UINT# matching to UNORM#.
+ * Data types are as defined by NVTXv3 core.
  */
-#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8     61
-#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8     62
-#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16    63
-#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16    64
-#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32    65
-#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32    66
-#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64    67
-#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64    68
+#define NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY   68 /* uint32_t */
+#define NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB 69 /* uint32_t */
 
 /**
- * String types.
- *
- * If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string
- * with the provided length.
+ * The scope of events or counters (see `nvtxScopeRegister`).
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID   70 /* uint64_t */
+
+/**
+ * Thread ID as scope.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32 73
+#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64 74
+
+/**
+ * \brief String types.
  *
- * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It
- * just specifies once more that the entry is a fixed-size string.
+ * If no flags are set for the entry and `arrayOrUnionDetail > 0`, the entry is
+ * assumed to be a fixed-size string with the given length, embedded in the payload.
+ * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is redundant for fixed-size strings.
  *
- * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a
- * zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero-
- * terminated array of fixed-size strings is assumed.
+ * \todo(Revise the following paragraph.)
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` specifies a
+ * zero-terminated string. If `arrayOrUnionDetail > 0`, the entry is handled as
+ * a zero-terminated array of fixed-size strings.
  *
- * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the
- * entry index of the entry which contains the string length. It is not possible
- * to describe a variable length array of strings.
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies a
+ * variable-length string with the length given in the entry specified by the
+ * field `arrayOrUnionDetail`.
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING       75 /* `char*`, system LOCALE */
 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8  76
@@ -238,93 +293,194 @@ extern "C" {
 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78
 
 /**
- * @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString
+ * The entry value is of type @ref nvtxStringHandle_t returned by
+ * @ref nvtxDomainRegisterString.
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80
 
-/**
- * Entry types to be used in deferred events. Data types are as defined by
- * NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t.
- */
-#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY    90
-#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE   91
-#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR       92
-
 /**
  * This type marks the union selector member (entry index) in schemas used by
- * a union with internal internal selector.
+ * a union with internal selector.
  * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR.
  */
 #define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100
 
 /**
- * Timestamp types occupy the range from 128 to 255
+ * \brief Predefined schema ID for payload data that is referenced in another payload.
+ *
+ * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate that the
+ * payload is a blob of memory which other payload entries may point into.
+ * A tool will not expose this payload directly.
+ *
+ * This schema ID cannot be used as schema entry type!
  */
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
+
+/**
+ * \brief Predefined schema ID for raw payload data.
+ *
+ * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate
+ * that the payload is a blob, which can be shown with an arbitrary data viewer.
+ * This schema ID cannot be used as schema entry type!
+ */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW        1023
+
+/**
+ * \deprecated: Remove for official release!
+ * In the initial version of this header custom schema IDs started
+ * here. Unless predefined types require more than 16 bits we can keep this
+ * value to preserve backwards compatibility. The value is not used as first
+ * ID for custom schemas any more, but in the analysis every entry type >= this
+ * value is assumed to be a custom schema.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CUSTOM_BASE 65536
+
+/* Custom (static) schema IDs. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START  (1 << 24)
+
+/* Dynamic schema IDs (generated by the tool) start here. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 /* 1 << 32 */
+
+#endif /* NVTX_PAYLOAD_ENTRY_TYPES_V1 */
+/** ---------------------------------------------------------------------------
+ * END: Payload schema entry types.
+ * ------------------------------------------------------------------------- */
+
+
+#ifndef NVTX_PAYLOAD_SCHEMA_TYPES_V1
+#define NVTX_PAYLOAD_SCHEMA_TYPES_V1
 
 /**
- * CPU timestamp sources.
- * \todo All 64 bits?
+ * \brief The payload schema type.
+ *
+ * A schema can be either of the following types. It is set with
+ * @ref nvtxPayloadSchemaAttr_t::type.
  */
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC                              129
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED               130
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME           131
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE    132
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC          133
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW      134
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE   135
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME           136
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID  138
+#define NVTX_PAYLOAD_SCHEMA_TYPE_INVALID                      0
+#define NVTX_PAYLOAD_SCHEMA_TYPE_STATIC                       1
+#define NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC                      2
+#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION                        3
+#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR 4
 
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC     160
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT  161
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162
+#endif /* NVTX_PAYLOAD_SCHEMA_TYPES_V1 */
 
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME         163
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK        164
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165
 
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK          166
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK          168
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK             169
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK             170
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK             171
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK            172
+#ifndef NVTX_PAYLOAD_SCHEMA_FLAGS_V1
+#define NVTX_PAYLOAD_SCHEMA_FLAGS_V1
 
 /**
- * \brief GPU timestamp sources.
+ * \brief Flags for static and dynamic schemas.
+ *
+ * The schema flags are used with @ref nvtxPayloadSchemaAttr_t::flags.
  */
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK    193
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64  194
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI       195
+#define NVTX_PAYLOAD_SCHEMA_FLAG_NONE           0
 
 /**
- * The timestamp was provided by the NVTX handler’s timestamp routine.
+ * This flag indicates that a schema and the corresponding payloads can
+ * contain fields which require a deep copy.
  */
-#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224
+#define NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY      (1 << 1)
 
 /**
- * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
- * the payload is a blob of memory which other payload entries may point into.
- * A tool will not expose this payload directly.
+ * This flag indicates that a schema and the corresponding payload can be
+ * referenced by another payload of the same event. If the schema is not
+ * intended to be visualized directly, it is possible use
+ * @ref NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED instead.
  */
-#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
+#define NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED     (1 << 2)
 
 /**
- * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
- * the payload is a blob which can be shown with an arbitrary data viewer.
+ * The schema defines a counter group. An NVTX handler can expect that the schema
+ * contains entries with counter semantics.
  */
-#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW        1023
+#define NVTX_PAYLOAD_SCHEMA_FLAG_COUNTER_GROUP  (1 << 3)
 
-/* Custom (static) schema IDs. */
-#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START  (1 << 24)
 
-/* Dynamic schema IDs (generated by the tool) start here. */
-#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296  // 1 << 32
+#endif /* NVTX_PAYLOAD_SCHEMA_FLAGS_V1 */
+
+
+#ifndef NVTX_PAYLOAD_SCHEMA_ATTRS_V1
+#define NVTX_PAYLOAD_SCHEMA_ATTRS_V1
+
+/**
+ * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
+ * specified via setting the field `fieldMask`.
+ */
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE        (1 << 2)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS       (1 << 3)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES     (1 << 4)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT   (1 << 7)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID   (1 << 8)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_EXTENSION   (1 << 9)
+
+#endif /* NVTX_PAYLOAD_SCHEMA_ATTRS_V1 */
+
+
+#ifndef NVTX_PAYLOAD_ENUM_ATTRS_V1
+#define NVTX_PAYLOAD_ENUM_ATTRS_V1
+
+/**
+ * The values are used to set the field `fieldMask` and specify which fields in
+ * @ref nvtxPayloadEnumAttr_t are set.
+ */
+#define NVTX_PAYLOAD_ENUM_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES     (1 << 2)
+#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
+#define NVTX_PAYLOAD_ENUM_ATTR_SIZE        (1 << 4)
+#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID   (1 << 5)
+#define NVTX_PAYLOAD_ENUM_ATTR_EXTENSION   (1 << 6)
 
+#endif /* NVTX_PAYLOAD_ENUM_ATTRS_V1 */
+
+/**
+ * An NVTX scope specifies the execution scope or source of events or counters.
+ */
+#ifndef NVTX_SCOPES_V1
+#define NVTX_SCOPES_V1
+
+/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */
+#define NVTX_SCOPE_NONE                    0 /* no scope */
+
+#define NVTX_SCOPE_ROOT                    1
+
+#define NVTX_SCOPE_CURRENT_HW_MACHINE      2 /* Node/machine name */
+#define NVTX_SCOPE_CURRENT_HW_SOCKET       3
+#define NVTX_SCOPE_CURRENT_HW_CPU_PHYSICAL 4 /* Physical CPU core */
+#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL  5 /* Logical CPU core */
+/* Innermost HW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_HW_INNERMOST   15
+
+/* Virtualized hardware, virtual machines, OS (if you don't know any better)
+\todo: Need to be more precise what information is expected for each of these scopes. */
+#define NVTX_SCOPE_CURRENT_HYPERVISOR     16
+#define NVTX_SCOPE_CURRENT_VM             17
+#define NVTX_SCOPE_CURRENT_KERNEL         18
+#define NVTX_SCOPE_CURRENT_CONTAINER      19
+#define NVTX_SCOPE_CURRENT_OS             20
+
+/* Software scopes */
+#define NVTX_SCOPE_CURRENT_SW_PROCESS 	  21 /* Process scope */
+#define NVTX_SCOPE_CURRENT_SW_THREAD  	  22 /* Thread scope */
+/* Innermost SW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_SW_INNERMOST   31
+
+/** Static (user-provided) scope IDs (feed forward) */
+#define NVTX_SCOPE_ID_STATIC_START  (1 << 24)
+
+/** Dynamically (tool) generated scope IDs */
+#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296  /* 1 << 32 */
+
+#endif /* NVTX_SCOPES_V1 */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef NVTX_PAYLOAD_TYPEDEFS_V1
+#define NVTX_PAYLOAD_TYPEDEFS_V1
 
 /**
  * \brief Size and alignment information for predefined payload entry types.
@@ -333,12 +489,64 @@ extern "C" {
  * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX
  * client/handler. The type (ID) is used as index into this array.
  */
-typedef struct nvtxPayloadEntryTypeInfo_t
+typedef struct nvtxPayloadEntryTypeInfo_v1
 {
     uint16_t size;
     uint16_t align;
 } nvtxPayloadEntryTypeInfo_t;
 
+/**
+ * \brief Binary payload data, size and decoding information.
+ *
+ * An array of type `nvtxPayloadData_t` is passed to the NVTX event attached to
+ * an NVTX event via the `payload.ullvalue` field of NVTX event attributes.
+ *
+ * The `schemaId` be a predefined schema entry type (`NVTX_PAYLOAD_ENTRY_TYPE*`),
+ * a schema ID (statically specified or dynamically created) or one of
+ * `NVTX_PAYLOAD_TYPE_REFERENCED` or `NVTX_PAYLOAD_TYPE_RAW`.
+ *
+ * Setting the size of a payload to `MAX_SIZE` can be useful to reduce the
+ * overhead of NVTX instrumentation, when no NVTX handler is attached. However,
+ * a tool might not be able to detect the size of a payload and thus skip it.
+ * A reasonable use case is a payload that represents a null-terminated
+ * C string, where the NVTX handler can call `strlen()`.
+ */
+typedef struct nvtxPayloadData_v1
+{
+    /**
+     * The schema ID, which defines the layout of the binary data.
+     */
+    uint64_t    schemaId;
+
+    /**
+     * Size of the payload (blob) in bytes. `SIZE_MAX` (`-1`) indicates the tool
+     * that it should figure out the size, which might not be possible.
+     */
+    size_t      size;
+
+    /**
+     * Pointer to the binary payload data.
+     */
+    const void* payload;
+} nvtxPayloadData_t;
+
+
+/**
+ * \brief Header of the payload entry's semantic field.
+ *
+ * If the semantic field of the payload schema entry is set, the first four
+ * fields (header) are defined with this type. A tool can iterate through the
+ * extensions and check, if it supports (can handle) it.
+ */
+typedef struct nvtxSemanticsHeader_v1
+{
+    uint32_t structSize; /** Size of semantic extension struct. */
+    uint16_t semanticId;
+    uint16_t version;
+    const struct nvtxSemanticsHeader_v1* next; /** linked list */
+    /* Additional fields are defined by the specific semantic extension. */
+} nvtxSemanticsHeader_t;
+
 /**
  * \brief Entry in a schema.
  *
@@ -349,12 +557,12 @@ typedef struct nvtxPayloadEntryTypeInfo_t
  * and the offset is determined based on self-alignment rules.
  *
  * Example schema:
- *  nvtxPayloadSchemaEntry_t desc[] = {
+ *  nvtxPayloadSchemaEntry_t schema[] = {
  *      {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"},
  *      {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"}
  *  };
  */
-typedef struct nvtxPayloadSchemaEntry_t
+typedef struct nvtxPayloadSchemaEntry_v1
 {
     /**
      * \brief Flags to augment the basic type.
@@ -365,37 +573,39 @@ typedef struct nvtxPayloadSchemaEntry_t
     uint64_t       flags;
 
     /**
-     * \brief Predefined payload schema entry type or ID of a registered payload
-     * schema.
+     * \brief Predefined payload schema entry type or custom schema ID.
+     *
+     * Predefined types are `NVTX_PAYLOAD_ENTRY_TYPE_*`. Passing a schema ID
+     * enables nesting of schemas.
      */
     uint64_t       type;
 
     /**
-     * \brief Name of the payload entry. (Optional)
+     * \brief Name or label of the payload entry. (Optional)
      *
-     * Providing a name is useful to give a meaning to the associated value.
+     * A meaningful name or label can help organizing and interpreting the data.
      */
     const char*    name;
 
     /**
      * \brief Description of the payload entry. (Optional)
+     *
+     * A more detail description of the data that is stored with this entry.
      */
     const char*    description;
 
     /**
-     * \brief String or array length or union selector for union types.
+     * \brief String length, array length or member selector for union types.
      *
-     * If @ref type is a C string type, this defines the length of the string.
+     * If @ref type is a C string type, this field specifies the string length.
      *
-     * If @ref flags specify that the entry is an array, this field defines the
-     * length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more
-     * details.
+     * If @ref flags specify that the entry is an array, this field specifies
+     * the array length. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more details.
      *
-     * If @ref type implies that the entry is a union with schema type
-     * @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union
-     * member), this field contains the index (starting with 0) to an entry of
-     * integer type in the same schema. The associated field contains the
-     * selected union member.
+     * If @ref type is a union with schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION
+     * (external selection of the union member), this field contains the index
+     * (starting with 0) to an entry of integral type in the same schema. The
+     * associated field value specifies the selected union member.
      *
      * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not
      * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can
@@ -407,176 +617,53 @@ typedef struct nvtxPayloadSchemaEntry_t
      * \brief Offset in the binary payload data (in bytes).
      *
      * This field specifies the byte offset from the base address of the actual
-     * binary data (blob) to the data of this entry.
+     * binary data (blob) to the start address of the data of this entry.
+     *
+     * It is recommended (but not required) to provide the offset it. Otherwise,
+     * the NVTX handler will determine the offset from natural alignment rules.
+     * In some cases, e.g. dynamic schema layouts, the offset cannot be set and
+     * has to be determined based on the data of prior entries.
      *
-     * This is an optional field, but it is recommended to specify this field to
-     * avoid issues in the automatic detection of the offset by a tool/handler.
+     * Setting the offset can also be used to skip entries during payload parsing.
      */
     uint64_t       offset;
 
     /**
-     * Semantics are not yet defined.
+     * \brief Additional semantics of the payload entry.
+     *
+     * The field points to the first element in a linked list, which enables
+     * multiple semantic extensions.
      */
-    void*          semantics;
+    const nvtxSemanticsHeader_t* semantics;
 
     /**
-     * Reserved for future use. Do not use it!
+     * \brief Reserved for future use. Do not use it!
      */
-    void*          reserved;
+    const void*    reserved;
 } nvtxPayloadSchemaEntry_t;
 
-/**
- * \brief Binary payload data, size and decoding information.
- *
- * An array of nvtxPayloadData_t is passed to the NVTX event attribute payload
- * member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR
- * can be used.
- */
-typedef struct nvtxPayloadData_t
-{
-    /**
-     * The schema ID, which defines the layout of the binary data.
-     */
-    uint64_t    schemaId;
-
-    /**
-     * Size of the binary payload (blob) in bytes.
-     */
-    size_t      size;
-
-    /**
-     * Pointer to the binary payload data.
-     */
-    const void* payload;
-} nvtxPayloadData_t;
-
-/* Helper macros for safe double-cast of pointer to uint64_t value */
-#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
-# ifdef __cplusplus
-# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
-    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
-# else
-#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
-# endif
-#endif
-
-
-#define NVTX_PAYLOAD_CONCAT2(a,b) a##b
-#define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b)
-#define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__)
-
-/**
- * \brief Helper macro to attach a single payload to an NVTX event attribute.
- *
- * @note The NVTX push, start or mark operation must not be in the same or a
- * nested scope.
- */
-#define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \
-    nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \
-    (EVTATTR).payload.ullValue = \
-        NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \
-    (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
-    (EVTATTR).reserved0 = 1;
-
-/**
- * \brief Helper macro to attach multiple payloads to an NVTX event attribute.
- *
- * The payload data array (`nvtxPayloadData_t`) is passed as first argument to
- * this macro.
- */
-#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \
-    (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
-    (EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \
-    (EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS);
-
 
 /**
- * \brief The payload schema type.
- *
- * A schema can be either of these types.
+ * \brief Header of the schema attribute extension field.
  */
-enum nvtxPayloadSchemaType
+typedef struct nvtxPayloadSchemaExtension_v1
 {
-    NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0,
-
-    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC  = 1,
-    NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2,
-
-    NVTX_PAYLOAD_SCHEMA_TYPE_UNION   = 3,
-    NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4
-};
-
-/**
- * \brief Flags for static and dynamic schemas.
- */
-enum nvtxPayloadSchemaFlags
-{
-    NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0,
-
-    /**
-     * This flag indicates that a schema and the corresponding payloads can
-     * contain fields which require a deep copy.
-     */
-    NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY  = (1 << 1),
-
-    /**
-     * This flag indicates that a schema and the corresponding payloads can
-     * be referenced by another payload of the same event.
-     */
-    NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2),
-
-    /**
-     * The schema describes a deferred event/marker. Such a schema requires one
-     * timestamp entry and one string entry with the flag
-     * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
-     * optionally specified with the respective entry types. The deferred event
-     * can contain a binary payload itself by using a custom schema ID as type
-     * its schema description. Multiple occurrences of the same event can be
-     * described by specifying an array timestamps.
-     */
-    NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3),
-    /**
-     * The schema describes a deferred event/marker. Such a schema requires
-     * one start timestamp, one end timestamp and one string entry with the flag
-     * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
-     * optionally specified with the respective entry types. The deferred range
-     * can contain a binary payload itself by using a custom schema ID as type
-     * its schema description.
-     *
-     * Timestamps can be provided in different ways:
-     *  - A single range has two timestamp entries with the first (smaller entry
-     *    index) being used as the start/push timestamp.
-     *  - If the range schema contains one array of timestamps, the tool assumes
-     *    that the array contains alternating start and end timestamps.
-     *  - If two timestamp arrays are specified the first entry (with the
-     *    smaller entry index) is assumed to contain the start timestamps. Both
-     *    arrays have to be of the same size.
-     */
-    NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3)
-};
+    uint32_t structSize; /** Size of schema extension struct. */
+    uint16_t schemaExtId;
+    uint16_t version;
+    const struct nvtxPayloadSchemaExtension_v1* next; /** linked list */
+    /* Additional fields are defined by the specific schema extension. */
+} nvtxPayloadSchemaExtension_t;
 
 /**
- * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
- * specified via setting the field `fieldMask`.
+ * \brief NVTX payload schema attributes.
  */
-#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME        (1 << 1)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE        (1 << 2)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS       (1 << 3)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES     (1 << 4)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT   (1 << 7)
-#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID   (1 << 8)
-
-/**
- * NVTX payload schema attributes.
- */
-typedef struct nvtxPayloadSchemaAttr_t
+typedef struct nvtxPayloadSchemaAttr_v1
 {
     /**
-     * \brief Mask of valid fields in this structure.
+     * \brief Mask of valid fields in this struct.
      *
-     * The values from `enum nvtxPayloadSchemaAttributes` have to be used.
+     * Use the `NVTX_PAYLOAD_SCHEMA_ATTR_*` defines.
      */
     uint64_t                        fieldMask;
 
@@ -588,14 +675,14 @@ typedef struct nvtxPayloadSchemaAttr_t
     /**
      * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD
      *
-     * A value from `enum nvtxPayloadSchemaType` has to be used.
+     * Use the `NVTX_PAYLOAD_SCHEMA_TYPE_*` defines.
      */
     uint64_t                        type;
 
     /**
      * \brief Payload schema flags. (Optional)
      *
-     * Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set
+     * Flags defined by `NVTX_PAYLOAD_SCHEMA_FLAG_*` can be used to set
      * additional properties of the schema.
      */
     uint64_t                        flags;
@@ -638,26 +725,23 @@ typedef struct nvtxPayloadSchemaAttr_t
        >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
        < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
     uint64_t                        schemaId;
+
+    /* Flexible extension for schema attributes. */
+    void*                           extension;
 } nvtxPayloadSchemaAttr_t;
 
 /**
- * \brief Register a payload schema.
+ * \brief This type is used to describe an enumeration.
  *
- * @param domain NVTX domain handle.
- * @param attr NVTX payload schema attributes.
- */
-NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
-    nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr);
-
-/**
- * \brief Enumeration entry.
+ * Since the value of an enum entry might not be meaningful for the analysis
+ * and/or visualization, a tool can show the name of enum entry instead.
  *
- * Since the value of an enum entry might not be meaningful for the analysis,
- * a tool can show the name of enum entry instead.
+ * An array of this struct is passed to @ref nvtxPayloadEnumAttr_t::entries to be
+ * finally registered via @ref nvtxPayloadEnumRegister with the NVTX handler.
  *
  * @note EXPERIMENTAL
  */
-typedef struct nvtxPayloadEnum_t
+typedef struct nvtxPayloadEnum_v1
 {
     /**
      * Name of the enum value.
@@ -671,28 +755,20 @@ typedef struct nvtxPayloadEnum_t
 
     /**
      * Indicates that this entry sets a specific set of bits, which can be used
-     * to easily define bitsets.
+     * to define bitsets.
      */
     int8_t      isFlag;
 } nvtxPayloadEnum_t;
 
 /**
- * The values are used to set the field `fieldMask` and specify which fields in
- * `nvtxPayloadEnumAttr_t` are set.
- */
-#define NVTX_PAYLOAD_ENUM_ATTR_NAME        (1 << 1)
-#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES     (1 << 2)
-#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
-#define NVTX_PAYLOAD_ENUM_ATTR_SIZE        (1 << 4)
-#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID   (1 << 5)
-
-/**
- * NVTX payload enumeration type attributes.
+ * \brief NVTX payload enumeration type attributes.
+ *
+ * A pointer to this struct is passed to @ref nvtxPayloadEnumRegister.
  */
-typedef struct nvtxPayloadEnumAttr_t {
+typedef struct nvtxPayloadEnumAttr_v1
+{
     /**
-     * Mask of valid fields in this struct.
-     * The values from `enum nvtxPayloadSchemaAttributes` have to be used.
+     * Mask of valid fields in this struct. See `NVTX_PAYLOAD_ENUM_ATTR_*`.
      */
     uint64_t                 fieldMask;
 
@@ -722,17 +798,168 @@ typedef struct nvtxPayloadEnumAttr_t {
      *  < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
      */
     uint64_t                 schemaId;
+
+    /* Flexible extension for enumeration attributes. */
+    void*                    extension;
 } nvtxPayloadEnumAttr_t;
 
+typedef struct nvtxScopeAttr_v1
+{
+    size_t structSize;
+
+    /** Path delimited by '/' characters, relative to parentScope. Leading
+    slashes are ignored. Nodes in the path may use name[key] syntax to indicate
+    an array of sibling nodes, which may be combined with other non-array nodes
+    or different arrays at the same scope. Node names should be UTF8 printable
+    characters, excluding '/', '[', and ']' characters which have special
+    meaning here. An empty C string "" and `NULL` are valid inputs and treated
+    equivalently. */
+    const char* path;
+
+    uint64_t parentScope;
+
+    /** The static scope ID must be unique within the domain,
+        >= NVTX_EVENT_SCOPE_ID_STATIC_START, and
+        < NVTX_EVENT_SCOPE_ID_DYNAMIC_START. */
+    uint64_t scopeId;
+} nvtxScopeAttr_t;
+
+
+#endif /* NVTX_PAYLOAD_TYPEDEFS_V1 */
+
+#ifndef NVTX_PAYLOAD_API_FUNCTIONS_V1
+#define NVTX_PAYLOAD_API_FUNCTIONS_V1
+
+/**
+ * \brief Register a payload schema.
+ *
+ * @param domain NVTX domain handle.
+ * @param attr NVTX payload schema attributes.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadSchemaAttr_t* attr);
+
 /**
  * \brief Register an enumeration type with the payload extension.
  *
  * @param domain NVTX domain handle
  * @param attr NVTX payload enumeration type attributes.
  */
-NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain,
+NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadEnumRegister(
+    nvtxDomainHandle_t domain,
     const nvtxPayloadEnumAttr_t* attr);
 
+/**
+ * \brief Register a scope.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param attr Scope attributes.
+ *
+ * @return an identifier for the scope. If the operation was not successful,
+ * `NVTX_SCOPE_NONE` is returned.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxScopeRegister(
+    nvtxDomainHandle_t domain,
+    const nvtxScopeAttr_t* attr);
+
+/**
+ * \brief Marks an instantaneous event in the application with the attributes
+ * being passed via the extended payload.
+ *
+ * An NVTX handler can assume that the payload contains the event message.
+ * Otherwise, it might ignore the event.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief Begin a nested thread range with the attributes being passed via the
+ * payload.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief End a nested thread range with an additional custom payload.
+ *
+ * NVTX event attributes passed to this function (via the payloads) overwrite
+ * event attributes (message and color) that have been set in the push event.
+ * Other payload entries extend the data of the range.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePopPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief Start a thread range with attributes passed via the extended payload.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief End a thread range and pass a custom payload.
+ *
+ * NVTX event attributes passed to this function (via the payloads) overwrite
+ * event attributes (message and color) that have been set in the start event.
+ * Other payload entries extend the data of the range.
+ *
+ * @param domain NVTX domain handle
+ * @param id The correlation ID returned from a NVTX range start call.
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEndPayload(
+    nvtxDomainHandle_t domain,
+    nvtxRangeId_t id,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * @brief Checks if an NVTX domain is enabled (unofficial and may not work)
+ *
+ * @param domain NVTX domain handle
+ * @return 0 if the domain is not enabled.
+ */
+NVTX_DECLSPEC uint8_t NVTX_API nvtxDomainIsEnabled(
+    nvtxDomainHandle_t domain);
+
+#endif /* NVTX_PAYLOAD_API_FUNCTIONS_V1 */
+
+#ifndef NVTX_PAYLOAD_CALLBACK_ID_V1
+#define NVTX_PAYLOAD_CALLBACK_ID_V1
 /**
  * \brief Callback Ids of API functions in the payload extension.
  *
@@ -740,30 +967,130 @@ NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain,
  * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is
  * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be
  * registered as follows:
+ * \code{.c}
  *      moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] =
- *          (intptr_t)handlenvtxPayloadRegisterSchema;
+ *          (intptr_t)YourPayloadRegisterSchemaHandlerFn;
+ * \endcode
  */
-typedef enum NvtxExtPayloadCallbackId
-{
-    NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0,
-    NVTX3EXT_CBID_nvtxPayloadEnumRegister   = 1,
-    NVTX3EXT_CBID_PAYLOAD_FN_NUM            = 2
-} NvtxExtPayloadCallbackId;
+#define NVTX3EXT_CBID_nvtxPayloadSchemaRegister      0
+#define NVTX3EXT_CBID_nvtxPayloadEnumRegister        1
+#define NVTX3EXT_CBID_nvtxMarkPayload                2
+#define NVTX3EXT_CBID_nvtxRangePushPayload           3
+#define NVTX3EXT_CBID_nvtxRangePopPayload            4
+#define NVTX3EXT_CBID_nvtxRangeStartPayload          5
+#define NVTX3EXT_CBID_nvtxRangeEndPayload            6
+#define NVTX3EXT_CBID_nvtxDomainIsEnabled            7
+#define NVTX3EXT_CBID_nvtxScopeRegister             12
+#endif /* NVTX_PAYLOAD_CALLBACK_ID_V1 */
+
+/*** Helper utilities ***/
+
+/** \brief  Helper macro for safe double-cast of pointer to uint64_t value. */
+#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
+# ifdef __cplusplus
+# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
+    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
+# else
+#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
+# endif
+#endif
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET_DATA
+/**
+ * \brief Helper macro to attach a single payload to an NVTX event attribute.
+ *
+ * @param evtAttr NVTX event attribute (variable name)
+ * @param pldata_addr Adress of `nvtxPayloadData_t` variable.
+ * @param schema_id NVTX binary payload schema ID.
+ * @param pl_addr Address of the (actual) payload.
+ * @param sz size of the (actual) payload.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET_DATA(evtAttr, pldata_addr, schema_id, pl_addr, sz) \
+    (pldata_addr)->schemaId = schema_id; \
+    (pldata_addr)->size = sz; \
+    (pldata_addr)->payload = pl_addr; \
+    (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata_addr); \
+    (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr).reserved0 = 1;
+#endif /* NVTX_PAYLOAD_EVTATTR_SET_DATA */
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE
+/**
+ * \brief Helper macro to attach multiple payloads to an NVTX event attribute.
+ *
+ * @param evtAttr NVTX event attribute (variable name)
+ * @param pldata Payload data array (of type `nvtxPayloadData_t`)
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(evtAttr, pldata) \
+    (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr).reserved0 = sizeof(pldata)/sizeof(nvtxPayloadData_t); \
+    (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata);
+#endif /* NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE */
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET
+/*
+ * Do not use this macro directly! It is a helper to attach a single payload to
+ * an NVTX event attribute.
+ * @warning The NVTX push, start or mark operation must not be in an outer scope.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schema_id, pl_addr, sz) \
+    nvtxPayloadData_t _NVTX_PAYLOAD_DATA_VAR[] = \
+        {{schema_id, sz, pl_addr}}; \
+    (evtAttr)->payload.ullValue = \
+        NVTX_POINTER_AS_PAYLOAD_ULLVALUE(_NVTX_PAYLOAD_DATA_VAR); \
+    (evtAttr)->payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr)->reserved0 = 1;
+#endif /* NVTX_PAYLOAD_EVTATTR_SET */
+
+#ifndef nvtxPayloadRangePush
+/**
+ * \brief Helper macro to push a range with extended payload.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param evtAttr pointer to NVTX event attribute.
+ * @param schemaId NVTX payload schema ID
+ * @param plAddr Pointer to the binary data (actual payload)
+ * @param size Size of the binary payload data in bytes.
+ */
+#define nvtxPayloadRangePush(domain, evtAttr, schemaId, plAddr, size) \
+do { \
+    NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \
+    nvtxDomainRangePushEx(domain, evtAttr); \
+} while (0)
+#endif /* nvtxPayloadRangePush */
+
+#ifndef nvtxPayloadMark
+/**
+ * \brief Helper macro to set a marker with extended payload.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param evtAttr pointer to NVTX event attribute.
+ * @param schemaId NVTX payload schema ID
+ * @param plAddr Pointer to the binary data (actual payload)
+ * @param size Size of the binary payload data in bytes.
+ */
+#define nvtxPayloadMark(domain, evtAttr, schemaId, plAddr, size) \
+do { \
+    NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \
+    nvtxDomainMarkEx(domain, evtAttr); \
+} while (0)
+#endif /* nvtxPayloadMark */
+
 
 #ifdef __GNUC__
 #pragma GCC visibility push(internal)
 #endif
 
-#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */
-#include "nvtxExtDetail/nvtxExtTypes.h"
+/* Extension types are required for the implementation and the NVTX handler. */
+#define NVTX_EXT_TYPES_GUARD
+#include "nvtxDetail/nvtxExtTypes.h"
 #undef NVTX_EXT_TYPES_GUARD
 
 #ifndef NVTX_NO_IMPL
-#define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */
-#include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h"
-#include "nvtxExtDetail/nvtxExtImplPayload_v1.h"
+#define NVTX_EXT_IMPL_PAYLOAD_GUARD
+#include "nvtxDetail/nvtxExtImplPayload_v1.h"
 #undef NVTX_EXT_IMPL_PAYLOAD_GUARD
-#endif /*NVTX_NO_IMPL*/
+#endif /* NVTX_NO_IMPL */
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
@@ -772,5 +1099,3 @@ typedef enum NvtxExtPayloadCallbackId
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
-
-#endif /* NVTOOLSEXT_PAYLOAD_H */
diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h
new file mode 100644
index 000000000..304d5d6a5
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h
@@ -0,0 +1,170 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvtxDetail/nvtxExtPayloadHelperInternal.h"
+
+
+/* This is just an empty marker (for readability), which can be omitted. */
+/* TODO: Fix issue with trailing comma at end of entry list. */
+#define NVTX_PAYLOAD_ENTRIES
+
+
+/**
+ * Use this macro for payload entries that are defined by a schema (nested
+ * payload schema).
+ */
+#define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId)
+
+
+/**
+ * \brief Define a payload schema for an existing C `struct` definition.
+ *
+ *  This macro does
+ *   1) create schema description (array of schema entries).
+ *   2) set the schema attributes for a static data layout.
+ *
+ * It can be used in static code or within a function context.
+ *
+ * Example:
+ *  NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
+ *      NVTX_PAYLOAD_ENTRIES(
+ *          (index, TYPE_INT, "integer value"),
+ *          (dpfloat, TYPE_DOUBLE, "fp64 value"),
+ *          (text, TYPE_CSTRING, "text", NULL, 24)
+ *      )
+ *  )
+ *
+ * It is required to at least provide the struct name and the payload entries.
+ * The first two fields (member name and NVTX entry type) of each payload entry
+ * are required.
+ *
+ * The optional parameters are only allowed to be passed in the predefined order.
+ * Hence, `payload_flags` requires `payload_schema` to be given and
+ * `prefix` requires `payload_flags` and `payload_schema` to be given.
+ * The payload entries are always the last parameter. A maximum of 16 schema
+ * entries is supported.
+ *
+ * It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
+ * @param prefix (Optional 2) prefix before the schema and attributes variables,
+ *               e.g. `static const`. Leave this empty, if no prefix is desired.
+ * @param schema_flags (Optional 2) flags to augment the payload schema.
+ *                     Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
+ * @param schema_id (Optional 4) User-defined payload schema ID.
+ * @param entries (Mandatory) Payload schema entries. This is always the last
+ *                parameter to the macro.
+ */
+#define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
+    _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__)
+
+
+/**
+ * \brief Define a C struct together with a matching schema.
+ *
+ * This macro does
+ *   1) define the payload type (typedef struct).
+ *   2) create schema description (array of schema entries).
+ *   3) set the schema attributes for a static data layout.
+ *
+ * The macro can be used in static code or within a function context.
+ *
+ * It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended
+ * to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema.
+ *
+ * Example:
+ *  NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
+ *      NVTX_PAYLOAD_ENTRIES(
+ *          (int, index, TYPE_INT, "integer value"),
+ *          (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
+ *          (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
+ *      )
+ *  )
+ *
+ * The first three fields (C type, member, entry type) of each entry are required.
+ * A fixed-size array or string requires a special notation with the member
+ * name and the size separated by comma and put into brackets (see last entry
+ * in the example).
+ *
+ * The optional parameters are positional (only allowed to be passed in the
+ * predefined order). A maximum of 16 schema entries is supported.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
+ * @param prefix (Optional 2) prefix before the schema and attributes variables,
+ *               e.g. `static const`. Leave this empty, if no prefix is desired.
+ * @param schema_flags (Optional 3) flags to augment the payload schema.
+ *                     Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
+ * @param schema_id (Optional 4) User-defined payload schema ID.
+ * @param entries (Mandatory) The schema entries. This is always the last
+ *                parameter to the macro.
+ */
+#define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
+    _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__)
+
+/**
+ * \brief Initialize and register the NVTX binary payload schema.
+ *
+ * This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in
+ * addition the schema is registered. The schema ID will be defined as follows:
+ * `const uint64_t struct_id##_schemaId`.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`.
+ */
+#define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \
+    _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \
+    const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
+/**
+ * \brief Define payload schema for an existing `struct` and register the schema.
+ *
+ * This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in
+ * addition, the schema is registered and `uint64_t struct_id##_schemaId` set.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`.
+ */
+#define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \
+    _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \
+    const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
+/**
+ * \brief Create a type definition for the given struct ID and members.
+ *
+ * This is a convenience macro. A normal `typedef` can be used instead.
+ *
+ * Example usage:
+ *   NVTX_DEFINE_STRUCT(your_struct,
+ *           (double, fp64),
+ *           (uint8_t, u8),
+ *           (float, fp32[3])
+ *   )
+ *
+ * @param struct_id The name of the struct.
+ * @param members The members of the struct.
+ */
+#define NVTX_DEFINE_STRUCT(struct_id, ...) \
+    _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__)
+
+/**
+ * \brief Register an NVTX binary payload schema.
+ *
+ * This is a convenience macro, which takes the same `struct_id` that has been
+ * used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be
+ * used, but `&struct_id##Attr` has to be passed.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * @param struct_id The name of the struct.
+ *
+ * @return NVTX schema ID
+ */
+#define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \
+    nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
diff --git a/src/include/nvtx3/nvToolsExtSemanticsCounters.h b/src/include/nvtx3/nvToolsExtSemanticsCounters.h
new file mode 100644
index 000000000..f97624a07
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtSemanticsCounters.h
@@ -0,0 +1,88 @@
+/*
+* Copyright 2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/**
+ * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
+ */
+
+#ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
+#define NVTX_SEMANTIC_ID_COUNTERS_V1 2
+
+/**
+ * Flags to extend the semantics of counters.
+ */
+#define NVTX_COUNTERS_FLAGS_NONE  0
+
+/**
+ * Convert the fixed point value to a normalized floating point value.
+ * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
+ * this flag is applied to.
+ */
+#define NVTX_COUNTERS_FLAG_NORMALIZE    (1 << 1)
+
+/**
+ *  Visual tools should apply scale and limits when graphing.
+ */
+#define NVTX_COUNTERS_FLAG_LIMIT_MIN    (1 << 2)
+#define NVTX_COUNTERS_FLAG_LIMIT_MAX    (1 << 3)
+#define NVTX_COUNTERS_FLAG_LIMITS \
+    (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
+
+/**
+ * Counter time scopes.
+ */
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT        (1 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST   (2 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT   (3 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START  (4 << 5)
+
+/**
+ * Counter value types.
+ */
+#define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
+/** Delta to previous value of same counter type. */
+#define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA    (2 << 10)
+
+/**
+ * Datatypes for the `limits` union.
+ */
+#define NVTX_COUNTERS_LIMIT_I64 0
+#define NVTX_COUNTERS_LIMIT_U64 1
+#define NVTX_COUNTERS_LIMIT_F64 2
+
+/**
+ *\brief Specify counter semantics.
+ */
+typedef struct nvtxSemanticsCounter_v1 {
+    /** Header of the semantic extensions (with identifier, version, etc.). */
+    struct nvtxSemanticsHeader_v1 header;
+
+    /** Flags to provide more context about the counter value. */
+    uint64_t flags;
+
+    /** Unit of the counter value (case-insensitive). */
+    const char*  unit;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleNumerator;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleDenominator;
+
+    /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
+    int64_t limitType;
+
+    /** Graph limits {minimum, maximum}. */
+    union limits_t {
+        int64_t  i64[2];
+        uint64_t u64[2];
+        double   d[2];
+    } limits;
+} nvtxSemanticsCounter_t;
+
+#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvToolsExtSemanticsScope.h b/src/include/nvtx3/nvToolsExtSemanticsScope.h
new file mode 100644
index 000000000..eed6f3095
--- /dev/null
+++ b/src/include/nvtx3/nvToolsExtSemanticsScope.h
@@ -0,0 +1,30 @@
+/*
+* Copyright 2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/**
+ * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
+ */
+
+#ifndef NVTX_SEMANTIC_ID_SCOPE_V1
+#define NVTX_SEMANTIC_ID_SCOPE_V1 1
+
+/**
+ * \brief Specify the NVTX scope for a payload entry.
+ *
+ * This allows the scope to be set for a specific value or counter in a payload.
+ * The scope must be known at schema registration time.
+ */
+typedef struct nvtxSemanticsScope_v1
+{
+    struct nvtxSemanticsHeader_v1 header;
+
+    /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
+    uint64_t scopeId;
+} nvtxSemanticsScope_t;
+
+#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvToolsExtSync.h b/src/include/nvtx3/nvToolsExtSync.h
index 113fcd191..6578516d4 100644
--- a/src/include/nvtx3/nvToolsExtSync.h
+++ b/src/include/nvtx3/nvToolsExtSync.h
@@ -15,23 +15,23 @@
 extern "C" {
 #endif /* __cplusplus */
 
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \version \NVTX_VERSION_2
 */
 #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
 /** \endcond */
 
 
-/** 
+/**
 * \page PAGE_SYNCHRONIZATION Synchronization
 *
 * This section covers a subset of the API that allow users to track additional
-* synchronization details of their application.   Naming OS synchronization primitives 
-* may allow users to better understand the data collected by traced synchronization 
+* synchronization details of their application.   Naming OS synchronization primitives
+* may allow users to better understand the data collected by traced synchronization
 * APIs.  Additionally, a user defined synchronization object can allow the users to
 * to tell the tools when the user is building their own synchronization system
 * that do not rely on the OS to provide behaviors and instead use techniques like
-* atomic operations and spinlocks.  
+* atomic operations and spinlocks.
 *
 * See module \ref SYNCHRONIZATION for details.
 *
@@ -59,7 +59,7 @@ extern "C" {
 *
 *     bool Lock() {
 *          nvtxDomainSyncUserAcquireStart(hSync);
-*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic 
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
 
 *          if (acquired) {
 *              nvtxDomainSyncUserAcquireSuccess(hSync);
@@ -76,12 +76,12 @@ extern "C" {
 *     }
 * };
 * \endcode
-* 
+*
 * \version \NVTX_VERSION_2
 */
 
 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -154,8 +154,8 @@ typedef struct nvtxSyncUser* nvtxSyncUser_t;
 /** \brief User Defined Synchronization Object Attributes Structure.
 * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
 *
-* This structure is used to describe the attributes of a user defined synchronization 
-* object.  The layout of the structure is defined by a specific version of the tools 
+* This structure is used to describe the attributes of a user defined synchronization
+* object.  The layout of the structure is defined by a specific version of the tools
 * extension library and can change between different versions of the Tools Extension
 * library.
 *
@@ -259,7 +259,7 @@ typedef struct nvtxSyncUserAttributes_v0
 typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
 
 /* ------------------------------------------------------------------------- */
-/** \brief Create a user defined synchronization object 
+/** \brief Create a user defined synchronization object
 * This is used to track non-OS synchronization working with spinlocks and atomics
 *
 * \param domain - Domain to own the resource
@@ -317,7 +317,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle
 /* ------------------------------------------------------------------------- */
 /** \brief Signal to tools of failure in acquiring a user defined synchronization object
 * This should be called after \ref nvtxDomainSyncUserAcquireStart
-* 
+*
 * \param handle - A handle to the object to operate on.
 *
 * \sa
@@ -374,7 +374,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
 #endif /* __cplusplus */
 
 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplSync_v3.h"
 #undef NVTX_IMPL_GUARD_SYNC
 #endif /*NVTX_NO_IMPL*/
diff --git a/src/include/nvtx3/nvtx3.hpp b/src/include/nvtx3/nvtx3.hpp
index 8c62acd46..a2f46c37f 100644
--- a/src/include/nvtx3/nvtx3.hpp
+++ b/src/include/nvtx3/nvtx3.hpp
@@ -12,6 +12,11 @@
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
+ *
+ *  Licensed under the Apache License v2.0 with LLVM Exceptions.
+ *  See https://llvm.org/LICENSE.txt for license information.
+ *
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
 /* Temporary helper #defines, #undef'ed at end of header */
@@ -1937,9 +1942,9 @@ class event_attributes {
         0,                              // color value
         NVTX_PAYLOAD_UNKNOWN,           // payload type
         0,                              // reserved 4B
-        0,                              // payload value (union)
+        {0},                            // payload value (union)
         NVTX_MESSAGE_UNKNOWN,           // message type
-        0                               // message value (union)
+        {0}                             // message value (union)
       }
   {
   }
@@ -2003,20 +2008,20 @@ class event_attributes {
     attributes_.messageType = m.get_type();
   }
 
-   /**
-   * @brief Variadic constructor where the first argument is a binary payload.
+  /**
+   * @brief Variadic constructor where the first argument is an extended payload.
    *
-   * Sets the value of the `EventAttribute`s message based on `m` and forwards
+   * Sets the `ullValue` of the `EventAttribute`s payload and forwards
    * the remaining variadic parameter pack to the next constructor.
    *
    */
   template <typename... Args>
-  NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
+  NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept
     : event_attributes(args...)
   {
-    attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
+    attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT;
     attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
-    attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
+    attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p);
   }
 
   ~event_attributes() = default;
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
new file mode 100644
index 000000000..00fc81768
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
@@ -0,0 +1,31 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_HELPER_MACROS_H
+#define NVTX_EXT_HELPER_MACROS_H
+
+/* Combine tokens */
+#define _NVTX_EXT_CONCAT(a, b) a##b
+#define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
+
+/* Resolves to the number of arguments passed. */
+#define NVTX_EXT_NUM_ARGS(...) \
+    NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
+#define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
+
+/* Cast argument(s) to void to prevent unused variable warnings. */
+#define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
+#define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
+#define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
+#define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
+
+/* Mark function arguments as unused. */
+#define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
+    NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#endif /* NVTX_EXT_HELPER_MACROS_H */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
similarity index 79%
rename from src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
rename to src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
index 5e4277805..79bb0c1c5 100644
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
@@ -14,7 +14,12 @@
 #define NVTX_EXT_IMPL_H
 /* ---- Include required platform headers ---- */
 
-#if defined(_WIN32) 
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
+#if defined(_WIN32)
 
 #include <Windows.h>
 
@@ -22,27 +27,19 @@
 #include <unistd.h>
 
 #if defined(__ANDROID__)
-#include <android/api-level.h> 
+#include <android/api-level.h>
 #endif
 
 #if defined(__linux__) || defined(__CYGWIN__)
 #include <sched.h>
 #endif
 
+#include <sys/types.h>
 #include <limits.h>
 #include <dlfcn.h>
 #include <fcntl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <errno.h>
-
-#include <string.h>
-#include <sys/types.h>
 #include <pthread.h>
-#include <stdlib.h>
-#include <wchar.h>
 
 #endif
 
@@ -66,26 +63,35 @@
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
-
-// #ifdef __GNUC__
-// #pragma GCC visibility push(hidden)
-// #endif
-
+/*
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+*/
 #define NVTX_EXTENSION_FRESH 0
 #define NVTX_EXTENSION_DISABLED 1
 #define NVTX_EXTENSION_STARTING 2
 #define NVTX_EXTENSION_LOADED 3
 
-NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
+/* Function slots are local to each extension */
+typedef struct nvtxExtGlobals1_t
+{
+    NvtxExtInitializeInjectionFunc_t injectionFnPtr;
+} nvtxExtGlobals1_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
+{
+    (NvtxExtInitializeInjectionFunc_t)0
+};
 
 #define NVTX_EXT_INIT_GUARD
 #include "nvtxExtInit.h"
 #undef NVTX_EXT_INIT_GUARD
-
-// #ifdef __GNUC__
-// #pragma GCC visibility pop
-// #endif
-
+/*
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+*/
 #ifdef __cplusplus
 } /* extern "C" */
 #endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
new file mode 100644
index 000000000..0f6ff9667
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
@@ -0,0 +1,148 @@
+/*
+* Copyright 2023-2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_COUNTERS_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifndef NVTX_EXT_IMPL_COUNTERS_V1
+#define NVTX_EXT_IMPL_COUNTERS_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Macros to create versioned symbols. */
+#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_bpl##COMPATID
+#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \
+    NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else /* NVTX_DISABLE */
+
+/*
+ * Function slots for the counters extension. First entry is the module state,
+ * initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+#define NVTX_EXT_COUNTERS_SLOT_COUNT 63
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1]
+    = {0};
+
+/* Avoid warnings about missing prototype. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX_EXT_COUNTERS_SLOT_COUNT,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID,
+        1, &segment, /* number of segments, segments */
+        NULL, /* no export function needed */
+        /* bake type sizes and alignment information into program binary */
+        NULL
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots));
+}
+
+#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
+typedef ret_type (*fn_name##_impl_fntype)signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister,
+    (nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr),
+    (domain, attr))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value),
+    (domain, hCounter, value))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value),
+    (domain, hCounter, value))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size),
+    (domain, hCounter, values, size))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason),
+    (domain, hCounter, reason))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters,
+    const void* counters, size_t size), (domain, hCounters, counters, size))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx,
+    (nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch),
+    (domain, countersBatch))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+/* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
new file mode 100644
index 000000000..5a5286df3
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
@@ -0,0 +1,74 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else  /* NVTX_DISABLE */
+
+#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
+typedef ret_type ( * fn_name##_impl_fntype )signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+#undef NVTX_EXT_FN_IMPL
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
new file mode 100644
index 000000000..7e316d379
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
@@ -0,0 +1,133 @@
+/*
+* Copyright 2009-2020,2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_MEM_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID
+#define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else  /* NVTX_DISABLE */
+
+/*
+ * Function slots for the memory extension. First entry is the module
+ * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2]
+    = {0};
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX3EXT_CBID_MEM_FN_NUM,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM,
+        1, &segment,
+        NULL, /* no export function needed */
+        NULL
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots));
+}
+
+#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
+typedef ret_type ( * fn_name##_impl_fntype )signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+#undef NVTX_EXT_FN_IMPL
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
new file mode 100644
index 000000000..8f9c79961
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
@@ -0,0 +1,155 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_V1
+#define NVTX_EXT_IMPL_PAYLOAD_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Macros to create versioned symbols. */
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_bpl##COMPATID
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else /* NVTX_DISABLE */
+
+#include "nvtxExtPayloadTypeInfo.h"
+
+/*
+ * Function slots for the payload extension. First entry is the module state,
+ * initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1]
+    = {0};
+
+/* Avoid warnings about missing prototype. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX_EXT_PAYLOAD_SLOT_COUNT,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID,
+        1, &segment, /* number of segments, segments */
+        NULL, /* no export function needed */
+        /* bake type sizes and alignment information into program binary */
+        &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo))
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
+}
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
+typedef ret_type (*fn_name##_impl_fntype)signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain,
+    const nvtxScopeAttr_t* attr), (domain, attr))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain,
+    nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, id, payloadData, count))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */
+
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h b/src/include/nvtx3/nvtxDetail/nvtxExtInit.h
similarity index 71%
rename from src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
rename to src/include/nvtx3/nvtxDetail/nvtxExtInit.h
index 724c217a5..abb993e2d 100644
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtInit.h
@@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2023  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@@ -22,7 +22,7 @@ extern "C" {
 #define NVTX_PATHCHAR   wchar_t
 #define NVTX_STR(x)     L##x
 #define NVTX_GETENV     _wgetenv
-#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  HMODULE
 #define NVTX_DLLOPEN(x) LoadLibraryW(x)
 #define NVTX_DLLFUNC    GetProcAddress
@@ -39,14 +39,14 @@ extern "C" {
 #define NVTX_PATHCHAR   char
 #define NVTX_STR(x)     x
 #define NVTX_GETENV     getenv
-#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  void*
 #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
 #define NVTX_DLLFUNC    dlsym
 #define NVTX_DLLCLOSE   dlclose
 #define NVTX_YIELD()    sched_yield()
 #define NVTX_MEMBAR()   __sync_synchronize()
-/* Ensure full memory barrier for atomics, to match Windows functions */
+/* Ensure full memory barrier for atomics, to match Windows functions. */
 #define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
 #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
 #define NVTX_ATOMIC_WRITE_PTR(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
@@ -63,7 +63,7 @@ extern "C" {
 #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
 #endif
 
-/* Define this to 1 for platforms that support environment variables */
+/* Define this to 1 for platforms that support environment variables. */
 /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
 /* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
 #define NVTX_SUPPORT_ENV_VARS 1
@@ -72,16 +72,16 @@ extern "C" {
 #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
 
 /* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
-*  and this will override any dynamic injection.  Useful for platforms where dynamic
-*  injection is not available.  Since weak symbols not explicitly marked extern are
-*  guaranteed to be initialized to zero if no definitions are found by the linker, the
-*  dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
+ * which will override any dynamic injection. This is useful for platforms, where dynamic
+ * injection is not available. Since weak symbols, not explicitly marked extern, are
+ * guaranteed to be initialized to zero, if no definitions are found by the linker, the
+ * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */
 #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
 #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
 /* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
-*  symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
-*  does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
-*  injection library. */
+ * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which
+ * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic
+ * injection library. */
 __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
 #else
 #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
@@ -89,35 +89,37 @@ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxEx
 
 
 
-/* This function tries to find or load an NVTX injection library and get the
-*  address of its InitializeInjectionExtension function.  If such a function pointer
-*  is found, it is called, and passed the address of this NVTX instance's
-*  nvtxGetExportTable function, so the injection can attach to this instance.
-*  If the initialization fails for any reason, any dynamic library loaded will
-*  be freed, and all NVTX implementation functions will be set to no-ops.  If
-*  initialization succeeds, NVTX functions not attached to the tool will be set
-*  to no-ops.  This is implemented as one function instead of several small
-*  functions to minimize the number of weak symbols the linker must resolve.
-*  Order of search is:
-*  - Pre-injected library exporting InitializeInjectionNvtxExtension
-*  - Loadable library exporting InitializeInjectionNvtxExtension
-*      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
-*      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
-*  - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
-*/
-NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
-NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
+/* This function tries to find or load an NVTX injection library and get the address of its
+ * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and
+ * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection
+ * can attach to this instance.
+ * If the initialization fails for any reason, any dynamic library loaded will  be freed, and all
+ * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX
+ * functions that are not attached to the tool will be set to no-ops. This is implemented as one
+ * function instead of several small functions to minimize the number of weak symbols the linker
+ * must resolve. The order of search is:
+ *  1) Pre-injected library exporting InitializeInjectionNvtxExtension
+ *  2) Loadable library exporting InitializeInjectionNvtxExtension
+ *      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+ *      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+ *  3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+ */
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
 {
     const char* const initFuncName = "InitializeInjectionNvtxExtension";
     NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
     NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
 
-    if(out_init_fnptr){
+    if (out_init_fnptr)
+    {
         *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
     }
 
 #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
-    /* Use POSIX global symbol chain to query for init function from any module */
+    /* Use POSIX global symbol chain to query for init function from any module. */
     init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
 #endif
 
@@ -127,7 +129,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
     {
 #if NVTX_SUPPORT_ENV_VARS
         /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
-        *  to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+           to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
         const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
             ? NVTX_STR("NVTX_INJECTION32_PATH")
             : NVTX_STR("NVTX_INJECTION64_PATH");
@@ -135,12 +137,12 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
         NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
         const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
 
-        /* Refer to this variable explicitly in case all references to it are #if'ed out */
+        /* Refer to this variable explicitly in case all references to it are #if'ed out. */
         (void)injectionLibraryPathBuf;
 
 #if NVTX_SUPPORT_ENV_VARS
         /* Disable the warning for getenv & _wgetenv -- this usage is safe because
-        *  these functions are not called again before using the returned value. */
+           these functions are not called again before using the returned value. */
 #if defined(_MSC_VER)
 #pragma warning( push )
 #pragma warning( disable : 4996 )
@@ -188,7 +190,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
 
             pkgName[bytesRead] = 0;
 
-            /* String can contain colon as a process separator. In this case the package name is before the colon. */
+            /* String can contain colon as a process separator. In this case the
+               package name is before the colon. */
             pos = 0;
             while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
             {
@@ -223,8 +226,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
         }
 #endif
 
-        /* At this point, injectionLibraryPath is specified if a dynamic
-        *  injection library was specified by a tool. */
+        /* At this point, `injectionLibraryPath` is specified if a dynamic
+           injection library was specified by a tool. */
         if (injectionLibraryPath)
         {
             /* Load the injection library */
@@ -236,7 +239,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
             }
             else
             {
-                /* Attempt to get the injection library's entry-point */
+                /* Attempt to get the injection library's entry-point. */
                 init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
                 if (!init_fnptr)
                 {
@@ -252,8 +255,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
 #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
     if (!init_fnptr)
     {
-        /* Check weakly-defined function pointer.  A statically-linked injection can define this as
-        *  a normal symbol and it will take precedence over a dynamic injection. */
+        /* Check weakly-defined function pointer.  A statically-linked injection can define
+           this as a normal symbol and it will take precedence over a dynamic injection. */
         if (InitializeInjectionNvtxExtension_fnptr)
         {
             init_fnptr = InitializeInjectionNvtxExtension_fnptr;
@@ -261,13 +264,13 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
     }
 #endif
 
-    if(out_init_fnptr){
+    if (out_init_fnptr)
+    {
         *out_init_fnptr = init_fnptr;
     }
 
-    /* At this point, if init_fnptr is not set, then no tool has specified
-    *  an NVTX injection library -- return non-success result so all NVTX
-    *  API functions will be set to no-ops. */
+    /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library.
+       Non-success result is returned, so that all NVTX API functions will be set to no-ops. */
     if (!init_fnptr)
     {
         return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
@@ -276,16 +279,19 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
     return NVTX_SUCCESS;
 }
 
+/* Avoid warnings about missing prototypes. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState);
 NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
-    nvtxExtModuleInfo_t* moduleInfo,
-    intptr_t* moduleState
-    )
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState)
 {
     intptr_t old;
 
     NVTX_INFO( "%s\n", __FUNCTION__ );
 
-    if( *moduleState == NVTX_EXTENSION_LOADED) {
+    if (*moduleState == NVTX_EXTENSION_LOADED)
+    {
+        NVTX_INFO("Module loaded\n");
         return;
     }
 
@@ -296,45 +302,55 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
         NVTX_EXTENSION_FRESH);
     if (old == NVTX_EXTENSION_FRESH)
     {
-        NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
+        NvtxExtInitializeInjectionFunc_t init_fnptr =
+            NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr;
         int entryPointStatus = 0;
         int forceAllToNoops = 0;
+        size_t s;
 
-        /* Load & initialize injection library -- it will assign the function pointers */
-        if(init_fnptr == 0){
+        /* Load and initialize injection library, which will assign the function pointers. */
+        if (init_fnptr == 0)
+        {
             int result = 0;
 
-            /* try to load vanilla NVTX first*/
+            /* Try to load vanilla NVTX first. */
             nvtxInitialize(0);
 
             result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
-            /*at this point init_fnptr will be either 0 or a real function*/
+            /* At this point `init_fnptr` will be either 0 or a real function. */
 
-            if(result == NVTX_SUCCESS) {
-                NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
+            if (result == NVTX_SUCCESS)
+            {
+                NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr;
             }
-            else {
+            else
+            {
                 NVTX_ERR("Failed to load injection library\n");
             }
         }
 
-        if(init_fnptr != 0) {
-            /* Invoke injection library's initialization function.  If it returns
-            *  0 (failure) and a dynamic injection was loaded, unload it. */
+        if (init_fnptr != 0)
+        {
+            /* Invoke injection library's initialization function. If it returns
+               0 (failure) and a dynamic injection was loaded, unload it. */
             entryPointStatus = init_fnptr(moduleInfo);
-            if (entryPointStatus == 0) {
+            if (entryPointStatus == 0)
+            {
                 NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
             }
         }
 
-        /* Clean up any functions that are still uninitialized so that they are skipped.
-         * Set all to null if injection init function failed as well.
-        */
+        /* Clean up any functions that are still uninitialized so that they are
+           skipped. Set all to null if injection init function failed as well. */
         forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
-        for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
-            nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
-            for(size_t i = 0; i < segment->slotCount; ++i){
-                if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
+        for (s = 0; s < moduleInfo->segmentsCount; ++s)
+        {
+            nvtxExtModuleSegment_t* segment = moduleInfo->segments + s;
+            size_t i;
+            for (i = 0; i < segment->slotCount; ++i)
+            {
+                if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH))
+                {
                     segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
                 }
             }
@@ -342,12 +358,11 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
 
         NVTX_MEMBAR();
 
-        /* Signal that initialization has finished, so now the assigned function pointers will be used */
-        NVTX_ATOMIC_WRITE_PTR(
-            moduleState,
-            NVTX_EXTENSION_LOADED);
+        /* Signal that initialization has finished and the assigned function
+           pointers will be used. */
+        NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED);
     }
-    else /* Spin-wait until initialization has finished */
+    else /* Spin-wait until initialization has finished. */
     {
         NVTX_MEMBAR();
         while (*moduleState != NVTX_EXTENSION_LOADED)
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
new file mode 100644
index 000000000..71e30bc37
--- /dev/null
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
@@ -0,0 +1,272 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
+#define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
+
+/* General helper macros */
+#include "nvtxExtHelperMacros.h"
+
+/* Get variable name with line number (almost unique per file). */
+#define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__)
+
+/* Create real arguments from just pasting tokens next to each other. */
+#define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__
+
+/* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */
+#define NVTX_PAYLOAD_ENTRY_THROWAWAY
+#define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id
+
+/*
+ * Create the NVTX binary payloads schema attributes.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name The name of the schema.
+ * @param schema_flags Additional schema flags
+ * @param mask_add Fields to be added to the mask.
+ * @param num_entries The number schema entries.
+ */
+#define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \
+    nvtxPayloadSchemaAttr_t struct_id##Attr = { \
+        /*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \
+            NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \
+            NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \
+            NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \
+        /*.name = */schema_name, \
+        /*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \
+        /*.flags = */schema_flags, \
+        /*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \
+        /*.payloadStaticSize = */sizeof(struct_id), \
+        /*.packAlign = */0, /*.schemaId = */schema_id};
+
+
+/*****************************************************************/
+/*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/
+
+/* First part of schema entry for different number of arguments. */
+#define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+#define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \
+    NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Second part of schema entry (append struct member).
+   (At least two arguments are passed (`member` and `etype`). */
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member
+
+/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \
+    {_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \
+    offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)},
+
+/* Handle up to 16 schema entries. */
+#define _NVTX_PAYLOAD_SME1(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1)
+#define _NVTX_PAYLOAD_SME2(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME3(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME4(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME5(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME6(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME7(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME8(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME9(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \
+  nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
+    {0, 0} \
+  };
+
+/*
+ * Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`.
+ */
+#define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \
+    _NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries)
+#define _NVTX_DEFINE_S4S_2(struct_id, entries) \
+    _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+
+#define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
+    NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \
+        NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
+
+/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/
+
+
+/******************************************************************/
+/*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/
+
+/* Extract struct member for fixed-size arrays. */
+#define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name
+#define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count]
+
+/* Extract type and member name and handle special case of fixed-size array. */
+#define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member;
+#define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member;
+#define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member;
+#define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member;
+#define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \
+    type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member;
+#define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \
+    _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen)
+
+/* Handle different number of arguments per struct entry. */
+#define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Handle up to 16 struct members. */
+#define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry
+#define _NVTX_PAYLOAD_STRUCT1(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1)
+#define _NVTX_PAYLOAD_STRUCT2(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT3(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT4(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT5(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT6(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT7(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT8(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT9(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__)
+
+/* Generate the typedef. */
+#define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \
+  typedef struct { \
+      NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \
+  } struct_id;
+
+/* Generate first part of the schema entry. */
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \
+    NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name
+#define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name
+
+/* Resolve to last part of schema entry (append struct member). */
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \
+    _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \
+    {_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \
+    offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)},
+
+/* Handle up to 16 schema entries. */
+#define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1)
+#define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \
+  nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
+    {0, 0} \
+  };
+
+/*
+ * Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`.
+ */
+#define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \
+      NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \
+  _NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries)
+#define _NVTX_DEFINE_SWS_2(struct_id, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+
+#define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
+    NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \
+        NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
+
+/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */
+
+#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
\ No newline at end of file
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
similarity index 90%
rename from src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
rename to src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
index 7c166bd34..6a30e6633 100644
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
@@ -10,14 +10,14 @@
 #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
 #endif
 
-typedef void* pointer_type;
+typedef void* nvtx_payload_pointer_type;
 
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 #include <uchar.h>
 #include <stdalign.h>
 #endif
 
-/* `alignof` is available as of C11 or C++11 */
+/* `alignof` is available as of C11 or C++11. */
 #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
 
 #define nvtx_alignof(type) alignof(type)
@@ -54,7 +54,7 @@ MKTYPEDEF(double);
 MKTYPEDEF2(long double, longdouble);
 
 MKTYPEDEF(size_t);
-MKTYPEDEF(pointer_type);
+MKTYPEDEF(nvtx_payload_pointer_type);
 
 MKTYPEDEF(wchar_t);
 
@@ -85,8 +85,16 @@ MKTYPEDEF(wchar_t);
 /*
  * Helper array to get the alignment for each predefined C/C++ language type.
  * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
+ *
+ * In C++, `const` variables use internal linkage by default, but we need it to
+ * be public (extern) since weak declarations must be public.
  */
-const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
+NVTX_LINKONCE_DEFINE_GLOBAL
+#ifdef __cplusplus
+extern
+#endif
+const nvtxPayloadEntryTypeInfo_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
 {
     /* The first entry contains this array's length and the size of each entry in this array. */
     {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
@@ -119,7 +127,7 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
     /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
 
     /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */    {sizeof(size_t),       nvtx_alignof(size_t)},
-    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)},
 
     /*** Special character types ***/
     /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
@@ -140,4 +148,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
 };
 
 #undef nvtx_alignof
-#undef nvtx_alignof2
+#undef nvtx_alignof2
\ No newline at end of file
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
similarity index 100%
rename from src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
rename to src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
diff --git a/src/include/nvtx3/nvtxDetail/nvtxImpl.h b/src/include/nvtx3/nvtxDetail/nvtxImpl.h
index 590ce9024..5ffc4abb4 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxImpl.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxImpl.h
@@ -10,37 +10,34 @@
 #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
 #endif
 
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
 /* ---- Include required platform headers ---- */
 
-#if defined(_WIN32) 
+#if defined(_WIN32)
 
-#include <Windows.h>
+#include <windows.h>
 
 #else
 #include <unistd.h>
 
 #if defined(__ANDROID__)
-#include <android/api-level.h> 
+#include <android/api-level.h>
 #endif
 
 #if defined(__linux__) || defined(__CYGWIN__)
 #include <sched.h>
 #endif
 
+#include <sys/types.h>
 #include <limits.h>
 #include <dlfcn.h>
 #include <fcntl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <errno.h>
-
-#include <string.h>
-#include <sys/types.h>
 #include <pthread.h>
-#include <stdlib.h>
-#include <wchar.h>
 
 #endif
 
diff --git a/src/include/nvtx3/nvtxDetail/nvtxInit.h b/src/include/nvtx3/nvtxDetail/nvtxInit.h
index 43cad7010..03568f149 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxInit.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxInit.h
@@ -14,11 +14,11 @@
 
 /* Prefer macros over inline functions to reduce symbol resolution at link time */
 
-#if defined(_WIN32) 
+#if defined(_WIN32)
 #define NVTX_PATHCHAR   wchar_t
 #define NVTX_STR(x)     L##x
 #define NVTX_GETENV     _wgetenv
-#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  HMODULE
 #define NVTX_DLLOPEN(x) LoadLibraryW(x)
 #define NVTX_DLLFUNC    GetProcAddress
@@ -31,7 +31,7 @@
 #define NVTX_PATHCHAR   char
 #define NVTX_STR(x)     x
 #define NVTX_GETENV     getenv
-#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  void*
 #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
 #define NVTX_DLLFUNC    dlsym
diff --git a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
index 57661c754..28e765581 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
@@ -23,7 +23,7 @@
  * In some situations it is desirable to declare a variable without initializing
  * it, refer to it in code or other variables' initializers, and then initialize
  * it later.  Similarly, functions can be prototyped, have their address taken,
- * and then have their body defined later.  In such cases, use the FWDDECL macros 
+ * and then have their body defined later.  In such cases, use the FWDDECL macros
  * when forward-declaring LINKONCE global variables without initializers and
  * function prototypes, and then use the DEFINE macros when later defining them.
  * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h b/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
deleted file mode 100644
index 4663fda82..000000000
--- a/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
-* Copyright 2021  NVIDIA Corporation.  All rights reserved.
-*
-* Licensed under the Apache License v2.0 with LLVM Exceptions.
-* See https://llvm.org/LICENSE.txt for license information.
-* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-*/
-
-#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
-#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
-#endif
-
-#define NVTX_EXT_IMPL_GUARD
-#include "nvtxExtImpl.h"
-#undef NVTX_EXT_IMPL_GUARD
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
-    NAME##_v##VERSION##_mem##COMPATID
-#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
-    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
-#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
-    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
-
-/*
- * Function slots for the binary payload extension. First entry is the module
- * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
- */
-NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
-NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
-    = {0};
-
-NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
-{
-    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
-    nvtxExtModuleSegment_t segment = {
-        0, // unused (only one segment)
-        NVTX3EXT_CBID_PAYLOAD_FN_NUM,
-        fnSlots
-    };
-
-    nvtxExtModuleInfo_t module = {
-        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
-        NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
-        1, &segment, // number of segments, segments
-        NULL, // no export function needed
-        // bake type sizes and alignment information into program binary
-        &nvtxExtPayloadTypeInfo
-    };
-
-    NVTX_INFO( "%s\n", __FUNCTION__  );
-
-    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
-        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
-}
-
-#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
-typedef ret_val ( * fn_name##_impl_fntype )signature; \
-NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
-    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
-    if (slot != NVTX_EXTENSION_DISABLED) { \
-        if (slot) { \
-            return (*(fn_name##_impl_fntype)slot) arg_names; \
-        } else { \
-            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
-            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
-            if (slot != NVTX_EXTENSION_DISABLED && slot) { \
-                return (*(fn_name##_impl_fntype)slot) arg_names; \
-            } \
-        } \
-    } \
-    return ((ret_val)(intptr_t)-1); \
-}
-
-NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
-
-NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
-
-#undef NVTX_EXT_FN_IMPL
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif /* __cplusplus */
\ No newline at end of file
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 9a3dbdb3b..5c73a6cf1 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -10,6 +10,9 @@
 #define NCCL_P2P_H_
 
 #include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "core.h"
 
 #if CUDART_VERSION < 12030
 // MNNVL: FABRIC handle support lifted from CUDA 12.3
diff --git a/src/include/proxy.h b/src/include/proxy.h
index cb1c3b200..eab6930fe 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -16,13 +16,29 @@
 #include "shm.h"
 #include "p2p.h"
 
+typedef enum : uint8_t {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown,
+  ncclPatternCollnetChain,
+  ncclPatternCollnetDirect,
+  ncclPatternNvls,
+  ncclPatternNvlsTree,
+  ncclPatternSend,
+  ncclPatternRecv
+} ncclPattern_t;
+
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
 
 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
 
 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
-static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");
 
 union ncclProxyOpSpecifics {
   struct {
@@ -124,8 +140,9 @@ struct ncclProxyArgs {
 
 // ProxyOps are used to communicate between main thread and service thread
 // Make sure we have enough to store two full rounds of operations on all channels.
-// Otherwise we'd be unable to post half of them to free new elements.
-#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+// Otherwise we'd be unable to post half of them to free new elements. Each
+// p2p work contains a send and recv proxy op hence the 2x before it.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)
 
 struct ncclProxyOpsPool {
   struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
@@ -243,7 +260,7 @@ struct ncclProxyState {
   bool dmaBufSupport;
   ncclNet_t* ncclNet;
   ncclCollNet_t* ncclCollNet;
-  volatile uint32_t* abortFlag;
+  uint32_t* abortFlag;
   // Service threads
   pthread_t thread;
   pthread_t threadUDS;
@@ -301,7 +318,6 @@ enum proxyMode {
 };
 
 ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
diff --git a/src/include/register.h b/src/include/register.h
index d23e0da3e..9f7c83faa 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -1,6 +1,11 @@
 #ifndef NCCL_REGISTER_H_
 #define NCCL_REGISTER_H_
 
+#include "device.h"
+
+#include <cuda.h>
+#include <stdint.h>
+
 enum {
   NET_REG_COMPLETE = 0x01,
   NVLS_REG_COMPLETE = 0x02,
diff --git a/src/include/transport.h b/src/include/transport.h
index 1671db0e2..07fbb3ec4 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -13,12 +13,14 @@
 #include "core.h"
 
 #define NTRANSPORTS 4
+#define TRANSPORT_UNDEFINED -1
 #define TRANSPORT_P2P 0
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3
 
 #include "proxy.h"
+#include "comm.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -45,6 +47,7 @@ struct ncclPeerInfo {
   int cudaCompCap;
   // MNNVL support
   nvmlGpuFabricInfoV_t fabricInfo;
+  int cuMemSupport;
 };
 
 #define CONNECT_SIZE 128
@@ -57,17 +60,21 @@ struct ncclConnect {
 #define NVLS_HANDLE_SIZE 64
 struct ncclNvlsSharedRes {
   int refCount;
-  CUmulticastObjectProp properties;
+  bool inited;
+  CUmulticastObjectProp bufProp;
+  CUmulticastObjectProp signalProp;
   CUmemAccessDesc accessDesc;
   int dev;
-  size_t size;
-  size_t granularity;
-  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
+  size_t buffSize;
+  size_t creditSize;
+  CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
+  CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
   char* mcBuff; // Multicast NVLS buffer address
-  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
+  char* mcCredit; // Multicast NVLS credit address
+  CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer
+  CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer
   char* ucBuff; // Unicast NVLS buffer address
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  size_t ucGran;
+  char* ucCredit; // Unicast NVLS credit address
   int nChannels;
   struct ncclShmemCollBuff nvlsShmem;
   void *nvlsShmemHandle;
@@ -84,6 +91,7 @@ struct ncclCollNetSharedRes {
   void* resources;
   int nChannels;
   size_t buffSize;
+  int intraHighestTransportType;
 };
 
 struct ncclTransportComm {
@@ -111,7 +119,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
+ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
 ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
@@ -121,6 +131,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
 ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
 ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
-ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
+ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
+
+ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
+ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
+
+ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
+ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
+ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
+
 #endif
diff --git a/src/include/tuner.h b/src/include/tuner.h
index df8f5f21c..285f87e81 100644
--- a/src/include/tuner.h
+++ b/src/include/tuner.h
@@ -9,14 +9,15 @@
 #define NCCL_INT_TUNER_H_
 
 #include "nccl_tuner.h"
+#include "comm.h"
 
 // Tuning plugin to override NCCL's default algorithm/protocol tuning.
 
 // Attempts to load NCCL tuner from environmental variable.
 // Returns ncclSuccess if the correct tuner symbol has been found and
 // successully loaded.  Otherwise returns an error and also logs the error.
-ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);
 
 // Cleans up NCCL tuner plugin.
-ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index cfc009861..abecf2257 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -9,12 +9,14 @@
 
 #include "nccl.h"
 #include "alloc.h"
+#include "bitops.h"
 #include "checks.h"
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
 #include <algorithm>
 #include <new>
+#include <type_traits>
 
 int ncclCudaCompCap();
 
@@ -30,11 +32,6 @@ uint64_t getHostHash();
 uint64_t getPidHash();
 ncclResult_t getRandomData(void* buffer, size_t bytes);
 
-const char* ncclOpToString(ncclRedOp_t op);
-const char* ncclDatatypeToString(ncclDataType_t type);
-const char* ncclAlgoToString(int algo);
-const char* ncclProtoToString(int proto);
-
 struct netIf {
   char prefix[64];
   int port;
@@ -44,9 +41,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList);
 bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
 
 static long log2i(long n) {
- long l = 0;
- while (n>>=1) l++;
- return l;
+  return log2Down(n);
 }
 
 inline uint64_t clockNano() {
@@ -96,8 +91,11 @@ void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
 void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
 void ncclMemoryStackPush(struct ncclMemoryStack* me);
 void ncclMemoryStackPop(struct ncclMemoryStack* me);
+void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align);
 template<typename T>
 T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+template<typename Header, typename Element>
+inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt);
 
 ////////////////////////////////////////////////////////////////////////////////
 /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
@@ -140,11 +138,14 @@ T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
 void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
 template<typename T, T *T::*next>
+void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
 T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
 T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src);
+
 
 ////////////////////////////////////////////////////////////////////////////////
 /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
@@ -233,6 +234,12 @@ inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size,
   return obj;
 }
 
+inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) {
+  void *obj = ncclMemoryStack::allocate(me, size, align);
+  memset(obj, 0, size);
+  return obj;
+}
+
 template<typename T>
 inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
   void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
@@ -240,6 +247,17 @@ inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
   return (T*)obj;
 }
 
+template<typename Header, typename Element>
+inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) {
+  size_t size = sizeof(Header);
+  size = (size + alignof(Element)-1) & -alignof(Element);
+  size += nElt*sizeof(Element);
+  size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header);
+  void *obj = ncclMemoryStack::allocate(me, size, align);
+  memset(obj, 0, size);
+  return (Header*)obj;
+}
+
 inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
   using Frame = ncclMemoryStack::Frame;
   Frame tmp = me->topFrame;
@@ -343,6 +361,13 @@ inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
   me->tail = x;
 }
 
+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x) {
+  if (me->head == nullptr) me->tail = x;
+  x->*next = me->head;
+  me->head = x;
+}
+
 template<typename T, T *T::*next>
 inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
   T *ans = me->head;
@@ -388,45 +413,11 @@ inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
 }
 
 template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
-  T *head = me->head;
-  me->head = nullptr;
-  me->tail = nullptr;
-  while (head != nullptr) {
-    T *tmp = head->*next;
-    ncclMemoryPoolFree(pool, tmp);
-    head = tmp;
-  }
-}
-
-/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
- * and we should put a before b; otherwise, b should be put ahead of a. */
-template<typename T, T *T::*next>
-inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
-  T *cur = me->head;
-  T *prev = NULL;
-
-  if (cur == NULL) {
-    x->*next = nullptr;
-    me->tail = me->head = x;
-  } else {
-    while (cur) {
-      if (cmp(cur, x) > 0) {
-        prev = cur;
-        cur = cur->next;
-      } else {
-        break;
-      }
-    }
-
-    x->*next = cur;
-    if (prev) {
-      prev->*next = x;
-      if (cur == NULL) me->tail = x;
-    } else {
-      me->head = x;
-    }
-  }
+void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src) {
+  (dst->tail ? dst->tail->next : dst->head) = src->head;
+  if (src->tail) dst->tail = src->tail;
+  src->head = nullptr;
+  src->tail = nullptr;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/init.cc b/src/init.cc
index cecb9bc05..16e02d49c 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -44,6 +44,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
@@ -71,24 +72,22 @@ ncclResult_t initGdrCopy() {
   return ncclSuccess;
 }
 
-pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
-static bool initialized = false;
+static ncclResult_t initResult = ncclSuccess;
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+
+static void initOnceFunc() {
+  initEnv();
+  initGdrCopy();
+  // Always initialize bootstrap network
+  NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit);
+
+  initNvtxRegisteredEnums();
+exit:;
+}
 
 static ncclResult_t ncclInit() {
-  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
-  pthread_mutex_lock(&initLock);
-  if (!initialized) {
-    initEnv();
-    initGdrCopy();
-    // Always initialize bootstrap network
-    NCCLCHECK(bootstrapNetInit());
-    NCCLCHECK(ncclNetPluginInit());
-
-    initNvtxRegisteredEnums();
-    __atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
-  }
-  pthread_mutex_unlock(&initLock);
-  return ncclSuccess;
+  pthread_once(&initOnceControl, initOnceFunc);
+  return initResult;
 }
 
 NCCL_API(ncclResult_t, ncclGetVersion, int* version);
@@ -172,6 +171,7 @@ void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) {
 }
 
 static ncclResult_t commFree(ncclComm_t comm) {
+  int abort = 0;
   /* commFree() should not involve any sync among ranks. */
   if (comm == NULL)
     return ncclSuccess;
@@ -234,8 +234,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
   ncclMemoryStackDestruct(&comm->memScoped);
   ncclMemoryStackDestruct(&comm->memPermanent);
 
+  abort = *comm->abortFlag;
   if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
-    NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
+    free(comm->abortFlag);
+    NCCLCHECK(ncclCudaHostFree((void*)comm->abortFlagDev));
     free(comm->abortFlagRefCount);
   }
   free((void*)comm->config.netName);
@@ -245,7 +247,11 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   NCCLCHECK(ncclRegCleanup(comm));
 
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
+
   commPoison(comm); // poison comm before free to avoid comm reuse.
+  NCCLCHECK(ncclNetFinalize(comm));
+  NCCLCHECK(ncclNetPluginUnload(comm));
   free(comm);
 
   return ncclSuccess;
@@ -254,7 +260,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
 NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
 // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
 NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
-NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10);
+#define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20)
+NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT);
+NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX);
 enum ncclLaunchMode ncclParamLaunchMode;
 
 NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1);
@@ -281,7 +289,7 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
 ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
   /* comm must be ready, or error will be reported */
   ncclResult_t ret = ncclSuccess;
-  if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
+  if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
     ncclGroupJobAbort(comm->groupJob);
   } else {
     NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
@@ -318,6 +326,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   comm->rank = rank;
   comm->nRanks = ndev;
 
+  NCCLCHECK(ncclNetPluginLoad(comm));
   NCCLCHECK(ncclNetInit(comm));
   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
@@ -349,9 +358,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 
   ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
   ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
-  ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
-  ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);
-  ncclMemoryPoolConstruct(&comm->memPool_ncclCollnetHandleList);
 
   comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
   comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -397,6 +403,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   int nRanks = comm->nRanks;
   struct ncclDevCommAndChannels tmpCommAndChans;
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
+  struct ncclNvmlCCStatus ccStatus;
 
   NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
   NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -406,37 +413,51 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   tmpCommAndChans.comm.nRanks = nRanks;
   tmpCommAndChans.comm.node = comm->node;
   tmpCommAndChans.comm.nNodes = comm->nNodes;
-  tmpCommAndChans.comm.abortFlag = comm->abortFlag;
+  tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
   for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
     tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
   }
   tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize;
   tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];
 
-  comm->workFifoDepth = ncclParamWorkFifoDepth();
-  if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) {
-    WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth);
-    comm->workFifoDepth = 64<<10;
+  comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));
+
+  memset(&ccStatus, 0, sizeof(ccStatus));
+  if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) {
+    comm->workFifoBytes = 0;
+    if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) {
+      WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)");
+    }
+  } else {
+    comm->workFifoBytes = ncclParamWorkFifoBytes();
+    if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
+      WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
+      comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+    }
+    comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
+  }
+
+  if (comm->rank == 0) {
+    INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes);
   }
-  tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth;
 
   if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
-    // The workFifoHeap lives in GDR mapped CUDA memory.
-    NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle), ret, fail);
-    ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle);
+    // The workFifoBuf lives in GDR mapped CUDA memory.
+    NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail);
+    ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle);
   } else {
-    // The workFifoHeap lives in cudaHost memory.
-    comm->workFifoHeapGdrHandle = nullptr;
-    NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth), ret, fail);
-    ncclCommPushCudaHostFree(comm, comm->workFifoHeap);
-    comm->devWorkFifoHeap = comm->workFifoHeap;
+    // The workFifoBuf lives in cudaHost memory.
+    comm->workFifoBufGdrHandle = nullptr;
+    NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoBuf, comm->workFifoBytes), ret, fail);
+    ncclCommPushCudaHostFree(comm, comm->workFifoBuf);
+    comm->workFifoBufDev = comm->workFifoBuf;
   }
-  tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap;
 
-  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS), ret, fail);
-  ncclCommPushCudaHostFree(comm, comm->workFifoDone);
-  comm->workFifoSent = 0;
-  comm->workFifoAckdMin = 0;
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail);
+  ncclCommPushCudaHostFree(comm, comm->workFifoConsumed);
+  comm->workFifoProduced = 0;
+  comm->workFifoConsumedLeast = 0;
+  tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
 
   if (comm->collNetDenseToUserRank != nullptr) {
     NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -452,7 +473,6 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
     tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
     tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
-    tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
 
     if (comm->channels[c].ring.userRanks != nullptr) {
       NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -471,13 +491,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
 static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
-    printf("%s\n", VERSION_STRING);
-    fflush(stdout);
-    if (ncclDebugFile != stdout)
-      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
-    shown = 1;
+  if (ncclDebugLevel == NCCL_LOG_VERSION || ncclDebugLevel == NCCL_LOG_WARN) {
+    VERSION("%s", VERSION_STRING);
+  } else {
+    INFO(NCCL_ALL,"%s", VERSION_STRING);
   }
 }
 
@@ -487,6 +504,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   info->nvmlDev = comm->nvmlDev;
   info->hostHash=getHostHash()+commHash;
   info->pidHash=getPidHash()+commHash;
+  info->cuMemSupport = ncclCuMemEnable();
 
   // Get the device MAJOR:MINOR of /dev/shm so we can use that
   // information to decide whether we can use SHM for inter-process
@@ -584,244 +602,6 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
 NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
 NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
 
-static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) {
-  int rank = comm->rank;
-  uint64_t nonHeadMask = (1ull << comm->localRanks) - 1;
-
-  comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
-  comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
-  // initialize collNetUserToDenseRank[rank]  
-  comm->collNetUserToDenseRank[rank] = -1;
-  for (int h = 0; h < comm->collNetHeadsNum; h++) {
-    nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]];
-    if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
-  }
-  if (comm->collNetUserToDenseRank[rank] == -1) {
-    comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1));
-  }
-  comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks;
-
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
-  for (int r = 0; r < comm->nRanks; r++) {
-    comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) {
-  ncclResult_t ret = ncclSuccess;
-  int rank = comm->rank;
-  int collNetSetupFail = 0;
-  int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
-  // Find all head ranks
-  int nHeadsUnique = 0;
-  int* headsUnique = NULL;
-  int highestTransportType0, highestTransportType1;
-  char line[1024];
-  bool share;
-
-  struct collnetShareInfo {
-    int headPosition;
-    int isMaster;
-  };
-  struct collnetShareInfo* infos = NULL;
-
-  NCCLCHECKGOTO(ncclCalloc(&headsUnique, collNetGraph->nChannels), ret, fail);
-  { uint64_t mask = 0;
-    // Head GPU index is always 0
-    for (int c = 0; c < collNetGraph->nChannels; c++) {
-      int head = collNetGraph->intra[c * comm->localRanks + 0];
-      assert(comm->rankToNode[head] == comm->node);
-      uint64_t mask0 = mask;
-      mask |= 1ull<<comm->rankToLocalRank[head];
-      if (mask != mask0) headsUnique[nHeadsUnique++] = head;
-    }
-  }
-
-  comm->collNetHeads = headsUnique;
-  comm->collNetHeadsNum = nHeadsUnique;
-  if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
-    NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
-    /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
-     * based on heads with the same head position in each node, as long as the collnet heads of child comm
-     * can match parent's heads, we can let child communicator share parent's collnet resources. */
-    for (int h = 0; h < nHeadsUnique; ++h) {
-      int prev = INT_MIN;
-      struct collnetShareInfo* myinfo;
-
-      share = true;
-      myinfo = infos + comm->rank;
-      memset(myinfo, 0, sizeof(struct collnetShareInfo));
-      /* find the child head position in parent collnet heads. */
-      if (headsUnique[h] == comm->rank) {
-        myinfo->headPosition = -1;
-        myinfo->isMaster = 1;
-        for (int th = 0; th < parent->collNetHeadsNum; ++th)
-          if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) {
-            myinfo->headPosition = th;
-            break;
-          }
-      }
-
-      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail);
-      for (int i = 0; i < comm->nRanks; ++i) {
-        if (infos[i].isMaster) {
-          if (prev == INT_MIN)
-            prev = infos[i].headPosition;
-
-          if (infos[i].headPosition == -1 || prev != infos[i].headPosition) {
-            share = false;
-            break;
-          }
-        }
-      }
-
-      if (share) {
-        if (myinfo->isMaster) {
-          comm->collNetSharedRes = parent->collNetSharedRes;
-          for (int c = 0; c < comm->nChannels; ++c)
-            NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
-        }
-
-        NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
-      } else {
-        /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
-         * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
-         * lifted by sharp plugin/IB hardware in the future. */
-        collNetSetupFail = 1;
-        if (comm->rank == 0) {
-          WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks);
-        }
-        goto fail;
-      }
-    }
-    share = true;
-  } else {
-    /* this allocated buffer will be freed on proxy side */
-    NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
-    comm->collNetSharedRes->nChannels = comm->nChannels;
-    comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
-
-    NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
-
-    for (int c = 0; c < comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels + c;
-      NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
-      for (int h = 0; h < nHeadsUnique; h++) {
-        const int head = headsUnique[h];
-        ncclConnect connect;
-        collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect);
-        if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect);
-      }
-      // Verify CollNet setup across ranks after trying the first channel
-      if (c == 0) {
-        NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
-      }
-    }
-    share = false;
-  }
-
-  if (share) {
-    memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix));
-  } else {
-    do {
-      /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some
-      ranks don't connect to sharp we enable a (redop,type) if any rank claims
-      support. */
-      const ncclRedOp_t redops[] = {ncclSum, ncclProd, ncclMin, ncclMax};
-      uint8_t(*matrix)[4][ncclNumTypes];
-      bool isHead = false;
-      matrix = nullptr;
-      NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
-      for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank);
-      if (isHead) {
-        for (int ty=0; ty < ncclNumTypes; ty++) {
-          for (int i=0; i < 4; i++) {
-            int support = 0;
-            NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, redops[i], &support), ret, matrix_end);
-            // bit 0 = not supported, bit 1 = supported
-            matrix[rank][redops[i]][ty] = 1<<(support ? 1 : 0);
-          }
-        }
-      }
-      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end);
-      for (int ty=0; ty < ncclNumTypes; ty++) {
-        for (int i=0; i < 4; i++) {
-          int op = redops[i];
-          uint8_t accum = 0;
-          for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty];
-          // We support (redop, type) if some rank supports it and no rank doesn't support it
-          comm->collNetSupportMatrix[op][ty] = (accum == (1<<1));
-        }
-      }
-    matrix_end:
-      free(matrix);
-      if (ret != ncclSuccess) goto fail;
-    } while (0);
-  }
-
-  // Verify CollNet setup across ranks after trying all channels
-  NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
-  TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
-
-  line[0] = '\0';
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclTree* chain = &comm->channels[c].collnetChain;
-    snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d",
-      c, chain->down[0], rank, chain->up);
-  }
-  line[1023] = '\0';
-
-  INFO(NCCL_INIT, "Collnet Chains %s", line);
-  // Connect Collnet + chain
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail);
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail);
-  INFO(NCCL_INIT, "Connected collnet + chain");
-
-  // Connect intra-node CollNet + Direct
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channelRecv = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail);
-
-  for (int c = 0; c < comm->nChannels; c++) {
-    struct ncclChannel* channelSend = comm->channels + c;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail);
-
-  // Exchange highest intra-node transport type among ranks
-  // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-  comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-  if (share) {
-    comm->intraHighestTransportType = std::max(comm->intraHighestTransportType, parent->intraHighestTransportType);
-  }
-  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
-  for (int i = 0; i < comm->localRanks; i++) {
-    if (highestTypes[i] > comm->intraHighestTransportType)
-      comm->intraHighestTransportType = highestTypes[i];
-  }
-
-  INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
-
-exit:
-  free(infos);
-  return ret;
-fail:
-  ncclTransportCollNetFree(comm);
-  comm->collNetSupport = 0;
-  goto exit;
-}
-
 // MNNVL: Flag to indicate whether to enable Multi-Node NVLink
 NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 
@@ -890,7 +670,16 @@ static int checkMNNVL(struct ncclComm* comm) {
 }
 #endif
 
-static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
+#define TIMER_INIT_TOTAL 0
+#define TIMER_INIT_KERNELS 1
+#define TIMER_INIT_BOOTSTRAP 2
+#define TIMER_INIT_ALLGATHER 3
+#define TIMER_INIT_TOPO 4
+#define TIMER_INIT_GRAPHS 5
+#define TIMER_INIT_CONNECT 6
+#define TIMERS_INIT_COUNT 7
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
   // 2. { nChannels, graphInfo, topoRanks }
@@ -899,11 +688,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   int nranks = comm->nRanks;
   int nNodes = 1;
   cpu_set_t affinitySave;
-  struct ncclTopoGraph ringGraph;
-  struct ncclTopoGraph treeGraph;
-  struct ncclTopoGraph collNetGraph;
-  struct ncclTopoGraph nvlsGraph;
-  struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
+  struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING];
+  struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
+  struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
+  struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
+  struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
+  struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph };
 
   struct graphInfo {
     int pattern;
@@ -919,6 +709,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   struct allGatherInfo {
     struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
     struct ncclTopoRanks topoRanks;
+    int cpuArch;
+    int cpuVendor;
   };
 
   int nChannelsOrig;
@@ -932,13 +724,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   int *topParentLocalRanks = NULL;
   int tpProxyRank;
 
+  timers[TIMER_INIT_ALLGATHER] = clockNano();
   // AllGather1 - begin
   NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
   NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
 
+  comm->cuMemSupport = 1;
   for (int i = 0; i < nranks; i++) {
     if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
+    if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
     if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
       WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
       ret = ncclInvalidUsage;
@@ -946,6 +741,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
   }
   // AllGather1 - end
+  timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
 
   // MNNVL support
   if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
@@ -1008,6 +804,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     comm->intraBarrierGate = 0;
   } while(0);
 
+  timers[TIMER_INIT_TOPO] = clockNano();
   // Topo detection / System graph creation
   NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
   // Compute paths between GPUs and NICs
@@ -1018,8 +815,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
   // Init search
   NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
+  // Decide on comm's CPU architecture.
+  NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail);
   // Print final topology
   NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
+  timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO];
 
   // Set Affinity to a CPU local the our GPU, so that all memory we allocate
   // on the host is local.
@@ -1043,51 +843,66 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // Determine local Nvls support
   NCCLCHECK(ncclNvlsInit(comm));
 
+  timers[TIMER_INIT_GRAPHS] = clockNano();
   // Get rings and trees
-  memset(&ringGraph, 0, sizeof(struct ncclTopoGraph));
-  ringGraph.id = 0;
-  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.minChannels = 1;
-  ringGraph.maxChannels = MAXCHANNELS/2;
-  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);
-
-  memset(&treeGraph, 0, sizeof(struct ncclTopoGraph));
-  treeGraph.id = 1;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.minChannels = ringGraph.nChannels;
-  treeGraph.maxChannels = ringGraph.nChannels;
-  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
-  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);
-
-  memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph));
-  collNetGraph.id = 2;
-  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-  collNetGraph.collNet = 1;
-  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
+  memset(ringGraph, 0, sizeof(struct ncclTopoGraph));
+  ringGraph->id = 0;
+  ringGraph->pattern = NCCL_TOPO_PATTERN_RING;
+  ringGraph->minChannels = 1;
+  ringGraph->maxChannels = MAXCHANNELS/2;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail);
+
+  memset(treeGraph, 0, sizeof(struct ncclTopoGraph));
+  treeGraph->id = 1;
+  treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
+  treeGraph->minChannels = ringGraph->nChannels;
+  treeGraph->maxChannels = ringGraph->nChannels;
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail);
+
+  memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetChainGraph->id = 2;
+  collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetChainGraph->collNet = 1;
+  collNetChainGraph->minChannels = ringGraph->nChannels;
+  collNetChainGraph->maxChannels = ringGraph->nChannels;
+
+  memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
+  collNetDirectGraph->id = 2;
+  collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
+  collNetDirectGraph->collNet = 1;
+  collNetDirectGraph->minChannels = 1;
+  collNetDirectGraph->maxChannels = MAXCHANNELS;
   if (comm->collNetSupport) {
-    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
-    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail);
   }
 
-  memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph));
-  nvlsGraph.id = 3;
-  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
-  nvlsGraph.minChannels = 1;
-  nvlsGraph.maxChannels = MAXCHANNELS;
+  memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph));
+  nvlsGraph->id = 3;
+  nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS;
+  nvlsGraph->minChannels = 1;
+  nvlsGraph->maxChannels = MAXCHANNELS;
   if (comm->nvlsSupport) {
-    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
-    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail);
+    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail);
   }
+  timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS];
 
   // Initialize num P2P LL buffers for this communicator
   comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1;
 
   if (comm->rank == ncclParamGraphDumpFileRank()) {
-    struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph };
-    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail);
+    struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph };
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail);
   }
 
+  // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather,
+  // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer.
+  timers[TIMER_INIT_CONNECT] = clockNano();
   // AllGather3 - begin
   NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
 
@@ -1102,7 +917,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
   }
 
-  comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].cpuArch = comm->cpuArch;
+  allGather3Data[rank].cpuVendor = comm->cpuVendor;
+
+  comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
   NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
 
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
@@ -1122,7 +940,28 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
     }
     comm->rankToNode[r] = node;
+
+    if (comm->cpuArch != allGather3Data[r].cpuArch &&
+        comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
+      comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
+    }
+    if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
+        comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
+      comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
+    }
+  }
+
+  // Alert the user to the presence of mixed CPUs. In the past this has caused
+  // locks in some collective routines. This may help debug issues in the future.
+  if (rank==0) {
+    if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
+    }
+    if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
+      INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
+    }
   }
+
   // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
   NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
@@ -1178,7 +1017,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
 
-  comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
   if (comm->nChannels < nChannelsOrig) {
     // We started duplicating channels during Preset(), so we need to move the
     // duplicated channels since we have removed some.
@@ -1209,6 +1048,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
   NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
   // AllGather3 - end
+  timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT];
 
   TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
 
@@ -1252,133 +1092,146 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   } else {
     NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
   }
+  
+  timers[TIMER_INIT_CONNECT] = clockNano();
+  do { // Build p2p schedule
+    int node = comm->node;
+    int nNodes = comm->nNodes;
+    int nRanks = comm->nRanks;
+    int local = comm->localRank;
+    int nLocals = comm->maxLocalRanks;
+    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+    bool flat = false;
+    for (int node = 0; node < nNodes; node++) {
+      if (nodeRanks[node].localRanks != nLocals) {
+        flat = true;
+        nNodes = 1; node = 0;
+        nLocals = nRanks; local = rank;
+        break;
+      }
+    }
+    int nNodesPow2 = pow2Up(nNodes);
+    int nLocalsPow2 = pow2Up(nLocals);
+    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
+    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
+    uint32_t nodeRound = 0;
+    uint32_t nodeDelta = 0;
+    int round = 0;
+    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+    // Since that formula only produces valid permutations when N is a pow of 2,
+    // we let N = pow2Up(n) and filter out results greater-eq to n.
+    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+    do {
+      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
+        int sendNode = (node + nodeDelta) % nNodes;
+        int recvNode = (node - nodeDelta + nNodes) % nNodes;
+        uint32_t localRound = 0;
+        uint32_t localDelta = 0;
+        do {
+          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
+            int sendLocal = (local + localDelta) % nLocals;
+            int recvLocal = (local - localDelta + nLocals) % nLocals;
+            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
+            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
+            round += 1;
+          }
+          localRound += 1;
+          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
+        } while (localRound != nLocalsPow2);
+      }
+      nodeRound += 1;
+      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
+    } while (nodeRound != nNodesPow2);
 
-  // Connect with prev/next for each ring
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
-    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail);
-  INFO(NCCL_INIT, "Connected all rings");
+    if (round != nRanks) {
+      WARN("P2p schedule creation has bugs.");
+      ret = ncclInternalError;
+      goto fail;
+    }
+  } while (0);
 
-  // Connect Trees
-  for (int c=0; c<comm->nChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
-  INFO(NCCL_INIT, "Connected all trees");
-
-  // Setup NVLS
-  NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
-  // And NVLS trees if needed
-  if (comm->nvlsSupport && comm->nNodes > 1) {
+  comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
+  if (comm->runtimeConn) {
     for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
+      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
     }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail);
-    INFO(NCCL_INIT, "Connected NVLS tree");
-  }
+    // Setup NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    // Check if we can setup CollNet
+    if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs);
+  } else {
+    for (int c=0; c<comm->nChannels; c++) {
+      NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
 
-  // Check if we can setup CollNet
-  if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph);
+    // Connect Trees
+    NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
 
-  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+    // Setup NVLS
+    NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
 
-  // Compute time models for algorithm and protocol combinations
-  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+    // And NVLS trees if needed
+    NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
 
-  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+    // Check if we can setup CollNet
+    if (comm->collNetSupport > 0) {
+      ncclCollNetSetup(comm, parent, graphs);
+      NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+      NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+    }
 
-  do { // Setup p2p structures in comm->tasks
-    struct ncclTasks* tasks = &comm->tasks;
-    int node = comm->node;
-    int nNodes = comm->nNodes;
-    struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
-    int localRank = comm->localRank;
-    // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8.
-    int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2);
-    tasks->p2pOrderSteps = comm->nNodes * steps;
-    tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, tasks->p2pOrderSteps);
-    tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
-    tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, tasks->p2pOrderSteps);
-    int i=0;
-    // schedule delta 0, +1, -1, +2, -2, ...
-    // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
-    for (int d=0; d <= nNodes/4; d++) {
-      int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
-      int index = 0;
-      int delta = deltas[index];
-    sched_delta:
-      int recvNode = (node+nNodes-delta)%nNodes;
-      int sendNode = (node+delta)%nNodes;
-      for (int step=0; step < steps; step++) {
-        int recvIndex = (localRank-step+steps)%steps;
-        int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
-        tasks->p2pRecvOrder[i] = recvRank;
-        int sendIndex = (localRank+step)%steps;
-        int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
-        tasks->p2pSendOrder[i] = sendRank;
-        i++;
-      }
-      index++;
-      if (index == 1 && deltas[1] == deltas[0]) index++;
-      if (index == 2 && deltas[2] == deltas[0]) index++;
-      if (index == 3 && deltas[3] == deltas[2]) index++;
-      if (index == 3 && deltas[3] == deltas[1]) index++;
-      if (index < 4) {
-        delta = deltas[index];
-        goto sched_delta;
+    // Connect to local net proxy
+    tpProxyRank = comm->topParentRanks[comm->rank];
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+
+    // Then to remote ones when using PXN
+    if (ncclPxnDisable(comm) == 0) {
+      int nranks;
+      NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
+      for (int r=0; r<nranks; r++) {
+        tpProxyRank = comm->topParentRanks[pxnPeers[r]];
+        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+        NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
       }
     }
-    assert(i == tasks->p2pOrderSteps);
-  } while (0);
 
-  if (ncclParamNvbPreconnect()) {
-    // Connect p2p when using NVB path
-    int nvbNpeers;
-    NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
-    for (int r=0; r<nvbNpeers; r++) {
-      int peer = nvbPeers[r];
-      int channelId;
-      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
-          comm->connectSend[peer] |= (1UL<<channelId);
-        }
-      }
-      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
-        if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
-          comm->connectRecv[peer] |= (1UL<<channelId);
+    if (ncclParamNvbPreconnect()) {
+      // Connect p2p when using NVB path
+      int nvbNpeers;
+      NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
+      for (int r=0; r<nvbNpeers; r++) {
+        int peer = nvbPeers[r];
+        int sendRound=0, recvRound=0;
+        while (comm->p2pSchedule[sendRound].sendRank != peer) sendRound++;
+        while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++;
+        uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound);
+        uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound);
+        for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+          int channelId;
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c);
+          if (comm->channels[channelId].peers[peer]->send[1].connected == 0) {
+            comm->connectSend[peer] |= (1UL<<channelId);
+          }
+          channelId = ncclP2pChannelForPart(comm->p2pnChannels, recvBase, c);
+          if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) {
+            comm->connectRecv[peer] |= (1UL<<channelId);
+          }
         }
       }
-    }
 
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
+    }
   }
 
-  // Connect to local net proxy
-  tpProxyRank = comm->topParentRanks[comm->rank];
-  NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
-  NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
-  // Then to remote ones when using PXN
-  if (ncclPxnDisable(comm) == 0) {
-    int nranks;
-    NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
-    for (int r=0; r<nranks; r++) {
-      tpProxyRank = comm->topParentRanks[pxnPeers[r]];
-      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
-      NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
-    }
-  }
+  // Compute time models for algorithm and protocol combinations
+  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
+
+  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
 
   if (comm->intraRank == 0) { // Load ncclParamLaunchMode
     const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
@@ -1399,6 +1252,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
   // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
   NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+  timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
 
   /* Local intra-node barrier */
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
@@ -1412,7 +1266,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
    * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
    * properly cleaned up. */
-  if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm);
+  if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
   free(allTopoRanks);
   free(nodesTreePatterns);
   free(nodesFirstRank);
@@ -1507,20 +1361,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   int cudaDev = job->cudaDev;
   int* parentRanks = NULL;
   int cudaArch;
+  uint64_t timers[TIMERS_INIT_COUNT];
 
+  timers[TIMER_INIT_TOTAL] = clockNano();
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
   cudaArch = 100*archMajor + 10*archMinor;
 
+  timers[TIMER_INIT_KERNELS] = clockNano();
   NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
   if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
-    TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
+    TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes);
     CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
   }
+  timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
 
+  timers[TIMER_INIT_BOOTSTRAP] = clockNano();
   if (job->parent) {
     NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
     NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
@@ -1533,6 +1392,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
     NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
   }
+  timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
 
   comm->cudaArch = cudaArch;
   comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
@@ -1545,15 +1405,16 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
   }
 
-  NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
+  NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
 
-  NCCLCHECKGOTO(ncclTunerPluginLoad(&comm->tuner), res, fail);
+  NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
   if (comm->tuner) {
     NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
   }
 
   // update communicator state
   comm->initState = ncclSuccess;
+  timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL];
 
   // Trace this call for replay tool
   if (job->parent) {
@@ -1573,6 +1434,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
     comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
   }
+  INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9,
+    timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9,
+    (timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9);
 exit:
   if (job->newcomm) {
     /* assign it to user pointer. */
@@ -1658,7 +1522,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   }
 
   if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
-    WARN("splitShare %d is not a valid value 0/1, set it to 0\n", comm->config.splitShare);
+    WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
     comm->config.splitShare = 0;
   }
 
@@ -1679,6 +1543,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   ncclConfig_t *internalConfigPtr;
   size_t realSize;
 
+  internalConfig.magic = 0;
   internalConfigPtr = &internalConfig;
   if (config) {
     memcpy((void*)&realSize, (void*)config, sizeof(size_t));
@@ -1767,8 +1632,10 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   }
 
   NCCLCHECKGOTO(ncclInit(), res, fail);
-  if (myrank == 0) showVersion();
-
+  if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
+    static pthread_once_t once = PTHREAD_ONCE_INIT;
+    pthread_once(&once, showVersion);
+  }
   // Make sure the CUDA runtime is initialized.
   CUDACHECKGOTO(cudaFree(NULL), res, fail);
 
@@ -1781,9 +1648,10 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   }
 
   NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), res, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), res, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), res, fail);
   comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
-  NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
-  NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail);
   *comm->abortFlagRefCount = 1;
   NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
   /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
@@ -1802,8 +1670,9 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   return ncclGroupErrCheck(res);
 fail:
   if (comm) {
-    if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag);
-    if (comm->abortFlagRefCount) free(comm->abortFlagRefCount);
+    free(comm->abortFlag);
+    if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev);
+    free(comm->abortFlagRefCount);
     free(comm);
   }
   if (newcomm) *newcomm = NULL;
@@ -1951,18 +1820,21 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
   if (comm->initState == ncclSuccess) {
     NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
     NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
+    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
+    // And keep polling until all graphs referencing us die.
+    while (comm->persistentRefs != 0) {
+      NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
+    }  
   }
-  NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
-  // And keep polling until all graphs referencing us die.
-  while (comm->persistentRefs != 0) {
-    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
+
+  if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
+    WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret);
   }
 
   if (savedDevice != commDevice) {
     CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail);
   }
 
-  comm->finalizeCalled = true;
 exit:
   return ret;
 fail:
@@ -1980,7 +1852,7 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
 
   if (comm->tuner != NULL) {
     NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
-    NCCLCHECK(ncclTunerPluginUnload(&comm->tuner));
+    NCCLCHECK(ncclTunerPluginUnload(comm));
   }
 
   NCCLCHECK(commFree(comm));
@@ -1992,31 +1864,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
   return ncclSuccess;
 }
 
-static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclCommFinalizeAsyncJob *job = NULL;
-
-  /* launch async thread to finalize comm. */
-  NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-  job->comm = comm;
-
-  if (userCalled) {
-    NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
-  } else {
-    NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail);
-    free(job);
-  }
-
-exit:
-  return ncclGroupErrCheck(ret);
-fail:
-  goto exit;
-}
-
 NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
 ncclResult_t ncclCommFinalize(ncclComm_t comm) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
   ncclResult_t ret = ncclSuccess;
+  struct ncclCommFinalizeAsyncJob *job = NULL;
 
   NCCLCHECK(ncclGroupStartInternal());
   if (comm == NULL) goto exit;
@@ -2030,8 +1882,11 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
     goto fail;
   }
 
-  /* finalize comm. */
-  ret = commFinalize(comm, true);
+  comm->finalizeCalled = true;
+  /* launch async thread to finalize comm. */
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
 
 exit:
   ncclGroupErrCheck(ret);
@@ -2043,21 +1898,14 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
   goto exit;
 }
 
-static ncclResult_t commReclaim(ncclComm_t comm) {
+static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
+  struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
+  ncclComm_t comm = job->comm;
   ncclResult_t ret = ncclSuccess;
-  ncclResult_t state;
-  int curRank; /* Debug info */
-
-  NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
-  TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
-  if (state == ncclSuccess && __atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0 && comm->finalizeCalled == false) {
-    /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
-     * should be nonblocking until last call of ncclCommDestroy. */
-    NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
-  }
 
   if (comm->intraComm0 != NULL) {
     int curRankCnt;
+    int curRank; /* Debug info */
     int intraRanks = comm->intraRanks;
     ncclComm_t intracomm0 = comm->intraComm0;
     int *finalizeRankCnt = &intracomm0->finalizeRankCnt;
@@ -2080,30 +1928,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
           job.comm = curIntraComm;
           /* every comm aborts, commDestroySync should not be blocked. */
           if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess)
-            WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret);
-        }
-      }
-
-      /* ncclProxyStop() loop must be put after commDestroySync() loop. Namely, you cannot do:
-       *  while(...) {
-       *     commDestroySync(...);
-       *     ncclProxyStop(...);
-       *  }
-       * Considering one process multi-gpu case, we must guarantee all kernels are complete before
-       * we free proxy resources; otherwise, we will face invalid memory issues where proxy connection
-       * and related intermediate memory from one rank are freed but other ranks are still using it.
-       * This is not a problem for multi-process case, since intermediate memory is opened by CUDA IPC
-       * or mmap where memory free is guarded by CUDA driver and operating system, so we will not have
-       * invalid memory access issue. */
-      nextIntraComm = intracomm0;
-      while (nextIntraComm) {
-        curIntraComm = nextIntraComm;
-        curRank = curIntraComm->rank;
-        nextIntraComm = nextIntraComm->intraNext;
-
-        /* free intraprocess proxy resources. */
-        if ((ret = ncclProxyStop(curIntraComm)) != ncclSuccess) {
-          WARN("commReclaim: comm %p (rank = %d) destroys proxy resource error %d", curIntraComm, curRank, ret);
+            WARN("commReclaim: comm %p (rank = %d) in commDestroySync, error %d", curIntraComm, curRank, ret);
         }
       }
 
@@ -2121,10 +1946,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
     }
   }
 
-exit:
   return ret;
-fail:
-  goto exit;
 }
 
 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
@@ -2135,25 +1957,31 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   }
 
   int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  struct ncclCommFinalizeAsyncJob *job = NULL;
+  ncclResult_t res = ncclSuccess;
 
   NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
   NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
 
-  int64_t busId = comm->busId;
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
     WARN("comm %p has already been destroyed", comm);
     return ncclInvalidArgument;
   }
 
+  comm->destroyFlag = 1;
   /* init thread must be joined before we destroy the comm. */
   NCCLCHECK(ncclCommEnsureReady(comm));
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
 
-  NCCLCHECK(commReclaim(comm));
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
-
-  return ncclSuccess;
+exit:
+  return res;
+fail:
+  free(job);
+  goto exit;
 }
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
@@ -2163,29 +1991,36 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
     return ncclSuccess;
   }
 
-  volatile uint32_t* childAbortFlag;
   int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  struct ncclCommFinalizeAsyncJob *job = NULL;
+  ncclResult_t res = ncclSuccess;
 
   NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
   NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
 
-  int64_t busId = comm->busId;
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
 
   // Ask anything that might still be running on the device to quit
-  childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE);
-  if (childAbortFlag != NULL) {
-    __atomic_store_n(childAbortFlag, 1, __ATOMIC_RELAXED);
+  if (comm->childAbortFlag != nullptr) {
+    __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
+    __atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE);
   }
-  __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELAXED);
+  __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE);
+  __atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE);
+  comm->destroyFlag = 1;
   /* init thread must be joined before we destroy the comm,
    * and we should ignore the init error here. */
   ncclCommEnsureReady(comm);
 
-  (void) commReclaim(comm);
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
 
+exit:
   return ncclSuccess;
+fail:
+  free(job);
+  goto exit;
 }
 
 NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
@@ -2208,14 +2043,17 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
     childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
     if (comm->config.splitShare) {
       childComm->abortFlag = comm->abortFlag;
+      childComm->abortFlagDev = comm->abortFlagDev;
       childComm->abortFlagRefCount = comm->abortFlagRefCount;
       comm->childAbortFlag = NULL;
       ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
     } else {
-      NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&childComm->abortFlag, 1), res, fail);
-      NCCLCHECKGOTO(ncclCalloc((uint32_t**)&childComm->abortFlagRefCount, 1), res, fail);
+      NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlag, 1), res, fail);
+      NCCLCHECKGOTO(ncclCudaHostCalloc(&childComm->abortFlagDev, 1), res, fail);
+      NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlagRefCount, 1), res, fail);
       /* temporarily used to abort everything during child comm init. */
       comm->childAbortFlag = childComm->abortFlag;
+      comm->childAbortFlagDev = childComm->abortFlagDev;
       *childComm->abortFlagRefCount = 1;
     }
     if (config == NULL) {
@@ -2244,8 +2082,9 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
 fail:
   if (childComm) {
     if (comm && !comm->config.splitShare) {
-      if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag);
-      if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
+      free(childComm->abortFlag);
+      if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
+      free(childComm->abortFlagRefCount);
     }
     free(childComm);
   }
diff --git a/src/init_nvtx.cc b/src/init_nvtx.cc
index 44face681..1cb1277d2 100644
--- a/src/init_nvtx.cc
+++ b/src/init_nvtx.cc
@@ -2,11 +2,11 @@
 #include "nvtx.h"
 
 static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
-  {"Sum", ncclSum},
-  {"Product", ncclProd},
-  {"Max", ncclMax},
-  {"Min", ncclMin},
-  {"Avg", ncclAvg}
+  {"Sum", ncclSum, 0},
+  {"Product", ncclProd, 0},
+  {"Max", ncclMax, 0},
+  {"Min", ncclMin, 0},
+  {"Avg", ncclAvg, 0}
 };
 
 // Must be called before the first call to any reduction operation.
@@ -19,7 +19,8 @@ void initNvtxRegisteredEnums() {
     .entries = NvtxEnumRedSchema,
     .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
     .sizeOfEnum = sizeof(ncclRedOp_t),
-    .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
+    .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
+    .extension = nullptr
   };
 
   nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 59023ae79..6ed5db27a 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -52,8 +52,6 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
     WARN("%s : invalid type %d", info->opName, info->datatype);
     return ncclInvalidArgument;
   }
-  // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
-  NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
 
   if (info->op < 0 || ncclMaxRedOp < info->op) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 6f5badfd8..d44c06355 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -59,6 +59,10 @@ DECLARE_CUDA_PFN(cuGetErrorString);
 DECLARE_CUDA_PFN(cuGetErrorName);
 /* enqueue.cc */
 DECLARE_CUDA_PFN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN(cuLaunchKernel);
+#if CUDA_VERSION >= 11080
+DECLARE_CUDA_PFN(cuLaunchKernelEx);
+#endif
 /* proxy.cc */
 DECLARE_CUDA_PFN(cuCtxCreate);
 DECLARE_CUDA_PFN(cuCtxDestroy);
@@ -137,6 +141,10 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuCtxGetCurrent, 1);
   LOAD_SYM(cuCtxSetCurrent, 1);
   LOAD_SYM(cuCtxGetDevice, 1);
+  LOAD_SYM(cuLaunchKernel, 1);
+#if CUDA_VERSION >= 11080
+  LOAD_SYM(cuLaunchKernelEx, 1);
+#endif
 /* cuMem API support */
   LOAD_SYM(cuMemAddressReserve, 1);
   LOAD_SYM(cuMemAddressFree, 1);
diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc
index 4729efe2e..3b46759c6 100644
--- a/src/misc/gdrwrap.cc
+++ b/src/misc/gdrwrap.cc
@@ -130,7 +130,7 @@ ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint6
   int ret;
   GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
   if (ret != 0) {
-    WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret);
+    WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret);
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -172,7 +172,7 @@ ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
   int ret;
   GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret);
   if (ret != 0) {
-    WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret);
+    WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret);
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -186,7 +186,7 @@ ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
   int ret;
   GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret);
   if (ret != 0) {
-    WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret);
+    WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret);
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -218,7 +218,7 @@ ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const vo
   int ret;
   GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret);
   if (ret != 0) {
-    WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
+    WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -232,7 +232,7 @@ ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void
   int ret;
   GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret);
   if (ret != 0) {
-    WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
+    WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
     return ncclSystemError;
   }
   return ncclSuccess;
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index fc7fd4b66..db61b3149 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
       WARN("UDS: Receiving data over socket failed : %d", errno);
       return ncclSystemError;
     }
-    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
   }
 
   if (recvFd != NULL) {
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
       WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
       return ncclSystemError;
     }
-    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
   }
 
   return ncclSuccess;
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index 76c989e76..a2b0be0df 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -41,11 +41,19 @@ namespace {
   NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
   // MNNVL support
   NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
+  // CC support
+  NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state));
+  NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting));
 
   std::mutex lock; // NVML has had some thread safety bugs
   bool initialized = false;
   thread_local bool threadInitialized = false;
   ncclResult_t initResult;
+
+  union nvmlCCInfoInternal {
+    nvmlConfComputeSystemState_t settingV12020;
+    nvmlSystemConfComputeSettings_t settingV12040;
+  };
 }
 
 ncclResult_t ncclNvmlEnsureInitialized() {
@@ -87,6 +95,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
       {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
       // MNNVL support
       {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
+      // CC support
+      {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"},
+      {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"}
     };
     for(Symbol sym: symbols) {
       *sym.ppfn = dlsym(libhandle, sym.name);
@@ -282,3 +293,33 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI
   NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
   return ncclSuccess;
 }
+
+ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  nvmlCCInfoInternal ccInfo;
+  if (pfn_nvmlSystemGetConfComputeSettings != NULL) {
+    ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1;
+    NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040);
+    if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
+      status->CCEnabled = true;
+    else
+      status->CCEnabled = false;
+
+    if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
+      status->multiGpuCCEnabled = true;
+    else
+      status->multiGpuCCEnabled = false;
+  } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
+    NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
+    if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
+      status->CCEnabled = true;
+    else
+      status->CCEnabled = false;
+    status->multiGpuCCEnabled = false;
+  } else {
+    status->CCEnabled = false;
+    status->multiGpuCCEnabled = false;
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/param.cc b/src/misc/param.cc
index e0b6ab821..2248be980 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -84,4 +84,4 @@ const char *ncclGetEnv(const char *name) {
   static pthread_once_t once = PTHREAD_ONCE_INIT;
   pthread_once(&once, initEnv);
   return getenv(name);
-}
\ No newline at end of file
+}
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index 04f7c10be..a48164337 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -63,13 +63,28 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
      * goes down to 0, unlink should be called in order to delete shared memory file. */
     if (shmPath[0] == '\0') {
       sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+    retry_mkstemp:
       fd = mkstemp(shmPath);
+      if (fd < 0) {
+        if (errno == EINTR) {
+          INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
+          goto retry_mkstemp;
+        }
+        WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
+        ret = ncclSystemError;
+        goto fail;
+      }
     } else {
       SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
     }
 
+  retry_fallocate:
     if (fallocate(fd, 0, 0, realShmSize) != 0) {
-      WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
+      if (errno == EINTR) {
+        INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno);
+        goto retry_fallocate;
+      }
+      WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
       ret = ncclSystemError;
       goto fail;
     }
@@ -80,7 +95,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
 
   hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   if (hptr == MAP_FAILED) {
-    WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
+    WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
     ret = ncclSystemError;
     hptr = NULL;
     goto fail;
@@ -93,7 +108,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
     if (remref == 0) {
       /* the last peer has completed attachment, it should unlink the shm mem file. */
       if (unlink(shmPath) != 0) {
-        WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
+        INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
       }
     }
   }
@@ -110,7 +125,8 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
   *handle = (ncclShmHandle_t)tmphandle;
   return ret;
 fail:
-  WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
+  WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to",
+       shmPath, shmSize, strerror(errno), errno);
   if (tmphandle) {
     shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
     ncclShmClose((ncclShmHandle_t)tmphandle);
@@ -129,7 +145,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
       close(tmphandle->fd);
       if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
         if (unlink(tmphandle->shmPath) != 0) {
-          WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+          WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
           ret = ncclSystemError;
         }
       }
@@ -139,7 +155,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
     if (tmphandle->shmPtr) {
       if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
       if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
-        WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
+        WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno);
         ret = ncclSystemError;
       }
     }
@@ -152,9 +168,9 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
   ncclResult_t ret = ncclSuccess;
   struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
   if (tmphandle) {
-    if (tmphandle->shmPath != NULL) {
+    if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
       if (unlink(tmphandle->shmPath) != 0) {
-        WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+        WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
         ret = ncclSystemError;
       }
       free(tmphandle->shmPath);
@@ -184,7 +200,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
     uint64_t t0 = clockNano();
     while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
       if (clockNano() - t0 >= 5 * 1000) sched_yield();
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) {
         ret = ncclInternalError;
         goto exit;
       }
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 6e9fb0790..9ade0e41d 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -34,7 +34,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
       }
     }
     (*offset) += bytes;
-    if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) {
+    if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) {
       INFO(NCCL_NET, "socketProgressOpt: abort called");
       return ncclInternalError;
     }
@@ -620,12 +620,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
   do {
     NCCLCHECK(socketProgressState(sock));
   } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
+      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
       (sock->state == ncclSocketStateConnecting ||
        sock->state == ncclSocketStateConnectPolling ||
        sock->state == ncclSocketStateConnected));
 
-  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
 
   switch (sock->state) {
     case ncclSocketStateConnecting:
@@ -667,11 +667,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
   do {
     NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
   } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
+      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) &&
       (sock->state == ncclSocketStateAccepting ||
        sock->state == ncclSocketStateAccepted));
 
-  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
 
   switch (sock->state) {
     case ncclSocketStateAccepting:
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
index ae6ade32f..608062bcc 100644
--- a/src/misc/tuner.cc
+++ b/src/misc/tuner.cc
@@ -9,117 +9,150 @@
 #include <errno.h>
 #include <stdlib.h>
 
+#include "checks.h"
 #include "debug.h"
-#include "nccl_tuner.h"
+#include "tuner.h"
 
 pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
 static int tunerPluginRefCount;
 static void* tunerPluginLib = nullptr;
-ncclTuner_t* tunerSymbol = nullptr;
+static ncclTuner_v3_t* tunerSymbol = nullptr;
+static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
+static ncclTuner_v3_t ncclTuner_v2_as_v3;
+
+static int hasNvlsSupport(float** collCostTable) {
+  // Requirements for support of different algorithms:
+  //
+  // - NVLS intra-node: nvlsSupport
+  // - NVLS intra+inter-node: collNetSupport
+  // - NVLSTree intra-node: always disabled
+  // - NVLSTree inter-node: nvlsSupport
+  // - Collnet* inter-node: collNetSupport
+  //
+  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
+}
+
+static int hasCollNetSupport(float** collCostTable) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
+}
+
+static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) {
+  int algorithm = NCCL_ALGO_UNDEF;
+  int protocol = NCCL_PROTO_UNDEF;
+  int nvlsSupport = hasNvlsSupport(collCostTable);
+  int collNetSupport = hasCollNetSupport(collCostTable);
+  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
+  // set time to 0 below to make sure this algorithm/protocol is selected later on
+  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
+    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
+  }
+  return ncclSuccess;
+}
 
-static void* tryOpenDynamicLib(const char* name) {
+static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
+  ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
+  ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo;
+  ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy;
+  return ncclSuccess;
+}
+
+#define MAX_STR_LEN 255
+
+static void* tryOpenLib(const char* name, int* err, char* errStr) {
+  *err = 0;
   if (nullptr == name || strlen(name) == 0) {
     return nullptr;
   }
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
   void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
   if (nullptr == handle) {
-    if (ENOENT == errno) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: No plugin found (%s)", name);
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = '\0';
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
     }
   }
   return handle;
 }
 
-static void summarizeOpenTunerPluginLibErrors(char* pluginNames) {
-  const char *separator = " ";
-  int len = strlen(pluginNames);
-  // remove tail separator
-  pluginNames[len - 1] = '\0';
-
-  // remove last plugin name
-  while (len > 0 && pluginNames[--len] != *separator);
-  if (len > 0) {
-    pluginNames[len] = '\0';
-  }
-
-  // distinguish between one load attempt and multiple attempts
-  if (strstr(pluginNames, separator)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
-  } else {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
+static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
+  if (openErr == ENOENT) {
+    snprintf(nameList, *nameListLen, " %s", name);
+    nameList += strlen(name) + 1;
+    *nameListLen -= strlen(name) + 1;
+    return nameList;
   }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
+  return nameList;
 }
 
-static void* openTunerPluginLib(void) {
+static void* openTunerPluginLib(char* couldNotFindNames, int len) {
+  int openErr;
   void *pluginLib;
-
-#define MAX_PLUGIN_LOAD 4
-
-  int len;
-  char tunerPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  char *ptr = tunerPluginLibNameTried;
   char tunerPluginLibName[PATH_MAX];
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
   const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
   if (envTunerPluginName && strlen(envTunerPluginName)) {
     INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
     snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
 
     snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
   } else {
     snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
   }
 
   const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
   if (envNetPluginName && strlen(envNetPluginName)) {
     // Users are allowed to pack tuner into the net plugin
     snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
 
     snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
   } else {
     snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenDynamicLib(tunerPluginLibName);
+    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
   }
-  summarizeOpenTunerPluginLibErrors(ptr);
-
   tunerPluginLibName[0] = '\0';
   return nullptr;
 }
@@ -130,10 +163,14 @@ enum {
   tunerPluginLoadSuccess =  1,
 };
 
-ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) {
+#define MAX_PLUGIN_LOAD 4
+
+static int status = tunerPluginLoadReady;
+
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
   // Initialize to nullptr by default if plugin tuner cannot be loaded.
-  *tuner = nullptr;
-  static int status = tunerPluginLoadReady;
+  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
+  comm->tuner = nullptr;
   if (tunerPluginLoadFailed == status) {
     return ncclSuccess;
   }
@@ -144,28 +181,41 @@ ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) {
   }
 
   if (tunerPluginLoadSuccess == status) {
-    *tuner = tunerSymbol;
+    comm->tuner = tunerSymbol;
     ++tunerPluginRefCount;
     goto exit;
   }
 
-  tunerPluginLib = openTunerPluginLib();
+  tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
   if (nullptr == tunerPluginLib) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
+    if (strlen(couldNotFindNames)) {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
+    } else {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
+    }
     goto fail;
   }
 
-  tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
+  tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
   if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find " NCCL_TUNER_PLUGIN_SYMBOL ", using internal tuner instead.");
-    dlclose(tunerPluginLib);
-    goto fail;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+    ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
+    if (ncclTuner_v2 == nullptr) {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+      dlclose(tunerPluginLib);
+      goto fail;
+    } else {
+      ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init;
+      ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
+      tunerSymbol = &ncclTuner_v2_as_v3;
+    }
   }
 
   INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
-  *tuner = tunerSymbol;
+  comm->tuner = tunerSymbol;
   ++tunerPluginRefCount;
   status = tunerPluginLoadSuccess;
+  comm->tunerPluginLoaded = 1;
 
 exit:
   pthread_mutex_unlock(&tunerPluginLock);
@@ -176,15 +226,16 @@ ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) {
   goto exit;
 }
 
-ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner) {
-  if (*tuner == nullptr) return ncclSuccess;
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
   pthread_mutex_lock(&tunerPluginLock);
-  if (0 == (--tunerPluginRefCount)) {
+  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
     INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
     dlclose(tunerPluginLib);
     tunerPluginLib = nullptr;
     tunerSymbol = nullptr;
-    *tuner = nullptr;
+    comm->tuner = nullptr;
+    status = tunerPluginLoadReady;
+    comm->tunerPluginLoaded = 0;
   }
   pthread_mutex_unlock(&tunerPluginLock);
   return ncclSuccess;
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 74d5b6d24..12504bc99 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -93,7 +93,8 @@ uint64_t getHostHash(void) {
 
   if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) {
     INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
-    strncpy(hostHash, hostId, sizeof(hostHash));
+    strncpy(hostHash, hostId, sizeof(hostHash)-1);
+    hostHash[sizeof(hostHash)-1] = '\0';
   } else {
     FILE *file = fopen(HOSTID_FILE, "r");
     if (file != NULL) {
@@ -291,79 +292,3 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
     h = h1;
   }
 }
-
-const char* ncclOpToString(ncclRedOp_t op) {
-  switch (op) {
-    case ncclSum:
-      return "ncclSum";
-    case ncclProd:
-      return "ncclProd";
-    case ncclMax:
-      return "ncclMax";
-    case ncclMin:
-      return "ncclMin";
-    case ncclAvg:
-      return "ncclAvg";
-    default:
-      return "Unknown";
-  }
-}
-
-const char* ncclDatatypeToString(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8: // ncclChar
-      return "ncclInt8";
-    case ncclInt32: // ncclInt
-      return "ncclInt32";
-    case ncclUint32:
-      return "ncclUint32";
-    case ncclInt64:
-      return "ncclInt64";
-    case ncclUint64:
-      return "ncclUint64";
-    case ncclFloat16: // ncclHalf
-      return "ncclFloat16";
-    case ncclFloat32: // ncclFloat
-      return "ncclFloat32";
-    case ncclFloat64: // ncclDouble
-      return "ncclFloat64";
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-      return "ncclBfloat16";
-#endif
-    default:
-      return "Unknown";
-  }
-}
-
-const char* ncclAlgoToString(int algo) {
-  switch (algo) {
-    case NCCL_ALGO_TREE:
-      return "TREE";
-    case NCCL_ALGO_RING:
-      return "RING";
-    case NCCL_ALGO_COLLNET_DIRECT:
-      return "COLLNET_DIRECT";
-    case NCCL_ALGO_COLLNET_CHAIN:
-      return "COLLNET_CHAIN";
-    case NCCL_ALGO_NVLS:
-      return "NVLS";
-    case NCCL_ALGO_NVLS_TREE:
-      return "NVLS_TREE";
-    default:
-      return "Unknown";
-  }
-}
-
-const char* ncclProtoToString(int proto) {
-  switch (proto) {
-    case NCCL_PROTO_LL:
-      return "LL";
-    case NCCL_PROTO_LL128:
-      return "LL128";
-    case NCCL_PROTO_SIMPLE:
-      return "SIMPLE";
-    default:
-      return "Unknown";
-  }
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 3cf619dcf..9efdf9fc1 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -47,6 +47,7 @@ typedef enum { ncclSuccess                 =  0,
 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
 #define NCCL_SPLIT_NOCOLOR -1
+#define NCCL_UNDEF_FLOAT -1.0f
 
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
@@ -78,6 +79,23 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }
 
+/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
+typedef struct ncclSimInfo_v22200 {
+    size_t size;
+    unsigned int magic;
+    unsigned int version;
+    float estimatedTime;
+} ncclSimInfo_t;
+
+/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
+ * Not initialized simInfo will result in NCCL error. */
+#define NCCL_SIM_INFO_INITIALIZER {                                         \
+  sizeof(ncclSimInfo_t),                            /* size */              \
+  0x74685283,                                       /* magic */             \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */           \
+  NCCL_UNDEF_FLOAT                                  /* estimated time */    \
+}
+
 /* NCCL malloc and free function for all types of NCCL optimizations
  * (e.g. user buffer registration). The actual allocated size might
  * be larger than requested due to granularity requirement. */
@@ -432,6 +450,14 @@ ncclResult_t pncclGroupStart();
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();
 
+/*
+ * Group Simulate End
+ *
+ * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
+ */
+ncclResult_t  ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
+ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
+
 #ifdef __cplusplus
 } // end extern "C"
 #endif
diff --git a/src/net.cc b/src/net.cc
index e978a1854..0f5d336ea 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -48,7 +48,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
 }
 
 static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -95,7 +95,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
 }
 
 static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -150,7 +150,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
 }
 
 static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -207,7 +207,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
 }
 
 static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -254,7 +254,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
 }
 
 static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -301,7 +301,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
 }
 
 static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1<<31) return ncclInternalError;
+  if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
@@ -339,90 +339,109 @@ enum ncclNetState {
 enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
 enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
 
-static void* tryOpenDynamicLib(char* name) {
+#define MAX_STR_LEN 255
+
+static void* tryOpenLib(char* name, int* err, char* errStr) {
+  *err = 0;
   if (nullptr == name || strlen(name) == 0) {
     return nullptr;
   }
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
   void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
   if (nullptr == handle) {
-    if (ENOENT == errno) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: No plugin found (%s)", name);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin load returned %d : %s when loading %s", errno, dlerror(), name);
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = '\0';
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
     }
   }
   return handle;
 }
 
-static void summarizeOpenNetPluginErrors(char* pluginNames) {
-  const char *separator = " ";
-  int len = strlen(pluginNames);
-  // remove tail separator
-  pluginNames[len - 1] = '\0';
-
-  // remove last plugin name
-  while (len > 0 && pluginNames[--len] != *separator);
-  if (len > 0) {
-    pluginNames[len] = '\0';
-  }
-
-  // distinguish between one load attempt and multiple attempts
-  if (strstr(pluginNames, separator)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
-  } else {
-    INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
+static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
+  if (openErr == ENOENT) {
+    snprintf(nameList, *nameListLen, " %s", name);
+    nameList += strlen(name) + 1;
+    *nameListLen -= strlen(name) + 1;
+    return nameList;
   }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr);
+  return nameList;
 }
 
-static void* openNetPluginLib(void) {
+static void* openNetPluginLib(char* couldNotFindNames, int len) {
+  int openErr;
   void *pluginLib;
-
-#define MAX_PLUGIN_LOAD 2
-
-  int len;
-  char netPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  char *ptr = netPluginLibNameTried;
   char netPluginLibName[PATH_MAX];
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
   const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
   if (envNetPluginName && strlen(envNetPluginName)) {
     snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenDynamicLib(netPluginLibName);
+    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
 
     snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenDynamicLib(netPluginLibName);
+    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
   } else {
     snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenDynamicLib(netPluginLibName);
+    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
     if (pluginLib) {
       return pluginLib;
     }
-    len = PATH_MAX - strlen(ptr);
-    snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
   }
-  summarizeOpenNetPluginErrors(ptr);
-
   return nullptr;
 }
 
-ncclResult_t ncclNetPluginInit() {
-  void* netPluginLib = openNetPluginLib();
-  if (netPluginLib == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
+static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int netPluginRefCount;
+static void* netPluginLib;
+
+enum {
+  netPluginLoadFailed  = -1,
+  netPluginLoadReady   =  0,
+  netPluginLoadSuccess =  1,
+};
+
+static int netPluginStatus = netPluginLoadReady;
+
+#define MAX_PLUGIN_LOAD 2
+
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
+  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
+  if (netPluginLoadFailed == netPluginStatus) {
     return ncclSuccess;
   }
 
+  pthread_mutex_lock(&netPluginLock);
+  if (netPluginLoadSuccess == netPluginStatus) {
+    ++netPluginRefCount;
+    goto exit;
+  }
+
+  netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
+  if (netPluginLib == nullptr) {
+    if (strlen(couldNotFindNames)) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames);
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
+    }
+    goto fail;
+  }
+
   ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
   if (ncclNets[0] == nullptr) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
@@ -436,8 +455,7 @@ ncclResult_t ncclNetPluginInit() {
         ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
         if (ncclNet_v5 == nullptr) {
           INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-          if (netPluginLib != nullptr) dlclose(netPluginLib);
-          return ncclSuccess;
+          goto fail;
         } else {
           ncclNets[0] = &ncclNet_v5_as_v8;
           ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
@@ -476,21 +494,52 @@ ncclResult_t ncclNetPluginInit() {
           ncclCollNets[0] = &ncclCollNet_v5_as_v8;
           ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
           ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
         }
       } else {
         ncclCollNets[0] = &ncclCollNet_v6_as_v8;
         ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
         ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
       }
     } else {
       ncclCollNets[0] = &ncclCollNet_v7_as_v8;
       ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
       ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v7)", ncclCollNets[0]->name);
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
     }
   }
+
+  ++netPluginRefCount;
+  netPluginStatus = netPluginLoadSuccess;
+  comm->netPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+fail:
+  if (netPluginLib) dlclose(netPluginLib);
+  netPluginStatus = netPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&netPluginLock);
+  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
+    if (ncclNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
+    }
+    if (ncclCollNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
+    }
+    dlclose(netPluginLib);
+    netPluginLib = nullptr;
+    ncclNets[0] = nullptr;
+    ncclCollNets[0] = nullptr;
+    netPluginStatus = netPluginLoadReady;
+    comm->netPluginLoaded = 0;
+  }
+  pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
 }
 
@@ -515,8 +564,6 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
       return ncclInternalError;
   }
 
-  INFO(NCCL_INIT, "Using non-device net plugin version %d",
-    props.netDeviceVersion);
   return ncclSuccess;
 }
 
@@ -582,6 +629,12 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
+  comm->ncclNet = nullptr;
+  comm->ncclCollNet = nullptr;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
   constexpr int GPU_BUF_SIZE = 2*1024*1024;
 #if CUDART_VERSION >= 11030
@@ -623,7 +676,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     while (!connected) {
 
       // If we're aborting now, skip to cleanup
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
         goto cleanup2;
       }
 
diff --git a/src/proxy.cc b/src/proxy.cc
index 955c415ec..eef71a565 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -12,6 +12,7 @@
 #include "profiler.h"
 #define ENABLE_TIMER 0
 #include "timer.h"
+#include "transport.h"
 
 #include <sys/syscall.h>
 #include <assert.h>
@@ -596,67 +597,6 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
   return ncclSuccess;
 }
 
-NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
-
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op, int reg) {
-  memset(op, 0, sizeof(struct ncclProxyOp));
-  int channelId = info->channelId;
-  struct ncclChannel* channel = info->comm->channels+channelId;
-  op->channelId = channelId;
-  op->sliceSteps = 1;
-  op->chunkSteps = 1;
-  op->dtype = info->datatype;
-  op->protocol = info->protocol;
-
-  int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
-
-  if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize;
-  info->chunkSize = stepSize;
-  op->root = info->root;
-
-  struct ncclChannelPeer* peer = channel->peers[op->root];
-  if (info->coll == ncclFuncSend) {
-    op->pattern = ncclPatternSend;
-    if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
-      // Tune chunk size for the network
-      if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
-      else if (info->count < 8*stepSize) info->chunkSize /= 2;
-      if (info->protocol == NCCL_PROTO_SIMPLE && peer->send[1].proxyConn.sameProcess) op->reg = reg;
-    }
-  } else if (info->coll == ncclFuncRecv) {
-    op->pattern = ncclPatternRecv;
-    if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
-      // Tune chunk size for the network
-      if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
-      else if (info->count < 8*stepSize) info->chunkSize /= 2;
-      if (info->protocol == NCCL_PROTO_SIMPLE && peer->recv[1].proxyConn.sameProcess) op->reg = reg;
-    }
-  } else {
-    WARN("P2p operation is neither send or recv");
-    return ncclInternalError;
-  }
-  if (ncclParamChunkSize() != 0) {
-    info->chunkSize = ncclParamChunkSize();
-  }
-  op->recvbuff = op->reg ? (uint8_t*)info->recvbuff : NULL;
-  op->chunkSize = info->chunkSize;
-  op->nbytes = info->count;
-
-  // Compute nSteps for proxies
-  int chunkEffectiveSize = op->chunkSize;
-  if (op->protocol == NCCL_PROTO_LL) {
-    chunkEffectiveSize /= 2;
-    op->nbytes *= 2;
-    op->nbytes = DIVUP(op->nbytes, sizeof(union ncclLLFifoLine)) * sizeof(union ncclLLFifoLine);
-  }
-
-  if (!op->reg) op->nbytes = std::min(op->nbytes, (ssize_t)info->chunkSize);
-  op->nsteps = DIVUP(info->count, chunkEffectiveSize);
-  if (op->nsteps == 0 || op->reg) op->nsteps = 1;
-
-  return ncclSuccess;
-}
-
 static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
   struct ncclProxyArgs* freeOp = *opPtr;
   struct ncclProxyArgs* next = freeOp->next;
@@ -870,7 +810,8 @@ void* ncclProxyProgress(void *proxyState_) {
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
   struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while ((state->stop == 0 || (state->stop == 1 && state->active)) && *proxyState->abortFlag == 0) {
+  while ((state->stop == 0 || (state->stop == 1 && state->active)) &&
+         __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) {
     int idle = 1;
     ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
@@ -1075,7 +1016,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
     strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1);
     struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
     if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
@@ -1172,7 +1113,7 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
 ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
   struct ncclProxyState* sharedProxyState = comm->proxyState;
   // Receive the connection pointer from the Proxy
-  if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
+  if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
     WARN("Comm %p is in abort state", comm);
     return ncclInternalError;
   }
@@ -1254,7 +1195,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
 
     char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
     shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle));
+    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
     // Init pool
     pool->nextOps = -1;
 
@@ -1403,7 +1344,7 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
     (*asyncOpCount)--;
     return ncclSuccess;
 
-  } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) {
+  } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) {
     return ncclInternalError;
   }
 
@@ -1491,7 +1432,7 @@ void* ncclProxyService(void* _args) {
     /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
      * connections. Need to wait until all other related comms call abort and safely exit
      * together, or we could face segmentation fault. */
-    if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) stop = 1;
+    if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1;
     /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
     int ret;
     do {
@@ -1721,14 +1662,13 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
       }
 
       if (sharedProxyState->peerAddresses) {
-        if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) {
-          struct ncclSocket sock;
-          int type = ncclProxyMsgStop;
-          NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
-          NCCLCHECK(ncclSocketConnect(&sock));
-          NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
-          NCCLCHECK(ncclSocketClose(&sock));
+        struct ncclSocket sock;
+        int type = ncclProxyMsgStop;
+        ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
+        if (ncclSocketConnect(&sock) == ncclSuccess) {
+          ncclSocketSend(&sock, &type, sizeof(int));
         }
+        ncclSocketClose(&sock);
       }
 
       if (sharedProxyState->peerSocks) {
@@ -1746,7 +1686,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
               }
             }
             int type = ncclProxyMsgClose;
-            if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
+            ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
             NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
           }
         }
diff --git a/src/register.cc b/src/register.cc
index c46899294..90d429fe4 100644
--- a/src/register.cc
+++ b/src/register.cc
@@ -9,6 +9,7 @@
 #include "comm.h"
 #include "net.h"
 #include "register.h"
+#include "transport.h"
 
 ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
   struct ncclRegCache* cache = &comm->regCache;
@@ -79,6 +80,7 @@ ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, str
     }
   }
 end:
+  INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs);
   ncclDebugNoWarn = 0;
   if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
   return ret;
diff --git a/src/transport.cc b/src/transport.cc
index 710285680..5df47065b 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -9,6 +9,7 @@
 #include "bootstrap.h"
 #define ENABLE_TIMER 0
 #include "timer.h"
+#include "transport.h"
 
 struct ncclTransport* ncclTransports[NTRANSPORTS] = {
   &p2pTransport,
@@ -72,7 +73,7 @@ NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   ncclResult_t ret = ncclSuccess;
-  int highestType = TRANSPORT_P2P;  // track highest transport type
+  int highestType = TRANSPORT_UNDEFINED;  // track highest transport type
   struct ncclConnect** data; // Store intermediate send/recvData structs for connect
   struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel
   struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel
@@ -215,13 +216,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     }
   }
 
-  if (timeReported) {
+  {
     struct timeval now;
     gettimeofday(&now, NULL);
     float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6;
-    printf("\rP2p connect done in %d:%02d                                                                       \n",
-        ((int)elapsed)/60, ((int)elapsed)%60);
-    fflush(stdout);
+    if (elapsed > 1.0) INFO(NCCL_PROFILE, "timings: rank %d nranks %d P2p connect done in %.2f", comm->rank, comm->nRanks, elapsed);
+    if (timeReported) {
+      printf("\rP2p connect done in %d:%02d                                                                       \n",
+             ((int)elapsed)/60, ((int)elapsed)%60);
+      fflush(stdout);
+    }
   }
 
   /* We need to sync ranks here since some ranks might run too fast after connection setup
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index abe051822..ae1fe0fb5 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -9,7 +9,10 @@
 #include "graph.h"
 #include "proxy.h"
 #include "gdrwrap.h"
+#include "transport.h"
 #include "assert.h"
+#include "bootstrap.h"
+#include "channel.h"
 
 int64_t ncclParamGdrCopySyncEnable();
 int64_t ncclParamGdrCopyFlushEnable();
@@ -1052,7 +1055,23 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
   goto exit;
 }
 
-ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+struct ncclCollnetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclProxyConnector* proxyConn;
+  void* buffer;
+  size_t size;
+  void* mhandle;
+};
+
+static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
+  NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle));
+  INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer);
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
   ncclResult_t ret = ncclSuccess;
   void* handle = NULL;
   struct ncclRegCache* cache = &comm->regCache;
@@ -1060,18 +1079,20 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKe
   uintptr_t addr = (uintptr_t)userbuff & -pageSize;
   size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
   collnetRegInfo info = {addr, size};
-  struct ncclCollnetHandleList* record = NULL;
+  struct ncclCollnetCleanupCallback* record = NULL;
   struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
 
   *outRegBufFlag = 0;
   NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
-  record = ncclMemoryPoolAlloc<struct ncclCollnetHandleList>(&comm->memPool_ncclCollnetHandleList, &comm->memPermanent);
-  record->proxyconn = proxyConn;
-  record->buffer = userbuff;
+  record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
+  record->base.fn = cleanupCollnet;
+  record->proxyConn = proxyConn;
+  record->buffer = (void*)userbuff;
   record->size = buffSize;
-  *outHandle = record->collnetHandle = handle;
+  *outHandle = record->mhandle = handle;
   *outRegBufFlag = 1;
-  ncclIntruQueueEnqueue(&plan->collnetHandleQueue, record);
+  ncclIntruQueueEnqueue(cleanupQueue, &record->base);
+  *nCleanupQueueElts += 1;
 
 exit:
   return ret;
@@ -1140,3 +1161,269 @@ struct ncclTransport collNetTransport = {
   { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
   { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
 };
+
+ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
+  ncclResult_t ret = ncclSuccess;
+  char line[1024];
+
+  if (comm->collNetSupport == 0) goto exit;
+  // Connect Collnet + chain
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 0), ret, fail);
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 1), ret, fail);
+
+  line[0] = '\0';
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclTree* chain = &comm->channels[c].collnetChain;
+    snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d",
+      c, chain->down[0], comm->rank, chain->up);
+  }
+  line[1023] = '\0';
+
+  INFO(NCCL_INIT, "Connected Collnet Chains %s", line);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
+  ncclResult_t ret = ncclSuccess;
+  int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED;
+
+  if (comm->collNetSupport == 0) goto exit;
+
+  // Connect intra-node CollNet + Direct
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channelRecv = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail);
+
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channelSend = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
+
+  // Exchange highest intra-node transport type among ranks
+  // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
+  if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) {
+    int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED };
+
+    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
+    for (int i = 0; i < comm->localRanks; i++) {
+      if (highestTypes[i] > comm->intraHighestTransportType)
+        comm->intraHighestTransportType = highestTypes[i];
+    }
+    if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType)
+      comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType;
+  } else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) {
+    // reuse previous shared intraHighestTransportType
+    comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType;
+  }
+  INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) {
+  int rank = comm->rank;
+  uint64_t nonHeadMask = (1ull << comm->localRanks) - 1;
+
+  comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
+  comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
+  // initialize collNetUserToDenseRank[rank]
+  comm->collNetUserToDenseRank[rank] = -1;
+  for (int h = 0; h < comm->collNetHeadsNum; h++) {
+    nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]];
+    if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
+  }
+  if (comm->collNetUserToDenseRank[rank] == -1) {
+    comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1));
+  }
+  comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks;
+
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
+  for (int r = 0; r < comm->nRanks; r++) {
+    comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]) {
+  ncclResult_t ret = ncclSuccess;
+  int rank = comm->rank;
+  int collNetSetupFail = 0;
+  // Find all head ranks
+  int nHeadsUnique = 0;
+  int* headsUnique = NULL;
+  bool share;
+  struct ncclTopoGraph* directGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
+
+  struct collnetShareInfo {
+    int headPosition;
+    int isMaster;
+  };
+  struct collnetShareInfo* infos = NULL;
+
+  NCCLCHECKGOTO(ncclCalloc(&headsUnique, directGraph->nChannels), ret, fail);
+  { uint64_t mask = 0;
+    // Head GPU index is always 0
+    for (int c = 0; c < directGraph->nChannels; c++) {
+      int head = directGraph->intra[c * comm->localRanks + 0];
+      assert(comm->rankToNode[head] == comm->node);
+      uint64_t mask0 = mask;
+      mask |= 1ull<<comm->rankToLocalRank[head];
+      if (mask != mask0) headsUnique[nHeadsUnique++] = head;
+    }
+  }
+
+  comm->collNetHeads = headsUnique;
+  comm->collNetHeadsNum = nHeadsUnique;
+  if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) {
+    if (!parent->config.splitShare) {
+      collNetSetupFail = 1;
+      goto fail;
+    }
+    NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
+    /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
+     * based on heads with the same head position in each node, as long as the collnet heads of child comm
+     * can match parent's heads, we can let child communicator share parent's collnet resources. */
+    for (int h = 0; h < nHeadsUnique; ++h) {
+      int prev = INT_MIN;
+      struct collnetShareInfo* myinfo;
+
+      share = true;
+      myinfo = infos + comm->rank;
+      memset(myinfo, 0, sizeof(struct collnetShareInfo));
+      /* find the child head position in parent collnet heads. */
+      if (headsUnique[h] == comm->rank) {
+        myinfo->headPosition = -1;
+        myinfo->isMaster = 1;
+        for (int th = 0; th < parent->collNetHeadsNum; ++th)
+          if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) {
+            myinfo->headPosition = th;
+            break;
+          }
+      }
+
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail);
+      for (int i = 0; i < comm->nRanks; ++i) {
+        if (infos[i].isMaster) {
+          if (prev == INT_MIN)
+            prev = infos[i].headPosition;
+
+          if (infos[i].headPosition == -1 || prev != infos[i].headPosition) {
+            share = false;
+            break;
+          }
+        }
+      }
+
+      if (share) {
+        if (myinfo->isMaster) {
+          comm->collNetSharedRes = parent->collNetSharedRes;
+          for (int c = 0; c < comm->nChannels; ++c)
+            NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
+        }
+
+        NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
+      } else {
+        /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
+         * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
+         * lifted by sharp plugin/IB hardware in the future. */
+        collNetSetupFail = 1;
+        if (comm->rank == 0) {
+          WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks);
+        }
+        goto fail;
+      }
+    }
+    share = true;
+  } else {
+    /* this allocated buffer will be freed on proxy side */
+    NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
+    comm->collNetSharedRes->nChannels = comm->nChannels;
+    comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
+
+    NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
+
+    for (int c = 0; c < comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
+      for (int h = 0; h < nHeadsUnique; h++) {
+        const int head = headsUnique[h];
+        ncclConnect connect;
+        collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetRecv, &connect);
+        if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetSend, &connect);
+      }
+      // Verify CollNet setup across ranks after trying the first channel
+      if (c == 0) {
+        NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+      }
+    }
+    share = false;
+  }
+
+  if (share) {
+    memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix));
+  } else {
+    do {
+      /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some
+      ranks don't connect to sharp we enable a (redop,type) if any rank claims
+      support. */
+      uint8_t(*matrix)[4][ncclNumTypes];
+      bool isHead = false;
+      matrix = nullptr;
+      NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
+      for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank);
+      if (isHead) {
+        for (int ty=0; ty < ncclNumTypes; ty++) {
+          for (int op=0; op < 4; op++) {
+            int support = 0;
+            NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, (ncclRedOp_t)op, &support), ret, matrix_end);
+            // bit 0 = not supported, bit 1 = supported
+            matrix[rank][op][ty] = 1<<(support ? 1 : 0);
+          }
+        }
+      }
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end);
+      for (int ty=0; ty < ncclNumTypes; ty++) {
+        for (int op=0; op < 4; op++) {
+          uint8_t accum = 0;
+          for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty];
+          // We support (redop, type) if some rank supports it and no rank doesn't support it
+          comm->collNetSupportMatrix[op][ty] = (accum == (1<<1));
+        }
+      }
+    matrix_end:
+      free(matrix);
+      if (ret != ncclSuccess) goto fail;
+    } while (0);
+  }
+
+  // Verify CollNet setup across ranks after trying all channels
+  NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+  TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
+
+exit:
+  free(infos);
+  return ret;
+fail:
+  ncclTransportCollNetFree(comm);
+  comm->collNetSupport = 0;
+  goto exit;
+}
diff --git a/src/transport/generic.cc b/src/transport/generic.cc
new file mode 100644
index 000000000..a0efaab5c
--- /dev/null
+++ b/src/transport/generic.cc
@@ -0,0 +1,36 @@
+#include "comm.h"
+#include "transport.h"
+
+ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nRanks > 1) {
+    for (int c = 0; c < comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail);
+    INFO(NCCL_INIT, "Connected all rings");
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nRanks > 1) {
+    // Connect Trees
+    for (int c = 0; c < comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
+    INFO(NCCL_INIT, "Connected all trees");
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/transport/net.cc b/src/transport/net.cc
index cc388211c..d5a585d42 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -13,6 +13,7 @@
 #include "shm.h"
 #include "p2p.h"
 #include "profiler.h"
+#include "transport.h"
 
 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
 
@@ -238,6 +239,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 }
 
 static ncclResult_t netMapShm(struct connectMapMem* mem) {
+  mem->cpuPtr = NULL;
+  mem->gpuPtr = NULL;
   NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle));
   return ncclSuccess;
 }
@@ -303,8 +306,12 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
   }
 
   ncclResult_t ret;
-  NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId));
-  if (ret == ncclInProgress) {
+  ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId);
+  if (ret != ncclSuccess) {
+    if (ret != ncclInProgress) {
+      free(map);
+      send->transportResources = NULL;
+    }
     return ret;
   }
   INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
@@ -323,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
     if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
       NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
                                              map->mems[NCCL_NET_MAP_DEVMEM].size,
                                              &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
@@ -332,6 +340,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
       void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
       if (*sharedDevMemPtr == NULL) {
+        map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL;
         NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
                                                map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
                                                &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
@@ -403,7 +412,11 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
 
   ncclResult_t ret;
   NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId));
-  if (ret == ncclInProgress) {
+  if (ret != ncclSuccess) {
+    if (ret != ncclInProgress) {
+      free(map);
+      recv->transportResources = NULL;
+    }
     return ret;
   }
   INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId);
@@ -1264,7 +1277,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             ptrs[subCount] = localBuff+buffSlot*stepSize;
             sizes[subCount] = stepSize*args->sliceSteps;
           }
-          sizes[subCount] = stepSize*args->sliceSteps;
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = sub->mhandle;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 551ca61fd..be8a8a37b 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -78,6 +78,7 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
+NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
 NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
 NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
@@ -88,6 +89,7 @@ NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
+NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
 
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -289,7 +291,38 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
   return ncclSuccess;
 }
 
-static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, int gidTblLen, int *gidIndex) {
+// GID Format
+// global:  |              64b  - subnet-prefix                |                 64b - EUI                          |
+// raw   :  | 10b fixed | 22b 0 | 16b FLID | 16b subnet-prefix |                 64b - EUI                          |
+static uint16_t ncclIbExtractLocalSubnetPrefix(uint64_t subnet_prefix)
+{
+  return (be64toh(subnet_prefix) & 0xffff);
+}
+
+static int ncclIbExtractFlid (union ibv_gid *gid)
+{
+  return ntohs(*((uint16_t*)((uintptr_t)(gid->raw) + 4)));
+}
+
+static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, struct ibv_port_attr* portAttr, int *gidIndex) {
+  int gidTblLen = portAttr->gid_tbl_len;
+
+  //for IB, choose GID Index that will have routable FLID if present
+  if (portAttr->link_layer == IBV_LINK_LAYER_INFINIBAND) {
+    union ibv_gid gid;
+    int routableGidIndex = ncclParamIbRoutableFlidIbGidIndex();
+    if (routableGidIndex < gidTblLen) {
+      NCCLCHECK(wrap_ibv_query_gid(context, portNum, routableGidIndex, &gid));
+      if (ncclIbExtractFlid(&gid) != 0) {
+        *gidIndex = routableGidIndex;
+        return ncclSuccess;
+      }
+    }
+    *gidIndex = 0;
+    return ncclSuccess;
+  }
+
+  //for ROCE
   *gidIndex = ncclParamIbGidIndex();
   if (*gidIndex >= 0) {
     return ncclSuccess;
@@ -420,6 +453,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
 
+      // Should NCCL merge multi-port devices into one?
+      int mergeNics;
+      mergeNics = ncclParamIbMergeNics();
+build_ib_list:
       for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
         struct ibv_context * context;
         if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
@@ -478,7 +515,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
 
           int mergedDev = ncclNMergedIbDevs;
-          if (ncclParamIbMergeNics()) {
+          if (mergeNics) {
             mergedDev = ncclIbFindMatchingDev(ncclNIbDevs);
           }
 
@@ -505,6 +542,21 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
       }
+
+      // Detect if there are both multi-port and single-port NICs in the system. If so, disable port merging and build the list again
+      if (mergeNics) {
+        for (int d = 0; d < ncclNMergedIbDevs; d++) {
+          if (ncclIbMergedDevs[d].ndevs != ncclIbMergedDevs[0].ndevs) {
+            INFO(NCCL_NET, "Detected a mix of single and multiple-port NICs. Force-disabling NCCL_IB_MERGE_NICS");
+            mergeNics = 0;
+            ncclNIbDevs = 0;
+            ncclNMergedIbDevs = 0;
+            memset(ncclIbMergedDevs, 0, sizeof(ncclIbMergedDevs));
+            goto build_ib_list;
+          }
+        }
+      }
+
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
     }
     if (ncclNIbDevs == 0) {
@@ -560,8 +612,8 @@ ncclResult_t ncclIbGdrSupport() {
   if (moduleLoaded == -1) {
     // Check for the nv_peer_mem module being loaded
     moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
-                    // Also support the new nvidia-peermem module
-                    (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == -1)) ? 0 : 1;
+                    // Also support the new nv_mem_nc module
+                    (access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1;
   }
   if (moduleLoaded == 0) return ncclSystemError;
   return ncclSuccess;
@@ -649,12 +701,13 @@ struct ncclIbDevInfo {
   enum ibv_mtu mtu;
   uint8_t link_layer;
 
-  // For RoCE
-  uint64_t spn;
-  uint64_t iid;
+  // For RoCE and IB Rounter
+  union ibv_gid gid;
 
   // FIFO RDMA info
   uint32_t fifoRkey;
+
+  //remote dev info
   union ibv_gid remoteGid;
 };
 
@@ -910,7 +963,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint8_t sGidIndex, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool override_tc) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -921,15 +974,36 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint8_t sGidIndex, uint32_t dest_qp_
   qpAttr.min_rnr_timer = 12;
   if (info->link_layer == IBV_LINK_LAYER_ETHERNET) {
     qpAttr.ah_attr.is_global = 1;
-    qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
-    qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
+    qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix;
+    qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id;
     qpAttr.ah_attr.grh.flow_label = 0;
-    qpAttr.ah_attr.grh.sgid_index = sGidIndex;
+    qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
+    if(ncclParamIbFifoTc() && override_tc) {
+      qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc();
+    } else {
+      qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
+    }
   } else {
-    qpAttr.ah_attr.is_global = 0;
-    qpAttr.ah_attr.dlid = info->lid;
+    //pick lid if subnet prefixs are same, FLID if they are not
+    if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
+		    ncclIbExtractLocalSubnetPrefix(info->gid.global.subnet_prefix)) {
+        qpAttr.ah_attr.is_global = 0;
+        qpAttr.ah_attr.dlid = info->lid;
+    } else {
+	uint16_t flid = ncclIbExtractFlid(&info->gid);
+        if (flid == 0) {
+          WARN("Warning: remote FLID configured as zero even when endpoints are on different subnets, using dlid as fallback");
+          qpAttr.ah_attr.dlid = info->lid;
+	} else {
+          qpAttr.ah_attr.dlid = ncclIbExtractFlid(&info->gid);
+	}
+        qpAttr.ah_attr.is_global = 1;
+        qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix;
+        qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id;
+        qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
+	qpAttr.ah_attr.grh.hop_limit = 255;
+    }
   }
   qpAttr.ah_attr.sl = ncclParamIbSl();
   qpAttr.ah_attr.src_path_bits = 0;
@@ -1041,22 +1115,22 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
     devInfo->fifoRkey = commDev->fifoMr->rkey;
 
-    // RoCE support
+    // Pack local GID info
     devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
-    if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET) {
-      NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &commDev->base.gidInfo.localGidIndex));
-      NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
-      devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix;
-      devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id;
-    }
+    NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex));
+    NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
+    devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix;
+    devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
 
+    // info logging
     if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
       for (int q = 0; q < comm->base.nqps; q++) {
         // Print just the QPs for this dev
         if (comm->base.qps[q].devIndex == i)
-          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d fifoRkey=0x%x fifoLkey=0x%x",
+          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu  FLID %d fifoRkey=0x%x fifoLkey=0x%x",
             comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
-            dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->fifoRkey, commDev->fifoMr->lkey);
+            dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
+	    devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
     } else { // RoCE
       for (int q = 0; q < comm->base.nqps; q++) {
@@ -1065,7 +1139,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
             comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
             commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
-            devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
+            devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
     }
   }
@@ -1114,8 +1188,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
   for (int i = 0; i < remMeta.ndevs; i++) {
     comm->base.remDevs[i] = remMeta.devs[i];
-    comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].iid;
-    comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].spn;
+    comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].gid.global.interface_id;
+    comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].gid.global.subnet_prefix;
 
     // Retain remote sizes fifo info and prepare RDMA ops
     comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey;
@@ -1135,13 +1209,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     comm->base.qps[q].remDevIdx = remQpInfo->devIndex;
     int devIndex = comm->base.qps[q].devIndex;
     ncclIbSendCommDev* commDev = comm->devs + devIndex;
-    uint8_t gidIndex = commDev->base.gidInfo.localGidIndex;
 
     struct ibv_qp* qp = comm->base.qps[q].qp;
-    if (remQpInfo->ece_supported && remQpInfo->ece_supported)
+    if (remQpInfo->ece_supported)
       NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
 
-    NCCLCHECK(ncclIbRtrQp(qp, gidIndex, remQpInfo->qpn, remDevInfo));
+    NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false));
     NCCLCHECK(ncclIbRtsQp(qp));
   }
 
@@ -1237,15 +1310,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     ibDevN = mergedDev->devs[i];
     NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
     ibDev = ncclIbDevs + ibDevN;
-    NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &rCommDev->base.gidInfo.localGidIndex));
+    NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex));
     NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
   }
 
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
   for (int i = 0; i < remMeta.ndevs; i++) {
     rComm->base.remDevs[i] = remMeta.devs[i];
-    rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].iid;
-    rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].spn;
+    rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].gid.global.interface_id;
+    rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix;
   }
 
   // Stripe QP creation across merged devs
@@ -1270,14 +1343,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     // Set the ece (enhanced connection establishment) on this QP before RTR
     if (remMeta.qpInfo[q].ece_supported) {
       NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
-  
+
       // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
       // Store this in our own qpInfo for returning to the requestor
       if (meta.qpInfo[q].ece_supported)
         NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
     }
 
-    NCCLCHECK(ncclIbRtrQp(qp->qp, rCommDev->base.gidInfo.localGidIndex, remMeta.qpInfo[q].qpn, remDevInfo));
+    bool override_tc = (q == 0) ? true : false;
+    NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc));
     NCCLCHECK(ncclIbRtsQp(qp->qp));
   }
 
@@ -1307,10 +1381,10 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       devInfo.lid         = ibDev->portAttr.lid;
       devInfo.link_layer  = ibDev->portAttr.link_layer;
       devInfo.ib_port     = ibDev->portNum;
-      devInfo.spn         = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
-      devInfo.iid         = rCommDev->base.gidInfo.localGid.global.interface_id;
+      devInfo.gid.global.subnet_prefix        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
+      devInfo.gid.global.interface_id         = rCommDev->base.gidInfo.localGid.global.interface_id;
       devInfo.mtu         = ibDev->portAttr.active_mtu;
-      NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->base.gidInfo.localGidIndex, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
+      NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false));
       NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
     }
 
@@ -1318,8 +1392,8 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     meta.devs[i].lid        = ibDev->portAttr.lid;
     meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
     meta.devs[i].ib_port    = ibDev->portNum;
-    meta.devs[i].spn        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
-    meta.devs[i].iid        = rCommDev->base.gidInfo.localGid.global.interface_id;
+    meta.devs[i].gid.global.subnet_prefix       = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
+    meta.devs[i].gid.global.interface_id        = rCommDev->base.gidInfo.localGid.global.interface_id;
 
     // Adjust the MTU
     remMeta.devs[i].mtu    = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu);
@@ -1906,9 +1980,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
             }
 
             char line[SOCKET_NAME_MAXLEN+1];
-            WARN("NET/IB : Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s",
+            char *hcaName = r->devBases[i]->pd->context->device->name;
+            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s",
                 ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
-                localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString);
+                localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
             return ncclRemoteError;
           }
 
@@ -1918,7 +1993,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
           #ifdef ENABLE_TRACE
           char line[SOCKET_NAME_MAXLEN+1];
-          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%d r=%p type=%d events={%d,%d}, i=%d",
+          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
               ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
           #endif
           if (req->type == NCCL_NET_IB_REQ_SEND) {
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 0dd7c52ff..61d5946c4 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -12,6 +12,7 @@
 #include "proxy.h"
 #include "enqueue.h"
 #include "register.h"
+#include "transport.h"
 
 #if CUDART_VERSION >= 12010
 
@@ -46,36 +47,13 @@ struct ncclTransport nvlsTransport = {
   { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
 
-ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, size_t size) {
-  CUmulticastObjectProp* prop = &resources->properties;
-  memset(prop, 0, sizeof(*prop));
-  prop->size = size;
-  prop->numDevices = comm->MNNVL ? comm->clique.size : comm->localRanks;
-  prop->handleTypes = ncclCuMemHandleType;
-  prop->flags = 0;
-
-  // Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
-  CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-  ALIGN_SIZE(size, resources->granularity);
-  prop->size = resources->size = size;
-
-  memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
-  resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  resources->accessDesc.location.id = dev;
-  resources->dev = dev;
-
-  return ncclSuccess;
-}
-
 ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
   CUmemAllocationHandleType type = ncclCuMemHandleType;
   size_t size = prop->size;
 
   // Create a Multicast group
 
-  INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
+  INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zu on rank %d", nranks, size, rank);
   CUCHECK(cuMulticastCreate(mcHandle, prop));
 
   if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
@@ -86,14 +64,8 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
     memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle));
   }
 
-  INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", *mcHandle, nranks, size, rank);
-
-  return ncclSuccess;
-}
+  INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zu on rank %d", *mcHandle, nranks, size, rank);
 
-ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
-  CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
   return ncclSuccess;
 }
 
@@ -123,53 +95,12 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  return ncclSuccess;
-}
-
-ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  size_t size = resources->size;
-  size_t granularity;
-  CUdeviceptr ptr = 0;
-  CUmemAllocationProp prop;
-
-  memset(&prop, 0, sizeof(prop));
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = resources->dev;
-  prop.requestedHandleTypes = ncclCuMemHandleType;
-  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-  resources->ucGran = granularity;
-
-  // Map a VA for UC memory
-  CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
-
-  // Alloc local physical mem for this NVLS group
-  CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0));
-  CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0));
-  CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1));
-  CUDACHECK(cudaMemset((void*)ptr, 0, size));
-  resources->ucBuff = (char*)ptr;
-  INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size);
-
-  // Bind physical memory to the Multicast group
-  // NB: It will block until all ranks have been added to the Group
-  INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size);
-  CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/));
-
-  return ncclSuccess;
-}
-
-ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  int dev = resources->dev;
-  size_t size = resources->size;
-  INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
+ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) {
+  int dev = comm->cudaDev;
+  INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev);
 
   // Unbind physical memory from group for the given device
-  CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
-
-  // Release the MC group resources
-  NCCLCHECK(nvlsGroupDisconnect(comm, resources));
+  CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
 
   return ncclSuccess;
 }
@@ -182,43 +113,18 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  size_t size = resources->size;
-  CUdeviceptr ptr = 0;
-
-  // Create a VA for the NVLS
-  CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0));
-  // Map the VA locally
-  CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0));
-  resources->mcBuff = (char*)ptr;
-  INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size);
-
-  // Having completed the BindMem we can now call SetAccess
-  // NB: It will block until all ranks have bound to the Group
-  CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1));
-
-  return ncclSuccess;
-}
-
-ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  size_t size;
-  CUdeviceptr ptr;
-  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
-       resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff);
+ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
+  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
 
   // Release the UC memory and mapping
-  ptr = (CUdeviceptr)resources->ucBuff;
-  size = resources->size;
-  CUCHECK(cuMemUnmap(ptr, size));
-  CUCHECK(cuMemAddressFree(ptr, size));
-  CUCHECK(cuMemRelease(resources->ucHandle));
+  CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
+  CUCHECK(cuMemRelease(*ucHandle));
 
   // Release the MC memory and mapping
-  ptr = (CUdeviceptr)resources->mcBuff;
-  size = resources->size;
-  CUCHECK(cuMemUnmap(ptr, size));
-  CUCHECK(cuMemAddressFree(ptr, size));
-  CUCHECK(cuMemRelease(resources->mcHandle));
+  CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
+  CUCHECK(cuMemRelease(*mcHandle));
 
   return ncclSuccess;
 }
@@ -260,84 +166,222 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
-  if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
+ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nvlsSupport && comm->nNodes > 1) {
+    for (int c = 0; c < comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
+    }
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_NVLS], 0), ret, fail);
+    INFO(NCCL_INIT, "Connected NVLS tree");
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
 
-  int nHeads = comm->channels[0].nvls.nHeads;
-  int headRank = comm->channels[0].nvls.headRank;
+static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) {
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  CUmulticastObjectProp mcprop;
+  CUmemAllocationProp ucprop;
+  ncclResult_t ret = ncclSuccess;
+  size_t size = *sizePtr;
+  size_t originSize = size;
+  size_t ucgran, mcgran;
+
+  memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
+  mcprop.numDevices = comm->localRanks;
+  mcprop.handleTypes = ncclCuMemHandleType;
+  mcprop.flags = 0;
+  mcprop.size = size;
+  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail);
+  ALIGN_SIZE(size, mcgran);
+  *sizePtr = mcprop.size = size;
+
+  if (comm->localRank == 0) {
+    NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+  } else {
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+    NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
+  }
+
+  CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail);
+
+  memset(&ucprop, 0, sizeof(CUmemAllocationProp));
+  ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  ucprop.location.id = comm->cudaDev;
+  ucprop.requestedHandleTypes = ncclCuMemHandleType;
+  CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
+  // Map a VA for UC memory
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail);
+
+  // Alloc local physical mem for this NVLS group
+  CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
+  CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
+
+  // Bind physical memory to the Multicast group
+  // NB: It will block until all ranks have been added to the Group
+  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
+
+  // Map mc virtual address
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail);
+  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
+  int nHeads = -1;
+  int headRank = -1;
+  ncclResult_t res = ncclSuccess;
+  int nvlsStepSize = -1;
+  size_t buffSize = 0;
+  size_t nvlsPerRankSize = 0;
+  size_t nvlsTotalSize = 0;
+  struct ncclNvlsSharedRes* resources = NULL;
+  int nChannels = -1;
+
+  if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess;
+  // initialize after checking comm->nvlsSupport
+  nHeads = comm->channels[0].nvls.nHeads;
+  headRank = comm->channels[0].nvls.headRank;
+  resources = comm->nvlsResources;
+  nChannels = comm->nvlsResources->nChannels;
+  nvlsStepSize = comm->nvlsChunkSize;
+  buffSize = nvlsStepSize * NCCL_STEPS;
+  nvlsPerRankSize = nChannels * 2 * buffSize;
+  nvlsTotalSize = nvlsPerRankSize * nHeads;
+
+  INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
+       comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize);
+
+  NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail);
+  resources->buffSize = nvlsTotalSize;
+
+  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+  for (int h = 0; h < nHeads; h++) {
+    int nvlsPeer = comm->nRanks + 1 + h;
+    for (int c = 0; c < nChannels; c++) {
+      struct ncclChannel* channel = comm->channels + c;
+      struct ncclChannelPeer* peer = channel->peers[nvlsPeer];
+
+      // Reduce UC -> MC
+      peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + (h * 2 * nChannels + c) * buffSize;
+      peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + (h * 2 * nChannels + c) * buffSize;
+
+      // Broadcast MC -> UC
+      peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
+      peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
+
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+    }
+  }
+
+  NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+  // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail);
+  comm->nvlsResources->inited = true;
+
+exit:
+  return res;
+fail:
+  comm->nvlsResources->inited = false;
+  goto exit;
+}
+
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
+  ncclResult_t res = ncclSuccess;
+  size_t typeSize;
   char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
   uintptr_t *nvlsShmem = NULL;
-  size_t typeSize;
+  bool nvlsShare = parent && parent->nvlsSupport && parent->config.splitShare;
+  int nHeads = comm->channels[0].nvls.nHeads;
 
-  CUdevice dev;
-  CUCHECK(cuCtxGetDevice(&dev));
+  if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
 
-  ncclResult_t res = ncclSuccess;
-  bool nvlsShare = true;
-  if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks)
+  if (nvlsShare && parent->channels[0].nvls.nHeads == nHeads) {
+    for (int ch = 0; ch < nHeads; ++ch) {
+      bool find = false;
+      for (int h = 0; h < parent->channels[0].nvls.nHeads; ++h) {
+        if (comm->nvlsHeads[ch] == parent->nvlsHeads[h]) {
+          // find the head
+          find = true;
+          break;
+        }
+      }
+      if (find == false) {
+        nvlsShare = false;
+        goto setup;
+      }
+    }
     nvlsShare = true;
-  else
+  } else {
     nvlsShare = false;
+  }
 
+setup:
+  comm->nvlsChunkSize = ncclParamNvlsChunkSize();
   if (nvlsShare) {
     /* reuse NVLS resources */
     comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
     for (int c = 0; c < comm->nChannels; c++) {
-      NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
+      NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, fail);
     }
 
     comm->nvlsResources = parent->nvlsResources;
     ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
   } else {
+    struct ncclNvlsSharedRes* resources = NULL;
+    int nHeads = comm->channels[0].nvls.nHeads;
     int nChannels = comm->nChannels;
-    struct ncclNvlsSharedRes* resources;
-
-    NCCLCHECK(ncclCalloc(&resources, 1));
-    comm->nvlsResources = resources;
-    resources->refCount = 1;
-
-    if (parent && parent->config.splitShare) {
+    size_t memSize = 16;
+    size_t creditSize = nChannels * 2 * memSize * nHeads;
+    int nvlsStepSize = comm->nvlsChunkSize;
+  
+    NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
+    comm->nvlsResources->inited = false;
+    comm->nvlsResources->refCount = 1;
+    comm->nvlsResources->nChannels = comm->nvlsChannels;
+    resources = comm->nvlsResources;
+
+    if (parent && parent->nvlsSupport && parent->config.splitShare) {
       /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels
        * to make sure nvlsChannels match for each rank. */
       comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
     }
+    comm->nvlsResources->nChannels = comm->nvlsChannels;
 
-    resources->nChannels = comm->nvlsChannels;
-    for (int c = 0; c < nChannels; c++) {
-      NCCLCHECK(initNvlsChannel(comm, c, parent, false));
+    for (int c = 0; c < comm->nChannels; c++) {
+      NCCLCHECKGOTO(initNvlsChannel(comm, c, NULL, false), res, fail);
     }
 
-    int nvlsStepSize = comm->nvlsChunkSize = ncclParamNvlsChunkSize();
-    size_t buffSize = nvlsStepSize * NCCL_STEPS;
-    size_t memSize = NVLS_MEM_ALIGN_SIZE;
-    size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
-    size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
-
-    INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
-      comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
-
-    char* shareableHandle = resources->shareableHandle;
-    NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nvlsTotalSize), res, cleanup);
-    if (comm->localRank == 0) {
-      NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup);
-      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
-    } else {
-      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
-      NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &resources->mcHandle), res, cleanup);
-    }
+    memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
+    resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    resources->accessDesc.location.id = comm->cudaDev;
+    resources->dev = comm->cudaDev;
 
-    NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
-    NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
-    if (comm->localRanks > 1) {
-      // Local intra-node barrier to ensure everyone has bound their memory to the group
-      NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
-    }
-    if (comm->MNNVL) {
-      // MNNVL: Clique wide barrier to ensure everyone has bound their memory to the group
-      NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, comm->clique.ranks[0]), res, cleanup);
-    }
-    NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
+    NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail);
+    resources->creditSize = creditSize;
 
+    // Set up head and tail only for now
+    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
     for (int h = 0; h < nHeads; h++) {
       int nvlsPeer = comm->nRanks + 1 + h;
       for (int c = 0; c < nChannels; c++) {
@@ -346,77 +390,72 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
         struct ncclChannelPeer* peer = channel->peers[nvlsPeer];
 
         // Reduce UC -> MC
-        mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
+        mem = resources->ucCredit + (h * 2 * nChannels + c) * memSize;
         peer->send[1].transportComm = &nvlsTransport.send;
-        peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-        peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
-        peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
+        peer->send[1].conn.head = (uint64_t*)mem;
+        peer->send[1].conn.tail = (uint64_t*)(mem + memSize / 2);
         peer->send[1].conn.stepSize = nvlsStepSize;
-        mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
+        mem = resources->mcCredit + (h * 2 * nChannels + c) * memSize;
         peer->recv[0].transportComm = &nvlsTransport.recv;
-        peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-        peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
-        peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
+        peer->recv[0].conn.head = (uint64_t*)mem;
+        peer->recv[0].conn.tail = (uint64_t*)(mem + memSize / 2);
         peer->recv[0].conn.stepSize = nvlsStepSize;
         peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
 
         // Broadcast MC -> UC
-        mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
+        mem = resources->ucCredit + ((h * 2 + 1) * nChannels + c) * memSize;
         peer->recv[1].transportComm = &nvlsTransport.recv;
-        peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-        peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
-        peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
+        peer->recv[1].conn.head = (uint64_t*)mem;
+        peer->recv[1].conn.tail = (uint64_t*)(mem + memSize / 2);
         peer->recv[1].conn.stepSize = nvlsStepSize;
-        mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
+        mem = resources->mcCredit + ((h * 2 + 1) * nChannels + c) * memSize;
         peer->send[0].transportComm = &nvlsTransport.send;
-        peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
-        peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
-        peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+        peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
+        peer->send[0].conn.head = (uint64_t*)mem;
+        peer->send[0].conn.tail = (uint64_t*)(mem + memSize / 2);
         peer->send[0].conn.stepSize = nvlsStepSize;
         peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
 
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
-
-        /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
-            nvlsPeer, c,
-            resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize),
-            resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize),
-            resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize),
-            resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
       }
     }
+    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
   }
 
   // MNNVL does not support NVLS buffer registration
-  if (comm->MNNVL) return res;
+  if (!comm->MNNVL && comm->nvlsResources->nvlsShmemHandle == NULL) {
+    /* create shared memory for fast NVLS buffer registration */
+    typeSize = sizeof(struct localRegData) << 1;
 
-  /* create shared memory for fast NVLS buffer registration */
-  typeSize = sizeof(struct localRegData) << 1;
-
-  if (comm->localRank == 0) {
-    shmPath[0] = '\0';
-    NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup);
-    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup);
-  } else {
-    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup);
-    NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup);
+    if (comm->localRank == 0) {
+      shmPath[0] = '\0';
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
+    } else {
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+    }
+    /* need 2 pools and a shared counter for shmem-based collectives */
+    comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
+    comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t));
+    comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
+    comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
+    comm->nvlsResources->nvlsShmem.round = 0;
+    comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;
   }
-  /* need 2 pools and a shared counter for shmem-based collectives */
-  comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
-  comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t));
-  comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
-  comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
-  comm->nvlsResources->nvlsShmem.round = 0;
-  comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;
 
+exit:
   return res;
-
-cleanup:
+fail:
   comm->nvlsSupport = 0;
-  return res;
+  goto exit;
 }
 
 ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
@@ -424,9 +463,18 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   if (resources == NULL) return ncclSuccess;
 
   if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) {
-    NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
-    NCCLCHECK(nvlsGroupUnbind(comm, resources));
-    NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
+    if (!comm->MNNVL && resources->nvlsShmemHandle)
+      NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
+
+    if (resources->ucCredit && resources->mcCredit) {
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
+    }
+
+    if (comm->nvlsResources->inited) {
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle));
+    }
     free(resources);
     comm->nvlsResources = NULL;
   }
@@ -437,14 +485,15 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   CUdeviceptr regPtr = 0;
-  CUmulticastObjectProp prop;
+  CUmulticastObjectProp mcprop;
+  CUmemAllocationProp ucprop;
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmemGenericAllocationHandle mcHandle;
-  size_t granularity;
   size_t minSize = SIZE_MAX;
   bool localRegBufUsed = false;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
+  size_t ucgran, mcgran;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -454,17 +503,28 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
       CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
       if (attr.type == cudaMemoryTypeDevice) {
         size_t regSize = regRecord->pages * comm->regCache.pageSize;
-        prop = comm->nvlsResources->properties;
-        prop.size = regSize;
-        CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+        memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
+        mcprop.numDevices = comm->localRanks;
+        mcprop.handleTypes = ncclCuMemHandleType;
+        mcprop.flags = 0;
+        mcprop.size = regSize;
+        CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+        memset(&ucprop, 0, sizeof(CUmemAllocationProp));
+        ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        ucprop.location.id = comm->cudaDev;
+        ucprop.requestedHandleTypes = ncclCuMemHandleType;
+        CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
+
         CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr));
-        if (regSize % granularity == 0) {
+        if (regSize % mcgran == 0) {
           regRecord->regSize = regSize;
         } else {
           regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
         }
 
-        if (regRecord->addr % comm->nvlsResources->ucGran == 0 && regRecord->regSize % granularity == 0) {
+        if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) {
           regRecord->state |= NVLS_REG_POSSIBLE;
           memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
           regData[comm->localRank].offset = userBuff - regRecord->addr;
@@ -489,11 +549,10 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   /* start registration */
-  memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
-  prop.size = minSize;
-  CUCHECKGOTO(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  mcprop.size = minSize;
+  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
   if (comm->localRank == 0) {
-    NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
+    NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
   } else {
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -504,7 +563,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
 
   // Create a VA for the NVLS
-  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, granularity, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
   CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
   CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
@@ -639,14 +698,35 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   goto exit;
 }
 
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+struct ncclNvlsCleanupCallback {
+  struct ncclCommCallback base;
+  CUmemGenericAllocationHandle mcHandle;
+  CUdeviceptr ptr;
+  int dev;
+  size_t size;
+};
+
+static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb;
+  NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
+  INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsGraphRegisterBuffer(
+    struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
+    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
+  ) {
   ncclResult_t ret = ncclSuccess;
   bool localRegBufUsed = false;
-  struct ncclNvlsMcHandleList* sendRecord = NULL;
-  struct ncclNvlsMcHandleList* recvRecord = NULL;
+  struct ncclNvlsCleanupCallback* sendRecord = NULL;
+  struct ncclNvlsCleanupCallback* recvRecord = NULL;
   CUdeviceptr regSendPtr = 0;
   CUdeviceptr regRecvPtr = 0;
-  CUmulticastObjectProp prop;
+  CUmulticastObjectProp mcprop;
+  CUmemAllocationProp ucprop;
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
   size_t sendGran = 0, recvGran = 0;
@@ -656,6 +736,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
   const void *baseRecv = NULL;
   size_t baseSendSize = 1;
   size_t baseRecvSize = 1;
+  size_t ucgran;
 
   *outRegBufUsed = false;
   NCCLCHECKGOTO(ncclCalloc(&regBufFlags, comm->localRanks), ret, fail);
@@ -669,16 +750,27 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
     if (recvbuff != NULL)
       CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
 
-    localRegBufUsed = ((uint64_t)baseSend % comm->nvlsResources->ucGran != 0 || (uint64_t)baseRecv % comm->nvlsResources->ucGran != 0) ? false : true;
+    memset(&ucprop, 0, sizeof(CUmemAllocationProp));
+    ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    ucprop.location.id = comm->cudaDev;
+    ucprop.requestedHandleTypes = ncclCuMemHandleType;
+    CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
+
+    localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true;
     regBufFlags[comm->localRank] = localRegBufUsed;
     NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
     for (int i = 0; i < comm->localRanks; ++i)
       if (regBufFlags[i] == false) goto fail;
 
-    memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
+    memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
+    mcprop.numDevices = comm->localRanks;
+    mcprop.handleTypes = ncclCuMemHandleType;
+    mcprop.flags = 0;
+
     if (sendbuff != NULL) {
-      prop.size = baseSendSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+      mcprop.size = baseSendSize;
+      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
 
       /* check send buffer offset and size */
       rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
@@ -691,11 +783,11 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
       }
       if (baseSendSize % sendGran != 0) goto fail;
 
-      prop.size = baseSendSize;
+      mcprop.size = baseSendSize;
 
       /* register sendbuff */
       if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
+        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
         NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
       } else {
         NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -711,7 +803,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
       CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail);
       CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
-      sendRecord = ncclMemoryPoolAlloc<struct ncclNvlsMcHandleList>(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent);
+      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      sendRecord->base.fn = cleanupNvls;
       sendRecord->mcHandle = sendMcHandle;
       sendRecord->ptr = regSendPtr;
       sendRecord->dev = comm->nvlsResources->dev;
@@ -719,8 +812,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
     }
 
     if (recvbuff != NULL) {
-      prop.size = baseRecvSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+      mcprop.size = baseRecvSize;
+      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
 
       rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
       rdata[comm->localRank].size = baseRecvSize;
@@ -732,9 +825,9 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
       }
       if (baseRecvSize % recvGran != 0) goto fail;
 
-      prop.size = baseRecvSize;
+      mcprop.size = baseRecvSize;
       if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
+        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
         NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
       } else {
         NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -750,7 +843,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
       CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail);
       CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
-      recvRecord = ncclMemoryPoolAlloc<struct ncclNvlsMcHandleList>(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent);
+      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      recvRecord->base.fn = cleanupNvls;
       recvRecord->mcHandle = recvMcHandle;
       recvRecord->ptr = regRecvPtr;
       recvRecord->dev = comm->nvlsResources->dev;
@@ -764,22 +858,24 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
   if (localRegBufUsed == false) {
     if (sendRecord) {
       ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size);
-      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, sendRecord);
+      free(sendRecord);
     }
 
     if (recvRecord) {
       ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
-      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, recvRecord);
+      free(recvRecord);
     }
   } else {
     if (sendRecord) {
       *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
-      ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, sendRecord);
+      ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base);
+      *nCleanupQueueEltsAdded += 1;
     }
 
     if (recvRecord) {
       *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
-      ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, recvRecord);
+      ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base);
+      *nCleanupQueueEltsAdded += 1;
     }
 
     INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr);
@@ -806,6 +902,10 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
+  return ncclSuccess;
+}
+
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   return ncclSuccess;
 }
@@ -814,7 +914,15 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsGraphRegisterBuffer(
+    struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
+    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
+  ) {
   *outRegBufUsed = false;
   return ncclSuccess;
 }
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 17a5d69ee..90a714b40 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -9,6 +9,7 @@
 #include "utils.h"
 #include "shm.h"
 #include "p2p.h"
+#include "transport.h"
 
 enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
 
@@ -223,7 +224,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
       CUDACHECK(res);
     }
   }
-  INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc);
+  INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zu ipcDesc %p", *ptr, size, ipcDesc);
 
   return ncclSuccess;
 }
@@ -256,7 +257,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
     CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0));
 
-    TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%llx dptr %p", size, handle, (void*)dptr);
+    TRACE(NCCL_P2P, "Imported shareable buffer size %zu handle 0x%llx dptr %p", size, handle, (void*)dptr);
 
     // Allow access by the local GPU
     CUmemAccessDesc accessDesc = {};
@@ -264,7 +265,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     accessDesc.location.id = comm->cudaDev;
     accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
-    TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);
+    TRACE(NCCL_P2P, "Set Access for %p size %zu on dev %d", (void*)dptr, size, accessDesc.location.id);
 
     *devMemPtr = (void *)dptr;
 #else
@@ -275,7 +276,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess));
   }
 
-  INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr);
+  INFO(NCCL_P2P, "Imported shareable buffer device %d size %zu ptr %p", comm->cudaDev, size, *devMemPtr);
 
   return ncclSuccess;
 }
@@ -318,7 +319,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
         accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         accessDesc.location.id = myInfo->cudaDev;
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
+        INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
         CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
       }
 #endif
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 9f2f2fc26..7fc6251b6 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -6,6 +6,7 @@
 
 #include "comm.h"
 #include "shm.h"
+#include "transport.h"
 
 struct shmConnectInfo {
   char shmName[7];