NVIDIA · pciolkosz · Nov 28, 2024 · Nov 28, 2024
@@ -48,11 +48,11 @@ namespace cudax = cuda::experimental;
 
 struct simple_kernel
 {
-  template <typename Dimensions>
-  __device__ void operator()(Dimensions dims, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
+  template <typename Configuration>
+  __device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
   {
     // Just a dummy kernel, doing enough for us to verify that everything worked
-    const auto idx = dims.rank(cudax::thread);
+    const auto idx = config.dims.rank(cudax::thread);
     dst[idx]       = src[idx] * 2.0f;
   }
 };
@@ -131,15 +131,15 @@ void test_cross_device_access_from_kernel(
   dev1_stream.wait(dev0_stream);
 
   // Kernel launch configuration
-  auto dims = cudax::distribute<512>(dev0_buffer.size());
+  auto config = cudax::distribute<512>(dev0_buffer.size());
 
   // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing output to the GPU 1 buffer
   printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
          "GPU%d...\n",
          dev1.get(),
          dev0.get(),
          dev1.get());
-  cudax::launch(dev1_stream, dims, simple_kernel{}, dev0_buffer, dev1_buffer);
+  cudax::launch(dev1_stream, config, simple_kernel{}, dev0_buffer, dev1_buffer);
   dev0_stream.wait(dev1_stream);
 
   // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing output to the GPU 0 buffer
@@ -148,7 +148,7 @@ void test_cross_device_access_from_kernel(
          dev0.get(),
          dev1.get(),
          dev0.get());
-  cudax::launch(dev0_stream, dims, simple_kernel{}, dev1_buffer, dev0_buffer);
+  cudax::launch(dev0_stream, config, simple_kernel{}, dev1_buffer, dev0_buffer);
 
   // Copy data back to host and verify
   printf("Copy data back to host from GPU%d and verify results...\n", dev0.get());

@@ -92,11 +92,12 @@ try
 
   // Define the kernel launch parameters
   constexpr int threadsPerBlock = 256;
-  auto dims                     = cudax::distribute<threadsPerBlock>(numElements);
+  auto config                   = cudax::distribute<threadsPerBlock>(numElements);
 
   // Launch the vectorAdd kernel
-  printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
-  cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));
+  printf(
+    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cudax::block, cudax::grid), threadsPerBlock);
+  cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
 
   printf("waiting for the stream to finish\n");
   stream.wait();

@@ -860,44 +860,6 @@ constexpr auto make_hierarchy(L1 l1, Levels... ls) noexcept
   return detail::__make_hierarchy_fragment<thread_level>()(detail::__as_level(l1), detail::__as_level(ls)...);
 }
 
-// We can consider removing the operator&, but its convenient for in-line construction
-// TODO accept forwarding references
-template <typename LUnit, typename LNew, typename... Levels>
-_CUDAX_API constexpr auto operator&(const hierarchy_dimensions_fragment<LUnit, Levels...>& ls, LNew lnew) noexcept
-{
-  auto new_level     = detail::__as_level(lnew);
-  using NewLevel     = decltype(new_level);
-  using top_level    = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
-  using bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
-
-  if constexpr (detail::can_rhs_stack_on_lhs<top_level, __level_type_of<NewLevel>>)
-  {
-    return hierarchy_dimensions_fragment<LUnit, NewLevel, Levels...>(
-      ::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), ls.levels));
-  }
-  else
-  {
-    static_assert(detail::can_rhs_stack_on_lhs<__level_type_of<NewLevel>, bottom_level>,
-                  "Not supported order of levels in hierarchy");
-    using NewUnit = detail::__default_unit_below<__level_type_of<NewLevel>>;
-    return hierarchy_dimensions_fragment<NewUnit, Levels..., NewLevel>(
-      ::cuda::std::tuple_cat(ls.levels, ::cuda::std::make_tuple(new_level)));
-  }
-}
-
-template <typename L1, typename LUnit, typename... Levels>
-_CUDAX_API constexpr auto operator&(L1 l1, const hierarchy_dimensions_fragment<LUnit, Levels...>& ls) noexcept
-{
-  return ls & l1;
-}
-
-template <typename L1, typename Dims1, typename L2, typename Dims2>
-_CUDAX_API constexpr auto
-operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
-{
-  return hierarchy_dimensions<level_dimensions<L1, Dims1>>(l1) & l2;
-}
-
 /**
  * @brief Add a level to a hierarchy
  *
@@ -921,35 +883,26 @@ operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims
  * @par
  */
 template <typename NewLevel, typename Unit, typename... Levels>
-constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment<Unit, Levels...>& hierarchy, NewLevel level)
+constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment<Unit, Levels...>& hierarchy, NewLevel lnew)
 {
-  return hierarchy & level;
-}
+  auto new_level     = detail::__as_level(lnew);
+  using AddedLevel   = decltype(new_level);
+  using top_level    = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
+  using bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
 
-/**
- * @brief A shorthand for creating a hierarchy of CUDA threads by evenly
- * distributing elements among blocks and threads.
- *
- * @par Snippet
- * @code
- * #include <cudax/hierarchy_dimensions.cuh>
- * using namespace cuda::experimental;
- *
- * constexpr int threadsPerBlock = 256;
- * auto dims = distribute<threadsPerBlock>(numElements);
- *
- * // Equivalent to:
- * constexpr int threadsPerBlock = 256;
- * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
- * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
- * @endcode
- */
-template <int _ThreadsPerBlock>
-constexpr auto distribute(int numElements) noexcept
-{
-  int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
-  return ::cuda::experimental::make_hierarchy(
-    ::cuda::experimental::grid_dims(blocksPerGrid), ::cuda::experimental::block_dims<_ThreadsPerBlock>());
+  if constexpr (detail::can_rhs_stack_on_lhs<top_level, __level_type_of<AddedLevel>>)
+  {
+    return hierarchy_dimensions_fragment<Unit, AddedLevel, Levels...>(
+      ::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), hierarchy.levels));
+  }
+  else
+  {
+    static_assert(detail::can_rhs_stack_on_lhs<__level_type_of<AddedLevel>, bottom_level>,
+                  "Not supported order of levels in hierarchy");
+    using NewUnit = detail::__default_unit_below<__level_type_of<AddedLevel>>;
+    return hierarchy_dimensions_fragment<NewUnit, Levels..., AddedLevel>(
+      ::cuda::std::tuple_cat(hierarchy.levels, ::cuda::std::make_tuple(new_level)));
+  }
 }
 
 } // namespace cuda::experimental

@@ -385,6 +385,28 @@ struct kernel_config
   }
 };
 
+// We can consider removing the operator&, but its convenient for in-line construction
+template <typename Dimensions, typename... Options, typename NewLevel>
+_CUDAX_HOST_API constexpr auto
+operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
+{
+  return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
+}
+
+template <typename NewLevel, typename Dimensions, typename... Options>
+_CUDAX_HOST_API constexpr auto
+operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
+{
+  return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
+}
+
+template <typename L1, typename Dims1, typename L2, typename Dims2>
+_CUDAX_HOST_API constexpr auto
+operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
+{
+  return kernel_config(make_hierarchy_fragment(l1, l2));
+}
+
 template <typename Dimensions,
           typename... Options,
           typename Option,
@@ -423,6 +445,31 @@ make_config(const hierarchy_dimensions_fragment<BottomUnit, Levels...>& dims, co
   return kernel_config<hierarchy_dimensions_fragment<BottomUnit, Levels...>, Opts...>(dims, opts...);
 }
 
+/**
+ * @brief A shorthand for creating a kernel configuration with a hierarchy of CUDA threads evenly
+ * distributing elements among blocks and threads.
+ *
+ * @par Snippet
+ * @code
+ * #include <cudax/hierarchy_dimensions.cuh>
+ * using namespace cuda::experimental;
+ *
+ * constexpr int threadsPerBlock = 256;
+ * auto dims = distribute<threadsPerBlock>(numElements);
+ *
+ * // Equivalent to:
+ * constexpr int threadsPerBlock = 256;
+ * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+ * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
+ * @endcode
+ */
+template <int _ThreadsPerBlock>
+constexpr auto distribute(int numElements) noexcept
+{
+  int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
+  return make_config(make_hierarchy(grid_dims(blocksPerGrid), block_dims<_ThreadsPerBlock>()));
+}
+
 template <typename... Args>
 _CCCL_NODISCARD constexpr auto make_config(const Args&... args)
 {