Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDAX] Remove launch overloads taking dimensions and make everything except make_hierarchy return kernel_config #2979

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cudax/examples/simple_p2p.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ namespace cudax = cuda::experimental;

struct simple_kernel
{
template <typename Dimensions>
__device__ void operator()(Dimensions dims, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
template <typename Configuration>
__device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
{
// Just a dummy kernel, doing enough for us to verify that everything worked
const auto idx = dims.rank(cudax::thread);
const auto idx = config.dims.rank(cudax::thread);
dst[idx] = src[idx] * 2.0f;
}
};
Expand Down Expand Up @@ -131,15 +131,15 @@ void test_cross_device_access_from_kernel(
dev1_stream.wait(dev0_stream);

// Kernel launch configuration
auto dims = cudax::distribute<512>(dev0_buffer.size());
auto config = cudax::distribute<512>(dev0_buffer.size());

// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing output to the GPU 1 buffer
printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n",
dev1.get(),
dev0.get(),
dev1.get());
cudax::launch(dev1_stream, dims, simple_kernel{}, dev0_buffer, dev1_buffer);
cudax::launch(dev1_stream, config, simple_kernel{}, dev0_buffer, dev1_buffer);
dev0_stream.wait(dev1_stream);

// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing output to the GPU 0 buffer
Expand All @@ -148,7 +148,7 @@ void test_cross_device_access_from_kernel(
dev0.get(),
dev1.get(),
dev0.get());
cudax::launch(dev0_stream, dims, simple_kernel{}, dev1_buffer, dev0_buffer);
cudax::launch(dev0_stream, config, simple_kernel{}, dev1_buffer, dev0_buffer);

// Copy data back to host and verify
printf("Copy data back to host from GPU%d and verify results...\n", dev0.get());
Expand Down
7 changes: 4 additions & 3 deletions cudax/examples/vector_add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ try

// Define the kernel launch parameters
constexpr int threadsPerBlock = 256;
auto dims = cudax::distribute<threadsPerBlock>(numElements);
auto config = cudax::distribute<threadsPerBlock>(numElements);

// Launch the vectorAdd kernel
printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));
printf(
"CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cudax::block, cudax::grid), threadsPerBlock);
cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));

printf("waiting for the stream to finish\n");
stream.wait();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -860,44 +860,6 @@ constexpr auto make_hierarchy(L1 l1, Levels... ls) noexcept
return detail::__make_hierarchy_fragment<thread_level>()(detail::__as_level(l1), detail::__as_level(ls)...);
}

// We can consider removing the operator&, but its convenient for in-line construction
// TODO accept forwarding references
template <typename LUnit, typename LNew, typename... Levels>
_CUDAX_API constexpr auto operator&(const hierarchy_dimensions_fragment<LUnit, Levels...>& ls, LNew lnew) noexcept
{
auto new_level = detail::__as_level(lnew);
using NewLevel = decltype(new_level);
using top_level = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
using bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;

if constexpr (detail::can_rhs_stack_on_lhs<top_level, __level_type_of<NewLevel>>)
{
return hierarchy_dimensions_fragment<LUnit, NewLevel, Levels...>(
::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), ls.levels));
}
else
{
static_assert(detail::can_rhs_stack_on_lhs<__level_type_of<NewLevel>, bottom_level>,
"Not supported order of levels in hierarchy");
using NewUnit = detail::__default_unit_below<__level_type_of<NewLevel>>;
return hierarchy_dimensions_fragment<NewUnit, Levels..., NewLevel>(
::cuda::std::tuple_cat(ls.levels, ::cuda::std::make_tuple(new_level)));
}
}

template <typename L1, typename LUnit, typename... Levels>
_CUDAX_API constexpr auto operator&(L1 l1, const hierarchy_dimensions_fragment<LUnit, Levels...>& ls) noexcept
{
return ls & l1;
}

template <typename L1, typename Dims1, typename L2, typename Dims2>
_CUDAX_API constexpr auto
operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
{
return hierarchy_dimensions<level_dimensions<L1, Dims1>>(l1) & l2;
}

/**
* @brief Add a level to a hierarchy
*
Expand All @@ -921,35 +883,26 @@ operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims
* @par
*/
template <typename NewLevel, typename Unit, typename... Levels>
constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment<Unit, Levels...>& hierarchy, NewLevel level)
constexpr auto hierarchy_add_level(const hierarchy_dimensions_fragment<Unit, Levels...>& hierarchy, NewLevel lnew)
{
return hierarchy & level;
}
auto new_level = detail::__as_level(lnew);
using AddedLevel = decltype(new_level);
using top_level = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
using bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;

/**
* @brief A shorthand for creating a hierarchy of CUDA threads by evenly
* distributing elements among blocks and threads.
*
* @par Snippet
* @code
* #include <cudax/hierarchy_dimensions.cuh>
* using namespace cuda::experimental;
*
* constexpr int threadsPerBlock = 256;
* auto dims = distribute<threadsPerBlock>(numElements);
*
* // Equivalent to:
* constexpr int threadsPerBlock = 256;
* int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
* auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
* @endcode
*/
template <int _ThreadsPerBlock>
constexpr auto distribute(int numElements) noexcept
{
int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
return ::cuda::experimental::make_hierarchy(
::cuda::experimental::grid_dims(blocksPerGrid), ::cuda::experimental::block_dims<_ThreadsPerBlock>());
if constexpr (detail::can_rhs_stack_on_lhs<top_level, __level_type_of<AddedLevel>>)
{
return hierarchy_dimensions_fragment<Unit, AddedLevel, Levels...>(
::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), hierarchy.levels));
}
else
{
static_assert(detail::can_rhs_stack_on_lhs<__level_type_of<AddedLevel>, bottom_level>,
"Not supported order of levels in hierarchy");
using NewUnit = detail::__default_unit_below<__level_type_of<AddedLevel>>;
return hierarchy_dimensions_fragment<NewUnit, Levels..., AddedLevel>(
::cuda::std::tuple_cat(hierarchy.levels, ::cuda::std::make_tuple(new_level)));
}
}

} // namespace cuda::experimental
Expand Down
47 changes: 47 additions & 0 deletions cudax/include/cuda/experimental/__launch/configuration.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,28 @@ struct kernel_config
}
};

// We can consider removing the operator&, but its convenient for in-line construction
template <typename Dimensions, typename... Options, typename NewLevel>
_CUDAX_HOST_API constexpr auto
operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
{
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
}

template <typename NewLevel, typename Dimensions, typename... Options>
_CUDAX_HOST_API constexpr auto
operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
{
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
}

template <typename L1, typename Dims1, typename L2, typename Dims2>
_CUDAX_HOST_API constexpr auto
operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
{
return kernel_config(make_hierarchy_fragment(l1, l2));
}

template <typename Dimensions,
typename... Options,
typename Option,
Expand Down Expand Up @@ -423,6 +445,31 @@ make_config(const hierarchy_dimensions_fragment<BottomUnit, Levels...>& dims, co
return kernel_config<hierarchy_dimensions_fragment<BottomUnit, Levels...>, Opts...>(dims, opts...);
}

/**
* @brief A shorthand for creating a kernel configuration with a hierarchy of CUDA threads evenly
* distributing elements among blocks and threads.
*
* @par Snippet
* @code
* #include <cudax/hierarchy_dimensions.cuh>
* using namespace cuda::experimental;
*
* constexpr int threadsPerBlock = 256;
* auto dims = distribute<threadsPerBlock>(numElements);
*
* // Equivalent to:
* constexpr int threadsPerBlock = 256;
* int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
* auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
* @endcode
*/
template <int _ThreadsPerBlock>
constexpr auto distribute(int numElements) noexcept
{
int blocksPerGrid = (numElements + _ThreadsPerBlock - 1) / _ThreadsPerBlock;
return make_config(make_hierarchy(grid_dims(blocksPerGrid), block_dims<_ThreadsPerBlock>()));
}

template <typename... Args>
_CCCL_NODISCARD constexpr auto make_config(const Args&... args)
{
Expand Down
Loading
Loading