Skip to content

Commit

Permalink
Merge branch 'main' into linalg-accessors
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Dec 2, 2024
2 parents a39d27a + b3fe77f commit 9699ce5
Show file tree
Hide file tree
Showing 48 changed files with 764 additions and 232 deletions.
3 changes: 1 addition & 2 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: Always
FixNamespaceComments: true
IfMacros: [
'_CCCL_IF_CONSTEXPR',
'_CCCL_ELSE_IF_CONSTEXPR',
'_CCCL_IF_CONSTEXPR'
]
IndentWrappedFunctionNames: false
IncludeBlocks: Regroup
Expand Down
1 change: 1 addition & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"CCCL_ENABLE_EXAMPLES": false,
"CCCL_ENABLE_C": false,
"CCCL_SUPPRESS_ICC_DEPRECATION_WARNING": true,
"CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING": true,
"libcudacxx_ENABLE_INSTALL_RULES": true,
"CUB_ENABLE_INSTALL_RULES": true,
"Thrust_ENABLE_INSTALL_RULES": true,
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,17 @@ Unless otherwise specified, CCCL supports all the same operating systems as the

### Host Compilers

Unless otherwise specified, CCCL supports all the same host compilers as the CUDA Toolkit, which are documented here:
Unless otherwise specified, CCCL supports the same host compilers as the latest CUDA Toolkit, which are documented here:
- [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#host-compiler-support-policy)
- [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#system-requirements)

In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test.
When using older CUDA Toolkits, we also only support the host compilers of the latest CUDA Toolkit,
but at least the most recent host compiler of any supported older CUDA Toolkit.

We may retain support of additional compilers and will accept corresponding patches from the community with reasonable fixes.
But we will not invest significant time in triaging or fixing issues for older compilers.

In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test.

### C++ Dialects
- C++11 (Deprecated in Thrust/CUB, to be removed in next major version)
Expand Down
5 changes: 5 additions & 0 deletions cmake/CCCLBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ option(CCCL_ENABLE_EXCEPTIONS "Enable exceptions within CCCL libraries." ON)
option(CCCL_ENABLE_RTTI "Enable RTTI within CCCL libraries." ON)
option(CCCL_ENABLE_WERROR "Treat warnings as errors for CCCL targets." ON)
option(CCCL_SUPPRESS_ICC_DEPRECATION_WARNING "Suppress Intel Compiler deprecation warnings" OFF)
option(CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING "Suppress Visual Studio 2017 deprecation warnings" OFF)

function(cccl_build_compiler_interface interface_target cuda_compile_options cxx_compile_options compile_defs)
add_library(${interface_target} INTERFACE)
Expand Down Expand Up @@ -72,6 +73,10 @@ function(cccl_build_compiler_targets)
list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_ICC_DEPRECATION_WARNING")
endif()

if (CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING)
list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING")
endif()

if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
list(APPEND cuda_compile_options "--use-local-env")
list(APPEND cxx_compile_options "/bigobj")
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
15 changes: 15 additions & 0 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE && TUNE_ALGORITHM == 1
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif
# ifndef _CUB_HAS_TRANSFORM_UBLKCP
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include <cub/util_namespace.cuh>

#include <cuda/std/type_traits>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark tests overlapping memory regions for reading and is compute intensive

template <typename OffsetT>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark is compute intensive with diverging threads

template <class IndexT, class OutputT>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark uses a LOT of registers and is compute intensive.

template <int N>
Expand Down
13 changes: 11 additions & 2 deletions cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,17 @@ multiply_extract_higher_bits(T value, R multiplier)
{
static_assert(supported_integral<T>::value, "unsupported type");
static_assert(supported_integral<R>::value, "unsupported type");
_CCCL_ASSERT(value >= 0, "value must be non-negative");
_CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_ICC(186) // pointless comparison of unsigned integer with zero
_CCCL_IF_CONSTEXPR (_CCCL_TRAIT(::cuda::std::is_signed, T))
{
_CCCL_ASSERT(value >= 0, "value must be non-negative");
}
_CCCL_IF_CONSTEXPR (_CCCL_TRAIT(::cuda::std::is_signed, R))
{
_CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
}
_CCCL_DIAG_POP
static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT;
using unsigned_t = unsigned_implicit_prom_t<DivisorType>;
using larger_t = larger_unsigned_type_t<DivisorType>;
Expand Down
3 changes: 3 additions & 0 deletions cub/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
endif()
endif() # Not catch2 test

# Ensure that we test with assertions enabled
target_compile_definitions(${test_target} PRIVATE CCCL_ENABLE_ASSERTIONS)
endfunction()

# Sets out_var to launch id if the label contains launch variants
Expand Down
1 change: 1 addition & 0 deletions cudax/cmake/cudaxBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ function(cudax_build_compiler_targets)
if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
# stf heavily uses host device lambdas which break on clang due to a warning about the implicitly
# deleted copy constructor
# TODO(bgruber): remove this when NVBug 4980157 is resolved
append_option_if_available("-Wno-deprecated-copy" cxx_compile_options)
endif()

Expand Down
5 changes: 5 additions & 0 deletions cudax/examples/stf/linear_algebra/07-cholesky.cu
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ void PDNRM2_HOST(matrix<double>* A, double* result)

void PDPOTRF(matrix<double>& A)
{
auto guard = ctx.dot_section("PDPOTRF");

#ifdef HAVE_DOT
reserved::dot::set_current_color("yellow");
#endif
Expand Down Expand Up @@ -505,6 +507,7 @@ void PDTRSM(cublasSideMode_t side,

void PDPOTRS(matrix<double>& A, class matrix<double>& B, cublasFillMode_t uplo)
{
auto guard = ctx.dot_section("PDPOTRS");
#ifdef HAVE_DOT
reserved::dot::set_current_color("green");
#endif
Expand Down Expand Up @@ -656,12 +659,14 @@ int main(int argc, char** argv)
return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row);
};

ctx.dot_push_section("fillA");
if (check_result)
{
Aref.fill(hilbert);
}

A.fill(hilbert);
ctx.dot_pop_section();

/* Right-hand side */
matrix<double> B_potrs(N, 1, NB, 1, false, "B");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ public:
}
#endif // _CCCL_STD_VER <= 2017

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
# if _CCCL_STD_VER >= 2020
//! @brief Equality comparison between a \c managed_memory_resource and another resource
//! @param __rhs The resource to compare to
Expand Down Expand Up @@ -237,7 +237,7 @@ public:
friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {}
//! @brief Enables the \c host_accessible property
friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {}
#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

//! @brief Checks whether the passed in alignment is valid
static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public:
}
#endif // _CCCL_STD_VER <= 2017

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
# if _CCCL_STD_VER >= 2020
//! @brief Equality comparison between a \c pinned_memory_resource and another resource
//! @param __rhs The resource to compare to
Expand Down Expand Up @@ -239,7 +239,7 @@ public:
friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
//! @brief Enables the \c host_accessible property
friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

//! @brief Checks whether the passed in alignment is valid
static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
Expand Down
15 changes: 15 additions & 0 deletions cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,21 @@ public:
reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot());
}

void dot_push_section(::std::string symbol) const
{
reserved::dot::section::push(mv(symbol));
}

void dot_pop_section() const
{
reserved::dot::section::pop();
}

auto dot_section(::std::string symbol) const
{
return reserved::dot::section::guard(mv(symbol));
}

auto get_phase() const
{
return pimpl->get_phase();
Expand Down
Loading

0 comments on commit 9699ce5

Please sign in to comment.