Merge branch 'main' into linalg-accessors

NVIDIA · Dec 2, 2024 · 9699ce5 · 9699ce5
2 parents a39d27a + b3fe77f
commit 9699ce5
Show file tree

Hide file tree

Showing 48 changed files with 764 additions and 232 deletions.
diff --git a/.clang-format b/.clang-format
@@ -88,8 +88,7 @@ EmptyLineAfterAccessModifier: Never
 EmptyLineBeforeAccessModifier: Always
 FixNamespaceComments: true
 IfMacros: [
-  '_CCCL_IF_CONSTEXPR',
-  '_CCCL_ELSE_IF_CONSTEXPR',
+  '_CCCL_IF_CONSTEXPR'
 ]
 IndentWrappedFunctionNames: false
 IncludeBlocks:   Regroup

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -23,6 +23,7 @@
         "CCCL_ENABLE_EXAMPLES": false,
         "CCCL_ENABLE_C": false,
         "CCCL_SUPPRESS_ICC_DEPRECATION_WARNING": true,
+        "CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING": true,
         "libcudacxx_ENABLE_INSTALL_RULES": true,
         "CUB_ENABLE_INSTALL_RULES": true,
         "Thrust_ENABLE_INSTALL_RULES": true,

diff --git a/README.md b/README.md
@@ -256,11 +256,17 @@ Unless otherwise specified, CCCL supports all the same operating systems as the
 
 ### Host Compilers
 
-Unless otherwise specified, CCCL supports all the same host compilers as the CUDA Toolkit, which are documented here:
+Unless otherwise specified, CCCL supports the same host compilers as the latest CUDA Toolkit, which are documented here:
 - [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#host-compiler-support-policy)
 - [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#system-requirements)
 
-In the spirit of "You only support what you test",  see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test.
+When using older CUDA Toolkits, we also only support the host compilers of the latest CUDA Toolkit,
+but at least the most recent host compiler of any supported older CUDA Toolkit.
+
+We may retain support of additional compilers and will accept corresponding patches from the community with reasonable fixes.
+But we will not invest significant time in triaging or fixing issues for older compilers.
+
+In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test.
 
 ### C++ Dialects
 - C++11 (Deprecated in Thrust/CUB, to be removed in next major version)

diff --git a/cmake/CCCLBuildCompilerTargets.cmake b/cmake/CCCLBuildCompilerTargets.cmake
@@ -24,6 +24,7 @@ option(CCCL_ENABLE_EXCEPTIONS "Enable exceptions within CCCL libraries." ON)
 option(CCCL_ENABLE_RTTI "Enable RTTI within CCCL libraries." ON)
 option(CCCL_ENABLE_WERROR "Treat warnings as errors for CCCL targets." ON)
 option(CCCL_SUPPRESS_ICC_DEPRECATION_WARNING "Suppress Intel Compiler deprecation warnings" OFF)
+option(CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING "Suppress Visual Studio 2017 deprecation warnings" OFF)
 
 function(cccl_build_compiler_interface interface_target cuda_compile_options cxx_compile_options compile_defs)
   add_library(${interface_target} INTERFACE)
@@ -72,6 +73,10 @@ function(cccl_build_compiler_targets)
     list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_ICC_DEPRECATION_WARNING")
   endif()
 
+  if (CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING)
+    list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING")
+  endif()
+
   if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
     list(APPEND cuda_compile_options "--use-local-env")
     list(APPEND cxx_compile_options "/bigobj")

diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {

diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {

diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {

diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
@@ -1,7 +1,22 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
+#pragma once
+
+// keep checks at the top so compilation of discarded variants fails really fast
 #include <cub/device/dispatch/dispatch_transform.cuh>
+#if !TUNE_BASE && TUNE_ALGORITHM == 1
+#  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "When tuning, this benchmark does not support being compiled for multiple architectures"
+#  endif
+#  if (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+#  ifndef _CUB_HAS_TRANSFORM_UBLKCP
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
 #include <cub/util_namespace.cuh>
 
 #include <cuda/std/type_traits>

diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark tests overlapping memory regions for reading and is compute intensive
 
 template <typename OffsetT>

diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark is compute intensive with diverging threads
 
 template <class IndexT, class OutputT>

diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark uses a LOT of registers and is compute intensive.
 
 template <int N>

diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh
@@ -109,8 +109,17 @@ multiply_extract_higher_bits(T value, R multiplier)
 {
   static_assert(supported_integral<T>::value, "unsupported type");
   static_assert(supported_integral<R>::value, "unsupported type");
-  _CCCL_ASSERT(value >= 0, "value must be non-negative");
-  _CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_ICC(186) // pointless comparison of unsigned integer with zero
+  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(::cuda::std::is_signed, T))
+  {
+    _CCCL_ASSERT(value >= 0, "value must be non-negative");
+  }
+  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(::cuda::std::is_signed, R))
+  {
+    _CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
+  }
+  _CCCL_DIAG_POP
   static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT;
   using unsigned_t             = unsigned_implicit_prom_t<DivisorType>;
   using larger_t               = larger_unsigned_type_t<DivisorType>;

diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
@@ -308,6 +308,9 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
     endif()
   endif() # Not catch2 test
+
+  # Ensure that we test with assertions enabled
+  target_compile_definitions(${test_target} PRIVATE CCCL_ENABLE_ASSERTIONS)
 endfunction()
 
 # Sets out_var to launch id if the label contains launch variants

diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -47,6 +47,7 @@ function(cudax_build_compiler_targets)
   if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
     # stf heavily uses host device lambdas which break on clang due to a warning about the implicitly
     # deleted copy constructor
+    # TODO(bgruber): remove this when NVBug 4980157 is resolved
     append_option_if_available("-Wno-deprecated-copy" cxx_compile_options)
   endif()
 

diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu
@@ -385,6 +385,8 @@ void PDNRM2_HOST(matrix<double>* A, double* result)
 
 void PDPOTRF(matrix<double>& A)
 {
+  auto guard = ctx.dot_section("PDPOTRF");
+
 #ifdef HAVE_DOT
   reserved::dot::set_current_color("yellow");
 #endif
@@ -505,6 +507,7 @@ void PDTRSM(cublasSideMode_t side,
 
 void PDPOTRS(matrix<double>& A, class matrix<double>& B, cublasFillMode_t uplo)
 {
+  auto guard = ctx.dot_section("PDPOTRS");
 #ifdef HAVE_DOT
   reserved::dot::set_current_color("green");
 #endif
@@ -656,12 +659,14 @@ int main(int argc, char** argv)
     return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row);
   };
 
+  ctx.dot_push_section("fillA");
   if (check_result)
   {
     Aref.fill(hilbert);
   }
 
   A.fill(hilbert);
+  ctx.dot_pop_section();
 
   /* Right-hand side */
   matrix<double> B_potrs(N, 1, NB, 1, false, "B");

diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
@@ -159,7 +159,7 @@ public:
   }
 #endif // _CCCL_STD_VER <= 2017
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c managed_memory_resource and another resource
   //! @param __rhs The resource to compare to
@@ -237,7 +237,7 @@ public:
   friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {}
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept

diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
@@ -160,7 +160,7 @@ public:
   }
 #endif // _CCCL_STD_VER <= 2017
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c pinned_memory_resource and another resource
   //! @param __rhs The resource to compare to
@@ -239,7 +239,7 @@ public:
   friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept

diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -879,6 +879,21 @@ public:
     reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot());
   }
 
+  void dot_push_section(::std::string symbol) const
+  {
+    reserved::dot::section::push(mv(symbol));
+  }
+
+  void dot_pop_section() const
+  {
+    reserved::dot::section::pop();
+  }
+
+  auto dot_section(::std::string symbol) const
+  {
+    return reserved::dot::section::guard(mv(symbol));
+  }
+
   auto get_phase() const
   {
     return pimpl->get_phase();