Skip to content

Commit

Permalink
Allow NVRTC to compile more of CUB (#3951)
Browse files Browse the repository at this point in the history
* Allow NVRTC to compile more of CUB
  • Loading branch information
bernhardmgruber authored Feb 27, 2025
1 parent e3d7a81 commit ac268c3
Show file tree
Hide file tree
Showing 24 changed files with 103 additions and 135 deletions.
3 changes: 1 addition & 2 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/cstdint>
#include <cuda/std/type_traits>

#include <cstdint>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@
#include <cuda/ptx>
#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
3 changes: 0 additions & 3 deletions cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@

#include <cuda/std/type_traits>

#include <iterator>
#include <type_traits>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
5 changes: 2 additions & 3 deletions cub/cub/block/block_run_length_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

#include <limits>
#include <type_traits>
#include <cuda/std/limits>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN

Expand Down
2 changes: 0 additions & 2 deletions cub/cub/detail/choose_offset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@
#include <cuda/std/iterator>
#include <cuda/std/type_traits>

#include <cstdint>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#include <cub/util_type.cuh> // _CCCL_HAS_INT128()

#include <cuda/cmath> // cuda::std::ceil_div
#include <cuda/std/bit> // std::has_single_bit
#include <cuda/std/bit> // cuda::std::has_single_bit
#include <cuda/std/climits> // CHAR_BIT
#include <cuda/std/cstdint> // uint64_t
#include <cuda/std/limits> // numeric_limits
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/mdspan_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# include <cub/detail/fast_modulo_division.cuh> // fast_div_mod

# include <cuda/std/array> // std::array
# include <cuda/std/array> // cuda::std::array
# include <cuda/std/cstddef> // size_t
# include <cuda/std/mdspan>
# include <cuda/std/type_traits> // make_unsigned_t
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/detail/nvtx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
// * C++14 is available for cuda::std::optional
// * NVTX3 uses module as an identifier, which trips up NVHPC
#if _CCCL_HAS_INCLUDE(<nvtx3/nvToolsExt.h>) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) \
&& (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017)
&& (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017) \
&& !_CCCL_COMPILER(NVRTC)
// Include our NVTX3 C++ wrapper if not available from the CTK
// TODO(bgruber): replace by a check for the first CTK version shipping the header
# if _CCCL_HAS_INCLUDE(<nvtx3/nvtx3.hpp>)
Expand Down
1 change: 0 additions & 1 deletion cub/cub/device/device_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@

#include <thrust/detail/raw_reference_cast.h>
#include <thrust/distance.h>
#include <thrust/system/cuda/detail/core/util.h>
#include <thrust/type_traits/is_contiguous_iterator.h>
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

Expand Down
2 changes: 0 additions & 2 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>

CUB_NAMESPACE_BEGIN

namespace detail::adjacent_difference
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@

# include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

# include <cuda/std/__type_traits/integral_constant.h> // std::integral_constant
# include <cuda/std/__utility/integer_sequence.h> // std::index_sequence
# include <cuda/std/array> // std::array
# include <cuda/std/__type_traits/integral_constant.h> // cuda::std::integral_constant
# include <cuda/std/__utility/integer_sequence.h> // cuda::std::index_sequence
# include <cuda/std/array> // cuda::std::array
# include <cuda/std/cstddef> // size_t

# define _CUB_RETURN_IF_ERROR(STATUS) \
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ struct DispatchMergeSort
* Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases,
* merge sort allocates virtual shared memory that resides in global memory.
*/
const std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock();
const std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock();
const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);
const ::cuda::std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock();
const ::cuda::std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock();
const ::cuda::std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);

void* allocations[4] = {nullptr, nullptr, nullptr, nullptr};
size_t allocation_sizes[4] = {
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ struct DeviceSegmentedReduceKernelSource
InitT,
AccumT>)

CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize()
CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumSize()
{
return sizeof(AccumT);
}
Expand Down
8 changes: 6 additions & 2 deletions cub/cub/iterator/cache_modified_output_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@

#include <thrust/iterator/iterator_facade.h>

#include <iosfwd>
#if !_CCCL_COMPILER(NVRTC)
# include <iosfwd>
#endif // !_CCCL_COMPILER(NVRTC)

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -155,7 +157,7 @@ public:
*/
template <typename QualifiedValueType>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedOutputIterator(QualifiedValueType* ptr)
: ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type*>(ptr))
: ptr(const_cast<::cuda::std::remove_cv_t<QualifiedValueType>*>(ptr))
{}

/// Postfix increment
Expand Down Expand Up @@ -236,11 +238,13 @@ public:
return (ptr != rhs.ptr);
}

#if !_CCCL_COMPILER(NVRTC)
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
return os;
}
#endif // !_CCCL_COMPILER(NVRTC)
};

CUB_NAMESPACE_END
10 changes: 8 additions & 2 deletions cub/cub/iterator/tex_obj_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@

#include <thrust/iterator/iterator_facade.h>

#include <ostream>

#include <nv/target>

#if !_CCCL_COMPILER(NVRTC)
# include <ostream>
#endif // !_CCCL_COMPILER(NVRTC)

CUB_NAMESPACE_BEGIN

/**
Expand Down Expand Up @@ -153,6 +155,7 @@ public:
, tex_obj(0)
{}

#if !_CCCL_COMPILER(NVRTC)
/**
* @brief Use this iterator to bind @p ptr with a texture reference
*
Expand Down Expand Up @@ -189,6 +192,7 @@ public:
{
return CubDebug(cudaDestroyTextureObject(tex_obj));
}
#endif // !_CCCL_COMPILER(NVRTC)

/// Postfix increment
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
Expand Down Expand Up @@ -281,13 +285,15 @@ public:
return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
}

#if !_CCCL_COMPILER(NVRTC)
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "cub::TexObjInputIterator( ptr=" << itr.ptr << ", offset=" << itr.tex_offset << ", tex_obj=" << itr.tex_obj
<< " )";
return os;
}
#endif // !_CCCL_COMPILER(NVRTC)

private:
// This is hoisted out of operator* because #pragma can't be used inside of
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/thread/thread_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@
#include <cub/util_namespace.cuh>
#include <cub/util_type.cuh>

#include <iterator>

#include <nv/target>

CUB_NAMESPACE_BEGIN
Expand Down
59 changes: 53 additions & 6 deletions cub/test/catch2_test_nvrtc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,27 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
nvrtcProgram prog{};

const char* src = R"asdf(
#include <cub/warp/warp_reduce.cuh>
#include <cub/warp/warp_scan.cuh>
#include <cub/warp/warp_exchange.cuh>
#include <cub/warp/warp_load.cuh>
#include <cub/warp/warp_store.cuh>
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/agent/agent_adjacent_difference.cuh>
#include <cub/agent/agent_batch_memcpy.cuh>
#include <cub/agent/agent_for.cuh>
#include <cub/agent/agent_histogram.cuh>
#include <cub/agent/agent_merge.cuh>
#include <cub/agent/agent_merge_sort.cuh>
#include <cub/agent/agent_radix_sort_downsweep.cuh>
#include <cub/agent/agent_radix_sort_histogram.cuh>
#include <cub/agent/agent_radix_sort_onesweep.cuh>
#include <cub/agent/agent_radix_sort_upsweep.cuh>
#include <cub/agent/agent_reduce_by_key.cuh>
#include <cub/agent/agent_reduce.cuh>
#include <cub/agent/agent_rle.cuh>
#include <cub/agent/agent_scan_by_key.cuh>
#include <cub/agent/agent_scan.cuh>
#include <cub/agent/agent_segmented_radix_sort.cuh>
#include <cub/agent/agent_select_if.cuh>
#include <cub/agent/agent_sub_warp_merge_sort.cuh>
#include <cub/agent/agent_three_way_partition.cuh>
#include <cub/agent/agent_unique_by_key.cuh>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_adjacent_difference.cuh>
#include <cub/block/block_discontinuity.cuh>
#include <cub/block/block_exchange.cuh>
Expand All @@ -53,8 +68,13 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
#include <cub/block/block_merge_sort.cuh>
#include <cub/block/block_radix_rank.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_raking_layout.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_run_length_decode.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/block/block_shuffle.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/device/dispatch/kernels/reduce.cuh>
#include <cub/device/dispatch/kernels/for_each.cuh>
#include <cub/device/dispatch/kernels/scan.cuh>
Expand All @@ -63,7 +83,34 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
#include <cub/device/dispatch/kernels/radix_sort.cuh>
#include <cub/device/dispatch/kernels/unique_by_key.cuh>
#include <cub/device/dispatch/kernels/transform.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/cache_modified_output_iterator.cuh>
#include <cub/iterator/tex_obj_input_iterator.cuh>
#include <cub/thread/thread_load.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/thread/thread_reduce.cuh>
#include <cub/thread/thread_scan.cuh>
#include <cub/thread/thread_sort.cuh>
#include <cub/thread/thread_store.cuh>
#include <cub/warp/warp_reduce.cuh>
#include <cub/warp/warp_scan.cuh>
#include <cub/warp/warp_exchange.cuh>
#include <cub/warp/warp_load.cuh>
#include <cub/warp/warp_store.cuh>
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/util_arch.cuh>
#include <cub/util_cpp_dialect.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_device.cuh>
#include <cub/util_macro.cuh>
#include <cub/util_math.cuh>
#include <cub/util_namespace.cuh>
#include <cub/util_policy_wrapper_t.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_temporary_storage.cuh>
#include <cub/util_type.cuh>
#include <cub/util_vsmem.cuh>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
Expand Down
Loading

0 comments on commit ac268c3

Please sign in to comment.