diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index f3df08f84d3..c000b56cd42 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -52,10 +52,9 @@ #include #include +#include #include -#include - CUB_NAMESPACE_BEGIN namespace detail diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 2b5463432da..9bb7fb5d08a 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -51,8 +51,6 @@ #include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index 6852877ec17..dbd7fd842ce 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -52,8 +52,6 @@ #include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index fefda393aff..09b10cfe717 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -56,8 +56,6 @@ #include #include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh index 1ac367742b2..6042c845f0a 100644 --- a/cub/cub/agent/agent_scan_by_key.cuh +++ b/cub/cub/agent/agent_scan_by_key.cuh @@ -52,8 +52,6 @@ #include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index a142e274d2c..4b7a7687c80 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -56,8 +56,6 @@ #include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 843167526d4..6ef3d1d6395 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -47,9 +47,6 @@ #include -#include -#include - CUB_NAMESPACE_BEGIN /****************************************************************************** diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 467d9141dc3..e11113f54e4 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -46,9 +46,8 @@ #include #include - -#include -#include +#include +#include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index 3943baaf6d5..1bcf3aed587 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -41,8 +41,6 @@ #include #include -#include - CUB_NAMESPACE_BEGIN namespace detail diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index d84db4feef9..75075b29fe3 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -41,7 +41,7 @@ #include // _CCCL_HAS_INT128() #include // cuda::std::ceil_div -#include // std::has_single_bit +#include // cuda::std::has_single_bit #include // CHAR_BIT #include // uint64_t #include // numeric_limits diff --git a/cub/cub/detail/mdspan_utils.cuh b/cub/cub/detail/mdspan_utils.cuh index 81d97414b2c..eaa4186b871 100644 --- a/cub/cub/detail/mdspan_utils.cuh +++ b/cub/cub/detail/mdspan_utils.cuh @@ -41,7 +41,7 @@ # include // fast_div_mod -# include // std::array +# include // cuda::std::array # include // size_t # include # include // make_unsigned_t diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh index cd67ef80696..576454ce2ba 100644 --- a/cub/cub/detail/nvtx.cuh +++ b/cub/cub/detail/nvtx.cuh @@ -48,7 +48,8 @@ // * C++14 is available for cuda::std::optional // * NVTX3 uses module as an identifier, which trips up NVHPC #if _CCCL_HAS_INCLUDE() && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) \ - && (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017) + && (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017) \ + && !_CCCL_COMPILER(NVRTC) // Include our NVTX3 C++ wrapper if not available from the CTK // TODO(bgruber): replace by a check for the first CTK version shipping the header # if _CCCL_HAS_INCLUDE() diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh index a0e15dbdd16..f0ebc07609a 100644 --- a/cub/cub/device/device_for.cuh +++ b/cub/cub/device/device_for.cuh @@ -44,7 +44,6 @@ #include #include -#include #include #include diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index f74fc17ab94..31d85d7b140 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -48,8 +48,6 @@ #include -#include - CUB_NAMESPACE_BEGIN namespace detail::adjacent_difference diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh index b2c4ab775d4..4fbd04a9bfd 100644 --- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh +++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh @@ -47,9 +47,9 @@ # include -# include // std::integral_constant -# include // std::index_sequence -# include // std::array +# include // cuda::std::integral_constant +# include // cuda::std::index_sequence +# include // cuda::std::array # include // size_t # define _CUB_RETURN_IF_ERROR(STATUS) \ diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 6cd0c75a3fa..a359e95dbcc 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -234,9 +234,9 @@ struct DispatchMergeSort * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases, * merge sort allocates virtual shared memory that resides in global memory. */ - const std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock(); - const std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock(); - const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); + const ::cuda::std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock(); + const ::cuda::std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock(); + const ::cuda::std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; size_t allocation_sizes[4] = { diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 8ac66edb6b0..fc69e735b2c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -631,7 +631,7 @@ struct DeviceSegmentedReduceKernelSource InitT, AccumT>) - CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize() + CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumSize() { return sizeof(AccumT); } diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index e447c27a9c2..fa4f3137b03 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -48,7 +48,9 @@ #include -#include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif // !_CCCL_COMPILER(NVRTC) CUB_NAMESPACE_BEGIN @@ -155,7 +157,7 @@ public: */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedOutputIterator(QualifiedValueType* ptr) - : ptr(const_cast::type*>(ptr)) + : ptr(const_cast<::cuda::std::remove_cv_t*>(ptr)) {} /// Postfix increment @@ -236,11 +238,13 @@ public: return (ptr != rhs.ptr); } +#if !_CCCL_COMPILER(NVRTC) /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { return os; } +#endif // !_CCCL_COMPILER(NVRTC) }; CUB_NAMESPACE_END diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index 4be625c6fcc..e128097ee6e 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -49,10 +49,12 @@ #include -#include - #include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif // !_CCCL_COMPILER(NVRTC) + CUB_NAMESPACE_BEGIN /** @@ -153,6 +155,7 @@ public: , tex_obj(0) {} +#if !_CCCL_COMPILER(NVRTC) /** * @brief Use this iterator to bind @p ptr with a texture reference * @@ -189,6 +192,7 @@ public: { return CubDebug(cudaDestroyTextureObject(tex_obj)); } +#endif // !_CCCL_COMPILER(NVRTC) /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) @@ -281,6 +285,7 @@ public: return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); } +#if !_CCCL_COMPILER(NVRTC) /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { @@ -288,6 +293,7 @@ public: << " )"; return os; } +#endif // !_CCCL_COMPILER(NVRTC) private: // This is hoisted out of operator* because #pragma can't be used inside of diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh index dfd2e07f2c7..ba14853e900 100644 --- a/cub/cub/thread/thread_search.cuh +++ b/cub/cub/thread/thread_search.cuh @@ -46,8 +46,6 @@ #include #include -#include - #include CUB_NAMESPACE_BEGIN diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 9c9fd4144eb..c2d74a76658 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -38,12 +38,27 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") nvrtcProgram prog{}; const char* src = R"asdf( - #include - #include - #include - #include - #include - #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include #include #include #include @@ -53,8 +68,13 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") #include #include #include + #include #include + #include #include + #include + #include + #include #include #include #include @@ -63,7 +83,34 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]") #include #include #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include #include + #include + #include + #include + #include + #include + #include + #include + #include #include #include diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index 8bedfd5d818..c62ab01f271 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -46,7 +46,6 @@ #include #include #include -#include #include #include @@ -200,13 +199,13 @@ struct specialize_plan : specialize_plan_msvc10_war::type::type template struct temp_storage_size { - static constexpr std::size_t value = 0; + static constexpr ::cuda::std::size_t value = 0; }; template struct temp_storage_size> { - static constexpr std::size_t value = sizeof(typename Agent::TempStorage); + static constexpr ::cuda::std::size_t value = sizeof(typename Agent::TempStorage); }; // check whether all Agents requires < MAX_SHMEM shared memory @@ -245,6 +244,7 @@ struct has_enough_shmem : has_enough_shmem_impl ///////////////////////// ///////////////////////// +#if !_CCCL_COMPILER(NVRTC) // AgentPlan structure and helpers // -------------------------------- @@ -343,87 +343,6 @@ THRUST_RUNTIME_FUNCTION typename get_plan::type get_agent_plan(int ptx_ve (return get_agent_plan_impl::get(ptx_version);)); } -// XXX keep this dead-code for now as a gentle reminder -// that kernel launch which reats plan values is the most robust -// mechanism to extract sm-specific tuning parameters -// TODO: since we are unable to afford kernel launch + cudaMemcpy ON EVERY -// algorithm invocation, we need to design a good caching strategy -// such that when the algorithm is called multiple times, only the -// first invocation will invoke kernel launch + cudaMemcpy, but -// the subsequent invocations, will just read cached values from host mem -// If launched from device, this is just a device-function call -// no caching is required. -// ---------------------------------------------------------------------------- -// if we don't know ptx version, we can call kernel -// to retrieve AgentPlan from device code. Slower, but guaranteed to work -// ----------------------------------------------------------------------- -#if 0 - template - void __global__ get_agent_plan_kernel(AgentPlan *plan); - - static _CCCL_DEVICE AgentPlan agent_plan_device; - - template - AgentPlan _CCCL_DEVICE get_agent_plan_dev() - { - AgentPlan plan; - plan.block_threads = Agent::ptx_plan::BLOCK_THREADS; - plan.items_per_thread = Agent::ptx_plan::ITEMS_PER_THREAD; - plan.items_per_tile = Agent::ptx_plan::ITEMS_PER_TILE; - plan.shared_memory_size = temp_storage_size::value; - return plan; - } - - template - AgentPlan _CCCL_HOST_DEVICE _CCCL_FORCEINLINE - xget_agent_plan_impl(F f, cudaStream_t s, void* d_ptr) - { - AgentPlan plan; -# ifdef __CUDA_ARCH__ - plan = get_agent_plan_dev(); -# else - static std::mutex mutex; - bool lock = false; - if (d_ptr == 0) - { - lock = true; - cudaGetSymbolAddress(&d_ptr, agent_plan_device); - } - if (lock) - mutex.lock(); - f<<<1,1,0,s>>>((AgentPlan*)d_ptr); - cudaMemcpyAsync((void*)&plan, - d_ptr, - sizeof(AgentPlan), - cudaMemcpyDeviceToHost, - s); - if (lock) - mutex.unlock(); - cudaStreamSynchronize(s); -# endif - return plan; - } - - template - AgentPlan THRUST_RUNTIME_FUNCTION - get_agent_plan(cudaStream_t s = 0, void *ptr = 0) - { - return xget_agent_plan_impl(get_agent_plan_kernel, - s, - ptr); - } - - template - void __global__ get_agent_plan_kernel(AgentPlan *plan) - { - *plan = get_agent_plan_dev(); - } -#endif - -///////////////////////// -///////////////////////// -///////////////////////// - THRUST_RUNTIME_FUNCTION inline int get_sm_count() { int dev_id; @@ -469,6 +388,7 @@ THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t return 0; } } +#endif // !_CCCL_COMPILER(NVRTC) template struct get_arch; @@ -528,6 +448,7 @@ class cuda_optional } }; +#if !_CCCL_COMPILER(NVRTC) THRUST_RUNTIME_FUNCTION inline int get_ptx_version() { int ptx_version = 0; @@ -581,6 +502,7 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version() return ptx_version; } +#endif // !_CCCL_COMPILER(NVRTC) #define _CUDA_CUB_RET_IF_FAIL(e) \ { \ @@ -671,6 +593,7 @@ struct uninitialized_array } }; +#if !_CCCL_COMPILER(NVRTC) namespace host { inline cuda_optional get_max_shared_memory_per_block() @@ -699,6 +622,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage( { return cub::detail::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes); } +#endif // !_CCCL_COMPILER(NVRTC) } // namespace detail } // namespace core diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h index ffac3f5ffc7..4d251f271cc 100644 --- a/thrust/thrust/system/cuda/detail/util.h +++ b/thrust/thrust/system/cuda/detail/util.h @@ -42,14 +42,16 @@ #include #include -#include -#include - -#include -#include #include +#if !_CCCL_COMPILER(NVRTC) +# include +# include + +# include +#endif // !_CCCL_COMPILER(NVRTC) + THRUST_NAMESPACE_BEGIN namespace cuda_cub { @@ -132,6 +134,7 @@ _CCCL_HOST_DEVICE cudaError_t synchronize_optional(Policy& policy) return synchronize_stream_optional(derived_cast(policy)); } +#if !_CCCL_COMPILER(NVRTC) template THRUST_HOST_FUNCTION cudaError_t trivial_copy_from_device(Type* dst, Type const* src, size_t count, cudaStream_t stream) { @@ -176,6 +179,7 @@ trivial_copy_device_to_device(Policy& policy, Type* dst, Type const* src, size_t cuda_cub::synchronize_optional(policy); return status; } +#endif // !_CCCL_COMPILER(NVRTC) _CCCL_HOST_DEVICE inline void throw_on_error(cudaError_t status) { diff --git a/thrust/thrust/system/error_code.h b/thrust/thrust/system/error_code.h index 1573f8a3a82..c434d788985 100644 --- a/thrust/thrust/system/error_code.h +++ b/thrust/thrust/system/error_code.h @@ -33,7 +33,9 @@ #include #include -#include +#if !_CCCL_COMPILER(NVRTC) +# include +#endif // !_CCCL_COMPILER(NVRTC) THRUST_NAMESPACE_BEGIN @@ -329,10 +331,12 @@ inline error_code make_error_code(errc::errc_t e); */ inline bool operator<(const error_code& lhs, const error_code& rhs); +#if !_CCCL_COMPILER(NVRTC) /*! Effects: os << ec.category().name() << ':' << ec.value(). */ template std::basic_ostream& operator<<(std::basic_ostream& os, const error_code& ec); +#endif // !_CCCL_COMPILER(NVRTC) // [19.5.3] class error_condition