diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index eacda52673d..629993c3cea 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -45,6 +45,8 @@ #include #include +#include + #include #include @@ -195,10 +197,10 @@ struct DeviceHistogram d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - &num_levels, - &lower_level, - &upper_level, + ::cuda::std::array{{d_histogram}}, + ::cuda::std::array{{num_levels}}, + ::cuda::std::array{{lower_level}}, + ::cuda::std::array{{upper_level}}, num_samples, static_cast(1), sizeof(SampleT) * num_samples, @@ -346,10 +348,10 @@ struct DeviceHistogram d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - &num_levels, - &lower_level, - &upper_level, + ::cuda::std::array{{d_histogram}}, + ::cuda::std::array{{num_levels}}, + ::cuda::std::array{{lower_level}}, + ::cuda::std::array{{upper_level}}, num_row_samples, num_rows, row_stride_bytes, @@ -501,10 +503,10 @@ struct DeviceHistogram void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, OffsetT num_pixels, cudaStream_t stream = 0) { @@ -688,10 +690,10 @@ struct DeviceHistogram void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, @@ -855,9 +857,9 @@ struct DeviceHistogram d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - &num_levels, - &d_levels, + ::cuda::std::array{{d_histogram}}, + ::cuda::std::array{{num_levels}}, + ::cuda::std::array{{d_levels}}, num_samples, (OffsetT) 1, (size_t) (sizeof(SampleT) * num_samples), @@ -993,9 +995,9 @@ struct DeviceHistogram d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - &num_levels, - &d_levels, + ::cuda::std::array{{d_histogram}}, + ::cuda::std::array{{num_levels}}, + ::cuda::std::array{{d_levels}}, num_row_samples, num_rows, row_stride_bytes, @@ -1138,9 +1140,9 @@ struct DeviceHistogram void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array d_levels, OffsetT num_pixels, cudaStream_t stream = 0) { @@ -1315,9 +1317,9 @@ struct DeviceHistogram void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array d_levels, OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index d5a894c3310..77a5a2cc580 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -272,46 +272,17 @@ struct dispatch_histogram void* d_temp_storage; size_t& temp_storage_bytes; SampleIteratorT d_samples; - CounterT** d_output_histograms; - const int* num_privatized_levels; - PrivatizedDecodeOpT* privatized_decode_op; - const int* num_output_levels; - OutputDecodeOpT* output_decode_op; + ::cuda::std::array d_output_histograms; + ::cuda::std::array num_privatized_levels; + ::cuda::std::array privatized_decode_op; + ::cuda::std::array num_output_levels; + ::cuda::std::array output_decode_op; int max_num_output_bins; OffsetT num_row_pixels; OffsetT num_rows; OffsetT row_stride_samples; cudaStream_t stream; - CUB_RUNTIME_FUNCTION dispatch_histogram( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_privatized_levels[NUM_ACTIVE_CHANNELS], - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], - int max_num_output_bins, - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - cudaStream_t stream) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_samples(d_samples) - , d_output_histograms(d_output_histograms) - , num_privatized_levels(num_privatized_levels) - , privatized_decode_op(privatized_decode_op) - , num_output_levels(num_output_levels) - , output_decode_op(output_decode_op) - , max_num_output_bins(max_num_output_bins) - , num_row_pixels(num_row_pixels) - , num_rows(num_rows) - , row_stride_samples(row_stride_samples) - , stream(stream) - {} - template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Invoke(DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel) @@ -402,33 +373,21 @@ struct dispatch_histogram GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); // Wrap arrays so we can pass them by-value to the kernel - ::cuda::std::array d_output_histograms_wrapper; ::cuda::std::array d_privatized_histograms_wrapper; - ::cuda::std::array privatized_decode_op_wrapper; - ::cuda::std::array output_decode_op_wrapper; ::cuda::std::array num_privatized_bins_wrapper; ::cuda::std::array num_output_bins_wrapper; auto* typedAllocations = reinterpret_cast(allocations); - ::cuda::std::copy( - d_output_histograms, d_output_histograms + NUM_ACTIVE_CHANNELS, d_output_histograms_wrapper.begin()); ::cuda::std::copy( typedAllocations, typedAllocations + NUM_ACTIVE_CHANNELS, d_privatized_histograms_wrapper.begin()); - // TODO(bgruber): we can probably skip copying the function objects when they are empty - ::cuda::std::copy( - privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin()); - ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin()); auto minus_one = ::cuda::proclaim_return_type([](int levels) { return levels - 1; }); ::cuda::std::transform( - num_privatized_levels, - num_privatized_levels + NUM_ACTIVE_CHANNELS, - num_privatized_bins_wrapper.begin(), - minus_one); + num_privatized_levels.begin(), num_privatized_levels.end(), num_privatized_bins_wrapper.begin(), minus_one); ::cuda::std::transform( - num_output_levels, num_output_levels + NUM_ACTIVE_CHANNELS, num_output_bins_wrapper.begin(), minus_one); + num_output_levels.begin(), num_output_levels.end(), num_output_bins_wrapper.begin(), minus_one); int histogram_init_block_threads = 256; int histogram_init_grid_dims = @@ -445,7 +404,7 @@ struct dispatch_histogram // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( histogram_init_grid_dims, histogram_init_block_threads, 0, stream) - .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); + .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms, tile_queue); // Return if empty problem if ((blocks_per_row == 0) || (blocks_per_col == 0)) @@ -472,10 +431,10 @@ struct dispatch_histogram d_samples, num_output_bins_wrapper, num_privatized_bins_wrapper, - d_output_histograms_wrapper, + d_output_histograms, d_privatized_histograms_wrapper, - output_decode_op_wrapper, - privatized_decode_op_wrapper, + output_decode_op, + privatized_decode_op, num_row_pixels, num_rows, row_stride_samples, @@ -912,9 +871,9 @@ public: void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_output_histograms, + ::cuda::std::array num_output_levels, + ::cuda::std::array d_levels, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, @@ -946,8 +905,8 @@ public: // Use the pass-thru transform op for converting privatized bins to output bins using OutputDecodeOpT = PassThruTransform; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; + ::cuda::std::array privatized_decode_op{}; + ::cuda::std::array output_decode_op{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) @@ -976,7 +935,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -989,7 +948,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) @@ -1012,7 +971,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -1025,7 +984,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) @@ -1089,9 +1048,9 @@ public: void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_output_histograms, + ::cuda::std::array num_output_levels, + ::cuda::std::array d_levels, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, @@ -1122,9 +1081,9 @@ public: // Use the search transform op for converting privatized bins to output bins using OutputDecodeOpT = SearchTransform; - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; + ::cuda::std::array num_privatized_levels; + ::cuda::std::array privatized_decode_op{}; + ::cuda::std::array output_decode_op{}; int max_levels = num_output_levels[0]; // Maximum number of levels in any channel for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) @@ -1151,7 +1110,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -1164,7 +1123,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) @@ -1228,10 +1187,10 @@ public: void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_output_histograms, + ::cuda::std::array num_output_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, @@ -1262,8 +1221,8 @@ public: // Use the pass-thru transform op for converting privatized bins to output bins using OutputDecodeOpT = PassThruTransform; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; + ::cuda::std::array privatized_decode_op{}; + ::cuda::std::array output_decode_op{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) @@ -1304,7 +1263,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -1317,7 +1276,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) @@ -1340,7 +1299,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -1353,7 +1312,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) @@ -1419,10 +1378,10 @@ public: void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], + ::cuda::std::array d_output_histograms, + ::cuda::std::array num_output_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, @@ -1453,9 +1412,9 @@ public: // Use the scale transform op for converting privatized bins to output bins using OutputDecodeOpT = ScaleTransform; - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; + ::cuda::std::array num_privatized_levels; + ::cuda::std::array privatized_decode_op{}; + ::cuda::std::array output_decode_op{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) @@ -1483,7 +1442,7 @@ public: OutputDecodeOpT, OffsetT, MaxPolicyT> - dispatch( + dispatch{ d_temp_storage, temp_storage_bytes, d_samples, @@ -1496,7 +1455,7 @@ public: num_row_pixels, num_rows, row_stride_samples, - stream); + stream}; error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu index 6f9ada49d37..1ee1d80f0c9 100644 --- a/cub/test/catch2_test_device_histogram.cu +++ b/cub/test/catch2_test_device_histogram.cu @@ -82,19 +82,40 @@ auto cast_if_half_pointer(const half_t* p) -> const __half* } #endif // TEST_HALF_T() -template -using caller_vector = c2h:: -#if TEST_LAUNCH == 1 - device_vector; -#else - host_vector; -#endif +template +auto cast_if_half(array a) +{ + return a; +} + +#if TEST_HALF_T() +template +auto cast_if_half(array a) +{ + array<__half, N> r; + for (size_t i = 0; i < N; i++) + { + r[i] = static_cast<__half>(a[i]); + } + return r; +} +#endif // TEST_HALF_T() + +template +auto to_array_of_ptrs(array, N>& in) +{ + array())), N> r; + for (size_t i = 0; i < N; i++) + { + r[i] = cast_if_half_pointer(thrust::raw_pointer_cast(in[i].data())); + } + return r; +} template -auto to_caller_vector_of_ptrs(array, N>& in) - -> caller_vector()))> +auto to_array_of_const_ptrs(array, N>& in) { - c2h::host_vector()))> r(N); + array())), N> r; for (size_t i = 0; i < N; i++) { r[i] = cast_if_half_pointer(thrust::raw_pointer_cast(in[i].data())); @@ -272,7 +293,7 @@ void test_even_and_range(LevelT max_level, int max_level_count, OffsetT width, O { // Setup levels const auto levels = setup_bin_levels_for_even(num_levels, max_level, max_level_count); - const auto& lower_level = levels[0]; // TODO(bgruber): use structured bindings in C++17 + const auto& lower_level = levels[0]; // TODO(bgruber): use structured bindings in C++20 (lambda capture below) const auto& upper_level = levels[1]; CAPTURE(lower_level, upper_level); @@ -319,26 +340,23 @@ void test_even_and_range(LevelT max_level, int max_level_count, OffsetT width, O { histogram_even( sample_ptr, - cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram[0].data())), + to_array_of_ptrs(d_histogram)[0], num_levels[0], - cast_if_half_pointer(lower_level.data())[0], - cast_if_half_pointer(upper_level.data())[0], + cast_if_half(lower_level)[0], + cast_if_half(upper_level)[0], width, height, row_pitch); } else { - auto d_histogram_ptrs = to_caller_vector_of_ptrs(d_histogram); - const auto d_num_levels = caller_vector(num_levels.begin(), num_levels.end()); - const auto d_lower_level = caller_vector(lower_level.begin(), lower_level.end()); - const auto d_upper_level = caller_vector(upper_level.begin(), upper_level.end()); + auto d_histogram_ptrs = to_array_of_ptrs(d_histogram); multi_histogram_even( sample_ptr, - cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram_ptrs.data())), - thrust::raw_pointer_cast(d_num_levels.data()), - cast_if_half_pointer(thrust::raw_pointer_cast(d_lower_level.data())), - cast_if_half_pointer(thrust::raw_pointer_cast(d_upper_level.data())), + d_histogram_ptrs, + num_levels, + cast_if_half(lower_level), + cast_if_half(upper_level), width, height, row_pitch); @@ -375,26 +393,19 @@ void test_even_and_range(LevelT max_level, int max_level_count, OffsetT width, O { histogram_range( sample_ptr, - cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram[0].data())), + to_array_of_ptrs(d_histogram)[0], num_levels[0], - cast_if_half_pointer(thrust::raw_pointer_cast(d_levels[0].data())), + to_array_of_const_ptrs(d_levels)[0], width, height, row_pitch); } else { - auto d_histogram_ptrs = to_caller_vector_of_ptrs(d_histogram); - const auto d_num_levels = caller_vector(num_levels.begin(), num_levels.end()); - const auto level_ptrs = to_caller_vector_of_ptrs(d_levels); + auto d_histogram_ptrs = to_array_of_ptrs(d_histogram); + auto level_ptrs = to_array_of_const_ptrs(d_levels); multi_histogram_range( - sample_ptr, - cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram_ptrs.data())), - thrust::raw_pointer_cast(d_num_levels.data()), - cast_if_half_pointer(thrust::raw_pointer_cast(level_ptrs.data())), - width, - height, - row_pitch); + sample_ptr, d_histogram_ptrs, num_levels, level_ptrs, width, height, row_pitch); } } for (size_t c = 0; c < ActiveChannels; ++c) @@ -493,8 +504,8 @@ C2H_TEST("DeviceHistogram::HistogramEven sample iterator", "[histogram_even][dev const auto total_values = (width + padding) * channels * height; const auto num_levels = array{11, 3, 2}; - const auto lower_level = caller_vector{0, -10, cs::numeric_limits::lowest()}; - const auto upper_level = caller_vector{total_values, 10, cs::numeric_limits::max()}; + const auto lower_level = array{0, -10, cs::numeric_limits::lowest()}; + const auto upper_level = array{total_values, 10, cs::numeric_limits::max()}; auto sample_iterator = thrust::counting_iterator(0); @@ -510,14 +521,7 @@ C2H_TEST("DeviceHistogram::HistogramEven sample iterator", "[histogram_even][dev } multi_histogram_even( - sample_iterator, - thrust::raw_pointer_cast(to_caller_vector_of_ptrs(d_histogram).data()), - thrust::raw_pointer_cast(caller_vector(num_levels.begin(), num_levels.end()).data()), - thrust::raw_pointer_cast(lower_level.data()), - thrust::raw_pointer_cast(upper_level.data()), - width, - height, - row_pitch); + sample_iterator, to_array_of_ptrs(d_histogram), num_levels, lower_level, upper_level, width, height, row_pitch); CHECK(d_histogram[0] == c2h::host_vector(10, (width * height) / 10)); CHECK(d_histogram[1] == c2h::host_vector{0, 3});