From cb6fe7c777fa586c0ad6e76fb2fdfd81953d2308 Mon Sep 17 00:00:00 2001 From: tsuki <12711693+enp1s0@users.noreply.github.com> Date: Wed, 26 Feb 2025 02:42:04 +0900 Subject: [PATCH] [BUG] Fix illegal memory access in linalg::reduction (#2592) Illegal memory access occurs when calling `mean` for a large matrix, specifically, when `extent(0) * extent(1)` exceeds INT_MAX. This PR fixes it. Although I think it is safer to use `uint64_t` or `size_t` than `IdxType` as the index type in the kernel, I follow the [coalescedReduction kernels](https://github.com/rapidsai/raft/blob/branch-25.04/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh). Authors: - tsuki (https://github.com/enp1s0) Approvers: - Micka (https://github.com/lowener) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/2592 --- .../raft/linalg/detail/strided_reduction.cuh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 567dc6220e..eba3c67d0d 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -35,19 +35,19 @@ namespace detail { // Note that the compensation will only be performed 'per-block' for performance // reasons and therefore not be equivalent to a sequential compensation. -template +template RAFT_KERNEL stridedSummationKernel( - Type* out, const Type* data, int D, int N, Type init, MainLambda main_op) + Type* out, const Type* data, IdxType D, IdxType N, Type init, MainLambda main_op) { // Thread reduction - Type thread_sum = Type(init); - Type thread_c = Type(0); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + Type thread_sum = Type(init); + Type thread_c = Type(0); + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { - int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; - for (int j = rowStart; j < N; j += stride) { - int idx = colStart + j * D; + IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; + IdxType stride = blockDim.y * gridDim.y; + for (IdxType j = rowStart; j < N; j += stride) { + auto idx = colStart + j * D; // KahanBabushkaNeumaierSum const Type cur_value = main_op(data[idx], j); @@ -97,8 +97,8 @@ template RAFT_KERNEL stridedReductionKernel(OutType* dots, const InType* data, - int D, - int N, + IdxType D, + IdxType N, OutType init, MainLambda main_op, ReduceLambda reduce_op) @@ -167,7 +167,7 @@ void stridedReduction(OutType* dots, raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk))); const size_t shmemSize = sizeof(OutType) * Block.x * 2; - stridedSummationKernel + stridedSummationKernel <<>>(dots, data, D, N, init, main_op); } else { // Arbitrary numbers for now, probably need to tune