From addb059975478375a422d32e9bec30e1aeb16527 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 9 Dec 2023 05:59:20 +0100 Subject: [PATCH] Removing code that explicitly compares equality of rmm memory resources (#2047) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/2047 --- .../raft/cluster/detail/kmeans_balanced.cuh | 8 +-- .../detail/device_memory_resource.hpp | 58 ----------------- .../raft/matrix/detail/select_k-inl.cuh | 1 + .../raft/matrix/detail/select_radix.cuh | 14 +---- .../raft/matrix/detail/select_warpsort.cuh | 4 +- .../neighbors/detail/cagra/cagra_build.cuh | 6 +- .../neighbors/detail/cagra/cagra_search.cuh | 2 - .../neighbors/detail/ivf_flat_search-inl.cuh | 7 +-- .../raft/neighbors/detail/ivf_pq_build.cuh | 62 ++++--------------- .../raft/neighbors/detail/ivf_pq_search.cuh | 2 - .../raft_internal/neighbors/naive_knn.cuh | 3 +- 11 files changed, 22 insertions(+), 145 deletions(-) delete mode 100644 cpp/include/raft/core/resource/detail/device_memory_resource.hpp diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh index 593d7d8fa9..3b5d3ff02a 100644 --- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -970,16 +971,11 @@ void build_hierarchical(const raft::resources& handle, IdxT n_mesoclusters = std::min(n_clusters, static_cast(std::sqrt(n_clusters) + 0.5)); RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters); + // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf. rmm::mr::managed_memory_resource managed_memory; rmm::mr::device_memory_resource* device_memory = resource::get_workspace_resource(handle); auto [max_minibatch_size, mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); - auto pool_guard = - raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size)); - if (pool_guard) { - RAFT_LOG_DEBUG("build_hierarchical: using pool memory resource with initial size %zu bytes", - mem_per_row * size_t(max_minibatch_size)); - } // Precompute the L2 norm of the dataset if relevant. const MathT* dataset_norm = nullptr; diff --git a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp deleted file mode 100644 index 9d3f13689d..0000000000 --- a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -#include -#include -#include - -namespace raft::resource::detail { - -/** - * Warn a user of the calling algorithm if they use the default non-pooled memory allocator, - * as it may hurt the performance. - * - * This helper function is designed to produce the warning once for a given `user_name`. - * - * @param[in] res - * @param[in] user_name the name of the algorithm or any other identification. - * - */ -inline void warn_non_pool_workspace(resources const& res, std::string user_name) -{ - // Detect if the plain cuda memory resource is used for the workspace - if (rmm::mr::cuda_memory_resource{}.is_equal(*get_workspace_resource(res)->get_upstream())) { - static std::set notified_names{}; - static std::mutex mutex{}; - std::lock_guard guard(mutex); - auto [it, inserted] = notified_names.insert(std::move(user_name)); - if (inserted) { - RAFT_LOG_WARN( - "[%s] the default cuda resource is used for the raft workspace allocations. This may lead " - "to a significant slowdown for this algorithm. Consider using the default pool resource " - "(`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly " - "(`raft::resource::set_workspace_resource`).", - it->c_str()); - } - } -} - -} // namespace raft::resource::detail diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh index 9024975734..63aeff2f1c 100644 --- a/cpp/include/raft/matrix/detail/select_k-inl.cuh +++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh @@ -244,6 +244,7 @@ void select_k(raft::resources const& handle, common::nvtx::range fun_scope( "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k); + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } auto stream = raft::resource::get_cuda_stream(handle); auto algo = choose_select_k_algorithm(batch_size, len, k); diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh index fa12005df2..4245be42d6 100644 --- a/cpp/include/raft/matrix/detail/select_radix.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -835,6 +835,8 @@ void radix_topk(const T* in, static_assert(calc_num_passes() > 1); constexpr int num_buckets = calc_num_buckets(); + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } + auto kernel = radix_kernel; const size_t max_chunk_size = calc_chunk_size(batch_size, len, sm_cnt, kernel, false); @@ -843,15 +845,7 @@ void radix_topk(const T* in, } const IdxT buf_len = calc_buf_len(len); - size_t req_aux = max_chunk_size * (sizeof(Counter) + num_buckets * sizeof(IdxT)); size_t req_buf = max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)); - size_t mem_req = req_aux + req_buf + 256 * 6; // might need extra memory for alignment - - auto pool_guard = raft::get_pool_memory_resource(mr, mem_req); - if (pool_guard) { - RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes", - mem_req); - } rmm::device_uvector> counters(max_chunk_size, stream, mr); rmm::device_uvector histograms(max_chunk_size * num_buckets, stream, mr); @@ -1120,10 +1114,6 @@ void radix_topk_one_block(const T* in, const size_t max_chunk_size = calc_chunk_size(batch_size, len, sm_cnt, kernel, true); - auto pool_guard = - raft::get_pool_memory_resource(mr, max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT))); - if (pool_guard) { RAFT_LOG_DEBUG("radix::select_k: using pool memory resource"); } - rmm::device_uvector bufs( max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr); diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 0ee87de4f7..018eea2306 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -988,9 +988,7 @@ void select_k_(int num_of_block, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = nullptr) { - auto pool_guard = raft::get_pool_memory_resource( - mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT))); - if (pool_guard) { RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource"); } + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } rmm::device_uvector tmp_val(num_of_block * k * batch_size, stream, mr); rmm::device_uvector tmp_idx(num_of_block * k * batch_size, stream, mr); diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh index ddaf77a22f..812cca5b3b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -48,7 +47,6 @@ void build_knn_graph(raft::resources const& res, std::optional build_params = std::nullopt, std::optional search_params = std::nullopt) { - resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::build"); RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded, "Currently only L2Expanded metric is supported"); @@ -125,9 +123,7 @@ void build_knn_graph(raft::resources const& res, bool first = true; const auto start_clock = std::chrono::system_clock::now(); - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq using pool memory resource"); } + rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(res); raft::spatial::knn::detail::utils::batch_load_iterator vec_batches( dataset.data_handle(), diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh index 81e714dc4e..23a966d41f 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -110,7 +109,6 @@ void search_main(raft::resources const& res, raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT()) { - resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::search"); RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", static_cast(index.dataset().extent(0)), static_cast(index.dataset().extent(1))); diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh index c0f856103a..09c58602a4 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh @@ -222,6 +222,7 @@ inline void search(raft::resources const& handle, common::nvtx::range fun_scope( "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } RAFT_EXPECTS(params.n_probes > 0, "n_probes (number of clusters to probe in the search) must be positive."); auto n_probes = std::min(params.n_probes, index.n_lists()); @@ -233,12 +234,6 @@ inline void search(raft::resources const& handle, raft::div_rounding_up_safe( kExpectedWsSize, 16ull * uint64_t{n_probes} * k + 4ull * index.dim())); - auto pool_guard = raft::get_pool_memory_resource(mr, max_queries * n_probes * k * 16); - if (pool_guard) { - RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes", - n_queries * n_probes * k * 16ull); - } - for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = min(max_queries, n_queries - offset_q); diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index e57133fc23..6adc4d583c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include @@ -29,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -48,11 +46,10 @@ #include #include +#include #include #include #include -#include -#include #include #include @@ -1559,7 +1556,6 @@ void extend(raft::resources const& handle, common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim()); - resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend"); auto stream = resource::get_cuda_stream(handle); const auto n_clusters = index->n_lists(); @@ -1569,13 +1565,7 @@ void extend(raft::resources const& handle, static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::extend: using pool memory resource"); } - - rmm::mr::managed_memory_resource managed_memory_upstream; - rmm::mr::pool_memory_resource managed_memory( - &managed_memory_upstream, 1024 * 1024); + rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(handle); // The spec defines how the clusters look like auto spec = list_spec{ @@ -1593,17 +1583,9 @@ void extend(raft::resources const& handle, size_t free_mem, total_mem; RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); - // Decide on an approximate threshold when we'd better start saving device memory by using - // managed allocations for large device buffers - rmm::mr::device_memory_resource* labels_mr = device_memory; - rmm::mr::device_memory_resource* batches_mr = device_memory; - if (n_rows * (index->dim() * sizeof(T) + index->pq_dim() + sizeof(IdxT) + sizeof(uint32_t)) > - free_mem) { - labels_mr = &managed_memory; - } // Allocate a buffer for the new labels (classifying the new data) - rmm::device_uvector new_data_labels(n_rows, stream, labels_mr); - if (labels_mr == device_memory) { free_mem -= sizeof(uint32_t) * n_rows; } + rmm::device_uvector new_data_labels(n_rows, stream, device_memory); + free_mem -= sizeof(uint32_t) * n_rows; // Calculate the batch size for the input data if it's not accessible directly from the device constexpr size_t kReasonableMaxBatchSize = 65536; @@ -1632,19 +1614,13 @@ void extend(raft::resources const& handle, while (size_factor * max_batch_size > free_mem && max_batch_size > 128) { max_batch_size >>= 1; } - if (size_factor * max_batch_size > free_mem) { - // if that still doesn't fit, resort to the UVM - batches_mr = &managed_memory; - max_batch_size = kReasonableMaxBatchSize; - } else { - // If we're keeping the batches in device memory, update the available mem tracker. - free_mem -= size_factor * max_batch_size; - } + // If we're keeping the batches in device memory, update the available mem tracker. + free_mem -= size_factor * max_batch_size; } // Predict the cluster labels for the new data, in batches if necessary utils::batch_load_iterator vec_batches( - new_vectors, n_rows, index->dim(), max_batch_size, stream, batches_mr); + new_vectors, n_rows, index->dim(), max_batch_size, stream, device_memory); // Release the placeholder memory, because we don't intend to allocate any more long-living // temporary buffers before we allocate the index data. // This memory could potentially speed up UVM accesses, if any. @@ -1717,7 +1693,7 @@ void extend(raft::resources const& handle, // By this point, the index state is updated and valid except it doesn't contain the new data // Fill the extended index with the new data (possibly, in batches) utils::batch_load_iterator idx_batches( - new_indices, n_rows, 1, max_batch_size, stream, batches_mr); + new_indices, n_rows, 1, max_batch_size, stream, device_memory); for (const auto& vec_batch : vec_batches) { const auto& idx_batch = *idx_batches++; process_and_fill_codes(handle, @@ -1728,7 +1704,7 @@ void extend(raft::resources const& handle, : std::variant(IdxT(idx_batch.offset())), new_data_labels.data() + vec_batch.offset(), IdxT(vec_batch.size()), - batches_mr); + device_memory); } } @@ -1758,7 +1734,6 @@ auto build(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); - resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build"); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); @@ -1782,21 +1757,10 @@ auto build(raft::resources const& handle, auto* device_memory = resource::get_workspace_resource(handle); rmm::mr::managed_memory_resource managed_memory_upstream; - rmm::mr::pool_memory_resource managed_memory( - &managed_memory_upstream, 1024 * 1024); - - // If the trainset is small enough to comfortably fit into device memory, put it there. - // Otherwise, use the managed memory. - constexpr size_t kTolerableRatio = 4; - rmm::mr::device_memory_resource* big_memory_resource = &managed_memory; - if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < - resource::get_workspace_free_bytes(handle)) { - big_memory_resource = device_memory; - } // Besides just sampling, we transform the input dataset into floats to make it easier // to use gemm operations from cublas. - rmm::device_uvector trainset(n_rows_train * index.dim(), stream, big_memory_resource); + rmm::device_uvector trainset(n_rows_train * index.dim(), stream, device_memory); // TODO: a proper sampling if constexpr (std::is_same_v) { RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), @@ -1865,7 +1829,7 @@ auto build(raft::resources const& handle, handle, kmeans_params, trainset_const_view, centers_view, utils::mapping{}); // Trainset labels are needed for training PQ codebooks - rmm::device_uvector labels(n_rows_train, stream, big_memory_resource); + rmm::device_uvector labels(n_rows_train, stream, device_memory); auto centers_const_view = raft::make_device_matrix_view( cluster_centers, index.n_lists(), index.dim()); auto labels_view = raft::make_device_vector_view(labels.data(), n_rows_train); @@ -1894,7 +1858,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory); + &managed_memory_upstream); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, @@ -1903,7 +1867,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory); + &managed_memory_upstream); break; default: RAFT_FAIL("Unreachable code"); } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 7f5b316d41..557a1be668 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -747,7 +746,6 @@ inline void search(raft::resources const& handle, params.n_probes, k, index.dim()); - resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::search"); RAFT_EXPECTS( params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F, diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh index 64a81da01e..594fff0ba0 100644 --- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh +++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh @@ -90,8 +90,7 @@ void naive_knn(raft::resources const& handle, uint32_t k, raft::distance::DistanceType type) { - rmm::mr::device_memory_resource* mr = nullptr; - auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1);