From 01edf5a66819b30f8fd2409732f074df01ecdb1a Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Wed, 26 Feb 2025 18:12:14 -0500 Subject: [PATCH] Revert "Add OOM fail reason, attempted allocation size to exception messages" (#1843) Reverts rapidsai/rmm#1827 This appears to be causing downstream issues in cuDF, reverting for now until we determine the root cause and fix it. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) - Rong Ou (https://github.com/rongou) URL: https://github.com/rapidsai/rmm/pull/1843 --- .../multi_stream_allocations_bench.cu | 6 ++- .../utilities/simulated_memory_resource.hpp | 5 +- include/rmm/detail/error.hpp | 39 +++++++------- .../rmm/mr/device/arena_memory_resource.hpp | 4 +- .../cuda_async_view_memory_resource.hpp | 3 +- .../rmm/mr/device/cuda_memory_resource.hpp | 2 +- .../detail/stream_ordered_memory_resource.hpp | 3 +- .../mr/device/limiting_resource_adaptor.hpp | 5 +- .../rmm/mr/device/managed_memory_resource.hpp | 2 +- .../rmm/mr/device/pool_memory_resource.hpp | 51 ++++++++----------- .../rmm/mr/device/system_memory_resource.hpp | 5 +- .../rmm/mr/host/pinned_memory_resource.hpp | 3 +- .../rmm/mr/pinned_host_memory_resource.hpp | 2 +- 13 files changed, 53 insertions(+), 77 deletions(-) diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu index 8ada45262..b313f1f05 100644 --- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu +++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu @@ -138,7 +138,9 @@ MRFactoryFunc get_mr_factory(std::string const& resource_name) if (resource_name == "arena") { return &make_arena; } if (resource_name == "binning") { return &make_binning; } - RMM_FAIL("Invalid memory_resource name: " + resource_name); + std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl; + + RMM_FAIL(); } void declare_benchmark(std::string const& name) @@ -173,7 +175,7 @@ void declare_benchmark(std::string const& name) return; } - RMM_FAIL("Invalid memory_resource name: " + name); + std::cout << "Error: invalid memory_resource name: " << name << std::endl; } // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) diff --git a/benchmarks/utilities/simulated_memory_resource.hpp b/benchmarks/utilities/simulated_memory_resource.hpp index 4ca243e1c..73e2a4d37 100644 --- a/benchmarks/utilities/simulated_memory_resource.hpp +++ b/benchmarks/utilities/simulated_memory_resource.hpp @@ -65,10 +65,7 @@ class simulated_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view) override { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - RMM_EXPECTS(begin_ + bytes <= end_, - "Simulated memory size exceeded (failed to allocate " + - rmm::detail::format_bytes(bytes) + ")", - rmm::bad_alloc); + RMM_EXPECTS(begin_ + bytes <= end_, "Simulated memory size exceeded", rmm::bad_alloc); auto* ptr = static_cast(begin_); begin_ += bytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) return ptr; diff --git a/include/rmm/detail/error.hpp b/include/rmm/detail/error.hpp index 5bf50a89b..3a05d69eb 100644 --- a/include/rmm/detail/error.hpp +++ b/include/rmm/detail/error.hpp @@ -55,11 +55,11 @@ GET_RMM_EXPECTS_MACRO(__VA_ARGS__, RMM_EXPECTS_3, RMM_EXPECTS_2) \ (__VA_ARGS__) #define GET_RMM_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME -#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \ - (!!(_condition)) ? static_cast(0) \ - : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \ - { \ - std::string("RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": ") + _reason \ +#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \ + (!!(_condition)) ? static_cast(0) \ + : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \ + { \ + "RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _reason \ } #define RMM_EXPECTS_2(_condition, _reason) RMM_EXPECTS_3(_condition, _reason, rmm::logic_error) @@ -79,10 +79,9 @@ GET_RMM_FAIL_MACRO(__VA_ARGS__, RMM_FAIL_2, RMM_FAIL_1) \ (__VA_ARGS__) #define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME -#define RMM_FAIL_2(_what, _exception_type) \ - /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \ - throw _exception_type{std::string{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": "} + \ - _what}; +#define RMM_FAIL_2(_what, _exception_type) \ + /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \ + throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what}; #define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error) /** @@ -133,18 +132,16 @@ * Defaults to throwing rmm::bad_alloc, but when `cudaErrorMemoryAllocation` is returned, * rmm::out_of_memory is thrown instead. */ -#define RMM_CUDA_TRY_ALLOC(_call, num_bytes) \ - do { \ - cudaError_t const error = (_call); \ - if (cudaSuccess != error) { \ - cudaGetLastError(); \ - auto const msg = std::string{"CUDA error (failed to allocate "} + \ - std::to_string(num_bytes) + " bytes) at: " + __FILE__ + ":" + \ - RMM_STRINGIFY(__LINE__) + ": " + cudaGetErrorName(error) + " " + \ - cudaGetErrorString(error); \ - if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \ - throw rmm::bad_alloc{msg}; \ - } \ +#define RMM_CUDA_TRY_ALLOC(_call) \ + do { \ + cudaError_t const error = (_call); \ + if (cudaSuccess != error) { \ + cudaGetLastError(); \ + auto const msg = std::string{"CUDA error at: "} + __FILE__ + ":" + RMM_STRINGIFY(__LINE__) + \ + ": " + cudaGetErrorName(error) + " " + cudaGetErrorString(error); \ + if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \ + throw rmm::bad_alloc{msg}; \ + } \ } while (0) /** diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index 6aaa659ec..f8e4e16cb 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -166,9 +166,7 @@ class arena_memory_resource final : public device_memory_resource { void* pointer = arena.allocate(bytes); if (pointer == nullptr) { if (dump_log_on_failure_) { dump_memory_log(bytes); } - auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") + - rmm::detail::format_bytes(bytes) + "): No room in arena."; - RMM_FAIL(msg.c_str(), rmm::out_of_memory); + RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory); } return pointer; } diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 92aea2072..a4ae74394 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -98,8 +98,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { { void* ptr{nullptr}; if (bytes > 0) { - RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()), - bytes); + RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value())); } return ptr; } diff --git a/include/rmm/mr/device/cuda_memory_resource.hpp b/include/rmm/mr/device/cuda_memory_resource.hpp index e4afbf711..522145d93 100644 --- a/include/rmm/mr/device/cuda_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_memory_resource.hpp @@ -59,7 +59,7 @@ class cuda_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override { void* ptr{nullptr}; - RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes), bytes); + RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes)); return ptr; } diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp index 5b1476d37..ad676bfd2 100644 --- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp +++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp @@ -214,8 +214,7 @@ class stream_ordered_memory_resource : public crtp, public device_ size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT); RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(), - std::string("Maximum allocation size exceeded (failed to allocate ") + - rmm::detail::format_bytes(size) + ")", + "Maximum allocation size exceeded", rmm::out_of_memory); auto const block = this->underlying().get_block(size, stream_event); diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index d551fbdb3..96d730191 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -151,9 +150,7 @@ class limiting_resource_adaptor final : public device_memory_resource { } allocated_bytes_ -= proposed_size; - auto const msg = std::string("Exceeded memory limit (failed to allocate ") + - rmm::detail::format_bytes(bytes) + ")"; - RMM_FAIL(msg.c_str(), rmm::out_of_memory); + RMM_FAIL("Exceeded memory limit", rmm::out_of_memory); } /** diff --git a/include/rmm/mr/device/managed_memory_resource.hpp b/include/rmm/mr/device/managed_memory_resource.hpp index 3ade7fb53..2bb807a31 100644 --- a/include/rmm/mr/device/managed_memory_resource.hpp +++ b/include/rmm/mr/device/managed_memory_resource.hpp @@ -63,7 +63,7 @@ class managed_memory_resource final : public device_memory_resource { if (bytes == 0) { return nullptr; } void* ptr{nullptr}; - RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes), bytes); + RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes)); return ptr; } diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 46920da15..dee0471b9 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -252,24 +252,21 @@ class pool_memory_resource final */ block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream) { - while (true) { - try { - auto block = block_from_upstream(try_size, stream); - current_pool_size_ += block.size(); - return block; - } catch (std::exception const& e) { - if (try_size <= min_size) { - RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]", - rmm::detail::format_stream(stream), - try_size, - e.what()); - auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") + - rmm::detail::format_bytes(try_size) + std::string("): ") + e.what(); - RMM_FAIL(msg.c_str(), rmm::out_of_memory); - } + while (try_size >= min_size) { + auto block = block_from_upstream(try_size, stream); + if (block.has_value()) { + current_pool_size_ += block.value().size(); + return block.value(); + } + if (try_size == min_size) { + break; // only try `size` once } try_size = std::max(min_size, try_size / 2); } + RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]", + rmm::detail::format_stream(stream), + min_size); + RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory); } /** @@ -310,18 +307,6 @@ class pool_memory_resource final // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size, // until either success or the attempt is less than the requested size. - - if (maximum_pool_size_.has_value()) { - auto const max_size = maximum_pool_size_.value(); - if (size > max_size) { - auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") + - rmm::detail::format_bytes(size) + - std::string("): Request larger than capacity (") + - rmm::detail::format_bytes(max_size) + std::string(")"); - RMM_FAIL(msg.c_str(), rmm::out_of_memory); - } - } - return try_to_expand(size_to_grow(size), size, stream); } @@ -354,17 +339,21 @@ class pool_memory_resource final * * @param size The size in bytes to allocate from the upstream resource * @param stream The stream on which the memory is to be used. - * @throws if call to allocate_async() throws * @return block_type The allocated block */ - block_type block_from_upstream(std::size_t size, cuda_stream_view stream) + std::optional block_from_upstream(std::size_t size, cuda_stream_view stream) { RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size); if (size == 0) { return {}; } - void* ptr = get_upstream_resource().allocate_async(size, stream); - return *upstream_blocks_.emplace(static_cast(ptr), size, true).first; + try { + void* ptr = get_upstream_resource().allocate_async(size, stream); + return std::optional{ + *upstream_blocks_.emplace(static_cast(ptr), size, true).first}; + } catch (std::exception const& e) { + return std::nullopt; + } } /** diff --git a/include/rmm/mr/device/system_memory_resource.hpp b/include/rmm/mr/device/system_memory_resource.hpp index 34c16f719..666a7a9c4 100644 --- a/include/rmm/mr/device/system_memory_resource.hpp +++ b/include/rmm/mr/device/system_memory_resource.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -104,9 +103,7 @@ class system_memory_resource final : public device_memory_resource { return rmm::detail::aligned_host_allocate( bytes, CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { return ::operator new(size); }); } catch (std::bad_alloc const& e) { - auto const msg = std::string("Failed to allocate ") + rmm::detail::format_bytes(bytes) + - std::string("of memory: ") + e.what(); - RMM_FAIL(msg.c_str(), rmm::out_of_memory); + RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory); } } diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index f75a8f7be..cf746f5ac 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -122,7 +122,8 @@ class pinned_memory_resource final : public host_memory_resource { return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) { void* ptr{nullptr}; - RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, size), size); + auto status = cudaMallocHost(&ptr, size); + if (cudaSuccess != status) { throw std::bad_alloc{}; } return ptr; }); } diff --git a/include/rmm/mr/pinned_host_memory_resource.hpp b/include/rmm/mr/pinned_host_memory_resource.hpp index 3f5878171..b5689b3ed 100644 --- a/include/rmm/mr/pinned_host_memory_resource.hpp +++ b/include/rmm/mr/pinned_host_memory_resource.hpp @@ -72,7 +72,7 @@ class pinned_host_memory_resource { return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) { void* ptr{nullptr}; - RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault), size); + RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault)); return ptr; }); }