Skip to content

Commit

Permalink
Add OOM fail reason, attempted allocation size to exception messages (#…
Browse files Browse the repository at this point in the history
…1827)

This PR addresses the following issues:
1. Closes [Issue 1791](#1791): It forwards on the details for why we got an OOM exception in `try_to_expand()`.  It also does a more thorough job of forwarding on failure details in other locations. A test has been added to test this case explicitly. 
2. Closes [Issue 1134](#1134): It adds the size of attempted allocations to the OOM and bad_alloc exceptions. 

Notes:
1. These code paths are already tested by the numerous `EXPECT_THROW()` macros already in the tests, and one more test was explicitly added. 
2. The modified loop in `try_to_expand()` in `pool_memory_resource.hpp` attempts to allocate for the case where `try_size` is less than `min_size`, whereas before it just immediately errored. If this isn't the behavior we want I can change it. Previously it was just erroring that we didn't have enough memory, which isn't right either.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Alessandro Bellina (https://github.com/abellina)
  - Bradley Dice (https://github.com/bdice)

URL: #1827
  • Loading branch information
pmattione-nvidia authored Feb 26, 2025
1 parent f26cc7e commit d1a5f1b
Show file tree
Hide file tree
Showing 13 changed files with 82 additions and 58 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -138,9 +138,7 @@ MRFactoryFunc get_mr_factory(std::string const& resource_name)
if (resource_name == "arena") { return &make_arena; }
if (resource_name == "binning") { return &make_binning; }

std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl;

RMM_FAIL();
RMM_FAIL("Invalid memory_resource name: " + resource_name);
}

void declare_benchmark(std::string const& name)
Expand Down Expand Up @@ -175,7 +173,7 @@ void declare_benchmark(std::string const& name)
return;
}

std::cout << "Error: invalid memory_resource name: " << name << std::endl;
RMM_FAIL("Invalid memory_resource name: " + name);
}

// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
Expand Down
7 changes: 5 additions & 2 deletions benchmarks/utilities/simulated_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -65,7 +65,10 @@ class simulated_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view) override
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
RMM_EXPECTS(begin_ + bytes <= end_, "Simulated memory size exceeded", rmm::bad_alloc);
RMM_EXPECTS(begin_ + bytes <= end_,
"Simulated memory size exceeded (failed to allocate " +
rmm::detail::format_bytes(bytes) + ")",
rmm::bad_alloc);
auto* ptr = static_cast<void*>(begin_);
begin_ += bytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
return ptr;
Expand Down
41 changes: 22 additions & 19 deletions include/rmm/detail/error.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -55,11 +55,11 @@
GET_RMM_EXPECTS_MACRO(__VA_ARGS__, RMM_EXPECTS_3, RMM_EXPECTS_2) \
(__VA_ARGS__)
#define GET_RMM_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
"RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _reason \
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
std::string("RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": ") + _reason \
}
#define RMM_EXPECTS_2(_condition, _reason) RMM_EXPECTS_3(_condition, _reason, rmm::logic_error)

Expand All @@ -79,9 +79,10 @@
GET_RMM_FAIL_MACRO(__VA_ARGS__, RMM_FAIL_2, RMM_FAIL_1) \
(__VA_ARGS__)
#define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what};
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{std::string{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": "} + \
_what};
#define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error)

/**
Expand Down Expand Up @@ -132,16 +133,18 @@
* Defaults to throwing rmm::bad_alloc, but when `cudaErrorMemoryAllocation` is returned,
* rmm::out_of_memory is thrown instead.
*/
#define RMM_CUDA_TRY_ALLOC(_call) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
cudaGetLastError(); \
auto const msg = std::string{"CUDA error at: "} + __FILE__ + ":" + RMM_STRINGIFY(__LINE__) + \
": " + cudaGetErrorName(error) + " " + cudaGetErrorString(error); \
if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \
throw rmm::bad_alloc{msg}; \
} \
#define RMM_CUDA_TRY_ALLOC(_call, num_bytes) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
cudaGetLastError(); \
auto const msg = std::string{"CUDA error (failed to allocate "} + \
std::to_string(num_bytes) + " bytes) at: " + __FILE__ + ":" + \
RMM_STRINGIFY(__LINE__) + ": " + cudaGetErrorName(error) + " " + \
cudaGetErrorString(error); \
if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \
throw rmm::bad_alloc{msg}; \
} \
} while (0)

/**
Expand Down
4 changes: 3 additions & 1 deletion include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ class arena_memory_resource final : public device_memory_resource {
void* pointer = arena.allocate(bytes);
if (pointer == nullptr) {
if (dump_log_on_failure_) { dump_memory_log(bytes); }
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(bytes) + "): No room in arena.";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
return pointer;
}
Expand Down
3 changes: 2 additions & 1 deletion include/rmm/mr/device/cuda_async_view_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
{
void* ptr{nullptr};
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()),
bytes);
}
return ptr;
}
Expand Down
4 changes: 2 additions & 2 deletions include/rmm/mr/device/cuda_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,7 +59,7 @@ class cuda_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
{
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes));
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes), bytes);
return ptr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_

size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(),
"Maximum allocation size exceeded",
std::string("Maximum allocation size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) + ")",
rmm::out_of_memory);
auto const block = this->underlying().get_block(size, stream_event);

Expand Down
5 changes: 4 additions & 1 deletion include/rmm/mr/device/limiting_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <rmm/aligned.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>
Expand Down Expand Up @@ -150,7 +151,9 @@ class limiting_resource_adaptor final : public device_memory_resource {
}

allocated_bytes_ -= proposed_size;
RMM_FAIL("Exceeded memory limit", rmm::out_of_memory);
auto const msg = std::string("Exceeded memory limit (failed to allocate ") +
rmm::detail::format_bytes(bytes) + ")";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}

/**
Expand Down
4 changes: 2 additions & 2 deletions include/rmm/mr/device/managed_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,7 +63,7 @@ class managed_memory_resource final : public device_memory_resource {
if (bytes == 0) { return nullptr; }

void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes));
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes), bytes);
return ptr;
}

Expand Down
51 changes: 31 additions & 20 deletions include/rmm/mr/device/pool_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,21 +252,24 @@ class pool_memory_resource final
*/
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
{
while (try_size >= min_size) {
auto block = block_from_upstream(try_size, stream);
if (block.has_value()) {
current_pool_size_ += block.value().size();
return block.value();
}
if (try_size == min_size) {
break; // only try `size` once
while (true) {
try {
auto block = block_from_upstream(try_size, stream);
current_pool_size_ += block.size();
return block;
} catch (std::exception const& e) {
if (try_size <= min_size) {
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",
rmm::detail::format_stream(stream),
try_size,
e.what());
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(try_size) + std::string("): ") + e.what();
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
}
try_size = std::max(min_size, try_size / 2);
}
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
rmm::detail::format_stream(stream),
min_size);
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
}

/**
Expand Down Expand Up @@ -307,6 +310,18 @@ class pool_memory_resource final
// limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
// time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
// until either success or the attempt is less than the requested size.

if (maximum_pool_size_.has_value()) {
auto const max_size = maximum_pool_size_.value();
if (size > max_size) {
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) +
std::string("): Request larger than capacity (") +
rmm::detail::format_bytes(max_size) + std::string(")");
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
}

return try_to_expand(size_to_grow(size), size, stream);
}

Expand Down Expand Up @@ -339,21 +354,17 @@ class pool_memory_resource final
*
* @param size The size in bytes to allocate from the upstream resource
* @param stream The stream on which the memory is to be used.
* @throws if call to allocate_async() throws
* @return block_type The allocated block
*/
std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
{
RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);

if (size == 0) { return {}; }

try {
void* ptr = get_upstream_resource().allocate_async(size, stream);
return std::optional<block_type>{
*upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
} catch (std::exception const& e) {
return std::nullopt;
}
void* ptr = get_upstream_resource().allocate_async(size, stream);
return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
}

/**
Expand Down
5 changes: 4 additions & 1 deletion include/rmm/mr/device/system_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <cstddef>
Expand Down Expand Up @@ -103,7 +104,9 @@ class system_memory_resource final : public device_memory_resource {
return rmm::detail::aligned_host_allocate(
bytes, CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { return ::operator new(size); });
} catch (std::bad_alloc const& e) {
RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
auto const msg = std::string("Failed to allocate ") + rmm::detail::format_bytes(bytes) +
std::string("of memory: ") + e.what();
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
}

Expand Down
3 changes: 1 addition & 2 deletions include/rmm/mr/host/pinned_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ class pinned_memory_resource final : public host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
auto status = cudaMallocHost(&ptr, size);
if (cudaSuccess != status) { throw std::bad_alloc{}; }
RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, size), size);
return ptr;
});
}
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/pinned_host_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class pinned_host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault));
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault), size);
return ptr;
});
}
Expand Down

0 comments on commit d1a5f1b

Please sign in to comment.