Skip to content

Commit

Permalink
Revert "Add OOM fail reason, attempted allocation size to exception m…
Browse files Browse the repository at this point in the history
…essages" (#1843)

Reverts #1827

This appears to be causing downstream issues in cuDF, reverting for now until we determine the root cause and fix it.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Rong Ou (https://github.com/rongou)

URL: #1843
  • Loading branch information
pmattione-nvidia authored Feb 26, 2025
1 parent 81ea864 commit 01edf5a
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ MRFactoryFunc get_mr_factory(std::string const& resource_name)
if (resource_name == "arena") { return &make_arena; }
if (resource_name == "binning") { return &make_binning; }

RMM_FAIL("Invalid memory_resource name: " + resource_name);
std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl;

RMM_FAIL();
}

void declare_benchmark(std::string const& name)
Expand Down Expand Up @@ -173,7 +175,7 @@ void declare_benchmark(std::string const& name)
return;
}

RMM_FAIL("Invalid memory_resource name: " + name);
std::cout << "Error: invalid memory_resource name: " << name << std::endl;
}

// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/utilities/simulated_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ class simulated_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view) override
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
RMM_EXPECTS(begin_ + bytes <= end_,
"Simulated memory size exceeded (failed to allocate " +
rmm::detail::format_bytes(bytes) + ")",
rmm::bad_alloc);
RMM_EXPECTS(begin_ + bytes <= end_, "Simulated memory size exceeded", rmm::bad_alloc);
auto* ptr = static_cast<void*>(begin_);
begin_ += bytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
return ptr;
Expand Down
39 changes: 18 additions & 21 deletions include/rmm/detail/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@
GET_RMM_EXPECTS_MACRO(__VA_ARGS__, RMM_EXPECTS_3, RMM_EXPECTS_2) \
(__VA_ARGS__)
#define GET_RMM_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
std::string("RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": ") + _reason \
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
"RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _reason \
}
#define RMM_EXPECTS_2(_condition, _reason) RMM_EXPECTS_3(_condition, _reason, rmm::logic_error)

Expand All @@ -79,10 +79,9 @@
GET_RMM_FAIL_MACRO(__VA_ARGS__, RMM_FAIL_2, RMM_FAIL_1) \
(__VA_ARGS__)
#define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{std::string{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": "} + \
_what};
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what};
#define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error)

/**
Expand Down Expand Up @@ -133,18 +132,16 @@
* Defaults to throwing rmm::bad_alloc, but when `cudaErrorMemoryAllocation` is returned,
* rmm::out_of_memory is thrown instead.
*/
#define RMM_CUDA_TRY_ALLOC(_call, num_bytes) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
cudaGetLastError(); \
auto const msg = std::string{"CUDA error (failed to allocate "} + \
std::to_string(num_bytes) + " bytes) at: " + __FILE__ + ":" + \
RMM_STRINGIFY(__LINE__) + ": " + cudaGetErrorName(error) + " " + \
cudaGetErrorString(error); \
if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \
throw rmm::bad_alloc{msg}; \
} \
#define RMM_CUDA_TRY_ALLOC(_call) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
cudaGetLastError(); \
auto const msg = std::string{"CUDA error at: "} + __FILE__ + ":" + RMM_STRINGIFY(__LINE__) + \
": " + cudaGetErrorName(error) + " " + cudaGetErrorString(error); \
if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \
throw rmm::bad_alloc{msg}; \
} \
} while (0)

/**
Expand Down
4 changes: 1 addition & 3 deletions include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ class arena_memory_resource final : public device_memory_resource {
void* pointer = arena.allocate(bytes);
if (pointer == nullptr) {
if (dump_log_on_failure_) { dump_memory_log(bytes); }
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(bytes) + "): No room in arena.";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
}
return pointer;
}
Expand Down
3 changes: 1 addition & 2 deletions include/rmm/mr/device/cuda_async_view_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
{
void* ptr{nullptr};
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()),
bytes);
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
}
return ptr;
}
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/cuda_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class cuda_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
{
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes), bytes);
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes));
return ptr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_

size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(),
std::string("Maximum allocation size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) + ")",
"Maximum allocation size exceeded",
rmm::out_of_memory);
auto const block = this->underlying().get_block(size, stream_event);

Expand Down
5 changes: 1 addition & 4 deletions include/rmm/mr/device/limiting_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <rmm/aligned.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>
Expand Down Expand Up @@ -151,9 +150,7 @@ class limiting_resource_adaptor final : public device_memory_resource {
}

allocated_bytes_ -= proposed_size;
auto const msg = std::string("Exceeded memory limit (failed to allocate ") +
rmm::detail::format_bytes(bytes) + ")";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
RMM_FAIL("Exceeded memory limit", rmm::out_of_memory);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/managed_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class managed_memory_resource final : public device_memory_resource {
if (bytes == 0) { return nullptr; }

void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes), bytes);
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes));
return ptr;
}

Expand Down
51 changes: 20 additions & 31 deletions include/rmm/mr/device/pool_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,24 +252,21 @@ class pool_memory_resource final
*/
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
{
while (true) {
try {
auto block = block_from_upstream(try_size, stream);
current_pool_size_ += block.size();
return block;
} catch (std::exception const& e) {
if (try_size <= min_size) {
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",
rmm::detail::format_stream(stream),
try_size,
e.what());
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(try_size) + std::string("): ") + e.what();
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
while (try_size >= min_size) {
auto block = block_from_upstream(try_size, stream);
if (block.has_value()) {
current_pool_size_ += block.value().size();
return block.value();
}
if (try_size == min_size) {
break; // only try `size` once
}
try_size = std::max(min_size, try_size / 2);
}
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
rmm::detail::format_stream(stream),
min_size);
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
}

/**
Expand Down Expand Up @@ -310,18 +307,6 @@ class pool_memory_resource final
// limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
// time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
// until either success or the attempt is less than the requested size.

if (maximum_pool_size_.has_value()) {
auto const max_size = maximum_pool_size_.value();
if (size > max_size) {
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) +
std::string("): Request larger than capacity (") +
rmm::detail::format_bytes(max_size) + std::string(")");
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
}

return try_to_expand(size_to_grow(size), size, stream);
}

Expand Down Expand Up @@ -354,17 +339,21 @@ class pool_memory_resource final
*
* @param size The size in bytes to allocate from the upstream resource
* @param stream The stream on which the memory is to be used.
* @throws if call to allocate_async() throws
* @return block_type The allocated block
*/
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
{
RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);

if (size == 0) { return {}; }

void* ptr = get_upstream_resource().allocate_async(size, stream);
return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
try {
void* ptr = get_upstream_resource().allocate_async(size, stream);
return std::optional<block_type>{
*upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
} catch (std::exception const& e) {
return std::nullopt;
}
}

/**
Expand Down
5 changes: 1 addition & 4 deletions include/rmm/mr/device/system_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <cstddef>
Expand Down Expand Up @@ -104,9 +103,7 @@ class system_memory_resource final : public device_memory_resource {
return rmm::detail::aligned_host_allocate(
bytes, CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { return ::operator new(size); });
} catch (std::bad_alloc const& e) {
auto const msg = std::string("Failed to allocate ") + rmm::detail::format_bytes(bytes) +
std::string("of memory: ") + e.what();
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
}
}

Expand Down
3 changes: 2 additions & 1 deletion include/rmm/mr/host/pinned_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ class pinned_memory_resource final : public host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, size), size);
auto status = cudaMallocHost(&ptr, size);
if (cudaSuccess != status) { throw std::bad_alloc{}; }
return ptr;
});
}
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/pinned_host_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class pinned_host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault), size);
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault));
return ptr;
});
}
Expand Down

0 comments on commit 01edf5a

Please sign in to comment.