Skip to content

Commit

Permalink
fix(lp): relace tbbmalloc by standard malloc for sparse ratings (#77)
Browse files Browse the repository at this point in the history
* fix(lp): relace tbbmalloc by standard malloc for sparse ratings

On some machines with recent TBB versions, tbbmalloc can be really slow
when allocating the sparse array when aggregating ratings of high degree
vertices during single-phase LP. Since I do no know why, lets just
use standard malloc here instead.
  • Loading branch information
DanielSeemaier authored Feb 4, 2025
1 parent dd0a5af commit ab934db
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 10 deletions.
5 changes: 3 additions & 2 deletions kaminpar-common/datastructures/fast_reset_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
using const_reference = const Value &;
using size_type = Size;

explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity, static_array::seq) {
explicit FastResetArray(const std::size_t capacity = 0)
: _data(capacity, static_array::seq, static_array::std_alloc) {
RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct);
}

Expand Down Expand Up @@ -107,7 +108,7 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
}

void resize(const std::size_t capacity) {
_data.resize(capacity, static_array::seq);
_data.resize(capacity, static_array::seq, static_array::std_alloc);

IF_HEAP_PROFILING(
_struct->size = std::max(
Expand Down
29 changes: 24 additions & 5 deletions kaminpar-common/datastructures/static_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ constexpr struct overcommit_t {
constexpr struct seq_t {
} seq;

constexpr struct std_alloc_t {
} std_alloc;

} // namespace static_array

template <typename T> class StaticArray {
Expand Down Expand Up @@ -323,7 +326,8 @@ template <typename T> class StaticArray {
template <typename... Tags>
void resize(const std::size_t size, const value_type init_value, Tags...) {
KASSERT(
_data == _owned_data.get() || _data == _overcommited_data.get(),
_data == _owned_data.get() || _data == _owned_data_std.get() ||
_data == _overcommited_data.get(),
"cannot resize span",
assert::always
);
Expand All @@ -332,7 +336,7 @@ template <typename T> class StaticArray {
const bool use_thp =
size >= KAMINPAR_THP_THRESHOLD && !contains_tag_v<static_array::small_t, Tags...>;

allocate_data(size, overcommit, use_thp);
allocate_data(size, overcommit, use_thp, contains_tag_v<static_array::std_alloc_t, Tags...>);

if constexpr (!contains_tag_v<static_array::noinit_t, Tags...>) {
if constexpr (contains_tag_v<static_array::seq_t, Tags...>) {
Expand Down Expand Up @@ -366,16 +370,25 @@ template <typename T> class StaticArray {
_data = nullptr;

_owned_data.reset();
_owned_data_std.reset();
_overcommited_data.reset();
}

private:
void allocate_data(const std::size_t size, const bool overcommit, const bool thp) {
void allocate_data(
const std::size_t size,
const bool overcommit,
const bool thp,
const bool use_std_alloc = false
) {
// Before allocating the new memory, free the old memory to prevent both from being held in
// memory at the same time
if (_owned_data != nullptr) {
_owned_data.reset();
}
if (_owned_data_std != nullptr) {
_owned_data_std.reset();
}
if (_overcommited_data != nullptr) {
_overcommited_data.reset();
}
Expand All @@ -384,8 +397,13 @@ template <typename T> class StaticArray {
_overcommited_data = heap_profiler::overcommit_memory<value_type>(size);
_data = _overcommited_data.get();
} else {
_owned_data = parallel::make_unique<value_type>(size, thp);
_data = _owned_data.get();
if (use_std_alloc) {
_owned_data_std = make_unique<value_type>(size, thp);
_data = _owned_data_std.get();
} else {
_owned_data = parallel::make_unique<value_type>(size, thp);
_data = _owned_data.get();
}
}

_size = size;
Expand All @@ -397,6 +415,7 @@ template <typename T> class StaticArray {
size_type _size = 0;
size_type _unrestricted_size = 0;
parallel::tbb_unique_ptr<value_type> _owned_data = nullptr;
std::unique_ptr<value_type, deleter<value_type>> _owned_data_std = nullptr;
heap_profiler::unique_ptr<value_type> _overcommited_data = nullptr;
value_type *_data = nullptr;

Expand Down
46 changes: 44 additions & 2 deletions kaminpar-common/parallel/tbb_malloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#include "sys/mman.h"
#endif

namespace kaminpar::parallel {
namespace kaminpar {

namespace parallel {

template <typename T> struct tbb_deleter {
void operator()(T *p) {
Expand Down Expand Up @@ -60,4 +62,44 @@ tbb_unique_ptr<T> make_unique(const std::size_t size, [[maybe_unused]] const boo
return tbb_unique_ptr<T>(ptr, tbb_deleter<T>{});
}

} // namespace kaminpar::parallel
} // namespace parallel

template <typename T> struct deleter {
void operator()(T *p) {
free(p);

if constexpr (kHeapProfiling && !kPageProfiling) {
heap_profiler::HeapProfiler::global().record_free(p);
}
}
};

template <typename T>
std::unique_ptr<T, deleter<T>>
make_unique(const std::size_t size, [[maybe_unused]] const bool thp) {
auto nbytes = sizeof(T) * size;
T *ptr = nullptr;

#if defined(__linux__) && defined(KAMINPAR_ENABLE_THP)
if (thp) {
posix_memalign(reinterpret_cast<void **>(&ptr), 1 << 21, nbytes);
madvise(ptr, nbytes, MADV_HUGEPAGE);
} else {
#endif
ptr = static_cast<T *>(malloc(nbytes));
#if defined(__linux__) && defined(KAMINPAR_ENABLE_THP)
}
#endif

KASSERT(
ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light
);

if constexpr (kHeapProfiling && !kPageProfiling) {
heap_profiler::HeapProfiler::global().record_alloc(ptr, sizeof(T) * size);
}

return std::unique_ptr<T, deleter<T>>(ptr);
}

} // namespace kaminpar
6 changes: 5 additions & 1 deletion kaminpar-shm/label_propagation.h
Original file line number Diff line number Diff line change
Expand Up @@ -1861,7 +1861,9 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra

template <typename RatingMapETS> void perform(RatingMapETS &rating_map_ets) {
parallel::Atomic<std::size_t> next_chunk = 0;
tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
DBG << "Number of chunks: " << _chunks.size();

tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](std::size_t) {
if (should_stop()) {
return;
}
Expand All @@ -1878,6 +1880,8 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
const auto &chunk = _chunks[chunk_id];
const auto &permutation = _random_permutations.get(local_rand);

DBG << chunk_id << " of " << _chunks.size() << " for " << sched_getcpu();

const std::size_t num_sub_chunks =
std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);

Expand Down

0 comments on commit ab934db

Please sign in to comment.