fix(lp): relace tbbmalloc by standard malloc for sparse ratings (#77)

* fix(lp): relace tbbmalloc by standard malloc for sparse ratings On some machines with recent TBB versions, tbbmalloc can be really slow when allocating the sparse array when aggregating ratings of high degree vertices during single-phase LP. Since I do no know why, lets just use standard malloc here instead.
KaHIP · Feb 4, 2025 · ab934db · ab934db
1 parent dd0a5af
commit ab934db
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 10 deletions.
diff --git a/kaminpar-common/datastructures/fast_reset_array.h b/kaminpar-common/datastructures/fast_reset_array.h
@@ -25,7 +25,8 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
   using const_reference = const Value &;
   using size_type = Size;
 
-  explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity, static_array::seq) {
+  explicit FastResetArray(const std::size_t capacity = 0)
+      : _data(capacity, static_array::seq, static_array::std_alloc) {
     RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct);
   }
 
@@ -107,7 +108,7 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
   }
 
   void resize(const std::size_t capacity) {
-    _data.resize(capacity, static_array::seq);
+    _data.resize(capacity, static_array::seq, static_array::std_alloc);
 
     IF_HEAP_PROFILING(
         _struct->size = std::max(

diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h
@@ -47,6 +47,9 @@ constexpr struct overcommit_t {
 constexpr struct seq_t {
 } seq;
 
+constexpr struct std_alloc_t {
+} std_alloc;
+
 } // namespace static_array
 
 template <typename T> class StaticArray {
@@ -323,7 +326,8 @@ template <typename T> class StaticArray {
   template <typename... Tags>
   void resize(const std::size_t size, const value_type init_value, Tags...) {
     KASSERT(
-        _data == _owned_data.get() || _data == _overcommited_data.get(),
+        _data == _owned_data.get() || _data == _owned_data_std.get() ||
+            _data == _overcommited_data.get(),
         "cannot resize span",
         assert::always
     );
@@ -332,7 +336,7 @@ template <typename T> class StaticArray {
     const bool use_thp =
         size >= KAMINPAR_THP_THRESHOLD && !contains_tag_v<static_array::small_t, Tags...>;
 
-    allocate_data(size, overcommit, use_thp);
+    allocate_data(size, overcommit, use_thp, contains_tag_v<static_array::std_alloc_t, Tags...>);
 
     if constexpr (!contains_tag_v<static_array::noinit_t, Tags...>) {
       if constexpr (contains_tag_v<static_array::seq_t, Tags...>) {
@@ -366,16 +370,25 @@ template <typename T> class StaticArray {
     _data = nullptr;
 
     _owned_data.reset();
+    _owned_data_std.reset();
     _overcommited_data.reset();
   }
 
 private:
-  void allocate_data(const std::size_t size, const bool overcommit, const bool thp) {
+  void allocate_data(
+      const std::size_t size,
+      const bool overcommit,
+      const bool thp,
+      const bool use_std_alloc = false
+  ) {
     // Before allocating the new memory, free the old memory to prevent both from being held in
     // memory at the same time
     if (_owned_data != nullptr) {
       _owned_data.reset();
     }
+    if (_owned_data_std != nullptr) {
+      _owned_data_std.reset();
+    }
     if (_overcommited_data != nullptr) {
       _overcommited_data.reset();
     }
@@ -384,8 +397,13 @@ template <typename T> class StaticArray {
       _overcommited_data = heap_profiler::overcommit_memory<value_type>(size);
       _data = _overcommited_data.get();
     } else {
-      _owned_data = parallel::make_unique<value_type>(size, thp);
-      _data = _owned_data.get();
+      if (use_std_alloc) {
+        _owned_data_std = make_unique<value_type>(size, thp);
+        _data = _owned_data_std.get();
+      } else {
+        _owned_data = parallel::make_unique<value_type>(size, thp);
+        _data = _owned_data.get();
+      }
     }
 
     _size = size;
@@ -397,6 +415,7 @@ template <typename T> class StaticArray {
   size_type _size = 0;
   size_type _unrestricted_size = 0;
   parallel::tbb_unique_ptr<value_type> _owned_data = nullptr;
+  std::unique_ptr<value_type, deleter<value_type>> _owned_data_std = nullptr;
   heap_profiler::unique_ptr<value_type> _overcommited_data = nullptr;
   value_type *_data = nullptr;
 

diff --git a/kaminpar-common/parallel/tbb_malloc.h b/kaminpar-common/parallel/tbb_malloc.h
@@ -18,7 +18,9 @@
 #include "sys/mman.h"
 #endif
 
-namespace kaminpar::parallel {
+namespace kaminpar {
+
+namespace parallel {
 
 template <typename T> struct tbb_deleter {
   void operator()(T *p) {
@@ -60,4 +62,44 @@ tbb_unique_ptr<T> make_unique(const std::size_t size, [[maybe_unused]] const boo
   return tbb_unique_ptr<T>(ptr, tbb_deleter<T>{});
 }
 
-} // namespace kaminpar::parallel
+} // namespace parallel
+
+template <typename T> struct deleter {
+  void operator()(T *p) {
+    free(p);
+
+    if constexpr (kHeapProfiling && !kPageProfiling) {
+      heap_profiler::HeapProfiler::global().record_free(p);
+    }
+  }
+};
+
+template <typename T>
+std::unique_ptr<T, deleter<T>>
+make_unique(const std::size_t size, [[maybe_unused]] const bool thp) {
+  auto nbytes = sizeof(T) * size;
+  T *ptr = nullptr;
+
+#if defined(__linux__) && defined(KAMINPAR_ENABLE_THP)
+  if (thp) {
+    posix_memalign(reinterpret_cast<void **>(&ptr), 1 << 21, nbytes);
+    madvise(ptr, nbytes, MADV_HUGEPAGE);
+  } else {
+#endif
+    ptr = static_cast<T *>(malloc(nbytes));
+#if defined(__linux__) && defined(KAMINPAR_ENABLE_THP)
+  }
+#endif
+
+  KASSERT(
+      ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light
+  );
+
+  if constexpr (kHeapProfiling && !kPageProfiling) {
+    heap_profiler::HeapProfiler::global().record_alloc(ptr, sizeof(T) * size);
+  }
+
+  return std::unique_ptr<T, deleter<T>>(ptr);
+}
+
+} // namespace kaminpar
diff --git a/kaminpar-shm/label_propagation.h b/kaminpar-shm/label_propagation.h
@@ -1861,7 +1861,9 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
 
   template <typename RatingMapETS> void perform(RatingMapETS &rating_map_ets) {
     parallel::Atomic<std::size_t> next_chunk = 0;
-    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
+    DBG << "Number of chunks: " << _chunks.size();
+
+    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](std::size_t) {
       if (should_stop()) {
         return;
       }
@@ -1878,6 +1880,8 @@ class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Gra
       const auto &chunk = _chunks[chunk_id];
       const auto &permutation = _random_permutations.get(local_rand);
 
+      DBG << chunk_id << " of " << _chunks.size() << " for " << sched_getcpu();
+
       const std::size_t num_sub_chunks =
           std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);