Merge branch 'main' into shm/feat/graph-compression-interface

KaHIP · Feb 22, 2025 · 4eda96e · 4eda96e
2 parents 2af4877 + 21f0e06
commit 4eda96e
Show file tree

Hide file tree

Showing 15 changed files with 305 additions and 46 deletions.
diff --git a/kaminpar-cli/kaminpar_arguments.cc b/kaminpar-cli/kaminpar_arguments.cc
@@ -82,6 +82,10 @@ CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) {
           "--p-rb-kway-toplevel-refinement", ctx.partitioning.rb_enable_kway_toplevel_refinement
       )
       ->capture_default_str();
+  partitioning->add_option("--p-rb-switch-to-seq-factor", ctx.partitioning.rb_switch_to_seq_factor)
+      ->capture_default_str();
+  partitioning->add_flag("--p-kway-parallel-rb", ctx.partitioning.kway_parallel_rb)
+      ->capture_default_str();
 
   partitioning->add_option("--p-vcycles", ctx.partitioning.vcycles)->capture_default_str();
   partitioning
@@ -206,6 +210,8 @@ Options are:
 
   coarsening->add_option("--c-overlay-levels", ctx.coarsening.overlay_clustering.num_levels)
       ->capture_default_str();
+  coarsening->add_option("--c-overlay-max-level", ctx.coarsening.overlay_clustering.max_level)
+      ->capture_default_str();
 
   create_lp_coarsening_options(app, ctx);
   create_contraction_coarsening_options(app, ctx);

diff --git a/kaminpar-dist/partitioning/deep_multilevel.cc b/kaminpar-dist/partitioning/deep_multilevel.cc
@@ -11,6 +11,8 @@
 
 #include <mpi.h>
 
+#include "kaminpar-mpi/datatype.h"
+
 #include "kaminpar-dist/datastructures/distributed_graph.h"
 #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h"
 #include "kaminpar-dist/debug.h"
@@ -205,20 +207,23 @@ DistributedPartitionedGraph DeepMultilevelPartitioner::partition() {
 
   auto extend_partition = [&](DistributedPartitionedGraph &p_graph,
                               const bool almost_toplevel = false,
-                              const std::string &prefix = " ") -> PartitionContext {
+                              const std::string &prefix = " ",
+                              BlockID desired_k = 0) -> PartitionContext {
     SCOPED_HEAP_PROFILER("Extending partition");
 
-    BlockID desired_k = std::min<BlockID>(
-        _input_ctx.partition.k,
-        math::ceil2(dist_p_graph.global_n() / _input_ctx.coarsening.contraction_limit)
-    );
+    if (desired_k == 0) {
+      desired_k = std::min<BlockID>(
+          _input_ctx.partition.k,
+          math::ceil2(dist_p_graph.global_n() / _input_ctx.coarsening.contraction_limit)
+      );
 
-    // If we (almost) work on the input graph, extend to final number of blocks
-    if (_input_graph.global_n() == p_graph.global_n() ||
-        (_input_ctx.partitioning.avoid_toplevel_bipartitioning && almost_toplevel &&
-         _input_graph.global_n() >
-             2 * _input_ctx.partition.k * _input_ctx.coarsening.contraction_limit)) {
-      desired_k = _input_ctx.partition.k;
+      // If we (almost) work on the input graph, extend to final number of blocks
+      if (_input_graph.global_n() == p_graph.global_n() ||
+          (_input_ctx.partitioning.avoid_toplevel_bipartitioning && almost_toplevel &&
+           _input_graph.global_n() >
+               2 * _input_ctx.partition.k * _input_ctx.coarsening.contraction_limit)) {
+        desired_k = _input_ctx.partition.k;
+      }
     }
 
     PartitionContext ref_p_ctx;
@@ -327,6 +332,16 @@ DistributedPartitionedGraph DeepMultilevelPartitioner::partition() {
       coarsener = get_current_coarsener();
 
       const DistributedGraph *new_graph = &coarsener->current();
+
+      // Before we join the branches, make sure we're on the same page (... k)
+      BlockID k = dist_p_graph.k();
+      MPI_Allreduce(
+          MPI_IN_PLACE, &k, 1, mpi::type::get<BlockID>(), MPI_MAX, new_graph->communicator()
+      );
+      if (dist_p_graph.k() < k) {
+        extend_partition(dist_p_graph, false, " ", k);
+      }
+
       dist_p_graph = distribute_best_partition(*new_graph, std::move(dist_p_graph));
 
       _replicated_graphs.pop_back();

diff --git a/kaminpar-dist/presets.cc b/kaminpar-dist/presets.cc
@@ -35,6 +35,8 @@ std::unordered_set<std::string> get_preset_names() {
   return {
       "default",
       "strong",
+      "jet",
+      "4xjet",
       "europar23-fast",
       "europar23-strong",
   };
@@ -47,7 +49,7 @@ Context create_default_context() {
           {
 
               .mode = PartitioningMode::DEEP,
-              .initial_k = 16,
+              .initial_k = 2,
               .extension_k = 0,
               .avoid_toplevel_bipartitioning = true,
               .enable_pe_splitting = true,

diff --git a/kaminpar-shm/coarsening/overlay_cluster_coarsener.cc b/kaminpar-shm/coarsening/overlay_cluster_coarsener.cc
@@ -103,18 +103,29 @@ bool OverlayClusteringCoarsener::coarsen() {
     _clustering_algorithm->set_desired_cluster_count(desired_cluster_count);
   }
 
-  for (auto &clustering : clusterings) {
-    _clustering_algorithm->compute_clustering(clustering, current(), free_allocated_memory);
+  const bool compute_overlays =
+      level() <= static_cast<std::size_t>(_c_ctx.overlay_clustering.max_level);
+
+  if (compute_overlays) {
+    for (auto &clustering : clusterings) {
+      _clustering_algorithm->compute_clustering(clustering, current(), free_allocated_memory);
+    }
+  } else {
+    _clustering_algorithm->compute_clustering(
+        clusterings.front(), current(), free_allocated_memory
+    );
   }
   STOP_TIMER();
   STOP_HEAP_PROFILER();
 
   TIMED_SCOPE("Overlay clusters") {
-    for (int level = _c_ctx.overlay_clustering.num_levels; level > 0; --level) {
-      const int num_overlays_in_level = 1 << level;
-      for (int pair = 0; pair < num_overlays_in_level / 2; ++pair) {
-        clusterings[pair] =
-            overlay(std::move(clusterings[pair]), clusterings[num_overlays_in_level / 2 + pair]);
+    if (compute_overlays) {
+      for (int level = _c_ctx.overlay_clustering.num_levels; level > 0; --level) {
+        const int num_overlays_in_level = 1 << level;
+        for (int pair = 0; pair < num_overlays_in_level / 2; ++pair) {
+          clusterings[pair] =
+              overlay(std::move(clusterings[pair]), clusterings[num_overlays_in_level / 2 + pair]);
+        }
       }
     }
   };

diff --git a/kaminpar-shm/context_io.cc b/kaminpar-shm/context_io.cc
@@ -547,6 +547,17 @@ void print(const PartitioningContext &p_ctx, std::ostream &out) {
   if (p_ctx.mode == PartitioningMode::DEEP) {
     out << "  Deep initial part. mode:    " << p_ctx.deep_initial_partitioning_mode << "\n";
     out << "  Deep initial part. load:    " << p_ctx.deep_initial_partitioning_load << "\n";
+  } else if (p_ctx.mode == PartitioningMode::KWAY) {
+    out << "  Initial partitioning mode:  "
+        << (p_ctx.kway_parallel_rb ? "parallel [1 x P]" : "sequential [P x 1]") << "\n";
+  } else if (p_ctx.mode == PartitioningMode::RB) {
+    out << "  Use flat k-way refinement:  "
+        << (p_ctx.rb_enable_kway_toplevel_refinement ? "yes" : "no") << "\n";
+    out << "  Switch to seq. part.:       "
+        << (p_ctx.rb_switch_to_seq_factor == 0
+                ? "never"
+                : "when k' > p * " + std::to_string(p_ctx.rb_switch_to_seq_factor))
+        << "\n";
   }
   out << "Subgraph memory:              " << (p_ctx.use_lazy_subgraph_memory ? "Lazy" : "Default")
       << "\n";

diff --git a/kaminpar-shm/kaminpar.h b/kaminpar-shm/kaminpar.h
@@ -238,6 +238,7 @@ struct ClusterCoarseningContext {
 
 struct OverlayClusterCoarseningContext {
   int num_levels;
+  int max_level;
 };
 
 struct CoarseningContext {
@@ -540,6 +541,9 @@ struct PartitioningContext {
   bool restrict_vcycle_refinement;
 
   bool rb_enable_kway_toplevel_refinement;
+  int rb_switch_to_seq_factor;
+
+  bool kway_parallel_rb;
 };
 
 struct GraphCompressionContext {
@@ -609,6 +613,7 @@ Context create_vcycle_context(bool restrict_refinement = false);
 
 Context create_esa21_smallk_context();
 Context create_esa21_largek_context();
+Context create_esa21_largek_fast_context();
 Context create_esa21_strong_context();
 
 } // namespace kaminpar::shm

diff --git a/kaminpar-shm/partitioning/helper.cc b/kaminpar-shm/partitioning/helper.cc
@@ -145,7 +145,7 @@ void extend_partition_recursive(
     graph::SubgraphMemory &subgraph_memory,
     graph::TemporarySubgraphMemory &tmp_extraction_mem_pool,
     InitialBipartitionerWorkerPool &bipartitioner_pool,
-    BipartitionTimingInfo *timings = nullptr
+    BipartitionTimingInfo *timings 
 ) {
   KASSERT(num_subblocks > 1u);
 

diff --git a/kaminpar-shm/partitioning/helper.h b/kaminpar-shm/partitioning/helper.h
@@ -79,6 +79,21 @@ void extend_partition(
     int num_active_threads
 );
 
+void extend_partition_recursive(
+    const Graph &graph,
+    StaticArray<BlockID> &partition,
+    const BlockID current_rel_block,
+    const BlockID current_abs_block,
+    const BlockID num_subblocks,
+    const BlockID current_k,
+    const Context &input_ctx,
+    const graph::SubgraphMemoryStartPosition position,
+    graph::SubgraphMemory &subgraph_memory,
+    graph::TemporarySubgraphMemory &tmp_extraction_mem_pool,
+    InitialBipartitionerWorkerPool &bipartitioner_pool,
+    BipartitionTimingInfo *timings = nullptr
+);
+
 void complete_partial_extend_partition(
     PartitionedGraph &p_graph,
     const Context &input_ctx,

diff --git a/kaminpar-shm/partitioning/kway/kway_multilevel.cc b/kaminpar-shm/partitioning/kway/kway_multilevel.cc
@@ -11,6 +11,7 @@
 #include "kaminpar-shm/factories.h"
 #include "kaminpar-shm/partitioning/debug.h"
 #include "kaminpar-shm/partitioning/helper.h"
+#include "kaminpar-shm/partitioning/rb/rb_multilevel.h"
 
 #include "kaminpar-common/console_io.h"
 #include "kaminpar-common/heap_profiler.h"
@@ -41,11 +42,16 @@ KWayMultilevelPartitioner::KWayMultilevelPartitioner(
 
 PartitionedGraph KWayMultilevelPartitioner::partition() {
   cio::print_delimiter("Partitioning");
-  return uncoarsen(initial_partition(coarsen()));
+  if (_input_ctx.partitioning.kway_parallel_rb) {
+    return uncoarsen(parallel_initial_partition(coarsen()));
+  } else {
+    return uncoarsen(initial_partition(coarsen()));
+  }
 }
 
 void KWayMultilevelPartitioner::refine(PartitionedGraph &p_graph) {
   SCOPED_HEAP_PROFILER("Refinement");
+  SCOPED_TIMER("Refinement");
 
   // If requested, dump the current partition to disk before refinement ...
   debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "pre-refinement", _input_ctx);
@@ -67,10 +73,13 @@ void KWayMultilevelPartitioner::refine(PartitionedGraph &p_graph) {
 
 PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph) {
   SCOPED_HEAP_PROFILER("Uncoarsening");
+  SCOPED_TIMER("Uncoarsening");
 
   refine(p_graph);
 
   while (!_coarsener->empty()) {
+    SCOPED_TIMER("Level", std::to_string(_coarsener->level() - 1));
+
     LOG;
     LOG << "Uncoarsening -> Level " << _coarsener->level() - 1;
 
@@ -85,6 +94,7 @@ PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph)
 
 const Graph *KWayMultilevelPartitioner::coarsen() {
   SCOPED_HEAP_PROFILER("Coarsening");
+  SCOPED_TIMER("Coarsening");
 
   const Graph *c_graph = &_input_graph;
   bool shrunk = true;
@@ -95,6 +105,8 @@ const Graph *KWayMultilevelPartitioner::coarsen() {
   LOG;
 
   while (shrunk && c_graph->n() > initial_partitioning_threshold()) {
+    SCOPED_TIMER("Level", std::to_string(_coarsener->level()));
+
     // If requested, dump graph before each coarsening step + after coarsening
     // converged. This way, we also have a dump of the (reordered) input graph,
     // which makes it easier to use the final partition (before reordering it).
@@ -153,21 +165,92 @@ PartitionedGraph KWayMultilevelPartitioner::initial_partition(const Graph *graph
   // Since timers are not multi-threaded, we disable them during (parallel)
   // initial partitioning.
   DISABLE_TIMERS();
-  PartitionedGraph p_graph = _bipartitioner_pool.bipartition(graph, 0, 1, true);
 
-  graph::SubgraphMemory subgraph_memory(p_graph.n(), _input_ctx.partition.k, p_graph.m());
-  partitioning::TemporarySubgraphMemoryEts ip_extraction_pool_ets;
+  std::vector<StaticArray<BlockID>> initial_partitions(_input_ctx.parallel.num_threads);
+  std::vector<std::pair<bool, EdgeWeight>> initial_cuts(_input_ctx.parallel.num_threads);
+
+  tbb::parallel_for<int>(0, _input_ctx.parallel.num_threads, [&](const int t) {
+    graph::SubgraphMemory subgraph_memory(graph->n(), _input_ctx.partition.k, graph->m());
+    graph::TemporarySubgraphMemory ip_extraction_pool;
+
+    StaticArray<BlockID> partition(graph->n());
+
+    partitioning::extend_partition_recursive(
+        *graph,
+        partition,
+        0,
+        0,
+        _input_ctx.partition.k,
+        1,
+        _input_ctx,
+        {.nodes_start_pos = 0, .edges_start_pos = 0},
+        subgraph_memory,
+        ip_extraction_pool,
+        _bipartitioner_pool,
+        nullptr
+    );
+
+    PartitionedGraph p_graph(*graph, _input_ctx.partition.k, std::move(partition));
+    PartitionContext p_ctx = create_kway_context(_input_ctx, p_graph);
+    initial_cuts[t] = {metrics::is_feasible(p_graph, p_ctx), metrics::edge_cut_seq(p_graph)};
+    initial_partitions[t] = std::move(p_graph.take_raw_partition());
+  });
+
+  StaticArray<BlockID> initial_partition;
+  EdgeWeight best_initial_cut = std::numeric_limits<EdgeWeight>::max();
+  bool best_initial_cut_feasible = false;
+
+  for (int t = 0; t < _input_ctx.parallel.num_threads; ++t) {
+    const auto [feasible, cut] = initial_cuts[t];
+    if ((feasible && cut < best_initial_cut) ||
+        (!feasible && !best_initial_cut_feasible && cut < best_initial_cut) ||
+        (feasible && !best_initial_cut_feasible)) {
+      initial_partition = std::move(initial_partitions[t]);
+      best_initial_cut_feasible = feasible;
+    }
+  }
+
+  PartitionedGraph p_graph(*graph, _input_ctx.partition.k, std::move(initial_partition));
+  _current_p_ctx = create_kway_context(_input_ctx, p_graph);
+
+  ENABLE_TIMERS();
+
+  // Print some metrics for the initial partition.
+  LOG << "  Number of blocks: " << p_graph.k();
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "  Cut:              " << metrics::edge_cut(p_graph);
+    LOG << "  Imbalance:        " << metrics::imbalance(p_graph);
+    LOG << "  Feasible:         " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
+  }
 
-  partitioning::extend_partition(
-      p_graph,
-      _input_ctx.partition.k,
-      _input_ctx,
-      subgraph_memory,
-      ip_extraction_pool_ets,
-      _bipartitioner_pool,
-      _input_ctx.parallel.num_threads
-  );
+  // If requested, dump the coarsest partition -- as noted above, this is not
+  // actually the coarsest partition when using deep multilevel.
+  debug::dump_coarsest_partition(p_graph, _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);
+
+  return p_graph;
+}
+
+PartitionedGraph KWayMultilevelPartitioner::parallel_initial_partition(const Graph *graph) {
+  SCOPED_HEAP_PROFILER("Initial partitioning");
+  SCOPED_TIMER("Initial partitioning");
+  LOG << "Initial partitioning:";
+
+  // If requested, dump the coarsest graph to disk. Note that in the context of
+  // deep multilevel, this is not actually the coarsest graph, but rather the
+  // coarsest graph before splitting PEs and duplicating the graph.
+  // Disable worker splitting with --p-deep-initial-partitioning-mode=sequential to obtain coarser
+  // graphs.
+  debug::dump_coarsest_graph(*graph, _input_ctx);
+  debug::dump_graph_hierarchy(*graph, _coarsener->level(), _input_ctx);
+
+  // Since timers are not multi-threaded, we disable them during (parallel)
+  // initial partitioning.
+  DISABLE_TIMERS();
 
+  RBMultilevelPartitioner rb(*graph, _input_ctx);
+  PartitionedGraph p_graph = rb.partition();
   _current_p_ctx = create_kway_context(_input_ctx, p_graph);
 
   ENABLE_TIMERS();

diff --git a/kaminpar-shm/partitioning/kway/kway_multilevel.h b/kaminpar-shm/partitioning/kway/kway_multilevel.h
@@ -38,6 +38,7 @@ class KWayMultilevelPartitioner : public Partitioner {
   NodeID initial_partitioning_threshold();
 
   PartitionedGraph initial_partition(const Graph *graph);
+  PartitionedGraph parallel_initial_partition(const Graph *graph);
 
   const Graph &_input_graph;
   const Context &_input_ctx;