Skip to content

Commit

Permalink
Merge branch 'main' into shm/feat/graph-compression-interface
Browse files Browse the repository at this point in the history
  • Loading branch information
dsalwasser committed Feb 22, 2025
2 parents 2af4877 + 21f0e06 commit 4eda96e
Show file tree
Hide file tree
Showing 15 changed files with 305 additions and 46 deletions.
6 changes: 6 additions & 0 deletions kaminpar-cli/kaminpar_arguments.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) {
"--p-rb-kway-toplevel-refinement", ctx.partitioning.rb_enable_kway_toplevel_refinement
)
->capture_default_str();
partitioning->add_option("--p-rb-switch-to-seq-factor", ctx.partitioning.rb_switch_to_seq_factor)
->capture_default_str();
partitioning->add_flag("--p-kway-parallel-rb", ctx.partitioning.kway_parallel_rb)
->capture_default_str();

partitioning->add_option("--p-vcycles", ctx.partitioning.vcycles)->capture_default_str();
partitioning
Expand Down Expand Up @@ -206,6 +210,8 @@ Options are:

coarsening->add_option("--c-overlay-levels", ctx.coarsening.overlay_clustering.num_levels)
->capture_default_str();
coarsening->add_option("--c-overlay-max-level", ctx.coarsening.overlay_clustering.max_level)
->capture_default_str();

create_lp_coarsening_options(app, ctx);
create_contraction_coarsening_options(app, ctx);
Expand Down
37 changes: 26 additions & 11 deletions kaminpar-dist/partitioning/deep_multilevel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

#include <mpi.h>

#include "kaminpar-mpi/datatype.h"

#include "kaminpar-dist/datastructures/distributed_graph.h"
#include "kaminpar-dist/datastructures/distributed_partitioned_graph.h"
#include "kaminpar-dist/debug.h"
Expand Down Expand Up @@ -205,20 +207,23 @@ DistributedPartitionedGraph DeepMultilevelPartitioner::partition() {

auto extend_partition = [&](DistributedPartitionedGraph &p_graph,
const bool almost_toplevel = false,
const std::string &prefix = " ") -> PartitionContext {
const std::string &prefix = " ",
BlockID desired_k = 0) -> PartitionContext {
SCOPED_HEAP_PROFILER("Extending partition");

BlockID desired_k = std::min<BlockID>(
_input_ctx.partition.k,
math::ceil2(dist_p_graph.global_n() / _input_ctx.coarsening.contraction_limit)
);
if (desired_k == 0) {
desired_k = std::min<BlockID>(
_input_ctx.partition.k,
math::ceil2(dist_p_graph.global_n() / _input_ctx.coarsening.contraction_limit)
);

// If we (almost) work on the input graph, extend to final number of blocks
if (_input_graph.global_n() == p_graph.global_n() ||
(_input_ctx.partitioning.avoid_toplevel_bipartitioning && almost_toplevel &&
_input_graph.global_n() >
2 * _input_ctx.partition.k * _input_ctx.coarsening.contraction_limit)) {
desired_k = _input_ctx.partition.k;
// If we (almost) work on the input graph, extend to final number of blocks
if (_input_graph.global_n() == p_graph.global_n() ||
(_input_ctx.partitioning.avoid_toplevel_bipartitioning && almost_toplevel &&
_input_graph.global_n() >
2 * _input_ctx.partition.k * _input_ctx.coarsening.contraction_limit)) {
desired_k = _input_ctx.partition.k;
}
}

PartitionContext ref_p_ctx;
Expand Down Expand Up @@ -327,6 +332,16 @@ DistributedPartitionedGraph DeepMultilevelPartitioner::partition() {
coarsener = get_current_coarsener();

const DistributedGraph *new_graph = &coarsener->current();

// Before we join the branches, make sure we're on the same page (... k)
BlockID k = dist_p_graph.k();
MPI_Allreduce(
MPI_IN_PLACE, &k, 1, mpi::type::get<BlockID>(), MPI_MAX, new_graph->communicator()
);
if (dist_p_graph.k() < k) {
extend_partition(dist_p_graph, false, " ", k);
}

dist_p_graph = distribute_best_partition(*new_graph, std::move(dist_p_graph));

_replicated_graphs.pop_back();
Expand Down
4 changes: 3 additions & 1 deletion kaminpar-dist/presets.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ std::unordered_set<std::string> get_preset_names() {
return {
"default",
"strong",
"jet",
"4xjet",
"europar23-fast",
"europar23-strong",
};
Expand All @@ -47,7 +49,7 @@ Context create_default_context() {
{

.mode = PartitioningMode::DEEP,
.initial_k = 16,
.initial_k = 2,
.extension_k = 0,
.avoid_toplevel_bipartitioning = true,
.enable_pe_splitting = true,
Expand Down
25 changes: 18 additions & 7 deletions kaminpar-shm/coarsening/overlay_cluster_coarsener.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,29 @@ bool OverlayClusteringCoarsener::coarsen() {
_clustering_algorithm->set_desired_cluster_count(desired_cluster_count);
}

for (auto &clustering : clusterings) {
_clustering_algorithm->compute_clustering(clustering, current(), free_allocated_memory);
const bool compute_overlays =
level() <= static_cast<std::size_t>(_c_ctx.overlay_clustering.max_level);

if (compute_overlays) {
for (auto &clustering : clusterings) {
_clustering_algorithm->compute_clustering(clustering, current(), free_allocated_memory);
}
} else {
_clustering_algorithm->compute_clustering(
clusterings.front(), current(), free_allocated_memory
);
}
STOP_TIMER();
STOP_HEAP_PROFILER();

TIMED_SCOPE("Overlay clusters") {
for (int level = _c_ctx.overlay_clustering.num_levels; level > 0; --level) {
const int num_overlays_in_level = 1 << level;
for (int pair = 0; pair < num_overlays_in_level / 2; ++pair) {
clusterings[pair] =
overlay(std::move(clusterings[pair]), clusterings[num_overlays_in_level / 2 + pair]);
if (compute_overlays) {
for (int level = _c_ctx.overlay_clustering.num_levels; level > 0; --level) {
const int num_overlays_in_level = 1 << level;
for (int pair = 0; pair < num_overlays_in_level / 2; ++pair) {
clusterings[pair] =
overlay(std::move(clusterings[pair]), clusterings[num_overlays_in_level / 2 + pair]);
}
}
}
};
Expand Down
11 changes: 11 additions & 0 deletions kaminpar-shm/context_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,17 @@ void print(const PartitioningContext &p_ctx, std::ostream &out) {
if (p_ctx.mode == PartitioningMode::DEEP) {
out << " Deep initial part. mode: " << p_ctx.deep_initial_partitioning_mode << "\n";
out << " Deep initial part. load: " << p_ctx.deep_initial_partitioning_load << "\n";
} else if (p_ctx.mode == PartitioningMode::KWAY) {
out << " Initial partitioning mode: "
<< (p_ctx.kway_parallel_rb ? "parallel [1 x P]" : "sequential [P x 1]") << "\n";
} else if (p_ctx.mode == PartitioningMode::RB) {
out << " Use flat k-way refinement: "
<< (p_ctx.rb_enable_kway_toplevel_refinement ? "yes" : "no") << "\n";
out << " Switch to seq. part.: "
<< (p_ctx.rb_switch_to_seq_factor == 0
? "never"
: "when k' > p * " + std::to_string(p_ctx.rb_switch_to_seq_factor))
<< "\n";
}
out << "Subgraph memory: " << (p_ctx.use_lazy_subgraph_memory ? "Lazy" : "Default")
<< "\n";
Expand Down
5 changes: 5 additions & 0 deletions kaminpar-shm/kaminpar.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ struct ClusterCoarseningContext {

struct OverlayClusterCoarseningContext {
int num_levels;
int max_level;
};

struct CoarseningContext {
Expand Down Expand Up @@ -540,6 +541,9 @@ struct PartitioningContext {
bool restrict_vcycle_refinement;

bool rb_enable_kway_toplevel_refinement;
int rb_switch_to_seq_factor;

bool kway_parallel_rb;
};

struct GraphCompressionContext {
Expand Down Expand Up @@ -609,6 +613,7 @@ Context create_vcycle_context(bool restrict_refinement = false);

Context create_esa21_smallk_context();
Context create_esa21_largek_context();
Context create_esa21_largek_fast_context();
Context create_esa21_strong_context();

} // namespace kaminpar::shm
Expand Down
2 changes: 1 addition & 1 deletion kaminpar-shm/partitioning/helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ void extend_partition_recursive(
graph::SubgraphMemory &subgraph_memory,
graph::TemporarySubgraphMemory &tmp_extraction_mem_pool,
InitialBipartitionerWorkerPool &bipartitioner_pool,
BipartitionTimingInfo *timings = nullptr
BipartitionTimingInfo *timings
) {
KASSERT(num_subblocks > 1u);

Expand Down
15 changes: 15 additions & 0 deletions kaminpar-shm/partitioning/helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,21 @@ void extend_partition(
int num_active_threads
);

void extend_partition_recursive(
const Graph &graph,
StaticArray<BlockID> &partition,
const BlockID current_rel_block,
const BlockID current_abs_block,
const BlockID num_subblocks,
const BlockID current_k,
const Context &input_ctx,
const graph::SubgraphMemoryStartPosition position,
graph::SubgraphMemory &subgraph_memory,
graph::TemporarySubgraphMemory &tmp_extraction_mem_pool,
InitialBipartitionerWorkerPool &bipartitioner_pool,
BipartitionTimingInfo *timings = nullptr
);

void complete_partial_extend_partition(
PartitionedGraph &p_graph,
const Context &input_ctx,
Expand Down
109 changes: 96 additions & 13 deletions kaminpar-shm/partitioning/kway/kway_multilevel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "kaminpar-shm/factories.h"
#include "kaminpar-shm/partitioning/debug.h"
#include "kaminpar-shm/partitioning/helper.h"
#include "kaminpar-shm/partitioning/rb/rb_multilevel.h"

#include "kaminpar-common/console_io.h"
#include "kaminpar-common/heap_profiler.h"
Expand Down Expand Up @@ -41,11 +42,16 @@ KWayMultilevelPartitioner::KWayMultilevelPartitioner(

PartitionedGraph KWayMultilevelPartitioner::partition() {
cio::print_delimiter("Partitioning");
return uncoarsen(initial_partition(coarsen()));
if (_input_ctx.partitioning.kway_parallel_rb) {
return uncoarsen(parallel_initial_partition(coarsen()));
} else {
return uncoarsen(initial_partition(coarsen()));
}
}

void KWayMultilevelPartitioner::refine(PartitionedGraph &p_graph) {
SCOPED_HEAP_PROFILER("Refinement");
SCOPED_TIMER("Refinement");

// If requested, dump the current partition to disk before refinement ...
debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "pre-refinement", _input_ctx);
Expand All @@ -67,10 +73,13 @@ void KWayMultilevelPartitioner::refine(PartitionedGraph &p_graph) {

PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph) {
SCOPED_HEAP_PROFILER("Uncoarsening");
SCOPED_TIMER("Uncoarsening");

refine(p_graph);

while (!_coarsener->empty()) {
SCOPED_TIMER("Level", std::to_string(_coarsener->level() - 1));

LOG;
LOG << "Uncoarsening -> Level " << _coarsener->level() - 1;

Expand All @@ -85,6 +94,7 @@ PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph)

const Graph *KWayMultilevelPartitioner::coarsen() {
SCOPED_HEAP_PROFILER("Coarsening");
SCOPED_TIMER("Coarsening");

const Graph *c_graph = &_input_graph;
bool shrunk = true;
Expand All @@ -95,6 +105,8 @@ const Graph *KWayMultilevelPartitioner::coarsen() {
LOG;

while (shrunk && c_graph->n() > initial_partitioning_threshold()) {
SCOPED_TIMER("Level", std::to_string(_coarsener->level()));

// If requested, dump graph before each coarsening step + after coarsening
// converged. This way, we also have a dump of the (reordered) input graph,
// which makes it easier to use the final partition (before reordering it).
Expand Down Expand Up @@ -153,21 +165,92 @@ PartitionedGraph KWayMultilevelPartitioner::initial_partition(const Graph *graph
// Since timers are not multi-threaded, we disable them during (parallel)
// initial partitioning.
DISABLE_TIMERS();
PartitionedGraph p_graph = _bipartitioner_pool.bipartition(graph, 0, 1, true);

graph::SubgraphMemory subgraph_memory(p_graph.n(), _input_ctx.partition.k, p_graph.m());
partitioning::TemporarySubgraphMemoryEts ip_extraction_pool_ets;
std::vector<StaticArray<BlockID>> initial_partitions(_input_ctx.parallel.num_threads);
std::vector<std::pair<bool, EdgeWeight>> initial_cuts(_input_ctx.parallel.num_threads);

tbb::parallel_for<int>(0, _input_ctx.parallel.num_threads, [&](const int t) {
graph::SubgraphMemory subgraph_memory(graph->n(), _input_ctx.partition.k, graph->m());
graph::TemporarySubgraphMemory ip_extraction_pool;

StaticArray<BlockID> partition(graph->n());

partitioning::extend_partition_recursive(
*graph,
partition,
0,
0,
_input_ctx.partition.k,
1,
_input_ctx,
{.nodes_start_pos = 0, .edges_start_pos = 0},
subgraph_memory,
ip_extraction_pool,
_bipartitioner_pool,
nullptr
);

PartitionedGraph p_graph(*graph, _input_ctx.partition.k, std::move(partition));
PartitionContext p_ctx = create_kway_context(_input_ctx, p_graph);
initial_cuts[t] = {metrics::is_feasible(p_graph, p_ctx), metrics::edge_cut_seq(p_graph)};
initial_partitions[t] = std::move(p_graph.take_raw_partition());
});

StaticArray<BlockID> initial_partition;
EdgeWeight best_initial_cut = std::numeric_limits<EdgeWeight>::max();
bool best_initial_cut_feasible = false;

for (int t = 0; t < _input_ctx.parallel.num_threads; ++t) {
const auto [feasible, cut] = initial_cuts[t];
if ((feasible && cut < best_initial_cut) ||
(!feasible && !best_initial_cut_feasible && cut < best_initial_cut) ||
(feasible && !best_initial_cut_feasible)) {
initial_partition = std::move(initial_partitions[t]);
best_initial_cut_feasible = feasible;
}
}

PartitionedGraph p_graph(*graph, _input_ctx.partition.k, std::move(initial_partition));
_current_p_ctx = create_kway_context(_input_ctx, p_graph);

ENABLE_TIMERS();

// Print some metrics for the initial partition.
LOG << " Number of blocks: " << p_graph.k();
if (_print_metrics) {
SCOPED_TIMER("Partition metrics");
LOG << " Cut: " << metrics::edge_cut(p_graph);
LOG << " Imbalance: " << metrics::imbalance(p_graph);
LOG << " Feasible: " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
}

partitioning::extend_partition(
p_graph,
_input_ctx.partition.k,
_input_ctx,
subgraph_memory,
ip_extraction_pool_ets,
_bipartitioner_pool,
_input_ctx.parallel.num_threads
);
// If requested, dump the coarsest partition -- as noted above, this is not
// actually the coarsest partition when using deep multilevel.
debug::dump_coarsest_partition(p_graph, _input_ctx);
debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);

return p_graph;
}

PartitionedGraph KWayMultilevelPartitioner::parallel_initial_partition(const Graph *graph) {
SCOPED_HEAP_PROFILER("Initial partitioning");
SCOPED_TIMER("Initial partitioning");
LOG << "Initial partitioning:";

// If requested, dump the coarsest graph to disk. Note that in the context of
// deep multilevel, this is not actually the coarsest graph, but rather the
// coarsest graph before splitting PEs and duplicating the graph.
// Disable worker splitting with --p-deep-initial-partitioning-mode=sequential to obtain coarser
// graphs.
debug::dump_coarsest_graph(*graph, _input_ctx);
debug::dump_graph_hierarchy(*graph, _coarsener->level(), _input_ctx);

// Since timers are not multi-threaded, we disable them during (parallel)
// initial partitioning.
DISABLE_TIMERS();

RBMultilevelPartitioner rb(*graph, _input_ctx);
PartitionedGraph p_graph = rb.partition();
_current_p_ctx = create_kway_context(_input_ctx, p_graph);

ENABLE_TIMERS();
Expand Down
1 change: 1 addition & 0 deletions kaminpar-shm/partitioning/kway/kway_multilevel.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class KWayMultilevelPartitioner : public Partitioner {
NodeID initial_partitioning_threshold();

PartitionedGraph initial_partition(const Graph *graph);
PartitionedGraph parallel_initial_partition(const Graph *graph);

const Graph &_input_graph;
const Context &_input_ctx;
Expand Down
Loading

0 comments on commit 4eda96e

Please sign in to comment.