From bfd1f112e2a28475f6fd16c405a200d542d23948 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 21 Feb 2025 10:43:42 -0800 Subject: [PATCH] GPU Mapping (#4326) For perlmutter and frontier, if there are multiple devices available, we will try to map GPUs to the closest core. For an FFT test on perlmutter using 256 nodes, the correct mapping reduced the run time from 0.172 to 0.127. Note that you can achieve the similar effect with `srun ... bash -c "export CUDA_VISIBLE_DEVICES=\$((3-SLURM_LOCALID)); ..."` by manually limiting the number of visible devices. But in this commit, we are trying to do this automatically for the user. Also note that MPI appears to crash with gpu-bind=closest on perlmutter. So we need to use gpu-bind=none. For frontier, you could use gpu-bind=closest. But if your use gpu-bind=none, this commit will try to do the correct mapping for you. In this commit, we also removed the old machine stuff and added new code for machine detection. --- Src/Base/AMReX.cpp | 4 +- Src/Base/AMReX_GpuDevice.cpp | 30 +- Src/Base/AMReX_Machine.H | 13 +- Src/Base/AMReX_Machine.cpp | 606 ++--------------------------------- 4 files changed, 61 insertions(+), 592 deletions(-) diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index 9d9edeaeba5..cf4d1eb3c07 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -506,6 +506,8 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, } #endif + Machine::Initialize(); + #ifdef AMREX_USE_GPU // Initialize after ParmParse so that we can read inputs. Gpu::Device::Initialize(); @@ -670,8 +672,6 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, BL_PROFILE_INITPARAMS(); #endif // ifndef BL_AMRPROF - machine::Initialize(); - #ifdef AMREX_USE_HYPRE if (init_hypre) { HYPRE_Init(); diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp index 4cb22d69b48..961fdb04060 100644 --- a/Src/Base/AMReX_GpuDevice.cpp +++ b/Src/Base/AMReX_GpuDevice.cpp @@ -1,9 +1,10 @@ #include +#include +#include #include #include #include -#include #ifdef AMREX_USE_HYPRE # include <_hypre_utilities.h> @@ -207,9 +208,6 @@ Device::Initialize () device_id = 0; } else { - if (amrex::Verbose() && ParallelDescriptor::IOProcessor()) { - amrex::Warning("Multiple GPUs are visible to each MPI rank. This is usually not an issue. But this may lead to incorrect or suboptimal rank-to-GPU mapping."); - } if (ParallelDescriptor::NProcsPerNode() == gpu_device_count) { device_id = ParallelDescriptor::MyRankInNode(); } else if (ParallelDescriptor::NProcsPerProcessor() == gpu_device_count) { @@ -219,6 +217,30 @@ Device::Initialize () } } + if (gpu_device_count > 1){ + if (Machine::name() == "nersc.perlmutter") { + // The CPU/GPU mapping on perlmutter has the reverse order. + device_id = gpu_device_count - device_id - 1; + if (amrex::Verbose()) { + amrex::Print() << "Multiple GPUs are visible to each MPI rank. Fixing GPU assignment for Perlmuuter according to heuristics.\n"; + } + } else if (Machine::name() == "olcf.frontier") { + // The CPU/GPU mapping on fronter is documented at + // https://docs.olcf.ornl.gov/systems/frontier_user_guide.html + if (gpu_device_count == 8) { + constexpr std::array gpu_order = {4,5,2,3,6,7,0,1}; + device_id = gpu_order[device_id]; + if (amrex::Verbose()) { + amrex::Print() << "Multiple GPUs are visible to each MPI rank. Fixing GPU assignment for Frontier according to heuristics.\n"; + } + } + } else { + if (amrex::Verbose() && ParallelDescriptor::IOProcessor()) { + amrex::Warning("Multiple GPUs are visible to each MPI rank. This is usually not an issue. But this may lead to incorrect or suboptimal rank-to-GPU mapping."); + } + } + } + AMREX_HIP_OR_CUDA(AMREX_HIP_SAFE_CALL (hipSetDevice(device_id));, AMREX_CUDA_SAFE_CALL(cudaSetDevice(device_id)); ); diff --git a/Src/Base/AMReX_Machine.H b/Src/Base/AMReX_Machine.H index dd49c876e28..ea43d3b311d 100644 --- a/Src/Base/AMReX_Machine.H +++ b/Src/Base/AMReX_Machine.H @@ -2,20 +2,15 @@ #define AMREX_MACHINE_H #include -#include +#include -namespace amrex::machine { +namespace amrex::Machine { void Initialize (); //!< called in amrex::Initialize() -#ifdef AMREX_USE_MPI void Finalize (); -/** -* find the best topologically close neighborhood of ranks -* returns a vector of global or local rank IDs based on flag_local_ranks -*/ -Vector find_best_nbh (int rank_n, bool flag_local_ranks = false); -#endif + +std::string const& name (); } diff --git a/Src/Base/AMReX_Machine.cpp b/Src/Base/AMReX_Machine.cpp index a7cc0d3993e..19dc50ba41f 100644 --- a/Src/Base/AMReX_Machine.cpp +++ b/Src/Base/AMReX_Machine.cpp @@ -1,599 +1,51 @@ -#include - -#ifndef AMREX_USE_MPI - -namespace amrex::machine { - void Initialize () {} -} - -#else - -#include -#include -#include -#include #include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include - -using namespace amrex; - -namespace { - -struct DoubleInt { - double d; - int i; -}; - -using Coord = Array; - -#if defined(AMREX_DEBUG) -// returns coordinate in an index space with no switches -// for dragonfly network -Coord read_df_node_coord (const std::string & name) -{ - int cabx, caby, cab_chas, slot, node; - { - std::ifstream ifs {"/proc/cray_xt/cname"}; - if (!ifs) { - // not on a cray - return Coord {{0,0,0,0}}; // initializer_list - } - char t0, t1, t2, t3, t4; - ifs >> t0 >> cabx >> t1 >> caby >> t2 >> cab_chas >> t3 >> slot >> t4 >> node; - AMREX_ALWAYS_ASSERT(t0 == 'c' && t1 == '-' && t2 == 'c' && t3 == 's' && t4 == 'n'); - } - - int group = 0; - if (name == "cori") { - group = cabx / 2 + caby * 6; // 2 cabinets per group, 6 groups per row - } else { - amrex::Abort("Could not determine group!"); - } - int chas = cab_chas + 3*(cabx & 1); // 2 cabinets per group (6 chassis per group) - - return Coord {{node, slot, chas, group}}; -} -#endif -std::string get_mpi_processor_name () +namespace amrex::Machine { - std::string result; - int len; - char name[MPI_MAX_PROCESSOR_NAME]; - MPI_Get_processor_name(name, &len); - result = std::string(name); - return result; -} -#if defined(AMREX_DEBUG) -// assumes groups are in 4x16x6 configuration -int df_coord_to_id (const Coord & c) -{ - return c[0] + 4 * (c[1] + 16 * (c[2] + 6 * c[3])); +namespace { + std::string s_name; } -#endif -// assumes groups are in 4x16x6 configuration -Coord df_id_to_coord (int id) +void Initialize () { - int node = id % 4; id /= 4; - int slot = id % 16; id /= 16; - int chas = id % 6; id /= 6; - int group = id; - return Coord {{node, slot, chas, group}}; -} + // Known machines: + // nersc.perlmutter: NERSC_HOST=perlmutter + // LMOD_SITE_NAME=perlmutter + // olcf.frontier : LMOD_SITE_NAME=OLCF + // LMOD_SYSTEM_NAME=frontier -template -std::string to_str (const Array & a) -{ - std::ostringstream oss; - oss << "("; - bool first = true; - for (auto const& item : a) { - if (!first) { oss << ","; } - oss << item; - first = false; - } - oss << ")"; - return oss.str(); -} + auto const* env_nersc_host = std::getenv("NERSC_HOST"); + auto const* env_lmod_site_name = std::getenv("LMOD_SITE_NAME"); + auto const* env_lmod_system_name = std::getenv("LMOD_SYSTEM_NAME"); + auto const* env_slurm_cluster_name = std::getenv("SLURM_CLUSTER_NAME"); -template -std::string to_str (const Vector & v) -{ - std::ostringstream oss; - oss << "("; - bool first = true; - for (auto const& item : v) { - if (!first) { oss << ","; } - oss << item; - first = false; + if (env_nersc_host && env_lmod_system_name) { + s_name = std::string("nersc."); + s_name.append(env_lmod_system_name); + } else if (env_lmod_site_name && env_lmod_system_name) { + s_name = std::string(env_lmod_site_name); + s_name.append(".").append(env_lmod_system_name); + } else if (env_slurm_cluster_name) { + s_name = std::string(env_slurm_cluster_name); } - oss << ")"; - return oss.str(); -} -Vector get_subgroup_ranks () -{ - int rank_n = ParallelContext::NProcsSub(); - Vector lranks(rank_n); - for (int i = 0; i < rank_n; ++i) { - lranks[i] = i; + if ( ! s_name.empty()) { + s_name = amrex::toLower(std::move(s_name)); } - Vector granks(rank_n); - ParallelContext::local_to_global_rank(granks.data(), lranks.data(), rank_n); - return granks; + amrex::ExecOnFinalize(Machine::Finalize); } -int pair_n (int x) { - return x*(x-1)/2; -} +void Finalize () {} -int df_dist (const Coord & a, const Coord & b) +std::string const& name () { - if (a[3] != b[3]) { - // large penalty for traversing across groups - return 20; - } else { - // same group - int slot_diff = (a[1] != b[1] ? 1 : 0); - int chas_diff = (a[2] != b[2] ? 1 : 0); - if (slot_diff + chas_diff == 0 && a[0] == b[0]) { - // same node - return 0; - } else { - // add 2 for first and last node-to-switch hops - return 2 + slot_diff + chas_diff; - } - } + return s_name; } -Coord id_to_coord (int id) -{ - // TODO: implement support for other types of networks - return df_id_to_coord(id); } - -int dist (const Coord & a, const Coord & b) -{ - // TODO: implement support for other types of networks - return df_dist(a, b); -} - -struct Candidate -{ - int id; - Coord coord; - // how many ranks on this node - int rank_n = 0; - // sum of pairwise rank distances from the candidate node to already chosen nodes - int sum_dist = 0; - - Candidate () = default; - Candidate (int i) : id(i), coord(id_to_coord(id)) {} -}; - -class NeighborhoodCache -{ - public: - void add (uint64_t key, Vector val) { - AMREX_ASSERT(cache.count(key) == 0); - cache[key] = std::move(val); - } - bool get (uint64_t key, Vector & val) { - bool result = cache.count(key) > 0; - if (result) { - val = cache.at(key); - } - return result; - } - - // result is dependent on both the current set of ranks - // and the size of the neighborhood desired - static uint64_t hash (const Vector & cur_ranks, int nbh_rank_n) { - auto result = hash_vector(cur_ranks); - hash_combine(result, nbh_rank_n); - return result; - } - - private: - std::unordered_map> cache; -}; - -class Machine -{ - public: - Machine () { - get_params(); - get_machine_envs(); - node_ids = get_node_ids(); - } - - // find a compact neighborhood of size rank_n in the current ParallelContext subgroup - Vector find_best_nbh (int nbh_rank_n, bool flag_local_ranks) - { - BL_PROFILE("Machine::find_best_nbh()"); - - auto sg_g_ranks = get_subgroup_ranks(); - auto sg_rank_n = sg_g_ranks.size(); - if (flag_verbose) { - Print() << "Machine::find_best_nbh(): called for " << nbh_rank_n - << " of " << sg_rank_n << " ranks" << '\n'; - } - - Vector result; - auto key = NeighborhoodCache::hash(sg_g_ranks, nbh_rank_n); - if (nbh_cache.get(key, result)) { - if (flag_verbose) { - Print() << "Machine::find_best_nbh(): found neighborhood in cache" << '\n'; - } - } else { - // get node IDs of current subgroup - Vector sg_node_ids(sg_rank_n); - std::unordered_map> node_ranks; - for (int i = 0; i < sg_rank_n; ++i) { - AMREX_ASSERT(sg_g_ranks[i] >= 0 && sg_g_ranks[i] < node_ids.size()); - sg_node_ids[i] = node_ids[sg_g_ranks[i]]; - if (flag_local_ranks) { - node_ranks[sg_node_ids[i]].push_back(i); - } else { - node_ranks[sg_node_ids[i]].push_back(sg_g_ranks[i]); - } - } - - if (flag_very_verbose) { - Print() << "SubRank: GloRank: Node ID: Node Coord:" << '\n'; - for (int i = 0; i < sg_rank_n; ++i) { - Print() << " " << i << ": " << sg_g_ranks[i] << ": " << sg_node_ids[i] - << ": " << to_str(id_to_coord(sg_node_ids[i])) << '\n'; - } - } - - Vector local_nbh; - double score; - auto rank_me = ParallelContext::MyProcSub(); - tie(local_nbh, score) = search_local_nbh(rank_me, sg_node_ids, nbh_rank_n); - - if (flag_verbose) { - Vector base_nbh; - double base_score; - tie(base_nbh, base_score) = baseline_score(sg_node_ids, nbh_rank_n); - - Print() << "Baseline neighborhood: " << to_str(base_nbh) << ", score = " << base_score << '\n' - << "Rank 0's neighborhood: " << to_str(local_nbh) << ", score = " << score << '\n'; - } - - // determine the best neighborhood among ranks - DoubleInt my_score_with_id {score, rank_me}, min_score_with_id; - MPI_Allreduce(&my_score_with_id, &min_score_with_id, 1, MPI_DOUBLE_INT, MPI_MINLOC, ParallelContext::CommunicatorSub()); - double winner_score = min_score_with_id.d; - int winner_rank = min_score_with_id.i; - - // broadcast the best hood from winner rank to everyone - auto local_nbh_size = static_cast(local_nbh.size()); - MPI_Bcast(&local_nbh_size, 1, MPI_INT, winner_rank, ParallelContext::CommunicatorSub()); - local_nbh.resize(local_nbh_size); - MPI_Bcast(local_nbh.data(), local_nbh_size, MPI_INT, winner_rank, ParallelContext::CommunicatorSub()); - - std::sort(local_nbh.begin(), local_nbh.end()); - if (flag_verbose) { - Print() << "Winning neighborhood: " << winner_rank << ": " << to_str(local_nbh) - << ", score = " << winner_score << '\n'; - } - - result.reserve(nbh_rank_n); - for (int i : local_nbh) { - for (auto rank : node_ranks.at(i)) { - if (result.size() < nbh_rank_n) { - result.push_back(rank); - } - } - } - nbh_cache.add(key, result); - } - - if (flag_very_verbose) { - Print() << "Ranks in neighborhood: " << to_str(result) << '\n'; - } - - return result; - } - - private: - - std::string hostname; - std::string nersc_host; - std::string cluster_name; - std::string partition; - std::string node_list; - std::string topo_addr; - - int flag_verbose = 0; - int flag_very_verbose = 0; - bool flag_nersc_df; - // int my_node_id; - Vector node_ids; - - NeighborhoodCache nbh_cache; - - void get_params () - { - ParmParse pp("amrex.machine"); - pp.query("verbose", flag_verbose); - pp.query("very_verbose", flag_very_verbose); - } - - static std::string get_env_str (const std::string& env_key) - { - std::string result; - auto *val_c_str = std::getenv(env_key.c_str()); - if (val_c_str) { - result = std::string(val_c_str); - } - return result; - } - - void get_machine_envs () - { - hostname = get_env_str("HOSTNAME"); - nersc_host = get_env_str("NERSC_HOST"); - cluster_name = get_env_str("SLURM_CLUSTER_NAME"); -#ifdef AMREX_USE_CUDA - flag_nersc_df = false; -#else - flag_nersc_df = (nersc_host == "cori" || - nersc_host == "saul"); -#endif - - if (flag_nersc_df) { - partition = get_env_str("SLURM_JOB_PARTITION"); - node_list = get_env_str("SLURM_NODELIST"); - topo_addr = get_env_str("SLURM_TOPOLOGY_ADDR"); - - if (flag_verbose) { - Print() << "HOSTNAME = " << hostname << '\n' - << "NERSC_HOST = " << nersc_host << '\n' - << "SLURM_JOB_PARTITION = " << partition << '\n' - << "SLURM_NODELIST = " << node_list << '\n' - << "SLURM_TOPOLOGY_ADDR = " << topo_addr << '\n'; - } - } - } - - // get this rank's machine node ID - int get_my_node_id () - { - int result = -1; - if (flag_nersc_df) { - std::string tag = "nid"; - auto pos = topo_addr.find(tag); - if (pos != std::string::npos) { - result = stoi(topo_addr.substr(pos + tag.size())); // assumes format ".*nid(\d+)" - if (flag_verbose) { - Print() << "Got node ID from SLURM_TOPOLOGY_ADDR: " << result << '\n'; - } - } else { - if (cluster_name == "escori") { - tag = "cgpu"; - } - auto mpi_proc_name = get_mpi_processor_name(); - Print() << "MPI_Get_processor_name: " << mpi_proc_name << '\n'; - pos = mpi_proc_name.find(tag); - if (pos != std::string::npos) { - result = stoi(mpi_proc_name.substr(pos + tag.size())); // assumes format ".*nid(\d+)" - if (flag_verbose) { - Print() << "Got node ID from MPI_Get_processor_name(): " << result << '\n'; - } - } - } - - // check result - AMREX_ALWAYS_ASSERT(result != -1); -#ifdef AMREX_DEBUG - auto coord = read_df_node_coord(nersc_host); - int id_from_coord = df_coord_to_id(coord); - AMREX_ALWAYS_ASSERT(id_from_coord == result); -#endif - } else { - result = 0; - } - - return result; - } - - // get all node IDs in this job, indexed by job rank - // this is collective over ALL ranks in the job - Vector get_node_ids () - { - Vector ids(ParallelDescriptor::NProcs(), 0); - int node_id = get_my_node_id(); - ParallelAllGather::AllGather(node_id, ids.data(), ParallelContext::CommunicatorAll()); - if (flag_verbose) { - std::map> node_ranks; - for (int i = 0; i < ids.size(); ++i) { - node_ranks[ids[i]].push_back(i); - } - Print() << "Node ID: Node Coord: Ranks:" << '\n'; - for (const auto & p : node_ranks) { - Print() << " " << p.first << ": " << to_str(id_to_coord(p.first)) - << ": " << to_str(p.second) << '\n'; - } - } - return ids; - } - - // do a local search starting at current node - std::pair, double> - baseline_score(const Vector & sg_node_ids, int nbh_rank_n) const - { - AMREX_ASSERT(!sg_node_ids.empty() && nbh_rank_n > 0 && - nbh_rank_n <= sg_node_ids.size()); - - // construct map of node candidates to select - std::map cand_map; - for (int i = 0; i < nbh_rank_n; ++i) { - auto node_id = sg_node_ids[i]; - if (cand_map.count(node_id) == 0) { - cand_map[node_id] = Candidate(node_id); - } - cand_map.at(node_id).rank_n++; - } - - Vector result(cand_map.size()); - Vector candidates(cand_map.size()); - int idx = 0; - for (auto & p : cand_map) { - result[idx] = p.second.id; - candidates[idx++] = p.second; - } - - int sum_dist = 0; - for (int j = 1; j < candidates.size(); ++j) { - const auto & b = candidates[j]; - for (int i = 0; i < j; ++i) { - const auto & a = candidates[i]; - auto pair_dist = dist(a.coord, b.coord); - // multiply distance by number of rank pairs across the two nodes - sum_dist += pair_dist * (a.rank_n * b.rank_n); - if (flag_very_verbose) { - Print() << " Distance from " << a.id - << " to " << b.id - << ": " << pair_dist << '\n'; - } - } - } - double score = (nbh_rank_n > 1) ? (static_cast(sum_dist) / pair_n(nbh_rank_n)) : 0; - return std::make_pair(std::move(result), score); - } - - // do a local search starting at current node - std::pair, double> - search_local_nbh(int rank_me, const Vector & sg_node_ids, int nbh_rank_n) const - { - BL_PROFILE("Machine::search_local_nbh()"); - - if (amrex::Verbose() > 0) { - Print() << "Machine::search_local_nbh() called ..." << '\n'; - } - - Vector result; - - // construct map of node candidates to select - std::map candidates; - for (auto node_id : sg_node_ids) { - if (candidates.count(node_id) == 0) { - candidates[node_id] = Candidate(node_id); - } - candidates.at(node_id).rank_n++; - } - - if (flag_very_verbose) { - Print() << " Candidates:" << '\n'; - for (const auto & p : candidates) { - const auto & cand = p.second; - Print() << " " << cand.id << " : " << to_str(cand.coord) - << ": " << cand.rank_n << " ranks" << '\n'; - } - } - - AMREX_ASSERT(rank_me >= 0 && rank_me < sg_node_ids.size()); - Candidate cur_node = candidates.at(sg_node_ids[rank_me]); - candidates.erase(cur_node.id); - - // add source_node - result.push_back(cur_node.id); - int total_rank_n = cur_node.rank_n; - int total_pairs_dist = 0; - if (flag_verbose) { - Print() << " Added " << cur_node.id - << ": " << to_str(cur_node.coord) - << ", ranks: " << cur_node.rank_n - << ", total ranks: " << total_rank_n - << ", avg dist: " << 0 << '\n'; - } - if (total_rank_n >= nbh_rank_n) { - return {std::move(result), 0}; - } - - double min_avg_dist; - while (total_rank_n < nbh_rank_n) - { - min_avg_dist = std::numeric_limits::max(); - Candidate * next_node = nullptr; - // update candidates with their pairwise rank distances to cur_node - for (auto & p : candidates) { - Candidate & cand_node = p.second; - auto cand_dist = dist(cand_node.coord, cur_node.coord); - // multiply distance by number of rank pairs across the two nodes - cand_node.sum_dist += cand_dist * (cand_node.rank_n * cur_node.rank_n); - double avg_dist = static_cast(cand_node.sum_dist + total_pairs_dist) / - pair_n(cand_node.rank_n + total_rank_n); - if (flag_very_verbose) { - Print() << " Distance from " << cand_node.id - << " to " << cur_node.id - << ": " << cand_dist - << ", candidate avg: " << avg_dist << '\n'; - } - // keep track of what should be the next node to add - if (avg_dist < min_avg_dist) { - next_node = &cand_node; - min_avg_dist = avg_dist; - } - } - - if (next_node) { - cur_node = *next_node; - next_node = nullptr; - candidates.erase(cur_node.id); - - // add cur_node to result - result.push_back(cur_node.id); - total_rank_n += cur_node.rank_n; - total_pairs_dist += cur_node.sum_dist; - - if (flag_verbose) { - Print() << " Added " << cur_node.id - << ": " << to_str(cur_node.coord) - << ", ranks: " << cur_node.rank_n - << ", total ranks: " << total_rank_n - << ", avg dist: " << min_avg_dist << '\n'; - } - } - } - - return std::make_pair(std::move(result), min_avg_dist); - } -}; - -std::unique_ptr the_machine; - -} - -namespace amrex::machine { - -void Initialize () { - the_machine = std::make_unique(); - amrex::ExecOnFinalize(machine::Finalize); -} - -void Finalize () { - the_machine.reset(); -} - -Vector find_best_nbh (int rank_n, bool flag_local_ranks) { - AMREX_ASSERT(the_machine); - return the_machine->find_best_nbh(rank_n, flag_local_ranks); -} - -} - -#endif