diff --git a/examples/client2.c b/examples/client2.c index 026f03bba9..f0af1a9d07 100644 --- a/examples/client2.c +++ b/examples/client2.c @@ -15,7 +15,7 @@ * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -128,6 +128,16 @@ int main(int argc, char **argv) PMIX_VALUE_RELEASE(val); fprintf(stderr, "Client %s:%d job size %d\n", myproc.nspace, myproc.rank, nprocs); +#ifdef PMIX_GPU_SUPPORT + /* see if we were given a GPU directive */ + rc = PMIx_Get(&proc, PMIX_GPU_SUPPORT, NULL, 0, &val); + if (PMIX_SUCCESS == rc) { + fprintf(stderr, "%s:%d GPU support: %s\n", myproc.nspace, myproc.rank, val->data.flag ? "ENABLED" : "DISABLED"); + } else { + fprintf(stderr, "%s:%d GPU support: NOT GIVEN\n", myproc.nspace, myproc.rank); + } +#endif + /* put a data array of pmix_value's */ val = (pmix_value_t *) malloc(32 * sizeof(pmix_value_t)); for (n = 0; n < 32; n++) { diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index f0fc5510eb..cc4f5becec 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -17,7 +17,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2019 UT-Battelle, LLC. All rights reserved. * - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. * $COPYRIGHT$ * @@ -80,6 +80,7 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) pmix_data_array_t *darray = NULL; pmix_list_t nodes; int slots, len; + bool flag, *fptr; PRTE_HIDE_UNUSED_PARAMS(fd, args); @@ -99,6 +100,7 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) memset(&options, 0, sizeof(prte_rmaps_options_t)); options.stream = prte_rmaps_base_framework.framework_output; options.verbosity = 5; // usual value for base-level functions + fptr = &flag; /* check and set some general options */ if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { @@ -286,6 +288,13 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } } } + /* if not already assigned, inherit the parent's GPU support directive */ + if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_GPU_SUPPORT, NULL, PMIX_BOOL)) { + if (prte_get_attribute(&parent->attributes, PRTE_JOB_GPU_SUPPORT, (void **) &fptr, PMIX_BOOL)) { + prte_set_attribute(&jdata->attributes, PRTE_JOB_GPU_SUPPORT, PRTE_ATTR_GLOBAL, fptr, PMIX_BOOL); + } + } + } else { if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL) && !prte_get_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, NULL, PMIX_BOOL)) { diff --git a/src/mca/schizo/ompi/schizo_ompi.c b/src/mca/schizo/ompi/schizo_ompi.c index 3fb46e7ca6..477c5e13e5 100644 --- a/src/mca/schizo/ompi/schizo_ompi.c +++ b/src/mca/schizo/ompi/schizo_ompi.c @@ -18,7 +18,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2022-2024 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -198,7 +198,9 @@ static struct option ompioptions[] = { /* mpiexec mandated form launch key parameters - MPI 4.0 */ PMIX_OPTION_DEFINE("initial-errhandler", PMIX_ARG_REQD), /* mpiexec mandated form launch key parameters - MPI 4.1*/ - PMIX_OPTION_DEFINE("memory-alloc-kinds", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE(PRTE_CLI_MEM_ALLOC_KIND, PMIX_ARG_REQD), + /* GPU support - on/off */ + PMIX_OPTION_DEFINE(PRTE_CLI_GPU_SUPPORT, PMIX_ARG_REQD), /* Display Commumication Protocol : MPI_Init */ PMIX_OPTION_DEFINE("display-comm", PMIX_ARG_NONE), @@ -1603,7 +1605,7 @@ static int parse_env(char **srcenv, char ***dstenv, } } - if (NULL != (opt = pmix_cmd_line_get_param(results, "memory-alloc-kinds"))) { + if (NULL != (opt = pmix_cmd_line_get_param(results, PRTE_CLI_MEM_ALLOC_KIND))) { rc = check_cache(&cache, &cachevals, "mpi_memory_alloc_kinds", opt->values[0]); if (PRTE_SUCCESS != rc) { PMIX_ARGV_FREE_COMPAT(cache); diff --git a/src/mca/schizo/prte/schizo_prte.c b/src/mca/schizo/prte/schizo_prte.c index 93eb9e6191..ca48562ef4 100644 --- a/src/mca/schizo/prte/schizo_prte.c +++ b/src/mca/schizo/prte/schizo_prte.c @@ -18,7 +18,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -197,6 +197,7 @@ static struct option prterunoptions[] = { PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_AGG_HELP, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_FWD_ENVIRON, PMIX_ARG_OPTIONAL), PMIX_OPTION_DEFINE(PRTE_CLI_MEM_ALLOC_KIND, PMIX_ARG_REQD), + PMIX_OPTION_DEFINE(PRTE_CLI_GPU_SUPPORT, PMIX_ARG_REQD), // output options PMIX_OPTION_DEFINE(PRTE_CLI_OUTPUT, PMIX_ARG_REQD), @@ -312,6 +313,7 @@ static struct option prunoptions[] = { PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_AGG_HELP, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_FWD_ENVIRON, PMIX_ARG_OPTIONAL), PMIX_OPTION_DEFINE(PRTE_CLI_MEM_ALLOC_KIND, PMIX_ARG_REQD), + PMIX_OPTION_DEFINE(PRTE_CLI_GPU_SUPPORT, PMIX_ARG_REQD), // output options PMIX_OPTION_DEFINE(PRTE_CLI_OUTPUT, PMIX_ARG_REQD), diff --git a/src/prted/pmix/pmix_server_dyn.c b/src/prted/pmix/pmix_server_dyn.c index 88e9ef003a..6b0f928cb2 100644 --- a/src/prted/pmix/pmix_server_dyn.c +++ b/src/prted/pmix/pmix_server_dyn.c @@ -655,6 +655,13 @@ int prte_pmix_xfer_job_info(prte_job_t *jdata, prte_set_attribute(&jdata->attributes, PRTE_JOB_NOAGG_HELP, PRTE_ATTR_GLOBAL, &flag, PMIX_BOOL); +#ifdef PMIX_GPU_SUPPORT + } else if (PMIX_CHECK_KEY(info, PMIX_GPU_SUPPORT)) { + flag = PMIX_INFO_TRUE(info); + prte_set_attribute(&jdata->attributes, PRTE_JOB_GPU_SUPPORT, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); +#endif + /*** DEFAULT - CACHE FOR INCLUSION WITH JOB INFO ***/ } else { pmix_server_cache_job_info(jdata, info); diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index f9061696d6..39ce230cab 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -376,6 +376,13 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) } #endif + // check for GPU directives +#ifdef PMIX_GPU_SUPPORT + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_GPU_SUPPORT, (void**)&fptr, PMIX_BOOL)) { + PMIX_INFO_LIST_ADD(ret, info, PMIX_GPU_SUPPORT, &flag, PMIX_BOOL); + } +#endif + /* for each app in the job, create an app-array */ for (n = 0; n < jdata->apps->size; n++) { if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { diff --git a/src/prted/prted.h b/src/prted/prted.h index e714ffd798..9caf363aa7 100644 --- a/src/prted/prted.h +++ b/src/prted/prted.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,6 +53,9 @@ PRTE_EXPORT int prun_common(pmix_cli_result_t *cli, prte_schizo_base_module_t *schizo, int argc, char **argv); +PRTE_EXPORT int prte_prun_parse_common_cli(void *jinfo, pmix_cli_result_t *results, + prte_schizo_base_module_t *schizo, + pmix_list_t *apps); END_C_DECLS #endif /* PRTED_H */ diff --git a/src/prted/prun_common.c b/src/prted/prun_common.c index 3d2897aa25..55d1f88433 100644 --- a/src/prted/prun_common.c +++ b/src/prted/prun_common.c @@ -19,7 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Geoffroy Vallee. All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights * reserved. * $COPYRIGHT$ @@ -307,7 +307,7 @@ int prun_common(pmix_cli_result_t *results, prte_schizo_base_module_t *schizo, int pargc, char **pargv) { - int rc = 1, i; + int rc = 1; char *param, *ptr; prte_pmix_lock_t lock, rellock; pmix_list_t apps; @@ -375,7 +375,7 @@ int prun_common(pmix_cli_result_t *results, /* setup the job data global table */ prte_job_data = PMIX_NEW(pmix_pointer_array_t); ret = pmix_pointer_array_init(prte_job_data, PRTE_GLOBAL_ARRAY_BLOCK_SIZE, - PRTE_GLOBAL_ARRAY_MAX_SIZE, + PRTE_GLOBAL_ARRAY_MAX_SIZE, PRTE_GLOBAL_ARRAY_BLOCK_SIZE); if (PRTE_SUCCESS != ret) { PRTE_ERROR_LOG(ret); @@ -542,135 +542,6 @@ int prun_common(pmix_cli_result_t *results, /* we want to be notified upon job completion */ PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_NOTIFY_COMPLETION, &flag, PMIX_BOOL); - /* pass the personality */ - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_PERSONALITY, schizo->name, PMIX_STRING); - - /* get display options */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_DISPLAY); - if (NULL != opt) { - ret = prte_schizo_base_parse_display(opt, jinfo); - if (PRTE_SUCCESS != ret) { - PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); - goto DONE; - } - } - - /* check for output options */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_OUTPUT); - if (NULL != opt) { - ret = prte_schizo_base_parse_output(opt, jinfo); - if (PRTE_SUCCESS != ret) { - PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); - goto DONE; - } - } - - /* check for runtime options */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_RTOS); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RUNTIME_OPTIONS, opt->values[0], PMIX_STRING); - } - - /* check what user wants us to do with stdin */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_STDIN); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_STDIN_TGT, opt->values[0], PMIX_STRING); - } - - opt = pmix_cmd_line_get_param(results, PRTE_CLI_MAPBY); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING); - } - - /* if the user specified a ranking policy, then set it */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_RANKBY); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RANKBY, opt->values[0], PMIX_STRING); - } - - /* if the user specified a binding policy, then set it */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_BINDTO); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING); - } - - /* check for an exec agent */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_EXEC_AGENT); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING); - } - - /* mark if recovery was enabled on the cmd line */ - if (pmix_cmd_line_is_taken(results, PRTE_CLI_ENABLE_RECOVERY)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, &flag, PMIX_BOOL); - } - /* record the max restarts */ - opt = pmix_cmd_line_get_param(results, PRTE_CLI_MAX_RESTARTS); - if (NULL != opt) { - ui32 = strtol(opt->values[0], NULL, 10); - PMIX_LIST_FOREACH(app, &apps, prte_pmix_app_t) - { - PMIX_INFO_LIST_ADD(ret, app->info, PMIX_MAX_RESTARTS, &ui32, PMIX_UINT32); - } - } - /* if continuous operation was specified */ - if (pmix_cmd_line_is_taken(results, PRTE_CLI_CONTINUOUS)) { - /* mark this job as continuously operating */ - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_CONTINUOUS, &flag, PMIX_BOOL); - } -#ifdef PMIX_ABORT_NONZERO_EXIT - /* if ignore non-zero exit was specified */ - if (pmix_cmd_line_is_taken(results, PRTE_CLI_TERM_NONZERO)) { - /* mark this job to not terminate if a proc exits with non-zero status */ - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_ABORT_NONZERO_EXIT, NULL, PMIX_BOOL); - } -#endif - /* if stop-on-exec was specified */ - if (pmix_cmd_line_is_taken(results, PRTE_CLI_STOP_ON_EXEC)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL); - } - - /* check for a job timeout specification, to be provided in seconds - * as that is what MPICH used - */ - i = 0; - opt = pmix_cmd_line_get_param(results, PRTE_CLI_TIMEOUT); - if (NULL != opt) { - i = strtol(opt->values[0], NULL, 10); - } else if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { - i = strtol(param, NULL, 10); - } - if (0 != i) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_TIMEOUT, &i, PMIX_INT); - } - - if (pmix_cmd_line_is_taken(results, PRTE_CLI_STACK_TRACES)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_STACKTRACES, &flag, PMIX_BOOL); - } - if (pmix_cmd_line_is_taken(results, PRTE_CLI_REPORT_STATE)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_REPORT_STATE, &flag, PMIX_BOOL); - } - opt = pmix_cmd_line_get_param(results, PRTE_CLI_SPAWN_TIMEOUT); - if (NULL != opt) { - i = strtol(opt->values[0], NULL, 10); - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_SPAWN_TIMEOUT, &i, PMIX_INT); - } - opt = pmix_cmd_line_get_param(results, PRTE_CLI_DO_NOT_AGG_HELP); - if (NULL != opt) { - flag = false; - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_LOG_AGG, &flag, PMIX_BOOL); - } - -#ifdef PMIX_MEM_ALLOC_KIND - opt = pmix_cmd_line_get_param(results, PRTE_CLI_MEM_ALLOC_KIND); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MEM_ALLOC_KIND, opt->values[0], PMIX_STRING); - } -#endif - - /* give the schizo components a chance to add to the job info */ - schizo->job_info(results, jinfo); - /* pickup any relevant envars */ ninfo = 4; PMIX_INFO_CREATE(iptr, ninfo); @@ -726,6 +597,11 @@ int prun_common(pmix_cli_result_t *results, goto DONE; } + ret = prte_prun_parse_common_cli(jinfo, results, schizo, &apps); + if (PRTE_SUCCESS != ret) { + goto DONE; + } + /* convert the job info into an array */ PMIX_INFO_LIST_CONVERT(ret, jinfo, &darray); iptr = (pmix_info_t *) darray.array; @@ -880,6 +756,161 @@ int prun_common(pmix_cli_result_t *results, return rc; } +int prte_prun_parse_common_cli(void *jinfo, pmix_cli_result_t *results, + prte_schizo_base_module_t *schizo, + pmix_list_t *apps) +{ + pmix_cli_item_t *opt; + int ret, i; + uint32_t ui32; + bool flag; + prte_pmix_app_t *app; + char *param; + pmix_info_t info; + + /* pass the personality */ + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_PERSONALITY, schizo->name, PMIX_STRING); + + /* get display options */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_DISPLAY); + if (NULL != opt) { + ret = prte_schizo_base_parse_display(opt, jinfo); + if (PRTE_SUCCESS != ret) { + PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); + return ret; + } + } + + /* check for output options */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_OUTPUT); + if (NULL != opt) { + ret = prte_schizo_base_parse_output(opt, jinfo); + if (PRTE_SUCCESS != ret) { + PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); + return ret; + } + } + + /* check for runtime options */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_RTOS); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RUNTIME_OPTIONS, opt->values[0], PMIX_STRING); + } + + /* check what user wants us to do with stdin */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_STDIN); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_STDIN_TGT, opt->values[0], PMIX_STRING); + } + + opt = pmix_cmd_line_get_param(results, PRTE_CLI_MAPBY); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING); + } + + /* if the user specified a ranking policy, then set it */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_RANKBY); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RANKBY, opt->values[0], PMIX_STRING); + } + + /* if the user specified a binding policy, then set it */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_BINDTO); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING); + } + + /* check for an exec agent */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_EXEC_AGENT); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING); + } + + /* mark if recovery was enabled on the cmd line */ + if (pmix_cmd_line_is_taken(results, PRTE_CLI_ENABLE_RECOVERY)) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, &flag, PMIX_BOOL); + } + /* record the max restarts */ + opt = pmix_cmd_line_get_param(results, PRTE_CLI_MAX_RESTARTS); + if (NULL != opt) { + ui32 = strtol(opt->values[0], NULL, 10); + PMIX_LIST_FOREACH(app, apps, prte_pmix_app_t) + { + PMIX_INFO_LIST_ADD(ret, app->info, PMIX_MAX_RESTARTS, &ui32, PMIX_UINT32); + } + } + /* if continuous operation was specified */ + if (pmix_cmd_line_is_taken(results, PRTE_CLI_CONTINUOUS)) { + /* mark this job as continuously operating */ + flag = true; + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_CONTINUOUS, &flag, PMIX_BOOL); + } +#ifdef PMIX_ABORT_NONZERO_EXIT + /* if ignore non-zero exit was specified */ + if (pmix_cmd_line_is_taken(results, PRTE_CLI_TERM_NONZERO)) { + /* mark this job to not terminate if a proc exits with non-zero status */ + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_ABORT_NONZERO_EXIT, NULL, PMIX_BOOL); + } +#endif + /* if stop-on-exec was specified */ + if (pmix_cmd_line_is_taken(results, PRTE_CLI_STOP_ON_EXEC)) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL); + } + + /* check for a job timeout specification, to be provided in seconds + * as that is what MPICH used + */ + i = 0; + opt = pmix_cmd_line_get_param(results, PRTE_CLI_TIMEOUT); + if (NULL != opt) { + i = strtol(opt->values[0], NULL, 10); + } else if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + i = strtol(param, NULL, 10); + } + if (0 != i) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_TIMEOUT, &i, PMIX_INT); + } + + if (pmix_cmd_line_is_taken(results, PRTE_CLI_STACK_TRACES)) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_STACKTRACES, &flag, PMIX_BOOL); + } + if (pmix_cmd_line_is_taken(results, PRTE_CLI_REPORT_STATE)) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_REPORT_STATE, &flag, PMIX_BOOL); + } + opt = pmix_cmd_line_get_param(results, PRTE_CLI_SPAWN_TIMEOUT); + if (NULL != opt) { + i = strtol(opt->values[0], NULL, 10); + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_SPAWN_TIMEOUT, &i, PMIX_INT); + } + opt = pmix_cmd_line_get_param(results, PRTE_CLI_DO_NOT_AGG_HELP); + if (NULL != opt) { + flag = false; + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_LOG_AGG, &flag, PMIX_BOOL); + } + +#ifdef PMIX_MEM_ALLOC_KIND + opt = pmix_cmd_line_get_param(results, PRTE_CLI_MEM_ALLOC_KIND); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MEM_ALLOC_KIND, opt->values[0], PMIX_STRING); + } +#endif +#ifdef PMIX_GPU_SUPPORT + opt = pmix_cmd_line_get_param(results, PRTE_CLI_GPU_SUPPORT); + if (NULL != opt) { + // they could be enabling or disabling it + info.value.type = PMIX_STRING; + info.value.data.string = opt->values[0]; + flag = PMIX_INFO_TRUE(&info); + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_GPU_SUPPORT, &flag, PMIX_BOOL); + } +#endif + + /* give the schizo components a chance to add to the job info */ + schizo->job_info(results, jinfo); + + return PRTE_SUCCESS; +} + static void signal_forward_callback(int signum) { pmix_status_t rc; diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index 2ec8b5daeb..eb269c6b4a 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -19,7 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Geoffroy Vallee. All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights * reserved. * Copyright (c) 2022-2023 Triad National Security, LLC. All rights @@ -250,18 +250,16 @@ static char *pmix_getline(FILE *fp) int main(int argc, char *argv[]) { int rc = 1, i; - char *param, *timeoutenv, *tpath, *cptr; + char *param, *tpath, *cptr; prte_pmix_lock_t lock; pmix_list_t apps; prte_pmix_app_t *app; pmix_info_t *iptr, *iptr2, info; pmix_status_t ret; - bool flag; size_t n, ninfo, param_len; pmix_app_t *papps; size_t napps; mylock_t mylock; - uint32_t ui32; char **pargv, **split; int pargc; prte_job_t *jdata; @@ -1006,136 +1004,11 @@ int main(int argc, char *argv[]) PMIX_VALUE_RELEASE(val); } - /* pass the personality */ - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_PERSONALITY, personality, PMIX_STRING); - - /* get display options */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_DISPLAY); - if (NULL != opt) { - ret = prte_schizo_base_parse_display(opt, jinfo); - if (PRTE_SUCCESS != ret) { - PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); - goto DONE; - } - } - - /* get output options */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_OUTPUT); - if (NULL != opt) { - ret = prte_schizo_base_parse_output(opt, jinfo); - if (PRTE_SUCCESS != ret) { - PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL); - goto DONE; - } - } - - /* check for runtime options */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_RTOS); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RUNTIME_OPTIONS, opt->values[0], PMIX_STRING); - } - - /* check what user wants us to do with stdin */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_STDIN); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_STDIN_TGT, opt->values[0], PMIX_STRING); - } - - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_MAPBY); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING); - } - - /* if the user specified a ranking policy, then set it */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_RANKBY); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RANKBY, opt->values[0], PMIX_STRING); - } - - /* if the user specified a binding policy, then set it */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_BINDTO); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING); - } - - /* check for an exec agent */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_EXEC_AGENT); - if (NULL != opt) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING); - } - - /* mark if recovery was enabled */ - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_ENABLE_RECOVERY)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, NULL, PMIX_BOOL); - } - /* record the max restarts */ - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_MAX_RESTARTS); - if (NULL != opt) { - ui32 = strtol(opt->values[0], NULL, 10); - PMIX_LIST_FOREACH(app, &apps, prte_pmix_app_t) - { - PMIX_INFO_LIST_ADD(ret, app->info, PMIX_MAX_RESTARTS, &ui32, PMIX_UINT32); - } - } - /* if continuous operation was specified */ - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_CONTINUOUS)) { - /* mark this job as continuously operating */ - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_CONTINUOUS, NULL, PMIX_BOOL); - } -#ifdef PMIX_ABORT_NONZERO_EXIT - /* if ignore non-zero exit was specified */ - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_TERM_NONZERO)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_ABORT_NONZERO_EXIT, NULL, PMIX_BOOL); - } -#endif - /* if stop-on-exec was specified */ - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_STOP_ON_EXEC)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL); - } - - /* check for a job timeout specification, to be provided in seconds - * as that is what MPICH used - */ - timeoutenv = NULL; - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_TIMEOUT); - if (NULL != opt || NULL != (timeoutenv = getenv("MPIEXEC_TIMEOUT"))) { - if (NULL != timeoutenv) { - i = strtol(timeoutenv, NULL, 10); - /* both cannot be present, or they must agree */ - if (NULL != opt) { - n = strtol(opt->values[0], NULL, 10); - if (i != (int)n) { - pmix_show_help("help-prun.txt", "prun:timeoutconflict", false, - prte_tool_basename, n, timeoutenv); - PRTE_UPDATE_EXIT_STATUS(1); - goto DONE; - } - } - } else { - i = strtol(opt->values[0], NULL, 10); - } - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_TIMEOUT, &i, PMIX_INT); - } - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_STACK_TRACES)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_STACKTRACES, NULL, PMIX_BOOL); - } - if (pmix_cmd_line_is_taken(&results, PRTE_CLI_REPORT_STATE)) { - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_TIMEOUT_REPORT_STATE, NULL, PMIX_BOOL); - } - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_SPAWN_TIMEOUT); - if (NULL != opt) { - i = strtol(opt->values[0], NULL, 10); - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_SPAWN_TIMEOUT, &i, PMIX_INT); - } - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_DO_NOT_AGG_HELP); - if (NULL != opt) { - flag = false; - PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_LOG_AGG, &flag, PMIX_BOOL); + ret = prte_prun_parse_common_cli(jinfo, &results, schizo, &apps); + if (PRTE_SUCCESS != ret) { + goto DONE; } - /* give the schizo components a chance to add to the job info */ - schizo->job_info(&results, jinfo); - /* convert the job info into an array */ PMIX_INFO_LIST_CONVERT(ret, jinfo, &darray); if (PMIX_ERR_EMPTY == ret) { diff --git a/src/util/attr.c b/src/util/attr.c index 7f1d3ec5dd..ca4fc54bc0 100644 --- a/src/util/attr.c +++ b/src/util/attr.c @@ -3,7 +3,7 @@ * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -509,6 +509,8 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key) return "JOB BINDING LIMIT"; case PRTE_JOB_CHILD_SEP: return "CHILD SEP"; + case PRTE_JOB_GPU_SUPPORT: + return "GPU SUPPORT"; case PRTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; diff --git a/src/util/attr.h b/src/util/attr.h index b7eb2d227d..485f9f5947 100644 --- a/src/util/attr.h +++ b/src/util/attr.h @@ -3,7 +3,7 @@ * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -231,6 +231,7 @@ typedef uint16_t prte_job_flags_t; #define PRTE_JOB_CHILD_SEP (PRTE_JOB_START_KEY + 116) // bool - child job is to be considered independent // from its parent, do not terminate if // parent dies first +#define PRTE_JOB_GPU_SUPPORT (PRTE_JOB_START_KEY + 117) // bool - enable/disable GPU support in app #define PRTE_JOB_MAX_KEY (PRTE_JOB_START_KEY + 200) diff --git a/src/util/prte_cmd_line.h b/src/util/prte_cmd_line.h index 17990d020b..c0761cb911 100644 --- a/src/util/prte_cmd_line.h +++ b/src/util/prte_cmd_line.h @@ -15,7 +15,7 @@ * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2017-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -109,6 +109,7 @@ BEGIN_C_DECLS #define PRTE_CLI_ENABLE_RECOVERY "enable-recovery" // none #define PRTE_CLI_DISABLE_RECOVERY "disable-recovery" // none #define PRTE_CLI_MEM_ALLOC_KIND "memory-alloc-kinds" // required +#define PRTE_CLI_GPU_SUPPORT "gpu-support" // required // Placement options #define PRTE_CLI_MAPBY "map-by" // required