From 4ec4df21a0ee49a748d263f872e1c1b4d45e65c7 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 29 Oct 2024 06:34:10 -0600 Subject: [PATCH] Fix support for hetero LSF environments Don't assume that all nodes share the same topology as the HNP - it is an unnecessary restriction since we know the actual topology of each node. Signed-off-by: Ralph Castain --- .../show-help-files/help-rmaps_rank_file.txt | 6 +++ src/mca/rmaps/rank_file/rmaps_rank_file.c | 54 +++++++++++++++++-- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/src/docs/show-help-files/help-rmaps_rank_file.txt b/src/docs/show-help-files/help-rmaps_rank_file.txt index 0f99de1900..210a5547ec 100644 --- a/src/docs/show-help-files/help-rmaps_rank_file.txt +++ b/src/docs/show-help-files/help-rmaps_rank_file.txt @@ -102,3 +102,9 @@ to a different process. If this is intentional then you must pass the "overload-allowed" qualifier to the --bind-to option. --bind-to :overload-allowed +# +[resource-not-found] +The specified LSF affinity file contained a node (%s) that is not in your +allocation. We therefore cannot map a process rank to it. Please +check your allocation and affinity file to ensure the latter only +contains allocated nodes. diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index d1a2401a41..acaf73aaa6 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -756,6 +756,28 @@ static int prte_rmaps_rf_process_lsf_affinity_hostfile(prte_job_t *jdata, return PRTE_SUCCESS; } +static bool quickmatch(prte_node_t *nd, char *name) +{ + int n; + + if (0 == strcmp(nd->name, name)) { + return true; + } + if (0 == strcmp(nd->name, prte_process_info.nodename) && + (0 == strcmp(name, "localhost") || + 0 == strcmp(name, "127.0.0.1"))) { + return true; + } + if (NULL != nd->aliases) { + for (n=0; NULL != nd->aliases[n]; n++) { + if (0 == strcmp(nd->aliases[n], name)) { + return true; + } + } + } + return false; +} + static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, char **aff_rankfile) { FILE *fp; @@ -765,9 +787,9 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c char *tmp_str = NULL; size_t len; char **cpus; - int i; + int i, j; hwloc_obj_t obj; - prte_topology_t *my_topo = NULL; + prte_node_t *node, *nptr; if( NULL != *aff_rankfile) { free(*aff_rankfile); @@ -835,11 +857,33 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c // Convert the Physical CPU set from LSF to a Hwloc logical CPU set pmix_output_verbose(20, prte_rmaps_base_framework.framework_output, "mca:rmaps:rf: (lsf) Convert Physical CPUSET from <%s>", sep); - my_topo = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, 0); + + // find the named host + nptr = NULL; + for (j = 0; j < prte_node_pool->size; j++) { + node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, j); + if (NULL == node) { + continue; + } + if (quickmatch(node, hstname)) { + nptr = node; + break; + } + } + if (NULL == nptr) { + /* wasn't found - that is an error */ + pmix_show_help("help-rmaps_rank_file.txt", + "resource-not-found", true, + hstname); + fclose(fp); + close(fp_rank); + return PRTE_ERROR; + } + cpus = PMIX_ARGV_SPLIT_COMPAT(sep, ','); for(i = 0; NULL != cpus[i]; ++i) { - // assume HNP has the same topology as other nodes - obj = hwloc_get_pu_obj_by_os_index(my_topo->topo, strtol(cpus[i], NULL, 10)) ; + // get the specified object + obj = hwloc_get_pu_obj_by_os_index(nptr->topology->topo, strtol(cpus[i], NULL, 10)) ; if (NULL == obj) { PMIX_ARGV_FREE_COMPAT(cpus); fclose(fp);