Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix add-hostfile and add-host operations #1851

Merged
merged 1 commit into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/mca/ras/base/help-ras-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,18 @@ file could not be opened for reading:
File: %s

Please check the filename and try again.
#
[ras-base:nonuniform-slots]
A request was made to add hosts from a hostfile while operating
in a managed allocation. In this case, either the slots must be
specified in the given hostfile, or the number of slots assigned
by the resource manager on the existing nodes must be uniform.

The current allocation does not conform to that requirement:

Base number of slots: %d
Node: %s
Number of slots: %d

Please assign a number of slots for each node to be added to the
allocation.
107 changes: 88 additions & 19 deletions src/mca/ras/base/ras_base_allocate.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,13 +753,14 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
{
int rc;
pmix_list_t nodes;
int i, k, n, slots;
int i, k, m, n, slots;
prte_app_context_t *app;
prte_node_t *node, *next, *nptr;
char *hosts, *line, *cptr, *ptr, **hostfiles;
char *hosts, *line, *cptr, *ptr, **hostfiles, *nm;
FILE *fp;
bool addslots, found;
bool extend = false;
int default_slots = -1;

PMIX_CONSTRUCT(&nodes, pmix_list_t);

Expand All @@ -776,6 +777,32 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
* can be present
*/

/* if we are in a managed allocation, the best we can do for nodes
* that do not include a specific slot assignment is to (a) check
* to see if there is a uniform assignment on existing nodes and
* use that, or (b) generate an error as we cannot know what the
* host environment might have set
*/
if (prte_managed_allocation) {
for (n = 0; n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
}
if (-1 == default_slots) {
default_slots = nptr->slots;
continue;
}
if (default_slots != nptr->slots) {
// generate an error message
pmix_show_help("help-ras-base.txt", "ras-base:nonuniform-slots", true,
default_slots, nptr->name, nptr->slots);
PMIX_LIST_DESTRUCT(&nodes);
return PRTE_ERR_SILENT;
}
}
}

for (i = 0; i < jdata->apps->size; i++) {
if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) {
continue;
Expand Down Expand Up @@ -819,28 +846,35 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
free(line);
continue;
}

addslots = false;
// because there can be arbitrary whitespace around keywords,
// we manually parse the line to get the directives
ptr = cptr;
while (NULL != ptr && !isspace(*ptr)) {
while ('\0' != *ptr && !isspace(*ptr)) {
++ptr;
}
*ptr = '\0';
if ('\0' == *ptr) {
// end of the line - just the node name was given
slots = default_slots;
goto process;
}
*ptr = '\0'; // terminate the name
// find the '=' sign
++ptr;
ptr = strchr(ptr, '=');
if (NULL == ptr) {
// didn't specify slots - autodetect them
slots = -1;
while ('\0' != *ptr && ('=' != *ptr || isspace(*ptr))) {
++ptr;
}
if ('\0' == *ptr) {
// didn't specify slots - use the default value
slots = default_slots;
goto process;
}
// find the value
++ptr;
while (NULL != ptr && '\0' != *ptr && isspace(*ptr)) {
while ('\0' != *ptr && isspace(*ptr)) {
++ptr;
}
if (NULL == ptr || '\0' == *ptr) {
if ('\0' == *ptr) {
// bad syntax
PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM);
fclose(fp);
Expand All @@ -851,7 +885,6 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}
// if it is a '+' or '-', then we are adjusting
// the #slots
addslots = false;
if ('+' == *ptr || '-' == *ptr) {
addslots = true;
}
Expand All @@ -860,23 +893,42 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
process:
// see if we have this node
found = false;
for (n = 0; n < prte_node_pool->size; n++) {
// does the name refer to me?
if (prte_check_host_is_local(cptr)) {
nm = prte_process_info.nodename;
} else {
nm = cptr;
}

for (n = 0; !found && n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
}
if (0 == strcmp(cptr, nptr->name)) {
if (0 == strcmp(nm, nptr->name)) {
// we have the node
if (addslots) {
nptr->slots += slots;
if (0 > nptr->slots) {
nptr->slots = 0;
}
} else {
nptr->slots = slots;
}
found = true;
break;
} else if (NULL != nptr->aliases) {
/* no choice but an exhaustive search - fortunately, these lists are short! */
for (m = 0; NULL != nptr->aliases[m]; m++) {
if (0 == strcmp(cptr, nptr->aliases[m])) {
if (addslots) {
nptr->slots += slots;
if (0 > nptr->slots) {
nptr->slots = 0;
}
}
found = true;
break;
}
}
}
}
if (!found) {
Expand Down Expand Up @@ -942,7 +994,8 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
PMIX_LIST_FOREACH_SAFE(node, next, &nodes, prte_node_t)
{
node->state = PRTE_NODE_STATE_ADDED;
for (n = 0; n < prte_node_pool->size; n++) {
found = false;
for (n = 0; !found && n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
Expand All @@ -956,7 +1009,22 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}
pmix_list_remove_item(&nodes, &node->super);
PMIX_RELEASE(node);
break;
found = true;
} else if (NULL != nptr->aliases) {
/* no choice but an exhaustive search - fortunately, these lists are short! */
for (m = 0; !found && NULL != nptr->aliases[m]; m++) {
if (0 == strcmp(node->name, nptr->aliases[m])) {
if (prte_get_attribute(&node->attributes, PRTE_NODE_ADD_SLOTS, NULL, PMIX_BOOL)) {
nptr->slots += node->slots;
prte_remove_attribute(&node->attributes, PRTE_NODE_ADD_SLOTS);
} else {
nptr->slots = node->slots;
}
pmix_list_remove_item(&nodes, &node->super);
PMIX_RELEASE(node);
found = true;
}
}
}
}
}
Expand All @@ -981,7 +1049,8 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}

/* shall we display the results? */
if (0 < pmix_output_get_verbosity(prte_ras_base_framework.framework_output)) {
if (0 < pmix_output_get_verbosity(prte_ras_base_framework.framework_output) ||
prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_ALLOC, NULL, PMIX_BOOL)) {
prte_ras_base_display_alloc(jdata);
}

Expand Down
42 changes: 42 additions & 0 deletions src/mca/ras/testrm/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
# Copyright (c) 2022-2023 Nanook Consulting. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

sources = \
ras_testrm.h \
ras_testrm_component.c \
ras_testrm.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_prte_ras_testrm_DSO
lib =
lib_sources =
component = prte_mca_ras_testrm.la
component_sources = $(sources)
else
lib = libprtemca_ras_testrm.la
lib_sources = $(sources)
component =
component_sources =
endif

mcacomponentdir = $(prtelibdir)
mcacomponent_LTLIBRARIES = $(component)
prte_mca_ras_testrm_la_SOURCES = $(component_sources)
prte_mca_ras_testrm_la_LDFLAGS = -module -avoid-version
prte_mca_ras_testrm_la_LIBADD = $(top_builddir)/src/libprrte.la

noinst_LTLIBRARIES = $(lib)
libprtemca_ras_testrm_la_SOURCES = $(lib_sources)
libprtemca_ras_testrm_la_LDFLAGS = -module -avoid-version
54 changes: 54 additions & 0 deletions src/mca/ras/testrm/ras_testrm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
* Copyright (c) 2015-2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
*
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "prte_config.h"
#include "constants.h"
#include "types.h"

#include "src/class/pmix_list.h"
#include "src/runtime/prte_globals.h"
#include "src/util/hostfile/hostfile.h"
#include "ras_testrm.h"

/*
* Local functions
*/
static int allocate(prte_job_t *jdata, pmix_list_t *nodes);
static int finalize(void);

/*
* Global variable
*/
prte_ras_base_module_t prte_ras_testrm_module = {
.init = NULL,
.allocate = allocate,
.deallocate = NULL,
.finalize = finalize
};

static int allocate(prte_job_t *jdata, pmix_list_t *nodes)
{
int rc;

rc = prte_util_add_hostfile_nodes(nodes, prte_mca_ras_testrm_component.hostfile);
return rc;
}

/*
* There's really nothing to do here
*/
static int finalize(void)
{
return PRTE_SUCCESS;
}
34 changes: 34 additions & 0 deletions src/mca/ras/testrm/ras_testrm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#ifndef PRTE_RAS_TESTRM_H
#define PRTE_RAS_TESTRM_H

#include "prte_config.h"
#include "src/mca/ras/base/base.h"
#include "src/mca/ras/ras.h"

BEGIN_C_DECLS

struct prte_ras_testrm_component_t {
prte_ras_base_component_t super;
char *hostfile;
};
typedef struct prte_ras_testrm_component_t prte_ras_testrm_component_t;

PRTE_EXPORT extern prte_ras_testrm_component_t prte_mca_ras_testrm_component;
PRTE_EXPORT extern prte_ras_base_module_t prte_ras_testrm_module;

END_C_DECLS

#endif
Loading