From 1411d6d35ee896b48f0608bfd43661c78a8b70cd Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:08 -0700 Subject: [PATCH 01/53] Cachepath: parse_loc utilities. --- src/client/beboot/parseloc.h | 7 ++ src/utils/parseloc.c | 145 +++++++++++++++++++++++++++++------ 2 files changed, 127 insertions(+), 25 deletions(-) diff --git a/src/client/beboot/parseloc.h b/src/client/beboot/parseloc.h index c5362e2e..1731906a 100644 --- a/src/client/beboot/parseloc.h +++ b/src/client/beboot/parseloc.h @@ -24,6 +24,13 @@ extern "C" { #include "spindle_launch.h" char *parse_location(char *loc, number_t number); +char *parse_location_noerr(char *loc, number_t number); +char *realize(char *path); +char **parse_colonsep_prefixes(char *colonsep_list, number_t number); +int is_local_prefix(const char *path, char **local_prefixes); +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); #if defined(__cplusplus) } diff --git a/src/utils/parseloc.c b/src/utils/parseloc.c index f0ebae30..8c2a7929 100644 --- a/src/utils/parseloc.c +++ b/src/utils/parseloc.c @@ -22,6 +22,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include #include +#include #if !defined(USE_PLUGIN_DEBUG) #include "spindle_debug.h" @@ -34,13 +35,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ccwarns.h" #include "spindle_launch.h" -#if defined(__cplusplus) -extern "C" { -#endif - char *parse_location(char *loc, number_t number); -#if defined(__cplusplus) -} -#endif +extern int spindle_mkdir(char *orig_path); #if defined(CUSTOM_GETENV) extern char *custom_getenv(char*); @@ -168,38 +163,64 @@ char *parse_location_noerr(char *loc, number_t number) **/ char *realize(char *path) { + int local_errno; char *result; - char *origpath, *cur_slash = NULL, *trailing; - struct stat buf; + char *origpath, *cur_slash = NULL, *prev_slash = NULL; + struct stat *buf = calloc( 1, sizeof( struct stat ) ); char newpath[MAX_PATH_LEN+1]; int lastpos; newpath[MAX_PATH_LEN] = '\0'; origpath = strdup(path); - for (;;) { - if (stat(origpath, &buf) != -1) - break; - if (cur_slash) - *cur_slash = '/'; + errno=0; + while( stat( origpath, buf ) == -1 ){ + local_errno = errno; + debug_printf("Failed to stat '%s' (%s).\n", origpath, strerror(local_errno)); + prev_slash = cur_slash; cur_slash = strrchr(origpath, '/'); - if (!cur_slash) - break; - *cur_slash = '\0'; + if( prev_slash ) + *prev_slash = '/'; + if( cur_slash ) + *cur_slash = '\0'; + else{ + debug_printf("Nothing in the original path can be stat'ed. (%s)\n", path); + return NULL; + } } - if (cur_slash) - trailing = cur_slash + 1; - else - trailing = ""; + errno = 0; result = realpath(origpath, newpath); if (!result) { + local_errno = errno; + err_printf( + "Error: realpath(3) failed to create canonical version of '%s' (%s). Returning '%s'.\n", + origpath, strerror(local_errno), path ); + errno = 0; + int rc = stat( origpath, buf ); + local_errno = errno; + err_printf( + " Statting that path results in rc=%d, errno=%d, error='%s'.\n", + rc, local_errno, strerror(local_errno)); free(origpath); - return path; + return NULL; } + free(buf); - strncat(newpath, "/", MAX_PATH_LEN); - strncat(newpath, trailing, MAX_PATH_LEN); - newpath[MAX_PATH_LEN] = '\0'; + if( cur_slash ){ + if( strlen( newpath ) + strlen( cur_slash+1 ) > MAX_PATH_LEN ){ + err_printf( + "Error: The realized path exceeds MAX_PATH_LEN (%d).\n" + " Original path: '%s'\n" + " Statable part: '%s'\n" + " Canonical version: '%s'\n" + " Returning original path.\n", + MAX_PATH_LEN, path, origpath, newpath); + free(origpath); + return path; + } + strncat(newpath, "/", 2); + strncat(newpath, cur_slash+1, MAX_PATH_LEN - strlen( newpath )); + } free(origpath); lastpos = strlen(newpath)-1; @@ -280,3 +301,77 @@ int is_local_prefix(const char *path, char **local_prefixes) { return 0; } +/* validateCandidatePath determines if candidatePath passes parse_location(), realize(), and spindle_mkdir(), which is to say, can + * spindle create a directory from this path? + * + * If not NULL, then realizedPath, parsedPath, and/or symbolicPath will hold the respective intermediate/final results. + * + * Return 1 if the candidatePath is valid, otherwise 0. + */ +static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ){ + int rc; + char *parsedCandidatePath, *realizedCandidatePath; + parsedCandidatePath = parse_location( candidatePath, number ); + if( parsedCandidatePath ){ + realizedCandidatePath = realize( parsedCandidatePath ); + if( realizedCandidatePath ){ + rc = spindle_mkdir( parsedCandidatePath ); + if( 0 == rc ){ + if( symbolicPath) *symbolicPath = candidatePath; + if( parsedPath ) *parsedPath = parsedCandidatePath; + if( realizedPath) *realizedPath = realizedCandidatePath; + return 1; + }else{ + debug_printf2("Unable to create directory %s, moving on to the next candidate.\n", realizedCandidatePath ); + } + }else{ + debug_printf2( "Unable to realize candidate %s, moving on to the next candidate.\n", parsedCandidatePath ); + } + }else{ + debug_printf2("Unable to parse candidate %s, moving on to the next candidate.\n", candidatePath ); + } + return 0; +} + +/** + * determineValidCachePaths() works exclusively with the cachepaths parameter. Because not all paths may be valid on all + * compute nodes, and because we want to have all nodes reach a consensus on which cache path to use, we + * determine the validity of all paths in the origPathList, save the intermediate results, and return a bit + * index to the user. Via allReduce() all nodes reach a consensus on the set of valid paths, and retrieves + * that informatino via getValidPathByIndex(). + */ +static char *realizedCachePaths[64], *parsedCachePaths[64], *symbolicCachePaths[64]; + +void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ){ + + char *saveptr, *candidatePath, *pathList = strdup( origPathList ); + uint64_t bitoffset = 0; + + *validBitIdx = 0; + debug_printf2("origPathList='%s', number='%lu'.\n", origPathList, number ); + + candidatePath = strtok_r( pathList, ":", &saveptr ); + while( NULL != candidatePath && bitoffset < 64 ){ + *validBitIdx |= validateCandidatePath( + candidatePath, + &realizedCachePaths[bitoffset], + &parsedCachePaths[bitoffset], + &symbolicCachePaths[bitoffset], number ) << bitoffset; + bitoffset++; + candidatePath = strtok_r( NULL, ":", &saveptr ); + } + free( pathList ); +} + +void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ){ + uint64_t bitoffset = 0; + if (!validBitIdx){ + return; + } + while( (bitoffset < 64) && (((1 << bitoffset) & validBitIdx) == 0) ){ + bitoffset++; + } + if( realizedCachePath ) *realizedCachePath = realizedCachePaths[bitoffset]; + if( parsedCachePath ) *parsedCachePath = parsedCachePaths[bitoffset]; + if( symbolicCachePath ) *symbolicCachePath = symbolicCachePaths[bitoffset]; +} From 75a8fc226afd44a92e92e59b29d5a33dbd674e8e Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:50:45 -0700 Subject: [PATCH 02/53] Cachepath: remove/rename [orig_]location. --- src/client/beboot/spindle_bootstrap.c | 2 +- src/client/client/client.c | 15 +++++++-------- src/client/client/intercept.h | 1 + src/client/client/intercept_exec.c | 5 +---- src/client/client/intercept_readlink.c | 16 +++++++++++----- src/client/client/should_intercept.c | 25 +++++++++++++++---------- src/client/client/should_intercept.h | 1 + 7 files changed, 37 insertions(+), 28 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 9c82c061..0244284c 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -115,7 +115,6 @@ static void setup_environment() setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_LOCATION", location, 1); - setenv("LDCS_ORIG_LOCATION", orig_location, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) @@ -162,6 +161,7 @@ static int parse_cmdline(int argc, char *argv[]) } symbolic_location = argv[i++]; + i++; // Skip over candidate_cachepaths. number_s = argv[i++]; number = (number_t) strtoul(number_s, NULL, 0); opts_s = argv[i++]; diff --git a/src/client/client/client.c b/src/client/client/client.c index 0b899e7e..a680b55c 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -41,6 +41,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "exec_util.h" #include "intercept.h" #include "fixlocale.h" +#include "should_intercept.h" errno_location_t app_errno_location; @@ -70,11 +71,8 @@ static const ElfW(Phdr) *libc_phdrs, *interp_phdrs; static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; -/* location has the realize'd path to the local file cache. orig_location is not realized and - * may contain symlinks - */ -char *location; -char *orig_location; +static char *location; +static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; number_t number; static int have_stat_patches; @@ -203,7 +201,6 @@ static int init_server_connection() return 0; location = getenv("LDCS_LOCATION"); - orig_location = getenv("LDCS_ORIG_LOCATION"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -223,7 +220,6 @@ static int init_server_connection() debug_printf("Disabling environment variables because we're not following forks\n"); unsetenv("LD_AUDIT"); unsetenv("LDCS_LOCATION"); - unsetenv("LDCS_ORIG_LOCATION"); unsetenv("LDCS_NUMBER"); unsetenv("LDCS_CONNECTION"); unsetenv("LDCS_RANKINFO"); @@ -267,6 +263,9 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); + set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); + set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); @@ -475,7 +474,7 @@ char *client_library_load(const char *name) char *orig_file_name = (char *) name; if (is_in_spindle_cache(name)) { - debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, location); + debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, chosen_realized_cachepath); memset(fixed_name, 0, MAX_PATH_LEN+1); send_orig_path_request(ldcsid, orig_file_name, fixed_name); orig_file_name = fixed_name; diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index 4ace2328..aae968f7 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,6 +89,7 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index dbf9bae8..7af4e73b 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -142,7 +142,6 @@ static char **removeEnvironmentStrs(char **envp) if (strIsPrefix("LD", envp[i])) { if (strIsPrefix("LD_AUDIT=", envp[i]) || strIsPrefix("LDCS_LOCATION=", envp[i]) || - strIsPrefix("LDCS_ORIG_LOCATION=", envp[i]) || strIsPrefix("LDCS_CONNECTION=", envp[i]) || strIsPrefix("LDCS_RANKINFO=", envp[i]) || strIsPrefix("LDCS_OPTIONS=", envp[i]) || @@ -177,7 +176,6 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp unsetf("SPINDLE"); unsetf("LD_AUDIT"); unsetf("LDCS_LOCATION"); - unsetf("LDCS_ORIG_LOCATION"); unsetf("LDCS_CONNECTION"); unsetf("LDCS_RANKINFO"); unsetf("LDCS_OPTIONS"); @@ -198,13 +196,12 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 10; + new_size = orig_size + 20; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); propogateEnvironmentStr(envp, newenv, &pos, "LD_AUDIT"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_LOCATION"); - propogateEnvironmentStr(envp, newenv, &pos, "LDCS_ORIG_LOCATION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_CONNECTION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_RANKINFO"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_OPTIONS"); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index af4266a1..2abf03fc 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,19 +31,25 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -extern char *location; +static char *cachepath; + +void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + chosen_parsed_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; - int location_len, result; + int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; - location_len = strlen(location); + cachepath_len = strlen(cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(location, buf, location_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); - result = send_orig_path_request(ldcsid, buf+location_len+1, tmp); + result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) return -1; debug_printf2("readlink translated spindle local path %s to %s\n", buf, tmp); diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index bfabbbf8..3a348d3d 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,22 +29,27 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); +static char *cachepath, *orig_cachepath; + +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ + cachepath = chosen_realized_cachepath; + orig_cachepath = chosen_parsed_cachepath; + chosen_symbolic_cachepath = chosen_symbolic_cachepath; +} -extern char *location; -extern char *orig_location; int is_in_spindle_cache(const char *pathname) { - static int location_size = 0; - static int orig_location_size = 0; - if (!location_size) { - location_size = strlen(location); + static int cachepath_size = 0; + static int orig_cachepath_size = 0; + if (!cachepath_size) { + cachepath_size = strlen(cachepath); } - if (!orig_location_size) { - orig_location_size = strlen(orig_location); + if (!orig_cachepath_size) { + orig_cachepath_size = strlen(orig_cachepath); } - return ((strncmp(pathname, location, location_size) == 0) || - (strncmp(pathname, orig_location, orig_location_size) == 0)); + return ((strncmp(pathname, cachepath, cachepath_size) == 0) || + (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index f6a9b510..6a545913 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,6 +27,7 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 +void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From 8aeec38f28d2c23650d189bd8d1cec455a3039ee Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:51:51 -0700 Subject: [PATCH 03/53] Cachepath: Configure-time support. --- config.h.in | 3 +++ configure | 16 ++++++++++++++++ configure.common.ac | 5 +++++ src/client/config.h.in | 3 +++ src/client/configure | 16 ++++++++++++++++ src/fe/config.h.in | 3 +++ src/fe/configure | 16 ++++++++++++++++ src/server/config.h.in | 3 +++ src/server/configure | 16 ++++++++++++++++ 9 files changed, 81 insertions(+) diff --git a/config.h.in b/config.h.in index c5261ac4..f07f2e20 100644 --- a/config.h.in +++ b/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/configure b/configure index 126b8b42..c4ac7615 100755 --- a/configure +++ b/configure @@ -847,6 +847,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1590,6 +1591,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16662,6 +16665,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16698,6 +16709,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/configure.common.ac b/configure.common.ac index ae2a64df..ea6e5b6f 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -17,6 +17,10 @@ AC_ARG_WITH(default-num-ports, [AS_HELP_STRING([--with-default-numports=NUM],[Number of TCP/IP ports to scan for Spindle server communication])], [NUM_COBO_PORTS=${withval}], [NUM_COBO_PORTS=$DEFAULT_NUM_COBO_PORTS]) +AC_ARG_WITH(cachepaths, + [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], + [CACHEPATHS=${withval}], + [CACHEPATHS=$DEFAULT_LOC]) AC_ARG_WITH(localstorage, [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], [SPINDLE_LOC=${withval}], @@ -29,6 +33,7 @@ AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle] AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) AC_DEFINE_UNQUOTED([SPINDLE_LOC],"[$SPINDLE_LOC]",[The default local directory for Spindle]) +AC_DEFINE_UNQUOTED([CACHEPATHS],"[$CACHEPATHS]",[Colon-separated list of potential back-end cache directories]) AC_DEFINE_UNQUOTED([SPINDLE_LOCAL_PREFIX],"[$SPINDLE_LOCAL_PREFIX]",[The default colon-separated list of directories that Spindle will not cache files out of]) TESTRM=unknown diff --git a/src/client/config.h.in b/src/client/config.h.in index 2ddcacba..d133c1ff 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/client/configure b/src/client/configure index 8a015a42..b26aeb02 100755 --- a/src/client/configure +++ b/src/client/configure @@ -810,6 +810,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1532,6 +1533,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -12587,6 +12590,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -12623,6 +12634,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index 33ba031a..ab6cde5b 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/fe/configure b/src/fe/configure index 1018b37f..5da3f4fc 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -831,6 +831,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1570,6 +1571,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16437,6 +16440,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16473,6 +16484,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF diff --git a/src/server/config.h.in b/src/server/config.h.in index 8d1842cf..0669c8e0 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -6,6 +6,9 @@ /* Whether we are using a broken srun */ #undef BROKEN_SRUN +/* Colon-separated list of potential back-end cache directories */ +#undef CACHEPATHS + /* Define if were using biter for client/server communication */ #undef COMM_BITER diff --git a/src/server/configure b/src/server/configure index 81b43b17..d1d3c346 100755 --- a/src/server/configure +++ b/src/server/configure @@ -837,6 +837,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_cachepaths with_localstorage with_default_local_prefix with_testrm @@ -1567,6 +1568,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-cachepaths=DIR Colon-separated list of potential back-end cache + directories --with-localstorage=DIR Directory on back-ends for storing relocated files --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle @@ -16434,6 +16437,14 @@ else fi +# Check whether --with-cachepaths was given. +if test "${with_cachepaths+set}" = set; then : + withval=$with_cachepaths; CACHEPATHS=${withval} +else + CACHEPATHS=$DEFAULT_LOC +fi + + # Check whether --with-localstorage was given. if test "${with_localstorage+set}" = set; then : withval=$with_localstorage; SPINDLE_LOC=${withval} @@ -16470,6 +16481,11 @@ cat >>confdefs.h <<_ACEOF _ACEOF +cat >>confdefs.h <<_ACEOF +#define CACHEPATHS "$CACHEPATHS" +_ACEOF + + cat >>confdefs.h <<_ACEOF #define SPINDLE_LOCAL_PREFIX "$SPINDLE_LOCAL_PREFIX" _ACEOF From 52ffab9aac0204793cafe480965d0f8bc2df878b Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:52:29 -0700 Subject: [PATCH 04/53] Cachepath: Internal messaging for path resolution --- src/client/client_comlib/client_api.c | 53 ++++++++++++ src/client/client_comlib/client_api.h | 1 + src/fe/startup/spindle_fe.cc | 18 +++++ src/include/ldcs_api.h | 3 + .../auditserver/ldcs_audit_server_handlers.c | 80 +++++++++++++++++++ .../auditserver/ldcs_audit_server_md_cobo.c | 6 ++ src/server/comlib/ldcs_api_util.c | 3 + 7 files changed, 164 insertions(+) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 96390dca..267b8a93 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -37,6 +37,59 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + ldcs_message_t message; + char buffer[MAX_PATH_LEN+1]; + buffer[MAX_PATH_LEN] = '\0'; + + message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; + message.header.len = MAX_PATH_LEN; + message.data = buffer; + + COMM_LOCK; + + debug_printf3("sending message of type: request_location_path.\n" ); + client_send_msg(fd, &message); + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_realized_cachepath ){ + *chosen_realized_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_parsed_cachepath ){ + *chosen_parsed_cachepath = strdup( buffer ); + } + + COMM_LOCK; + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_UNLOCK; + + if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { + err_printf("Got unexpected message of type %d\n", (int) message.header.type); + assert(0); + } + if( chosen_symbolic_cachepath ){ + *chosen_symbolic_cachepath = strdup( buffer ); + } + + return 0; +} + int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { ldcs_message_t message; char buffer[MAX_PATH_LEN+1+sizeof(int)]; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 74f82346..982c4b1c 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,6 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); int get_python_prefix(int fd, char **prefix); diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index cb53023b..2c2879f5 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -41,6 +41,7 @@ static const char *logging_file = NULL; #endif static const char spindle_bootstrap[] = LIBEXECDIR "/spindle_bootstrap"; static bool sendAndWaitForAlive(); +static void determineCachepathConsensus(); #define STARTUP_TIMEOUT 60 @@ -71,6 +72,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); buffer_size += args->location ? strlen(args->location) + 1 : 1; + buffer_size += args->candidate_cachepaths ? strlen(args->candidate_cachepaths) + 1 : 1; buffer_size += args->pythonprefix ? strlen(args->pythonprefix) + 1 : 1; buffer_size += args->preloadfile ? strlen(args->preloadfile) + 1 : 1; buffer_size += args->numa_files ? strlen(args->numa_files) + 1 : 1; @@ -91,6 +93,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) pack_param(args->startup_type, buf, pos); pack_param(args->shm_cache_size, buf, pos); pack_param(args->location, buf, pos); + pack_param(args->candidate_cachepaths, buf, pos); pack_param(args->pythonprefix, buf, pos); pack_param(args->preloadfile, buf, pos); pack_param(args->bundle_timeout_ms, buf, pos); @@ -230,6 +233,7 @@ int getApplicationArgsFE(spindle_args_t *params, int *spindle_argc, char ***spin (*spindle_argv)[n++] = strdup(uniqueid_s); } (*spindle_argv)[n++] = strdup(params->location); + (*spindle_argv)[n++] = strdup(params->candidate_cachepaths); (*spindle_argv)[n++] = strdup(number_s); (*spindle_argv)[n++] = strdup(opt_s); (*spindle_argv)[n++] = strdup(cachesize_s); @@ -395,9 +399,11 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Start FE server */ debug_printf("spindle_args_t { number = %lu; port = %u; num_ports = %u; opts = %lu; unique_id = %lu; " "use_launcher = %u; startup_type = %u; shm_cache_size = %u; location = %s; " + "cachepaths = %s; " "pythonprefix = %s; preloadfile = %s; bundle_timeout_ms = %u; bundle_cachesize_kb = %u }\n", (unsigned long) params->number, params->port, params->num_ports, params->opts, params->unique_id, params->use_launcher, params->startup_type, params->shm_cache_size, params->location, + params->candidate_cachepaths, params->pythonprefix, params->preloadfile, params->bundle_timeout_ms, params->bundle_cachesize_kb); printSpindleFlags(params->opts); @@ -427,6 +433,7 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Wait for servers to indicate startup */ sendAndWaitForAlive(); + determineCachepathConsensus(); return 0; } @@ -483,6 +490,17 @@ void markRSHPidReapedFE() clear_fe_rsh_pid(); } +static void determineCachepathConsensus( void ){ + ldcs_message_t consensus_req_msg; + consensus_req_msg.header.type = LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS; + consensus_req_msg.header.len = 0; + consensus_req_msg.data = NULL; + int result = ldcs_audit_server_fe_broadcast(&consensus_req_msg, NULL); + if (result == -1) { + debug_printf("Failure sending cachepath consensus message\n"); + } +} + static bool sendAndWaitForAlive() { int result; diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index e8ffa43d..e6ccbafb 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -85,6 +85,9 @@ typedef enum { LDCS_MSG_PICKONE_RESP, LDCS_MSG_ALIVE_REQ, LDCS_MSG_ALIVE_RESP, + LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS, + LDCS_MSG_CHOSEN_CACHEPATH_REQUEST, + LDCS_MSG_CHOSEN_CACHEPATH, LDCS_MSG_UNKNOWN } ldcs_message_ids_t; diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 2791057c..1f6bc354 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -178,7 +178,10 @@ static int handle_setup_alias(ldcs_process_data_t *procdata, char *pathname, cha static int handle_client_dirlists_req(ldcs_process_data_t *procdata, int nc); static int handle_close_client_query(ldcs_process_data_t *procdata, int nc); static int handle_alive_msg(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg); +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc); +extern void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); /** * Query from client to server. Returns info about client's rank in server data structures. **/ @@ -1892,6 +1895,8 @@ int handle_client_message(ldcs_process_data_t *procdata, int nc, ldcs_message_t return handle_client_pickone_msg(procdata, nc, msg); case LDCS_MSG_END: return handle_client_end(procdata, nc); + case LDCS_MSG_CHOSEN_CACHEPATH_REQUEST: + return handle_chosen_cachepath_request(procdata, nc); default: err_printf("Received unexpected message from client %d: %d\n", nc, (int) msg->header.type); assert(0); @@ -1989,6 +1994,8 @@ int handle_server_message(ldcs_process_data_t *procdata, node_peer_t peer, ldcs_ case LDCS_MSG_ALIVE_REQ: case LDCS_MSG_ALIVE_RESP: return handle_alive_msg(procdata, msg); + case LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS: + return handle_cachepath_consensus(procdata, msg); default: err_printf("Received unexpected message from node: %d\n", (int) msg->header.type); assert(0); @@ -2950,6 +2957,79 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs } } +/** + * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which of the locations, commpaths, and cachepaths are + * available across all of the servers. + */ + +static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ + + int num_children = ldcs_audit_server_md_get_num_children(procdata); + + if (num_children) { + spindle_broadcast(procdata, msg); + msgbundle_force_flush(procdata); + } + + ldcs_audit_server_md_consensus(procdata, msg); + + if( procdata->cachepath_bitidx == 0 ){ + err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); + procdata->cachepath = procdata->location; + }else{ + // ldcs_audit_server_filemngt_init() does it's own realize() pass. + getValidCachePathByIndex( procdata->cachepath_bitidx, + &procdata->cachepath, + &procdata->parsed_cachepath, + &procdata->symbolic_cachepath); + } + + debug_printf3("Initializing file cache location %s\n", procdata->location); + ldcs_audit_server_filemngt_init(procdata->cachepath); + + test_printf(" cachepath=%s\n", procdata->cachepath); + return 0; +} + +/** + * Handle LDCS_MSG_CHOSEN_CACHEPATH_REQUEST + */ +static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc){ + ldcs_message_t msg; + int connid; + ldcs_client_t *client; + + assert(nc != -1); + client = procdata->client_table + nc; + connid = client->connid; + if (client->state != LDCS_CLIENT_STATUS_ACTIVE || connid < 0) + return 0; + + + msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; + + msg.header.len = strlen(procdata->cachepath) + 1; + msg.data = procdata->cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->parsed_cachepath) + 1; + msg.data = procdata->parsed_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + msg.header.len = strlen(procdata->symbolic_cachepath) + 1; + msg.data = procdata->symbolic_cachepath; + ldcs_send_msg(connid, &msg); + procdata->server_stat.clientmsg.cnt++; + procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; + + return 0; +} + + /** * Handle alive message, which is a broadcast/response ping through all servers */ diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index 08c9b952..d8b5442f 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -401,3 +401,9 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) cobo_get_num_childs(&num_childs); return num_childs; } + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ + if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ + cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); + } +} diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index 2bc2455d..b6beb56d 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -91,6 +91,9 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_PICKONE_RESP); STR_CASE(LDCS_MSG_ALIVE_REQ); STR_CASE(LDCS_MSG_ALIVE_RESP); + STR_CASE(LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH_REQUEST); + STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH); STR_CASE(LDCS_MSG_UNKNOWN); } return "unknown"; From 41ddcf4b96a6113836300a7b216191d0e01ade6f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:53:25 -0700 Subject: [PATCH 05/53] Cachepath: Adds cobo_allreduce() --- src/cobo/cobo.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/cobo/ldcs_cobo.h | 16 ++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 785a5d03..ad2ea878 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1434,6 +1434,48 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } +int cobo_allreduce( int64_t *pval, cobo_op_t op ){ + + /* if i have any children, receive their data */ + int64_t child_val; + for(int i=cobo_num_child-1; i>=0; i--) { + /* read int64_t from child */ + if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { + err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); + exit(1); + } + + /* compare child's val to our current val */ + switch( op ){ + case COBO_OP_MIN: if( child_val < *pval ) *pval = child_val; break; + case COBO_OP_MAX: if( child_val > *pval ) *pval = child_val; break; + case COBO_OP_BITWISE_AND: *pval &= child_val; break; + case COBO_OP_BITWISE_OR: *pval |= child_val; break; + case COBO_OP_LOGICAL_AND: *pval = *pval && child_val; break; + case COBO_OP_LOGICAL_OR: *pval = *pval || child_val; break; + case COBO_OP_SUM: *pval += child_val; break; + case COBO_OP_NOOP: break; + default: + err_printf("Illegal op (%d). Ignoring.\n", op); + break; + } + } + + /* forward data to parent if we're not rank 0, otherwise set the recvbuf */ + if (cobo_me != 0) { + /* not the root, so forward our reduction result to our parent */ + if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { + err_printf("Sending reduced data to parent failed\n"); + exit(1); + } + } + + /* broadcast result of reduction from rank 0 to all tasks */ + cobo_bcast_tree(pval, sizeof(int64_t)); + + return COBO_SUCCESS; +} + /* provide list of ports and number of ports as input, get number of tasks and my rank as output */ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* num_ranks) { diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index edacd4b1..30cc9673 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -55,6 +55,7 @@ extern "C" { #define cobo_allgather COMBINE(COBO_NAMESPACE, cobo_allgather) #define cobo_alltoall COMBINE(COBO_NAMESPACE, cobo_alltoall ) #define cobo_allgather_str COMBINE(COBO_NAMESPACE, cobo_allgather_str) +#define cobo_allreduce COMBINE(COBO_NAMESPACE, cobo_allreduce) #define cobo_server_open COMBINE(COBO_NAMESPACE, cobo_server_open) #define cobo_server_close COMBINE(COBO_NAMESPACE, cobo_server_close) #define cobo_server_get_root_socket COMBINE(COBO_NAMESPACE, cobo_server_get_root_socket) @@ -67,6 +68,19 @@ extern "C" { #define cobo_register_preconnect_cb COMBINE(COBO_NAMESPACE, cobo_register_preconnect_cb) #endif +// Used for cobo_allreduce(). +typedef enum{ + COBO_OP_MIN, + COBO_OP_MAX, + COBO_OP_BITWISE_AND, + COBO_OP_BITWISE_OR, + COBO_OP_LOGICAL_AND, + COBO_OP_LOGICAL_OR, + COBO_OP_SUM, + COBO_OP_NOOP, + NUM_COBO_OP +} cobo_op_t; + /* * ========================================================================== * ========================================================================== @@ -128,6 +142,8 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); +int cobo_allreduce(int64_t *pval, cobo_op_t op); + /* * ========================================================================== * ========================================================================== From 093f0f630b790f381bf8964f97a916a693cae9e4 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:54:01 -0700 Subject: [PATCH 06/53] Cachepath: Adds parameters to config_mgr --- src/fe/startup/config_mgr.cc | 23 +++++++++++++++++++++++ src/fe/startup/config_mgr.h | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index 36a30b20..eb710b36 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -56,6 +56,12 @@ using namespace std; #define SPINDLE_LOC_STR "$TMPDIR" #endif +#if defined(CACHEPATHS) +#define SPINDLE_CACHEPATHS_STR CACHEPATHS +#else +#define SPINDLE_CACHEPATHS_STR "$TMPDIR" +#endif + #if defined(SPINDLE_LOCAL_PREFIX) #define SPINDLE_LOCAL_PREFIX_STR SPINDLE_LOCAL_PREFIX #else @@ -269,6 +275,8 @@ void initOptionsList() "Strip debug and symbol information from binaries before distributing them." }, { confLocation, "location", shortLocation, groupMisc, cvString, {}, SPINDLE_LOC_STR, "Back-end directory for storing relocated files. Should be a non-shared location such as a ramdisk." }, + { confCachePaths, "cachepaths", shortCachePaths, groupMisc, cvString, {}, SPINDLE_CACHEPATHS_STR, + "Colon-separated list of candidate paths for cached libraries."}, { confNoclean, "noclean", shortNoClean, groupMisc, cvBool, {}, "false", "Don't remove local file cache after execution." }, { confDisableLogging, "disable-logging", shortDisableLogging, groupMisc, cvBool, {}, DISABLE_LOGGING_STR, @@ -740,6 +748,21 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const args.location = strdup(loc.c_str()); break; } + case confCachePaths:{ + // Paramemter values are colon-separated lists of paths. + // Append "/spindle.$NUMBER" to each path in the list. + string paths = strresult; + size_t idx = paths.find(":"); + string number_var_with_colon("/spindle.$NUMBER:"); + string number_var_without_colon("/spindle.$NUMBER"); + while( idx != string::npos ){ + paths.replace(idx, 1, number_var_with_colon); + idx = paths.find(":", idx + number_var_with_colon.size()); + }; + paths += number_var_without_colon; + args.candidate_cachepaths = strdup(paths.c_str()); + break; + } case confCachePrefix: case confPythonPrefix: if (args.pythonprefix) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 8e70daa6..27be1ae8 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -30,6 +30,7 @@ enum SpindleConfigID { confPort, confNumPorts, confLocation, + confCachePaths, confCachePrefix, confPythonPrefix, confLocalPrefix, @@ -125,7 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso + shortPatchLdso, + shortCachePaths, }; enum CmdlineGroups { From 6ed3ac251edab4dd7a76a07b7b2add3f5953a535 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:55:39 -0700 Subject: [PATCH 07/53] Cachepath: adds flux parameter support --- src/flux/flux-spindle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/flux/flux-spindle.c b/src/flux/flux-spindle.c index bd2213f2..6b3fdd40 100644 --- a/src/flux/flux-spindle.c +++ b/src/flux/flux-spindle.c @@ -382,7 +382,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) const char *relocaout = NULL, *reloclibs = NULL, *relocexec = NULL, *relocpython = NULL; const char *followfork = NULL, *preload = NULL, *level = NULL; const char *pyprefix = NULL, *location = NULL; - char *numafiles = NULL; + char *numafiles = NULL, *cachepaths = NULL; if (flux_shell_getopt_unpack (shell, "spindle", "o", &opts) < 0) return -1; @@ -404,7 +404,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) * supplied by the user, but not unpacked (This handles typos, etc). */ if (json_unpack_ex (opts, &error, JSON_STRICT, - "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s}", + "{s?i s?i s?i s?i s?s s?s s?s s?s s?s s?s s?s s?i s?s s?s s?s s?s}", "noclean", &noclean, "nostrip", &nostrip, "push", &push, @@ -419,7 +419,8 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) "numa", &numa, "numa-files", &numafiles, "preload", &preload, - "level", &level) < 0) + "level", &level, + "cachepaths", &cachepaths) < 0) logerrno_printf_and_return(1, "Error in spindle option: %s\n", error.text); if (noclean) @@ -462,6 +463,9 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) free (ctx->params.pythonprefix); ctx->params.pythonprefix = tmp; } + if( cachepaths ){ + ctx->params.candidate_cachepaths = cachepaths; + } if (location) { ctx->params.location = (char *) location; } From 57b480c79d18b4dcea38a43c038ae76ddc236b4d Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:05 -0700 Subject: [PATCH 08/53] Cachepath: Adds logging support. --- src/logging/spindle_logd.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/logging/spindle_logd.cc b/src/logging/spindle_logd.cc index 0048170b..372b55de 100644 --- a/src/logging/spindle_logd.cc +++ b/src/logging/spindle_logd.cc @@ -202,7 +202,7 @@ class TestVerifier std::vector err_strings; std::set > target_libs; std::set > libs_loaded; - char *location; + char *cachepath; void logerror(std::string s) { @@ -249,7 +249,8 @@ class TestVerifier tmp_s = getenv("TEMPDIR"); if (!tmp_s) tmp_s = "/tmp"; - location = strdup(tmp_s); + // These are reasonable fallbacks that should be replaced via messages, below. + cachepath = strdup(tmp_s); } ~TestVerifier() @@ -269,7 +270,7 @@ class TestVerifier strstr(filename, "bin") == NULL && strstr(filename, ".py") == NULL) return true; - bool is_from_temp = (strstr(filename, location) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); + bool is_from_temp = (strstr(filename, cachepath) != NULL) && (strncmp(filename, "/__not_exist", 12) != 0); bool is_local_test = strstr(filename, "liblocal") != NULL; if (is_from_temp && !is_local_test && ret_code == -1) { @@ -295,12 +296,12 @@ class TestVerifier char buffer[4096]; int ret; - if (strstr(s, " location=" ) == s ){ - free( location ); - const char *loc_start = strstr( s, "=") + 1; - size_t loc_len = strlen( loc_start ); - location = strdup( loc_start ); - location[ loc_len - 1 ] = '\0'; // Remove trailing '\n'. + if (strstr(s, " cachepath=" ) == s ){ + free( cachepath ); + const char *cachepath_start = strstr( s, "=") + 1; + size_t cachepath_len = strlen( cachepath_start ); + cachepath = strdup( cachepath_start ); + cachepath[ cachepath_len - 1 ] = '\0'; // Remove trailing '\n'. } if (strstr(s, "open(") == s) { const char *first_quote, *last_quote, *equals; From 15164a5d60deec9c0b87468d9ca0957f70b5649b Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:56:34 -0700 Subject: [PATCH 09/53] Cachepath: Removes out-of-root cleanup checks. --- src/server/auditserver/cleanup_proc.cc | 4 ++-- testsuite/test_driver.c | 23 ----------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/server/auditserver/cleanup_proc.cc b/src/server/auditserver/cleanup_proc.cc index c23a9f7d..a3d3ddcc 100644 --- a/src/server/auditserver/cleanup_proc.cc +++ b/src/server/auditserver/cleanup_proc.cc @@ -72,7 +72,8 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) continue; if (strncmp(prefix_dir, componentpath.c_str(), prefix_size) != 0) { - err_printf("Tried to clean a file %s that wasn't in our prefix %s\n", componentpath.c_str(), prefix_dir); + // We have multiple directory roots. Not a problem if the directory + // we're looking for isn't in this one. continue; } unlink(componentpath.c_str()); @@ -83,7 +84,6 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) sort(ordered_dirs.begin(), ordered_dirs.end(), longest_str_first); for (vector::iterator i = ordered_dirs.begin(); i != ordered_dirs.end(); i++) { if (strncmp(prefix_dir, i->c_str(), prefix_size) != 0) { - err_printf("Tried to rmdir directory %s that wasn't in our prefix %s\n", i->c_str(), prefix_dir); continue; } rmdir(i->c_str()); diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index acdc54e5..fcd9b7a3 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1258,27 +1258,6 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } -static int checkLinkForLeak(const char *path, const char *spindle_loc) -{ - char link_target[4096]; - int result, error; - memset(link_target, 0, sizeof(link_target)); - - result = readlink(path, link_target, sizeof(link_target)); - if (result == -1) { - error = errno; - err_printf("Failed to read link %s: %s\n", path, strerror(error)); - return -1; - } - - if (strstr(link_target, spindle_loc)) { - err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); - return -1; - } - - return 0; -} - static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) { if (strstr(path, spindle_loc)) { @@ -1375,9 +1354,7 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); - checkLinkForLeak(path, spindle_loc); } - checkLinkForLeak("/proc/self/exe", spindle_loc); /** * Check link_maps for leaked spindle paths From 091407f726448f2c599c909da780b98eb30fd9c6 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 3 Oct 2025 09:57:06 -0700 Subject: [PATCH 10/53] Cachepath: Set of small, miscellaneous patches. --- src/include/spindle_launch.h | 5 +++++ src/server/auditserver/ldcs_audit_server_md.h | 3 +++ src/server/auditserver/ldcs_audit_server_process.c | 5 +++-- src/server/auditserver/ldcs_audit_server_process.h | 5 +++++ src/server/startup/spindle_be.cc | 4 +++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 47a4d92e..5476734d 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -126,6 +126,11 @@ typedef struct { /* The local-disk location where Spindle will store its cache */ char *location; + /* Path[s] for cached libraries. */ + char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ + char *chosen_cachepath; /* The consensus path (same across all nodes). */ + uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ + /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index eb5bf9f6..ba7943e2 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -107,6 +107,9 @@ int ldcs_audit_server_md_broadcast_noncontig(ldcs_process_data_t *ldcs_process_d int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); + +void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); + #if defined(__cplusplus) } diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 312095be..dca91c20 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -142,6 +142,9 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; + ldcs_process_data.cachepaths = args->candidate_cachepaths; + ldcs_process_data.cachepath = args->chosen_cachepath; + ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -191,8 +194,6 @@ int ldcs_audit_server_process(spindle_args_t *args) } ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; - debug_printf3("Initializing file cache location %s\n", ldcs_process_data.location); - ldcs_audit_server_filemngt_init(ldcs_process_data.location); if (ldcs_process_data.opts & OPT_PROCCLEAN) init_cleanup_proc(ldcs_process_data.location); diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 18b3320a..9ba1675e 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -126,6 +126,11 @@ struct ldcs_process_data_struct ldcs_dist_model_t dist_model; ldcs_client_t* client_table; char *location; + char *cachepaths; + char *cachepath; + char *symbolic_cachepath; + char *parsed_cachepath; + int64_t cachepath_bitidx; char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 7493c020..4f583756 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -59,6 +59,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->startup_type, buf, pos); unpack_param(args->shm_cache_size, buf, pos); unpack_param(args->location, buf, pos); + unpack_param(args->candidate_cachepaths, buf, pos); unpack_param(args->pythonprefix, buf, pos); unpack_param(args->preloadfile, buf, pos); unpack_param(args->bundle_timeout_ms, buf, pos); @@ -152,7 +153,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i debug_printf("Translated location from %s to %s\n", args.location, new_location); free(args.location); args.location = new_location; - test_printf(" location=%s\n", args.location); + + determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); result = ldcs_audit_server_process(&args); if (result == -1) { From b2a8c5351d521aa8297e91c2d204f67840952f7b Mon Sep 17 00:00:00 2001 From: Barry Date: Tue, 21 Oct 2025 12:21:37 -0700 Subject: [PATCH 11/53] Fixes per Matt's comments. --- src/client/client/intercept_exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index 7af4e73b..c8ba97f9 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -196,7 +196,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp if (envp) { debug_printf2("Propogating spindle environment by copying it to new envp list\n"); for (cur = (char **) envp; *cur; cur++, orig_size++); - new_size = orig_size + 20; + new_size = orig_size + 9; newenv = (char **) malloc(new_size * sizeof(char*)); propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); From 42a55c897548b553bf6a478f6576940174fab98d Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 12:20:35 -0700 Subject: [PATCH 12/53] Single source of truth for client cachepath. Previously, chosen_realized_cachepath was copied into set_intercept_readlink_cachepath() chosen_realized_cachepath and chosen_parsed_cachepath were copied into set_should_intercept_cachepath() This PR removes both setter functions and makes the original pointers global. --- src/client/client/client.c | 7 ++----- src/client/client/intercept.h | 1 - src/client/client/intercept_readlink.c | 12 +++--------- src/client/client/should_intercept.c | 17 +++++------------ src/client/client/should_intercept.h | 1 - 5 files changed, 10 insertions(+), 28 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index a680b55c..82f82da7 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -72,7 +72,7 @@ static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; static char *location; -static char *chosen_realized_cachepath, *chosen_parsed_cachepath, *chosen_symbolic_cachepath; +char *chosen_realized_cachepath, *chosen_parsed_cachepath; number_t number; static int have_stat_patches; @@ -263,10 +263,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, &chosen_symbolic_cachepath ); - set_should_intercept_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - set_intercept_readlink_cachepath( chosen_realized_cachepath, chosen_parsed_cachepath, chosen_symbolic_cachepath ); - + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client/intercept.h b/src/client/client/intercept.h index aae968f7..4ace2328 100644 --- a/src/client/client/intercept.h +++ b/src/client/client/intercept.h @@ -89,7 +89,6 @@ int execvpe_wrapper(const char *path, char *const argv[], const char *envp[]); pid_t vfork_wrapper(); char *dlerror_wrapper(); -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); ssize_t readlink_wrapper(const char *path, char *buf, size_t bufsiz); ssize_t readlinkat_wrapper(int dirfd, const char *pathname, char *buf, size_t bufsiz); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index 2abf03fc..28547bf1 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -31,23 +31,17 @@ Place, Suite 330, Boston, MA 02111-1307 USA ssize_t (*orig_readlink)(const char *path, char *buf, size_t bufsiz); ssize_t (*orig_readlinkat)(int dirfd, const char *pathname, char *buf, size_t bufsiz); -static char *cachepath; - -void set_intercept_readlink_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - chosen_parsed_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} static int fix_local_readlink(char *buf, size_t bufsiz) { char spindle_id[32]; int cachepath_len, result; char tmp[MAX_PATH_LEN+1]; + extern char *chosen_realized_cachepath; - cachepath_len = strlen(cachepath); + cachepath_len = strlen(chosen_realized_cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); - if (strstr(buf, spindle_id) && strncmp(cachepath, buf, cachepath_len) == 0) { + if (strstr(buf, spindle_id) && strncmp(chosen_realized_cachepath, buf, cachepath_len) == 0) { debug_printf2("readlink received spindle cache path %s. Translating\n", buf); result = send_orig_path_request(ldcsid, buf+cachepath_len+1, tmp); if (result == -1) diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index 3a348d3d..cee4e43c 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -29,27 +29,20 @@ #include "spindle_debug.h" extern int relocate_spindleapi(); -static char *cachepath, *orig_cachepath; - -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ){ - cachepath = chosen_realized_cachepath; - orig_cachepath = chosen_parsed_cachepath; - chosen_symbolic_cachepath = chosen_symbolic_cachepath; -} - int is_in_spindle_cache(const char *pathname) { static int cachepath_size = 0; static int orig_cachepath_size = 0; + extern char *chosen_realized_cachepath, *chosen_parsed_cachepath; if (!cachepath_size) { - cachepath_size = strlen(cachepath); + cachepath_size = strlen(chosen_realized_cachepath); } if (!orig_cachepath_size) { - orig_cachepath_size = strlen(orig_cachepath); + orig_cachepath_size = strlen(chosen_parsed_cachepath); } - return ((strncmp(pathname, cachepath, cachepath_size) == 0) || - (strncmp(pathname, orig_cachepath, orig_cachepath_size) == 0)); + return ((strncmp(pathname, chosen_realized_cachepath, cachepath_size) == 0) || + (strncmp(pathname, chosen_parsed_cachepath, orig_cachepath_size) == 0)); } extern int is_local_prefix(const char *path, char **cached_local_prefixes); diff --git a/src/client/client/should_intercept.h b/src/client/client/should_intercept.h index 6a545913..f6a9b510 100644 --- a/src/client/client/should_intercept.h +++ b/src/client/client/should_intercept.h @@ -27,7 +27,6 @@ #define EXCL_OPEN 2 #define ERR_CALL 3 -void set_should_intercept_cachepath( char *chosen_realized_cachepath, char *chosen_parsed_cachepath, char *chosen_symbolic_cachepath ); int open_filter(const char *fname, int flags); int fopen_filter(const char *fname, const char *flags); int exec_filter(const char *fname); From 9183447ef8a376392a7e85d12bbd42bd38ebd104 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 22 Oct 2025 14:39:23 -0700 Subject: [PATCH 13/53] Comments the cachepath variables. --- .../auditserver/ldcs_audit_server_process.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 9ba1675e..f9f4c30c 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -125,12 +125,15 @@ struct ldcs_process_data_struct int exit_readys_recvd; ldcs_dist_model_t dist_model; ldcs_client_t* client_table; - char *location; - char *cachepaths; - char *cachepath; - char *symbolic_cachepath; - char *parsed_cachepath; - int64_t cachepath_bitidx; + char *location; /* Single user-specified path for fifo, daemons, etc. */ + /* (Everything except the cachepath.) */ + char *cachepaths; /* Up to 64 colon-separated list of candidate cachepaths. */ + char *cachepath; /* The earliest path in the list available to all servers. */ + /* (Environment variables replaced, symbolic links realized.) */ + char *symbolic_cachepath; /* The original representation of the cachepath. */ + char *parsed_cachepath; /* The cachepath with environment variables replaced. */ + /* (Symbolic links, if any, remain.) */ + int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; From 28c097c3db494bfd7883ace3500a42349dfe5046 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 13:07:21 -0700 Subject: [PATCH 14/53] Removes internal vars from spindle_launch.h Removes chosen_cachepath and cachepath_bitindex from spindle_launch.h Updates initialization of matching variables in ldcs_process_data. determineValidCachePaths() moved from spindle_be.cc to ldcs_audit_server_process.c to get ldcs_process_data visibility. Added #include "parseloc.h" to ldcs_audit_server_process.c to get declaration of determineValidCachePaths(). Relocated "parseloc.h" to src/util so ldcs_audit_server_process.c could find it. Trued up signedness of types caused my making "parseloc.h" more visible, e.g., cachepath_bitidx is now uint64_t everywhere. --- src/cobo/cobo.c | 2 +- src/cobo/ldcs_cobo.h | 2 +- src/include/spindle_launch.h | 2 -- src/server/auditserver/ldcs_audit_server_process.c | 9 +++++++-- src/server/auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 2 -- src/{client/beboot => utils}/parseloc.h | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) rename src/{client/beboot => utils}/parseloc.h (91%) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index ad2ea878..59c2f809 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1434,7 +1434,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) return COBO_SUCCESS; } -int cobo_allreduce( int64_t *pval, cobo_op_t op ){ +int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* if i have any children, receive their data */ int64_t child_val; diff --git a/src/cobo/ldcs_cobo.h b/src/cobo/ldcs_cobo.h index 30cc9673..fafbda6a 100644 --- a/src/cobo/ldcs_cobo.h +++ b/src/cobo/ldcs_cobo.h @@ -142,7 +142,7 @@ int cobo_alltoall (void* sendbuf, int sendcount, void* recvbuf); */ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf); -int cobo_allreduce(int64_t *pval, cobo_op_t op); +int cobo_allreduce(uint64_t *pval, cobo_op_t op); /* * ========================================================================== diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 5476734d..81c0728e 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -128,8 +128,6 @@ typedef struct { /* Path[s] for cached libraries. */ char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ - char *chosen_cachepath; /* The consensus path (same across all nodes). */ - uint64_t cachepath_bitidx; /* Bit index used by allReduce() to arrive at consensus. */ /* Colon-seperated list of directories where Python is installed */ char *pythonprefix; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index dca91c20..566cad01 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -37,6 +37,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "msgbundle.h" #include "exitnote.h" #include "cleanup_proc.h" +#include "parseloc.h" //#define GPERFTOOLS #if defined(GPERFTOOLS) @@ -143,8 +144,8 @@ int ldcs_audit_server_process(spindle_args_t *args) debug_printf3("Initializing server data structures\n"); ldcs_process_data.location = args->location; ldcs_process_data.cachepaths = args->candidate_cachepaths; - ldcs_process_data.cachepath = args->chosen_cachepath; - ldcs_process_data.cachepath_bitidx = args->cachepath_bitidx; + ldcs_process_data.cachepath = NULL; + ldcs_process_data.cachepath_bitidx = 0; ldcs_process_data.number = args->number; ldcs_process_data.pythonprefix = args->pythonprefix; ldcs_process_data.localprefix = args->local_prefixes; @@ -230,6 +231,10 @@ int ldcs_audit_server_process(spindle_args_t *args) if (fd != -1) { ldcs_listen_register_fd(fd, serverid, forceExitCB, (void *) &ldcs_process_data); } + determineValidCachePaths( + &ldcs_process_data.cachepath_bitidx, + ldcs_process_data.cachepaths, + ldcs_process_data.number ); return 0; } diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index f9f4c30c..1495cebd 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -133,7 +133,7 @@ struct ldcs_process_data_struct char *symbolic_cachepath; /* The original representation of the cachepath. */ char *parsed_cachepath; /* The cachepath with environment variables replaced. */ /* (Symbolic links, if any, remain.) */ - int64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ + uint64_t cachepath_bitidx; /* Bit index of valid cachepaths on a given server. */ char *hostname; char *pythonprefix; char *localprefix; diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 4f583756..733d3244 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -154,8 +154,6 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i free(args.location); args.location = new_location; - determineValidCachePaths( &args.cachepath_bitidx, args.candidate_cachepaths, args.number); - result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); diff --git a/src/client/beboot/parseloc.h b/src/utils/parseloc.h similarity index 91% rename from src/client/beboot/parseloc.h rename to src/utils/parseloc.h index 1731906a..a99409c3 100644 --- a/src/client/beboot/parseloc.h +++ b/src/utils/parseloc.h @@ -28,7 +28,7 @@ char *parse_location_noerr(char *loc, number_t number); char *realize(char *path); char **parse_colonsep_prefixes(char *colonsep_list, number_t number); int is_local_prefix(const char *path, char **local_prefixes); -static int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); +int validateCandidatePath( char *candidatePath, char **realizedPath, char **parsedPath, char **symbolicPath, number_t number ); void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ); void getValidCachePathByIndex( uint64_t validBitIdx, char **realizedCachePath, char **parsedCachePath, char **symbolicCachePath ); From 0edeac98dce997697b5ad27de9dd706d2d25d743 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:18:59 -0700 Subject: [PATCH 15/53] Client cachepath message now uses single response. The three-message-reply response is now a single message with two strings. The symbolic version of the cachepath is no longer communicated as it was not being used. --- src/client/client/client.c | 2 +- src/client/client_comlib/client_api.c | 31 ++++--------------- src/client/client_comlib/client_api.h | 2 +- .../auditserver/ldcs_audit_server_handlers.c | 19 +++--------- 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index 82f82da7..d715e31e 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -263,7 +263,7 @@ static int init_server_connection() send_cpu(ldcsid, get_cur_cpu()); #endif } - send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath, NULL); + send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 267b8a93..cdff2bb5 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -36,9 +36,9 @@ static struct lock_t comm_lock; #define COMM_LOCK do { if (lock(&comm_lock) == -1) return -1; } while (0) #define COMM_UNLOCK unlock(&comm_lock) - -int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath, char **chosen_symbolic_cachepath ){ + +int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath){ ldcs_message_t message; char buffer[MAX_PATH_LEN+1]; buffer[MAX_PATH_LEN] = '\0'; @@ -59,32 +59,13 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose err_printf("Got unexpected message of type %d\n", (int) message.header.type); assert(0); } + char *local_crc = strdup( buffer ); + char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); if( chosen_realized_cachepath ){ - *chosen_realized_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + *chosen_realized_cachepath = local_crc; } if( chosen_parsed_cachepath ){ - *chosen_parsed_cachepath = strdup( buffer ); - } - - COMM_LOCK; - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); - COMM_UNLOCK; - - if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { - err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); - } - if( chosen_symbolic_cachepath ){ - *chosen_symbolic_cachepath = strdup( buffer ); + *chosen_parsed_cachepath = local_cpc; } return 0; diff --git a/src/client/client_comlib/client_api.h b/src/client/client_comlib/client_api.h index 982c4b1c..3d7c41be 100644 --- a/src/client/client_comlib/client_api.h +++ b/src/client/client_comlib/client_api.h @@ -42,7 +42,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath); int send_dirlists_request(int fd, char **local_result, char **exece_result, char **to_free); int send_procmaps_query(int fd, int pid, char *result); int send_pickone_query(int fd, char *key, int *result); -int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath, char **chosen_realized_cachepath ); +int send_cachepath_query( int fd, char **chosen_symbolic_cachepath, char **chosen_parsed_cachepath); int get_python_prefix(int fd, char **prefix); diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 1f6bc354..b8102d6a 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -3008,21 +3008,12 @@ static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; - msg.header.len = strlen(procdata->cachepath) + 1; - msg.data = procdata->cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->parsed_cachepath) + 1; - msg.data = procdata->parsed_cachepath; - ldcs_send_msg(connid, &msg); - procdata->server_stat.clientmsg.cnt++; - procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; - - msg.header.len = strlen(procdata->symbolic_cachepath) + 1; - msg.data = procdata->symbolic_cachepath; + msg.header.len = strlen(procdata->cachepath) + 1 + strlen(procdata->parsed_cachepath) + 1; + msg.data = calloc( 1, msg.header.len ); + strcpy( msg.data, procdata->cachepath ); + strcpy( &msg.data[ strlen(procdata->cachepath)+1 ], procdata->parsed_cachepath ); ldcs_send_msg(connid, &msg); + free( msg.data ); procdata->server_stat.clientmsg.cnt++; procdata->server_stat.clientmsg.time += ldcs_get_time() - client->query_arrival_time; From c9594a51285829c58206f7cdf9df292260e45e4c Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 14:33:07 -0700 Subject: [PATCH 16/53] Removes assert(0) in network error paths. --- src/client/client_comlib/client_api.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index cdff2bb5..4b0f9ded 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -57,7 +57,7 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } char *local_crc = strdup( buffer ); char *local_cpc = strdup( &buffer[ strlen(local_crc) + 1 ] ); @@ -102,7 +102,7 @@ int send_file_query(int fd, char* path, int dso, char** newpath, int *errcode) { if (message.header.type != LDCS_MSG_FILE_QUERY_ANSWER) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); - assert(0); + return -1; } if (message.header.len > sizeof(int)) { @@ -195,7 +195,7 @@ int send_existance_test(int fd, char *path, int *exists) if (message.header.type != LDCS_MSG_EXISTS_ANSWER || message.header.len != sizeof(uint32_t)) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } memcpy(exists, buffer, sizeof(*exists)); @@ -232,7 +232,7 @@ int send_orig_path_request(int fd, const char *path, char *newpath) if (message.header.type != LDCS_MSG_ORIGPATH_ANSWER || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message after existance test: %d\n", (int) message.header.type); - assert(0); + return -1; } strncpy(newpath, buffer, MAX_PATH_LEN+1); @@ -380,7 +380,7 @@ int send_ldso_info_request(int fd, const char *ldso_path, char *result_path) if (message.header.type != LDCS_MSG_LOADER_DATA_RESP) { err_printf("Got unexpected message after ldso req: %d\n", (int) message.header.type); - assert(0); + return -1; } return 0; } @@ -422,7 +422,7 @@ int send_rankinfo_query(int fd, int *mylrank, int *mylsize, int *mymdrank, int * if (message.header.type != LDCS_MSG_MYRANKINFO_QUERY_ANSWER || message.header.len != 4*sizeof(int)) { err_printf("Received incorrect response to rankinfo query %d\n", message.header.type); *mylrank = *mylsize = *mymdrank = *mymdsize = -1; - assert(0); + return -1; } p = (int *) message.data; @@ -457,7 +457,7 @@ int send_procmaps_query(int fd, int pid, char *result) if (message.header.type != LDCS_MSG_PROCMAPS_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } memcpy(result, buffer, MAX_PATH_LEN); @@ -488,7 +488,7 @@ int send_pickone_query(int fd, char *key, int *result) if (message.header.type != LDCS_MSG_PICKONE_RESP) { err_printf("Received incorrect response to procmaps query %d\n", message.header.type); - assert(0); + return -1; } *result = *((int *) message.data); From eba3457ce84913d975c093f2b3d8818c781f19fc Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:18:40 -0700 Subject: [PATCH 17/53] Renames ldcs_audit_server_md_consensus(). New name is ldcs_audit_server_md_allreduce_AND(). If we get to the point where we're using other allreduce operations we can solve the problem of duplicating the op list in md-land and cobo-land. For now, we're only using one op in md-land, so the op can go into the function name. --- src/server/auditserver/ldcs_audit_server_handlers.c | 2 +- src/server/auditserver/ldcs_audit_server_md.h | 2 +- src/server/auditserver/ldcs_audit_server_md_cobo.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index b8102d6a..ac39810c 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2971,7 +2971,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag msgbundle_force_flush(procdata); } - ldcs_audit_server_md_consensus(procdata, msg); + ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); diff --git a/src/server/auditserver/ldcs_audit_server_md.h b/src/server/auditserver/ldcs_audit_server_md.h index ba7943e2..a4640370 100644 --- a/src/server/auditserver/ldcs_audit_server_md.h +++ b/src/server/auditserver/ldcs_audit_server_md.h @@ -108,7 +108,7 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata); int ldcs_audit_server_md_is_parent(node_peer_t peer); -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg); +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ); #if defined(__cplusplus) diff --git a/src/server/auditserver/ldcs_audit_server_md_cobo.c b/src/server/auditserver/ldcs_audit_server_md_cobo.c index d8b5442f..27393a55 100644 --- a/src/server/auditserver/ldcs_audit_server_md_cobo.c +++ b/src/server/auditserver/ldcs_audit_server_md_cobo.c @@ -402,8 +402,6 @@ int ldcs_audit_server_md_get_num_children(ldcs_process_data_t *procdata) return num_childs; } -void ldcs_audit_server_md_consensus(ldcs_process_data_t *ldcs_process_data, ldcs_message_t *msg){ - if( msg->header.type == LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS ){ - cobo_allreduce( &ldcs_process_data->cachepath_bitidx, COBO_OP_BITWISE_AND ); - } +void ldcs_audit_server_md_allreduce_AND( uint64_t *val ){ + cobo_allreduce( val, COBO_OP_BITWISE_AND ); } From fed846b50eee32e9f8ab0093dbe7197ec2b10046 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:27:27 -0700 Subject: [PATCH 18/53] Adds explicit enum values to CmdlineShortOptions. --- src/fe/startup/config_mgr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index 27be1ae8..c2d3cd7e 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -126,8 +126,8 @@ enum CmdlineShortOptions { shortSpindleLevel = 296, shortLocalPrefix = 297, shortExecExcludes = 298, - shortPatchLdso, - shortCachePaths, + shortPatchLdso = 299, + shortCachePaths = 300, }; enum CmdlineGroups { From c6ae16b8936444447155ca850ba1886281b282d7 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 23 Oct 2025 15:38:19 -0700 Subject: [PATCH 19/53] Return instead of exit on network errors. --- src/cobo/cobo.c | 58 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 59c2f809..65741008 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -160,7 +160,7 @@ static char* cobo_getenv(char* envvar, int type) char* str = getenv(envvar); if (str == NULL && type == ENV_REQUIRED) { err_printf("Missing required environment variable: %s\n", envvar); - exit(1); + return NULL; } return str; } @@ -171,7 +171,7 @@ static void* cobo_malloc(size_t n, char* msg) void* p = malloc(n); if (!p) { err_printf("Call to malloc(%lu) failed: %s (%m errno %d)\n", n, msg, errno); - exit(1); + return NULL; } return p; } @@ -513,7 +513,7 @@ static int cobo_connect_hostname(char* hostname, int rank) break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -768,7 +768,7 @@ static int cobo_open_tree() if (sockfd < 0) { err_printf("Creating parent socket (socket() %m errno=%d)\n", errno); - exit(1); + return -1; } setCloseOnExec(sockfd); @@ -817,7 +817,7 @@ static int cobo_open_tree() if (!port_is_bound) { /* TODO: would like to send an abort back to server */ err_printf("Failed to open socket on any port\n"); - exit(1); + return -1; } /* accept a connection from parent and receive socket table */ @@ -837,7 +837,7 @@ static int cobo_open_tree() break; case HSHAKE_INTERNAL_ERROR: err_printf("Internal error doing handshake: %s", spindle_handshake_last_error_str()); - exit(-1); + return -1; break; case HSHAKE_DROP_CONNECTION: debug_printf3("Handshake said to drop connection\n"); @@ -907,26 +907,26 @@ static int cobo_open_tree() /* read our rank number */ if (cobo_read_fd(cobo_parent_fd, &cobo_me, sizeof(int)) < 0) { err_printf("Receiving my rank from parent failed\n"); - exit(1); + return -1; } /* discover how many ranks are in our world */ if (cobo_read_fd(cobo_parent_fd, &cobo_nprocs, sizeof(int)) < 0) { err_printf("Receiving number of tasks from parent failed\n"); - exit(1); + return -1; } /* read the size of the hostlist (in bytes) */ if (cobo_read_fd(cobo_parent_fd, &cobo_hostlist_size, sizeof(int)) < 0) { err_printf("Receiving size of hostname table from parent failed\n"); - exit(1); + return -1; } /* allocate space for the hostlist and read it in */ cobo_hostlist = (void*) cobo_malloc(cobo_hostlist_size, "Hostlist data buffer"); if (cobo_read_fd(cobo_parent_fd, cobo_hostlist, cobo_hostlist_size) < 0) { err_printf("Receiving hostname table from parent failed\n"); - exit(1); + return -1; } /* @@ -969,7 +969,7 @@ static int cobo_open_tree() if (cobo_child_fd[i] == -1) { err_printf("Failed to connect to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* tell child what rank he is and forward the hostname table to him */ @@ -978,7 +978,7 @@ static int cobo_open_tree() if (forward != COBO_SUCCESS) { err_printf("Failed to forward hostname table to child (rank %d) on %s failed\n", c, child_hostname); - exit(1); + return -1; } /* free the child hostname string */ @@ -1033,7 +1033,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_me != 0) { if (cobo_read_fd(cobo_parent_fd, buf, size) < 0) { err_printf("Receiving broadcast data from parent failed\n"); - exit(1); + return -1; } } @@ -1042,7 +1042,7 @@ static int cobo_bcast_tree(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } @@ -1058,7 +1058,7 @@ int cobo_bcast_down(void* buf, int size) if (cobo_write_fd(cobo_child_fd[i], buf, size) < 0) { err_printf("Broadcasting data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } } return rc; @@ -1080,7 +1080,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(child_val)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's max to our current max */ @@ -1094,7 +1094,7 @@ static int cobo_allreduce_max_int_tree(int* sendbuf, int* recvbuf) /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, &max_val, sizeof(max_val)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } else { /* we're the root, got the result, set the recvbuf */ @@ -1129,7 +1129,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_read_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Gathering data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1138,7 +1138,7 @@ static int cobo_gather_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_me != 0) { if (cobo_write_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Sending gathered data to parent failed\n"); - exit(1); + return -1; } cobo_free(bigbuf); } @@ -1158,7 +1158,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) bigbuf = (void*) cobo_malloc(bigcount, "Temporary scatter buffer in cobo_scatter_tree"); if (cobo_read_fd(cobo_parent_fd, bigbuf, bigcount) < 0) { err_printf("Receiving scatter data from parent failed\n"); - exit(1); + return -1; } } @@ -1169,7 +1169,7 @@ static int cobo_scatter_tree(void* sendbuf, int sendcount, void* recvbuf) if (cobo_write_fd(cobo_child_fd[i], (char*)bigbuf + offset, sendcount * cobo_child_incl[i]) < 0) { err_printf("Scattering data to child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } offset += sendcount * cobo_child_incl[i]; } @@ -1257,7 +1257,7 @@ int cobo_bcast(void* buf, int sendcount, int root) rc = cobo_bcast_tree(buf, sendcount); } else { err_printf("Cannot execute bcast from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1283,7 +1283,7 @@ int cobo_gather(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_gather_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute gather to non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1309,7 +1309,7 @@ int cobo_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) rc = cobo_scatter_tree(sendbuf, sendcount, recvbuf); } else { err_printf("Cannot execute scatter from non-zero root\n"); - exit(1); + return -1; } cobo_gettimeofday(&end); @@ -1354,7 +1354,7 @@ int cobo_alltoall(void* sendbuf, int sendcount, void* recvbuf) int rc = COBO_SUCCESS; err_printf("Cannot execute alltoall\n"); - exit(1); + return -1; cobo_gettimeofday(&end); debug_printf3("Exiting cobo_alltoall(), took %f seconds for %d procs\n", cobo_getsecs(&end,&start), cobo_nprocs); @@ -1442,7 +1442,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* read int64_t from child */ if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { err_printf("Reducing data from child (rank %d) failed\n", cobo_child[i]); - exit(1); + return -1; } /* compare child's val to our current val */ @@ -1466,7 +1466,7 @@ int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* not the root, so forward our reduction result to our parent */ if (cobo_write_fd(cobo_parent_fd, pval, sizeof(*pval)) < 0) { err_printf("Sending reduced data to parent failed\n"); - exit(1); + return -1; } } @@ -1524,7 +1524,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* cobo_ports = cobo_int_dup(portlist, num_ports); if (cobo_ports == NULL) { err_printf("Failed to copy port list\n"); - exit(1); + return -1; } /* open the tree */ @@ -1533,7 +1533,7 @@ int cobo_open(uint64_t sessionid, int* portlist, int num_ports, int* rank, int* /* need to check that tree opened successfully before returning, so do a barrier */ if (cobo_barrier() != COBO_SUCCESS) { err_printf("Failed to open tree\n"); - exit(1); + return -1; } if (cobo_me == 0) { From 05e78b5ec22bdcf0838a17d70ada1f5c7fc3df79 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 06:54:47 -0700 Subject: [PATCH 20/53] Apply rename to configuration and parameters. --- config.h.in | 6 +++--- configure | 17 +++++++++-------- configure.common.ac | 12 ++++++------ src/client/config.h.in | 6 +++--- src/client/configure | 17 +++++++++-------- src/fe/config.h.in | 6 +++--- src/fe/configure | 17 +++++++++-------- src/fe/startup/config_mgr.cc | 16 ++++++++-------- src/fe/startup/config_mgr.h | 4 ++-- src/flux/sessionmgr.c | 8 ++++---- src/server/config.h.in | 6 +++--- src/server/configure | 17 +++++++++-------- 12 files changed, 68 insertions(+), 64 deletions(-) diff --git a/config.h.in b/config.h.in index f07f2e20..121c551f 100644 --- a/config.h.in +++ b/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -140,9 +143,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/configure b/configure index c4ac7615..b22b9b7c 100755 --- a/configure +++ b/configure @@ -848,7 +848,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1593,7 +1593,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16673,11 +16674,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16685,7 +16686,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16705,7 +16706,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/configure.common.ac b/configure.common.ac index ea6e5b6f..bc34e009 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -21,18 +21,18 @@ AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], [CACHEPATHS=$DEFAULT_LOC]) -AC_ARG_WITH(localstorage, - [AS_HELP_STRING([--with-localstorage=DIR],[Directory on back-ends for storing relocated files])], - [SPINDLE_LOC=${withval}], - [SPINDLE_LOC=$DEFAULT_LOC]) +AC_ARG_WITH(commpath, + [AS_HELP_STRING([--with-compath=DIR],[Back-end directory for communication and housekeeping])], + [COMMPATH=${withval}], + [COMMPATH=$DEFAULT_LOC]) AC_ARG_WITH(default-local-prefix, [AS_HELP_STRING([--with-default-local-prefix=DIRS],[Colon-seperated list of directories that Spindle will not cache files out of])], [SPINDLE_LOCAL_PREFIX=${withval}], - [SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC"]) + [SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH"]) AC_DEFINE_UNQUOTED([SPINDLE_PORT],[$SPINDLE_PORT],[The default port for Spindle]) AC_DEFINE_UNQUOTED([NUM_COBO_PORTS],[$NUM_COBO_PORTS],[Number of ports for COBO to search for an open port]) AC_DEFINE_UNQUOTED([SPINDLE_MAX_PORT],[$(($SPINDLE_PORT + $NUM_COBO_PORTS - 1))],[The maximum port value]) -AC_DEFINE_UNQUOTED([SPINDLE_LOC],"[$SPINDLE_LOC]",[The default local directory for Spindle]) +AC_DEFINE_UNQUOTED([COMMPATH],"[$COMMPATH]",[Back-end directory for communication and housekeeping]) AC_DEFINE_UNQUOTED([CACHEPATHS],"[$CACHEPATHS]",[Colon-separated list of potential back-end cache directories]) AC_DEFINE_UNQUOTED([SPINDLE_LOCAL_PREFIX],"[$SPINDLE_LOCAL_PREFIX]",[The default colon-separated list of directories that Spindle will not cache files out of]) diff --git a/src/client/config.h.in b/src/client/config.h.in index d133c1ff..044ca9e1 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -127,9 +130,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/client/configure b/src/client/configure index b26aeb02..ebfb6be8 100755 --- a/src/client/configure +++ b/src/client/configure @@ -811,7 +811,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1535,7 +1535,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -12598,11 +12599,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -12610,7 +12611,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -12630,7 +12631,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index ab6cde5b..615997be 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -172,9 +175,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/fe/configure b/src/fe/configure index 5da3f4fc..fc35c605 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -832,7 +832,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1573,7 +1573,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16448,11 +16449,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16460,7 +16461,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16480,7 +16481,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index eb710b36..16c4a0f8 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -50,10 +50,10 @@ using namespace std; #define SPINDLE_NUM_PORTS_STR "250" #endif -#if defined(SPINDLE_LOC) -#define SPINDLE_LOC_STR SPINDLE_LOC +#if defined(COMMPATH) +#define SPINDLE_COMMPATH_STR COMMPATH #else -#define SPINDLE_LOC_STR "$TMPDIR" +#define SPINDLE_COMMPATH_STR "$TMPDIR" #endif #if defined(CACHEPATHS) @@ -273,8 +273,8 @@ void initOptionsList() "Provides a text file containing a white-space separated list of files that should be relocated to each node before execution begins" }, { confStrip, "strip", shortStrip, groupMisc, cvBool, {}, "true", "Strip debug and symbol information from binaries before distributing them." }, - { confLocation, "location", shortLocation, groupMisc, cvString, {}, SPINDLE_LOC_STR, - "Back-end directory for storing relocated files. Should be a non-shared location such as a ramdisk." }, + { confCommPath, "commpath", shortCommPath, groupMisc, cvString, {}, SPINDLE_COMMPATH_STR, + "Back-end directory communication and housekeeping. Should be a non-shared location such as a ramdisk." }, { confCachePaths, "cachepaths", shortCachePaths, groupMisc, cvString, {}, SPINDLE_CACHEPATHS_STR, "Colon-separated list of candidate paths for cached libraries."}, { confNoclean, "noclean", shortNoClean, groupMisc, cvBool, {}, "false", @@ -743,9 +743,9 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const case confNumPorts: args.num_ports = numresult; break; - case confLocation: { - string loc = strresult + "/spindle.$NUMBER"; - args.location = strdup(loc.c_str()); + case confCommPath: { + string path = strresult + "/spindle.$NUMBER"; + args.location = strdup(path.c_str()); break; } case confCachePaths:{ diff --git a/src/fe/startup/config_mgr.h b/src/fe/startup/config_mgr.h index c2d3cd7e..ec3c8135 100644 --- a/src/fe/startup/config_mgr.h +++ b/src/fe/startup/config_mgr.h @@ -29,7 +29,7 @@ enum SpindleConfigID { confCmdlineNewgroup, confPort, confNumPorts, - confLocation, + confCommPath, confCachePaths, confCachePrefix, confPythonPrefix, @@ -83,7 +83,7 @@ enum CmdlineShortOptions { shortAuditType = 'k', shortRelocSO = 'l', shortNoClean = 'n', - shortLocation = 'o', + shortCommPath = 'o', shortPush = 'p', shortPull = 'q', shortPythonPrefix = 'r', diff --git a/src/flux/sessionmgr.c b/src/flux/sessionmgr.c index 49324a2f..17027163 100644 --- a/src/flux/sessionmgr.c +++ b/src/flux/sessionmgr.c @@ -109,16 +109,16 @@ char **strip_start_from_argv(int argc, char **argv) extern char *parse_location(char *loc, int number); extern int spindle_mkdir(char *orig_path); -#if !defined(SPINDLE_LOC) -#error SPINDLE_LOC must be defined in config.h +#if !defined(COMMPATH) +#error COMMPATH must be defined in config.h #endif const char *get_session_dir() { int result; char *dir; - dir = parse_location((char *) (SPINDLE_LOC "/spindle_session"), 0); + dir = parse_location((char *) (COMMPATH "/spindle_session"), 0); if (!dir) { - spindle_debug_printf(1, "ERROR: Could not parse directory for spindle session location from %s/spindle_session\n", SPINDLE_LOC); + spindle_debug_printf(1, "ERROR: Could not parse directory for spindle session location from %s/spindle_session\n", COMMPATH); return NULL; } diff --git a/src/server/config.h.in b/src/server/config.h.in index 0669c8e0..24040627 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -9,6 +9,9 @@ /* Colon-separated list of potential back-end cache directories */ #undef CACHEPATHS +/* Back-end directory for communication and housekeeping */ +#undef COMMPATH + /* Define if were using biter for client/server communication */ #undef COMM_BITER @@ -154,9 +157,6 @@ /* Default mode for slurm launch */ #undef SLURMLAUNCH_ENABLED -/* The default local directory for Spindle */ -#undef SPINDLE_LOC - /* The default colon-separated list of directories that Spindle will not cache files out of */ #undef SPINDLE_LOCAL_PREFIX diff --git a/src/server/configure b/src/server/configure index d1d3c346..7abbb35d 100755 --- a/src/server/configure +++ b/src/server/configure @@ -838,7 +838,7 @@ enable_maintainer_mode with_default_port with_default_num_ports with_cachepaths -with_localstorage +with_commpath with_default_local_prefix with_testrm with_rm @@ -1570,7 +1570,8 @@ Optional Packages: communication --with-cachepaths=DIR Colon-separated list of potential back-end cache directories - --with-localstorage=DIR Directory on back-ends for storing relocated files + --with-compath=DIR Back-end directory for communication and + housekeeping --with-default-local-prefix=DIRS Colon-seperated list of directories that Spindle will not cache files out of @@ -16445,11 +16446,11 @@ else fi -# Check whether --with-localstorage was given. -if test "${with_localstorage+set}" = set; then : - withval=$with_localstorage; SPINDLE_LOC=${withval} +# Check whether --with-commpath was given. +if test "${with_commpath+set}" = set; then : + withval=$with_commpath; COMMPATH=${withval} else - SPINDLE_LOC=$DEFAULT_LOC + COMMPATH=$DEFAULT_LOC fi @@ -16457,7 +16458,7 @@ fi if test "${with_default_local_prefix+set}" = set; then : withval=$with_default_local_prefix; SPINDLE_LOCAL_PREFIX=${withval} else - SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$SPINDLE_LOC" + SPINDLE_LOCAL_PREFIX="$DEFAULT_LOCAL_PREFIX:$COMMPATH" fi @@ -16477,7 +16478,7 @@ _ACEOF cat >>confdefs.h <<_ACEOF -#define SPINDLE_LOC "$SPINDLE_LOC" +#define COMMPATH "$COMMPATH" _ACEOF From 8c1067e407e45e51f66be6e9cfc76b9eb52eda27 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 12:29:41 -0700 Subject: [PATCH 21/53] Renaming location variables/fields to commpath. --- src/fe/startup/config_mgr.cc | 2 +- src/fe/startup/parse_launcher.cc | 4 ++-- src/fe/startup/spindle_fe.cc | 10 +++++----- src/flux/flux-spindle.c | 8 ++++---- src/include/spindle_launch.h | 4 ++-- .../auditserver/ldcs_audit_server_handlers.c | 12 ++++++------ .../auditserver/ldcs_audit_server_process.c | 14 +++++++------- .../auditserver/ldcs_audit_server_process.h | 2 +- src/server/startup/spindle_be.cc | 16 ++++++++-------- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/fe/startup/config_mgr.cc b/src/fe/startup/config_mgr.cc index 16c4a0f8..1b9d9ade 100644 --- a/src/fe/startup/config_mgr.cc +++ b/src/fe/startup/config_mgr.cc @@ -745,7 +745,7 @@ bool ConfigMap::toSpindleArgs(spindle_args_t &args, bool alloc_strs) const break; case confCommPath: { string path = strresult + "/spindle.$NUMBER"; - args.location = strdup(path.c_str()); + args.commpath = strdup(path.c_str()); break; } case confCachePaths:{ diff --git a/src/fe/startup/parse_launcher.cc b/src/fe/startup/parse_launcher.cc index f05ba18a..4484e998 100644 --- a/src/fe/startup/parse_launcher.cc +++ b/src/fe/startup/parse_launcher.cc @@ -292,7 +292,7 @@ void ModifyArgv::modifyCmdLine() snprintf(options_str, 32, "%lu", (unsigned long) params->opts); string options(options_str); - string location(params->location); + string commpath(params->commpath); char number_str[32]; snprintf(number_str, 32, "%lu", (unsigned long) params->number); @@ -319,7 +319,7 @@ void ModifyArgv::modifyCmdLine() if (p == parser->appExecutableAt()) { #if defined(os_bluegene) string bg_env_str = parser->getParser()->getBGString(); - parser->getParser()->addBGEnvStr(n, new_argv, bg_env_str, default_libstr, intercept_libstr, location, number, options, shmcache_size); + parser->getParser()->addBGEnvStr(n, new_argv, bg_env_str, default_libstr, intercept_libstr, commpath, number, options, shmcache_size); #else char **a_argv; int a_argc; diff --git a/src/fe/startup/spindle_fe.cc b/src/fe/startup/spindle_fe.cc index 2c2879f5..a038e201 100644 --- a/src/fe/startup/spindle_fe.cc +++ b/src/fe/startup/spindle_fe.cc @@ -71,7 +71,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) buffer_size += sizeof(number_t); buffer_size += sizeof(opt_t); buffer_size += sizeof(unique_id_t); - buffer_size += args->location ? strlen(args->location) + 1 : 1; + buffer_size += args->commpath ? strlen(args->commpath) + 1 : 1; buffer_size += args->candidate_cachepaths ? strlen(args->candidate_cachepaths) + 1 : 1; buffer_size += args->pythonprefix ? strlen(args->pythonprefix) + 1 : 1; buffer_size += args->preloadfile ? strlen(args->preloadfile) + 1 : 1; @@ -92,7 +92,7 @@ static int pack_data(spindle_args_t *args, void* &buffer, unsigned &buffer_size) pack_param(args->use_launcher, buf, pos); pack_param(args->startup_type, buf, pos); pack_param(args->shm_cache_size, buf, pos); - pack_param(args->location, buf, pos); + pack_param(args->commpath, buf, pos); pack_param(args->candidate_cachepaths, buf, pos); pack_param(args->pythonprefix, buf, pos); pack_param(args->preloadfile, buf, pos); @@ -232,7 +232,7 @@ int getApplicationArgsFE(spindle_args_t *params, int *spindle_argc, char ***spin (*spindle_argv)[n++] = strdup(numports_s); (*spindle_argv)[n++] = strdup(uniqueid_s); } - (*spindle_argv)[n++] = strdup(params->location); + (*spindle_argv)[n++] = strdup(params->commpath); (*spindle_argv)[n++] = strdup(params->candidate_cachepaths); (*spindle_argv)[n++] = strdup(number_s); (*spindle_argv)[n++] = strdup(opt_s); @@ -398,11 +398,11 @@ int spindleInitFE(const char **hosts, spindle_args_t *params) /* Start FE server */ debug_printf("spindle_args_t { number = %lu; port = %u; num_ports = %u; opts = %lu; unique_id = %lu; " - "use_launcher = %u; startup_type = %u; shm_cache_size = %u; location = %s; " + "use_launcher = %u; startup_type = %u; shm_cache_size = %u; commpath = %s; " "cachepaths = %s; " "pythonprefix = %s; preloadfile = %s; bundle_timeout_ms = %u; bundle_cachesize_kb = %u }\n", (unsigned long) params->number, params->port, params->num_ports, params->opts, params->unique_id, - params->use_launcher, params->startup_type, params->shm_cache_size, params->location, + params->use_launcher, params->startup_type, params->shm_cache_size, params->commpath, params->candidate_cachepaths, params->pythonprefix, params->preloadfile, params->bundle_timeout_ms, params->bundle_cachesize_kb); diff --git a/src/flux/flux-spindle.c b/src/flux/flux-spindle.c index 6b3fdd40..dd00aa32 100644 --- a/src/flux/flux-spindle.c +++ b/src/flux/flux-spindle.c @@ -381,7 +381,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) int numa = 0; const char *relocaout = NULL, *reloclibs = NULL, *relocexec = NULL, *relocpython = NULL; const char *followfork = NULL, *preload = NULL, *level = NULL; - const char *pyprefix = NULL, *location = NULL; + const char *pyprefix = NULL, *commpath = NULL; char *numafiles = NULL, *cachepaths = NULL; if (flux_shell_getopt_unpack (shell, "spindle", "o", &opts) < 0) @@ -415,7 +415,7 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) "reloc-exec", &relocexec, "reloc-python", &relocpython, "python-prefix", &pyprefix, - "location", &location, + "commpath", &commpath, "numa", &numa, "numa-files", &numafiles, "preload", &preload, @@ -466,8 +466,8 @@ static int sp_getopts (flux_shell_t *shell, struct spindle_ctx *ctx) if( cachepaths ){ ctx->params.candidate_cachepaths = cachepaths; } - if (location) { - ctx->params.location = (char *) location; + if (commpath) { + ctx->params.commpath = (char *) commpath; } if (level) { if (strcmp(level, "high") == 0) { diff --git a/src/include/spindle_launch.h b/src/include/spindle_launch.h index 81c0728e..ca7b8d3d 100644 --- a/src/include/spindle_launch.h +++ b/src/include/spindle_launch.h @@ -123,8 +123,8 @@ typedef struct { /* Size of client shared memory cache */ unsigned int shm_cache_size; - /* The local-disk location where Spindle will store its cache */ - char *location; + /* The local-disk location for communication and housekeeping. */ + char *commpath; /* Path[s] for cached libraries. */ char *candidate_cachepaths; /* Colon-separated list of candidate paths (max 64) */ diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index ac39810c..f165f562 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -202,7 +202,7 @@ static int handle_client_info_msg(ldcs_process_data_t *procdata, int nc, ldcs_me else if(msg->header.type == LDCS_MSG_LOCATION) { strncpy(client->remote_location, msg->data, sizeof(client->remote_location)-1); client->remote_location[sizeof(client->remote_location)-1] = '\0'; - debug_printf2("Server recvd location %s from %d\n", msg->data, nc); + debug_printf2("Server recvd remote_location %s from %d\n", msg->data, nc); } else if (msg->header.type == LDCS_MSG_CPU) { int clientcpu; @@ -2958,7 +2958,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs } /** - * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which of the locations, commpaths, and cachepaths are + * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which cachepaths are * available across all of the servers. */ @@ -2974,8 +2974,8 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ - err_printf("No valid cachepath path available. Falling back to \"location\" path (%s).\n", procdata->location); - procdata->cachepath = procdata->location; + err_printf("No valid cachepath path available. Falling back to \"commpath\" path (%s).\n", procdata->commpath); + procdata->cachepath = procdata->commpath; }else{ // ldcs_audit_server_filemngt_init() does it's own realize() pass. getValidCachePathByIndex( procdata->cachepath_bitidx, @@ -2984,7 +2984,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag &procdata->symbolic_cachepath); } - debug_printf3("Initializing file cache location %s\n", procdata->location); + debug_printf3("Initializing file cache cachepath %s\n", procdata->cachepath); ldcs_audit_server_filemngt_init(procdata->cachepath); test_printf(" cachepath=%s\n", procdata->cachepath); @@ -3229,7 +3229,7 @@ int exit_note_cb(int fd, int serverid, void *data) eresult = -1; } - result = handleExitNote(fd, procdata->location); + result = handleExitNote(fd, procdata->commpath); if (result == -1) { debug_printf("handleExitNote failed\n"); eresult = -1; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index 566cad01..a73a7b7f 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -113,7 +113,7 @@ void startprofile(spindle_args_t *args) char hostname[257]; char *home = getenv("HOME"); if (!home || !*home) - home = ldcs_process_data.location; + home = ldcs_process_data.commpath; gethostname(hostname, sizeof(hostname)); snprintf(filename, 4096, "%s/spindled.%lu.%s.%d.prof", home, (unsigned long) args->number, hostname, getpid()); ProfilerStart(filename); @@ -142,7 +142,7 @@ int ldcs_audit_server_process(spindle_args_t *args) startprofile(args); debug_printf3("Initializing server data structures\n"); - ldcs_process_data.location = args->location; + ldcs_process_data.commpath = args->commpath; ldcs_process_data.cachepaths = args->candidate_cachepaths; ldcs_process_data.cachepath = NULL; ldcs_process_data.cachepath_bitidx = 0; @@ -196,11 +196,11 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; if (ldcs_process_data.opts & OPT_PROCCLEAN) - init_cleanup_proc(ldcs_process_data.location); + init_cleanup_proc(ldcs_process_data.commpath); debug_printf3("Initializing connections for clients at %s and %lu\n", - ldcs_process_data.location, (unsigned long) ldcs_process_data.number); - serverid = ldcs_create_server(ldcs_process_data.location, ldcs_process_data.number); + ldcs_process_data.commpath, (unsigned long) ldcs_process_data.number); + serverid = ldcs_create_server(ldcs_process_data.commpath, ldcs_process_data.number); if (serverid == -1) { err_printf("Unable to setup area for client connections\n"); return -1; @@ -216,7 +216,7 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_listen_register_fd(fd, serverid, &_ldcs_server_CB, (void *) &ldcs_process_data); if (args->opts & OPT_BEEXIT) { - fd = createExitNote(args->location); + fd = createExitNote(args->commpath); if (fd != -1) { ldcs_listen_register_fd(fd, serverid, exit_note_cb, (void *) &ldcs_process_data); } @@ -254,7 +254,7 @@ int ldcs_audit_server_run() _ldcs_server_stat_print(&ldcs_process_data.server_stat); - debug_printf("destroy server (%s,%lu)\n", ldcs_process_data.location, (unsigned long) ldcs_process_data.number); + debug_printf("destroy server (%s,%lu)\n", ldcs_process_data.commpath, (unsigned long) ldcs_process_data.number); ldcs_destroy_server(ldcs_process_data.serverid); /* destroy md support (multi-daemon) */ diff --git a/src/server/auditserver/ldcs_audit_server_process.h b/src/server/auditserver/ldcs_audit_server_process.h index 1495cebd..82b60023 100644 --- a/src/server/auditserver/ldcs_audit_server_process.h +++ b/src/server/auditserver/ldcs_audit_server_process.h @@ -125,7 +125,7 @@ struct ldcs_process_data_struct int exit_readys_recvd; ldcs_dist_model_t dist_model; ldcs_client_t* client_table; - char *location; /* Single user-specified path for fifo, daemons, etc. */ + char *commpath; /* Single user-specified path for fifo, daemons, etc. */ /* (Everything except the cachepath.) */ char *cachepaths; /* Up to 64 colon-separated list of candidate cachepaths. */ char *cachepath; /* The earliest path in the list available to all servers. */ diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index 733d3244..fa0eccb3 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -58,7 +58,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->use_launcher, buf, pos); unpack_param(args->startup_type, buf, pos); unpack_param(args->shm_cache_size, buf, pos); - unpack_param(args->location, buf, pos); + unpack_param(args->commpath, buf, pos); unpack_param(args->candidate_cachepaths, buf, pos); unpack_param(args->pythonprefix, buf, pos); unpack_param(args->preloadfile, buf, pos); @@ -144,15 +144,15 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i assert(args.port == port); - /* Expand environment variables in location. */ - char *new_location = parse_location(args.location, args.number); - if (!new_location) { - err_printf("Failed to convert location %s\n", args.location); + /* Expand environment variables in commpath. */ + char *new_commpath = parse_location(args.commpath, args.number); + if (!new_commpath) { + err_printf("Failed to convert commpath %s\n", args.commpath); return -1; } - debug_printf("Translated location from %s to %s\n", args.location, new_location); - free(args.location); - args.location = new_location; + debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); + free(args.commpath); + args.commpath = new_commpath; result = ldcs_audit_server_process(&args); if (result == -1) { From 90fa1ecc3cd48195075b7f9d35abe4257056c97f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 24 Oct 2025 12:45:10 -0700 Subject: [PATCH 22/53] Renames enums. --- src/client/client/client.c | 4 ++-- src/client/client/intercept_exec.c | 6 +++--- src/client/client_comlib/client_api.c | 2 +- src/fe/startup/parse_launcher_args.cc | 12 ++++++------ src/include/ldcs_api.h | 2 +- src/server/auditserver/ldcs_audit_server_handlers.c | 4 ++-- .../auditserver/ldcs_audit_server_md_msocket.c | 2 +- src/server/comlib/ldcs_api_util.c | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index d715e31e..11ef9091 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -200,7 +200,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_LOCATION"); + location = getenv("LDCS_COMMPATH"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -219,7 +219,7 @@ static int init_server_connection() if (!(opts & OPT_FOLLOWFORK)) { debug_printf("Disabling environment variables because we're not following forks\n"); unsetenv("LD_AUDIT"); - unsetenv("LDCS_LOCATION"); + unsetenv("LDCS_COMMPATH"); unsetenv("LDCS_NUMBER"); unsetenv("LDCS_CONNECTION"); unsetenv("LDCS_RANKINFO"); diff --git a/src/client/client/intercept_exec.c b/src/client/client/intercept_exec.c index c8ba97f9..edb0199a 100644 --- a/src/client/client/intercept_exec.c +++ b/src/client/client/intercept_exec.c @@ -141,7 +141,7 @@ static char **removeEnvironmentStrs(char **envp) continue; if (strIsPrefix("LD", envp[i])) { if (strIsPrefix("LD_AUDIT=", envp[i]) || - strIsPrefix("LDCS_LOCATION=", envp[i]) || + strIsPrefix("LDCS_COMMPATH=", envp[i]) || strIsPrefix("LDCS_CONNECTION=", envp[i]) || strIsPrefix("LDCS_RANKINFO=", envp[i]) || strIsPrefix("LDCS_OPTIONS=", envp[i]) || @@ -175,7 +175,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp unsetf = orig_unsetenv ? orig_unsetenv : unsetenv; unsetf("SPINDLE"); unsetf("LD_AUDIT"); - unsetf("LDCS_LOCATION"); + unsetf("LDCS_COMMPATH"); unsetf("LDCS_CONNECTION"); unsetf("LDCS_RANKINFO"); unsetf("LDCS_OPTIONS"); @@ -201,7 +201,7 @@ static char **updateEnvironment(char **envp, int *num_modified, int propogate_sp propogateEnvironmentStr(envp, newenv, &pos, "SPINDLE"); propogateEnvironmentStr(envp, newenv, &pos, "LD_AUDIT"); - propogateEnvironmentStr(envp, newenv, &pos, "LDCS_LOCATION"); + propogateEnvironmentStr(envp, newenv, &pos, "LDCS_COMMPATH"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_CONNECTION"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_RANKINFO"); propogateEnvironmentStr(envp, newenv, &pos, "LDCS_OPTIONS"); diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 4b0f9ded..f5e07cc0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -349,7 +349,7 @@ int send_cpu(int fd, int cpu) { int send_location(int fd, char *location) { ldcs_message_t message; - message.header.type = LDCS_MSG_LOCATION; + message.header.type = LDCS_MSG_COMMPATH; message.header.len = strlen(location)+1; message.data = location; diff --git a/src/fe/startup/parse_launcher_args.cc b/src/fe/startup/parse_launcher_args.cc index 84cf7f23..710b009d 100644 --- a/src/fe/startup/parse_launcher_args.cc +++ b/src/fe/startup/parse_launcher_args.cc @@ -127,7 +127,7 @@ static cmdoption_t openmpi_options[] = { }; -static const char *openmpi_bg_env_str = "-x LD_AUDIT=%s -x LDCS_LOCATION=%s -x LDCS_NUMBER=%s -x LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *openmpi_bg_env_str = "-x LD_AUDIT=%s -x LDCS_COMMPATH=%s -x LDCS_NUMBER=%s -x LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; static const unsigned int openmpi_size = (sizeof(openmpi_options) / sizeof(cmdoption_t)); static cmdoption_t srun_options[] = { @@ -218,11 +218,11 @@ static cmdoption_t srun_options[] = { { NULL, "--usage", 0 }, { "-V", "--version", 0 } }; -static const char *srun_bg_env_str = "--runjob-opts=--envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_str_nopreload = "--runjob-opts=--envs LD_AUDIT=%s%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str = "%s --envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str_preload = "%sLD_PRELOAD=%s:%s LD_AUDIT=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; -static const char *srun_bg_env_bare_str_nopreload = "%s LD_AUDIT=%s LDCS_LOCATION=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_str = "--runjob-opts=--envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_str_nopreload = "--runjob-opts=--envs LD_AUDIT=%s%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str = "%s --envs LD_AUDIT=%s LD_PRELOAD=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str_preload = "%sLD_PRELOAD=%s:%s LD_AUDIT=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; +static const char *srun_bg_env_bare_str_nopreload = "%s LD_AUDIT=%s LDCS_COMMPATH=%s LDCS_NUMBER=%s LDCS_OPTIONS=%s LDCS_CACHESIZE=%s"; static const unsigned int srun_size (sizeof(srun_options) / sizeof(cmdoption_t)); diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index e6ccbafb..49ba45e3 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -45,7 +45,7 @@ typedef enum { LDCS_MSG_END, LDCS_MSG_CWD, LDCS_MSG_PID, - LDCS_MSG_LOCATION, + LDCS_MSG_COMMPATH, LDCS_MSG_CPU, LDCS_MSG_MYRANKINFO_QUERY, LDCS_MSG_MYRANKINFO_QUERY_ANSWER, diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index f165f562..3ab968a9 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -199,7 +199,7 @@ static int handle_client_info_msg(ldcs_process_data_t *procdata, int nc, ldcs_me client->remote_pid=mypid; debug_printf2("Server recvd pid %d from %d\n", mypid, nc); } - else if(msg->header.type == LDCS_MSG_LOCATION) { + else if(msg->header.type == LDCS_MSG_COMMPATH) { strncpy(client->remote_location, msg->data, sizeof(client->remote_location)-1); client->remote_location[sizeof(client->remote_location)-1] = '\0'; debug_printf2("Server recvd remote_location %s from %d\n", msg->data, nc); @@ -1868,7 +1868,7 @@ int handle_client_message(ldcs_process_data_t *procdata, int nc, ldcs_message_t switch (msg->header.type) { case LDCS_MSG_CWD: case LDCS_MSG_PID: - case LDCS_MSG_LOCATION: + case LDCS_MSG_COMMPATH: case LDCS_MSG_CPU: return handle_client_info_msg(procdata, nc, msg); case LDCS_MSG_PYTHONPREFIX_REQ: diff --git a/src/server/auditserver/ldcs_audit_server_md_msocket.c b/src/server/auditserver/ldcs_audit_server_md_msocket.c index c9d616c2..6db18bd4 100644 --- a/src/server/auditserver/ldcs_audit_server_md_msocket.c +++ b/src/server/auditserver/ldcs_audit_server_md_msocket.c @@ -62,7 +62,7 @@ int ldcs_audit_server_md_init ( ldcs_process_data_t *ldcs_process_data ) { int rc=0; char* ldcs_nportsstr=getenv("LDCS_NPORTS"); - char* ldcs_locmodstr=getenv("LDCS_LOCATION_MOD"); + char* ldcs_locmodstr=getenv("LDCS_COMMPATH_MOD"); int usedport; int serverfd, serverid, i; diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index b6beb56d..5ea51ab8 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -51,7 +51,7 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_END); STR_CASE(LDCS_MSG_CWD); STR_CASE(LDCS_MSG_PID); - STR_CASE(LDCS_MSG_LOCATION); + STR_CASE(LDCS_MSG_COMMPATH); STR_CASE(LDCS_MSG_CPU); STR_CASE(LDCS_MSG_MYRANKINFO_QUERY); STR_CASE(LDCS_MSG_MYRANKINFO_QUERY_ANSWER); From c271b22d39d924bc63adf40048b8ff04f6507cf9 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 5 Nov 2025 15:28:52 -0800 Subject: [PATCH 23/53] Use strdup() for commpath instead of stack var. Unlikely it would ever make a difference, but this is much more correct. --- src/server/startup/spindle_be.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index fa0eccb3..c3e7bbd9 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -152,7 +152,7 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i } debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); free(args.commpath); - args.commpath = new_commpath; + args.commpath = strdup(new_commpath); result = ldcs_audit_server_process(&args); if (result == -1) { From 82d6118b8f9c8079074d889a11ad4618e8cff150 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 12 Nov 2025 10:13:59 -0800 Subject: [PATCH 24/53] Adds LDCS_COMMPATH --- src/client/client/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client/client.c b/src/client/client/client.c index 11ef9091..75422800 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -200,7 +200,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_COMMPATH"); + location = getenv("LDCS_LOCATION"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); From 76c4ce171b9f21e682e83f99b42ecd574e9728fd Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 12:56:57 -0800 Subject: [PATCH 25/53] Restores checkLinkForLeak() to test_driver.c --- testsuite/test_driver.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index fcd9b7a3..84ac1027 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1258,6 +1258,28 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } +static int checkLinkForLeak(const char *path, const char *spindle_loc) +{ + char link_target[4096]; + int result, error; + memset(link_target, 0, sizeof(link_target)); + + result = readlink(path, link_target, sizeof(link_target)); + if (result == -1) { + error = errno; + err_printf("Failed to read link %s: %s\n", path, strerror(error)); + return -1; + } + + if (strstr(link_target, spindle_loc)) { + err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); + return -1; + } + + return 0; +} + + static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) { if (strstr(path, spindle_loc)) { @@ -1354,7 +1376,9 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); + checkLinkForLeak(path, spindle_loc); } + checkLinkForLeak("/proc/self/exe", spindle_loc); /** * Check link_maps for leaked spindle paths From b32e555108ac933211cc8f5172bf7f5df8a5fbdf Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 15:24:58 -0800 Subject: [PATCH 26/53] Replacing "location" with "commpath" as needed. src/client/beboot/spindle_bootstrap.c Moved orig_location from static global to local Renamed symbolic_location to symbolic_commpath Renamed orig_location to orig_commpath Renamed location to commpath Renamed LDCS_LOCATION to LDCS_COMMPATH src/client/client/client.c Renamed LDCS_LOCATION to LDCS_COMMPATH Renamed location to commpath --- src/client/beboot/spindle_bootstrap.c | 24 ++++++++++++------------ src/client/client/client.c | 22 +++++++++++----------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 0244284c..63525374 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -53,7 +53,7 @@ static int rankinfo[4]={-1,-1,-1,-1}; number_t number; static int use_cache; static unsigned int cachesize; -static char *location, *number_s, *orig_location, *symbolic_location; +static char *commpath, *number_s, *symbolic_commpath; static char **cmdline; static char *executable; static char *client_lib; @@ -92,7 +92,7 @@ extern char *realize(char *path); static int establish_connection() { debug_printf2("Opening connection to server\n"); - ldcsid = client_open_connection(location, number); + ldcsid = client_open_connection(commpath, number); if (ldcsid == -1) return -1; @@ -114,7 +114,7 @@ static void setup_environment() connection_str = client_get_connection_string(ldcsid); setenv("LD_AUDIT", client_lib, 1); - setenv("LDCS_LOCATION", location, 1); + setenv("LDCS_COMMPATH", commpath, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) @@ -160,7 +160,7 @@ static int parse_cmdline(int argc, char *argv[]) daemon_args[i - 3] = NULL; } - symbolic_location = argv[i++]; + symbolic_commpath = argv[i++]; i++; // Skip over candidate_cachepaths. number_s = argv[i++]; number = (number_t) strtoul(number_s, NULL, 0); @@ -174,7 +174,7 @@ static int parse_cmdline(int argc, char *argv[]) return 0; } -static void launch_daemon(char *location) +static void launch_daemon(char *commpath) { /*grand-child fork, then execv daemon. By grand-child forking we ensure that the app won't get confused by seeing an unknown process as a child. */ @@ -184,12 +184,12 @@ static void launch_daemon(char *location) char unique_file[MAX_PATH_LEN+1]; char buffer[32]; - result = spindle_mkdir(location); + result = spindle_mkdir(commpath); if (result == -1) { debug_printf("Exiting due to spindle_mkdir error\n"); exit(-1); } - snprintf(unique_file, MAX_PATH_LEN, "%s/spindle_daemon_pid", location); + snprintf(unique_file, MAX_PATH_LEN, "%s/spindle_daemon_pid", commpath); unique_file[MAX_PATH_LEN] = '\0'; fd = open(unique_file, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd == -1) { @@ -344,14 +344,14 @@ int main(int argc, char *argv[]) } } - orig_location = parse_location(symbolic_location, number); - if (!orig_location) { + char *orig_commpath = parse_location(symbolic_commpath, number); + if (!orig_commpath) { return -1; } - location = realize(orig_location); + commpath = realize(orig_commpath); if (daemon_args) { - launch_daemon(location); + launch_daemon(commpath); } result = establish_connection(); @@ -375,7 +375,7 @@ int main(int argc, char *argv[]) #else shm_cache_limit = cachesize; #endif - shmcache_init(location, number, cachesize, shm_cache_limit); + shmcache_init(commpath, number, cachesize, shm_cache_limit); use_cache = 1; } diff --git a/src/client/client/client.c b/src/client/client/client.c index 75422800..6a75fb08 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -71,7 +71,7 @@ static const ElfW(Phdr) *libc_phdrs, *interp_phdrs; static int num_libc_phdrs, num_interp_phdrs; ElfW(Addr) libc_loadoffset, interp_loadoffset; -static char *location; +static char *commpath; char *chosen_realized_cachepath, *chosen_parsed_cachepath; number_t number; static int have_stat_patches; @@ -200,7 +200,7 @@ static int init_server_connection() if (!use_ldcs) return 0; - location = getenv("LDCS_LOCATION"); + commpath = getenv("LDCS_COMMPATH"); number = (number_t) strtoul(getenv("LDCS_NUMBER"), NULL, 0); connection = getenv("LDCS_CONNECTION"); rankinfo_s = getenv("LDCS_RANKINFO"); @@ -209,9 +209,9 @@ static int init_server_connection() opts = strtoul(opts_s, NULL, 10); shm_cachesize = atoi(cachesize_s) * 1024; - if (strchr(location, '$')) { - location = parse_location(location, number); - if (!location) { + if (strchr(commpath, '$')) { + commpath = parse_location(commpath, number); + if (!commpath) { exit(-1); } } @@ -233,14 +233,14 @@ static int init_server_connection() #else shm_cache_limit = shm_cachesize; #endif - shmcache_init(location, number, shm_cachesize, shm_cache_limit); + shmcache_init(commpath, number, shm_cachesize, shm_cache_limit); } if (connection) { /* boostrapper established the connection for us. Reuse it. */ debug_printf("Recreating existing connection to server\n"); - debug_printf3("location = %s, number = %lu, connection = %s, rankinfo = %s\n", - location, (unsigned long) number, connection, rankinfo_s); + debug_printf3("commpath = %s, number = %lu, connection = %s, rankinfo = %s\n", + commpath, (unsigned long) number, connection, rankinfo_s); ldcsid = client_register_connection(connection); if (ldcsid == -1) return -1; @@ -250,13 +250,13 @@ static int init_server_connection() } else { /* Establish a new connection */ - debug_printf("open connection to ldcs %s %lu\n", location, (unsigned long) number); - ldcsid = client_open_connection(location, number); + debug_printf("open connection to ldcs %s %lu\n", commpath, (unsigned long) number); + ldcsid = client_open_connection(commpath, number); if (ldcsid == -1) return -1; send_pid(ldcsid); - send_location(ldcsid, location); + send_location(ldcsid, commpath); send_rankinfo_query(ldcsid, rankinfo+0, rankinfo+1, rankinfo+2, rankinfo+3); #if defined(LIBNUMA) if (opts & OPT_NUMA) From 41b4467e06867d057b8a95d4696ca3f5dbbcdaf5 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 13 Nov 2025 16:06:35 -0800 Subject: [PATCH 27/53] Continues location rename. src/client/client_comlib/client_api.c Added setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc); testsuite/test_driver.c Replaced LDCS_LOCATION and LDCS_ORIG_LOCATION checks for cachepath with LDCS_CHOSEN_PARSED_CACHEPATH Replaced spindle_loc with cachepath --- src/client/client_comlib/client_api.c | 2 ++ testsuite/test_driver.c | 48 +++++++++++++-------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index f5e07cc0..827acbf0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -67,6 +67,8 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if( chosen_parsed_cachepath ){ *chosen_parsed_cachepath = local_cpc; } + // Required by testsuite/test_driver.c + setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc, 1); return 0; } diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index 84ac1027..e4bb42bb 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1258,7 +1258,7 @@ static char* getCacheLocation(char *env_var) return strdup(last_slash); } -static int checkLinkForLeak(const char *path, const char *spindle_loc) +static int checkLinkForLeak(const char *path, const char *cachepath) { char link_target[4096]; int result, error; @@ -1271,8 +1271,8 @@ static int checkLinkForLeak(const char *path, const char *spindle_loc) return -1; } - if (strstr(link_target, spindle_loc)) { - err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, spindle_loc); + if (strstr(link_target, cachepath)) { + err_printf("Link at '%s' has path '%s', which leaks spindle path with '%s'\n", path, link_target, cachepath); return -1; } @@ -1280,10 +1280,10 @@ static int checkLinkForLeak(const char *path, const char *spindle_loc) } -static int checkPathForLeak(const char *what, const char *path, const char *spindle_loc) +static int checkPathForLeak(const char *what, const char *path, const char *cachepath) { - if (strstr(path, spindle_loc)) { - err_printf("%s: Path '%s' leaks spindle path with '%s'\n", what, path, spindle_loc); + if (strstr(path, cachepath)) { + err_printf("%s: Path '%s' leaks spindle path with '%s'\n", what, path, cachepath); return -1; } return 0; @@ -1291,14 +1291,14 @@ static int checkPathForLeak(const char *what, const char *path, const char *spin static int leak_check_cb(struct dl_phdr_info *p, size_t psize, void *opaque) { - char *spindle_loc = (char *) opaque; + char *cachepath = (char *) opaque; if (!p->dlpi_name || p->dlpi_name[0] == '\0') return 0; - checkPathForLeak("dl_iterate_phdr", p->dlpi_name, spindle_loc); + checkPathForLeak("dl_iterate_phdr", p->dlpi_name, cachepath); return 0; } -static int check_proc_maps(char *path, char *spindle_loc) +static int check_proc_maps(char *path, char *cachepath) { int fd, error, result; struct stat statbuf; @@ -1337,8 +1337,8 @@ static int check_proc_maps(char *path, char *spindle_loc) maps[filesize] = '\0'; close(fd); - if (strstr(maps, spindle_loc)) { - err_printf("Found leaked spindle path '%s' in maps '%s'\n", spindle_loc, path); + if (strstr(maps, cachepath)) { + err_printf("Found leaked spindle path '%s' in maps '%s'\n", cachepath, path); return -1; } @@ -1348,17 +1348,15 @@ static int check_proc_maps(char *path, char *spindle_loc) void check_for_path_leaks() { - char *spindle_loc = NULL; + char *cachepath = NULL; DIR *proc_fds = NULL; struct dirent *d; char path[4096]; struct link_map *lm; char *dlerr_msg = NULL; - spindle_loc = getCacheLocation("LDCS_LOCATION"); - if (!spindle_loc) - spindle_loc = getCacheLocation("LDCS_ORIG_LOCATION"); - if (!spindle_loc) { + cachepath = getCacheLocation("LDCS_CHOSEN_PARSED_CACHEPATH"); + if (!cachepath) { err_printf("Failed to calculate cache location"); goto done; } @@ -1376,9 +1374,9 @@ void check_for_path_leaks() continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); - checkLinkForLeak(path, spindle_loc); + checkLinkForLeak(path, cachepath); } - checkLinkForLeak("/proc/self/exe", spindle_loc); + checkLinkForLeak("/proc/self/exe", cachepath); /** * Check link_maps for leaked spindle paths @@ -1386,22 +1384,22 @@ void check_for_path_leaks() for (lm = _r_debug.r_map; lm != NULL; lm = lm->l_next) { if (!lm->l_name || lm->l_name[0] == '\0') continue; - checkPathForLeak("link_map", lm->l_name, spindle_loc); + checkPathForLeak("link_map", lm->l_name, cachepath); } /** * Check libraries in dl_iterate_phdr for leaked paths **/ - dl_iterate_phdr(leak_check_cb, spindle_loc); + dl_iterate_phdr(leak_check_cb, cachepath); /** * Check /proc/pid/maps under various aliases for leaked names **/ - check_proc_maps("/proc/self/maps", spindle_loc); + check_proc_maps("/proc/self/maps", cachepath); snprintf(path, sizeof(path), "/proc/self/task/%d/maps", getpid()); - check_proc_maps(path, spindle_loc); + check_proc_maps(path, cachepath); snprintf(path, sizeof(path), "/proc/%d/maps", getpid()); - check_proc_maps(path, spindle_loc); + check_proc_maps(path, cachepath); /** * Check that dlerror doesn't leak the /__not_exists/ prefix @@ -1413,8 +1411,8 @@ void check_for_path_leaks() } done: - if (spindle_loc) - free(spindle_loc); + if (cachepath) + free(cachepath); if (proc_fds) closedir(proc_fds); } From 871e7c26f1449bc18a22230d9e758ff22586a679 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 7 Dec 2025 11:04:26 -0800 Subject: [PATCH 28/53] Fixes -Wsign-compare warning in new code. --- src/cobo/cobo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cobo/cobo.c b/src/cobo/cobo.c index 65741008..2d1a53a9 100644 --- a/src/cobo/cobo.c +++ b/src/cobo/cobo.c @@ -1437,7 +1437,7 @@ int cobo_allgather_str(char* sendstr, char*** recvstr, char** recvbuf) int cobo_allreduce( uint64_t *pval, cobo_op_t op ){ /* if i have any children, receive their data */ - int64_t child_val; + uint64_t child_val; for(int i=cobo_num_child-1; i>=0; i--) { /* read int64_t from child */ if (cobo_read_fd(cobo_child_fd[i], &child_val, sizeof(int64_t)) < 0) { From 9762ca6272c3ec8aafdfca385bf4afda56fa1c96 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 7 Dec 2025 11:33:43 -0800 Subject: [PATCH 29/53] LDCS_CHOSEN_PARSED_CACHEPATH set in bootstrap. --- src/client/beboot/spindle_bootstrap.c | 6 +++++- src/client/client_comlib/client_api.c | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 63525374..75b9420f 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -108,13 +108,17 @@ static void setup_environment() { char rankinfo_str[256]; snprintf(rankinfo_str, 256, "%d %d %d %d %d", ldcsid, rankinfo[0], rankinfo[1], rankinfo[2], rankinfo[3]); - + char *connection_str = NULL; if (opts & OPT_RELOCAOUT) connection_str = client_get_connection_string(ldcsid); + char *chosen_parsed_cachepath; + send_cachepath_query( ldcsid , NULL, &chosen_parsed_cachepath); + setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_COMMPATH", commpath, 1); + setenv("LDCS_CHOSEN_PARSED_CACHEPATH", chosen_parsed_cachepath, 1); setenv("LDCS_NUMBER", number_s, 1); setenv("LDCS_RANKINFO", rankinfo_str, 1); if (connection_str) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 827acbf0..f5e07cc0 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -67,8 +67,6 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose if( chosen_parsed_cachepath ){ *chosen_parsed_cachepath = local_cpc; } - // Required by testsuite/test_driver.c - setenv("LDCS_CHOSEN_PARSED_CACHEPATH", local_cpc, 1); return 0; } From 6b0e13d1e85930e3ef7b4939e6931cb43da5a678 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 18 Dec 2025 19:46:36 -0800 Subject: [PATCH 30/53] Updates test_driver.c to ignore FIFO files. All tests pass with both distinct and identical commpaths/cachepaths. --- testsuite/test_driver.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index e4bb42bb..bcee0d12 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1372,6 +1372,11 @@ void check_for_path_leaks() for (d = readdir(proc_fds); d != NULL; d = readdir(proc_fds)) { if (d->d_name[0] == '.') continue; + // Ignore Spindle fifo files for now. + if ( strncmp( "315", d->d_name, 3 ) == 0 ) + continue; + if ( strncmp( "316", d->d_name, 3 ) == 0 ) + continue; strncpy(path, "/proc/self/fd/", sizeof(path)); strncat(path, d->d_name, sizeof(path)-1); checkLinkForLeak(path, cachepath); From 291d129456d1e98dfe699498e133cb7b2b3e5035 Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 12 Jan 2026 01:38:42 -0800 Subject: [PATCH 31/53] Sets TMPDIR=/tmp in each Dockerfile Additionally populates /etc/environment just in case ssh is used to set up the servers. --- containers/spindle-flux-ubuntu/Dockerfile | 2 ++ containers/spindle-serial-ubuntu/Dockerfile | 2 ++ containers/spindle-slurm-ubuntu/base/Dockerfile | 2 ++ containers/spindle-slurm-ubuntu/testing/Dockerfile | 2 ++ 4 files changed, 8 insertions(+) diff --git a/containers/spindle-flux-ubuntu/Dockerfile b/containers/spindle-flux-ubuntu/Dockerfile index 3af607a8..57badbfc 100644 --- a/containers/spindle-flux-ubuntu/Dockerfile +++ b/containers/spindle-flux-ubuntu/Dockerfile @@ -5,6 +5,8 @@ FROM fluxrm/flux-sched:${flux_sched_version} AS builder ARG replicas=4 ENV workers=${replicas} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ && apt-get -qq install -y --no-install-recommends \ diff --git a/containers/spindle-serial-ubuntu/Dockerfile b/containers/spindle-serial-ubuntu/Dockerfile index 3070596e..62c0cbf5 100644 --- a/containers/spindle-serial-ubuntu/Dockerfile +++ b/containers/spindle-serial-ubuntu/Dockerfile @@ -1,6 +1,8 @@ ARG ubuntu_version=noble FROM ubuntu:${ubuntu_version} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ # install latest pkg utils: diff --git a/containers/spindle-slurm-ubuntu/base/Dockerfile b/containers/spindle-slurm-ubuntu/base/Dockerfile index d4724276..d6eee9f2 100644 --- a/containers/spindle-slurm-ubuntu/base/Dockerfile +++ b/containers/spindle-slurm-ubuntu/base/Dockerfile @@ -1,6 +1,8 @@ ARG UBUNTU_VERSION=noble FROM ubuntu:${UBUNTU_VERSION} USER root +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment RUN apt-get update \ && DEBIAN_FRONTEND="noninteractive" apt-get -qq install -y --no-install-recommends \ diff --git a/containers/spindle-slurm-ubuntu/testing/Dockerfile b/containers/spindle-slurm-ubuntu/testing/Dockerfile index 91d6901c..99768535 100644 --- a/containers/spindle-slurm-ubuntu/testing/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing/Dockerfile @@ -2,6 +2,8 @@ ARG BASE_VERSION=latest FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} +ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing From 31e1deb57c46dd8195ccc415340f350a82742105 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 10:41:36 -0800 Subject: [PATCH 32/53] Restores --with-localstorage to generate error. The option is marked as obsolete in configure --help and will cause an error in configure if it is specified. As updates the CI configure scripts to use --with-cachepaths and --with-commpath instead of --with-localstorage. --- configure | 9 +++++++++ configure.common.ac | 4 ++++ containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- .../spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing/scripts/build_spindle.sh | 2 +- src/client/configure | 9 +++++++++ src/fe/configure | 9 +++++++++ src/server/configure | 9 +++++++++ 8 files changed, 43 insertions(+), 3 deletions(-) diff --git a/configure b/configure index b22b9b7c..0e99b0ac 100755 --- a/configure +++ b/configure @@ -847,6 +847,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1591,6 +1592,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16666,6 +16669,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/configure.common.ac b/configure.common.ac index bc34e009..9096bb24 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -17,6 +17,10 @@ AC_ARG_WITH(default-num-ports, [AS_HELP_STRING([--with-default-numports=NUM],[Number of TCP/IP ports to scan for Spindle server communication])], [NUM_COBO_PORTS=${withval}], [NUM_COBO_PORTS=$DEFAULT_NUM_COBO_PORTS]) +AC_ARG_WITH(localstorage, + [AS_HELP_STRING([--with-localstorage=DIR (obsolete)],[Use --with-cachepaths and --with-commpath instead.])], + [AC_MSG_ERROR(requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead.)], + []) AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 9257f85e..283e4451 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 37b6491a..4fee85b4 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index 6943e49a..17e7197f 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/src/client/configure b/src/client/configure index ebfb6be8..00f93824 100755 --- a/src/client/configure +++ b/src/client/configure @@ -810,6 +810,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1533,6 +1534,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -12591,6 +12594,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/src/fe/configure b/src/fe/configure index fc35c605..89be0536 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -831,6 +831,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1571,6 +1572,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16441,6 +16444,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} diff --git a/src/server/configure b/src/server/configure index 7abbb35d..a5ff6885 100755 --- a/src/server/configure +++ b/src/server/configure @@ -837,6 +837,7 @@ enable_libtool_lock enable_maintainer_mode with_default_port with_default_num_ports +with_localstorage with_cachepaths with_commpath with_default_local_prefix @@ -1568,6 +1569,8 @@ Optional Packages: --with-default-numports=NUM Number of TCP/IP ports to scan for Spindle server communication + --with-localstorage=DIR (obsolete) + Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories --with-compath=DIR Back-end directory for communication and @@ -16438,6 +16441,12 @@ else fi +# Check whether --with-localstorage was given. +if test "${with_localstorage+set}" = set; then : + withval=$with_localstorage; as_fn_error $? "requested obsolete option --with-localstorage. Use --with-cachepaths and --with-commpath instead." "$LINENO" 5 +fi + + # Check whether --with-cachepaths was given. if test "${with_cachepaths+set}" = set; then : withval=$with_cachepaths; CACHEPATHS=${withval} From e11f36bdca0663c2ad6733c0c4bd1a962749948e Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 16:27:54 -0800 Subject: [PATCH 33/53] Updates spank plugin to use commpath. Replaces args->location with args->commpath. --- src/slurm_plugin/slurm_plugin.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index f2fedfeb..3e2228ad 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -104,6 +104,7 @@ static int enable_spindle = 0; static int start_session = 0; extern char *parse_location(char *loc, number_t number); +extern char *realize(char *path); // CLI options for srun struct spank_option spank_options[] = @@ -703,7 +704,7 @@ static unique_id_t getUniqueID(spank_t spank, int session_enabled) static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv, unique_id_t unique_id, int session_enabled) { int result; - char *oldlocation; + char *symbolic_commpath, *orig_commpath; char *err_string; args->unique_id = unique_id; @@ -730,10 +731,15 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv args->opts |= OPT_BEEXIT; } - oldlocation = args->location; + + symbolic_commpath = args->commpath; + orig_commpath = parse_location(xmbolic_commpath, args->number); + if( !orig_commpath ){ + return -1; + } + args->commpath = realize(orig_commpath) + current_spank = spank; - args->location = parse_location(oldlocation, args->number); - free(oldlocation); return 0; } @@ -1261,8 +1267,8 @@ static int handleExit(void *params, char **output_str) return 0; } - if (!args.location) { - sdprintf(2, "WARNING: spindleExitBE not called since location is NULL\n"); + if (!args.commpath) { + sdprintf(2, "WARNING: spindleExitBE not called since commpath is NULL\n"); } else { // The task_exit callback is run for _each proc_, so we use // isBEProc to pick only one proc per node to call spindleExitBE. @@ -1279,9 +1285,9 @@ static int handleExit(void *params, char **output_str) return 0; } } else { - result = spindleExitBE(args.location); + result = spindleExitBE(args.commpath); if (result == -1) { - sdprintf(1, "ERROR: spindleExitBE returned an error on location %s\n", args.location); + sdprintf(1, "ERROR: spindleExitBE returned an error on commpath %s\n", args.commpath); return -1; } } From be289da33863d29f663d656665ac9c8ccb03fbb2 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 17:15:21 -0800 Subject: [PATCH 34/53] Updates a configure script to use commpath. --- .../testing-plugin/scripts/build_spindle.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 2a252b32..879a28c6 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 825e94703faaa2b597f43e58d7cc0eb80736418f Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 12 Feb 2026 20:12:23 -0800 Subject: [PATCH 35/53] Rebasing commpath on devel Additional integration for commpath + spank-plugin. --- src/slurm_plugin/plugin_utils.c | 10 ++++------ src/slurm_plugin/slurm_plugin.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/slurm_plugin/plugin_utils.c b/src/slurm_plugin/plugin_utils.c index 9a00e111..f77531e4 100644 --- a/src/slurm_plugin/plugin_utils.c +++ b/src/slurm_plugin/plugin_utils.c @@ -301,10 +301,8 @@ int isFEHost(char **hostlist, unsigned int num_hosts) static char* locSpecificDir(spindle_args_t *params) { char *dir = NULL, *expanded_dir = NULL, *realized_dir = NULL; - char hostname[256], session_id_str[32]; - size_t unique_file_len; - - dir = params->location; + + dir = params->commpath; if (!dir) { sdprintf(1, "ERROR: Location not filled in\n"); goto done; @@ -858,7 +856,7 @@ int registerFEPid(pid_t pid, spindle_args_t *args) int fd; int result; - snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->location); + snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->commpath); pid_file[sizeof(pid_file)-1] = '\0'; snprintf(pid_s, sizeof(pid_s), "%d\n", (int) pid); @@ -889,7 +887,7 @@ int readFEPid(pid_t *pid, spindle_args_t *args) pid_t pid_result; int fd, result; - snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->location); + snprintf(pid_file, sizeof(pid_file), "%s/fepid", args->commpath); pid_file[sizeof(pid_file)-1] = '\0'; sdprintf(2, "Reading FE pid from %s\n", pid_file); diff --git a/src/slurm_plugin/slurm_plugin.c b/src/slurm_plugin/slurm_plugin.c index 3e2228ad..0989299c 100644 --- a/src/slurm_plugin/slurm_plugin.c +++ b/src/slurm_plugin/slurm_plugin.c @@ -733,11 +733,11 @@ static int fillInArgs(spank_t spank, spindle_args_t *args, int argc, char **argv symbolic_commpath = args->commpath; - orig_commpath = parse_location(xmbolic_commpath, args->number); + orig_commpath = parse_location(symbolic_commpath, args->number); if( !orig_commpath ){ return -1; } - args->commpath = realize(orig_commpath) + args->commpath = realize(orig_commpath); current_spank = spank; From c97e946651f17039dd7dc80c7024765073a44b15 Mon Sep 17 00:00:00 2001 From: Barry Date: Sat, 14 Feb 2026 11:45:32 -0800 Subject: [PATCH 36/53] Fixes two silly bugs. TMPDIR left out of a docker script, and --cachepath instead of --cachepaths. --- configure | 8 ++++++++ configure.common.ac | 4 ++++ containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile | 1 + .../testing-plugin/scripts/build_spindle.sh | 2 +- src/client/configure | 8 ++++++++ src/fe/configure | 8 ++++++++ src/server/configure | 8 ++++++++ 7 files changed, 38 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 0e99b0ac..ac8d1434 100755 --- a/configure +++ b/configure @@ -849,6 +849,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1596,6 +1597,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16683,6 +16685,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/configure.common.ac b/configure.common.ac index 9096bb24..baa9c0a9 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -25,6 +25,10 @@ AC_ARG_WITH(cachepaths, [AS_HELP_STRING([--with-cachepaths=DIR],[Colon-separated list of potential back-end cache directories])], [CACHEPATHS=${withval}], [CACHEPATHS=$DEFAULT_LOC]) +AC_ARG_WITH(cachepath, + [[],[]], + [AC_MSG_ERROR(use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths)], + []) AC_ARG_WITH(commpath, [AS_HELP_STRING([--with-compath=DIR],[Back-end directory for communication and housekeeping])], [COMMPATH=${withval}], diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile index 7b66a155..da39cbe5 100644 --- a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile @@ -2,6 +2,7 @@ ARG BASE_VERSION=latest FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} +ENV TMPDIR=/tmp ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 879a28c6..1aa9889c 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepaths=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/src/client/configure b/src/client/configure index 00f93824..eb84ee07 100755 --- a/src/client/configure +++ b/src/client/configure @@ -812,6 +812,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1538,6 +1539,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -12608,6 +12610,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/src/fe/configure b/src/fe/configure index 89be0536..4c11bdc7 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -833,6 +833,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1576,6 +1577,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16458,6 +16460,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} diff --git a/src/server/configure b/src/server/configure index a5ff6885..83bff908 100755 --- a/src/server/configure +++ b/src/server/configure @@ -839,6 +839,7 @@ with_default_port with_default_num_ports with_localstorage with_cachepaths +with_cachepath with_commpath with_default_local_prefix with_testrm @@ -1573,6 +1574,7 @@ Optional Packages: Use --with-cachepaths and --with-commpath instead. --with-cachepaths=DIR Colon-separated list of potential back-end cache directories +, --with-compath=DIR Back-end directory for communication and housekeeping --with-default-local-prefix=DIRS @@ -16455,6 +16457,12 @@ else fi +# Check whether --with-cachepath was given. +if test "${with_cachepath+set}" = set; then : + withval=$with_cachepath; as_fn_error $? "use --with-cachepaths=DIRS (plural) instead of --with-cachepath=DIR to specify one or more cache paths" "$LINENO" 5 +fi + + # Check whether --with-commpath was given. if test "${with_commpath+set}" = set; then : withval=$with_commpath; COMMPATH=${withval} From a2351a1b2223b8f7bb3fd0c844b86e04b0a3bb93 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 19 Feb 2026 23:29:55 -0800 Subject: [PATCH 37/53] Testing non-overlapping cache/commpath directories. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile | 1 + .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 283e4451..90dbbb21 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 4fee85b4..cdc18537 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile index da39cbe5..951480f1 100644 --- a/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile +++ b/containers/spindle-slurm-ubuntu/testing-plugin/Dockerfile @@ -3,6 +3,7 @@ FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} ARG replicas=4 ENV workers=${replicas} ENV TMPDIR=/tmp +RUN echo 'TMPDIR="/tmp"' >> /etc/environment ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-plugin diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 1aa9889c..5fbb5c97 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp --with-cachepaths=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp/commpath --with-cachepaths=/tmp/cachepath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index 17e7197f..be6b933a 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp --with-commpath=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From f7c739bdcf7219717ba8942039357f6e1e5b7c20 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Feb 2026 09:10:25 -0800 Subject: [PATCH 38/53] Testing commpath as subdirectory of cachepath. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 90dbbb21..02f44096 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index cdc18537..42604a2f 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 5fbb5c97..1e88d116 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-commpath=/tmp/commpath --with-cachepaths=/tmp/cachepath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index be6b933a..fffe6a55 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 8137cc042a6a1dad14b7ae07ca5b367a046a880d Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Feb 2026 09:22:33 -0800 Subject: [PATCH 39/53] Testing cachepath as a subdirectory of commpath. --- containers/spindle-flux-ubuntu/scripts/build_spindle.sh | 2 +- containers/spindle-serial-ubuntu/scripts/build_spindle.sh | 2 +- .../testing-plugin/scripts/build_spindle.sh | 2 +- .../spindle-slurm-ubuntu/testing/scripts/build_spindle.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh index 02f44096..a63b5d8d 100755 --- a/containers/spindle-flux-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-flux-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh index 42604a2f..1022ba14 100755 --- a/containers/spindle-serial-ubuntu/scripts/build_spindle.sh +++ b/containers/spindle-serial-ubuntu/scripts/build_spindle.sh @@ -4,7 +4,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=serial --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh index 1e88d116..2b36be90 100755 --- a/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-plugin/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm-plugin --enable-slurm-plugin --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install diff --git a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh index fffe6a55..7fcb48e3 100755 --- a/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/cachepath --with-commpath=/tmp/cachepath/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install From 590377232564a2cbabd930ea52d4f1118844e4e2 Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 9 Mar 2026 16:23:24 -0700 Subject: [PATCH 40/53] ci.yml Forcing commpath to use current devel ver. --- src/server/startup/spindle_be.cc | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index c3e7bbd9..f91ce2f6 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -136,6 +136,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = ldcs_audit_server_network_setup(port, num_ports, unique_id, &setup_data, &setup_data_size); if (result == -1) { err_printf("Error setting up network in spindleRunBE\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } unpack_data(&args, setup_data, setup_data_size); @@ -148,6 +150,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i char *new_commpath = parse_location(args.commpath, args.number); if (!new_commpath) { err_printf("Failed to convert commpath %s\n", args.commpath); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } debug_printf("Translated commpath from %s to %s\n", args.commpath, new_commpath); @@ -157,6 +161,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = ldcs_audit_server_process(&args); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } @@ -164,6 +170,8 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i result = post_setup(&args); if (result == -1) { err_printf("post_setup callback errored. Returning\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } } @@ -172,11 +180,12 @@ int spindleRunBE(unsigned int port, unsigned int num_ports, unique_id_t unique_i ldcs_audit_server_run(); if (result == -1) { err_printf("Error in ldcs_audit_server_process\n"); + if (args.startup_type == startup_external) + LOGGING_FINI; return -1; } - - if (args.startup_type == startup_external) + if (args.startup_type == startup_external) LOGGING_FINI; return 0; From 1febb0870e9513b0ad799df1df91e69e2edf163b Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Mar 2026 07:28:29 -0700 Subject: [PATCH 41/53] Adds NO_CACHEPATH_CONSENSUS_YET and retry logic. This commit fixes a race condition where eager client processes can submit a LDCS_MSG_CHOSEN_CACHEPATH_REQUEST before the servers have come to a consensus. When that is the case, the server now responds with LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET and the client sleeps for one second (max 10 retries) before sending the message again. 1) include/ldcs_api.h Added LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET 2) server/comlib/ldcs_api_util.c Added STR_CASE entry for that message. 3) server/auditserver/ldcs_audit_server_handlers.c Added global "static bool cachepath_consensus_reached" above handle_cachepath_consensus() Set that variable to true inside handle_cachepath_consensus() In handle_chosen_cachepath_request(), made msg.header.type conditional on cachepath_consensus_reached 4) client/client_comlib/client_api.c This is the only place LDCS_MSG_CHOSEN_CACHEPATH_REQUEST is used. Will sleep for 1 second after each LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET message. --- src/client/client_comlib/client_api.c | 32 ++++++++++++++----- src/include/ldcs_api.h | 1 + .../auditserver/ldcs_audit_server_handlers.c | 19 +++++++---- src/server/comlib/ldcs_api_util.c | 1 + 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index f5e07cc0..bd99794e 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "ldcs_api.h" #include "client_api.h" @@ -39,21 +40,36 @@ static struct lock_t comm_lock; int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chosen_parsed_cachepath){ + int retries = 0, max_retries = 10; + struct timespec one_second = { .tv_sec = 1, .tv_nsec = 0 }; ldcs_message_t message; char buffer[MAX_PATH_LEN+1]; buffer[MAX_PATH_LEN] = '\0'; - message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; - message.header.len = MAX_PATH_LEN; - message.data = buffer; - COMM_LOCK; + do{ + message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; + message.header.len = MAX_PATH_LEN; + message.data = buffer; - debug_printf3("sending message of type: request_location_path.\n" ); - client_send_msg(fd, &message); - client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + COMM_LOCK; - COMM_UNLOCK; + debug_printf3("sending message of type: request_location_path.\n" ); + client_send_msg(fd, &message); + client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); + + COMM_UNLOCK; + + if( message.header.type == LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET ){ + if( retries++ >= max_retries ){ + break; + } + nanosleep( &one_second, NULL ); + continue; + } + break; + + }while( 1 ); if (message.header.type != LDCS_MSG_CHOSEN_CACHEPATH || message.header.len > MAX_PATH_LEN) { err_printf("Got unexpected message of type %d\n", (int) message.header.type); diff --git a/src/include/ldcs_api.h b/src/include/ldcs_api.h index 49ba45e3..0bcd8f40 100644 --- a/src/include/ldcs_api.h +++ b/src/include/ldcs_api.h @@ -88,6 +88,7 @@ typedef enum { LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS, LDCS_MSG_CHOSEN_CACHEPATH_REQUEST, LDCS_MSG_CHOSEN_CACHEPATH, + LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET, LDCS_MSG_UNKNOWN } ldcs_message_ids_t; diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 3ab968a9..dbd4cfd7 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2961,7 +2961,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which cachepaths are * available across all of the servers. */ - +static bool cachepath_consensus_reached; static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ int num_children = ldcs_audit_server_md_get_num_children(procdata); @@ -2988,6 +2988,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag ldcs_audit_server_filemngt_init(procdata->cachepath); test_printf(" cachepath=%s\n", procdata->cachepath); + cachepath_consensus_reached = true; return 0; } @@ -3006,12 +3007,18 @@ static int handle_chosen_cachepath_request(ldcs_process_data_t *procdata, int nc return 0; - msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; + if( cachepath_consensus_reached ){ + msg.header.type = LDCS_MSG_CHOSEN_CACHEPATH; + msg.header.len = strlen(procdata->cachepath) + 1 + strlen(procdata->parsed_cachepath) + 1; + msg.data = calloc( 1, msg.header.len ); + strcpy( msg.data, procdata->cachepath ); + strcpy( &msg.data[ strlen(procdata->cachepath)+1 ], procdata->parsed_cachepath ); + }else{ + msg.header.type = LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET; + msg.header.len = 0; + msg.data = NULL; + } - msg.header.len = strlen(procdata->cachepath) + 1 + strlen(procdata->parsed_cachepath) + 1; - msg.data = calloc( 1, msg.header.len ); - strcpy( msg.data, procdata->cachepath ); - strcpy( &msg.data[ strlen(procdata->cachepath)+1 ], procdata->parsed_cachepath ); ldcs_send_msg(connid, &msg); free( msg.data ); procdata->server_stat.clientmsg.cnt++; diff --git a/src/server/comlib/ldcs_api_util.c b/src/server/comlib/ldcs_api_util.c index 5ea51ab8..af101815 100644 --- a/src/server/comlib/ldcs_api_util.c +++ b/src/server/comlib/ldcs_api_util.c @@ -94,6 +94,7 @@ char* _message_type_to_str (ldcs_message_ids_t type) { STR_CASE(LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS); STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH_REQUEST); STR_CASE(LDCS_MSG_CHOSEN_CACHEPATH); + STR_CASE(LDCS_MSG_NO_CACHEPATH_CONSENSUS_YET); STR_CASE(LDCS_MSG_UNKNOWN); } return "unknown"; From 8baee827a4395da0291d9c094f2f873bba0969c9 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 20 Mar 2026 09:28:57 -0700 Subject: [PATCH 42/53] Changes bool/true to int/1 --- src/server/auditserver/ldcs_audit_server_handlers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index dbd4cfd7..3cfe9603 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2961,7 +2961,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs * Handle LDCS_MSG_REQUEST_CACHEPATH_CONSENSUS to determine which cachepaths are * available across all of the servers. */ -static bool cachepath_consensus_reached; +static int cachepath_consensus_reached; static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ int num_children = ldcs_audit_server_md_get_num_children(procdata); @@ -2988,7 +2988,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag ldcs_audit_server_filemngt_init(procdata->cachepath); test_printf(" cachepath=%s\n", procdata->cachepath); - cachepath_consensus_reached = true; + cachepath_consensus_reached = 1; return 0; } From 5585081b8921cff05ffe0df85e61fd96107ad7ff Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 25 Mar 2026 14:09:56 -0700 Subject: [PATCH 43/53] Adds delay to flush out consensus race condition. The theory being that eager clients are using an uninitialized cachepath variable. By delaying the consensus, the failure should happen more often. --- src/server/auditserver/ldcs_audit_server_handlers.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 3cfe9603..9d9e8b8d 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -21,6 +21,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include #include +#include #include "ldcs_api.h" #include "ldcs_api_listen.h" @@ -2964,6 +2965,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs static int cachepath_consensus_reached; static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ + struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; int num_children = ldcs_audit_server_md_get_num_children(procdata); if (num_children) { @@ -2984,6 +2986,10 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag &procdata->symbolic_cachepath); } + debug_printf( "Arrived at cachepath consensus: %s. Now delaying to flush race condition.\n", procdata->cachepath ); + nanosleep( &seconds, NULL ); + debug_printf( "Delay completed.\n"); + debug_printf3("Initializing file cache cachepath %s\n", procdata->cachepath); ldcs_audit_server_filemngt_init(procdata->cachepath); From e1e707b5f548a98a527f87828682bf24e4bbd426 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 25 Mar 2026 16:35:33 -0700 Subject: [PATCH 44/53] Adds lots of asserts around cachepath variables. --- src/client/beboot/spindle_bootstrap.c | 3 ++- src/client/client/client.c | 3 +++ src/client/client/intercept_readlink.c | 1 + src/client/client/should_intercept.c | 1 + src/server/startup/spindle_be.cc | 2 +- testsuite/test_driver.c | 1 + 6 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/client/beboot/spindle_bootstrap.c b/src/client/beboot/spindle_bootstrap.c index 75b9420f..98f91913 100644 --- a/src/client/beboot/spindle_bootstrap.c +++ b/src/client/beboot/spindle_bootstrap.c @@ -113,8 +113,9 @@ static void setup_environment() if (opts & OPT_RELOCAOUT) connection_str = client_get_connection_string(ldcsid); - char *chosen_parsed_cachepath; + char *chosen_parsed_cachepath = NULL; send_cachepath_query( ldcsid , NULL, &chosen_parsed_cachepath); + assert( chosen_parsed_cachepath ); setenv("LD_AUDIT", client_lib, 1); setenv("LDCS_COMMPATH", commpath, 1); diff --git a/src/client/client/client.c b/src/client/client/client.c index 6a75fb08..b30b96f2 100644 --- a/src/client/client/client.c +++ b/src/client/client/client.c @@ -264,6 +264,8 @@ static int init_server_connection() #endif } send_cachepath_query( ldcsid, &chosen_realized_cachepath, &chosen_parsed_cachepath ); + assert( chosen_realized_cachepath ); + assert( chosen_parsed_cachepath ); snprintf(debugging_name, 32, "Client.%d", rankinfo[0]); LOGGING_INIT(debugging_name); @@ -471,6 +473,7 @@ char *client_library_load(const char *name) char *orig_file_name = (char *) name; if (is_in_spindle_cache(name)) { + assert( chosen_realized_cachepath ); debug_printf2("Library %s is in spindle cache (%s). Translating request\n", name, chosen_realized_cachepath); memset(fixed_name, 0, MAX_PATH_LEN+1); send_orig_path_request(ldcsid, orig_file_name, fixed_name); diff --git a/src/client/client/intercept_readlink.c b/src/client/client/intercept_readlink.c index 28547bf1..93fb879e 100644 --- a/src/client/client/intercept_readlink.c +++ b/src/client/client/intercept_readlink.c @@ -39,6 +39,7 @@ static int fix_local_readlink(char *buf, size_t bufsiz) char tmp[MAX_PATH_LEN+1]; extern char *chosen_realized_cachepath; + assert( chosen_realized_cachepath ); cachepath_len = strlen(chosen_realized_cachepath); snprintf(spindle_id, sizeof(spindle_id), "spindle.%lx", number); if (strstr(buf, spindle_id) && strncmp(chosen_realized_cachepath, buf, cachepath_len) == 0) { diff --git a/src/client/client/should_intercept.c b/src/client/client/should_intercept.c index cee4e43c..89e4133d 100644 --- a/src/client/client/should_intercept.c +++ b/src/client/client/should_intercept.c @@ -35,6 +35,7 @@ int is_in_spindle_cache(const char *pathname) static int cachepath_size = 0; static int orig_cachepath_size = 0; extern char *chosen_realized_cachepath, *chosen_parsed_cachepath; + assert( chosen_realized_cachepath ); if (!cachepath_size) { cachepath_size = strlen(chosen_realized_cachepath); } diff --git a/src/server/startup/spindle_be.cc b/src/server/startup/spindle_be.cc index f91ce2f6..d5a234a1 100644 --- a/src/server/startup/spindle_be.cc +++ b/src/server/startup/spindle_be.cc @@ -71,7 +71,7 @@ static int unpack_data(spindle_args_t *args, void *buffer, int buffer_size) unpack_param(args->session_key, buf, pos); unpack_param(args->exec_excludes, buf, pos); assert(pos == buffer_size); - + assert( args->candidate_cachepaths ); return 0; } diff --git a/testsuite/test_driver.c b/testsuite/test_driver.c index bcee0d12..50ed9102 100644 --- a/testsuite/test_driver.c +++ b/testsuite/test_driver.c @@ -1292,6 +1292,7 @@ static int checkPathForLeak(const char *what, const char *path, const char *cach static int leak_check_cb(struct dl_phdr_info *p, size_t psize, void *opaque) { char *cachepath = (char *) opaque; + assert( cachepath ); if (!p->dlpi_name || p->dlpi_name[0] == '\0') return 0; checkPathForLeak("dl_iterate_phdr", p->dlpi_name, cachepath); From 3712be1135d1208f659cf0c8f07838270e764131 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 26 Mar 2026 12:45:29 -0700 Subject: [PATCH 45/53] FIX: Update debug3 msg text "sending message of type: request_location_path" is now "sending message of type: CHOSEN_CACHEPATH_REQUEST" --- src/client/client_comlib/client_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index bd99794e..6599f9f8 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -54,7 +54,7 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose COMM_LOCK; - debug_printf3("sending message of type: request_location_path.\n" ); + debug_printf3("sending message of type: CHOSEN_CACHEPATH_REQUEST.\n" ); client_send_msg(fd, &message); client_recv_msg_static(fd, &message, LDCS_READ_BLOCK); From caa057b6b200f64ede5c0679170b502490621132 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 27 Mar 2026 07:04:59 -0700 Subject: [PATCH 46/53] FIX: Fixes use-after-free error Known to affect the symbolic form of candidate cachepaths. Not sure that's ever being used, but it's fixed now. --- .../auditserver/ldcs_audit_server_handlers.c | 15 +++++++++++++++ src/utils/parseloc.c | 13 ++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 9d9e8b8d..85636565 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -22,6 +22,8 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include #include +#include +#include #include "ldcs_api.h" #include "ldcs_api_listen.h" @@ -2968,12 +2970,22 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; int num_children = ldcs_audit_server_md_get_num_children(procdata); + debug_printf( "Processing REQUEST_CACHEPATH_CONSENSUS." ); + debug_printf( " procdata->cachepath_bitidx = %#"PRIx64"\n", procdata->cachepath_bitidx ); + debug_printf( " procdata->cachepaths = %s\n", procdata->cachepaths ); + debug_printf( " procdata->cachepath = %s [should be null]\n", procdata->cachepath ); + debug_printf( " procdata->commpath = %s\n", procdata->commpath ); + debug_printf( " num_children = %d\n", num_children ); + if (num_children) { spindle_broadcast(procdata, msg); + debug_printf( "Successfully broadcast REQUEST_CACHEPATH_CONSENSUS\n" ); msgbundle_force_flush(procdata); + debug_printf( "Successfully flushed the broadcast of REQUEST_CACHEPATH_CONSENSUS\n" ); } ldcs_audit_server_md_allreduce_AND( &procdata->cachepath_bitidx ); + debug_printf( "The consensus value for procdata->cachepath_bitidx is: %#"PRIx64"\n", procdata->cachepath_bitidx ); if( procdata->cachepath_bitidx == 0 ){ err_printf("No valid cachepath path available. Falling back to \"commpath\" path (%s).\n", procdata->commpath); @@ -2984,6 +2996,9 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag &procdata->cachepath, &procdata->parsed_cachepath, &procdata->symbolic_cachepath); + debug_printf( "The consensus cachepath is: %s\n", procdata->cachepath ); + debug_printf( "The consensus parsed_cachepath is: %s\n", procdata->parsed_cachepath ); + debug_printf( "The consensus symbolic_cachepath is: %s\n", procdata->symbolic_cachepath ); } debug_printf( "Arrived at cachepath consensus: %s. Now delaying to flush race condition.\n", procdata->cachepath ); diff --git a/src/utils/parseloc.c b/src/utils/parseloc.c index 8c2a7929..b14cab00 100644 --- a/src/utils/parseloc.c +++ b/src/utils/parseloc.c @@ -317,9 +317,14 @@ static int validateCandidatePath( char *candidatePath, char **realizedPath, char if( realizedCandidatePath ){ rc = spindle_mkdir( parsedCandidatePath ); if( 0 == rc ){ - if( symbolicPath) *symbolicPath = candidatePath; - if( parsedPath ) *parsedPath = parsedCandidatePath; - if( realizedPath) *realizedPath = realizedCandidatePath; + // candidatePath is going to be freed in the calling function. + // symbolicPath needs a strdup(). parsedPath() and realizedPath() + // allocate their own memory for strings right now; the extra + // strdup()s are just in case a future implementation decides + // to modify the string passed instead of returning a new one. + if( symbolicPath) *symbolicPath = strdup(candidatePath); + if( parsedPath ) *parsedPath = strdup(parsedCandidatePath); + if( realizedPath) *realizedPath = strdup(realizedCandidatePath); return 1; }else{ debug_printf2("Unable to create directory %s, moving on to the next candidate.\n", realizedCandidatePath ); @@ -345,6 +350,7 @@ static char *realizedCachePaths[64], *parsedCachePaths[64], *symbolicCachePaths[ void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number_t number ){ char *saveptr, *candidatePath, *pathList = strdup( origPathList ); + size_t pathList_len = strlen( pathList ); uint64_t bitoffset = 0; *validBitIdx = 0; @@ -360,6 +366,7 @@ void determineValidCachePaths( uint64_t *validBitIdx, char *origPathList, number bitoffset++; candidatePath = strtok_r( NULL, ":", &saveptr ); } + memset( pathList, 'Q', pathList_len ); free( pathList ); } From 3b4503c47ca0dabc60a0b7c76d110d676157f1e4 Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 27 Mar 2026 14:14:53 -0700 Subject: [PATCH 47/53] FIX: Greater message detail in FE code _message_type_to_str() can now be used in cobo_fe_comm.c. ldcs_audit_server_fe_broadcast() now reports message type. Only two messages are expected to be routed through there, but it's the correct way to report it. --- src/fe/comlib/Makefile.am | 2 +- src/fe/comlib/Makefile.in | 30 +++++++++++++++++++++++++----- src/fe/comlib/cobo_fe_comm.c | 2 +- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/fe/comlib/Makefile.am b/src/fe/comlib/Makefile.am index 56a25ea2..2bb98236 100644 --- a/src/fe/comlib/Makefile.am +++ b/src/fe/comlib/Makefile.am @@ -1,5 +1,5 @@ noinst_LTLIBRARIES = libfe_cobo.la #noinst_LTLIBRARIES = libfe_msocket.la AM_CPPFLAGS = -I$(top_srcdir)/../logging -I$(top_srcdir)/../include -I$(top_srcdir)/../cobo -libfe_cobo_la_SOURCES = cobo_fe_comm.c +libfe_cobo_la_SOURCES = cobo_fe_comm.c ../../server/comlib/ldcs_api_util.c #libfe_msocket_la_SOURCES = msocket_fe_comm.c diff --git a/src/fe/comlib/Makefile.in b/src/fe/comlib/Makefile.in index ba02a246..b9f4599b 100644 --- a/src/fe/comlib/Makefile.in +++ b/src/fe/comlib/Makefile.in @@ -110,7 +110,9 @@ CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = LTLIBRARIES = $(noinst_LTLIBRARIES) libfe_cobo_la_LIBADD = -am_libfe_cobo_la_OBJECTS = cobo_fe_comm.lo +am__dirstamp = $(am__leading_dot)dirstamp +am_libfe_cobo_la_OBJECTS = cobo_fe_comm.lo \ + ../../server/comlib/ldcs_api_util.lo libfe_cobo_la_OBJECTS = $(am_libfe_cobo_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -131,7 +133,8 @@ am__v_at_1 = DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) depcomp = $(SHELL) $(top_srcdir)/../../scripts/depcomp am__maybe_remake_depfiles = depfiles -am__depfiles_remade = ./$(DEPDIR)/cobo_fe_comm.Plo +am__depfiles_remade = ../../server/comlib/$(DEPDIR)/ldcs_api_util.Plo \ + ./$(DEPDIR)/cobo_fe_comm.Plo am__mv = mv -f COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) @@ -332,7 +335,7 @@ top_srcdir = @top_srcdir@ noinst_LTLIBRARIES = libfe_cobo.la #noinst_LTLIBRARIES = libfe_msocket.la AM_CPPFLAGS = -I$(top_srcdir)/../logging -I$(top_srcdir)/../include -I$(top_srcdir)/../cobo -libfe_cobo_la_SOURCES = cobo_fe_comm.c +libfe_cobo_la_SOURCES = cobo_fe_comm.c ../../server/comlib/ldcs_api_util.c all: all-am .SUFFIXES: @@ -377,16 +380,28 @@ clean-noinstLTLIBRARIES: echo rm -f $${locs}; \ rm -f $${locs}; \ } +../../server/comlib/$(am__dirstamp): + @$(MKDIR_P) ../../server/comlib + @: > ../../server/comlib/$(am__dirstamp) +../../server/comlib/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) ../../server/comlib/$(DEPDIR) + @: > ../../server/comlib/$(DEPDIR)/$(am__dirstamp) +../../server/comlib/ldcs_api_util.lo: \ + ../../server/comlib/$(am__dirstamp) \ + ../../server/comlib/$(DEPDIR)/$(am__dirstamp) libfe_cobo.la: $(libfe_cobo_la_OBJECTS) $(libfe_cobo_la_DEPENDENCIES) $(EXTRA_libfe_cobo_la_DEPENDENCIES) $(AM_V_CCLD)$(LINK) $(libfe_cobo_la_OBJECTS) $(libfe_cobo_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) + -rm -f ../../server/comlib/*.$(OBJEXT) + -rm -f ../../server/comlib/*.lo distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@../../server/comlib/$(DEPDIR)/ldcs_api_util.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cobo_fe_comm.Plo@am__quote@ # am--include-marker $(am__depfiles_remade): @@ -424,6 +439,7 @@ mostlyclean-libtool: clean-libtool: -rm -rf .libs _libs + -rm -rf ../../server/comlib/.libs ../../server/comlib/_libs ID: $(am__tagged_files) $(am__define_uniq_tagged_files); mkid -fID $$unique @@ -540,6 +556,8 @@ clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -rm -f ../../server/comlib/$(DEPDIR)/$(am__dirstamp) + -rm -f ../../server/comlib/$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @@ -550,7 +568,8 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ mostlyclean-am distclean: distclean-am - -rm -f ./$(DEPDIR)/cobo_fe_comm.Plo + -rm -f ../../server/comlib/$(DEPDIR)/ldcs_api_util.Plo + -rm -f ./$(DEPDIR)/cobo_fe_comm.Plo -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags @@ -596,7 +615,8 @@ install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am - -rm -f ./$(DEPDIR)/cobo_fe_comm.Plo + -rm -f ../../server/comlib/$(DEPDIR)/ldcs_api_util.Plo + -rm -f ./$(DEPDIR)/cobo_fe_comm.Plo -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic diff --git a/src/fe/comlib/cobo_fe_comm.c b/src/fe/comlib/cobo_fe_comm.c index cef2f225..297fc1d3 100644 --- a/src/fe/comlib/cobo_fe_comm.c +++ b/src/fe/comlib/cobo_fe_comm.c @@ -187,7 +187,7 @@ int ldcs_audit_server_fe_broadcast(ldcs_message_t *msg, void *data) (void)data; int root_fd; - debug_printf("Broadcasting message to daemons\n"); + debug_printf("Broadcasting message %s to daemons\n",_message_type_to_str( msg->header.type ) ); cobo_server_get_root_socket(&root_fd); return write_msg(root_fd, msg); From b01b3a03452f106b821e3573c8701e665168b16e Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 29 Mar 2026 20:31:19 -0700 Subject: [PATCH 48/53] Fixed missing carriage return. --- src/server/auditserver/ldcs_audit_server_handlers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 85636565..849d294d 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2970,7 +2970,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; int num_children = ldcs_audit_server_md_get_num_children(procdata); - debug_printf( "Processing REQUEST_CACHEPATH_CONSENSUS." ); + debug_printf( "Processing REQUEST_CACHEPATH_CONSENSUS.\n" ); debug_printf( " procdata->cachepath_bitidx = %#"PRIx64"\n", procdata->cachepath_bitidx ); debug_printf( " procdata->cachepaths = %s\n", procdata->cachepaths ); debug_printf( " procdata->cachepath = %s [should be null]\n", procdata->cachepath ); From 0e5c4b6678686912d1b8e18c698833a0a6d30b03 Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 30 Mar 2026 11:31:38 -0700 Subject: [PATCH 49/53] FIX: Don't send server 4k of uninitialized data. --- src/client/client_comlib/client_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/client_comlib/client_api.c b/src/client/client_comlib/client_api.c index 6599f9f8..6101a494 100644 --- a/src/client/client_comlib/client_api.c +++ b/src/client/client_comlib/client_api.c @@ -49,7 +49,7 @@ int send_cachepath_query( int fd, char **chosen_realized_cachepath, char **chose do{ message.header.type = LDCS_MSG_CHOSEN_CACHEPATH_REQUEST; - message.header.len = MAX_PATH_LEN; + message.header.len = 0; message.data = buffer; COMM_LOCK; From 5f713a4be21edfa1a24ad65051d26e0b1b575d03 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 8 Apr 2026 21:46:35 -0700 Subject: [PATCH 50/53] FIX: Removes 4-second delay in cachepath process --- src/server/auditserver/ldcs_audit_server_handlers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 849d294d..9b4dadea 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2967,7 +2967,7 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs static int cachepath_consensus_reached; static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ - struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; + /* QQQ struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; */ int num_children = ldcs_audit_server_md_get_num_children(procdata); debug_printf( "Processing REQUEST_CACHEPATH_CONSENSUS.\n" ); @@ -3002,7 +3002,7 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag } debug_printf( "Arrived at cachepath consensus: %s. Now delaying to flush race condition.\n", procdata->cachepath ); - nanosleep( &seconds, NULL ); + /* QQQ nanosleep( &seconds, NULL ); */ debug_printf( "Delay completed.\n"); debug_printf3("Initializing file cache cachepath %s\n", procdata->cachepath); From 1a6fc91469e30782dac38b87d5a2c4e79b3cce3e Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 May 2026 21:01:51 -0700 Subject: [PATCH 51/53] Reworks cleanup_created_dirs() and CleanupProc. Cleanup now takes both commpath and cachepath and prefixes for removing files created by Spindle. --- src/server/auditserver/cleanup_proc.cc | 55 +++++++++++-------- src/server/auditserver/cleanup_proc.h | 8 +-- .../auditserver/ldcs_audit_server_filemngt.c | 29 +++++----- .../auditserver/ldcs_audit_server_filemngt.h | 2 +- .../auditserver/ldcs_audit_server_handlers.c | 10 ++-- .../auditserver/ldcs_audit_server_process.c | 2 +- 6 files changed, 58 insertions(+), 48 deletions(-) diff --git a/src/server/auditserver/cleanup_proc.cc b/src/server/auditserver/cleanup_proc.cc index a3d3ddcc..8f0e7f6a 100644 --- a/src/server/auditserver/cleanup_proc.cc +++ b/src/server/auditserver/cleanup_proc.cc @@ -51,11 +51,17 @@ static bool longest_str_first(const string &a, const string &b) return a.size() > b.size(); } -static void rmDirSet(const set &dirs, const char *prefix_dir) +static void rmDirSet(const set &dirs, const char *cachepath, const char *commpath) { - string path_sep("/"); - size_t prefix_size = prefix_dir ? strlen(prefix_dir) : 0; - + string path_sep("/"); + if( !cachepath || !commpath ){ + // Should never happen. + err_printf( "cachepath (%s) and/or commpath (%s) is NULL. Unable to cleanup files.\n" ); + return; + } + size_t cachepath_len = strlen(cachepath); + size_t commpath_len = strlen(commpath); + for (set::const_iterator i = dirs.begin(); i != dirs.end(); i++) { DIR *dir = opendir(i->c_str()); if (!dir) @@ -71,9 +77,10 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) if (dirs.find(componentpath) != dirs.end()) continue; - if (strncmp(prefix_dir, componentpath.c_str(), prefix_size) != 0) { - // We have multiple directory roots. Not a problem if the directory - // we're looking for isn't in this one. + if ( (strncmp(cachepath, componentpath.c_str(), cachepath_len) != 0) && + (strncmp(commpath, componentpath.c_str(), commpath_len ) != 0) ){ + err_printf( "File for deletion (%s) is outside of cachepath (%s) and commpath (%s).\n", + componentpath.c_str(), cachepath, commpath ); continue; } unlink(componentpath.c_str()); @@ -83,25 +90,28 @@ static void rmDirSet(const set &dirs, const char *prefix_dir) vector ordered_dirs(dirs.begin(), dirs.end()); sort(ordered_dirs.begin(), ordered_dirs.end(), longest_str_first); for (vector::iterator i = ordered_dirs.begin(); i != ordered_dirs.end(); i++) { - if (strncmp(prefix_dir, i->c_str(), prefix_size) != 0) { - continue; - } + if ( (strncmp(cachepath, i->c_str(), cachepath_len) != 0) && + (strncmp(commpath, i->c_str(), commpath_len) != 0) ){ + err_printf( "Directory for deletion (%s) is outside of cachepath (%s) and commpath (%s).\n", + i->c_str(), cachepath, commpath ); + continue; + } rmdir(i->c_str()); - } + } } class CleanupProc { - friend void init_cleanup_proc(const char *); + friend void init_cleanup_proc(const char *, const char *); private: set dirs; int write_dir_fd; int read_dir_fd; bool has_error; pid_t child_pid; - const char *prefix_dir; + const char *cachepath, *commpath; - CleanupProc(const char *pd); + CleanupProc(const char *cachepath, const char *commpath); void rmDirs(); void cleanupMain(); public: @@ -110,11 +120,12 @@ class CleanupProc bool hadError(); }; -CleanupProc::CleanupProc(const char *pd) : +CleanupProc::CleanupProc(const char *cachepath, const char *commpath) : write_dir_fd(-1), read_dir_fd(-1), has_error(false), - prefix_dir(pd) + cachepath(cachepath), + commpath(commpath) { int fds[2]; int result; @@ -185,7 +196,7 @@ bool CleanupProc::hadError() void CleanupProc::rmDirs() { - rmDirSet(dirs, prefix_dir); + rmDirSet(dirs, cachepath, commpath); } void CleanupProc::cleanupMain() @@ -240,10 +251,10 @@ void CleanupProc::addDir(const char *dir) static CleanupProc *proc = NULL; static set local_dircache; -void init_cleanup_proc(const char *location_dir) +void init_cleanup_proc(const char *cachepath, const char *commpath) { assert(!proc); - proc = new CleanupProc(location_dir); + proc = new CleanupProc(cachepath, commpath); if (proc->hadError()) { delete proc; proc = NULL; @@ -269,13 +280,13 @@ int lookup_prev_mkdir(const char *dir) return (i != local_dircache.end()) ? 1 : 0; } -void cleanup_created_dirs(const char *prefix_dir) +void cleanup_created_dirs(const char *cachepath, const char *commpath) { if (proc) { proc->triggerCleanup(); } else { debug_printf("Cleaning files with local unlink/rmdirs.\n"); - rmDirSet(local_dircache, prefix_dir); - } + rmDirSet(local_dircache, cachepath, commpath); + } } diff --git a/src/server/auditserver/cleanup_proc.h b/src/server/auditserver/cleanup_proc.h index f5f71123..ab8bb939 100644 --- a/src/server/auditserver/cleanup_proc.h +++ b/src/server/auditserver/cleanup_proc.h @@ -18,11 +18,11 @@ Place, Suite 330, Boston, MA 02111-1307 USA extern "C" { #endif -void init_cleanup_proc(const char *location_dir); +void init_cleanup_proc(const char *cachepath, const char *commpath); void track_mkdir(const char *dir); -int lookup_prev_mkdir(const char *dir); -void cleanup_created_dirs(const char *location_dir); - +int lookup_prev_mkdir(const char *dir); +void cleanup_created_dirs(const char *cachepath, const char *commpath); + #if defined(__cplusplus) } #endif diff --git a/src/server/auditserver/ldcs_audit_server_filemngt.c b/src/server/auditserver/ldcs_audit_server_filemngt.c index bb2e9294..1d0f410b 100644 --- a/src/server/auditserver/ldcs_audit_server_filemngt.c +++ b/src/server/auditserver/ldcs_audit_server_filemngt.c @@ -45,9 +45,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA #error LIBEXECDIR must be defined #endif -char *_ldcs_audit_server_tmpdir; +char *_ldcs_audit_server_cachepath; static char *normalized_tmpdir; - +static char *_ldcs_audit_server_commpath; extern int spindle_mkdir(char *path); static char *filemngt_normalize_dir(char *dir) { @@ -55,25 +55,26 @@ static char *filemngt_normalize_dir(char *dir) { return newpath ? newpath : dir; } -int ldcs_audit_server_filemngt_init (char* location) { +int ldcs_audit_server_filemngt_init (char *cachepath, char *commpath) { int rc=0; - _ldcs_audit_server_tmpdir = location; - if (-1 == spindle_mkdir(_ldcs_audit_server_tmpdir)) { - err_printf("mkdir: ERROR during mkdir %s\n", _ldcs_audit_server_tmpdir); + _ldcs_audit_server_cachepath = cachepath; + _ldcs_audit_server_commpath = commpath; + if (-1 == spindle_mkdir(_ldcs_audit_server_cachepath)) { + err_printf("mkdir: ERROR during mkdir %s\n", _ldcs_audit_server_cachepath); _error("mkdir failed"); } - normalized_tmpdir = filemngt_normalize_dir(location); + normalized_tmpdir = filemngt_normalize_dir(cachepath); return(rc); } /* Returns NULL if not a cached file. Otherwise, returns pointer to global portion of string */ char* ldcs_is_a_cachedfile (char* filename) { - int len = strlen(_ldcs_audit_server_tmpdir); + int len = strlen(_ldcs_audit_server_cachepath); int norm_len = strlen(normalized_tmpdir); - if ( strncmp(_ldcs_audit_server_tmpdir, filename, len) == 0 ) + if ( strncmp(_ldcs_audit_server_cachepath, filename, len) == 0 ) return filename + len + 1; if ( strncmp(normalized_tmpdir, filename, norm_len) == 0 ) return filename + norm_len + 1; @@ -105,7 +106,7 @@ char *filemngt_calc_localname(char *global_name, calc_local_t reqtype) size_t dirpart_size, filepart_size; int cut_dirpart_slash; - lastslash = strrchr(_ldcs_audit_server_tmpdir, '/'); + lastslash = strrchr(_ldcs_audit_server_cachepath, '/'); if (lastslash && lastslash[1] == '\0') endslash = ""; else @@ -153,10 +154,10 @@ char *filemngt_calc_localname(char *global_name, calc_local_t reqtype) cut_dirpart_slash = (dirpart[0] == '/') ? 1 : 0; - snprintf(target, sizeof(target), "%s%s%s", _ldcs_audit_server_tmpdir, endslash, dirpart+cut_dirpart_slash); + snprintf(target, sizeof(target), "%s%s%s", _ldcs_audit_server_cachepath, endslash, dirpart+cut_dirpart_slash); spindle_mkdir(target); - snprintf(target, sizeof(target), "%s%s%s/%s", _ldcs_audit_server_tmpdir, endslash, dirpart+cut_dirpart_slash, filepart); + snprintf(target, sizeof(target), "%s%s%s/%s", _ldcs_audit_server_cachepath, endslash, dirpart+cut_dirpart_slash, filepart); GCC7_ENABLE_WARNING; @@ -287,7 +288,7 @@ int filemngt_decode_packet(node_peer_t peer, ldcs_message_t *msg, char *filename **/ int ldcs_audit_server_filemngt_clean() { - cleanup_created_dirs(_ldcs_audit_server_tmpdir); + cleanup_created_dirs(_ldcs_audit_server_cachepath, _ldcs_audit_server_commpath); return 0; } @@ -829,7 +830,7 @@ int filemngt_convert_proc_maps(int pid, char *new_maps_filename, int new_maps_fi int result; debug_printf2("Asked to convert /proc/%d/maps to remove spindle paths\n", pid); - result = translate_proc_pid_maps(_ldcs_audit_server_tmpdir, pid, new_maps_filename, new_maps_filename_size); + result = translate_proc_pid_maps(_ldcs_audit_server_cachepath, pid, new_maps_filename, new_maps_filename_size); if (result == -1) { new_maps_filename[0] = '\0'; return -1; diff --git a/src/server/auditserver/ldcs_audit_server_filemngt.h b/src/server/auditserver/ldcs_audit_server_filemngt.h index 6f45c485..592b9c82 100644 --- a/src/server/auditserver/ldcs_audit_server_filemngt.h +++ b/src/server/auditserver/ldcs_audit_server_filemngt.h @@ -23,7 +23,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "ldcs_audit_server_md.h" -int ldcs_audit_server_filemngt_init (char* location); +int ldcs_audit_server_filemngt_init (char *cachepath, char *commpath); int filemngt_read_file(char *filename, void *buffer, size_t *size, int strip, int *err, int *was_stripped); int filemngt_encode_packet(char *filename, void *filecontents, size_t filesize, diff --git a/src/server/auditserver/ldcs_audit_server_handlers.c b/src/server/auditserver/ldcs_audit_server_handlers.c index 9b4dadea..67d2a04b 100644 --- a/src/server/auditserver/ldcs_audit_server_handlers.c +++ b/src/server/auditserver/ldcs_audit_server_handlers.c @@ -2298,7 +2298,7 @@ static int handle_client_fileexist_msg(ldcs_process_data_t *procdata, int nc, ld return handle_client_progress(procdata, nc); } -extern char *_ldcs_audit_server_tmpdir; +extern char *_ldcs_audit_server_cachepath; static int handle_client_origpath_msg(ldcs_process_data_t *procdata, int nc, ldcs_message_t *msg) { ldcs_client_t *client; @@ -2313,7 +2313,7 @@ static int handle_client_origpath_msg(ldcs_process_data_t *procdata, int nc, ldc lookuppath[MAX_PATH_LEN] = '\0'; if (*origpath != '/' && *origpath != '.') - snprintf(lookuppath, MAX_PATH_LEN, "%s/%s", _ldcs_audit_server_tmpdir, origpath); + snprintf(lookuppath, MAX_PATH_LEN, "%s/%s", _ldcs_audit_server_cachepath, origpath); else strncpy(lookuppath, origpath, MAX_PATH_LEN); @@ -2967,7 +2967,6 @@ static int handle_client_pickone_msg(ldcs_process_data_t *procdata, int nc, ldcs static int cachepath_consensus_reached; static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_message_t *msg){ - /* QQQ struct timespec seconds = { .tv_sec = 4, .tv_nsec = 0 }; */ int num_children = ldcs_audit_server_md_get_num_children(procdata); debug_printf( "Processing REQUEST_CACHEPATH_CONSENSUS.\n" ); @@ -2991,7 +2990,6 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag err_printf("No valid cachepath path available. Falling back to \"commpath\" path (%s).\n", procdata->commpath); procdata->cachepath = procdata->commpath; }else{ - // ldcs_audit_server_filemngt_init() does it's own realize() pass. getValidCachePathByIndex( procdata->cachepath_bitidx, &procdata->cachepath, &procdata->parsed_cachepath, @@ -2999,14 +2997,14 @@ static int handle_cachepath_consensus(ldcs_process_data_t *procdata, ldcs_messag debug_printf( "The consensus cachepath is: %s\n", procdata->cachepath ); debug_printf( "The consensus parsed_cachepath is: %s\n", procdata->parsed_cachepath ); debug_printf( "The consensus symbolic_cachepath is: %s\n", procdata->symbolic_cachepath ); + debug_printf( "The commpath is: %s\n", procdata->commpath ); } debug_printf( "Arrived at cachepath consensus: %s. Now delaying to flush race condition.\n", procdata->cachepath ); - /* QQQ nanosleep( &seconds, NULL ); */ debug_printf( "Delay completed.\n"); debug_printf3("Initializing file cache cachepath %s\n", procdata->cachepath); - ldcs_audit_server_filemngt_init(procdata->cachepath); + ldcs_audit_server_filemngt_init(procdata->cachepath, procdata->commpath); test_printf(" cachepath=%s\n", procdata->cachepath); cachepath_consensus_reached = 1; diff --git a/src/server/auditserver/ldcs_audit_server_process.c b/src/server/auditserver/ldcs_audit_server_process.c index a73a7b7f..b6ecdbff 100644 --- a/src/server/auditserver/ldcs_audit_server_process.c +++ b/src/server/auditserver/ldcs_audit_server_process.c @@ -196,7 +196,7 @@ int ldcs_audit_server_process(spindle_args_t *args) ldcs_process_data.server_stat.hostname=ldcs_process_data.hostname; if (ldcs_process_data.opts & OPT_PROCCLEAN) - init_cleanup_proc(ldcs_process_data.commpath); + init_cleanup_proc(ldcs_process_data.cachepath, ldcs_process_data.commpath); debug_printf3("Initializing connections for clients at %s and %lu\n", ldcs_process_data.commpath, (unsigned long) ldcs_process_data.number); From 9b5911fe851c0a03c68c7d2b97cfee7dddfd5244 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 May 2026 23:30:01 -0700 Subject: [PATCH 52/53] Removes LDCS_[LOCATION|COMMPATH]_MOD The original LDCS_LOCATION_MOD checked to see if there were multiple servers running on a node and, if so, modified the location string so that each server had its own location. The code did not handle the case where the directory above the requested directory was not writeable, e.g., if the user passed in --location=/tmp, the code would try to create a directory /tmp-00 for the first server. That fails. With commpath and cachepath replacing location, and with new initialization paths, the existing code would modify only commpath after the commpath directory had been created. If the multiple-server case needs to be supported, commpath- and cachepath-specific code needs to be added back in. --- .../auditserver/ldcs_audit_server_md_msocket.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/server/auditserver/ldcs_audit_server_md_msocket.c b/src/server/auditserver/ldcs_audit_server_md_msocket.c index 6db18bd4..5cfe9f2d 100644 --- a/src/server/auditserver/ldcs_audit_server_md_msocket.c +++ b/src/server/auditserver/ldcs_audit_server_md_msocket.c @@ -62,7 +62,6 @@ int ldcs_audit_server_md_init ( ldcs_process_data_t *ldcs_process_data ) { int rc=0; char* ldcs_nportsstr=getenv("LDCS_NPORTS"); - char* ldcs_locmodstr=getenv("LDCS_COMMPATH_MOD"); int usedport; int serverfd, serverid, i; @@ -128,21 +127,6 @@ int ldcs_audit_server_md_init ( ldcs_process_data_t *ldcs_process_data ) { ldcs_listen_unregister_fd(serverfd); - - if(ldcs_locmodstr) { - int ldcs_locmod=atoi(ldcs_locmodstr); - if(ldcs_locmod>0) { - char buffer[MAX_PATH_LEN]; - debug_printf3("multiple server per node add modifier to location mod=%d\n",ldcs_locmod); - if(strlen(ldcs_process_data->location)+10location,ldcs_process_data->md_rank%ldcs_locmod); - debug_printf3("change location to %s (locmod=%d)\n",buffer,ldcs_locmod); - free(ldcs_process_data->location); - ldcs_process_data->location=strdup(buffer); - } else _error("location path too long"); - } - } - return(rc); } From b5970ec4d2e97c3742f50e2c143adecaa689f98f Mon Sep 17 00:00:00 2001 From: Barry Date: Fri, 22 May 2026 00:43:00 -0700 Subject: [PATCH 53/53] Removes --with-localstorage from slurm testcase That configure parameter is no longer supported. Replaced with --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath --- .../spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh index acf1ef0b..125fe2eb 100755 --- a/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh +++ b/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh @@ -3,7 +3,7 @@ set -euxo pipefail mkdir -p /home/${USER}/Spindle-build cd /home/${USER}/Spindle-build -/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-cachepaths=/tmp/commpath/cachepath --with-commpath=/tmp/commpath CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" make -j$(nproc) make install