From a0232a2bd75e561d853eb85c49057425a68ede2e Mon Sep 17 00:00:00 2001 From: Matt Ezell Date: Fri, 17 Jan 2025 01:10:12 -0500 Subject: [PATCH 1/3] Use --external-launcher for Slurm launch --- configure.common.ac | 11 ++++------- src/fe/startup/launch_slurm.cc | 4 +--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/configure.common.ac b/configure.common.ac index 4e6141c0..1b7baf94 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -141,8 +141,8 @@ AS_CASE([$sysconfdir], BROKEN_SRUN=0 bad_srun_major=20 bad_srun_minor=11 -good_srun_major=0 -good_srun_minor=0 +good_srun_major=23 +good_srun_minor=11 if test "x$ENABLE_SLURM" == "xtrue"; then AC_PATH_PROG(SRUN_PATH, "srun", "none") if test "x$SRUN_PATH" == "xnone"; then @@ -153,10 +153,7 @@ if test "x$ENABLE_SLURM" == "xtrue"; then srun_major=`echo $srun_version | cut -d . -f 1` srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` - if ( [[ "$srun_major" == "$bad_srun_major" ]] && [[ $srun_minor -ge $bad_srun_minor ]] ) || [[ $srun_major -gt $bad_srun_major ]] ; then BROKEN_SRUN=1; fi - if test "x$good_srun_major" != "x0"; then - if ( [[ "$srun_major" == "$good_srun_major" ]] && [[ $srun_minor -ge $good_srun_minor ]] ) || [[ $srun_major -gt $good_srun_major ]] ; then BROKEN_SRUN=0; fi - fi + if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -lt $good_srun_minor ] ) || [ $srun_major -lt $good_srun_major ]; then BROKEN_SRUN=1; fi if test "x$BROKEN_SRUN" == "x1"; then AC_MSG_RESULT([no]) @@ -183,7 +180,7 @@ fi if test "x$ENABLE_RSH_LAUNCH" == "x1"; then AC_DEFINE_UNQUOTED([RSHLAUNCH_ENABLED],[1],[Default mode for rsh launch]) fi -AC_DEFINE_UNQUOTED([BROKEN_SRUN],[1],[Whether we are using a broken srun]) +AC_DEFINE_UNQUOTED([BROKEN_SRUN],[$BROKEN_SRUN],[Whether we are using a broken srun]) #Runmode detection (pipe/socket or cobo/msocket communications) if test "x$OS_BUILD" == "xlinux"; then diff --git a/src/fe/startup/launch_slurm.cc b/src/fe/startup/launch_slurm.cc index db490685..d4dce4fc 100644 --- a/src/fe/startup/launch_slurm.cc +++ b/src/fe/startup/launch_slurm.cc @@ -159,10 +159,8 @@ bool SlurmLauncher::spawnDaemon() char count_buffer[64]; snprintf(count_buffer, 64, "%d", nnodes); new_daemon_args[i++] = const_cast("srun"); - new_daemon_args[i++] = const_cast("--ntasks-per-node=1"); + new_daemon_args[i++] = const_cast("--external-launcher"); new_daemon_args[i++] = const_cast("--wait=0"); - new_daemon_args[i++] = const_cast("--gres=none"); - new_daemon_args[i++] = const_cast("--mem=0"); new_daemon_args[i++] = const_cast("-n"); new_daemon_args[i++] = count_buffer; new_daemon_args[i++] = const_cast("-N"); From 71b61ab74061499f2bd84b79fb0551f64e98f69d Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Wed, 13 May 2026 09:41:55 -0700 Subject: [PATCH 2/3] Fall back to old behavior for Slurm before 23.11 --- config.h.in | 3 +++ configure | 17 ++++++++++++----- configure.common.ac | 10 ++++++++-- src/client/config.h.in | 3 +++ src/client/configure | 17 ++++++++++++----- src/fe/config.h.in | 3 +++ src/fe/configure | 17 ++++++++++++----- src/fe/startup/launch_slurm.cc | 8 ++++++++ src/server/config.h.in | 3 +++ src/server/configure | 17 ++++++++++++----- 10 files changed, 76 insertions(+), 22 deletions(-) diff --git a/config.h.in b/config.h.in index a2afe8a8..c5261ac4 100644 --- a/config.h.in +++ b/config.h.in @@ -55,6 +55,9 @@ /* Define to 1 if you have MPI libs and headers. */ #undef HAVE_MPI +/* Whether srun supports --external-launcher */ +#undef HAVE_SRUN_EXTERNAL_LAUNCHER + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H diff --git a/configure b/configure index 0a52a68f..126b8b42 100755 --- a/configure +++ b/configure @@ -16885,10 +16885,11 @@ case $sysconfdir in #( esac BROKEN_SRUN=0 +HAVE_EXT_LAUNCHER=0 bad_srun_major=20 bad_srun_minor=11 -good_srun_major=0 -good_srun_minor=0 +good_srun_major=23 +good_srun_minor=11 if test "x$ENABLE_SLURM" == "xtrue"; then # Extract the first word of ""srun"", so it can be a program name with args. set dummy "srun"; ac_word=$2 @@ -16941,8 +16942,9 @@ $as_echo_n "checking slurm version for compatibility... " >&6; } srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` if ( [ "$srun_major" == "$bad_srun_major" ] && [ $srun_minor -ge $bad_srun_minor ] ) || [ $srun_major -gt $bad_srun_major ] ; then BROKEN_SRUN=1; fi - if test "x$good_srun_major" != "x0"; then - if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then BROKEN_SRUN=0; fi + if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then + HAVE_EXT_LAUNCHER=1 + BROKEN_SRUN=0 fi if test "x$BROKEN_SRUN" == "x1"; then @@ -16983,7 +16985,12 @@ _ACEOF fi cat >>confdefs.h <<_ACEOF -#define BROKEN_SRUN 1 +#define BROKEN_SRUN $BROKEN_SRUN +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define HAVE_SRUN_EXTERNAL_LAUNCHER $HAVE_EXT_LAUNCHER _ACEOF diff --git a/configure.common.ac b/configure.common.ac index 1b7baf94..ae2a64df 100644 --- a/configure.common.ac +++ b/configure.common.ac @@ -139,6 +139,7 @@ AS_CASE([$sysconfdir], [PKGSYSCONF_DIR=$sysconfdir/spindle]) BROKEN_SRUN=0 +HAVE_EXT_LAUNCHER=0 bad_srun_major=20 bad_srun_minor=11 good_srun_major=23 @@ -153,8 +154,12 @@ if test "x$ENABLE_SLURM" == "xtrue"; then srun_major=`echo $srun_version | cut -d . -f 1` srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` - if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -lt $good_srun_minor ] ) || [ $srun_major -lt $good_srun_major ]; then BROKEN_SRUN=1; fi - + if ( [[ "$srun_major" == "$bad_srun_major" ]] && [[ $srun_minor -ge $bad_srun_minor ]] ) || [[ $srun_major -gt $bad_srun_major ]] ; then BROKEN_SRUN=1; fi + if ( [[ "$srun_major" == "$good_srun_major" ]] && [[ $srun_minor -ge $good_srun_minor ]] ) || [[ $srun_major -gt $good_srun_major ]] ; then + HAVE_EXT_LAUNCHER=1 + BROKEN_SRUN=0 + fi + if test "x$BROKEN_SRUN" == "x1"; then AC_MSG_RESULT([no]) else @@ -181,6 +186,7 @@ if test "x$ENABLE_RSH_LAUNCH" == "x1"; then AC_DEFINE_UNQUOTED([RSHLAUNCH_ENABLED],[1],[Default mode for rsh launch]) fi AC_DEFINE_UNQUOTED([BROKEN_SRUN],[$BROKEN_SRUN],[Whether we are using a broken srun]) +AC_DEFINE_UNQUOTED([HAVE_SRUN_EXTERNAL_LAUNCHER],[$HAVE_EXT_LAUNCHER],[Whether srun supports --external-launcher]) #Runmode detection (pipe/socket or cobo/msocket communications) if test "x$OS_BUILD" == "xlinux"; then diff --git a/src/client/config.h.in b/src/client/config.h.in index 6d531908..2ddcacba 100644 --- a/src/client/config.h.in +++ b/src/client/config.h.in @@ -45,6 +45,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Whether srun supports --external-launcher */ +#undef HAVE_SRUN_EXTERNAL_LAUNCHER + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H diff --git a/src/client/configure b/src/client/configure index 43fcc696..8a015a42 100755 --- a/src/client/configure +++ b/src/client/configure @@ -12810,10 +12810,11 @@ case $sysconfdir in #( esac BROKEN_SRUN=0 +HAVE_EXT_LAUNCHER=0 bad_srun_major=20 bad_srun_minor=11 -good_srun_major=0 -good_srun_minor=0 +good_srun_major=23 +good_srun_minor=11 if test "x$ENABLE_SLURM" == "xtrue"; then # Extract the first word of ""srun"", so it can be a program name with args. set dummy "srun"; ac_word=$2 @@ -12866,8 +12867,9 @@ $as_echo_n "checking slurm version for compatibility... " >&6; } srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` if ( [ "$srun_major" == "$bad_srun_major" ] && [ $srun_minor -ge $bad_srun_minor ] ) || [ $srun_major -gt $bad_srun_major ] ; then BROKEN_SRUN=1; fi - if test "x$good_srun_major" != "x0"; then - if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then BROKEN_SRUN=0; fi + if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then + HAVE_EXT_LAUNCHER=1 + BROKEN_SRUN=0 fi if test "x$BROKEN_SRUN" == "x1"; then @@ -12908,7 +12910,12 @@ _ACEOF fi cat >>confdefs.h <<_ACEOF -#define BROKEN_SRUN 1 +#define BROKEN_SRUN $BROKEN_SRUN +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define HAVE_SRUN_EXTERNAL_LAUNCHER $HAVE_EXT_LAUNCHER _ACEOF diff --git a/src/fe/config.h.in b/src/fe/config.h.in index 5d9963cc..33ba031a 100644 --- a/src/fe/config.h.in +++ b/src/fe/config.h.in @@ -75,6 +75,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Whether srun supports --external-launcher */ +#undef HAVE_SRUN_EXTERNAL_LAUNCHER + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H diff --git a/src/fe/configure b/src/fe/configure index 53335db1..1018b37f 100755 --- a/src/fe/configure +++ b/src/fe/configure @@ -16660,10 +16660,11 @@ case $sysconfdir in #( esac BROKEN_SRUN=0 +HAVE_EXT_LAUNCHER=0 bad_srun_major=20 bad_srun_minor=11 -good_srun_major=0 -good_srun_minor=0 +good_srun_major=23 +good_srun_minor=11 if test "x$ENABLE_SLURM" == "xtrue"; then # Extract the first word of ""srun"", so it can be a program name with args. set dummy "srun"; ac_word=$2 @@ -16716,8 +16717,9 @@ $as_echo_n "checking slurm version for compatibility... " >&6; } srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` if ( [ "$srun_major" == "$bad_srun_major" ] && [ $srun_minor -ge $bad_srun_minor ] ) || [ $srun_major -gt $bad_srun_major ] ; then BROKEN_SRUN=1; fi - if test "x$good_srun_major" != "x0"; then - if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then BROKEN_SRUN=0; fi + if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then + HAVE_EXT_LAUNCHER=1 + BROKEN_SRUN=0 fi if test "x$BROKEN_SRUN" == "x1"; then @@ -16758,7 +16760,12 @@ _ACEOF fi cat >>confdefs.h <<_ACEOF -#define BROKEN_SRUN 1 +#define BROKEN_SRUN $BROKEN_SRUN +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define HAVE_SRUN_EXTERNAL_LAUNCHER $HAVE_EXT_LAUNCHER _ACEOF diff --git a/src/fe/startup/launch_slurm.cc b/src/fe/startup/launch_slurm.cc index d4dce4fc..07408f55 100644 --- a/src/fe/startup/launch_slurm.cc +++ b/src/fe/startup/launch_slurm.cc @@ -14,6 +14,7 @@ program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "config.h" #include "launcher.h" #include "spindle_debug.h" @@ -159,8 +160,15 @@ bool SlurmLauncher::spawnDaemon() char count_buffer[64]; snprintf(count_buffer, 64, "%d", nnodes); new_daemon_args[i++] = const_cast("srun"); +#if HAVE_SRUN_EXTERNAL_LAUNCHER new_daemon_args[i++] = const_cast("--external-launcher"); new_daemon_args[i++] = const_cast("--wait=0"); +#else + new_daemon_args[i++] = const_cast("--ntasks-per-node=1"); + new_daemon_args[i++] = const_cast("--wait=0"); + new_daemon_args[i++] = const_cast("--gres=none"); + new_daemon_args[i++] = const_cast("--mem=0"); +#endif new_daemon_args[i++] = const_cast("-n"); new_daemon_args[i++] = count_buffer; new_daemon_args[i++] = const_cast("-N"); diff --git a/src/server/config.h.in b/src/server/config.h.in index 931b768c..8d1842cf 100644 --- a/src/server/config.h.in +++ b/src/server/config.h.in @@ -57,6 +57,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Whether srun supports --external-launcher */ +#undef HAVE_SRUN_EXTERNAL_LAUNCHER + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H diff --git a/src/server/configure b/src/server/configure index cb6d1366..81b43b17 100755 --- a/src/server/configure +++ b/src/server/configure @@ -16657,10 +16657,11 @@ case $sysconfdir in #( esac BROKEN_SRUN=0 +HAVE_EXT_LAUNCHER=0 bad_srun_major=20 bad_srun_minor=11 -good_srun_major=0 -good_srun_minor=0 +good_srun_major=23 +good_srun_minor=11 if test "x$ENABLE_SLURM" == "xtrue"; then # Extract the first word of ""srun"", so it can be a program name with args. set dummy "srun"; ac_word=$2 @@ -16713,8 +16714,9 @@ $as_echo_n "checking slurm version for compatibility... " >&6; } srun_minor=`echo $srun_version | cut -d . -f 2` srun_point=`echo $srun_version | cut -d . -f 3` if ( [ "$srun_major" == "$bad_srun_major" ] && [ $srun_minor -ge $bad_srun_minor ] ) || [ $srun_major -gt $bad_srun_major ] ; then BROKEN_SRUN=1; fi - if test "x$good_srun_major" != "x0"; then - if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then BROKEN_SRUN=0; fi + if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then + HAVE_EXT_LAUNCHER=1 + BROKEN_SRUN=0 fi if test "x$BROKEN_SRUN" == "x1"; then @@ -16755,7 +16757,12 @@ _ACEOF fi cat >>confdefs.h <<_ACEOF -#define BROKEN_SRUN 1 +#define BROKEN_SRUN $BROKEN_SRUN +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define HAVE_SRUN_EXTERNAL_LAUNCHER $HAVE_EXT_LAUNCHER _ACEOF From 82460d43575fadcc98e80f9b72ee1a1d3e7b04fa Mon Sep 17 00:00:00 2001 From: Nicholas Chaimov Date: Wed, 13 May 2026 11:26:36 -0700 Subject: [PATCH 3/3] CI job for Slurm with srun wrapper --- .github/workflows/ci.yml | 60 ++++++++- .../testing-srun/Dockerfile | 35 +++++ .../testing-srun/conf/cgroup.conf | 1 + .../testing-srun/conf/slurm.conf | 42 ++++++ .../testing-srun/conf/slurmdbd.conf.template | 10 ++ .../testing-srun/docker-compose.yml | 126 ++++++++++++++++++ .../testing-srun/generate_config.sh | 10 ++ .../testing-srun/scripts/add_docker_user.sh | 10 ++ .../testing-srun/scripts/build_spindle.sh | 9 ++ .../testing-srun/scripts/entrypoint.sh | 19 +++ .../testing-srun/scripts/setup_slurm.sh | 10 ++ 11 files changed, 331 insertions(+), 1 deletion(-) create mode 100644 containers/spindle-slurm-ubuntu/testing-srun/Dockerfile create mode 100644 containers/spindle-slurm-ubuntu/testing-srun/conf/cgroup.conf create mode 100644 containers/spindle-slurm-ubuntu/testing-srun/conf/slurm.conf create mode 100644 containers/spindle-slurm-ubuntu/testing-srun/conf/slurmdbd.conf.template create mode 100644 containers/spindle-slurm-ubuntu/testing-srun/docker-compose.yml create mode 100755 containers/spindle-slurm-ubuntu/testing-srun/generate_config.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-srun/scripts/add_docker_user.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-srun/scripts/entrypoint.sh create mode 100755 containers/spindle-slurm-ubuntu/testing-srun/scripts/setup_slurm.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 634ce790..8e872cac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,7 +151,7 @@ jobs: docker compose down spindle-slurm-ubuntu: - name: Testsuite (Slurm, Ubuntu) + name: Testsuite (Slurm rshlaunch, Ubuntu) environment: Spindle CI runs-on: ubuntu-latest timeout-minutes: 20 @@ -208,6 +208,64 @@ jobs: cd containers/spindle-slurm-ubuntu/testing docker compose down + spindle-slurm-srun-ubuntu: + name: Testsuite (Slurm srun, Ubuntu) + environment: Spindle CI + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - name: Check out Spindle + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + + - name: Setup Docker Compose + uses: docker/setup-compose-action@8cccb8c14b6500aaffebff1aa49c502c34d2e5e6 + with: + version: latest + + - name: Login to GitHub Container Registry + if: ${{ !env.ACT }} + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate MariaDB configuration + id: slurm-ubuntu-mariadb + run: | + cd containers/spindle-slurm-ubuntu/testing-srun + ./generate_config.sh + + - name: Build spindle-slurm-ubuntu image + id: slurm-ubuntu-build + run: | + cd containers/spindle-slurm-ubuntu/testing-srun + docker compose --progress=plain build + + - name: Bring spindle-slurm-ubuntu up + id: slurm-ubuntu-up + run: | + cd containers/spindle-slurm-ubuntu/testing-srun + docker compose up -d --wait --wait-timeout 120 + + - name: Verify munge works in spindle-slurm-srun-ubuntu + id: slurm-ubuntu-munge + run: | + docker exec slurm-srun-head bash -c 'munge -n | unmunge' + + - name: Run spindle-slurm-srun-ubuntu testsuite + id: slurm-ubuntu-testsuite + run: | + docker exec slurm-srun-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}' + + - name: Bring spindle-slurm-ubuntu down + id: slurm-ubuntu-down + if: ${{ always() }} + continue-on-error: true + run: | + cd containers/spindle-slurm-ubuntu/testing-srun + docker compose down + spindle-slurm-plugin-ubuntu: name: Testsuite (Slurm Plugin, Ubuntu) environment: Spindle CI diff --git a/containers/spindle-slurm-ubuntu/testing-srun/Dockerfile b/containers/spindle-slurm-ubuntu/testing-srun/Dockerfile new file mode 100644 index 00000000..5b281c3f --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/Dockerfile @@ -0,0 +1,35 @@ +ARG BASE_VERSION=latest +FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION} +ARG replicas=4 +ENV workers=${replicas} + +ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-srun + +# Slurm daemons run as $SLURM_USER +ARG SLURM_USER=slurm + +# Applications run as $USER +ARG USER=slurmuser +ARG UID=1001 + +# Set up the Slurm install already present in the base image +COPY ${BUILD_ROOT}/scripts/setup_slurm.sh /setup_slurm.sh +COPY ${BUILD_ROOT}/conf/slurm.conf /home/${SLURM_USER}/slurm.conf +COPY ${BUILD_ROOT}/conf/slurmdbd.conf /home/${SLURM_USER}/slurmdbd.conf +COPY ${BUILD_ROOT}/conf/cgroup.conf /home/${SLURM_USER}/cgroup.conf +RUN /setup_slurm.sh + +USER ${USER} +WORKDIR /home/${USER} + +# Copy the Spindle repo into the container and build it +RUN mkdir -p /home/${USER}/Spindle +COPY . /home/${USER}/Spindle +COPY ${BUILD_ROOT}/scripts/build_spindle.sh /home/${USER}/build_spindle.sh +RUN ./build_spindle.sh + +COPY ${BUILD_ROOT}/scripts/entrypoint.sh /home/${USER}/entrypoint.sh +ENV PATH /home/${USER}/Spindle-inst/bin:$PATH + +ENTRYPOINT /bin/bash ./entrypoint.sh + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/conf/cgroup.conf b/containers/spindle-slurm-ubuntu/testing-srun/conf/cgroup.conf new file mode 100644 index 00000000..e59e9aee --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/conf/cgroup.conf @@ -0,0 +1 @@ +CgroupPlugin=cgroup/v1 diff --git a/containers/spindle-slurm-ubuntu/testing-srun/conf/slurm.conf b/containers/spindle-slurm-ubuntu/testing-srun/conf/slurm.conf new file mode 100644 index 00000000..abf060d5 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/conf/slurm.conf @@ -0,0 +1,42 @@ +ClusterName=linux +ControlMachine=slurm-head +ControlAddr=slurm-head +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +TaskPlugin=task/affinity +ReturnToService=2 +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurm-db +AccountingStoragePort=6819 +NodeName=slurm-node-1 NodeAddr=slurm-node-1 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-2 NodeAddr=slurm-node-2 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-3 NodeAddr=slurm-node-3 CPUs=3 RealMemory=1000 State=UNKNOWN +NodeName=slurm-node-4 NodeAddr=slurm-node-4 CPUs=3 RealMemory=1000 State=UNKNOWN +PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/conf/slurmdbd.conf.template b/containers/spindle-slurm-ubuntu/testing-srun/conf/slurmdbd.conf.template new file mode 100644 index 00000000..0e274118 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/conf/slurmdbd.conf.template @@ -0,0 +1,10 @@ +AuthType=auth/munge +DbdAddr=slurm-db +DbdHost=slurm-db +SlurmUser=slurm +DebugLevel=4 +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +StorageType=accounting_storage/mysql +StorageHost=slurm-mariadb +StorageUser=slurm diff --git a/containers/spindle-slurm-ubuntu/testing-srun/docker-compose.yml b/containers/spindle-slurm-ubuntu/testing-srun/docker-compose.yml new file mode 100644 index 00000000..39332e3e --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/docker-compose.yml @@ -0,0 +1,126 @@ +# `replicas` must match the number of nodes defined in the services section +x-shared-workers: + &workers + replicas: 4 + +# Base image version to use +x-shared-build-args: &shared-build-args + BASE_VERSION: latest + <<: *workers + +# Docker prohibits copying files from outside of the build context. +# In order to be able to copy the whole repo into the container, +# we have to set the context to be the root of the repo. +# We then have to specify the path from there to the Dockerfile. +x-shared-build-context: &shared-build-context + context: ../../.. + dockerfile: containers/spindle-slurm-ubuntu/testing-srun/Dockerfile + args: *shared-build-args + +# Name of the head node +x-shared-environment: &shared-environment + SLURM_HEAD_NODE: slurm-head + <<: *workers + +# The entrypoint runs different services depending +# on the node's role. Valid options are: +# - worker: runs slurmd +# - db: runs slurmdbd +# - ctl: runs slurmctld +x-worker-environment: &worker-environment + SLURM_ROLE: worker + <<: *shared-environment + +networks: + slurm: + driver: bridge + +# Common parameters for all nodes. +x-shared-node-parameters: &shared-node-parameters + build: *shared-build-context + networks: + - slurm + cap_add: + - SYS_NICE # Required for libnuma + +x-healthcheck-parameters: &healthcheck-parameters + start_period: 3s + interval: 3s + timeout: 5s + retries: 5 + +x-worker-parameters: &worker-node-parameters + <<: *shared-node-parameters + environment: *worker-environment + depends_on: + slurm-head: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmd/slurmd.pid"] + <<: *healthcheck-parameters + +services: + slurm-mariadb: + image: mariadb:12 + networks: + - slurm + hostname: slurm-mariadb + container_name: slurm-srun-mariadb + env_file: mariadb.env + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: "slurm_acct_db" + MYSQL_USER: "slurm" + healthcheck: + test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"] + <<: *healthcheck-parameters + + slurm-db: + <<: *shared-node-parameters + hostname: slurm-db + container_name: slurm-srun-db + environment: + SLURM_ROLE: db + <<: *shared-environment + depends_on: + slurm-mariadb: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmdbd/slurmdbd.pid"] + <<: *healthcheck-parameters + + slurm-head: + <<: *shared-node-parameters + hostname: slurm-head + container_name: slurm-srun-head + tty: true + environment: + SLURM_ROLE: ctl + <<: *shared-environment + depends_on: + slurm-db: + condition: service_healthy + healthcheck: + test: ["CMD", "stat", "/var/run/slurmd/slurmctld.pid"] + <<: *healthcheck-parameters + + slurm-node-1: + <<: *worker-node-parameters + hostname: slurm-node-1 + container_name: slurm-srun-node-1 + + slurm-node-2: + <<: *worker-node-parameters + hostname: slurm-node-2 + container_name: slurm-srun-node-2 + + slurm-node-3: + <<: *worker-node-parameters + hostname: slurm-node-3 + container_name: slurm-srun-node-3 + + slurm-node-4: + <<: *worker-node-parameters + hostname: slurm-node-4 + container_name: slurm-srun-node-4 + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/generate_config.sh b/containers/spindle-slurm-ubuntu/testing-srun/generate_config.sh new file mode 100755 index 00000000..7f911c32 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/generate_config.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Generate random password for the MariaDB slurm user +# and set it in config files + +MARIADB_PASS=$(openssl rand --base64 16 | head -c -3) +echo "MARIADB_PASSWORD: \"${MARIADB_PASS}\"" > mariadb.env +cp conf/slurmdbd.conf.template conf/slurmdbd.conf +echo "StoragePass=${MARIADB_PASS}" >> conf/slurmdbd.conf + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/scripts/add_docker_user.sh b/containers/spindle-slurm-ubuntu/testing-srun/scripts/add_docker_user.sh new file mode 100755 index 00000000..ace8c619 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/scripts/add_docker_user.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euxo pipefail + +sudo groupadd -g ${UID} ${USER} +sudo useradd -g ${USER} -u ${UID} -d /home/${USER} -m ${USER} +# Allow user to run as other users so that munge can be started as the munge user +sudo sh -c "printf \"${USER} ALL=(ALL) NOPASSWD: ALL\\n\" >> /etc/sudoers" +sudo adduser ${USER} sudo +sudo usermod -s /bin/bash ${USER} + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh b/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh new file mode 100755 index 00000000..acf1ef0b --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/scripts/build_spindle.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p /home/${USER}/Spindle-build +cd /home/${USER}/Spindle-build +/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=slurm --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g" +make -j$(nproc) +make install + diff --git a/containers/spindle-slurm-ubuntu/testing-srun/scripts/entrypoint.sh b/containers/spindle-slurm-ubuntu/testing-srun/scripts/entrypoint.sh new file mode 100755 index 00000000..5b7b8c17 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/scripts/entrypoint.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +echo "SLURM_ROLE: ${SLURM_ROLE}" + +echo "Starting munged..." +sudo -u munge /usr/sbin/munged + +if [ "${SLURM_ROLE}" = "db" ]; then + echo "Starting slurmdbd..." + sudo -u slurm /usr/sbin/slurmdbd -Dvvv +elif [ "${SLURM_ROLE}" = "ctl" ] ; then + echo "Starting slurmctld..." + sudo -u slurm /usr/sbin/slurmctld -i -Dvvv +elif [ "${SLURM_ROLE}" = "worker" ] ; then + echo "Starting slurmd..." + sudo /usr/sbin/slurmd -Dvvv +fi + +sleep inf diff --git a/containers/spindle-slurm-ubuntu/testing-srun/scripts/setup_slurm.sh b/containers/spindle-slurm-ubuntu/testing-srun/scripts/setup_slurm.sh new file mode 100755 index 00000000..186beea0 --- /dev/null +++ b/containers/spindle-slurm-ubuntu/testing-srun/scripts/setup_slurm.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p /etc/slurm /etc/sysconfig/slurm /var/spool/slurmd /var/spool/slurmctld /var/run/slurmd /var/run/slurmdbd /var/lib/slurmd /var/log/slurm +touch /var/lib/slurmd/node_state /var/lib/slurmd/front_end_state /var/lib/slurmd/job_state /var/lib/slurmd/resv_state /var/lib/slurmd/trigger_state /var/lib/slurmd/assoc_mgr_state /var/lib/slurmd/assoc_usage /var/lib/slurmd/qos_usage /var/lib/slurmd/fed_mgr_state +cp /home/${SLURM_USER}/slurm.conf /etc/slurm/slurm.conf +cp /home/${SLURM_USER}/slurmdbd.conf /etc/slurm/slurmdbd.conf +cp /home/${SLURM_USER}/cgroup.conf /etc/slurm/cgroup.conf +chown -R slurm:slurm /etc/slurm /etc/sysconfig/slurm /var/spool/slurmd /var/spool/slurmctld /var/run/slurmd /var/run/slurmdbd /var/lib/slurmd /var/log/slurm +chmod 600 /etc/slurm/slurmdbd.conf