Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ jobs:
docker compose down

spindle-slurm-ubuntu:
name: Testsuite (Slurm, Ubuntu)
name: Testsuite (Slurm rshlaunch, Ubuntu)
environment: Spindle CI
runs-on: ubuntu-latest
timeout-minutes: 20
Expand Down Expand Up @@ -208,6 +208,64 @@ jobs:
cd containers/spindle-slurm-ubuntu/testing
docker compose down

spindle-slurm-srun-ubuntu:
name: Testsuite (Slurm srun, Ubuntu)
environment: Spindle CI
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Check out Spindle
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd

- name: Setup Docker Compose
uses: docker/setup-compose-action@8cccb8c14b6500aaffebff1aa49c502c34d2e5e6
with:
version: latest

- name: Login to GitHub Container Registry
if: ${{ !env.ACT }}
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Generate MariaDB configuration
id: slurm-ubuntu-mariadb
run: |
cd containers/spindle-slurm-ubuntu/testing-srun
./generate_config.sh

- name: Build spindle-slurm-ubuntu image
id: slurm-ubuntu-build
run: |
cd containers/spindle-slurm-ubuntu/testing-srun
docker compose --progress=plain build

- name: Bring spindle-slurm-ubuntu up
id: slurm-ubuntu-up
run: |
cd containers/spindle-slurm-ubuntu/testing-srun
docker compose up -d --wait --wait-timeout 120

- name: Verify munge works in spindle-slurm-srun-ubuntu
id: slurm-ubuntu-munge
run: |
docker exec slurm-srun-head bash -c 'munge -n | unmunge'

- name: Run spindle-slurm-srun-ubuntu testsuite
id: slurm-ubuntu-testsuite
run: |
docker exec slurm-srun-head bash -c 'cd Spindle-build/testsuite && salloc -n${workers} -N${workers} ./runTests ${workers}'

- name: Bring spindle-slurm-ubuntu down
id: slurm-ubuntu-down
if: ${{ always() }}
continue-on-error: true
run: |
cd containers/spindle-slurm-ubuntu/testing-srun
docker compose down

spindle-slurm-plugin-ubuntu:
name: Testsuite (Slurm Plugin, Ubuntu)
environment: Spindle CI
Expand Down
3 changes: 3 additions & 0 deletions config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
/* Define to 1 if you have MPI libs and headers. */
#undef HAVE_MPI

/* Whether srun supports --external-launcher */
#undef HAVE_SRUN_EXTERNAL_LAUNCHER

/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H

Expand Down
17 changes: 12 additions & 5 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -16885,10 +16885,11 @@ case $sysconfdir in #(
esac

BROKEN_SRUN=0
HAVE_EXT_LAUNCHER=0
bad_srun_major=20
bad_srun_minor=11
good_srun_major=0
good_srun_minor=0
good_srun_major=23
good_srun_minor=11
if test "x$ENABLE_SLURM" == "xtrue"; then
# Extract the first word of ""srun"", so it can be a program name with args.
set dummy "srun"; ac_word=$2
Expand Down Expand Up @@ -16941,8 +16942,9 @@ $as_echo_n "checking slurm version for compatibility... " >&6; }
srun_minor=`echo $srun_version | cut -d . -f 2`
srun_point=`echo $srun_version | cut -d . -f 3`
if ( [ "$srun_major" == "$bad_srun_major" ] && [ $srun_minor -ge $bad_srun_minor ] ) || [ $srun_major -gt $bad_srun_major ] ; then BROKEN_SRUN=1; fi
if test "x$good_srun_major" != "x0"; then
if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then BROKEN_SRUN=0; fi
if ( [ "$srun_major" == "$good_srun_major" ] && [ $srun_minor -ge $good_srun_minor ] ) || [ $srun_major -gt $good_srun_major ] ; then
HAVE_EXT_LAUNCHER=1
BROKEN_SRUN=0
fi

if test "x$BROKEN_SRUN" == "x1"; then
Expand Down Expand Up @@ -16983,7 +16985,12 @@ _ACEOF
fi

cat >>confdefs.h <<_ACEOF
#define BROKEN_SRUN 1
#define BROKEN_SRUN $BROKEN_SRUN
_ACEOF


cat >>confdefs.h <<_ACEOF
#define HAVE_SRUN_EXTERNAL_LAUNCHER $HAVE_EXT_LAUNCHER
_ACEOF


Expand Down
15 changes: 9 additions & 6 deletions configure.common.ac
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,11 @@ AS_CASE([$sysconfdir],
[PKGSYSCONF_DIR=$sysconfdir/spindle])

BROKEN_SRUN=0
HAVE_EXT_LAUNCHER=0
bad_srun_major=20
bad_srun_minor=11
good_srun_major=0
good_srun_minor=0
good_srun_major=23
good_srun_minor=11
if test "x$ENABLE_SLURM" == "xtrue"; then
AC_PATH_PROG(SRUN_PATH, "srun", "none")
if test "x$SRUN_PATH" == "xnone"; then
Expand All @@ -154,10 +155,11 @@ if test "x$ENABLE_SLURM" == "xtrue"; then
srun_minor=`echo $srun_version | cut -d . -f 2`
srun_point=`echo $srun_version | cut -d . -f 3`
if ( [[ "$srun_major" == "$bad_srun_major" ]] && [[ $srun_minor -ge $bad_srun_minor ]] ) || [[ $srun_major -gt $bad_srun_major ]] ; then BROKEN_SRUN=1; fi
if test "x$good_srun_major" != "x0"; then
if ( [[ "$srun_major" == "$good_srun_major" ]] && [[ $srun_minor -ge $good_srun_minor ]] ) || [[ $srun_major -gt $good_srun_major ]] ; then BROKEN_SRUN=0; fi
if ( [[ "$srun_major" == "$good_srun_major" ]] && [[ $srun_minor -ge $good_srun_minor ]] ) || [[ $srun_major -gt $good_srun_major ]] ; then
HAVE_EXT_LAUNCHER=1
BROKEN_SRUN=0
fi

if test "x$BROKEN_SRUN" == "x1"; then
AC_MSG_RESULT([no])
else
Expand All @@ -183,7 +185,8 @@ fi
if test "x$ENABLE_RSH_LAUNCH" == "x1"; then
AC_DEFINE_UNQUOTED([RSHLAUNCH_ENABLED],[1],[Default mode for rsh launch])
fi
AC_DEFINE_UNQUOTED([BROKEN_SRUN],[1],[Whether we are using a broken srun])
AC_DEFINE_UNQUOTED([BROKEN_SRUN],[$BROKEN_SRUN],[Whether we are using a broken srun])
AC_DEFINE_UNQUOTED([HAVE_SRUN_EXTERNAL_LAUNCHER],[$HAVE_EXT_LAUNCHER],[Whether srun supports --external-launcher])

#Runmode detection (pipe/socket or cobo/msocket communications)
if test "x$OS_BUILD" == "xlinux"; then
Expand Down
35 changes: 35 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-srun/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
ARG BASE_VERSION=latest
FROM ghcr.io/llnl/spindle-slurm-base:${BASE_VERSION}
ARG replicas=4
ENV workers=${replicas}

ARG BUILD_ROOT=containers/spindle-slurm-ubuntu/testing-srun

# Slurm daemons run as $SLURM_USER
ARG SLURM_USER=slurm

# Applications run as $USER
ARG USER=slurmuser
ARG UID=1001

# Set up the Slurm install already present in the base image
COPY ${BUILD_ROOT}/scripts/setup_slurm.sh /setup_slurm.sh
COPY ${BUILD_ROOT}/conf/slurm.conf /home/${SLURM_USER}/slurm.conf
COPY ${BUILD_ROOT}/conf/slurmdbd.conf /home/${SLURM_USER}/slurmdbd.conf
COPY ${BUILD_ROOT}/conf/cgroup.conf /home/${SLURM_USER}/cgroup.conf
RUN /setup_slurm.sh

USER ${USER}
WORKDIR /home/${USER}

# Copy the Spindle repo into the container and build it
RUN mkdir -p /home/${USER}/Spindle
COPY . /home/${USER}/Spindle
COPY ${BUILD_ROOT}/scripts/build_spindle.sh /home/${USER}/build_spindle.sh
RUN ./build_spindle.sh

COPY ${BUILD_ROOT}/scripts/entrypoint.sh /home/${USER}/entrypoint.sh
ENV PATH /home/${USER}/Spindle-inst/bin:$PATH

ENTRYPOINT /bin/bash ./entrypoint.sh

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CgroupPlugin=cgroup/v1
42 changes: 42 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-srun/conf/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ClusterName=linux
ControlMachine=slurm-head
ControlAddr=slurm-head
SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
TaskPlugin=task/affinity
ReturnToService=2
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurm-db
AccountingStoragePort=6819
NodeName=slurm-node-1 NodeAddr=slurm-node-1 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-2 NodeAddr=slurm-node-2 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-3 NodeAddr=slurm-node-3 CPUs=3 RealMemory=1000 State=UNKNOWN
NodeName=slurm-node-4 NodeAddr=slurm-node-4 CPUs=3 RealMemory=1000 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
AuthType=auth/munge
DbdAddr=slurm-db
DbdHost=slurm-db
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=slurm-mariadb
StorageUser=slurm
126 changes: 126 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-srun/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# `replicas` must match the number of nodes defined in the services section
x-shared-workers:
&workers
replicas: 4

# Base image version to use
x-shared-build-args: &shared-build-args
BASE_VERSION: latest
<<: *workers

# Docker prohibits copying files from outside of the build context.
# In order to be able to copy the whole repo into the container,
# we have to set the context to be the root of the repo.
# We then have to specify the path from there to the Dockerfile.
x-shared-build-context: &shared-build-context
context: ../../..
dockerfile: containers/spindle-slurm-ubuntu/testing-srun/Dockerfile
args: *shared-build-args

# Name of the head node
x-shared-environment: &shared-environment
SLURM_HEAD_NODE: slurm-head
<<: *workers

# The entrypoint runs different services depending
# on the node's role. Valid options are:
# - worker: runs slurmd
# - db: runs slurmdbd
# - ctl: runs slurmctld
x-worker-environment: &worker-environment
SLURM_ROLE: worker
<<: *shared-environment

networks:
slurm:
driver: bridge

# Common parameters for all nodes.
x-shared-node-parameters: &shared-node-parameters
build: *shared-build-context
networks:
- slurm
cap_add:
- SYS_NICE # Required for libnuma

x-healthcheck-parameters: &healthcheck-parameters
start_period: 3s
interval: 3s
timeout: 5s
retries: 5

x-worker-parameters: &worker-node-parameters
<<: *shared-node-parameters
environment: *worker-environment
depends_on:
slurm-head:
condition: service_healthy
healthcheck:
test: ["CMD", "stat", "/var/run/slurmd/slurmd.pid"]
<<: *healthcheck-parameters

services:
slurm-mariadb:
image: mariadb:12
networks:
- slurm
hostname: slurm-mariadb
container_name: slurm-srun-mariadb
env_file: mariadb.env
environment:
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
MYSQL_DATABASE: "slurm_acct_db"
MYSQL_USER: "slurm"
healthcheck:
test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"]
<<: *healthcheck-parameters

slurm-db:
<<: *shared-node-parameters
hostname: slurm-db
container_name: slurm-srun-db
environment:
SLURM_ROLE: db
<<: *shared-environment
depends_on:
slurm-mariadb:
condition: service_healthy
healthcheck:
test: ["CMD", "stat", "/var/run/slurmdbd/slurmdbd.pid"]
<<: *healthcheck-parameters

slurm-head:
<<: *shared-node-parameters
hostname: slurm-head
container_name: slurm-srun-head
tty: true
environment:
SLURM_ROLE: ctl
<<: *shared-environment
depends_on:
slurm-db:
condition: service_healthy
healthcheck:
test: ["CMD", "stat", "/var/run/slurmd/slurmctld.pid"]
<<: *healthcheck-parameters

slurm-node-1:
<<: *worker-node-parameters
hostname: slurm-node-1
container_name: slurm-srun-node-1

slurm-node-2:
<<: *worker-node-parameters
hostname: slurm-node-2
container_name: slurm-srun-node-2

slurm-node-3:
<<: *worker-node-parameters
hostname: slurm-node-3
container_name: slurm-srun-node-3

slurm-node-4:
<<: *worker-node-parameters
hostname: slurm-node-4
container_name: slurm-srun-node-4

10 changes: 10 additions & 0 deletions containers/spindle-slurm-ubuntu/testing-srun/generate_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

# Generate random password for the MariaDB slurm user
# and set it in config files

MARIADB_PASS=$(openssl rand --base64 16 | head -c -3)
echo "MARIADB_PASSWORD: \"${MARIADB_PASS}\"" > mariadb.env
cp conf/slurmdbd.conf.template conf/slurmdbd.conf
echo "StoragePass=${MARIADB_PASS}" >> conf/slurmdbd.conf

Loading