Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/.github/workflows"
schedule:
interval: "monthly"
106 changes: 106 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
name: ci

on:
push:
branches:
- devel
pull_request:
branches:
- devel

permissions:
contents: read

concurrency:
group: ci-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}
cancel-in-progress: true

jobs:
spindle-serial-ubuntu:
name: Testsuite (Serial, Ubuntu)
environment: Spindle CI
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Check out Spindle
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8

- name: Setup Docker Compose
uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746
with:
version: latest

- name: Build spindle-serial-ubuntu image
id: serial-ubuntu-build
run: |
cd containers/spindle-serial-ubuntu
docker compose --progress=plain build

- name: Bring spindle-serial-ubuntu up
id: serial-ubuntu-up
run: |
cd containers/spindle-serial-ubuntu
docker compose up -d

- name: Verify munge works in spindle-serial-ubuntu
id: serial-ubuntu-munge
run: |
docker exec spindlenode bash -c 'munge -n | unmunge'

- name: Run spindle-serial-ubuntu testsuite
id: serial-ubuntu-testsuite
run: |
docker exec spindlenode bash -c 'cd Spindle-build/testsuite && ./runTests'

- name: Bring spindle-serial-ubuntu down
id: serial-ubuntu-down
if: ${{ always() }}
continue-on-error: true
run: |
cd containers/spindle-serial-ubuntu
docker compose down

spindle-flux-ubuntu:
name: Testsuite (Flux, Ubuntu)
environment: Spindle CI
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Check out Spindle
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8

- name: Setup Docker Compose
uses: docker/setup-compose-action@364cc21a5de5b1ee4a7f5f9d3fa374ce0ccde746
with:
version: latest

- name: Build spindle-flux-ubuntu image
id: flux-ubuntu-build
run: |
cd containers/spindle-flux-ubuntu
docker compose --progress=plain build

- name: Bring spindle-flux-ubuntu up
id: flux-ubuntu-up
run: |
cd containers/spindle-flux-ubuntu
docker compose up -d --wait --wait-timeout 60

- name: Verify munge works in spindle-flux-ubuntu
id: flux-ubuntu-munge
run: |
docker exec node-1 bash -c 'munge -n | unmunge'

- name: Run spindle-flux-ubuntu testsuite
id: flux-ubuntu-testsuite
run: |
docker exec node-1 bash -c 'cd Spindle-build/testsuite && flux alloc --nodes=${workers} ./runTests --nodes=${workers} --tasks-per-node=3'

- name: Bring spindle-flux-ubuntu down
id: flux-ubuntu-down
if: ${{ always() }}
continue-on-error: true
run: |
cd containers/spindle-flux-ubuntu
docker compose down

30 changes: 30 additions & 0 deletions .github/workflows/container.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Build & Push Slurm Base Container
on:
workflow_dispatch:
workflow_call:

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2

- name: Login to GitHub Container Registry
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build & Push Slurm Base Image
uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0
with:
context: ./containers/spindle-slurm-ubuntu/base
platforms: linux/amd64
push: true
tags: ghcr.io/paratoolsinc/spindle-slurm-base:latest
cache-from: type=registry,ref=ghcr.io/paratoolsinc/spindle-slurm-base:buildcache
cache-to: type=registry,ref=ghcr.io/paratoolsinc/spindle-slurm-base:buildcache,mode=max
67 changes: 67 additions & 0 deletions containers/spindle-flux-ubuntu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# This is based on the Flux Container Tutorial
# See https://flux-framework.readthedocs.io/en/latest/tutorials/containers
ARG flux_sched_version=noble
FROM fluxrm/flux-sched:${flux_sched_version} AS builder
ARG replicas=4
ENV workers=${replicas}
USER root

RUN DEBIAN_FRONTEND="noninteractive" apt-get update \
&& apt-get -qq install -y --no-install-recommends \
autotools-dev \
autoconf \
automake \
cmake \
git \
python3 \
openssh-server \
openssh-client \
libdb-dev \
apt-utils \
dnsutils \
iputils-ping \
python3-pip \
libgcrypt20 \
libgcrypt20-dev \
gdb \
software-properties-common

ARG USER=fluxuser
ARG CONFIG_ROOT=containers/spindle-flux-ubuntu

# Allow fluxuser to run as other users so it can start munged
RUN sh -c "printf \"${USER} ALL=(ALL) NOPASSWD: ALL\\n\" >> /etc/sudoers"

# Configure flux
ENV STATE_DIR=/var/lib/flux
RUN mkdir -p ${STATE_DIR} /etc/flux/system /etc/flux/system/cron.d /etc/flux/config /run/flux /etc/flux/imp/conf.d
COPY ${CONFIG_ROOT}/flux/imp.toml /etc/flux/imp/conf.d/
COPY ${CONFIG_ROOT}/flux/broker.toml /etc/flux/config/
RUN mkdir -p /etc/flux/system/cron.d && \
mkdir -p /mnt/curve && \
flux keygen /mnt/curve/curve.cert && \
flux R encode --hosts="node-[1-${workers}]" > /etc/flux/system/R && \
chown -R ${USER}:${USER} /run/flux ${STATE_DIR} /mnt/curve/curve.cert

# Build Spindle
WORKDIR /home/${USER}
# Copy the whole git repo into the container.
COPY . /home/${USER}/Spindle
COPY ${CONFIG_ROOT}/scripts/build_spindle.sh /home/${USER}/build_spindle.sh
RUN ./build_spindle.sh

RUN chown -R ${USER}:${USER} /home/fluxuser && \
chown -R ${USER}:${USER} /run/flux

USER ${USER}
COPY ${CONFIG_ROOT}/scripts/flux_healthcheck.sh ./
COPY ${CONFIG_ROOT}/scripts/entrypoint.sh ./
ENV PATH /home/${USER}/Spindle-inst/bin:${PATH}
# Make libfabric work with fork.
ENV RDMAV_FORK_SAFE 1
# Silence warning from hwloc about unsupported PCI device
# on GitHub-hosted runners.
ENV HWLOC_HIDE_ERRORS 2

ENTRYPOINT /bin/bash ./entrypoint.sh

70 changes: 70 additions & 0 deletions containers/spindle-flux-ubuntu/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# 4-node Flux cluster
# For information on running Flux in containers, see
# https://flux-framework.readthedocs.io/en/latest/tutorials/containers

# `replicas` must match the number of nodes defined in the services section
x-shared-workers:
&workers
replicas: 4

# Ubuntu version to use (noble = 24.04)
x-shared-build-args: &shared-build-args
flux_sched_version: noble
<<: *workers

# Docker prohibits copying files from outside of the build context.
# In order to be able to copy the whole repo into the container,
# we have to set the context to be the root of the repo.
# We then have to specify the path from there to the Dockerfile.
x-shared-build-context: &shared-build-context
context: ../..
dockerfile: containers/spindle-flux-ubuntu/Dockerfile
args: *shared-build-args

# Name of the node that runs the Flux broker
x-shared-environment: &shared-environment
mainHost: node-1
<<: *workers

networks:
flux:
driver: bridge

# Common parameters for all nodes.
x-shared-node-parameters: &shared-node-parameters
build: *shared-build-context
networks:
- flux
environment: *shared-environment
cap_add:
- SYS_NICE # Required for libnuma

services:
node-1:
<<: *shared-node-parameters
hostname: node-1
container_name: node-1
# Check whether all the workers have registered
# with the broker on the head node.
healthcheck:
test: ["CMD", "./flux_healthcheck.sh"]
start_period: 15s
interval: 5s
timeout: 10s
retries: 5

node-2:
<<: *shared-node-parameters
hostname: node-2
container_name: node-2

node-3:
<<: *shared-node-parameters
hostname: node-3
container_name: node-3

node-4:
<<: *shared-node-parameters
hostname: node-4
container_name: node-4

19 changes: 19 additions & 0 deletions containers/spindle-flux-ubuntu/flux/broker.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[exec]
imp = "/usr/libexec/flux/flux-imp"

[access]
allow-guest-user = true
allow-root-owner = true

[resource]
path = "/etc/flux/system/R"

[bootstrap]
curve_cert = "/mnt/curve/curve.cert"
default_port = 8050
default_bind = "tcp://eth0:%%p"
default_connect = "tcp://%%h:%%p"
hosts = [
{ host="node-[1-4]"},
]

3 changes: 3 additions & 0 deletions containers/spindle-flux-ubuntu/flux/imp.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[exec]
allowed-users = [ "flux", "root" ]
allowed-shells = [ "/usr/libexec/flux/flux-shell" ]
10 changes: 10 additions & 0 deletions containers/spindle-flux-ubuntu/scripts/build_spindle.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

set -euxo pipefail

mkdir -p /home/${USER}/Spindle-build
cd /home/${USER}/Spindle-build
/home/${USER}/Spindle/configure --prefix=/home/${USER}/Spindle-inst --enable-sec-munge --with-rm=flux --enable-flux-plugin --with-localstorage=/tmp CFLAGS="-O2 -g" CXXFLAGS="-O2 -g"
make -j$(nproc)
make install

36 changes: 36 additions & 0 deletions containers/spindle-flux-ubuntu/scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

# Starts munged and the flux broker.
#
# For documentation on running Flux in containers, see
# https://flux-framework.readthedocs.io/en/latest/tutorials/containers

brokerOptions="-Scron.directory=/etc/flux/system/cron.d \
-Stbon.fanout=256 \
-Srundir=/run/flux \
-Sstatedir=${STATE_DIRECTORY:-/var/lib/flux} \
-Slocal-uri=local:///run/flux/local \
-Slog-stderr-level=6 \
-Slog-stderr-mode=local"

# Get the hostname that will resolve for the Docker bridge network.
address=$(echo $( nslookup "$( hostname -i )" | head -n 1 ))
parts=(${address//=/ })
hostName=${parts[2]}
thisHost=(${hostName//./ })
thisHost=${thisHost[0]}
echo $thisHost
export FLUX_FAKE_HOSTNAME=$thisHost

# Start munged
sudo -u munge /usr/sbin/munged

if [ ${thisHost} != "${mainHost}" ]; then
# Worker node -- wait for head node before connecting
sleep 15
FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf
else
# Head node
FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf
fi

11 changes: 11 additions & 0 deletions containers/spindle-flux-ubuntu/scripts/flux_healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

FLUX_FREE_NODES=$(flux resource list -s free -n | awk '{print $2}')

if [[ ${FLUX_FREE_NODES} -ne ${replicas} ]] ; then
echo "FAILED: Incorrect number of Flux free nodes: expected ${replicas}, got ${FLUX_FREE_NODES}"
exit 1
fi

echo "PASSED: Found ${FLUX_FREE_NODES} nodes out of ${replicas}; all nodes registered."

Loading