From e5aae029897be592deef18a3b3ba50215d716113 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 11:45:13 -0700 Subject: [PATCH 1/8] feat: add support for arm Signed-off-by: vsoch --- .github/workflows/build-deploy.yaml | 29 +++++++- .github/workflows/main.yaml | 2 +- .github/workflows/slurm-containers.yaml | 66 ++++++++++++++++- Makefile | 12 ++++ docker/Dockerfile.ubuntu | 96 +++++++++++++++++++++++++ examples/tests/hello-world/README.md | 6 +- 6 files changed, 204 insertions(+), 7 deletions(-) create mode 100644 docker/Dockerfile.ubuntu diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index 19b8a9e..bef54a4 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -1,13 +1,40 @@ name: build slurm-operator on: - pull_request: [] + pull_request: {} push: branches: - main workflow_dispatch: jobs: + build-arm: + if: (github.event_name != 'pull_request') + runs-on: ubuntu-latest + name: make and build arm + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - uses: actions/setup-go@v3 + with: + go-version: ^1.23 + - name: GHCR Login + if: (github.event_name != 'pull_request') + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Add custom buildx ARM builder + run: | + docker buildx create --name armbuilder + docker buildx use armbuilder + docker buildx inspect --bootstrap + + - name: Deploy Container + run: make arm-deploy + build: runs-on: ubuntu-latest strategy: diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 6299d4b..f5ab991 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -1,7 +1,7 @@ name: test slurm-operator on: - pull_request: [] + pull_request: {} jobs: formatting: diff --git a/.github/workflows/slurm-containers.yaml b/.github/workflows/slurm-containers.yaml index 7b39c78..ffea01c 100644 --- a/.github/workflows/slurm-containers.yaml +++ b/.github/workflows/slurm-containers.yaml @@ -1,13 +1,70 @@ name: build slurm-containers on: - pull_request: [] + pull_request: {} push: branches: - main workflow_dispatch: jobs: + build-arm-ubuntu: + env: + container: ghcr.io/converged-computing/slurm + runs-on: ubuntu-latest + name: make and build arm + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - uses: actions/setup-go@v3 + with: + go-version: ^1.23 + - name: GHCR Login + if: (github.event_name != 'pull_request') + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Add custom buildx ARM builder + run: | + docker buildx create --name armbuilder + docker buildx use armbuilder + docker buildx inspect --bootstrap + + - name: Build and Deploy Container + run: docker buildx build -f docker/Dockerfile.ubuntu --platform linux/arm64 --push -t ${{ env.container }}:ubuntu-arm ./docker + + build-arm: + env: + container: ghcr.io/converged-computing/slurm + runs-on: ubuntu-latest + name: make and build arm + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - uses: actions/setup-go@v3 + with: + go-version: ^1.23 + - name: GHCR Login + if: (github.event_name != 'pull_request') + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Add custom buildx ARM builder + run: | + docker buildx create --name armbuilder + docker buildx use armbuilder + docker buildx inspect --bootstrap + + - name: Build and Deploy Container + run: docker buildx build -f docker/Dockerfile --platform linux/arm64 --push -t ${{ env.container }}:arm ./docker + + build: env: container: ghcr.io/converged-computing/slurm @@ -27,6 +84,11 @@ jobs: - name: Build Container run: docker build -f docker/Dockerfile -t ${{ env.container }} ./docker + - name: Build Ubuntu Container + run: docker build -f docker/Dockerfile.ubuntu -t ${{ env.container }}:ubuntu ./docker + - name: Deploy Container if: (github.event_name != 'pull_request') - run: docker push ${{ env.container }} + run: | + docker push ${{ env.container }} + docker push ${{ env.container }}:ubuntu \ No newline at end of file diff --git a/Makefile b/Makefile index 95f1b17..9b50680 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,7 @@ endif # Image URL to use all building/pushing image targets IMG ?= ghcr.io/converged-computing/slurm-operator:latest +ARMIMG ?= ghcr.io/converged-computing/slurm-operator:arm # Testing image (for development mostly) DEVIMG ?= ghcr.io/converged-computing/slurm-operator:test @@ -126,6 +127,17 @@ run: manifests generate fmt vet ## Run a controller from your host. docker-build: test ## Build docker image with the manager. docker build -t ${IMG} . +.PHONY: arm-build +arm-build: test ## Build docker image with the manager. + docker buildx build --platform linux/arm64 -t ${ARMIMG} . + +.PHONY: arm-deploy +arm-deploy: manifests kustomize + docker buildx build --platform linux/arm64 --push -t ${ARMIMG} . + cd config/manager && $(KUSTOMIZE) edit set image controller=${ARMIMG} + $(KUSTOMIZE) build config/default > examples/dist/slurm-operator-arm.yaml + + .PHONY: docker-push docker-push: ## Push docker image with the manager. docker push ${IMG} diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu new file mode 100644 index 0000000..0abed41 --- /dev/null +++ b/docker/Dockerfile.ubuntu @@ -0,0 +1,96 @@ +FROM ubuntu:24.04 + +LABEL org.opencontainers.image.source="https://github.com/converged-computing/slurm-operator" \ + org.opencontainers.image.title="slurm-operator" \ + org.opencontainers.image.description="Slurm in Kubernetes on Ubuntu" \ + maintainer="Vanessa Sochat" + +ARG SLURM_TAG=slurm-21-08-6-1 + +# Prevent interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +RUN set -ex \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + wget \ + bzip2 \ + perl \ + gcc \ + g++ \ + git \ + gnupg \ + make \ + munge \ + libmunge-dev \ + python3-dev \ + python3-pip \ + python3-setuptools \ + mariadb-server \ + libmariadb-dev \ + libmariadb-dev-compat \ + psmisc \ + bash-completion \ + vim \ + libhttp-parser-dev \ + libjson-c-dev \ + build-essential \ + pkg-config \ + gosu \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Note: Ubuntu 24.04 uses PEP 668 (externally managed environments). +# If pip install fails, use --break-system-packages or install in a venv. +RUN pip3 install --break-system-packages Cython nose + +RUN set -x \ + && git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \ + && cd slurm \ + && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ + --with-mysql_config=/usr/bin --libdir=/usr/lib \ + && make install \ + && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ + && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ + && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \ + && install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \ + && cd ../ \ + && rm -rf slurm \ + && groupadd -r --gid=990 slurm \ + && useradd -r -g slurm --uid=990 slurm \ + && mkdir -p /etc/sysconfig/slurm \ + /var/spool/slurmd \ + /var/run/slurmd \ + /var/run/slurmdbd \ + /var/lib/slurmd \ + /var/log/slurm \ + /data \ + && touch /var/lib/slurmd/node_state \ + /var/lib/slurmd/front_end_state \ + /var/lib/slurmd/job_state \ + /var/lib/slurmd/resv_state \ + /var/lib/slurmd/trigger_state \ + /var/lib/slurmd/assoc_mgr_state \ + /var/lib/slurmd/assoc_usage \ + /var/lib/slurmd/qos_usage \ + /var/lib/slurmd/fed_mgr_state \ + && chown -R slurm:slurm /var/lib/slurmd /var/run/slurmd /var/run/slurmdbd /var/log/slurm \ + && /usr/sbin/mungekey --create --force --keyfile /etc/munge/munge.key \ + && chown munge:munge /etc/munge/munge.key \ + && chmod 400 /etc/munge/munge.key + +# These will be overridden by your local files +COPY slurm.conf /etc/slurm/slurm.conf +COPY slurmdbd.conf /etc/slurm/slurmdbd.conf +RUN set -x \ + && chown slurm:slurm /etc/slurm/slurmdbd.conf \ + && chmod 600 /etc/slurm/slurmdbd.conf + +COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] + +EXPOSE 6717 +EXPOSE 6718 +EXPOSE 6719 + +CMD ["slurmdbd"] \ No newline at end of file diff --git a/examples/tests/hello-world/README.md b/examples/tests/hello-world/README.md index 1a3a784..e96f428 100644 --- a/examples/tests/hello-world/README.md +++ b/examples/tests/hello-world/README.md @@ -3,7 +3,7 @@ Create a cluster with kind: ```bash -$ kind create cluster +kind create cluster ``` You'll need to install the jobset API, which eventually will be added to Kubernetes proper (but is not yet!) @@ -12,7 +12,7 @@ You'll need to install the jobset API, which eventually will be added to Kuberne kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.0/manifests.yaml ``` -Install the Slurl operator: +Install the Slurm operator: ```bash # From GitHub @@ -121,7 +121,7 @@ Once you've verified the controller is running, you can shell into the control l kubectl exec -it slurm-sample-s-0-0-xj5zr bash ``` ```bash -$ sinfo +sinfo ``` ```console PARTITION AVAIL TIMELIMIT NODES STATE NODELIST From 8cbdc8de139a88c3930d055bf3c671d6a85b8db1 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 12:12:51 -0700 Subject: [PATCH 2/8] tweak module to use 1.24 Signed-off-by: vsoch --- .github/workflows/build-deploy.yaml | 4 ++-- .github/workflows/main.yaml | 4 ++-- Dockerfile | 2 +- go.mod | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index bef54a4..d517dee 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -17,7 +17,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 @@ -48,7 +48,7 @@ jobs: uses: actions/checkout@v4 - uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f5ab991..9919ca9 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -27,7 +27,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: fmt check run: make fmt @@ -60,7 +60,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: Start minikube uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 diff --git a/Dockerfile b/Dockerfile index 644f12e..a4e6f57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.23 as builder +FROM golang:1.24 AS builder ARG TARGETOS ARG TARGETARCH diff --git a/go.mod b/go.mod index 6a6fff4..cb8d8aa 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/converged-computing/slurm-operator -go 1.23.0 +go 1.24.0 require ( github.com/go-logr/logr v1.4.2 From 949d384e074ef1c071fb0990ca84f1f90b427702 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 13:14:08 -0700 Subject: [PATCH 3/8] build: update for arm Signed-off-by: vsoch --- .github/workflows/slurm-containers.yaml | 8 +- Makefile | 12 +- .../crd/bases/flux-framework.org_slurms.yaml | 2 +- config/manager/kustomization.yaml | 2 +- config/rbac/role.yaml | 157 +--- docker/Dockerfile | 8 +- docker/Dockerfile.ubuntu | 3 + examples/dist/slurm-operator-arm.yaml | 737 ++++++++++++++++++ examples/dist/slurm-operator.yaml | 159 +--- 9 files changed, 780 insertions(+), 308 deletions(-) create mode 100644 examples/dist/slurm-operator-arm.yaml diff --git a/.github/workflows/slurm-containers.yaml b/.github/workflows/slurm-containers.yaml index ffea01c..4b15bd5 100644 --- a/.github/workflows/slurm-containers.yaml +++ b/.github/workflows/slurm-containers.yaml @@ -18,7 +18,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 @@ -34,7 +34,7 @@ jobs: docker buildx inspect --bootstrap - name: Build and Deploy Container - run: docker buildx build -f docker/Dockerfile.ubuntu --platform linux/arm64 --push -t ${{ env.container }}:ubuntu-arm ./docker + run: docker buildx build -f docker/Dockerfile.ubuntu --build-arg ARCH=arm64 --platform linux/arm64 --push -t ${{ env.container }}:ubuntu-arm ./docker build-arm: env: @@ -46,7 +46,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.23 + go-version: ^1.24 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 @@ -62,7 +62,7 @@ jobs: docker buildx inspect --bootstrap - name: Build and Deploy Container - run: docker buildx build -f docker/Dockerfile --platform linux/arm64 --push -t ${{ env.container }}:arm ./docker + run: docker buildx build -f docker/Dockerfile --build-arg ARCH=arm64 --platform linux/arm64 --push -t ${{ env.container }}:arm ./docker build: diff --git a/Makefile b/Makefile index 9b50680..298bc5b 100644 --- a/Makefile +++ b/Makefile @@ -129,11 +129,11 @@ docker-build: test ## Build docker image with the manager. .PHONY: arm-build arm-build: test ## Build docker image with the manager. - docker buildx build --platform linux/arm64 -t ${ARMIMG} . + docker buildx build ARCH=arm64 --platform linux/arm64 -t ${ARMIMG} . .PHONY: arm-deploy arm-deploy: manifests kustomize - docker buildx build --platform linux/arm64 --push -t ${ARMIMG} . + docker buildx build --platform linux/arm64 --build-arg ARCH=arm64 --push -t ${ARMIMG} . cd config/manager && $(KUSTOMIZE) edit set image controller=${ARMIMG} $(KUSTOMIZE) build config/default > examples/dist/slurm-operator-arm.yaml @@ -218,6 +218,12 @@ test-deploy: manifests kustomize $(KUSTOMIZE) build config/default > examples/dist/slurm-operator-dev.yaml sed -i 's/ imagePullPolicy: IfNotPresent/ imagePullPolicy: Always/' examples/dist/slurm-operator-dev.yaml + +.PHONY: build-config-arm +build-config-arm: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. + cd config/manager && $(KUSTOMIZE) edit set image controller=${ARMIMG} + $(KUSTOMIZE) build config/default > examples/dist/slurm-operator-arm.yaml + .PHONY: test-deploy-recreate test-deploy-recreate: test-deploy kubectl delete -f ./examples/dist/slurm-operator-dev.yaml || echo "Already deleted" @@ -229,7 +235,7 @@ list: ## Tool Versions KUSTOMIZE_VERSION ?= v3.8.7 -CONTROLLER_TOOLS_VERSION ?= v0.14.0 +CONTROLLER_TOOLS_VERSION ?= v0.19.0 KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" .PHONY: kustomize diff --git a/config/crd/bases/flux-framework.org_slurms.yaml b/config/crd/bases/flux-framework.org_slurms.yaml index 8e1a960..70a03a1 100644 --- a/config/crd/bases/flux-framework.org_slurms.yaml +++ b/config/crd/bases/flux-framework.org_slurms.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.19.0 name: slurms.flux-framework.org spec: group: flux-framework.org diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 8aeb3c5..808644e 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/slurm-operator - newTag: latest + newTag: arm diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 4fa1520..c842ada 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -7,151 +7,18 @@ rules: - apiGroups: - "" resources: - - events - verbs: - - create - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs/status - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - "" - resources: - - "" - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - batch - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - configmaps - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch -- apiGroups: - - "" - resources: - jobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - networks - verbs: - - create - - patch -- apiGroups: - - "" - resources: - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - persistentvolumes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods/exec - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods/log + - secrets + - services + - statefulsets verbs: - create - delete @@ -163,34 +30,28 @@ rules: - apiGroups: - "" resources: - - secrets + - events verbs: - create - - delete - - get - - list - patch - update - watch - apiGroups: - "" resources: - - services + - networks verbs: - create - - delete - - get - - list - patch - - update - - watch - apiGroups: - - "" + - batch resources: - - statefulsets + - jobs + - jobs/status verbs: - create - delete + - exec - get - list - patch diff --git a/docker/Dockerfile b/docker/Dockerfile index c612bdd..5d483bf 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,9 @@ FROM rockylinux:9 +# docker build --network=host -t ghcr.io/converged-computing/slurm:arm-hpc7g . +ARG ARCH=amd64 +ENV ARCH=$ARCH + # From https://github.com/giovtorres/slurm-docker-cluster/blob/52b5f9e5a9a7b149900404077e377e8daedf1a8c/Dockerfile # Moved here to have automated build LABEL org.opencontainers.image.source="https://github.com/converged-computing/slurm-operator" \ @@ -42,8 +46,8 @@ RUN set -ex \ RUN pip3 install Cython nose RUN set -ex \ - && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \ - && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \ + && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}" \ + && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}.asc" \ && export GNUPGHOME="$(mktemp -d)" \ && gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \ && gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \ diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 0abed41..3582e1a 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -1,5 +1,8 @@ FROM ubuntu:24.04 +# for arm, hpc7g on aws 4/22/2026 +# docker build --network=host -f Dockerfile.ubuntu -t ghcr.io/converged-computing/slurm:ubuntu-arm-hpc7g . + LABEL org.opencontainers.image.source="https://github.com/converged-computing/slurm-operator" \ org.opencontainers.image.title="slurm-operator" \ org.opencontainers.image.description="Slurm in Kubernetes on Ubuntu" \ diff --git a/examples/dist/slurm-operator-arm.yaml b/examples/dist/slurm-operator-arm.yaml new file mode 100644 index 0000000..ec1d021 --- /dev/null +++ b/examples/dist/slurm-operator-arm.yaml @@ -0,0 +1,737 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: system + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: namespace + app.kubernetes.io/part-of: slurm-operator + control-plane: controller-manager + name: slurm-operator-system +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: slurms.flux-framework.org +spec: + group: flux-framework.org + names: + kind: Slurm + listKind: SlurmList + plural: slurms + singular: slurm + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: Slurm is the Schema for the slurms API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: SlurmSpec defines the desired state of slurm + properties: + clusterName: + default: linux + description: Name of the cluster + type: string + daemon: + description: Slurm dbd "daemon" + properties: + command: + description: Command will be honored by a server node + type: string + commands: + description: Commands to run around different parts of the setup + properties: + init: + description: Init runs before anything in both scripts + type: string + type: object + environment: + additionalProperties: + type: string + description: Key/value pairs for the environment + type: object + image: + default: ghcr.io/converged-computing/slurm + description: Image to use for slurm + type: string + ports: + description: |- + Ports to be exposed to other containers in the cluster + We take a single list of integers and map to the same + items: + format: int32 + type: integer + type: array + x-kubernetes-list-type: atomic + pullAlways: + description: PullAlways will always pull the container + type: boolean + pullSecret: + description: PullSecret for the node, if needed + type: string + resources: + description: Resources include limits and requests + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object + workingDir: + description: Working directory + type: string + type: object + database: + description: Database is the database service spec + properties: + databaseName: + default: slurm_acct_db + description: Name of the database + type: string + environment: + additionalProperties: + type: string + description: |- + Default Environment, will be set if not defined here + Note that by defalt we set MYSQL_* envars. + If you use a different database, be sure to set them all + Username and password are set separately below! + type: object + host: + description: |- + Custom database host + This should only be set if you are deploying your own database + and DeployDatabase is false + type: string + image: + default: mariadb:10.10 + description: |- + Image to use for the database + We assume we don't need to tweak the command + type: string + password: + default: password + description: Database password + type: string + pullAlways: + description: PullAlways will always pull the container + type: boolean + user: + default: slurm + description: Database user + type: string + type: object + deadlineSeconds: + default: 31500000 + description: |- + Time limit for the job + Approximately one year. This cannot be zero or job won't start + format: int64 + type: integer + deployDatabase: + default: true + description: Deploy the database (or not) + type: boolean + interactive: + description: Interactive mode keeps the cluster running + type: boolean + network: + description: Network options (service name and selector) + properties: + selector: + description: Selector name for network + type: string + serviceName: + description: Service name (e.g., helpful if already exists) + type: string + type: object + node: + description: The generic login node + properties: + command: + description: Command will be honored by a server node + type: string + commands: + description: Commands to run around different parts of the setup + properties: + init: + description: Init runs before anything in both scripts + type: string + type: object + environment: + additionalProperties: + type: string + description: Key/value pairs for the environment + type: object + image: + default: ghcr.io/converged-computing/slurm + description: Image to use for slurm + type: string + ports: + description: |- + Ports to be exposed to other containers in the cluster + We take a single list of integers and map to the same + items: + format: int32 + type: integer + type: array + x-kubernetes-list-type: atomic + pullAlways: + description: PullAlways will always pull the container + type: boolean + pullSecret: + description: PullSecret for the node, if needed + type: string + resources: + description: Resources include limits and requests + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object + workingDir: + description: Working directory + type: string + type: object + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + description: Resources include limits and requests + type: object + size: + description: Size of the slurm (1 server + (N-1) nodes) + format: int32 + type: integer + slurmVersion: + default: 19.05.2 + description: Release of slurm to installed (if sbinary not found in PATH) + type: string + worker: + description: |- + Worker is the worker node spec, does not include login slurmctl or slurmdbd + Defaults to be same spec as the server + properties: + command: + description: Command will be honored by a server node + type: string + commands: + description: Commands to run around different parts of the setup + properties: + init: + description: Init runs before anything in both scripts + type: string + type: object + environment: + additionalProperties: + type: string + description: Key/value pairs for the environment + type: object + image: + default: ghcr.io/converged-computing/slurm + description: Image to use for slurm + type: string + ports: + description: |- + Ports to be exposed to other containers in the cluster + We take a single list of integers and map to the same + items: + format: int32 + type: integer + type: array + x-kubernetes-list-type: atomic + pullAlways: + description: PullAlways will always pull the container + type: boolean + pullSecret: + description: PullSecret for the node, if needed + type: string + resources: + description: Resources include limits and requests + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object + workingDir: + description: Working directory + type: string + type: object + required: + - node + - size + type: object + status: + description: SlurmStatus defines the observed state of slurm + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: serviceaccount + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-controller-manager + namespace: slurm-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: leader-election-role + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: role + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-leader-election-role + namespace: slurm-operator-system +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: slurm-operator-manager-role +rules: +- apiGroups: + - "" + resources: + - "" + - batch + - configmaps + - jobs + - persistentvolumeclaims + - persistentvolumes + - pods + - pods/exec + - pods/log + - secrets + - services + - statefulsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update + - watch +- apiGroups: + - "" + resources: + - networks + verbs: + - create + - patch +- apiGroups: + - batch + resources: + - jobs + - jobs/status + verbs: + - create + - delete + - exec + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - slurms + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - slurms/finalizers + verbs: + - update +- apiGroups: + - flux-framework.org + resources: + - slurms/status + verbs: + - get + - patch + - update +- apiGroups: + - jobset.x-k8s.io + resources: + - jobsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - jobset.x-k8s.io + resources: + - jobsets/finalizers + verbs: + - update +- apiGroups: + - jobset.x-k8s.io + resources: + - jobsets/status + verbs: + - get + - patch + - update +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: metrics-reader + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: clusterrole + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: proxy-role + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: clusterrole + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-proxy-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: leader-election-rolebinding + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: rolebinding + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-leader-election-rolebinding + namespace: slurm-operator-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: slurm-operator-leader-election-role +subjects: +- kind: ServiceAccount + name: slurm-operator-controller-manager + namespace: slurm-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: manager-rolebinding + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: clusterrolebinding + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: slurm-operator-manager-role +subjects: +- kind: ServiceAccount + name: slurm-operator-controller-manager + namespace: slurm-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: proxy-rolebinding + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: clusterrolebinding + app.kubernetes.io/part-of: slurm-operator + name: slurm-operator-proxy-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: slurm-operator-proxy-role +subjects: +- kind: ServiceAccount + name: slurm-operator-controller-manager + namespace: slurm-operator-system +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: controller-manager-metrics-service + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: service + app.kubernetes.io/part-of: slurm-operator + control-plane: controller-manager + name: slurm-operator-controller-manager-metrics-service + namespace: slurm-operator-system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + selector: + control-plane: controller-manager +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: slurm-operator + app.kubernetes.io/instance: controller-manager + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: deployment + app.kubernetes.io/part-of: slurm-operator + control-plane: controller-manager + name: slurm-operator-controller-manager + namespace: slurm-operator-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - arm64 + - ppc64le + - s390x + - key: kubernetes.io/os + operator: In + values: + - linux + containers: + - args: + - --secure-listen-address=0.0.0.0:8443 + - --upstream=http://127.0.0.1:8080/ + - --logtostderr=true + - --v=0 + image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1 + name: kube-rbac-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 5m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + - args: + - --health-probe-bind-address=:8081 + - --metrics-bind-address=127.0.0.1:8080 + - --leader-elect + command: + - /manager + image: ghcr.io/converged-computing/slurm-operator:arm + imagePullPolicy: IfNotPresent + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + securityContext: + runAsNonRoot: true + serviceAccountName: slurm-operator-controller-manager + terminationGracePeriodSeconds: 10 diff --git a/examples/dist/slurm-operator.yaml b/examples/dist/slurm-operator.yaml index 5214fbc..326c8bd 100644 --- a/examples/dist/slurm-operator.yaml +++ b/examples/dist/slurm-operator.yaml @@ -15,7 +15,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.19.0 name: slurms.flux-framework.org spec: group: flux-framework.org @@ -393,151 +393,18 @@ rules: - apiGroups: - "" resources: - - events - verbs: - - create - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs/status - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - "" - resources: - - "" - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - batch - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - configmaps - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch -- apiGroups: - - "" - resources: - jobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - networks - verbs: - - create - - patch -- apiGroups: - - "" - resources: - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - persistentvolumes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods/exec - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - pods/log + - secrets + - services + - statefulsets verbs: - create - delete @@ -549,34 +416,28 @@ rules: - apiGroups: - "" resources: - - secrets + - events verbs: - create - - delete - - get - - list - patch - update - watch - apiGroups: - "" resources: - - services + - networks verbs: - create - - delete - - get - - list - patch - - update - - watch - apiGroups: - - "" + - batch resources: - - statefulsets + - jobs + - jobs/status verbs: - create - delete + - exec - get - list - patch From ef8be5edb12a20071ccd1f03d0f8d028e09a2a7e Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 16:15:51 -0700 Subject: [PATCH 4/8] bug: remove reliance on proxy Signed-off-by: vsoch --- config/default/kustomization.yaml | 7 ++--- config/manager/kustomization.yaml | 2 +- examples/dist/slurm-operator-arm.yaml | 41 --------------------------- examples/dist/slurm-operator.yaml | 41 --------------------------- 4 files changed, 4 insertions(+), 87 deletions(-) diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index e3ad3ac..363c865 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -12,7 +12,7 @@ namePrefix: slurm-operator- #commonLabels: # someName: someValue -bases: +resources: - ../crd - ../rbac - ../manager @@ -28,9 +28,8 @@ patchesStrategicMerge: # Protect the /metrics endpoint by putting it behind auth. # If you want your controller-manager to expose the /metrics # endpoint w/o any authn/z, please comment the following line. -- manager_auth_proxy_patch.yaml - - +# we have removed this because the image is deprecated 3/2026 +# - manager_auth_proxy_patch.yaml # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 808644e..8aeb3c5 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/slurm-operator - newTag: arm + newTag: latest diff --git a/examples/dist/slurm-operator-arm.yaml b/examples/dist/slurm-operator-arm.yaml index ec1d021..19a4857 100644 --- a/examples/dist/slurm-operator-arm.yaml +++ b/examples/dist/slurm-operator-arm.yaml @@ -658,49 +658,8 @@ spec: labels: control-plane: controller-manager spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - amd64 - - arm64 - - ppc64le - - s390x - - key: kubernetes.io/os - operator: In - values: - - linux containers: - args: - - --secure-listen-address=0.0.0.0:8443 - - --upstream=http://127.0.0.1:8080/ - - --logtostderr=true - - --v=0 - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1 - name: kube-rbac-proxy - ports: - - containerPort: 8443 - name: https - protocol: TCP - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - - args: - - --health-probe-bind-address=:8081 - - --metrics-bind-address=127.0.0.1:8080 - --leader-elect command: - /manager diff --git a/examples/dist/slurm-operator.yaml b/examples/dist/slurm-operator.yaml index 326c8bd..b33d4c9 100644 --- a/examples/dist/slurm-operator.yaml +++ b/examples/dist/slurm-operator.yaml @@ -658,49 +658,8 @@ spec: labels: control-plane: controller-manager spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - amd64 - - arm64 - - ppc64le - - s390x - - key: kubernetes.io/os - operator: In - values: - - linux containers: - args: - - --secure-listen-address=0.0.0.0:8443 - - --upstream=http://127.0.0.1:8080/ - - --logtostderr=true - - --v=0 - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1 - name: kube-rbac-proxy - ports: - - containerPort: 8443 - name: https - protocol: TCP - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - - args: - - --health-probe-bind-address=:8081 - - --metrics-bind-address=127.0.0.1:8080 - --leader-elect command: - /manager From a50f5ec55b3199d6a99cd646a5956cea33410b5e Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 16:27:56 -0700 Subject: [PATCH 5/8] munge: ensure use consistent template Signed-off-by: vsoch --- controllers/slurm/templates/components.sh | 18 +++++++++++++++++- controllers/slurm/templates/daemon.sh | 3 +-- controllers/slurm/templates/server.sh | 4 +--- controllers/slurm/templates/worker.sh | 5 ++--- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/controllers/slurm/templates/components.sh b/controllers/slurm/templates/components.sh index 676835b..4d0a513 100644 --- a/controllers/slurm/templates/components.sh +++ b/controllers/slurm/templates/components.sh @@ -29,4 +29,20 @@ mkdir -p ${workdir} {{define "exit"}} sleep infinity {{ if .Spec.Interactive }}sleep infinity{{ end }} -{{ end }} \ No newline at end of file +{{ end }} + +{{ define "munge"}} +echo "---> Starting the MUNGE Authentication service (munged) ..." + +# Create the missing directory +mkdir -p /run/munge + +# Set ownership so the munge user can write the socket +chown -R munge:munge /run/munge + +# Start the daemon manually +gosu munge /usr/sbin/munged --force + +# Verify it works +munge -n | unmunge +{{end}} \ No newline at end of file diff --git a/controllers/slurm/templates/daemon.sh b/controllers/slurm/templates/daemon.sh index 7f18f7b..8e16959 100644 --- a/controllers/slurm/templates/daemon.sh +++ b/controllers/slurm/templates/daemon.sh @@ -5,8 +5,7 @@ echo "Hello, I am a worker with $(hostname)" # Shared logic to install hq {{template "init" .}} -echo "---> Starting the MUNGE Authentication service (munged) ..." -gosu munge /usr/sbin/munged +{{template "munge" .}} echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." diff --git a/controllers/slurm/templates/server.sh b/controllers/slurm/templates/server.sh index 76673bf..d30aaba 100644 --- a/controllers/slurm/templates/server.sh +++ b/controllers/slurm/templates/server.sh @@ -5,9 +5,7 @@ echo "Hello, I am a server with $(hostname)" # This script handles shared start logic {{template "init" .}} -# Default entrypoint with slurmctld, this is like a login node -echo "---> Starting the MUNGE Authentication service (munged) ..." -gosu munge /usr/sbin/munged +{{template "munge" .}} echo "---> Sleeping for slurmdbd to become active before starting slurmctld ..." diff --git a/controllers/slurm/templates/worker.sh b/controllers/slurm/templates/worker.sh index 7cb9bba..fd36954 100644 --- a/controllers/slurm/templates/worker.sh +++ b/controllers/slurm/templates/worker.sh @@ -2,12 +2,11 @@ echo "Hello, I am a worker with $(hostname)" -# Shared logic to install hq +# Shared logic {{template "init" .}} # This is a worker node -echo "---> Starting the MUNGE Authentication service (munged) ..." -gosu munge /usr/sbin/munged +{{ template "munge" .}} echo "---> Waiting for slurmctld to become active before starting slurmd..." sleep 30 From b797793b39a4e31ac31433719ea7b1e0f8ec502c Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 17:13:02 -0700 Subject: [PATCH 6/8] feat: allow to specify nodespec Signed-off-by: vsoch --- api/v1alpha1/slurm_types.go | 9 +++++++++ config/crd/bases/flux-framework.org_slurms.yaml | 15 +++++++++++++++ controllers/slurm/templates.go | 2 ++ controllers/slurm/templates/slurm.conf | 3 +-- examples/dist/slurm-operator-arm.yaml | 15 +++++++++++++++ examples/dist/slurm-operator.yaml | 15 +++++++++++++++ 6 files changed, 57 insertions(+), 2 deletions(-) diff --git a/api/v1alpha1/slurm_types.go b/api/v1alpha1/slurm_types.go index e48ab80..325da5f 100644 --- a/api/v1alpha1/slurm_types.go +++ b/api/v1alpha1/slurm_types.go @@ -168,6 +168,11 @@ type Node struct { // +optional WorkingDir string `json:"workingDir,omitempty"` + // Node specification. Leave empty for testing cluster + // This does not include hostlist (generated automatically) + // +optional + Nodespec string `json:"nodespec,omitempty"` + // PullAlways will always pull the container // +optional PullAlways bool `json:"pullAlways"` @@ -238,6 +243,10 @@ func (s *Slurm) Validate() bool { s.Spec.ClusterName = "linux" } + // Default node spec + if s.Spec.Node.Nodespec == "" { + s.Spec.Node.Nodespec = "RealMemory=1000 CPUs=1 State=UNKNOWN" + } // Along with a username and password if s.Spec.Database.DatabaseName == "" { s.Spec.Database.DatabaseName = "slurm_acct_db" diff --git a/config/crd/bases/flux-framework.org_slurms.yaml b/config/crd/bases/flux-framework.org_slurms.yaml index 70a03a1..88ddbb9 100644 --- a/config/crd/bases/flux-framework.org_slurms.yaml +++ b/config/crd/bases/flux-framework.org_slurms.yaml @@ -65,6 +65,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -188,6 +193,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -266,6 +276,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster diff --git a/controllers/slurm/templates.go b/controllers/slurm/templates.go index 64b4202..5f92c63 100644 --- a/controllers/slurm/templates.go +++ b/controllers/slurm/templates.go @@ -50,6 +50,7 @@ type ConfigTemplate struct { ControlHost string DatabaseHost string Hostlist string + Nodespec string } // combineTemplates into one "start" @@ -124,6 +125,7 @@ func generateConfig(cluster *api.Slurm, startTemplate string) (string, error) { DatabaseHost: database, DaemonHost: daemon, Hostlist: generateHostlist(cluster), + Nodespec: cluster.Spec.Worker.Nodespec, } // Wrap the named template to identify it later diff --git a/controllers/slurm/templates/slurm.conf b/controllers/slurm/templates/slurm.conf index 104a198..3ac6e6d 100644 --- a/controllers/slurm/templates/slurm.conf +++ b/controllers/slurm/templates/slurm.conf @@ -89,8 +89,7 @@ AccountingStoragePort=6819 #AccountingStorageUser= # # COMPUTE NODES -# TODO customize memory here? -NodeName={{ .Hostlist }} RealMemory=1000 State=UNKNOWN +NodeName={{ .Hostlist }} {{ .Spec.Node.Nodespec }} # # PARTITIONS PartitionName=normal Default=yes Nodes={{ .Hostlist }} Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP \ No newline at end of file diff --git a/examples/dist/slurm-operator-arm.yaml b/examples/dist/slurm-operator-arm.yaml index 19a4857..c8fd4b1 100644 --- a/examples/dist/slurm-operator-arm.yaml +++ b/examples/dist/slurm-operator-arm.yaml @@ -77,6 +77,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -200,6 +205,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -277,6 +287,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster diff --git a/examples/dist/slurm-operator.yaml b/examples/dist/slurm-operator.yaml index b33d4c9..e93fa3f 100644 --- a/examples/dist/slurm-operator.yaml +++ b/examples/dist/slurm-operator.yaml @@ -77,6 +77,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -200,6 +205,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster @@ -277,6 +287,11 @@ spec: default: ghcr.io/converged-computing/slurm description: Image to use for slurm type: string + nodespec: + description: |- + Node specification. Leave empty for testing cluster + This does not include hostlist (generated automatically) + type: string ports: description: |- Ports to be exposed to other containers in the cluster From 27d8ca4aa89ee5b738496944ba83ee6c098613cd Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 20:46:32 -0700 Subject: [PATCH 7/8] make size simpler - just is number of worker nodes Signed-off-by: vsoch --- api/v1alpha1/slurm_types.go | 12 +++------- .../crd/bases/flux-framework.org_slurms.yaml | 2 +- controllers/slurm/jobset.go | 5 ++-- controllers/slurm/templates.go | 2 +- docker/Dockerfile | 19 ++++++++++++++- docker/Dockerfile.ubuntu | 23 ++++++++++++++++--- examples/dist/slurm-operator.yaml | 2 +- 7 files changed, 46 insertions(+), 19 deletions(-) diff --git a/api/v1alpha1/slurm_types.go b/api/v1alpha1/slurm_types.go index 325da5f..3abb333 100644 --- a/api/v1alpha1/slurm_types.go +++ b/api/v1alpha1/slurm_types.go @@ -63,7 +63,7 @@ type SlurmSpec struct { // +optional SlurmVersion string `json:"slurmVersion,omitempty"` - // Size of the slurm (1 server + (N-1) nodes) + // Size is number of worker nodes Size int32 `json:"size"` // Interactive mode keeps the cluster running @@ -230,8 +230,8 @@ func (s *Slurm) SelectorName() string { // Validate the slurm func (s *Slurm) Validate() bool { - if s.WorkerNodes() < 1 { - fmt.Printf("😥️ Slurm cluster must have at least one worker node, Size >= 2.\n") + if s.Spec.Size < 1 { + fmt.Printf("😥️ Slurm cluster must have 1 or more worker nodes.\n") return false } // Ensure we have the default image set @@ -258,12 +258,6 @@ func (s *Slurm) Validate() bool { return true } -// WorkerNodes returns the number of worker nodes -// At this point we've already validated size is >= 1 -func (s *Slurm) WorkerNodes() int32 { - return s.Spec.Size - 1 -} - // WorkerNode returns the worker node (if defined) or falls back to the server func (s *Slurm) WorkerNode() Node { diff --git a/config/crd/bases/flux-framework.org_slurms.yaml b/config/crd/bases/flux-framework.org_slurms.yaml index 88ddbb9..cdb7b94 100644 --- a/config/crd/bases/flux-framework.org_slurms.yaml +++ b/config/crd/bases/flux-framework.org_slurms.yaml @@ -244,7 +244,7 @@ spec: description: Resources include limits and requests type: object size: - description: Size of the slurm (1 server + (N-1) nodes) + description: Size is number of worker nodes format: int32 type: integer slurmVersion: diff --git a/controllers/slurm/jobset.go b/controllers/slurm/jobset.go index 15c0a59..9505ba8 100644 --- a/controllers/slurm/jobset.go +++ b/controllers/slurm/jobset.go @@ -93,9 +93,8 @@ func (r *SlurmReconciler) newJobSet( } - // Create a cluster (JobSet) with workers (required) - workerNodes := cluster.WorkerNodes() - workerJob, err := r.getJob(cluster, cluster.WorkerNode(), workerNodes, "w", true) + // Create a cluster (JobSet) with workers + workerJob, err := r.getJob(cluster, cluster.WorkerNode(), cluster.Spec.Size, "w", true) if err != nil { r.Log.Error(err, "There was an error getting the worker ReplicatedJob") return &jobs, err diff --git a/controllers/slurm/templates.go b/controllers/slurm/templates.go index 5f92c63..6df9402 100644 --- a/controllers/slurm/templates.go +++ b/controllers/slurm/templates.go @@ -95,7 +95,7 @@ func generateHostlist(cluster *api.Slurm) string { hosts := "" serviceName := cluster.ServiceName() - for i := 0; i < int(cluster.WorkerNodes()); i++ { + for i := 0; i < int(cluster.Spec.Size); i++ { if hosts == "" { hosts = fmt.Sprintf("%s-w-0-%d.%s.%s.svc.cluster.local", cluster.Name, i, serviceName, cluster.Namespace) } else { diff --git a/docker/Dockerfile b/docker/Dockerfile index 5d483bf..2950be9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -39,12 +39,22 @@ RUN set -ex \ bash-completion \ vim-enhanced \ http-parser-devel \ + hwloc-devel \ + libevent-devel \ json-c-devel \ && yum clean all \ && rm -rf /var/cache/yum RUN pip3 install Cython nose +RUN set -x \ + && wget https://github.com/openpmix/openpmix/releases/download/v3.2.3/pmix-3.2.3.tar.gz \ + && tar -xvf pmix-3.2.3.tar.gz \ + && cd pmix-3.2.3 \ + && ./configure --prefix=/opt/pmix --disable-static \ + && make -j$(nproc) install \ + && cd .. && rm -rf pmix-3.2.3* + RUN set -ex \ && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}" \ && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}.asc" \ @@ -58,8 +68,15 @@ RUN set -ex \ RUN set -x \ && git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \ && pushd slurm \ + && export CFLAGS="-I/usr/include" \ + && export LDFLAGS="-L/usr/lib/$(uname -m)-linux-gnu" \ && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ - --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ + --with-mysql_config=/usr/bin \ + --libdir=/usr/lib \ + --with-pmix=/opt/pmix \ + --with-hwloc=/usr \ + --without-hdf5 \ + --with-libevent=/usr \ && make install \ && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 3582e1a..0ea2ef1 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -40,6 +40,8 @@ RUN set -ex \ build-essential \ pkg-config \ gosu \ + libhwloc-dev \ + libevent-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -47,11 +49,26 @@ RUN set -ex \ # If pip install fails, use --break-system-packages or install in a venv. RUN pip3 install --break-system-packages Cython nose +RUN set -x \ + && wget https://github.com/openpmix/openpmix/releases/download/v3.2.3/pmix-3.2.3.tar.gz \ + && tar -xvf pmix-3.2.3.tar.gz \ + && cd pmix-3.2.3 \ + && ./configure --prefix=/opt/pmix --disable-static \ + && make -j$(nproc) install \ + && cd .. && rm -rf pmix-3.2.3* + RUN set -x \ && git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \ - && cd slurm \ + && export CFLAGS="-I/usr/include" \ + && export LDFLAGS="-L/usr/lib/$(uname -m)-linux-gnu" \ + && cd ./slurm \ && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ - --with-mysql_config=/usr/bin --libdir=/usr/lib \ + --with-mysql_config=/usr/bin \ + --libdir=/usr/lib \ + --with-pmix=/opt/pmix \ + --without-hdf5 \ + --with-hwloc=/usr \ + --with-libevent=/usr \ && make install \ && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ @@ -96,4 +113,4 @@ EXPOSE 6717 EXPOSE 6718 EXPOSE 6719 -CMD ["slurmdbd"] \ No newline at end of file +CMD ["slurmdbd"] diff --git a/examples/dist/slurm-operator.yaml b/examples/dist/slurm-operator.yaml index e93fa3f..0023f39 100644 --- a/examples/dist/slurm-operator.yaml +++ b/examples/dist/slurm-operator.yaml @@ -256,7 +256,7 @@ spec: description: Resources include limits and requests type: object size: - description: Size of the slurm (1 server + (N-1) nodes) + description: Size is number of worker nodes format: int32 type: integer slurmVersion: From 55b835552ca07a36fd30a04d17960d60db8fd7f0 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 22 Apr 2026 22:32:41 -0700 Subject: [PATCH 8/8] ci: do not build arm on pr Signed-off-by: vsoch --- .github/workflows/build-deploy.yaml | 2 ++ .github/workflows/slurm-containers.yaml | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index d517dee..e153325 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -27,12 +27,14 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Add custom buildx ARM builder + if: (github.event_name != 'pull_request') run: | docker buildx create --name armbuilder docker buildx use armbuilder docker buildx inspect --bootstrap - name: Deploy Container + if: (github.event_name != 'pull_request') run: make arm-deploy build: diff --git a/.github/workflows/slurm-containers.yaml b/.github/workflows/slurm-containers.yaml index 4b15bd5..a54ecfe 100644 --- a/.github/workflows/slurm-containers.yaml +++ b/.github/workflows/slurm-containers.yaml @@ -12,7 +12,7 @@ jobs: env: container: ghcr.io/converged-computing/slurm runs-on: ubuntu-latest - name: make and build arm + name: build arm slurm ubuntu steps: - name: Checkout Repository uses: actions/checkout@v3 @@ -28,12 +28,14 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Add custom buildx ARM builder + if: (github.event_name != 'pull_request') run: | docker buildx create --name armbuilder docker buildx use armbuilder docker buildx inspect --bootstrap - name: Build and Deploy Container + if: (github.event_name != 'pull_request') run: docker buildx build -f docker/Dockerfile.ubuntu --build-arg ARCH=arm64 --platform linux/arm64 --push -t ${{ env.container }}:ubuntu-arm ./docker build-arm: @@ -56,12 +58,14 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Add custom buildx ARM builder + if: (github.event_name != 'pull_request') run: | docker buildx create --name armbuilder docker buildx use armbuilder docker buildx inspect --bootstrap - name: Build and Deploy Container + if: (github.event_name != 'pull_request') run: docker buildx build -f docker/Dockerfile --build-arg ARCH=arm64 --platform linux/arm64 --push -t ${{ env.container }}:arm ./docker