diff --git a/.gitignore b/.gitignore
index 8f8ed15d..e876b4c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,21 @@ wandb/
 checkpoints/
 rollout_results/
 reference_projects/
-docs/
\ No newline at end of file
+docs/
+
+# SLURM job outputs
+*.out
+*.err
+
+# Cluster results
+results/
+
+# Private cluster configs and logs
+worklogs/
+
+# Cluster artifacts (generated locally)
+cluster.yaml
+opencode.def
+test_results.log
+tmp_defs/
+sif_images/
diff --git a/examples/calculator/README.md b/examples/calculator/README.md
index 2ebee5e6..895d56a5 100644
--- a/examples/calculator/README.md
+++ b/examples/calculator/README.md
@@ -101,4 +101,70 @@ Each rollout then prepares a fresh workspace by:
 - installing the harness CLI or SDK for that run
 - creating `/polar/session/workspace`
 - uploading `calculator.py` and `test_calculator.py`
-- initializing a git repo used by the evaluator
\ No newline at end of file
+- initializing a git repo used by the evaluator
+
+## Cluster Deployment (SLURM)
+
+For running on a SLURM cluster with Apptainer containers and vLLM inference.
+See [examples/slurm/README.md](../slurm/README.md) for full documentation.
+
+### 1. Configure
+
+```bash
+cp examples/slurm/cluster.yaml.example my-cluster.yaml
+# Edit my-cluster.yaml with your cluster details
+```
+
+### 2. One-Time Setup
+
+```bash
+polar cluster setup -c my-cluster.yaml
+```
+
+### 3. Build SIF Image
+
+```bash
+# Single harness:
+polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode
+
+# Multiple harnesses:
+polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode,codex,swe_agent
+```
+
+### 4. Start Services
+
+```bash
+polar cluster serve -c my-cluster.yaml
+```
+
+Once services are ready, the command prints the job ID and a sample `submit-task` command.
+
+### 5. Submit Tasks
+
+```bash
+# Use the job ID from step 4
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness opencode
+
+# Multiple harnesses against the same running service
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness codex
+```
+
+### 6. Stop Services
+
+```bash
+scancel JOB_ID
+```
+
+### 7. Collect Results
+
+```bash
+polar cluster sync -c my-cluster.yaml
+```
+
+**One-shot alternative** — start services, run tasks, and exit in one command:
+
+```bash
+polar cluster launch -c my-cluster.yaml --example calculator --harness opencode
+```
diff --git a/examples/calculator/swe_agent/Dockerfile b/examples/calculator/swe_agent/Dockerfile
new file mode 100644
index 00000000..47c53664
--- /dev/null
+++ b/examples/calculator/swe_agent/Dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.12-slim
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        bash \
+        ca-certificates \
+        curl \
+        git \
+        build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install SWE-agent from git (PyPI version has missing dependency)
+RUN pip install --no-cache-dir "git+https://github.com/SWE-agent/SWE-agent.git" \
+    && SITE=$(python -c "import site; print(site.getsitepackages()[0])") \
+    && git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src \
+    && cp -r /tmp/swe-agent-src/config "$SITE/config" \
+    && cp -r /tmp/swe-agent-src/tools "$SITE/tools" \
+    && mkdir -p "$SITE/trajectories" \
+    && rm -rf /tmp/swe-agent-src
+
+# Pre-install tool dependencies so install.sh is a no-op at runtime
+# (compute nodes may lack internet access).
+RUN pip install --no-cache-dir 'tree-sitter==0.21.3' 'tree-sitter-languages'
+
+WORKDIR /polar/session
diff --git a/examples/slurm/README.md b/examples/slurm/README.md
new file mode 100644
index 00000000..f000cc1b
--- /dev/null
+++ b/examples/slurm/README.md
@@ -0,0 +1,415 @@
+# Polar on SLURM
+
+Deploy Polar agent rollout jobs on a SLURM cluster using Apptainer containers and vLLM inference.
+
+## Prerequisites
+
+- SSH access to a SLURM login node
+- SLURM account with GPU allocation
+- Shared filesystem (Lustre, GPFS, etc.) accessible from all nodes
+- Apptainer on the cluster (for running agent containers)
+- Python 3.10+ on the cluster
+
+## Install Polar
+
+From a checkout of this repository (laptop or cluster login node):
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+uv pip install -e .
+polar --help
+```
+
+After `polar cluster setup`, the cluster workspace venv also provides the `polar` command.
+
+## Quick Start
+
+### 1. Configure
+
+```bash
+cp examples/slurm/cluster.yaml.example my-cluster.yaml
+# Edit my-cluster.yaml with your cluster details
+```
+
+Key fields to set:
+- `slurm.login_node` -- SSH hostname of your login node
+- `slurm.account` -- your SLURM account string
+- `slurm.partition` -- SLURM partition name
+- `paths.workspace` -- base directory on shared filesystem
+
+### 2. One-Time Setup
+
+```bash
+polar cluster setup -c my-cluster.yaml
+```
+
+This syncs code to the cluster and:
+- Creates directories (sif_images/, results/, apptainer_cache/)
+- Creates a Python venv and installs Polar
+- Verifies Apptainer and CUDA are accessible
+
+### 3. Build SIF Images
+
+```bash
+# Build a single harness
+polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode
+
+# Build multiple harnesses at once
+polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode,codex,swe_agent
+```
+
+### 4. Start Services
+
+```bash
+# Start vLLM + rollout + gateway (waits until ready)
+polar cluster serve -c my-cluster.yaml
+
+# Override model or resources
+polar cluster serve -c my-cluster.yaml --model "Qwen/Qwen3.5-72B" --nodes 2
+
+# Preview without submitting
+polar cluster serve -c my-cluster.yaml --dry-run
+```
+
+Once ready, the command prints the job ID and a sample `submit-task` command.
+
+### 5. Submit Tasks
+
+```bash
+# Submit against the running service (use job ID from step 4)
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness opencode
+
+# Multiple harnesses can reuse the same running service
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness codex
+```
+
+### 6. Stop Services
+
+```bash
+scancel JOB_ID
+```
+
+### 7. One-Shot Mode (Alternative)
+
+If you prefer a single command that starts services, runs tasks, and exits:
+
+```bash
+polar cluster launch -c my-cluster.yaml --example calculator --harness opencode
+```
+
+Services stop after tasks complete.
+
+### 8. Monitor
+
+```bash
+# Check job status
+polar cluster status -c my-cluster.yaml
+
+# Or directly via SSH
+ssh <login-node> squeue -u \$USER
+```
+
+### 9. Sync Results
+
+```bash
+# Sync everything
+polar cluster sync -c my-cluster.yaml
+
+# Sync a specific job
+polar cluster sync -c my-cluster.yaml --job-id 12345
+
+# Sync only code changes
+polar cluster sync -c my-cluster.yaml --code-only
+```
+
+## Example Workflows
+
+### Calculator (Quick Validation)
+
+Build SIFs and submit tasks for one or more harnesses:
+
+```bash
+polar cluster build-sif -c my-cluster.yaml \
+    --example calculator --harness opencode,codex,swe_agent
+
+polar cluster serve -c my-cluster.yaml
+
+# Use the job ID printed by serve
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness opencode
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness codex
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example calculator --harness swe_agent
+```
+
+**Supported calculator harnesses:**
+
+| Harness | API | vLLM Compatible | Status |
+|---------|-----|-----------------|--------|
+| opencode | OpenAI Chat | Yes | Verified |
+| codex | OpenAI Responses | Yes | Verified |
+| swe_agent | OpenAI Chat | Yes | Verified |
+| qwen_code | OpenAI Chat | Yes | Available |
+| openhands_sdk | OpenAI Chat | Yes | Available |
+| claude_code | Anthropic Messages | No (needs native API) | Local only |
+| gemini_cli | Google Generative AI | No (needs native API) | Local only |
+
+### SWE-Gym (10 Curated Tasks)
+
+Each SWE-Gym instance needs its own SIF:
+
+```bash
+# Build all 10 sample SIFs
+polar cluster build-sif -c my-cluster.yaml \
+    --example swegym --harness swe_agent
+
+# Start services and submit
+polar cluster serve -c my-cluster.yaml
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swegym --harness swe_agent \
+    --timeout-seconds 2400
+
+# Or a single instance
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swegym --harness swe_agent \
+    --timeout-seconds 2400 --instance-id getmoto__moto-7365
+```
+
+Sample instances:
+
+| Instance | Repo |
+|----------|------|
+| `getmoto__moto-7365` | getmoto/moto |
+| `python__mypy-10392` | python/mypy |
+| `conan-io__conan-13721` | conan-io/conan |
+| `iterative__dvc-1809` | iterative/dvc |
+| `dask__dask-10441` | dask/dask |
+| `pydantic__pydantic-8072` | pydantic/pydantic |
+| `pandas-dev__pandas-58335` | pandas-dev/pandas |
+| `facebookresearch__hydra-1783` | facebookresearch/hydra |
+| `bokeh__bokeh-13636` | bokeh/bokeh |
+| `Project-MONAI__MONAI-2238` | Project-MONAI/MONAI |
+
+### SWE-bench Verified (500 Tasks)
+
+Full benchmark evaluation with per-instance containers:
+
+```bash
+# Cache dataset (once)
+python -c "from examples.swebench_verified.dataset import load_swebench_verified; load_swebench_verified()"
+
+# Build per-instance SIFs
+polar cluster build-sif -c my-cluster.yaml \
+    --example swebench_verified --harness opencode \
+    --instance-id django__django-15098
+
+# Start services and submit
+polar cluster serve -c my-cluster.yaml
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swebench_verified --harness opencode \
+    --timeout-seconds 3600 --instance-id django__django-15098
+```
+
+### SWE-Gym Slime GRPO (RL Training)
+
+Distributed RL training using Polar for rollout and Slime + Megatron for GRPO training.
+See [examples/swegym_slime_grpo/README.md](../swegym_slime_grpo/README.md) for detailed setup.
+
+```bash
+# Build training SIF (uses slimerl/slime Docker image as base)
+polar cluster build-sif -c my-cluster.yaml --example train
+
+# Submit training job
+polar cluster train -c my-cluster.yaml \
+    --polar-config examples/swegym_slime_grpo/polar_config.yaml \
+    --prompt-data examples/swegym_slime_grpo/swegym_10_tasks.jsonl \
+    --num-rollouts 5
+```
+
+## CLI Commands
+
+### `polar cluster setup`
+
+One-time cluster initialization.
+
+### `polar cluster build-sif`
+
+Build Apptainer SIF images on the cluster.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--example` | calculator | Task type: `calculator`, `swegym`, `swebench_verified`, `train` |
+| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. |
+| `--instance-id` | -- | Specific instance (repeatable, for swegym/swebench) |
+| `--force` | -- | Rebuild even if SIF exists |
+
+### `polar cluster serve`
+
+Start vLLM + rollout + gateway services and wait until ready.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--model` | Qwen/Qwen3.5-27B | Model name for vLLM |
+| `--nodes` | 1 | Number of SLURM nodes |
+| `--gpus` | 8 | GPUs per node |
+| `--time` | 04:00:00 | Job time limit |
+| `--no-sync` | -- | Skip rsync to cluster |
+| `--no-wait` | -- | Submit and return immediately |
+| `--wait-timeout` | 600 | Seconds to wait for services |
+| `--dry-run` | -- | Print sbatch command only |
+
+### `polar cluster submit-task`
+
+Submit tasks to a running service.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--job-id` | (required) | SLURM job ID from `serve` |
+| `--example` | calculator | Task: `calculator`, `swegym`, `swebench_verified` |
+| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. |
+| `--num-rollouts` | 4 | Rollouts per task |
+| `--timeout-seconds` | 900 | Per-session timeout |
+| `--instance-id` | -- | Specific instance (repeatable) |
+
+### `polar cluster launch` (one-shot)
+
+Starts services, runs tasks, and exits in one SLURM job.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--example` | calculator | Task: `calculator`, `swegym`, `swebench_verified` |
+| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. |
+| `--model` | Qwen/Qwen3.5-27B | Model name for vLLM |
+| `--nodes` | 1 | Number of SLURM nodes |
+| `--gpus` | 8 | GPUs per node |
+| `--time` | 04:00:00 | Job time limit |
+| `--num-rollouts` | 4 | Rollouts per task |
+| `--timeout-seconds` | 900 | Per-session timeout |
+| `--instance-id` | -- | Specific instance (repeatable) |
+| `--no-sync` | -- | Skip rsync to cluster |
+| `--dry-run` | -- | Print sbatch command only |
+
+### `polar cluster train`
+
+Submit RL training job (Slime + Megatron GRPO).
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--polar-config` | (required) | Path to polar_config.yaml |
+| `--prompt-data` | (required) | Path to JSONL training data |
+| `--hf-checkpoint` | Qwen/Qwen3-4B | HuggingFace model checkpoint |
+| `--num-rollouts` | 5 | Training steps |
+| `--no-sync` | -- | Skip rsync to cluster |
+| `--no-wait` | -- | Submit and return immediately |
+| `--dry-run` | -- | Print sbatch command only |
+
+### `polar cluster status`
+
+Show job status and running services.
+
+### `polar cluster sync`
+
+Sync code and results between local and cluster.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--results-only` | -- | Only sync results (no code) |
+| `--code-only` | -- | Only sync code (no results) |
+| `--job-id` | -- | Sync a specific job's results |
+
+## Architecture
+
+### Two-Phase (serve + submit-task)
+
+The `serve` command submits a SLURM job that:
+
+```
+1. Discover hostnames (scontrol show hostnames)
+2. Generate topology.yaml
+3. Start vLLM server (GPU, ~2-5min to load model)
+4. Start rollout service (CPU)
+5. Start gateway node(s) (CPU, manages Apptainer containers)
+6. Write .services_ready sentinel
+7. Wait indefinitely (until scancel or SLURM timeout)
+```
+
+The `submit-task` command discovers the running service via the sentinel file and submits tasks.
+
+### One-Shot (launch)
+
+The `launch` command bundles everything into one SLURM job (steps 1-5 above + submit tasks + wait + cleanup).
+
+For multi-node jobs:
+- Node 0: vLLM + rollout + gateway
+- Node 1+: additional gateway nodes (via `srun --overlap`)
+
+## Results Structure
+
+```
+polar-serve_<jobid>/
+  topology.yaml
+  .services_ready
+  logs/
+    vllm.log
+    rollout.log
+    gateway_node-00.log
+  rollout_results/
+    task_<id>/ses_<id>.json
+  tasks/
+    request.json          # (calculator)
+    response.json         # (calculator)
+    manifest.json         # (swegym/swebench)
+    <instance_id>/        # (swegym/swebench: one dir per instance)
+      request.json
+      response.json
+    summary.json          # (swegym/swebench)
+```
+
+## Tuning Parameters
+
+| Parameter | Default | Notes |
+|-----------|---------|-------|
+| `--num-rollouts` | 4 | Number of parallel sessions per task |
+| `--timeout-seconds` | 900 | Per-session timeout. Use 2400 for swegym, 3600 for swebench |
+| `max_model_len` | 16384 | In cluster.yaml `model` section. Increase for complex tasks |
+| `max_num_seqs` | 64 | Max concurrent vLLM sequences |
+| `gpu_memory_utilization` | 0.90 | Reduce if OOM |
+| `tensor_parallel_size` | 8 | Match `gpus_per_node` |
+| `tool_call_parser` | qwen3_xml | Must match model. qwen3_xml for Qwen3.5, hermes for others |
+
+## Switching Models
+
+Edit your `cluster.yaml`:
+
+```yaml
+model:
+  name: "Qwen/Qwen3.5-27B"         # Change to your model
+  tensor_parallel_size: 8            # Adjust for model size
+  tool_call_parser: "qwen3_xml"      # Match model's tool call format
+```
+
+Or override on the command line:
+
+```bash
+polar cluster serve -c my-cluster.yaml --model "Qwen/Qwen3.5-72B"
+```
+
+**Note**: The model must be cached in `$HF_HOME` on the cluster (compute nodes run with `HF_HUB_OFFLINE=1`). To cache a new model:
+
+```bash
+huggingface-cli download Qwen/Qwen3.5-72B
+```
+
+## Backend Support
+
+The cluster config supports multiple backends via the `backend` field:
+
+| Backend | Status | Description |
+|---------|--------|-------------|
+| `slurm` | Supported | SSH + sbatch job submission |
+| `local` | Stub | Run services locally (use example scripts) |
+| `k8s`   | Planned | Kubernetes pod scheduling |
diff --git a/examples/slurm/cluster.yaml.example b/examples/slurm/cluster.yaml.example
new file mode 100644
index 00000000..1b550f8b
--- /dev/null
+++ b/examples/slurm/cluster.yaml.example
@@ -0,0 +1,71 @@
+# Polar cluster configuration.
+#
+# Copy this file and fill in your cluster details:
+#   cp examples/slurm/cluster.yaml.example my-cluster.yaml
+#   # Edit my-cluster.yaml
+#   polar cluster launch -c my-cluster.yaml
+#
+# The same config works for local, SLURM, and (future) Kubernetes backends.
+# Set 'backend' to select the deployment target.
+
+# ── Backend ─────────────────────────────────────────────────────────────────
+backend: slurm  # "local" | "slurm" | "k8s"
+
+# ── SLURM connection (required when backend=slurm) ─────────────────────────
+slurm:
+  login_node: "<your-login-node>"          # SSH host for job submission
+  account: "<your-slurm-account>"          # SLURM account/project string
+  partition: "<your-partition>"            # SLURM partition name
+
+# ── Paths on the cluster filesystem ─────────────────────────────────────────
+paths:
+  # Base workspace directory (required).  All other paths derive from this.
+  workspace: "/path/to/your/workspace"
+
+  # Override any derived path if your layout differs:
+  # polar_root: "${workspace}/polar"
+  # code: "${polar_root}/ProRL-Agent-Server"
+  # sif_dir: "${polar_root}/sif_images"
+  # results: "${polar_root}/results"
+  # venv: "${polar_root}/.venv"
+
+  # Optional: custom binary paths (leave commented if system-installed)
+  # apptainer_bin_dir: "/path/to/apptainer/usr/bin"
+  # cuda_home: "/path/to/cuda-12.x"
+
+# ── Model configuration ──────────────────────────────────────────────────────
+model:
+  name: "Qwen/Qwen3.5-27B"              # HuggingFace model ID
+  tensor_parallel_size: 8                # TP size for vLLM
+  max_model_len: 16384
+  gpu_memory_utilization: 0.90
+  max_num_seqs: 64
+  tool_call_parser: "qwen3_xml"          # qwen3_xml, hermes, etc.
+
+# ── Task defaults ────────────────────────────────────────────────────────────
+task:
+  example: "calculator"                  # calculator or swegym
+  harness: "opencode"                    # opencode, codex, swe_agent, etc.
+  num_rollouts: 4
+  timeout_seconds: 900
+
+# ── Resource allocation ──────────────────────────────────────────────────────
+resources:
+  nodes: 1
+  gpus_per_node: 8
+  cpus_per_task: 64
+  mem: "512G"
+  time: "04:00:00"                       # HH:MM:SS job time limit
+
+# ── Service ports ────────────────────────────────────────────────────────────
+ports:
+  vllm: 18000
+  rollout: 18080
+  gateway_base: 18100
+
+# ── Gateway tuning ───────────────────────────────────────────────────────────
+gateway:
+  max_init_workers: 8
+  max_run_workers: 4
+  max_postrun_workers: 4
+  ready_buffer_target: 4
diff --git a/examples/slurm/setup_cluster.sh b/examples/slurm/setup_cluster.sh
new file mode 100644
index 00000000..272ef07f
--- /dev/null
+++ b/examples/slurm/setup_cluster.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# setup_cluster.sh — One-time cluster setup for Polar on SLURM.
+#
+# Run via:
+#   polar cluster setup -c my-cluster.yaml
+#
+# Or manually on the cluster:
+#   export POLAR_WORKSPACE=/path/to/workspace
+#   bash examples/slurm/setup_cluster.sh
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source env.sh from the cluster templates
+source "${REPO_ROOT}/src/polar/cluster/templates/env.sh"
+
+echo "==============================================================="
+echo "Polar Cluster Setup"
+echo "==============================================================="
+
+# ── 1. Create directory structure ──────────────────────────────────────────────
+echo "[setup] Creating directories..."
+mkdir -p "${POLAR_ROOT}"
+mkdir -p "${POLAR_SIFS}"
+mkdir -p "${POLAR_RESULTS}"
+mkdir -p "${APPTAINER_CACHEDIR}"
+echo "  POLAR_ROOT:    ${POLAR_ROOT}"
+echo "  POLAR_SIFS:    ${POLAR_SIFS}"
+echo "  POLAR_RESULTS: ${POLAR_RESULTS}"
+echo "  APPTAINER_CACHEDIR: ${APPTAINER_CACHEDIR}"
+
+# ── 2. Check Apptainer ────────────────────────────────────────────────────────
+echo ""
+echo "[setup] Checking Apptainer..."
+if command -v apptainer &>/dev/null; then
+    echo "  Apptainer: $(which apptainer)"
+    apptainer --version
+else
+    echo "  WARNING: Apptainer not found on PATH."
+    echo "  Set paths.apptainer_bin_dir in your cluster.yaml."
+fi
+
+# ── 3. Create/update Python venv ──────────────────────────────────────────────
+echo ""
+echo "[setup] Setting up Python venv at ${POLAR_VENV}..."
+if [ ! -d "${POLAR_VENV}" ]; then
+    echo "  Creating new venv..."
+    python3 -m venv "${POLAR_VENV}"
+fi
+source "${POLAR_VENV}/bin/activate"
+
+echo "  Python: $(which python) ($(python --version))"
+
+# Install polar in editable mode
+echo "  Installing polar..."
+pip install --upgrade pip
+pip install -e "${POLAR_CODE}" 2>&1 | tail -5
+
+# Check if vLLM is installed
+if python -c "import vllm" 2>/dev/null; then
+    echo "  vLLM: $(python -c 'import vllm; print(vllm.__version__)')"
+else
+    echo ""
+    echo "  WARNING: vLLM is not installed in this venv."
+    echo "  To install: pip install vllm"
+fi
+
+# ── 4. Verify polar CLI ───────────────────────────────────────────────────────
+echo ""
+echo "[setup] Verifying polar CLI..."
+polar --help > /dev/null 2>&1 && echo "  polar CLI: OK" || echo "  ERROR: polar CLI not working"
+
+# ── 5. Summary ─────────────────────────────────────────────────────────────────
+echo ""
+echo "==============================================================="
+echo "Setup complete!"
+echo ""
+echo "Next steps:"
+echo "  1. Build SIF images:"
+echo "     polar cluster build-sif -c cluster.yaml --example calculator --harness opencode"
+echo ""
+echo "  2. Submit a job:"
+echo "     polar cluster launch -c cluster.yaml"
+echo "==============================================================="
diff --git a/examples/swebench_verified/README.md b/examples/swebench_verified/README.md
index 5dbc37b7..1385b0a9 100644
--- a/examples/swebench_verified/README.md
+++ b/examples/swebench_verified/README.md
@@ -87,3 +87,87 @@ uv run python examples/swebench_verified/submit_swebench_tasks.py \
   --max-concurrent 4 \
   --max-tasks 10
 ```
+
+Supported harness names: `claude_code`, `codex`, `opencode`, `openhands_sdk`
+
+## Cluster Deployment (SLURM)
+
+For running on a SLURM cluster with Apptainer containers and vLLM inference.
+See [examples/slurm/README.md](../slurm/README.md) for full documentation.
+
+### 1. Configure
+
+```bash
+cp examples/slurm/cluster.yaml.example my-cluster.yaml
+# Edit my-cluster.yaml with your cluster details
+```
+
+### 2. Populate Dataset Cache
+
+The task runner needs the full SWE-bench Verified dataset cached locally.
+Run once (requires `datasets` library):
+
+```bash
+python -c "from examples.swebench_verified.dataset import load_swebench_verified; load_swebench_verified()"
+```
+
+### 3. Build SIF Images
+
+Each SWE-bench Verified instance needs a per-instance SIF:
+
+```bash
+# Build SIF for a specific instance + harness
+polar cluster build-sif -c my-cluster.yaml \
+    --example swebench_verified --harness opencode \
+    --instance-id django__django-15098
+
+# Build multiple instances
+polar cluster build-sif -c my-cluster.yaml \
+    --example swebench_verified --harness opencode \
+    --instance-id django__django-15098 \
+    --instance-id sympy__sympy-18835
+```
+
+### 4. Start Services
+
+```bash
+polar cluster serve -c my-cluster.yaml
+```
+
+Once services are ready, the command prints the job ID.
+
+### 5. Submit Tasks
+
+```bash
+# Submit a single instance (use job ID from step 4)
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swebench_verified --harness opencode \
+    --timeout-seconds 3600 --instance-id django__django-15098
+
+# Submit multiple instances
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swebench_verified --harness opencode \
+    --timeout-seconds 3600 \
+    --instance-id django__django-15098 \
+    --instance-id sympy__sympy-18835
+```
+
+### 6. Stop Services
+
+```bash
+scancel JOB_ID
+```
+
+### 7. Collect Results
+
+```bash
+polar cluster sync -c my-cluster.yaml
+```
+
+**One-shot alternative** — start services, run tasks, and exit in one command:
+
+```bash
+polar cluster launch -c my-cluster.yaml \
+    --example swebench_verified --harness opencode \
+    --timeout-seconds 3600 --instance-id django__django-15098
+```
diff --git a/examples/swebench_verified/dataset.py b/examples/swebench_verified/dataset.py
index 7078dd69..143915b8 100644
--- a/examples/swebench_verified/dataset.py
+++ b/examples/swebench_verified/dataset.py
@@ -12,7 +12,10 @@
 DEFAULT_CACHE_PATH = Path.home() / ".cache" / "polar" / "swebench_verified.json"
 HARNESS_IMAGE_PREFIX = "polar-swebench"
 
-SUPPORTED_HARNESSES = ("opencode", "codex", "claude_code")
+SUPPORTED_HARNESSES = (
+    "opencode", "codex", "claude_code",
+    "gemini_cli", "qwen_code", "swe_agent", "openhands_sdk",
+)
 
 
 def sanitize_instance_id(instance_id: str) -> str:
diff --git a/examples/swebench_verified/submit_swebench_tasks.py b/examples/swebench_verified/submit_swebench_tasks.py
index 24be1290..f53fb025 100644
--- a/examples/swebench_verified/submit_swebench_tasks.py
+++ b/examples/swebench_verified/submit_swebench_tasks.py
@@ -34,6 +34,8 @@
     "codex": "@openai/codex@latest",
     "opencode": "opencode-ai@latest",
     "claude_code": "@anthropic-ai/claude-code@latest",
+    "gemini_cli": "@google/gemini-cli@latest",
+    "qwen_code": "@qwen-code/qwen-code@latest",
 }
 
 _PREPARE_BASE = (
@@ -48,8 +50,18 @@
 
 
 def prepare_command_for_harness(harness: str) -> str:
-    pkg = HARNESS_NPM_PACKAGE[harness]
-    return f"npm install -g {pkg} && {_PREPARE_BASE}"
+    if harness in HARNESS_NPM_PACKAGE:
+        pkg = HARNESS_NPM_PACKAGE[harness]
+        return f"npm install -g {pkg} && {_PREPARE_BASE}"
+    if harness == "swe_agent":
+        return (
+            "source /opt/miniconda3/etc/profile.d/conda.sh && "
+            "conda activate polar-sweagent && "
+            f"{_PREPARE_BASE}"
+        )
+    if harness == "openhands_sdk":
+        return _PREPARE_BASE
+    raise ValueError(f"Unknown harness: {harness}")
 
 
 def runtime_env_for_harness(harness: str) -> dict[str, str]:
@@ -146,6 +158,31 @@ def select_instances(args: argparse.Namespace) -> list[dict[str, Any]]:
     return instances
 
 
+def agent_settings_for_harness(harness: str) -> dict[str, Any]:
+    if harness == "swe_agent":
+        return {
+            "repo_path": "/polar/session/workspace",
+            "shell_preamble": (
+                "source /opt/miniconda3/etc/profile.d/conda.sh && "
+                "conda activate polar-sweagent && "
+                "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH"
+            ),
+        }
+    return {}
+
+
+def agent_env_for_harness(harness: str) -> dict[str, str]:
+    if harness in ("openhands_sdk", "openhands"):
+        return {"WORKSPACE_BASE": "/polar/session/workspace"}
+    return {}
+
+
+def runtime_kwargs_for_harness(harness: str) -> dict[str, Any]:
+    if harness == "swe_agent":
+        return {"fakeroot": True}
+    return {}
+
+
 def build_task_request(
     args: argparse.Namespace,
     *,
@@ -154,6 +191,7 @@ def build_task_request(
 ) -> dict[str, Any]:
     instance_id = str(instance["instance_id"])
     image = runtime_image_for_instance(instance_id)
+    kwargs = runtime_kwargs_for_harness(args.harness)
     return {
         "task_id": f"swebench-{args.harness}-{sanitize_instance_id(instance_id)}-{batch_id}",
         "instruction": str(instance["problem_statement"]).strip(),
@@ -166,11 +204,12 @@ def build_task_request(
             "env": runtime_env_for_harness(args.harness),
             "network": "host",
             "workdir": "/polar/session/workspace",
+            **({"kwargs": kwargs} if kwargs else {}),
         },
         "agent": {
             "harness": args.harness,
-            "settings": {},
-            "env": {},
+            "settings": agent_settings_for_harness(args.harness),
+            "env": agent_env_for_harness(args.harness),
         },
         "builder": {"strategy": "prefix_merging"},
         "evaluator": {
diff --git a/examples/swegym/README.md b/examples/swegym/README.md
index 67edf889..10441126 100644
--- a/examples/swegym/README.md
+++ b/examples/swegym/README.md
@@ -86,3 +86,63 @@ examples/swegym/<harness>/batches/<timestamp>/
 - The sample is text-only SWE-Gym data.
 - The submit helper extracts patches from `/polar/session/workspace`, then replays them onto `/testbed` for grading.
 - `swe_agent` uses a dedicated `polar-sweagent` environment inside the derived image.
+- `openhands_sdk` only builds on benchmark images whose native Python is already compatible.
+
+## Cluster Deployment (SLURM)
+
+For running on a SLURM cluster with Apptainer containers and vLLM inference.
+See [examples/slurm/README.md](../slurm/README.md) for full documentation.
+
+### 1. Configure
+
+```bash
+cp examples/slurm/cluster.yaml.example my-cluster.yaml
+# Edit my-cluster.yaml with your cluster details
+```
+
+### 2. Build SIF Images
+
+```bash
+polar cluster build-sif -c my-cluster.yaml --example swegym --harness swe_agent
+```
+
+### 3. Start Services
+
+```bash
+polar cluster serve -c my-cluster.yaml
+```
+
+Once services are ready, the command prints the job ID.
+
+### 4. Submit Tasks
+
+```bash
+# All 10 sample instances (use job ID from step 3)
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swegym --harness swe_agent \
+    --timeout-seconds 2400
+
+# Or a single instance
+polar cluster submit-task -c my-cluster.yaml \
+    --job-id JOB_ID --example swegym --harness swe_agent \
+    --timeout-seconds 2400 --instance-id getmoto__moto-7365
+```
+
+### 5. Stop Services
+
+```bash
+scancel JOB_ID
+```
+
+### 6. Collect Results
+
+```bash
+polar cluster sync -c my-cluster.yaml
+```
+
+**One-shot alternative** — start services, run tasks, and exit in one command:
+
+```bash
+polar cluster launch -c my-cluster.yaml --example swegym --harness swe_agent \
+    --timeout-seconds 2400
+```
diff --git a/examples/swegym/sample_tasks.py b/examples/swegym/sample_tasks.py
index 6cbba34e..65a5372c 100644
--- a/examples/swegym/sample_tasks.py
+++ b/examples/swegym/sample_tasks.py
@@ -152,3 +152,10 @@ def fetch_sample_instances(
     cache_file.parent.mkdir(parents=True, exist_ok=True)
     cache_file.write_text(json.dumps(ordered, indent=2, ensure_ascii=True, sort_keys=True))
     return ordered
+
+
+if __name__ == "__main__":
+    instances = fetch_sample_instances()
+    print(f"Cached {len(instances)} instances to {DEFAULT_CACHE_PATH}")
+    for inst in instances:
+        print(f"  {inst['instance_id']}")
diff --git a/pyproject.toml b/pyproject.toml
index 1cc4a30c..590c3010 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,3 +40,6 @@ package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["polar*"]
+
+[tool.setuptools.package-data]
+"polar.cluster" = ["templates/*.sbatch", "templates/*.sh"]
diff --git a/src/polar/agent/harnesses/codex.py b/src/polar/agent/harnesses/codex.py
index 18e1afd8..cac1f0a9 100644
--- a/src/polar/agent/harnesses/codex.py
+++ b/src/polar/agent/harnesses/codex.py
@@ -16,7 +16,7 @@ class CodexHarness(BaseHarness):
 
     def __init__(self, agent_spec: AgentSpec) -> None:
         super().__init__(agent_spec)
-        self._codex_home = "$HOME/.codex"
+        self._codex_home = "/root/.codex"
 
     async def setup(self, runtime: BaseRuntime) -> None:
         await runtime.exec(f"mkdir -p {self._codex_home}")
@@ -82,6 +82,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]:
         return [
             ExecInput(
                 command=(
+                    f"set -o pipefail && "
                     f"codex exec {flags_str} -- {escaped} "
                     f"2>&1 | tee {RUNTIME_AGENT_LOG_DIR}/codex.txt"
                 ),
diff --git a/src/polar/agent/harnesses/opencode.py b/src/polar/agent/harnesses/opencode.py
index 48659f05..95517148 100644
--- a/src/polar/agent/harnesses/opencode.py
+++ b/src/polar/agent/harnesses/opencode.py
@@ -70,6 +70,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]:
         return [
             ExecInput(
                 command=(
+                    f"set -o pipefail; "
                     f"opencode -m {shlex.quote(model)} run "
                     f"--format=json -- {escaped} "
                     f"2>&1 | tee {RUNTIME_AGENT_LOG_DIR}/opencode.txt"
diff --git a/src/polar/agent/harnesses/openhands_sdk.py b/src/polar/agent/harnesses/openhands_sdk.py
index 4c393c9d..ab0ad7a9 100644
--- a/src/polar/agent/harnesses/openhands_sdk.py
+++ b/src/polar/agent/harnesses/openhands_sdk.py
@@ -79,7 +79,8 @@ def run_steps(self, instruction: str) -> list[ExecInput]:
             ExecInput(
                 command=(
                     'export LLM_API_KEY="$OPENAI_API_KEY" LLM_BASE_URL="$OPENAI_BASE_URL" && '
-                    'PYTHON_BIN="$HOME/.venv/bin/python"; '
+                    'PYTHON_BIN="/opt/miniconda3/envs/polar-openhands/bin/python"; '
+                    '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="$HOME/.venv/bin/python"; '
                     '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="/opt/openhands-sdk-venv/bin/python"; '
                     '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="$(command -v python3 || command -v python)"; '
                     '"$PYTHON_BIN" '
@@ -93,12 +94,19 @@ def run_steps(self, instruction: str) -> list[ExecInput]:
 
 _RUNNER_SCRIPT = r'''#!/usr/bin/env python3
 """OpenHands SDK runner for Polar."""
-from __future__ import annotations
+import sys
+if sys.version_info < (3, 10):
+    print(
+        f"Error: OpenHands SDK requires Python >= 3.10, "
+        f"got {'.'.join(map(str, sys.version_info[:3]))}",
+        file=sys.stderr,
+    )
+    sys.exit(1)
 
 import json
 import os
-import sys
 from pathlib import Path
+from typing import Optional
 
 
 def _load_skills(skill_paths_raw: str) -> list[object]:
@@ -136,7 +144,7 @@ def _load_skills(skill_paths_raw: str) -> list[object]:
     return skills
 
 
-def _load_mcp_config() -> dict[str, object] | None:
+def _load_mcp_config() -> Optional[dict[str, object]]:
     raw = os.environ.get("MCP_SERVERS_JSON")
     if not raw:
         return None
diff --git a/src/polar/agent/harnesses/swe_agent.py b/src/polar/agent/harnesses/swe_agent.py
index f9ae21fa..39bf5db9 100644
--- a/src/polar/agent/harnesses/swe_agent.py
+++ b/src/polar/agent/harnesses/swe_agent.py
@@ -86,6 +86,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]:
                 command=(
                     f"cat > {self._problem_statement_path} << 'POLARINST'\n{safe_instruction}\nPOLARINST\n"
                     f"{preamble}"
+                    'set -o pipefail && '
                     'export OPENAI_API_KEY="$OPENAI_API_KEY" OPENAI_BASE_URL="$OPENAI_BASE_URL" && '
                     f"sweagent run "
                     f"--agent.model.name={shlex.quote(model)} "
diff --git a/src/polar/cli.py b/src/polar/cli.py
index ea96b9f3..66b78d1e 100644
--- a/src/polar/cli.py
+++ b/src/polar/cli.py
@@ -5,6 +5,7 @@
 import argparse
 import json
 from pathlib import Path
+import subprocess
 import sys
 from typing import Any
 from urllib.parse import urlparse
@@ -93,6 +94,108 @@ def build_parser() -> argparse.ArgumentParser:
         help="Print the raw response JSON.",
     )
 
+    # ── cluster subcommands ─────────────────────────────────────────────────
+    cluster_parser = subparsers.add_parser(
+        "cluster",
+        help="Cluster deployment operations (launch, setup, sync, build-sif, status, train).",
+    )
+    cluster_sub = cluster_parser.add_subparsers(
+        dest="cluster_command",
+        required=True,
+    )
+
+    # polar cluster launch
+    launch_p = cluster_sub.add_parser("launch", help="Sync code and submit a cluster job.")
+    launch_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    launch_p.add_argument("--example", default=None, help="Override task.example")
+    launch_p.add_argument("--harness", default=None, help="Override task.harness")
+    launch_p.add_argument("--model", default=None, help="Override model.name")
+    launch_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes")
+    launch_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node")
+    launch_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)")
+    launch_p.add_argument("--num-rollouts", type=int, default=None)
+    launch_p.add_argument("--timeout-seconds", type=float, default=None)
+    launch_p.add_argument(
+        "--instance-id", action="append", default=None,
+        help="SWE-Gym instance ID (repeatable; defaults to sample 10)",
+    )
+    launch_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster")
+    launch_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only")
+
+    # polar cluster setup
+    setup_p = cluster_sub.add_parser("setup", help="One-time cluster environment setup.")
+    setup_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+
+    # polar cluster status
+    cstatus_p = cluster_sub.add_parser("status", help="Check SLURM job status.")
+    cstatus_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    cstatus_p.add_argument("--job-id", default=None, help="Specific job ID to query")
+
+    # polar cluster sync
+    sync_p = cluster_sub.add_parser("sync", help="Sync code/results from cluster.")
+    sync_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    sync_p.add_argument("--job-id", default=None, help="Sync specific job results")
+    sync_p.add_argument("--code-only", action="store_true")
+    sync_p.add_argument("--results-only", action="store_true")
+    sync_p.add_argument("--dry-run", action="store_true")
+
+    # polar cluster build-sif
+    sif_p = cluster_sub.add_parser("build-sif", help="Build Apptainer SIF images.")
+    sif_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    sif_p.add_argument("--example", required=True, help="Example name (calculator, swegym, swebench_verified, train)")
+    sif_p.add_argument("--harness", default=None, help="Comma-separated harness names (required except for --example train)")
+    sif_p.add_argument("--force", action="store_true", help="Rebuild even if SIF exists")
+    sif_p.add_argument(
+        "--instance-id", action="append", default=None,
+        help="SWE-Gym instance ID (repeatable; defaults to sample 10)",
+    )
+
+    # polar cluster serve
+    serve_p = cluster_sub.add_parser("serve", help="Start services (vLLM + rollout + gateway).")
+    serve_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    serve_p.add_argument("--model", default=None, help="Override model.name")
+    serve_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes")
+    serve_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node")
+    serve_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)")
+    serve_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster")
+    serve_p.add_argument("--no-wait", action="store_true", help="Don't wait for services to be ready")
+    serve_p.add_argument("--wait-timeout", type=int, default=600, help="Seconds to wait for readiness (default: 600)")
+    serve_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only")
+
+    # polar cluster submit-task
+    submit_task_p = cluster_sub.add_parser("submit-task", help="Submit tasks to a running serve job.")
+    submit_task_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    submit_task_p.add_argument("--job-id", required=True, help="SLURM job ID of the serve job")
+    submit_task_p.add_argument("--example", default=None, help="Override task.example")
+    submit_task_p.add_argument("--harness", default=None, help="Override task.harness")
+    submit_task_p.add_argument("--num-rollouts", type=int, default=None)
+    submit_task_p.add_argument("--timeout-seconds", type=float, default=None)
+    submit_task_p.add_argument(
+        "--instance-id", action="append", default=None,
+        help="SWE-Gym instance ID (repeatable; defaults to sample 10)",
+    )
+
+    # polar cluster train
+    train_p = cluster_sub.add_parser("train", help="Submit a distributed RL training job.")
+    train_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml")
+    train_p.add_argument("--polar-config", default=None, help="Path to polar_config.yaml (bridge config)")
+    train_p.add_argument("--prompt-data", default=None, help="Path to JSONL training data")
+    train_p.add_argument("--hf-checkpoint", default=None, help="HuggingFace model checkpoint")
+    train_p.add_argument("--num-rollouts", type=int, default=None, help="Number of training steps")
+    train_p.add_argument("--rollout-batch-size", type=int, default=None)
+    train_p.add_argument("--n-samples-per-prompt", type=int, default=None)
+    train_p.add_argument("--global-batch-size", type=int, default=None)
+    train_p.add_argument("--actor-gpus", type=int, default=None)
+    train_p.add_argument("--rollout-gpus", type=int, default=None)
+    train_p.add_argument("--tp-size", type=int, default=None)
+    train_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes")
+    train_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node")
+    train_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)")
+    train_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster")
+    train_p.add_argument("--no-wait", action="store_true", help="Don't wait for training to complete")
+    train_p.add_argument("--wait-timeout", type=int, default=3600, help="Seconds to wait (default: 3600)")
+    train_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only")
+
     return parser
 
 
@@ -111,6 +214,8 @@ def main(argv: list[str] | None = None) -> int:
             return _handle_submit(args)
         if args.command == "status":
             return _handle_status(args)
+        if args.command == "cluster":
+            return _handle_cluster(args)
     except httpx.HTTPStatusError as exc:
         body = exc.response.text.strip()
         if body:
@@ -127,14 +232,161 @@ def main(argv: list[str] | None = None) -> int:
     except httpx.HTTPError as exc:
         print(f"error: could not reach the rollout service: {exc}", file=sys.stderr)
         return 1
-    except (FileNotFoundError, ValueError) as exc:
+    except NotImplementedError as exc:
         print(f"error: {exc}", file=sys.stderr)
         return 1
+    except (FileNotFoundError, ValueError, TimeoutError) as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 1
+    except subprocess.CalledProcessError as exc:
+        print(f"error: command failed with exit code {exc.returncode}", file=sys.stderr)
+        if exc.stderr:
+            print(exc.stderr.strip(), file=sys.stderr)
+        return 1
 
     parser.error(f"Unknown command: {args.command}")
     return 2
 
 
+def _handle_cluster(args: argparse.Namespace) -> int:
+    # Lazy imports — avoid loading cluster modules for non-cluster commands.
+    from polar.cluster.config import ClusterConfig
+    from polar.cluster.backend import get_backend
+
+    config = ClusterConfig.load(args.config)
+    overrides = _build_cluster_overrides(args)
+    if overrides:
+        config = config.apply_overrides(overrides)
+
+    repo_root = Path.cwd()
+    backend = get_backend(config)
+
+    cmd = args.cluster_command
+    if cmd == "launch":
+        backend.launch(repo_root, dry_run=args.dry_run, no_sync=args.no_sync)
+        return 0
+    if cmd == "setup":
+        backend.setup(repo_root)
+        return 0
+    if cmd == "status":
+        result = backend.status(job_id=args.job_id)
+        jobs = result.get("jobs", [])
+        if not jobs:
+            print("No jobs found.")
+        else:
+            print(f"{'JOB_ID':<12} {'NAME':<30} {'STATE':<12} {'TIME':<10} {'NODES'}")
+            for j in jobs:
+                print(f"{j.get('job_id',''):<12} {j.get('name',''):<30} {j.get('state',''):<12} {j.get('time',''):<10} {j.get('nodes','')}")
+        return 0
+    if cmd == "sync":
+        backend.sync(
+            repo_root,
+            job_id=args.job_id,
+            code_only=args.code_only,
+            results_only=args.results_only,
+            dry_run=args.dry_run,
+        )
+        return 0
+    if cmd == "build-sif":
+        if args.example != "train" and not args.harness:
+            print("error: --harness is required for non-train examples", file=sys.stderr)
+            return 1
+        harnesses = [h.strip() for h in args.harness.split(",")] if args.harness else []
+        results = backend.build_sif(
+            repo_root, args.example, harnesses,
+            force=args.force,
+            instance_ids=getattr(args, "instance_id", None),
+        )
+        for key, sif_path in results.items():
+            print(f"  {key}: {sif_path}")
+        return 0
+    if cmd == "serve":
+        result = backend.serve(
+            repo_root,
+            dry_run=args.dry_run,
+            no_sync=args.no_sync,
+            wait=not args.no_wait,
+            wait_timeout=args.wait_timeout,
+        )
+        if result:
+            print(f"\n[cluster] Services ready.")
+            print(f"[cluster] Job ID: {result['job_id']}")
+            print(f"[cluster] Topology: {result['topology']}")
+            print(f"\n[cluster] Submit tasks with:")
+            print(f"  polar cluster submit-task -c {args.config} \\")
+            print(f"      --job-id {result['job_id']} --example calculator --harness opencode")
+        return 0
+    if cmd == "submit-task":
+        return backend.submit_task(
+            repo_root,
+            job_id=args.job_id,
+            example=getattr(args, "example", None),
+            harness=getattr(args, "harness", None),
+        )
+    if cmd == "train":
+        result = backend.train(
+            repo_root,
+            dry_run=args.dry_run,
+            no_sync=args.no_sync,
+            wait=not args.no_wait,
+            wait_timeout=args.wait_timeout,
+        )
+        if result:
+            print(f"\n[cluster] Training job info:")
+            for k, v in result.items():
+                print(f"  {k}: {v}")
+        return 0
+
+    print(f"Unknown cluster command: {cmd}", file=sys.stderr)
+    return 2
+
+
+def _build_cluster_overrides(args: argparse.Namespace) -> dict:
+    """Extract CLI flag overrides into a nested dict for ``ClusterConfig.apply_overrides``."""
+    overrides: dict = {}
+    if getattr(args, "example", None):
+        overrides.setdefault("task", {})["example"] = args.example
+    if getattr(args, "harness", None) and args.cluster_command == "launch":
+        overrides.setdefault("task", {})["harness"] = args.harness
+    if getattr(args, "model", None):
+        overrides.setdefault("model", {})["name"] = args.model
+    if getattr(args, "nodes", None) is not None:
+        overrides.setdefault("resources", {})["nodes"] = args.nodes
+    if getattr(args, "gpus", None) is not None:
+        overrides.setdefault("resources", {})["gpus_per_node"] = args.gpus
+    if getattr(args, "time", None):
+        overrides.setdefault("resources", {})["time"] = args.time
+    if getattr(args, "num_rollouts", None) is not None:
+        if getattr(args, "cluster_command", None) == "train":
+            overrides.setdefault("train", {})["num_rollouts"] = args.num_rollouts
+        else:
+            overrides.setdefault("task", {})["num_rollouts"] = args.num_rollouts
+    if getattr(args, "timeout_seconds", None) is not None:
+        overrides.setdefault("task", {})["timeout_seconds"] = args.timeout_seconds
+    if getattr(args, "instance_id", None):
+        overrides.setdefault("task", {})["instance_ids"] = args.instance_id
+    # Train-specific overrides
+    if getattr(args, "polar_config", None):
+        overrides.setdefault("train", {})["polar_config"] = args.polar_config
+    if getattr(args, "prompt_data", None):
+        overrides.setdefault("train", {})["prompt_data"] = args.prompt_data
+    if getattr(args, "hf_checkpoint", None):
+        overrides.setdefault("train", {})["hf_checkpoint"] = args.hf_checkpoint
+    if getattr(args, "rollout_batch_size", None) is not None:
+        overrides.setdefault("train", {})["rollout_batch_size"] = args.rollout_batch_size
+    if getattr(args, "n_samples_per_prompt", None) is not None:
+        overrides.setdefault("train", {})["n_samples_per_prompt"] = args.n_samples_per_prompt
+    if getattr(args, "global_batch_size", None) is not None:
+        overrides.setdefault("train", {})["global_batch_size"] = args.global_batch_size
+    if getattr(args, "actor_gpus", None) is not None:
+        overrides.setdefault("train", {})["actor_gpus"] = args.actor_gpus
+    if getattr(args, "rollout_gpus", None) is not None:
+        overrides.setdefault("train", {})["rollout_gpus"] = args.rollout_gpus
+    if getattr(args, "tp_size", None) is not None:
+        overrides.setdefault("train", {})["tp_size"] = args.tp_size
+    return overrides
+
+
 def _handle_submit(args: argparse.Namespace) -> int:
     rollout_url = _resolve_rollout_url(args.config, args.rollout_url)
     payload = _load_structured_file(args.task_file)
diff --git a/src/polar/cluster/__init__.py b/src/polar/cluster/__init__.py
new file mode 100644
index 00000000..a7ec9ca0
--- /dev/null
+++ b/src/polar/cluster/__init__.py
@@ -0,0 +1,6 @@
+"""Polar cluster deployment — launch jobs on local, SLURM, or K8s backends."""
+
+from polar.cluster.config import ClusterConfig
+from polar.cluster.backend import ClusterBackend, get_backend
+
+__all__ = ["ClusterConfig", "ClusterBackend", "get_backend"]
diff --git a/src/polar/cluster/backend.py b/src/polar/cluster/backend.py
new file mode 100644
index 00000000..5e239420
--- /dev/null
+++ b/src/polar/cluster/backend.py
@@ -0,0 +1,127 @@
+"""Abstract cluster backend and factory."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from polar.cluster.config import ClusterConfig
+
+
+class ClusterBackend(ABC):
+    """Base class for deployment backends (local, SLURM, K8s, ...)."""
+
+    def __init__(self, config: ClusterConfig) -> None:
+        self.config = config
+
+    @abstractmethod
+    def launch(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+    ) -> str:
+        """Sync code and launch a job. Return a job identifier string."""
+
+    @abstractmethod
+    def setup(self, repo_root: Path) -> None:
+        """One-time environment setup on the target cluster."""
+
+    @abstractmethod
+    def status(self, job_id: str | None = None) -> dict[str, Any]:
+        """Query job / service status."""
+
+    @abstractmethod
+    def sync(
+        self,
+        repo_root: Path,
+        *,
+        job_id: str | None = None,
+        code_only: bool = False,
+        results_only: bool = False,
+        dry_run: bool = False,
+    ) -> None:
+        """Sync results (and optionally code) back from the cluster."""
+
+    @abstractmethod
+    def build_sif(
+        self,
+        repo_root: Path,
+        example: str,
+        harnesses: list[str],
+        *,
+        force: bool = False,
+        instance_ids: list[str] | None = None,
+    ) -> dict[str, Path]:
+        """Build Apptainer SIF images. Return ``{key: sif_path}``.
+
+        For calculator, *key* is the harness name.
+        For swegym, *key* is ``harness/sanitized_instance_id``.
+        When *instance_ids* is ``None`` for swegym, builds all sample instances.
+        """
+
+
+    # ── Optional two-phase methods (non-abstract) ─────────────────────────
+
+    def serve(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+        wait: bool = True,
+        wait_timeout: int = 600,
+    ) -> dict[str, str]:
+        """Start services without submitting tasks. Return job info."""
+        raise NotImplementedError(
+            f"The {type(self).__name__} backend does not support 'serve'. "
+            "Use 'polar cluster launch' for a combined workflow."
+        )
+
+    def submit_task(
+        self,
+        repo_root: Path,
+        *,
+        job_id: str,
+        example: str | None = None,
+        harness: str | None = None,
+    ) -> int:
+        """Submit tasks to a running service. Return exit code."""
+        raise NotImplementedError(
+            f"The {type(self).__name__} backend does not support 'submit-task'. "
+            "Use 'polar cluster launch' for a combined workflow."
+        )
+
+    def train(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+        wait: bool = True,
+        wait_timeout: int = 3600,
+    ) -> dict[str, str]:
+        """Submit a training job. Return job info dict."""
+        raise NotImplementedError(
+            f"The {type(self).__name__} backend does not support 'train'."
+        )
+
+
+def get_backend(config: ClusterConfig) -> ClusterBackend:
+    """Return the appropriate backend for *config.backend*."""
+    if config.backend == "slurm":
+        from polar.cluster.slurm import SlurmBackend
+
+        return SlurmBackend(config)
+    if config.backend == "local":
+        from polar.cluster.local import LocalBackend
+
+        return LocalBackend(config)
+    if config.backend == "k8s":
+        raise NotImplementedError(
+            "Kubernetes backend is not yet implemented. "
+            "Contributions welcome!"
+        )
+    raise ValueError(f"Unknown backend: {config.backend!r}")
diff --git a/src/polar/cluster/config.py b/src/polar/cluster/config.py
new file mode 100644
index 00000000..b0b332de
--- /dev/null
+++ b/src/polar/cluster/config.py
@@ -0,0 +1,300 @@
+"""Unified cluster configuration model.
+
+Parses a ``cluster.yaml`` file into a typed Pydantic model. The config is
+backend-agnostic: the same YAML schema works for local, SLURM, and (future)
+K8s backends — only the ``backend`` field and its corresponding connection
+section differ.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Literal
+
+import yaml
+from pydantic import BaseModel, model_validator
+
+
+class SlurmConnection(BaseModel):
+    """SLURM-specific connection details (only required when ``backend: slurm``)."""
+
+    login_node: str = ""
+    account: str = ""
+    partition: str = ""
+
+
+class ClusterPaths(BaseModel):
+    """Filesystem paths on the target cluster / machine."""
+
+    workspace: str = ""
+    polar_root: str = ""
+    code: str = ""
+    sif_dir: str = ""
+    results: str = ""
+    venv: str = ""
+    apptainer_bin_dir: str = ""
+    cuda_home: str = ""
+
+    @model_validator(mode="after")
+    def _derive_paths(self) -> "ClusterPaths":
+        if self.workspace:
+            if not self.polar_root:
+                self.polar_root = f"{self.workspace}/polar"
+            if not self.code:
+                self.code = f"{self.polar_root}/ProRL-Agent-Server"
+            if not self.sif_dir:
+                self.sif_dir = f"{self.polar_root}/sif_images"
+            if not self.results:
+                self.results = f"{self.polar_root}/results"
+            if not self.venv:
+                self.venv = f"{self.polar_root}/.venv"
+        return self
+
+
+class ModelConfig(BaseModel):
+    """LLM model configuration for vLLM."""
+
+    name: str = "Qwen/Qwen3.5-27B"
+    tensor_parallel_size: int = 8
+    max_model_len: int = 16384
+    gpu_memory_utilization: float = 0.90
+    max_num_seqs: int = 64
+    tool_call_parser: str = "qwen3_xml"
+
+
+class TaskConfig(BaseModel):
+    """Default task / example settings."""
+
+    example: str = "calculator"
+    harness: str = "opencode"
+    num_rollouts: int = 4
+    timeout_seconds: float = 900.0
+    instance_ids: list[str] = []
+
+
+class ResourceConfig(BaseModel):
+    """Compute resource allocation."""
+
+    nodes: int = 1
+    gpus_per_node: int = 8
+    cpus_per_task: int = 64
+    mem: str = "512G"
+    time: str = "04:00:00"
+
+
+class PortConfig(BaseModel):
+    """Service port assignments."""
+
+    vllm: int = 18000
+    rollout: int = 18080
+    gateway_base: int = 18100
+
+
+class GatewayTuning(BaseModel):
+    """Gateway worker pool sizing."""
+
+    max_init_workers: int = 8
+    max_run_workers: int = 4
+    max_postrun_workers: int = 4
+    ready_buffer_target: int = 4
+
+
+class TrainConfig(BaseModel):
+    """RL training configuration for Slime + Megatron GRPO."""
+
+    polar_config: str = ""
+    prompt_data: str = ""
+    hf_checkpoint: str = "Qwen/Qwen3-4B"
+    torch_dist_dir: str = ""
+    save_dir: str = ""
+    model_args: list[str] = [
+        "--swiglu",
+        "--num-layers", "36",
+        "--hidden-size", "2560",
+        "--ffn-hidden-size", "9728",
+        "--num-attention-heads", "32",
+        "--group-query-attention",
+        "--num-query-groups", "8",
+        "--use-rotary-position-embeddings",
+        "--disable-bias-linear",
+        "--normalization", "RMSNorm",
+        "--norm-epsilon", "1e-6",
+        "--rotary-base", "1000000",
+        "--vocab-size", "151936",
+        "--kv-channels", "128",
+        "--qk-layernorm",
+    ]
+    num_rollouts: int = 5
+    rollout_batch_size: int = 2
+    n_samples_per_prompt: int = 16
+    global_batch_size: int = 32
+    actor_gpus: int = 4
+    rollout_gpus: int = 4
+    tp_size: int = 2
+    sglang_router_port: int = 9000
+    ray_port: int = 6379
+    ray_dashboard_port: int = 8265
+    extra_args: list[str] = []
+    wandb_project: str = ""
+    wandb_exp_name: str = ""
+    wandb_group: str = ""
+
+
+class ClusterConfig(BaseModel):
+    """Top-level cluster configuration.
+
+    The ``backend`` field selects which deployment backend to use:
+    ``"local"``, ``"slurm"``, or ``"k8s"`` (future).
+    """
+
+    backend: Literal["local", "slurm", "k8s"] = "slurm"
+    slurm: SlurmConnection = SlurmConnection()
+    paths: ClusterPaths = ClusterPaths()
+    model: ModelConfig = ModelConfig()
+    task: TaskConfig = TaskConfig()
+    resources: ResourceConfig = ResourceConfig()
+    ports: PortConfig = PortConfig()
+    gateway: GatewayTuning = GatewayTuning()
+    train: TrainConfig = TrainConfig()
+
+    # ── Constructors ─────────────────────────────────────────────────────────
+
+    @classmethod
+    def load(cls, path: str | Path) -> "ClusterConfig":
+        """Load configuration from a YAML file."""
+        p = Path(path)
+        if not p.exists():
+            raise FileNotFoundError(f"Cluster config not found: {p}")
+        with p.open() as fh:
+            raw = yaml.safe_load(fh) or {}
+        if not isinstance(raw, dict):
+            raise ValueError(f"Cluster config must be a YAML mapping: {p}")
+        # Support legacy configs that use 'cluster' section for slurm fields
+        if "cluster" in raw and "slurm" not in raw:
+            raw["slurm"] = raw.pop("cluster")
+        return cls.model_validate(raw)
+
+    # ── Helpers ───────────────────────────────────────────────────────────────
+
+    def sbatch_export_vars(self) -> dict[str, str]:
+        """Build the flat env-var dict passed to ``sbatch --export``."""
+        v: dict[str, str] = {
+            "POLAR_CODE": self.paths.code,
+            "POLAR_WORKSPACE": self.paths.workspace,
+            "EXAMPLE": self.task.example,
+            "HARNESS": self.task.harness,
+            "MODEL_NAME": self.model.name,
+            "MODEL_PATH": self.model.name,
+            "TENSOR_PARALLEL_SIZE": str(self.model.tensor_parallel_size),
+            "NUM_ROLLOUTS": str(self.task.num_rollouts),
+            "TIMEOUT_SECONDS": str(self.task.timeout_seconds),
+            "VLLM_PORT": str(self.ports.vllm),
+            "ROLLOUT_PORT": str(self.ports.rollout),
+            "GATEWAY_BASE_PORT": str(self.ports.gateway_base),
+            "MAX_INIT_WORKERS": str(self.gateway.max_init_workers),
+            "MAX_RUN_WORKERS": str(self.gateway.max_run_workers),
+            "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers),
+            "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target),
+            "GPU_MEMORY_UTILIZATION": str(self.model.gpu_memory_utilization),
+            "MAX_MODEL_LEN": str(self.model.max_model_len),
+            "MAX_NUM_SEQS": str(self.model.max_num_seqs),
+            "TOOL_CALL_PARSER": self.model.tool_call_parser,
+        }
+        if self.task.instance_ids:
+            v["INSTANCE_IDS"] = ",".join(self.task.instance_ids)
+        if self.paths.apptainer_bin_dir:
+            v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir
+        if self.paths.cuda_home:
+            v["CUDA_HOME"] = self.paths.cuda_home
+        # swe_agent's swerex needs --fakeroot in Apptainer for chown support
+        if self.task.harness == "swe_agent":
+            v["RUNTIME_FAKEROOT"] = "true"
+        return v
+
+    def sbatch_serve_export_vars(self) -> dict[str, str]:
+        """Build env-var dict for serve-only sbatch (no task-specific vars)."""
+        v: dict[str, str] = {
+            "POLAR_CODE": self.paths.code,
+            "POLAR_WORKSPACE": self.paths.workspace,
+            "MODEL_NAME": self.model.name,
+            "MODEL_PATH": self.model.name,
+            "TENSOR_PARALLEL_SIZE": str(self.model.tensor_parallel_size),
+            "VLLM_PORT": str(self.ports.vllm),
+            "ROLLOUT_PORT": str(self.ports.rollout),
+            "GATEWAY_BASE_PORT": str(self.ports.gateway_base),
+            "MAX_INIT_WORKERS": str(self.gateway.max_init_workers),
+            "MAX_RUN_WORKERS": str(self.gateway.max_run_workers),
+            "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers),
+            "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target),
+            "GPU_MEMORY_UTILIZATION": str(self.model.gpu_memory_utilization),
+            "MAX_MODEL_LEN": str(self.model.max_model_len),
+            "MAX_NUM_SEQS": str(self.model.max_num_seqs),
+            "TOOL_CALL_PARSER": self.model.tool_call_parser,
+        }
+        if self.paths.apptainer_bin_dir:
+            v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir
+        if self.paths.cuda_home:
+            v["CUDA_HOME"] = self.paths.cuda_home
+        return v
+
+    def sbatch_train_export_vars(self) -> dict[str, str]:
+        """Build env-var dict for the training sbatch job."""
+        t = self.train
+        v: dict[str, str] = {
+            "POLAR_CODE": self.paths.code,
+            "POLAR_WORKSPACE": self.paths.workspace,
+            "POLAR_CONFIG_PATH": t.polar_config,
+            "PROMPT_DATA": t.prompt_data,
+            "HF_CHECKPOINT": t.hf_checkpoint,
+            "MODEL_NAME": t.hf_checkpoint,
+            "TRAIN_NUM_ROLLOUTS": str(t.num_rollouts),
+            "ROLLOUT_BATCH_SIZE": str(t.rollout_batch_size),
+            "N_SAMPLES_PER_PROMPT": str(t.n_samples_per_prompt),
+            "GLOBAL_BATCH_SIZE": str(t.global_batch_size),
+            "ACTOR_GPUS": str(t.actor_gpus),
+            "ROLLOUT_GPUS": str(t.rollout_gpus),
+            "TP_SIZE": str(t.tp_size),
+            "SGLANG_ROUTER_PORT": str(t.sglang_router_port),
+            "RAY_PORT": str(t.ray_port),
+            "RAY_DASHBOARD_PORT": str(t.ray_dashboard_port),
+            "ROLLOUT_PORT": str(self.ports.rollout),
+            "GATEWAY_BASE_PORT": str(self.ports.gateway_base),
+            "MAX_INIT_WORKERS": str(self.gateway.max_init_workers),
+            "MAX_RUN_WORKERS": str(self.gateway.max_run_workers),
+            "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers),
+            "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target),
+        }
+        if t.torch_dist_dir:
+            v["TORCH_DIST_DIR"] = t.torch_dist_dir
+        if t.save_dir:
+            v["TRAIN_SAVE_DIR"] = t.save_dir
+        if t.model_args:
+            v["MODEL_ARGS"] = " ".join(t.model_args)
+        if t.extra_args:
+            v["EXTRA_TRAIN_ARGS"] = " ".join(t.extra_args)
+        if t.wandb_project:
+            v["WANDB_PROJECT"] = t.wandb_project
+        if t.wandb_exp_name:
+            v["WANDB_EXP_NAME"] = t.wandb_exp_name
+        if t.wandb_group:
+            v["WANDB_GROUP"] = t.wandb_group
+        if self.paths.apptainer_bin_dir:
+            v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir
+        if self.paths.cuda_home:
+            v["CUDA_HOME"] = self.paths.cuda_home
+        return v
+
+    def apply_overrides(self, overrides: dict[str, Any]) -> "ClusterConfig":
+        """Return a new config with *overrides* deep-merged on top."""
+        data = self.model_dump()
+        _deep_merge(data, overrides)
+        return ClusterConfig.model_validate(data)
+
+
+def _deep_merge(base: dict, overlay: dict) -> None:
+    """Recursively merge *overlay* into *base* in place."""
+    for key, value in overlay.items():
+        if key in base and isinstance(base[key], dict) and isinstance(value, dict):
+            _deep_merge(base[key], value)
+        else:
+            base[key] = value
diff --git a/src/polar/cluster/local.py b/src/polar/cluster/local.py
new file mode 100644
index 00000000..293da892
--- /dev/null
+++ b/src/polar/cluster/local.py
@@ -0,0 +1,57 @@
+"""Local backend — run Polar services directly on the current machine."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from polar.cluster.backend import ClusterBackend
+
+
+class LocalBackend(ClusterBackend):
+    """Run Polar services locally (no cluster scheduler).
+
+    For local development the recommended flow is the per-example
+    ``submit_tasks.py`` scripts under ``examples/``.  This backend exists
+    as a placeholder so the unified config schema works with
+    ``backend: local`` and can be extended in the future.
+    """
+
+    def launch(self, repo_root: Path, *, dry_run: bool = False, no_sync: bool = False) -> str:
+        raise NotImplementedError(
+            "Local launch is not yet integrated into 'polar cluster launch'.\n"
+            "Use the per-example submit scripts instead:\n"
+            "  python examples/calculator/opencode/submit_tasks.py"
+        )
+
+    def setup(self, repo_root: Path) -> None:
+        print("[local] No setup required for local backend.")
+
+    def status(self, job_id: str | None = None) -> dict[str, Any]:
+        return {"backend": "local", "status": "not implemented"}
+
+    def sync(
+        self,
+        repo_root: Path,
+        *,
+        job_id: str | None = None,
+        code_only: bool = False,
+        results_only: bool = False,
+        dry_run: bool = False,
+    ) -> None:
+        print("[local] No sync needed for local backend.")
+
+    def build_sif(
+        self,
+        repo_root: Path,
+        example: str,
+        harnesses: list[str],
+        *,
+        force: bool = False,
+        instance_ids: list[str] | None = None,
+    ) -> dict[str, Path]:
+        raise NotImplementedError(
+            "Local SIF builds require Docker or Apptainer installed locally.\n"
+            "Use 'docker build' in the example directory, or set backend: slurm\n"
+            "to build on a cluster with Apptainer."
+        )
diff --git a/src/polar/cluster/slurm.py b/src/polar/cluster/slurm.py
new file mode 100644
index 00000000..40618e98
--- /dev/null
+++ b/src/polar/cluster/slurm.py
@@ -0,0 +1,1065 @@
+"""SLURM cluster backend — rsync code, submit sbatch jobs, sync results."""
+
+from __future__ import annotations
+
+import importlib.resources
+import platform
+import re
+import shlex
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+from polar.cluster.backend import ClusterBackend
+from polar.cluster.config import ClusterConfig
+
+# Files and directories excluded from rsync to the cluster.
+_RSYNC_EXCLUDES = [
+    ".git",
+    "__pycache__",
+    "*.pyc",
+    ".venv",
+    "node_modules",
+    "worklogs/",
+    "results/",
+    "rollout_results/",
+    "*.egg-info",
+]
+
+
+class SlurmBackend(ClusterBackend):
+    """Deploy Polar via SSH + sbatch on a SLURM cluster."""
+
+    def __init__(self, config: ClusterConfig) -> None:
+        super().__init__(config)
+        slurm = config.slurm
+        if not slurm.login_node:
+            raise ValueError("slurm.login_node is required for the SLURM backend")
+        if not slurm.account:
+            raise ValueError("slurm.account is required for the SLURM backend")
+        if not slurm.partition:
+            raise ValueError("slurm.partition is required for the SLURM backend")
+        if not config.paths.workspace:
+            raise ValueError("paths.workspace is required for the SLURM backend")
+
+    # ── Public API ────────────────────────────────────────────────────────────
+
+    def launch(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+    ) -> str:
+        cfg = self.config
+        self._print_summary()
+
+        if not no_sync:
+            self._sync_code_to_cluster(repo_root)
+
+        # Pre-populate swegym sample cache so the job can find instance data
+        if cfg.task.example == "swegym":
+            self._sync_swegym_cache()
+        if cfg.task.example == "swebench_verified":
+            self._sync_swebench_cache()
+
+        sbatch_cmd = self._build_sbatch_command()
+        print(f"\n[cluster] sbatch command:\n  {sbatch_cmd}")
+
+        if dry_run:
+            print("\n[cluster] Dry run — not submitting.")
+            return ""
+
+        results_dir = cfg.paths.results
+        self._ssh_run(f"mkdir -p '{results_dir}'")
+        out = self._ssh_run(sbatch_cmd, capture=True)
+        job_id = out.strip().split()[-1]
+
+        login = cfg.slurm.login_node
+        example, harness = cfg.task.example, cfg.task.harness
+        print(f"\n[cluster] Job submitted: {job_id}")
+        print(f"[cluster] Monitor:")
+        print(f"  ssh {login} squeue -j {job_id}")
+        print(f"  ssh {login} 'tail -f {results_dir}/polar-{example}-{harness}_{job_id}.out'")
+        print(f"\n[cluster] Sync results after completion:")
+        print(f"  polar cluster sync -c <config>")
+        return job_id
+
+    def setup(self, repo_root: Path) -> None:
+        cfg = self.config
+        self._sync_code_to_cluster(repo_root)
+        print(f"\n[cluster] Running setup on {cfg.slurm.login_node}...")
+        env_parts = [f"export POLAR_WORKSPACE='{cfg.paths.workspace}'"]
+        if cfg.paths.apptainer_bin_dir:
+            env_parts.append(f"export APPTAINER_BIN_DIR='{cfg.paths.apptainer_bin_dir}'")
+        if cfg.paths.cuda_home:
+            env_parts.append(f"export CUDA_HOME='{cfg.paths.cuda_home}'")
+        env_str = " && ".join(env_parts)
+        setup_script = f"{cfg.paths.code}/examples/slurm/setup_cluster.sh"
+        self._ssh_run(f"{env_str} && cd '{cfg.paths.code}' && bash '{setup_script}'")
+        print("[cluster] Setup complete.")
+
+    def status(self, job_id: str | None = None) -> dict[str, Any]:
+        if job_id:
+            out = self._ssh_run(
+                f"squeue -j {job_id} --format='%i %j %T %M %N' --noheader",
+                capture=True,
+            )
+        else:
+            out = self._ssh_run(
+                "squeue -u $USER --format='%i %j %T %M %N' --noheader",
+                capture=True,
+            )
+        jobs: list[dict[str, str]] = []
+        for line in out.strip().splitlines():
+            parts = line.split(None, 4)
+            if len(parts) >= 3:
+                jobs.append({
+                    "job_id": parts[0],
+                    "name": parts[1] if len(parts) > 1 else "",
+                    "state": parts[2] if len(parts) > 2 else "",
+                    "time": parts[3] if len(parts) > 3 else "",
+                    "nodes": parts[4] if len(parts) > 4 else "",
+                })
+        return {"jobs": jobs}
+
+    def sync(
+        self,
+        repo_root: Path,
+        *,
+        job_id: str | None = None,
+        code_only: bool = False,
+        results_only: bool = False,
+        dry_run: bool = False,
+    ) -> None:
+        cfg = self.config
+        login = cfg.slurm.login_node
+        local = self._is_local()
+        extra: list[str] = []
+        if dry_run:
+            extra.append("--dry-run")
+
+        def _remote_path(p: str) -> str:
+            return p if local else f"{login}:{p}"
+
+        if not results_only:
+            print(f"[cluster] Syncing code from {login}...")
+            src = _remote_path(cfg.paths.code + "/")
+            dst = str(repo_root) + "/"
+            self._rsync(src, dst, extra_args=extra, exclude=_RSYNC_EXCLUDES)
+
+        if not code_only:
+            print(f"[cluster] Syncing results from {login}...")
+            if job_id:
+                pattern = f"*_{job_id}"
+                src = _remote_path(f"{cfg.paths.results}/{pattern}/")
+                dst_dir = repo_root / "results" / pattern
+                dst_dir.mkdir(parents=True, exist_ok=True)
+                self._rsync(src, str(dst_dir) + "/", extra_args=extra)
+            else:
+                src = _remote_path(cfg.paths.results + "/")
+                dst_dir = repo_root / "results"
+                dst_dir.mkdir(parents=True, exist_ok=True)
+                self._rsync(src, str(dst_dir) + "/", extra_args=extra)
+
+        print("[cluster] Sync complete.")
+
+    def build_sif(
+        self,
+        repo_root: Path,
+        example: str,
+        harnesses: list[str],
+        *,
+        force: bool = False,
+        instance_ids: list[str] | None = None,
+    ) -> dict[str, Path]:
+        cfg = self.config
+        results: dict[str, Path] = {}
+
+        if example == "train":
+            sif_name = "train-slime-grpo.sif"
+            def_content = _generate_train_def(cfg.paths.code)
+            sif_path = self._build_single_sif(
+                sif_name, def_content, force=force,
+                srun_time="01:00:00", srun_mem="64G",
+            )
+            results["train"] = Path(sif_path)
+            return results
+
+        if example in ("swegym", "swebench_verified"):
+            # Per-instance examples: one SIF per (harness, instance_id)
+            if not instance_ids:
+                if example == "swegym":
+                    from polar.cluster.tasks import SWEGYM_SAMPLE
+                    instance_ids = [i["instance_id"] for i in SWEGYM_SAMPLE]
+                else:
+                    raise ValueError(
+                        f"--instance-id is required for {example} SIF builds."
+                    )
+            prefix = "swegym" if example == "swegym" else "swebench"
+            for harness in harnesses:
+                for instance_id in instance_ids:
+                    sanitized = _sanitize_instance_id(instance_id)
+                    sif_name = f"{prefix}-{harness}-{sanitized}.sif"
+                    def_content = _generate_def_file(
+                        repo_root, example, harness, instance_id=instance_id,
+                    )
+                    if def_content is None:
+                        print(f"[cluster] WARNING: No .def for {example}/{harness}/{instance_id}")
+                        continue
+                    sif_path = self._build_single_sif(
+                        sif_name, def_content, force=force,
+                    )
+                    key = f"{harness}/{sanitized}"
+                    results[key] = Path(sif_path)
+        else:
+            # Calculator and other examples: one SIF per harness
+            for harness in harnesses:
+                sif_name = f"{example}-{harness}.sif"
+                def_content = _generate_def_file(repo_root, example, harness)
+                if def_content is None:
+                    print(f"[cluster] WARNING: Cannot generate .def for {example}/{harness}, skipping")
+                    continue
+                sif_path = self._build_single_sif(
+                    sif_name, def_content, force=force,
+                )
+                results[harness] = Path(sif_path)
+
+        return results
+
+    def serve(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+        wait: bool = True,
+        wait_timeout: int = 600,
+    ) -> dict[str, str]:
+        """Submit a serve-only sbatch job. Return {job_id, topology}."""
+        cfg = self.config
+        self._print_serve_summary()
+
+        if not no_sync:
+            self._sync_code_to_cluster(repo_root)
+
+        sbatch_cmd = self._build_serve_sbatch_command()
+        print(f"\n[cluster] sbatch command:\n  {sbatch_cmd}")
+
+        if dry_run:
+            print("\n[cluster] Dry run — not submitting.")
+            return {}
+
+        results_dir = cfg.paths.results
+        self._ssh_run(f"mkdir -p '{results_dir}'")
+        out = self._ssh_run(sbatch_cmd, capture=True)
+        job_id = out.strip().split()[-1]
+
+        job_dir = f"{results_dir}/polar-serve_{job_id}"
+        sentinel = f"{job_dir}/.services_ready"
+
+        print(f"\n[cluster] Job submitted: {job_id}")
+
+        if not wait:
+            print(f"[cluster] Not waiting. Check readiness with:")
+            print(f"  polar cluster status -c <config> --job-id {job_id}")
+            return {"job_id": job_id, "topology": f"{job_dir}/topology.yaml"}
+
+        print(f"[cluster] Waiting for services to be ready (timeout: {wait_timeout}s)...")
+        poll_interval = 10
+        for attempt in range(wait_timeout // poll_interval):
+            content = self._ssh_run(
+                f"cat '{sentinel}' 2>/dev/null || true",
+                capture=True,
+            )
+            if "TOPOLOGY=" in content:
+                topology = ""
+                for line in content.strip().splitlines():
+                    if line.startswith("TOPOLOGY="):
+                        topology = line.split("=", 1)[1]
+                        break
+                return {"job_id": job_id, "topology": topology}
+
+            # Check job is still alive
+            state = self._get_job_state(job_id)
+            if state in ("FAILED", "CANCELLED", "TIMEOUT", "COMPLETED", ""):
+                raise RuntimeError(
+                    f"Serve job {job_id} entered state '{state}' before services were ready. "
+                    f"Check logs: {job_dir}/logs/"
+                )
+            elapsed = (attempt + 1) * poll_interval
+            if elapsed % 30 == 0:
+                print(f"[cluster] Still waiting... ({elapsed}s, job state: {state})")
+            time.sleep(poll_interval)
+
+        raise TimeoutError(
+            f"Services not ready after {wait_timeout}s. "
+            f"Check job logs: {job_dir}/logs/"
+        )
+
+    def submit_task(
+        self,
+        repo_root: Path,
+        *,
+        job_id: str,
+        example: str | None = None,
+        harness: str | None = None,
+    ) -> int:
+        """Submit tasks to a running serve job. Return exit code."""
+        cfg = self.config
+        example = example or cfg.task.example
+        harness = harness or cfg.task.harness
+
+        # Discover topology from job ID
+        topology_path = self._find_topology(job_id)
+        job_dir = str(Path(topology_path).parent)
+
+        print(f"[cluster] Submitting tasks to job {job_id}")
+        print(f"[cluster]   Topology: {topology_path}")
+        print(f"[cluster]   Example:  {example}")
+        print(f"[cluster]   Harness:  {harness}")
+
+        # Build instance-id args
+        instance_id_args = ""
+        if cfg.task.instance_ids:
+            for iid in cfg.task.instance_ids:
+                instance_id_args += f" --instance-id {iid}"
+
+        env_setup = (
+            f"export POLAR_WORKSPACE='{cfg.paths.workspace}' && "
+            f"source '{cfg.paths.code}/src/polar/cluster/templates/env.sh'"
+        )
+        task_cmd = (
+            f"{env_setup} && "
+            f"python -m polar.cluster.tasks "
+            f"--example {example} --harness {harness} "
+            f"--topology {topology_path} "
+            f"--sif-dir {cfg.paths.sif_dir} "
+            f"--output-dir {job_dir}/tasks/{harness} "
+            f"--num-rollouts {cfg.task.num_rollouts} "
+            f"--timeout-seconds {cfg.task.timeout_seconds}"
+            f"{instance_id_args}"
+        )
+
+        try:
+            self._ssh_run(task_cmd)
+            print("[cluster] Task submission complete.")
+            return 0
+        except subprocess.CalledProcessError as exc:
+            print(f"[cluster] Task submission failed (exit code {exc.returncode})")
+            return exc.returncode
+
+    def train(
+        self,
+        repo_root: Path,
+        *,
+        dry_run: bool = False,
+        no_sync: bool = False,
+        wait: bool = True,
+        wait_timeout: int = 3600,
+    ) -> dict[str, str]:
+        """Submit a training sbatch job. Return job info dict."""
+        cfg = self.config
+        self._print_train_summary()
+
+        if not no_sync:
+            self._sync_code_to_cluster(repo_root)
+
+        sbatch_cmd = self._build_train_sbatch_command()
+        print(f"\n[cluster] sbatch command:\n  {sbatch_cmd}")
+
+        if dry_run:
+            print("\n[cluster] Dry run — not submitting.")
+            return {}
+
+        results_dir = cfg.paths.results
+        self._ssh_run(f"mkdir -p '{results_dir}'")
+        out = self._ssh_run(sbatch_cmd, capture=True)
+        job_id = out.strip().split()[-1]
+
+        job_dir = f"{results_dir}/polar-train_{job_id}"
+        login = cfg.slurm.login_node
+
+        print(f"\n[cluster] Training job submitted: {job_id}")
+        print(f"[cluster] Monitor:")
+        print(f"  ssh {login} squeue -j {job_id}")
+        print(f"  ssh {login} 'tail -f {results_dir}/polar-train_{job_id}.out'")
+
+        if not wait:
+            return {"job_id": job_id, "job_dir": job_dir}
+
+        print(f"[cluster] Waiting for training to complete (timeout: {wait_timeout}s)...")
+        poll_interval = 30
+        for attempt in range(wait_timeout // poll_interval):
+            state = self._get_job_state(job_id)
+            if state == "COMPLETED":
+                print(f"[cluster] Training job {job_id} completed successfully.")
+                return {"job_id": job_id, "job_dir": job_dir, "state": "COMPLETED"}
+            if state in ("FAILED", "CANCELLED", "TIMEOUT", ""):
+                raise RuntimeError(
+                    f"Training job {job_id} entered state '{state}'. "
+                    f"Check logs: ssh {login} 'tail -100 {results_dir}/polar-train_{job_id}.out'"
+                )
+            elapsed = (attempt + 1) * poll_interval
+            if elapsed % 120 == 0:
+                print(f"[cluster] Training still running... ({elapsed}s, state: {state})")
+            time.sleep(poll_interval)
+
+        raise TimeoutError(
+            f"Training job {job_id} not completed after {wait_timeout}s. "
+            f"Job may still be running. Check: ssh {login} squeue -j {job_id}"
+        )
+
+    def _build_single_sif(
+        self,
+        sif_name: str,
+        def_content: str,
+        *,
+        force: bool = False,
+        srun_time: str = "00:30:00",
+        srun_mem: str = "32G",
+    ) -> str:
+        """Build a single SIF image on the cluster and return its path."""
+        cfg = self.config
+        sif_path = f"{cfg.paths.sif_dir}/{sif_name}"
+
+        # Check if SIF already exists (skip unless --force)
+        if not force:
+            try:
+                self._ssh_run(f"test -f '{sif_path}'", check=True)
+                print(f"[cluster] SIF exists, skipping: {sif_name}")
+                return sif_path
+            except subprocess.CalledProcessError:
+                pass  # file doesn't exist, proceed
+
+        print(f"[cluster] Building SIF: {sif_name}")
+        def_dir = f"{cfg.paths.polar_root}/tmp_defs"
+        remote_def = f"{def_dir}/{sif_name}.def"
+        self._ssh_run(f"mkdir -p '{def_dir}'")
+        self._ssh_run(f"cat > '{remote_def}' << 'POLAREOF'\n{def_content}\nPOLAREOF")
+        self._ssh_run(f"mkdir -p '{cfg.paths.sif_dir}'")
+
+        force_flag = "--force" if force else ""
+        cache_dir = f"{cfg.paths.polar_root}/apptainer_cache"
+        path_prefix = ""
+        if cfg.paths.apptainer_bin_dir:
+            path_prefix = f"export PATH='{cfg.paths.apptainer_bin_dir}':$PATH && "
+        build_cmd = (
+            f"{path_prefix}"
+            f"export APPTAINER_CACHEDIR='{cache_dir}' && mkdir -p '{cache_dir}' && "
+            f"apptainer build {force_flag} '{sif_path}' '{remote_def}'"
+        )
+        account = cfg.slurm.account
+        try:
+            self._ssh_run(
+                f"srun --account={account} --partition=cpu_short --time={srun_time} "
+                f"--cpus-per-task=8 --mem={srun_mem} bash -c {shlex.quote(build_cmd)}"
+            )
+        except subprocess.CalledProcessError:
+            print(f"[cluster] srun failed, trying direct build...")
+            self._ssh_run(build_cmd)
+
+        self._ssh_run(f"rm -f '{remote_def}'")
+        print(f"[cluster] Built: {sif_path}")
+        return sif_path
+
+    # ── Internal helpers ──────────────────────────────────────────────────────
+
+    def _is_local(self) -> bool:
+        """Return True if we're already on the login node (sbatch available)."""
+        if not hasattr(self, "_local_cache"):
+            import shutil
+            self._local_cache = shutil.which("sbatch") is not None
+        return self._local_cache
+
+    def _ssh_run(
+        self,
+        command: str,
+        *,
+        capture: bool = False,
+        check: bool = True,
+    ) -> str:
+        if self._is_local():
+            cmd = ["bash", "-l", "-c", command]
+        else:
+            login = self.config.slurm.login_node
+            cmd = ["ssh", "-o", "ConnectTimeout=10", login, command]
+        result = subprocess.run(
+            cmd,
+            capture_output=capture,
+            text=True,
+            check=check,
+        )
+        if capture:
+            return result.stdout
+        return ""
+
+    def _rsync(
+        self,
+        src: str,
+        dst: str,
+        *,
+        exclude: list[str] | None = None,
+        extra_args: list[str] | None = None,
+    ) -> None:
+        cmd = ["rsync", "-avz", "--delete"]
+        for pattern in exclude or []:
+            cmd.extend(["--exclude", pattern])
+        cmd.extend(extra_args or [])
+        cmd.extend([src, dst])
+        subprocess.run(cmd, check=True)
+
+    def _sync_code_to_cluster(self, repo_root: Path) -> None:
+        cfg = self.config
+        login = cfg.slurm.login_node
+        print(f"\n[cluster] Syncing code to {login}:{cfg.paths.code}/ ...")
+        self._ssh_run(f"mkdir -p '{cfg.paths.code}'")
+        if self._is_local():
+            src = str(repo_root) + "/"
+            dst = cfg.paths.code + "/"
+        else:
+            src = str(repo_root) + "/"
+            dst = f"{login}:{cfg.paths.code}/"
+        self._rsync(src, dst, exclude=_RSYNC_EXCLUDES)
+        print("[cluster] Sync complete.")
+
+    def _sync_swegym_cache(self) -> None:
+        """Ensure the SWE-Gym sample instance cache exists on the cluster."""
+        cache_file = Path.home() / ".cache" / "polar" / "swegym_sample_10.json"
+        if not cache_file.exists():
+            try:
+                from examples.swegym.sample_tasks import fetch_sample_instances
+                print("[cluster] Fetching SWE-Gym sample data from HuggingFace...")
+                fetch_sample_instances()
+            except Exception as exc:
+                print(f"[cluster] WARNING: Could not fetch SWE-Gym sample data: {exc}")
+                return
+        if cache_file.exists():
+            if self._is_local():
+                # Already on the login node — cache is in place, nothing to sync.
+                print("[cluster] SWE-Gym sample cache already present.")
+            else:
+                login = self.config.slurm.login_node
+                self._ssh_run("mkdir -p ~/.cache/polar/")
+                self._rsync(str(cache_file), f"{login}:~/.cache/polar/swegym_sample_10.json")
+                print("[cluster] SWE-Gym sample cache synced.")
+
+    def _sync_swebench_cache(self) -> None:
+        """Ensure the SWE-bench Verified dataset cache exists on the cluster."""
+        cache_file = Path.home() / ".cache" / "polar" / "swebench_verified.json"
+        if not cache_file.exists():
+            try:
+                import importlib
+                spec = importlib.util.spec_from_file_location(
+                    "dataset",
+                    Path(__file__).resolve().parents[2] / "examples" / "swebench_verified" / "dataset.py",
+                )
+                mod = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(mod)
+                print("[cluster] Fetching SWE-bench Verified dataset from HuggingFace...")
+                mod.load_swebench_verified()
+            except Exception as exc:
+                print(f"[cluster] WARNING: Could not fetch SWE-bench Verified data: {exc}")
+                return
+        if cache_file.exists():
+            if self._is_local():
+                print("[cluster] SWE-bench Verified cache already present.")
+            else:
+                login = self.config.slurm.login_node
+                self._ssh_run("mkdir -p ~/.cache/polar/")
+                self._rsync(str(cache_file), f"{login}:~/.cache/polar/swebench_verified.json")
+                print("[cluster] SWE-bench Verified cache synced.")
+
+    def _build_sbatch_command(self) -> str:
+        cfg = self.config
+        export_vars = cfg.sbatch_export_vars()
+        export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items())
+
+        example = cfg.task.example
+        harness = cfg.task.harness
+        results_dir = cfg.paths.results
+        job_name = f"polar-{example}-{harness}"
+        sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm.sbatch"
+
+        parts = [
+            "sbatch",
+            f"--account={cfg.slurm.account}",
+            f"--partition={cfg.slurm.partition}",
+            f"--nodes={cfg.resources.nodes}",
+            f"--gres=gpu:{cfg.resources.gpus_per_node}",
+            f"--cpus-per-task={cfg.resources.cpus_per_task}",
+            f"--mem={cfg.resources.mem}",
+            f"--time={cfg.resources.time}",
+            f"--job-name={job_name}",
+            f"--output={results_dir}/{job_name}_%j.out",
+            f"--error={results_dir}/{job_name}_%j.err",
+            f"--export={export_str}",
+            sbatch_path,
+        ]
+        return " ".join(parts)
+
+    def _build_serve_sbatch_command(self) -> str:
+        cfg = self.config
+        export_vars = cfg.sbatch_serve_export_vars()
+        export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items())
+
+        results_dir = cfg.paths.results
+        job_name = "polar-serve"
+        sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm_serve.sbatch"
+
+        parts = [
+            "sbatch",
+            f"--account={cfg.slurm.account}",
+            f"--partition={cfg.slurm.partition}",
+            f"--nodes={cfg.resources.nodes}",
+            f"--gres=gpu:{cfg.resources.gpus_per_node}",
+            f"--cpus-per-task={cfg.resources.cpus_per_task}",
+            f"--mem={cfg.resources.mem}",
+            f"--time={cfg.resources.time}",
+            f"--job-name={job_name}",
+            f"--output={results_dir}/{job_name}_%j.out",
+            f"--error={results_dir}/{job_name}_%j.err",
+            f"--export={export_str}",
+            sbatch_path,
+        ]
+        return " ".join(parts)
+
+    def _build_train_sbatch_command(self) -> str:
+        cfg = self.config
+        export_vars = cfg.sbatch_train_export_vars()
+        export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items())
+
+        results_dir = cfg.paths.results
+        job_name = "polar-train"
+        sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm_train.sbatch"
+
+        parts = [
+            "sbatch",
+            f"--account={cfg.slurm.account}",
+            f"--partition={cfg.slurm.partition}",
+            f"--nodes={cfg.resources.nodes}",
+            f"--gres=gpu:{cfg.resources.gpus_per_node}",
+            f"--cpus-per-task={cfg.resources.cpus_per_task}",
+            f"--mem={cfg.resources.mem}",
+            f"--time={cfg.resources.time}",
+            f"--job-name={job_name}",
+            f"--output={results_dir}/{job_name}_%j.out",
+            f"--error={results_dir}/{job_name}_%j.err",
+            # Quote the export string — values like MODEL_ARGS contain spaces
+            f"--export={shlex.quote(export_str)}",
+            sbatch_path,
+        ]
+        return " ".join(parts)
+
+    def _get_job_state(self, job_id: str) -> str:
+        """Query SLURM for the current state of a job."""
+        out = self._ssh_run(
+            f"squeue -j {job_id} --format='%T' --noheader 2>/dev/null || true",
+            capture=True,
+        )
+        return out.strip()
+
+    def _find_topology(self, job_id: str) -> str:
+        """Discover the topology.yaml path for a running serve job."""
+        cfg = self.config
+        results_dir = cfg.paths.results
+
+        # Try the sentinel file first (written by polar_slurm_serve.sbatch)
+        sentinel = f"{results_dir}/polar-serve_{job_id}/.services_ready"
+        content = self._ssh_run(
+            f"cat '{sentinel}' 2>/dev/null || true",
+            capture=True,
+        )
+        if "TOPOLOGY=" in content:
+            for line in content.strip().splitlines():
+                if line.startswith("TOPOLOGY="):
+                    return line.split("=", 1)[1]
+
+        # Fallback: search for topology.yaml matching the job ID
+        out = self._ssh_run(
+            f"ls '{results_dir}'/*_{job_id}/topology.yaml 2>/dev/null || true",
+            capture=True,
+        )
+        path = out.strip().splitlines()[0] if out.strip() else ""
+        if path:
+            return path
+
+        raise FileNotFoundError(
+            f"Cannot find topology for job {job_id}. "
+            f"Is the serve job running? Check: polar cluster status -c <config> --job-id {job_id}"
+        )
+
+    def _print_serve_summary(self) -> None:
+        cfg = self.config
+        lines = [
+            "=" * 65,
+            "Polar SLURM Serve (services only)",
+            "=" * 65,
+            f"  Model:      {cfg.model.name}",
+            f"  TP size:    {cfg.model.tensor_parallel_size}",
+            "  " + "-" * 60,
+            f"  Login node: {cfg.slurm.login_node}",
+            f"  Account:    {cfg.slurm.account}",
+            f"  Partition:  {cfg.slurm.partition}",
+            f"  Nodes:      {cfg.resources.nodes}",
+            f"  GPUs/node:  {cfg.resources.gpus_per_node}",
+            f"  Time limit: {cfg.resources.time}",
+            f"  Workspace:  {cfg.paths.workspace}",
+            "=" * 65,
+        ]
+        print("\n".join(lines))
+
+    def _print_train_summary(self) -> None:
+        cfg = self.config
+        t = cfg.train
+        lines = [
+            "=" * 65,
+            "Polar SLURM Training Job (Slime + Megatron GRPO)",
+            "=" * 65,
+            f"  HF checkpoint: {t.hf_checkpoint}",
+            f"  Actor GPUs:    {t.actor_gpus} (TP={t.tp_size})",
+            f"  Rollout GPUs:  {t.rollout_gpus}",
+            f"  Num rollouts:  {t.num_rollouts}",
+            f"  Batch:         {t.rollout_batch_size} prompts x {t.n_samples_per_prompt} samples",
+            f"  Global batch:  {t.global_batch_size}",
+            "  " + "-" * 60,
+            f"  Login node:    {cfg.slurm.login_node}",
+            f"  Account:       {cfg.slurm.account}",
+            f"  Partition:     {cfg.slurm.partition}",
+            f"  Nodes:         {cfg.resources.nodes}",
+            f"  GPUs/node:     {cfg.resources.gpus_per_node}",
+            f"  Time limit:    {cfg.resources.time}",
+            f"  Workspace:     {cfg.paths.workspace}",
+            "=" * 65,
+        ]
+        print("\n".join(lines))
+
+    def _print_summary(self) -> None:
+        cfg = self.config
+        lines = [
+            "=" * 65,
+            "Polar SLURM Job Submission",
+            "=" * 65,
+            f"  Example:    {cfg.task.example}",
+            f"  Harness:    {cfg.task.harness}",
+            f"  Model:      {cfg.model.name}",
+            f"  TP size:    {cfg.model.tensor_parallel_size}",
+            f"  Rollouts:   {cfg.task.num_rollouts}",
+            f"  Timeout:    {cfg.task.timeout_seconds}s",
+            "  " + "-" * 60,
+            f"  Login node: {cfg.slurm.login_node}",
+            f"  Account:    {cfg.slurm.account}",
+            f"  Partition:  {cfg.slurm.partition}",
+            f"  Nodes:      {cfg.resources.nodes}",
+            f"  GPUs/node:  {cfg.resources.gpus_per_node}",
+            f"  Time limit: {cfg.resources.time}",
+            f"  Workspace:  {cfg.paths.workspace}",
+            "=" * 65,
+        ]
+        print("\n".join(lines))
+
+
+# ── SIF definition file generation ────────────────────────────────────────────
+
+
+def _sanitize_instance_id(instance_id: str) -> str:
+    """Normalize instance ID for use in filenames."""
+    normalized = instance_id.strip().lower()
+    normalized = re.sub(r"[^a-z0-9_.-]+", "-", normalized.replace("__", "--"))
+    normalized = re.sub(r"-{2,}", "-", normalized)
+    return normalized.strip("-")
+
+
+# Maps harness name to (base_image, install_commands)
+_CALCULATOR_HARNESS_DEFS: dict[str, tuple[str, list[str]]] = {
+    "opencode": (
+        "node:22-bookworm-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*",
+            "npm install -g opencode-ai@latest",
+        ],
+    ),
+    "codex": (
+        "node:22-bookworm-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*",
+            "npm install -g @openai/codex@latest",
+        ],
+    ),
+    "claude_code": (
+        "node:22-bookworm-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*",
+            "npm install -g @anthropic-ai/claude-code@latest",
+        ],
+    ),
+    "gemini_cli": (
+        "node:22-bookworm-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*",
+            "npm install -g @google/gemini-cli@latest",
+        ],
+    ),
+    "qwen_code": (
+        "node:22-bookworm-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*",
+            "npm install -g @qwen-code/qwen-code@latest",
+        ],
+    ),
+    "swe_agent": (
+        "python:3.12-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git build-essential && rm -rf /var/lib/apt/lists/*",
+            "pip install --no-cache-dir 'sweagent[all] @ git+https://github.com/SWE-agent/SWE-agent.git'",
+            "pip install --no-cache-dir tree-sitter==0.21.3 tree-sitter-languages",
+            "SITE=$(python -c 'import site; print(site.getsitepackages()[0])') && "
+            "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && "
+            "cp -r /tmp/swe-agent-src/config $SITE/config && "
+            "cp -r /tmp/swe-agent-src/tools $SITE/tools && "
+            "mkdir -p $SITE/trajectories && "
+            "rm -rf /tmp/swe-agent-src",
+        ],
+    ),
+    "openhands_sdk": (
+        "python:3.12-slim",
+        [
+            "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git && rm -rf /var/lib/apt/lists/*",
+            "pip install --no-cache-dir openhands-sdk openhands-tools",
+        ],
+    ),
+}
+
+
+def _generate_def_file(
+    repo_root: Path,
+    example: str,
+    harness: str,
+    instance_id: str | None = None,
+) -> str | None:
+    """Generate an Apptainer ``.def`` file for building a harness SIF."""
+    if example == "calculator":
+        spec = _CALCULATOR_HARNESS_DEFS.get(harness)
+        if spec is None:
+            return None
+        base_image, commands = spec
+        post = "\n    ".join(commands)
+        return (
+            f"Bootstrap: docker\n"
+            f"From: {base_image}\n"
+            f"\n"
+            f"%post\n"
+            f"    {post}\n"
+            f"    mkdir -p /polar/session/workspace /polar/session/logs/agent\n"
+            f"\n"
+            f"%environment\n"
+            f"    export DEBIAN_FRONTEND=noninteractive\n"
+            f"\n"
+            f"%labels\n"
+            f"    io.polar.example {example}\n"
+            f"    io.polar.harness {harness}\n"
+        )
+    if example == "swegym":
+        return _generate_swegym_def(harness, instance_id)
+    if example == "swebench_verified":
+        return _generate_swebench_def(harness, instance_id)
+    return None
+
+
+def _swegym_base_image(instance_id: str) -> str:
+    """Derive the SWE-Gym eval base image from an instance ID."""
+    suffix = instance_id.replace("__", "_s_").lower()
+    return f"docker.io/xingyaoww/sweb.eval.x86_64.{suffix}:latest"
+
+
+# SWE-Gym harness install commands (layered on top of per-instance base image).
+# Base images already have conda + testbed env; we add the agent harness tools.
+_SWEGYM_HARNESS_DEFS: dict[str, list[str]] = {
+    "swe_agent": [
+        "/opt/miniconda3/bin/conda create -y -n polar-sweagent python=3.11 pip",
+        "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir "
+        "'git+https://github.com/SWE-agent/SWE-agent.git'",
+        "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir "
+        "tree-sitter==0.21.3 tree-sitter-languages",
+        "SITE=$(/opt/miniconda3/envs/polar-sweagent/bin/python -c "
+        "\"import site; print(site.getsitepackages()[0])\") && "
+        "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && "
+        "cp -r /tmp/swe-agent-src/config $SITE/config && "
+        "cp -r /tmp/swe-agent-src/tools $SITE/tools && "
+        "mkdir -p $SITE/trajectories && "
+        "/opt/miniconda3/bin/conda clean -afy && "
+        "rm -rf /tmp/swe-agent-src",
+    ],
+}
+
+
+def _generate_swegym_def(
+    harness: str,
+    instance_id: str | None,
+) -> str | None:
+    """Generate an Apptainer .def for a SWE-Gym per-instance SIF."""
+    if instance_id is None:
+        return None
+    spec = _SWEGYM_HARNESS_DEFS.get(harness)
+    if spec is None:
+        return None
+    base_image = _swegym_base_image(instance_id)
+    post = "\n    ".join(spec)
+    return (
+        f"Bootstrap: docker\n"
+        f"From: {base_image}\n"
+        f"\n"
+        f"%post\n"
+        f"    {post}\n"
+        f"    mkdir -p /polar/session/workspace /polar/session/logs/agent\n"
+        f"\n"
+        f"%environment\n"
+        f"    export DEBIAN_FRONTEND=noninteractive\n"
+        f"    export PATH=/opt/miniconda3/envs/testbed/bin:"
+        f"/opt/miniconda3/envs/polar-sweagent/bin:$PATH\n"
+        f"\n"
+        f"%labels\n"
+        f"    io.polar.example swegym\n"
+        f"    io.polar.harness {harness}\n"
+        f"    io.polar.instance_id {instance_id}\n"
+    )
+
+
+# ── SWE-bench Verified SIF definitions ──────────────────────────────────────
+
+_SWEBENCH_NODE_INSTALL = (
+    "apt-get update && "
+    "apt-get install -y --no-install-recommends ca-certificates curl gnupg && "
+    "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && "
+    "apt-get install -y --no-install-recommends nodejs && "
+    "apt-get clean && rm -rf /var/lib/apt/lists/*"
+)
+
+_SWEBENCH_HARNESS_DEFS: dict[str, list[str]] = {
+    "opencode": [
+        _SWEBENCH_NODE_INSTALL,
+        "npm install -g opencode-ai@latest",
+    ],
+    "codex": [
+        _SWEBENCH_NODE_INSTALL,
+        "npm install -g @openai/codex@latest",
+    ],
+    "claude_code": [
+        _SWEBENCH_NODE_INSTALL,
+        "npm install -g @anthropic-ai/claude-code@latest",
+    ],
+    "gemini_cli": [
+        _SWEBENCH_NODE_INSTALL,
+        "npm install -g @google/gemini-cli@latest",
+    ],
+    "qwen_code": [
+        _SWEBENCH_NODE_INSTALL,
+        "npm install -g @qwen-code/qwen-code@latest",
+    ],
+    "swe_agent": [
+        "apt-get update && apt-get install -y --no-install-recommends "
+        "bash ca-certificates curl git build-essential && rm -rf /var/lib/apt/lists/*",
+        "/opt/miniconda3/bin/conda create -y -n polar-sweagent python=3.11 pip",
+        "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir "
+        "'git+https://github.com/SWE-agent/SWE-agent.git'",
+        "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir "
+        "tree-sitter==0.21.3 tree-sitter-languages",
+        "SITE=$(/opt/miniconda3/envs/polar-sweagent/bin/python -c "
+        "\"import site; print(site.getsitepackages()[0])\") && "
+        "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && "
+        "cp -r /tmp/swe-agent-src/config $SITE/config && "
+        "cp -r /tmp/swe-agent-src/tools $SITE/tools && "
+        "mkdir -p $SITE/trajectories && "
+        "/opt/miniconda3/bin/conda clean -afy && "
+        "rm -rf /tmp/swe-agent-src",
+    ],
+    "openhands_sdk": [
+        "apt-get update && apt-get install -y --no-install-recommends "
+        "bash ca-certificates curl git && rm -rf /var/lib/apt/lists/*",
+        "/opt/miniconda3/bin/conda create -y -n polar-openhands python=3.12 pip",
+        "/opt/miniconda3/envs/polar-openhands/bin/python -m pip install --no-cache-dir "
+        "openhands-sdk openhands-tools",
+        "/opt/miniconda3/bin/conda clean -afy",
+    ],
+}
+
+
+def _swebench_base_image(instance_id: str) -> str:
+    """Derive the SWE-bench eval base image from an instance ID."""
+    suffix = instance_id.replace("__", "_s_").lower()
+    return f"docker.io/xingyaoww/sweb.eval.x86_64.{suffix}:latest"
+
+
+def _generate_swebench_def(
+    harness: str,
+    instance_id: str | None,
+) -> str | None:
+    """Generate an Apptainer .def for a SWE-bench Verified per-instance SIF."""
+    if instance_id is None:
+        return None
+    spec = _SWEBENCH_HARNESS_DEFS.get(harness)
+    if spec is None:
+        return None
+    base_image = _swebench_base_image(instance_id)
+    post = "\n    ".join(spec)
+
+    # Build PATH: always include testbed; add harness-specific conda envs
+    path_parts = ["/opt/miniconda3/envs/testbed/bin"]
+    if harness == "swe_agent":
+        path_parts.insert(0, "/opt/miniconda3/envs/polar-sweagent/bin")
+    elif harness == "openhands_sdk":
+        path_parts.insert(0, "/opt/miniconda3/envs/polar-openhands/bin")
+    path_env = ":".join(path_parts)
+
+    return (
+        f"Bootstrap: docker\n"
+        f"From: {base_image}\n"
+        f"\n"
+        f"%post\n"
+        f"    {post}\n"
+        f"    mkdir -p /polar/session/workspace /polar/session/logs/agent\n"
+        f"\n"
+        f"%environment\n"
+        f"    export DEBIAN_FRONTEND=noninteractive\n"
+        f"    export PATH={path_env}:$PATH\n"
+        f"\n"
+        f"%labels\n"
+        f"    io.polar.example swebench_verified\n"
+        f"    io.polar.harness {harness}\n"
+        f"    io.polar.instance_id {instance_id}\n"
+    )
+
+
+def _generate_train_def(code_path: str) -> str:
+    """Generate an Apptainer .def for the Slime+Megatron GRPO training SIF.
+
+    Uses slimerl/slime Docker image which ships sglang v0.5.9, Megatron-LM,
+    flash-attn, transformer_engine, apex, mbridge, and all training deps
+    pre-built.  We only add Polar and apply Polar's Slime patch on top.
+    """
+    return (
+        "Bootstrap: docker\n"
+        "From: slimerl/slime:nightly-dev-20260329a\n"
+        "\n"
+        "%files\n"
+        f"    {code_path} /opt/polar\n"
+        "\n"
+        "%post\n"
+        "    # Install Polar on top of the Slime image\n"
+        "    pip install -e /opt/polar\n"
+        "\n"
+        "    # Apply Slime patch (adds external advantage estimator for Polar)\n"
+        "    bash /opt/polar/scripts/patch/patch_slime.sh\n"
+        "\n"
+        "%environment\n"
+        '    export PYTHONPATH="/opt/polar/src:/root/Megatron-LM:${PYTHONPATH:-}"\n'
+        "    export CUDA_DEVICE_MAX_CONNECTIONS=1\n"
+        "    export PYTHONNOUSERSITE=1\n"
+        '    export LD_LIBRARY_PATH="/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}"\n'
+        "\n"
+        "%labels\n"
+        "    io.polar.example train\n"
+        "    io.polar.framework slime-grpo\n"
+    )
diff --git a/src/polar/cluster/tasks.py b/src/polar/cluster/tasks.py
new file mode 100644
index 00000000..373ec2f3
--- /dev/null
+++ b/src/polar/cluster/tasks.py
@@ -0,0 +1,632 @@
+"""Build and submit Polar task payloads on SLURM.
+
+Called inside the SLURM job to construct task JSON with correct absolute SIF
+image paths and submit them through the Polar CLI.
+
+Usage (from sbatch script)::
+
+    python -m polar.cluster.tasks --example calculator --harness opencode \\
+        --topology /path/to/topology.yaml --sif-dir /lustre/.../sif_images
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2, ensure_ascii=True, sort_keys=True))
+
+
+def _submit_task(request_path: Path, topology_path: str) -> dict[str, Any]:
+    """Submit a task JSON via ``polar submit`` and return the response."""
+    command = [
+        sys.executable, "-m", "polar.cli",
+        "submit", str(request_path),
+        "-c", topology_path,
+        "--json",
+    ]
+    print(f"[tasks] Running: {' '.join(command)}")
+    completed = subprocess.run(command, check=True, capture_output=True, text=True)
+    return json.loads(completed.stdout)
+
+
+def _summarize_result(response: dict[str, Any]) -> dict[str, Any]:
+    sessions = response.get("results") or []
+    completed = sum(1 for s in sessions if s.get("status") == "COMPLETED")
+    reward_one = 0
+    for s in sessions:
+        traces = (s.get("trajectory") or {}).get("traces") or []
+        if traces and traces[-1].get("reward") == 1.0:
+            reward_one += 1
+    return {
+        "total_sessions": len(sessions),
+        "completed_sessions": completed,
+        "reward_one_sessions": reward_one,
+    }
+
+
+def _sanitize_instance_id(instance_id: str) -> str:
+    normalized = instance_id.strip().lower()
+    normalized = re.sub(r"[^a-z0-9_.-]+", "-", normalized.replace("__", "--"))
+    normalized = re.sub(r"-{2,}", "-", normalized)
+    return normalized.strip("-")
+
+
+# ── Calculator ────────────────────────────────────────────────────────────────
+
+CALCULATOR_INSTRUCTION = """\
+Write a Python calculator with no extra imports. Support arithmetic expressions over integers and
+parentheses. Save it as `calculator.py`.
+
+Expose a `Calculator` class that can be called with a string expression.
+
+Example:
+
+from calculator import Calculator
+cal = Calculator()
+print(cal("4*3-3"))  # should print 9"""
+
+CALCULATOR_TEST = """\
+from calculator import Calculator
+
+cal = Calculator()
+
+assert cal("4*3-3") == 9
+assert cal("(2+3)*4") == 20
+assert cal("10/2+7") == 12
+assert cal("18-(3*4)") == 6
+assert cal(" 8 + 2 * 5 ") == 18
+
+print("calculator tests passed")
+"""
+
+
+def build_calculator_task(
+    harness: str,
+    sif_dir: str,
+    output_dir: str,
+    *,
+    agent_model: str = "openai/gpt-4o",
+    num_rollouts: int = 4,
+    timeout_seconds: float = 900.0,
+    batch_id: str = "",
+) -> dict[str, Any]:
+    """Build a calculator task payload."""
+    sif_path = os.path.join(sif_dir, f"calculator-{harness}.sif")
+    if not os.path.isfile(sif_path):
+        raise FileNotFoundError(
+            f"Calculator SIF not found: {sif_path}\n"
+            f"Build it with: polar cluster build-sif --example calculator --harness {harness}"
+        )
+
+    test_dir = Path(output_dir) / "assets"
+    test_dir.mkdir(parents=True, exist_ok=True)
+    test_file = test_dir / "test_calculator.py"
+    test_file.write_text(CALCULATOR_TEST)
+
+    return {
+        "task_id": f"calculator-{harness}-slurm-{batch_id}",
+        "instruction": CALCULATOR_INSTRUCTION,
+        "num_rollouts": num_rollouts,
+        "timeout_seconds": timeout_seconds,
+        "runtime": {
+            "backend": "apptainer",
+            "image": sif_path,
+            "prepare": [
+                {
+                    "type": "exec",
+                    "command": (
+                        "mkdir -p /polar/session/workspace /polar/session/logs/agent && "
+                        "cd /polar/session/workspace && git init && "
+                        "git config user.email 'polar@test' && "
+                        "git config user.name 'Polar'"
+                    ),
+                },
+                {
+                    "type": "upload_file",
+                    "source": str(test_file.resolve()),
+                    "target": "/polar/session/workspace/test_calculator.py",
+                },
+                {
+                    "type": "exec",
+                    "command": "cd /polar/session/workspace && git add -A && git commit -m 'initial'",
+                },
+            ],
+            "env": {},
+            "network": "host",
+            "workdir": "/polar/session/workspace",
+            # swe_agent's swerex does chown inside the container; Apptainer
+            # needs --fakeroot to support ownership changes on overlayFS.
+            **({"kwargs": {"fakeroot": True}} if harness == "swe_agent" else {}),
+        },
+        "agent": {
+            "harness": harness,
+            "model_name": agent_model,
+            "settings": {},
+            "env": {},
+        },
+        "builder": {"strategy": "prefix_merging"},
+        "evaluator": {
+            "strategy": "swegym_git_diff",
+            "config": {
+                "repo_dir": "/polar/session/workspace",
+                "patch_command": (
+                    "cd /polar/session/workspace && git add -A && git diff --cached --binary"
+                ),
+                "test_command": (
+                    "cd /polar/session/workspace && python3 test_calculator.py && "
+                    "echo 'PASSED test_calculator'"
+                ),
+                "test_timeout": 60.0,
+                "expected_output_json": {"test_calculator": "PASSED"},
+            },
+            "refresh_runtime": False,
+        },
+    }
+
+
+# ── SWE-Gym ──────────────────────────────────────────────────────────────────
+
+SWEGYM_SAMPLE = [
+    {"instance_id": "getmoto__moto-7365", "repo": "getmoto/moto"},
+    {"instance_id": "python__mypy-10392", "repo": "python/mypy"},
+    {"instance_id": "conan-io__conan-13721", "repo": "conan-io/conan"},
+    {"instance_id": "iterative__dvc-1809", "repo": "iterative/dvc"},
+    {"instance_id": "dask__dask-10441", "repo": "dask/dask"},
+    {"instance_id": "pydantic__pydantic-8072", "repo": "pydantic/pydantic"},
+    {"instance_id": "pandas-dev__pandas-58335", "repo": "pandas-dev/pandas"},
+    {"instance_id": "facebookresearch__hydra-1783", "repo": "facebookresearch/hydra"},
+    {"instance_id": "bokeh__bokeh-13636", "repo": "bokeh/bokeh"},
+    {"instance_id": "Project-MONAI__MONAI-2238", "repo": "Project-MONAI/MONAI"},
+]
+
+SWEGYM_PREPARE = (
+    "rm -rf /polar/session/workspace && "
+    "mkdir -p /polar/session/logs/agent /polar/session/workspace /root/.venv/bin && "
+    "cp -a /testbed/. /polar/session/workspace/ && "
+    # swerex's shutil.copytree fails on dangling symlinks (e.g. bokeh repo)
+    "find /polar/session/workspace -xtype l -delete 2>/dev/null; "
+    "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python && "
+    "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python3 && "
+    "git config --global core.pager '' && "
+    "cd /polar/session/workspace && git reset --hard"
+)
+
+
+def build_swegym_task(
+    harness: str,
+    sif_dir: str,
+    instance: dict[str, Any],
+    *,
+    agent_model: str = "openai/gpt-4o",
+    num_rollouts: int = 4,
+    timeout_seconds: float = 900.0,
+    batch_id: str = "",
+) -> dict[str, Any]:
+    """Build a SWE-Gym task payload for a single instance."""
+    instance_id = instance["instance_id"]
+    sif_name = f"swegym-{harness}-{_sanitize_instance_id(instance_id)}.sif"
+    sif_path = os.path.join(sif_dir, sif_name)
+
+    if not os.path.isfile(sif_path):
+        raise FileNotFoundError(f"SWE-Gym SIF not found: {sif_path}")
+
+    agent_settings: dict[str, Any] = {}
+    agent_env: dict[str, str] = {}
+    if harness == "swe_agent":
+        agent_settings = {
+            "repo_path": "/polar/session/workspace",
+            "shell_preamble": (
+                "source /opt/miniconda3/etc/profile.d/conda.sh && "
+                "conda activate polar-sweagent && "
+                "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH"
+            ),
+        }
+    elif harness in ("openhands_sdk", "openhands"):
+        agent_env = {"WORKSPACE_BASE": "/polar/session/workspace"}
+
+    return {
+        "task_id": f"swegym-{harness}-{_sanitize_instance_id(instance_id)}-{batch_id}",
+        "instruction": str(instance.get("problem_statement", "")).strip(),
+        "num_rollouts": num_rollouts,
+        "timeout_seconds": timeout_seconds,
+        "runtime": {
+            "backend": "apptainer",
+            "image": sif_path,
+            "prepare": [{"type": "exec", "command": SWEGYM_PREPARE}],
+            "env": {},
+            "network": "host",
+            "workdir": "/polar/session/workspace",
+            **({"kwargs": {"fakeroot": True}} if harness == "swe_agent" else {}),
+        },
+        "agent": {
+            "harness": harness,
+            "model_name": agent_model,
+            "settings": agent_settings,
+            "env": agent_env,
+        },
+        "builder": {"strategy": "prefix_merging"},
+        "evaluator": {
+            "strategy": "swegym_git_diff",
+            "config": {
+                "repo_dir": "/testbed",
+                "patch_command": "cd /polar/session/workspace && git add -A && git diff --cached --binary --submodule=diff",
+                "instance": instance,
+            },
+            "refresh_runtime": False,
+        },
+    }
+
+
+# ── SWE-bench Verified ────────────────────────────────────────────────────────
+
+SWEBENCH_PREPARE_BASE = (
+    "rm -rf /polar/session/workspace && "
+    "mkdir -p /polar/session/logs/agent /polar/session/workspace /root/.venv/bin && "
+    "cp -a /testbed/. /polar/session/workspace/ && "
+    "find /polar/session/workspace -xtype l -delete 2>/dev/null; "
+    "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python && "
+    "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python3 && "
+    "git config --global core.pager '' && "
+    "cd /polar/session/workspace && git reset --hard; true"
+)
+
+
+def build_swebench_task(
+    harness: str,
+    sif_dir: str,
+    instance: dict[str, Any],
+    *,
+    agent_model: str = "openai/gpt-4o",
+    num_rollouts: int = 1,
+    timeout_seconds: float = 3600.0,
+    batch_id: str = "",
+) -> dict[str, Any]:
+    """Build a SWE-bench Verified task payload for a single instance."""
+    instance_id = instance["instance_id"]
+    sif_name = f"swebench-{harness}-{_sanitize_instance_id(instance_id)}.sif"
+    sif_path = os.path.join(sif_dir, sif_name)
+
+    if not os.path.isfile(sif_path):
+        raise FileNotFoundError(f"SWE-bench SIF not found: {sif_path}")
+
+    runtime_env: dict[str, str] = {}
+    if harness == "opencode":
+        runtime_env["OPENCODE_FAKE_VCS"] = "git"
+
+    exclude_patterns: list[str] = []
+    if harness == "claude_code":
+        exclude_patterns.extend([".claude/**", "**/.claude/**"])
+
+    agent_settings: dict[str, Any] = {}
+    agent_env: dict[str, str] = {}
+    if harness == "swe_agent":
+        agent_settings = {
+            "repo_path": "/polar/session/workspace",
+            "shell_preamble": (
+                "source /opt/miniconda3/etc/profile.d/conda.sh && "
+                "conda activate polar-sweagent && "
+                "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH"
+            ),
+        }
+    elif harness in ("openhands_sdk", "openhands"):
+        agent_env = {"WORKSPACE_BASE": "/polar/session/workspace"}
+
+    runtime_kwargs: dict[str, Any] = {}
+    if harness == "swe_agent":
+        runtime_kwargs["fakeroot"] = True
+
+    return {
+        "task_id": f"swebench-{harness}-{_sanitize_instance_id(instance_id)}-{batch_id}",
+        "instruction": str(instance.get("problem_statement", "")).strip(),
+        "num_rollouts": num_rollouts,
+        "timeout_seconds": timeout_seconds,
+        "runtime": {
+            "backend": "apptainer",
+            "image": sif_path,
+            "prepare": [{"type": "exec", "command": SWEBENCH_PREPARE_BASE}],
+            "env": runtime_env,
+            "network": "host",
+            "workdir": "/polar/session/workspace",
+            **({"kwargs": runtime_kwargs} if runtime_kwargs else {}),
+        },
+        "agent": {
+            "harness": harness,
+            "model_name": agent_model,
+            "settings": agent_settings,
+            "env": agent_env,
+        },
+        "builder": {"strategy": "prefix_merging"},
+        "evaluator": {
+            "strategy": "swegym_git_diff",
+            "config": {
+                "repo_dir": "/testbed",
+                "patch_command": (
+                    "cd /polar/session/workspace && "
+                    "git add -A && git diff --cached --binary"
+                ),
+                "instance": instance,
+                **({"exclude_patterns": exclude_patterns} if exclude_patterns else {}),
+            },
+            "refresh_runtime": False,
+        },
+    }
+
+
+# ── Runner functions ──────────────────────────────────────────────────────────
+
+def run_calculator(args: argparse.Namespace) -> int:
+    batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    output_dir = Path(args.output_dir)
+
+    payload = build_calculator_task(
+        args.harness,
+        args.sif_dir,
+        args.output_dir,
+        agent_model=args.agent_model,
+        num_rollouts=args.num_rollouts,
+        timeout_seconds=args.timeout_seconds,
+        batch_id=batch_id,
+    )
+    request_path = output_dir / "request.json"
+    response_path = output_dir / "response.json"
+    _write_json(request_path, payload)
+    print(f"[calculator] Wrote request to {request_path}")
+
+    if args.dry_run:
+        print("[calculator] Dry run — not submitting.")
+        return 0
+
+    result = _submit_task(request_path, args.topology)
+    _write_json(response_path, result)
+    summary = _summarize_result(result)
+    print(f"[calculator] Done: {summary['reward_one_sessions']}/{summary['total_sessions']} reward=1.0")
+    print(f"[calculator] Response: {response_path}")
+    return 0
+
+
+def run_swegym(args: argparse.Namespace) -> int:
+    batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    output_dir = Path(args.output_dir)
+
+    instances = SWEGYM_SAMPLE
+    if args.instance_id:
+        wanted = set(args.instance_id)
+        instances = [i for i in instances if i["instance_id"] in wanted]
+        missing = wanted - {i["instance_id"] for i in instances}
+        if missing:
+            print(f"[swegym] WARNING: Unknown instance_ids: {missing}")
+    instances = instances[: args.max_tasks]
+
+    if not instances:
+        print("[swegym] No instances selected.")
+        return 1
+
+    # Try to load full instance data from cache
+    cache_path = Path.home() / ".cache" / "polar" / "swegym_sample_10.json"
+    full_instances: dict[str, dict[str, Any]] = {}
+    if cache_path.exists():
+        cached = json.loads(cache_path.read_text())
+        full_instances = {str(i.get("instance_id")): i for i in cached}
+
+    manifest = {
+        "batch_id": batch_id,
+        "harness": args.harness,
+        "model_name": args.model_name,
+        "num_rollouts": args.num_rollouts,
+        "tasks": [i["instance_id"] for i in instances],
+    }
+    _write_json(output_dir / "manifest.json", manifest)
+
+    summaries: list[dict[str, Any]] = []
+    for instance_meta in instances:
+        instance_id = instance_meta["instance_id"]
+        instance = {**instance_meta}
+        if instance_id in full_instances:
+            instance = full_instances[instance_id]
+
+        if not instance.get("problem_statement"):
+            print(
+                f"[swegym] WARNING: No problem_statement for {instance_id}. "
+                f"Run: python examples/swegym/sample_tasks.py to populate cache."
+            )
+            continue
+
+        task_dir = output_dir / _sanitize_instance_id(instance_id)
+        request_path = task_dir / "request.json"
+        response_path = task_dir / "response.json"
+
+        try:
+            payload = build_swegym_task(
+                args.harness,
+                args.sif_dir,
+                instance,
+                agent_model=args.agent_model,
+                num_rollouts=args.num_rollouts,
+                timeout_seconds=args.timeout_seconds,
+                batch_id=batch_id,
+            )
+        except FileNotFoundError as e:
+            print(f"[swegym] Skipping {instance_id}: {e}")
+            continue
+
+        _write_json(request_path, payload)
+        print(f"[swegym] [{instance_id}] Wrote request to {request_path}")
+
+        if args.dry_run:
+            summaries.append({"instance_id": instance_id, "dry_run": True})
+            continue
+
+        try:
+            result = _submit_task(request_path, args.topology)
+            _write_json(response_path, result)
+            summary = {
+                "instance_id": instance_id,
+                "task_id": payload["task_id"],
+                **_summarize_result(result),
+            }
+            summaries.append(summary)
+            print(
+                f"[swegym] [{instance_id}] Done: "
+                f"reward_1={summary['reward_one_sessions']}/{summary['total_sessions']}"
+            )
+        except subprocess.CalledProcessError as e:
+            print(f"[swegym] [{instance_id}] FAILED: {e}")
+            if e.stderr:
+                print(f"  stderr: {e.stderr[:500]}")
+            summaries.append({"instance_id": instance_id, "error": str(e)})
+
+    _write_json(output_dir / "summary.json", summaries)
+    print(f"[swegym] Batch summary: {output_dir / 'summary.json'}")
+    return 0
+
+
+def run_swebench(args: argparse.Namespace) -> int:
+    batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    output_dir = Path(args.output_dir)
+
+    cache_path = Path.home() / ".cache" / "polar" / "swebench_verified.json"
+    if not cache_path.exists():
+        print(
+            f"[swebench] ERROR: Dataset cache not found at {cache_path}\n"
+            f"  Populate it with: python -c \""
+            f"from examples.swebench_verified.dataset import load_swebench_verified; "
+            f"load_swebench_verified()\""
+        )
+        return 1
+
+    all_instances = json.loads(cache_path.read_text())
+    instances_by_id: dict[str, dict[str, Any]] = {
+        str(i["instance_id"]): i for i in all_instances
+    }
+
+    if args.instance_id:
+        wanted = set(args.instance_id)
+        instances = [instances_by_id[iid] for iid in wanted if iid in instances_by_id]
+        missing = wanted - {str(i["instance_id"]) for i in instances}
+        if missing:
+            print(f"[swebench] WARNING: Unknown instance_ids: {missing}")
+    else:
+        instances = all_instances
+    instances = instances[: args.max_tasks]
+
+    if not instances:
+        print("[swebench] No instances selected.")
+        return 1
+
+    manifest = {
+        "batch_id": batch_id,
+        "harness": args.harness,
+        "num_rollouts": args.num_rollouts,
+        "tasks": [str(i["instance_id"]) for i in instances],
+    }
+    _write_json(output_dir / "manifest.json", manifest)
+
+    summaries: list[dict[str, Any]] = []
+    for instance in instances:
+        instance_id = str(instance["instance_id"])
+        task_dir = output_dir / _sanitize_instance_id(instance_id)
+        request_path = task_dir / "request.json"
+        response_path = task_dir / "response.json"
+
+        if not instance.get("problem_statement"):
+            print(f"[swebench] WARNING: No problem_statement for {instance_id}, skipping.")
+            continue
+
+        try:
+            payload = build_swebench_task(
+                args.harness,
+                args.sif_dir,
+                instance,
+                agent_model=args.agent_model,
+                num_rollouts=args.num_rollouts,
+                timeout_seconds=args.timeout_seconds,
+                batch_id=batch_id,
+            )
+        except FileNotFoundError as e:
+            print(f"[swebench] Skipping {instance_id}: {e}")
+            continue
+
+        _write_json(request_path, payload)
+        print(f"[swebench] [{instance_id}] Wrote request to {request_path}")
+
+        if args.dry_run:
+            summaries.append({"instance_id": instance_id, "dry_run": True})
+            continue
+
+        try:
+            result = _submit_task(request_path, args.topology)
+            _write_json(response_path, result)
+            summary = {
+                "instance_id": instance_id,
+                "task_id": payload["task_id"],
+                **_summarize_result(result),
+            }
+            summaries.append(summary)
+            print(
+                f"[swebench] [{instance_id}] Done: "
+                f"reward_1={summary['reward_one_sessions']}/{summary['total_sessions']}"
+            )
+        except subprocess.CalledProcessError as e:
+            print(f"[swebench] [{instance_id}] FAILED: {e}")
+            if e.stderr:
+                print(f"  stderr: {e.stderr[:500]}")
+            summaries.append({"instance_id": instance_id, "error": str(e)})
+
+    _write_json(output_dir / "summary.json", summaries)
+    print(f"[swebench] Batch summary: {output_dir / 'summary.json'}")
+    return 0
+
+
+# ── CLI entry point ───────────────────────────────────────────────────────────
+
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build and submit Polar tasks on SLURM")
+    parser.add_argument("--example", required=True, choices=["calculator", "swegym", "swebench_verified"])
+    parser.add_argument("--harness", default="opencode")
+    parser.add_argument("--topology", required=True, help="Path to topology.yaml")
+    parser.add_argument("--sif-dir", required=True, help="Directory containing SIF images")
+    parser.add_argument("--output-dir", default="./task_outputs")
+    parser.add_argument("--num-rollouts", type=int, default=4)
+    parser.add_argument("--timeout-seconds", type=float, default=900.0)
+    parser.add_argument(
+        "--model-name",
+        default=os.environ.get("MODEL_NAME", "Qwen/Qwen3.5-27B"),
+    )
+    parser.add_argument(
+        "--agent-model",
+        default=os.environ.get("AGENT_MODEL", "openai/gpt-4o"),
+    )
+    parser.add_argument("--max-tasks", type=int, default=10)
+    parser.add_argument("--instance-id", action="append", default=[])
+    parser.add_argument("--dry-run", action="store_true")
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _parse_args(argv)
+    if args.example == "calculator":
+        return run_calculator(args)
+    elif args.example == "swegym":
+        return run_swegym(args)
+    elif args.example == "swebench_verified":
+        return run_swebench(args)
+    else:
+        print(f"Unknown example: {args.example}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/polar/cluster/templates/__init__.py b/src/polar/cluster/templates/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/polar/cluster/templates/env.sh b/src/polar/cluster/templates/env.sh
new file mode 100644
index 00000000..05186b5f
--- /dev/null
+++ b/src/polar/cluster/templates/env.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Polar SLURM environment configuration.
+# Source this file in all SLURM job scripts and helpers.
+#
+# Required: POLAR_WORKSPACE must be set before sourcing.
+# The easiest way is via 'polar cluster launch -c cluster.yaml'.
+
+# ── Cluster paths ──────────────────────────────────────────────────────────────
+if [ -z "${POLAR_WORKSPACE:-}" ]; then
+    echo "ERROR: POLAR_WORKSPACE must be set before sourcing env.sh" >&2
+    echo "  Use: polar cluster launch -c cluster.yaml" >&2
+    return 1 2>/dev/null || exit 1
+fi
+
+export POLAR_ROOT="${POLAR_ROOT:-${POLAR_WORKSPACE}/polar}"
+export POLAR_CODE="${POLAR_CODE:-${POLAR_ROOT}/ProRL-Agent-Server}"
+export POLAR_SIFS="${POLAR_SIFS:-${POLAR_ROOT}/sif_images}"
+export POLAR_RESULTS="${POLAR_RESULTS:-${POLAR_ROOT}/results}"
+export POLAR_VENV="${POLAR_VENV:-${POLAR_ROOT}/.venv}"
+
+# ── Apptainer (optional — skip if system-installed) ───────────────────────────
+if [ -n "${APPTAINER_BIN_DIR:-}" ]; then
+    export PATH="${APPTAINER_BIN_DIR}:${PATH}"
+fi
+export APPTAINER_CACHEDIR="${APPTAINER_CACHEDIR:-${POLAR_ROOT}/apptainer_cache}"
+
+# ── CUDA (optional — for FlashInfer GDN kernel JIT compilation) ───────────────
+if [ -n "${CUDA_HOME:-}" ]; then
+    export PATH="${CUDA_HOME}/bin:${PATH}"
+    export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}"
+fi
+
+# ── Caches (redirect to shared storage to avoid home directory quota) ─────────
+export HF_HOME="${HF_HOME:-${POLAR_WORKSPACE}/hf_cache}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export TORCH_HOME="${TORCH_HOME:-${POLAR_WORKSPACE}/torch_cache}"
+export PIP_CACHE_DIR="${PIP_CACHE_DIR:-${POLAR_WORKSPACE}/pip_cache}"
+export XDG_CACHE_HOME="${XDG_CACHE_HOME:-${POLAR_WORKSPACE}/xdg_cache}"
+export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-${POLAR_WORKSPACE}/triton_cache}"
+
+# ── Python ─────────────────────────────────────────────────────────────────────
+if [ -d "${POLAR_VENV}" ]; then
+    source "${POLAR_VENV}/bin/activate"
+fi
+
+# Ensure polar source tree is importable even without pip install -e
+export PYTHONPATH="${POLAR_CODE}/src${PYTHONPATH:+:${PYTHONPATH}}"
+
+# ── Service ports ──────────────────────────────────────────────────────────────
+export VLLM_PORT="${VLLM_PORT:-18000}"
+export ROLLOUT_PORT="${ROLLOUT_PORT:-18080}"
+export GATEWAY_BASE_PORT="${GATEWAY_BASE_PORT:-18100}"
+
+# ── vLLM defaults ──────────────────────────────────────────────────────────────
+export MODEL_NAME="${MODEL_NAME:-Qwen/Qwen3.5-27B}"
+export MODEL_PATH="${MODEL_PATH:-${MODEL_NAME}}"
+export TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-8}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}"
+export GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+export MAX_NUM_SEQS="${MAX_NUM_SEQS:-64}"
+export TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-qwen3_xml}"
+
+# ── Gateway defaults ──────────────────────────────────────────────────────────
+export MAX_INIT_WORKERS="${MAX_INIT_WORKERS:-8}"
+export MAX_RUN_WORKERS="${MAX_RUN_WORKERS:-4}"
+export MAX_POSTRUN_WORKERS="${MAX_POSTRUN_WORKERS:-4}"
+export READY_BUFFER_TARGET="${READY_BUFFER_TARGET:-4}"
+
+# ── Training defaults (used by polar_slurm_train.sbatch) ─────────────────────
+export SGLANG_ROUTER_PORT="${SGLANG_ROUTER_PORT:-9000}"
+export RAY_PORT="${RAY_PORT:-6379}"
+export RAY_DASHBOARD_PORT="${RAY_DASHBOARD_PORT:-8265}"
diff --git a/src/polar/cluster/templates/polar_slurm.sbatch b/src/polar/cluster/templates/polar_slurm.sbatch
new file mode 100644
index 00000000..68f052b4
--- /dev/null
+++ b/src/polar/cluster/templates/polar_slurm.sbatch
@@ -0,0 +1,252 @@
+#!/bin/bash
+# ────────────────────────────────────────────────────────────────────────────────
+# Polar SLURM Job Script
+#
+# Orchestrates: vLLM server → Rollout service → Gateway node(s) → Task submission
+#
+# All SBATCH directives are passed via sbatch CLI flags from 'polar cluster launch'.
+# No hardcoded account, partition, or resource values.
+#
+# Required env vars (pass via --export):
+#   POLAR_CODE        - Path to ProRL-Agent-Server on cluster
+#   POLAR_WORKSPACE   - Base workspace path on cluster
+#   EXAMPLE           - "calculator" or "swegym" (default: calculator)
+#   HARNESS           - Agent harness name (default: opencode)
+#
+# Optional env vars:
+#   MODEL_NAME        - Model to serve (default from env.sh)
+#   MODEL_PATH        - Model path/name for vLLM (default: $MODEL_NAME)
+#   TENSOR_PARALLEL_SIZE - TP size for vLLM (default: 8)
+#   NUM_ROLLOUTS      - Number of rollouts per task (default: 4)
+#   TIMEOUT_SECONDS   - Per-task timeout (default: 900)
+#   DEFAULT_SIF_IMAGE - Override SIF path for agent containers
+#   TASK_FILE         - Submit a specific task JSON instead of using the example
+# ────────────────────────────────────────────────────────────────────────────────
+set -euo pipefail
+
+# SLURM copies the script to a local spool directory, so BASH_SOURCE[0] won't
+# point back to the original source tree.  POLAR_CODE must be passed via --export.
+if [ -z "${POLAR_CODE:-}" ]; then
+    echo "FATAL: POLAR_CODE not set. Use 'polar cluster launch -c cluster.yaml'." >&2
+    exit 1
+fi
+if [ -z "${POLAR_WORKSPACE:-}" ]; then
+    echo "FATAL: POLAR_WORKSPACE not set. Use 'polar cluster launch -c cluster.yaml'." >&2
+    exit 1
+fi
+
+# Source environment setup from the cluster templates
+TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates"
+source "${TEMPLATE_DIR}/env.sh"
+
+# ── Defaults ───────────────────────────────────────────────────────────────────
+EXAMPLE="${EXAMPLE:-calculator}"
+HARNESS="${HARNESS:-opencode}"
+NUM_ROLLOUTS="${NUM_ROLLOUTS:-4}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-900}"
+
+# ── Job directory ──────────────────────────────────────────────────────────────
+JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}"
+mkdir -p "${JOB_DIR}/logs"
+echo "[polar_slurm] Job directory: ${JOB_DIR}"
+echo "[polar_slurm] Job ID: ${SLURM_JOB_ID}"
+echo "[polar_slurm] Nodes: ${SLURM_JOB_NODELIST}"
+echo "[polar_slurm] GPUs per node: ${SLURM_GPUS_ON_NODE:-unknown}"
+
+# ── Discover hostnames ─────────────────────────────────────────────────────────
+HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}"))
+NODE_0="${HOSTNAMES[0]}"
+NUM_NODES="${#HOSTNAMES[@]}"
+echo "[polar_slurm] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}"
+
+# ── Resolve SIF image ─────────────────────────────────────────────────────────
+if [ -z "${DEFAULT_SIF_IMAGE:-}" ]; then
+    if [ "${EXAMPLE}" = "calculator" ]; then
+        DEFAULT_SIF_IMAGE="${POLAR_SIFS}/calculator-${HARNESS}.sif"
+    else
+        # For swegym, each task has its own SIF — submit script handles this
+        DEFAULT_SIF_IMAGE=""
+    fi
+fi
+
+if [ -n "${DEFAULT_SIF_IMAGE}" ] && [ ! -f "${DEFAULT_SIF_IMAGE}" ]; then
+    echo "[polar_slurm] WARNING: SIF not found: ${DEFAULT_SIF_IMAGE}"
+    echo "[polar_slurm] Build it first with: polar cluster build-sif --example ${EXAMPLE} --harness ${HARNESS}"
+fi
+
+# ── Save results directory ─────────────────────────────────────────────────────
+export SAVE_DIR="${JOB_DIR}/rollout_results"
+mkdir -p "${SAVE_DIR}"
+
+# ── Generate topology.yaml ─────────────────────────────────────────────────────
+TOPOLOGY_PATH="${JOB_DIR}/topology.yaml"
+python -m polar.cluster.topology \
+    --output "${TOPOLOGY_PATH}" \
+    --save-dir "${SAVE_DIR}" \
+    ${DEFAULT_SIF_IMAGE:+--default-sif "${DEFAULT_SIF_IMAGE}"}
+
+export POLAR_TOPOLOGY="${TOPOLOGY_PATH}"
+echo "[polar_slurm] Generated topology: ${TOPOLOGY_PATH}"
+cat "${TOPOLOGY_PATH}"
+
+# ── Kill stale processes on ports from previous runs ─────────────────────────
+echo "[polar_slurm] Cleaning stale processes on ports..."
+for port in "${VLLM_PORT}" "${ROLLOUT_PORT}"; do
+    stale_pid=$(lsof -ti ":${port}" 2>/dev/null || true)
+    if [ -n "${stale_pid}" ]; then
+        echo "[polar_slurm] Killing stale PID ${stale_pid} on port ${port}"
+        kill -9 ${stale_pid} 2>/dev/null || true
+        sleep 1
+    fi
+done
+for ((i=0; i<NUM_NODES; i++)); do
+    gw_port=$((GATEWAY_BASE_PORT + i))
+    stale_pid=$(lsof -ti ":${gw_port}" 2>/dev/null || true)
+    if [ -n "${stale_pid}" ]; then
+        echo "[polar_slurm] Killing stale PID ${stale_pid} on port ${gw_port}"
+        kill -9 ${stale_pid} 2>/dev/null || true
+        sleep 1
+    fi
+done
+
+# ── PID tracking for cleanup ───────────────────────────────────────────────────
+PIDS=()
+cleanup() {
+    echo "[polar_slurm] Cleaning up..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "${pid}" 2>/dev/null; then
+            echo "[polar_slurm] Killing PID ${pid}"
+            kill "${pid}" 2>/dev/null || true
+        fi
+    done
+    wait 2>/dev/null || true
+    echo "[polar_slurm] Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+# ── Source readiness poller ────────────────────────────────────────────────────
+source "${TEMPLATE_DIR}/wait_for_service.sh"
+
+# ── Launch vLLM ────────────────────────────────────────────────────────────────
+echo "[polar_slurm] Starting vLLM server on ${NODE_0}:${VLLM_PORT}..."
+echo "[polar_slurm]   Model: ${MODEL_PATH}"
+echo "[polar_slurm]   TP size: ${TENSOR_PARALLEL_SIZE}"
+
+python -m vllm.entrypoints.openai.api_server \
+    --model "${MODEL_PATH}" \
+    --port "${VLLM_PORT}" \
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
+    --max-model-len "${MAX_MODEL_LEN}" \
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION:-0.90}" \
+    --max-num-seqs "${MAX_NUM_SEQS:-64}" \
+    --enable-auto-tool-choice \
+    --tool-call-parser "${TOOL_CALL_PARSER:-hermes}" \
+    --gdn-prefill-backend triton \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    > "${JOB_DIR}/logs/vllm.log" 2>&1 &
+PIDS+=($!)
+echo "[polar_slurm] vLLM PID: ${PIDS[-1]}"
+
+wait_for_service "http://${NODE_0}:${VLLM_PORT}/health" "vLLM" 240 5 || exit 1
+
+# ── Launch Rollout Service ─────────────────────────────────────────────────────
+echo "[polar_slurm] Starting rollout service on ${NODE_0}:${ROLLOUT_PORT}..."
+
+polar serve_rollout -c "${TOPOLOGY_PATH}" \
+    > "${JOB_DIR}/logs/rollout.log" 2>&1 &
+PIDS+=($!)
+echo "[polar_slurm] Rollout PID: ${PIDS[-1]}"
+
+wait_for_service "http://${NODE_0}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1
+
+# ── Launch Gateway Node(s) ─────────────────────────────────────────────────────
+for ((i=0; i<NUM_NODES; i++)); do
+    NODE_ID="node-$(printf '%02d' ${i})"
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+
+    echo "[polar_slurm] Starting gateway ${NODE_ID} on ${GW_HOST}:${GW_PORT}..."
+
+    if [ "${i}" -eq 0 ]; then
+        # Gateway on node 0 runs locally (same as rollout + vLLM)
+        polar serve_gateway \
+            -c "${TOPOLOGY_PATH}" \
+            --node-id "${NODE_ID}" \
+            > "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 &
+        PIDS+=($!)
+    else
+        # Gateway on remote node via srun
+        srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \
+            --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \
+            --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \
+            bash -c "
+                source '${TEMPLATE_DIR}/env.sh'
+                export POLAR_TOPOLOGY='${TOPOLOGY_PATH}'
+                export POLAR_GATEWAY_NODE_ID='${NODE_ID}'
+                python -m polar.gateway.server
+            " &
+        PIDS+=($!)
+    fi
+    echo "[polar_slurm] Gateway ${NODE_ID} PID: ${PIDS[-1]}"
+done
+
+# Wait for all gateways
+for ((i=0; i<NUM_NODES; i++)); do
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+    wait_for_service "http://${GW_HOST}:${GW_PORT}/health" "Gateway node-$(printf '%02d' ${i})" 30 2 || exit 1
+done
+
+# ── Check overall status ───────────────────────────────────────────────────────
+echo "[polar_slurm] All services running. Checking status..."
+polar status -c "${TOPOLOGY_PATH}" --json > "${JOB_DIR}/logs/status.json" 2>&1 || true
+polar status -c "${TOPOLOGY_PATH}" || true
+
+# ── Submit Tasks ───────────────────────────────────────────────────────────────
+echo ""
+echo "[polar_slurm] ════════════════════════════════════════════════════════"
+echo "[polar_slurm] Submitting tasks: EXAMPLE=${EXAMPLE}, HARNESS=${HARNESS}"
+echo "[polar_slurm] ════════════════════════════════════════════════════════"
+
+if [ -n "${TASK_FILE:-}" ]; then
+    # Direct task file submission
+    echo "[polar_slurm] Submitting task file: ${TASK_FILE}"
+    polar submit "${TASK_FILE}" \
+        -c "${TOPOLOGY_PATH}" \
+        --json | tee "${JOB_DIR}/response.json"
+else
+    # Use the cluster task builder module
+    INSTANCE_ID_ARGS=""
+    if [ -n "${INSTANCE_IDS:-}" ]; then
+        IFS=',' read -ra _IDS <<< "${INSTANCE_IDS}"
+        for _id in "${_IDS[@]}"; do
+            INSTANCE_ID_ARGS="${INSTANCE_ID_ARGS} --instance-id ${_id}"
+        done
+    fi
+    python -m polar.cluster.tasks \
+        --example "${EXAMPLE}" \
+        --harness "${HARNESS}" \
+        --topology "${TOPOLOGY_PATH}" \
+        --sif-dir "${POLAR_SIFS}" \
+        --output-dir "${JOB_DIR}/tasks" \
+        --num-rollouts "${NUM_ROLLOUTS}" \
+        --timeout-seconds "${TIMEOUT_SECONDS}" \
+        ${INSTANCE_ID_ARGS}
+fi
+
+SUBMIT_EXIT=$?
+
+# ── Results ────────────────────────────────────────────────────────────────────
+echo ""
+echo "[polar_slurm] ════════════════════════════════════════════════════════"
+echo "[polar_slurm] Task submission finished (exit code: ${SUBMIT_EXIT})"
+echo "[polar_slurm] Results directory: ${JOB_DIR}"
+echo "[polar_slurm] Rollout results: ${SAVE_DIR}"
+echo "[polar_slurm] ════════════════════════════════════════════════════════"
+
+# Final status check
+polar status -c "${TOPOLOGY_PATH}" || true
+
+echo "[polar_slurm] Job complete."
+exit ${SUBMIT_EXIT}
diff --git a/src/polar/cluster/templates/polar_slurm_serve.sbatch b/src/polar/cluster/templates/polar_slurm_serve.sbatch
new file mode 100644
index 00000000..d16520db
--- /dev/null
+++ b/src/polar/cluster/templates/polar_slurm_serve.sbatch
@@ -0,0 +1,202 @@
+#!/bin/bash
+# ────────────────────────────────────────────────────────────────────────────────
+# Polar SLURM Serve-Only Job Script
+#
+# Starts: vLLM server → Rollout service → Gateway node(s), then waits.
+# Tasks are submitted separately via 'polar cluster submit-task'.
+#
+# All SBATCH directives are passed via sbatch CLI flags from 'polar cluster serve'.
+# No hardcoded account, partition, or resource values.
+#
+# Required env vars (pass via --export):
+#   POLAR_CODE        - Path to ProRL-Agent-Server on cluster
+#   POLAR_WORKSPACE   - Base workspace path on cluster
+#
+# Optional env vars:
+#   MODEL_NAME        - Model to serve (default from env.sh)
+#   MODEL_PATH        - Model path/name for vLLM (default: $MODEL_NAME)
+#   TENSOR_PARALLEL_SIZE - TP size for vLLM (default: 8)
+# ────────────────────────────────────────────────────────────────────────────────
+set -euo pipefail
+
+if [ -z "${POLAR_CODE:-}" ]; then
+    echo "FATAL: POLAR_CODE not set. Use 'polar cluster serve -c cluster.yaml'." >&2
+    exit 1
+fi
+if [ -z "${POLAR_WORKSPACE:-}" ]; then
+    echo "FATAL: POLAR_WORKSPACE not set. Use 'polar cluster serve -c cluster.yaml'." >&2
+    exit 1
+fi
+
+# Source environment setup from the cluster templates
+TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates"
+source "${TEMPLATE_DIR}/env.sh"
+
+# ── Job directory ──────────────────────────────────────────────────────────────
+JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}"
+mkdir -p "${JOB_DIR}/logs"
+echo "[polar_serve] Job directory: ${JOB_DIR}"
+echo "[polar_serve] Job ID: ${SLURM_JOB_ID}"
+echo "[polar_serve] Nodes: ${SLURM_JOB_NODELIST}"
+echo "[polar_serve] GPUs per node: ${SLURM_GPUS_ON_NODE:-unknown}"
+
+# ── Discover hostnames ─────────────────────────────────────────────────────────
+HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}"))
+NODE_0="${HOSTNAMES[0]}"
+NUM_NODES="${#HOSTNAMES[@]}"
+echo "[polar_serve] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}"
+
+# ── Save results directory ─────────────────────────────────────────────────────
+export SAVE_DIR="${JOB_DIR}/rollout_results"
+mkdir -p "${SAVE_DIR}"
+
+# ── Generate topology.yaml ─────────────────────────────────────────────────────
+TOPOLOGY_PATH="${JOB_DIR}/topology.yaml"
+python -m polar.cluster.topology \
+    --output "${TOPOLOGY_PATH}" \
+    --save-dir "${SAVE_DIR}"
+
+export POLAR_TOPOLOGY="${TOPOLOGY_PATH}"
+echo "[polar_serve] Generated topology: ${TOPOLOGY_PATH}"
+cat "${TOPOLOGY_PATH}"
+
+# ── Kill stale processes on ports from previous runs ─────────────────────────
+echo "[polar_serve] Cleaning stale processes on ports..."
+for port in "${VLLM_PORT}" "${ROLLOUT_PORT}"; do
+    stale_pid=$(lsof -ti ":${port}" 2>/dev/null || true)
+    if [ -n "${stale_pid}" ]; then
+        echo "[polar_serve] Killing stale PID ${stale_pid} on port ${port}"
+        kill -9 ${stale_pid} 2>/dev/null || true
+        sleep 1
+    fi
+done
+for ((i=0; i<NUM_NODES; i++)); do
+    gw_port=$((GATEWAY_BASE_PORT + i))
+    stale_pid=$(lsof -ti ":${gw_port}" 2>/dev/null || true)
+    if [ -n "${stale_pid}" ]; then
+        echo "[polar_serve] Killing stale PID ${stale_pid} on port ${gw_port}"
+        kill -9 ${stale_pid} 2>/dev/null || true
+        sleep 1
+    fi
+done
+
+# ── PID tracking for cleanup ───────────────────────────────────────────────────
+PIDS=()
+cleanup() {
+    echo "[polar_serve] Cleaning up..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "${pid}" 2>/dev/null; then
+            echo "[polar_serve] Killing PID ${pid}"
+            kill "${pid}" 2>/dev/null || true
+        fi
+    done
+    wait 2>/dev/null || true
+    echo "[polar_serve] Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+# ── Source readiness poller ────────────────────────────────────────────────────
+source "${TEMPLATE_DIR}/wait_for_service.sh"
+
+# ── Launch vLLM ────────────────────────────────────────────────────────────────
+echo "[polar_serve] Starting vLLM server on ${NODE_0}:${VLLM_PORT}..."
+echo "[polar_serve]   Model: ${MODEL_PATH}"
+echo "[polar_serve]   TP size: ${TENSOR_PARALLEL_SIZE}"
+
+python -m vllm.entrypoints.openai.api_server \
+    --model "${MODEL_PATH}" \
+    --port "${VLLM_PORT}" \
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
+    --max-model-len "${MAX_MODEL_LEN}" \
+    --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION:-0.90}" \
+    --max-num-seqs "${MAX_NUM_SEQS:-64}" \
+    --enable-auto-tool-choice \
+    --tool-call-parser "${TOOL_CALL_PARSER:-hermes}" \
+    --gdn-prefill-backend triton \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    > "${JOB_DIR}/logs/vllm.log" 2>&1 &
+PIDS+=($!)
+echo "[polar_serve] vLLM PID: ${PIDS[-1]}"
+
+wait_for_service "http://${NODE_0}:${VLLM_PORT}/health" "vLLM" 240 5 || exit 1
+
+# ── Launch Rollout Service ─────────────────────────────────────────────────────
+echo "[polar_serve] Starting rollout service on ${NODE_0}:${ROLLOUT_PORT}..."
+
+polar serve_rollout -c "${TOPOLOGY_PATH}" \
+    > "${JOB_DIR}/logs/rollout.log" 2>&1 &
+PIDS+=($!)
+echo "[polar_serve] Rollout PID: ${PIDS[-1]}"
+
+wait_for_service "http://${NODE_0}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1
+
+# ── Launch Gateway Node(s) ─────────────────────────────────────────────────────
+for ((i=0; i<NUM_NODES; i++)); do
+    NODE_ID="node-$(printf '%02d' ${i})"
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+
+    echo "[polar_serve] Starting gateway ${NODE_ID} on ${GW_HOST}:${GW_PORT}..."
+
+    if [ "${i}" -eq 0 ]; then
+        polar serve_gateway \
+            -c "${TOPOLOGY_PATH}" \
+            --node-id "${NODE_ID}" \
+            > "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 &
+        PIDS+=($!)
+    else
+        srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \
+            --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \
+            --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \
+            bash -c "
+                source '${TEMPLATE_DIR}/env.sh'
+                export POLAR_TOPOLOGY='${TOPOLOGY_PATH}'
+                export POLAR_GATEWAY_NODE_ID='${NODE_ID}'
+                python -m polar.gateway.server
+            " &
+        PIDS+=($!)
+    fi
+    echo "[polar_serve] Gateway ${NODE_ID} PID: ${PIDS[-1]}"
+done
+
+# Wait for all gateways
+for ((i=0; i<NUM_NODES; i++)); do
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+    wait_for_service "http://${GW_HOST}:${GW_PORT}/health" "Gateway node-$(printf '%02d' ${i})" 30 2 || exit 1
+done
+
+# ── All services ready ───────────────────────────────────────────────────────
+echo "[polar_serve] All services running. Checking status..."
+polar status -c "${TOPOLOGY_PATH}" --json > "${JOB_DIR}/logs/status.json" 2>&1 || true
+polar status -c "${TOPOLOGY_PATH}" || true
+
+# Write sentinel file so 'polar cluster serve' knows services are ready
+cat > "${JOB_DIR}/.services_ready" <<EOF
+TOPOLOGY=${TOPOLOGY_PATH}
+ROLLOUT_URL=http://${NODE_0}:${ROLLOUT_PORT}
+JOB_DIR=${JOB_DIR}
+SIF_DIR=${POLAR_SIFS}
+TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+EOF
+
+echo ""
+echo "[polar_serve] ════════════════════════════════════════════════════════"
+echo "[polar_serve] Services ready. Waiting for task submissions..."
+echo "[polar_serve] Job ID: ${SLURM_JOB_ID}"
+echo "[polar_serve] Topology: ${TOPOLOGY_PATH}"
+echo "[polar_serve]"
+echo "[polar_serve] Submit tasks with:"
+echo "[polar_serve]   polar cluster submit-task -c <config> --job-id ${SLURM_JOB_ID} --example calculator --harness opencode"
+echo "[polar_serve] ════════════════════════════════════════════════════════"
+
+# Wait indefinitely — services stay alive until SLURM time limit or scancel
+while true; do
+    sleep 60
+    # Periodic health check
+    if ! kill -0 "${PIDS[0]}" 2>/dev/null; then
+        echo "[polar_serve] vLLM process died. Exiting."
+        exit 1
+    fi
+done
diff --git a/src/polar/cluster/templates/polar_slurm_train.sbatch b/src/polar/cluster/templates/polar_slurm_train.sbatch
new file mode 100644
index 00000000..68f64c96
--- /dev/null
+++ b/src/polar/cluster/templates/polar_slurm_train.sbatch
@@ -0,0 +1,360 @@
+#!/bin/bash
+# ────────────────────────────────────────────────────────────────────────────────
+# Polar SLURM Training Job Script
+#
+# Orchestrates distributed RL training: Polar services (rollout + gateway) for
+# agent session management, and Slime/Ray/SGLang/Megatron for GRPO training.
+#
+# GPU layout (single trainer node, 8 GPUs):
+#   GPU 0..ROLLOUT_GPUS-1  – SGLang inference (Slime/Ray-managed, weight-synced)
+#   GPU ROLLOUT_GPUS..7     – Megatron GRPO training (TP=TP_SIZE, DP=auto)
+#
+# Polar services (CPU-only) run on the host using POLAR_VENV.
+# Training commands run inside the training SIF via apptainer exec.
+#
+# Required env vars (pass via --export):
+#   POLAR_CODE         – Path to ProRL-Agent-Server on cluster
+#   POLAR_WORKSPACE    – Base workspace path on cluster
+#   POLAR_CONFIG_PATH  – Path to polar_config.yaml (bridge config)
+#   PROMPT_DATA        – Path to JSONL training data
+#   HF_CHECKPOINT      – HuggingFace model checkpoint name
+#   TRAIN_SIF          – Path to training SIF image
+# ────────────────────────────────────────────────────────────────────────────────
+set -euo pipefail
+
+if [ -z "${POLAR_CODE:-}" ]; then
+    echo "FATAL: POLAR_CODE not set." >&2; exit 1
+fi
+if [ -z "${POLAR_WORKSPACE:-}" ]; then
+    echo "FATAL: POLAR_WORKSPACE not set." >&2; exit 1
+fi
+
+# Source environment setup
+TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates"
+source "${TEMPLATE_DIR}/env.sh"
+
+# ── Defaults ──────────────────────────────────────────────────────────────────
+ACTOR_GPUS="${ACTOR_GPUS:-4}"
+ROLLOUT_GPUS="${ROLLOUT_GPUS:-4}"
+TP_SIZE="${TP_SIZE:-2}"
+SGLANG_ROUTER_PORT="${SGLANG_ROUTER_PORT:-9000}"
+RAY_PORT="${RAY_PORT:-6379}"
+RAY_DASHBOARD_PORT="${RAY_DASHBOARD_PORT:-8265}"
+TRAIN_NUM_ROLLOUTS="${TRAIN_NUM_ROLLOUTS:-5}"
+ROLLOUT_BATCH_SIZE="${ROLLOUT_BATCH_SIZE:-2}"
+N_SAMPLES_PER_PROMPT="${N_SAMPLES_PER_PROMPT:-16}"
+GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-32}"
+TOTAL_GPUS=$((ACTOR_GPUS + ROLLOUT_GPUS))
+
+# Training SIF — must be pre-built via 'polar cluster build-sif --example train'
+TRAIN_SIF="${TRAIN_SIF:-${POLAR_SIFS}/train-slime-grpo.sif}"
+if [ ! -f "${TRAIN_SIF}" ]; then
+    echo "FATAL: Training SIF not found: ${TRAIN_SIF}" >&2
+    echo "  Build with: polar cluster build-sif -c cluster.yaml --example train" >&2
+    exit 1
+fi
+
+# HF cache on lustre (shared, persistent across jobs)
+HF_HOME="${POLAR_WORKSPACE}/hf_cache"
+mkdir -p "${HF_HOME}"
+
+# Apptainer exec prefix for GPU commands.
+# PYTHONNOUSERSITE prevents user-installed packages (e.g. torch in ~/.local)
+# from shadowing the NGC container's system packages.
+# LD_LIBRARY_PATH includes CUDA compat for forward-compatibility with older drivers.
+# HF_HOME on lustre so the container can access downloaded models.
+APPTAINER_EXEC="apptainer exec --nv --writable-tmpfs --no-home \
+  --bind ${POLAR_WORKSPACE}:${POLAR_WORKSPACE} \
+  --env PYTHONNOUSERSITE=1 \
+  --env LD_LIBRARY_PATH=/usr/local/cuda/compat:\${LD_LIBRARY_PATH:-} \
+  --env HF_HOME=${HF_HOME} \
+  --env HF_HUB_OFFLINE=0 \
+  ${TRAIN_SIF}"
+
+# ── Job directory ─────────────────────────────────────────────────────────────
+JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}"
+mkdir -p "${JOB_DIR}/logs"
+echo "[polar_train] Job directory: ${JOB_DIR}"
+echo "[polar_train] Job ID: ${SLURM_JOB_ID}"
+echo "[polar_train] Nodes: ${SLURM_JOB_NODELIST}"
+echo "[polar_train] Training SIF: ${TRAIN_SIF}"
+
+# ── Discover hostnames ────────────────────────────────────────────────────────
+HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}"))
+TRAINER_NODE="${HOSTNAMES[0]}"
+NUM_NODES="${#HOSTNAMES[@]}"
+echo "[polar_train] Trainer node: ${TRAINER_NODE}"
+echo "[polar_train] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}"
+
+# ── Derived paths ─────────────────────────────────────────────────────────────
+HF_CKPT_BASENAME="${HF_CHECKPOINT##*/}"
+TORCH_DIST_DIR="${TORCH_DIST_DIR:-${POLAR_WORKSPACE}/checkpoints/${HF_CKPT_BASENAME}_torch_dist}"
+SAVE_DIR="${TRAIN_SAVE_DIR:-${JOB_DIR}/checkpoints}"
+mkdir -p "${SAVE_DIR}"
+export SAVE_DIR
+
+ROLLOUT_SAVE_DIR="${JOB_DIR}/rollout_results"
+mkdir -p "${ROLLOUT_SAVE_DIR}"
+
+# ── Generate topology.yaml ────────────────────────────────────────────────────
+# Points gateway at SGLang router (on trainer node), NOT at vLLM
+TOPOLOGY_PATH="${JOB_DIR}/topology.yaml"
+SGLANG_URL="http://${TRAINER_NODE}:${SGLANG_ROUTER_PORT}"
+
+python -m polar.cluster.topology \
+    --output "${TOPOLOGY_PATH}" \
+    --save-dir "${ROLLOUT_SAVE_DIR}" \
+    --sglang-base-url "${SGLANG_URL}"
+
+export POLAR_TOPOLOGY="${TOPOLOGY_PATH}"
+echo "[polar_train] Generated topology: ${TOPOLOGY_PATH}"
+cat "${TOPOLOGY_PATH}"
+
+# ── Patch polar_config.yaml with actual rollout URL ───────────────────────────
+PATCHED_POLAR_CONFIG="${JOB_DIR}/polar_config.yaml"
+POLAR_ROLLOUT_URL="http://${TRAINER_NODE}:${ROLLOUT_PORT}"
+sed "s|polar_rollout_url:.*|polar_rollout_url: \"${POLAR_ROLLOUT_URL}\"|" \
+    "${POLAR_CONFIG_PATH}" > "${PATCHED_POLAR_CONFIG}"
+echo "[polar_train] Patched polar_config.yaml: polar_rollout_url → ${POLAR_ROLLOUT_URL}"
+
+# ── PID tracking for cleanup ──────────────────────────────────────────────────
+PIDS=()
+cleanup() {
+    echo "[polar_train] Cleaning up..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "${pid}" 2>/dev/null; then
+            echo "[polar_train] Killing PID ${pid}"
+            kill "${pid}" 2>/dev/null || true
+        fi
+    done
+    wait 2>/dev/null || true
+    echo "[polar_train] Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+source "${TEMPLATE_DIR}/wait_for_service.sh"
+
+# ── Step 1: Start Polar services (CPU, on host using POLAR_VENV) ──────────────
+echo "[polar_train] Starting Polar rollout server on ${TRAINER_NODE}:${ROLLOUT_PORT}..."
+polar serve_rollout -c "${TOPOLOGY_PATH}" \
+    > "${JOB_DIR}/logs/rollout.log" 2>&1 &
+PIDS+=($!)
+
+wait_for_service "http://${TRAINER_NODE}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1
+
+# Start gateway(s) on all nodes
+for ((i=0; i<NUM_NODES; i++)); do
+    NODE_ID="node-$(printf '%02d' ${i})"
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+
+    echo "[polar_train] Starting gateway ${NODE_ID} on ${GW_HOST}:${GW_PORT}..."
+    if [ "${i}" -eq 0 ]; then
+        polar serve_gateway \
+            -c "${TOPOLOGY_PATH}" \
+            --node-id "${NODE_ID}" \
+            > "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 &
+        PIDS+=($!)
+    else
+        srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \
+            --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \
+            --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \
+            bash -c "
+                source '${TEMPLATE_DIR}/env.sh'
+                export POLAR_TOPOLOGY='${TOPOLOGY_PATH}'
+                export POLAR_GATEWAY_NODE_ID='${NODE_ID}'
+                python -m polar.gateway.server
+            " &
+        PIDS+=($!)
+    fi
+done
+
+for ((i=0; i<NUM_NODES; i++)); do
+    GW_PORT=$((GATEWAY_BASE_PORT + i))
+    GW_HOST="${HOSTNAMES[$i]}"
+    wait_for_service "http://${GW_HOST}:${GW_PORT}/health" \
+        "Gateway node-$(printf '%02d' ${i})" 30 2 || exit 1
+done
+
+echo "[polar_train] Polar services ready."
+
+# ── Step 2: Convert weights if needed (inside container) ──────────────────────
+# Pre-download the HF model on the host (compute nodes may lack internet,
+# and the container uses --no-home so ~/.cache is not available).
+if [[ ! -d "${HF_CHECKPOINT}" ]]; then
+    HF_LOCAL_DIR="${POLAR_WORKSPACE}/hf_models/${HF_CKPT_BASENAME}"
+    if [[ ! -d "${HF_LOCAL_DIR}" ]]; then
+        echo "[polar_train] Downloading HF model: ${HF_CHECKPOINT} -> ${HF_LOCAL_DIR}"
+        HF_HUB_OFFLINE=0 HF_HOME="${HF_HOME}" python -c "
+from huggingface_hub import snapshot_download
+snapshot_download('${HF_CHECKPOINT}', local_dir='${HF_LOCAL_DIR}')
+"
+        echo "[polar_train] Download complete."
+    else
+        echo "[polar_train] HF model already cached at ${HF_LOCAL_DIR}"
+    fi
+    HF_CHECKPOINT="${HF_LOCAL_DIR}"
+fi
+
+if [ ! -d "${TORCH_DIST_DIR}/release" ]; then
+    echo "[polar_train] Converting HF weights: ${HF_CHECKPOINT} -> ${TORCH_DIST_DIR}"
+    mkdir -p "${TORCH_DIST_DIR}"
+    ${APPTAINER_EXEC} bash -c "
+        export CUDA_DEVICE_MAX_CONNECTIONS=1
+        export PYTHONPATH='/opt/polar/src:/root/Megatron-LM:\${PYTHONPATH:-}'
+        torchrun --nproc_per_node 1 \
+            /root/slime/tools/convert_hf_to_torch_dist.py \
+            ${MODEL_ARGS} \
+            --hf-checkpoint '${HF_CHECKPOINT}' \
+            --save '${TORCH_DIST_DIR}' \
+            --tensor-model-parallel-size 1 \
+            --pipeline-model-parallel-size 1 \
+            --context-parallel-size 1 \
+            --expert-model-parallel-size 1 \
+            --expert-tensor-parallel-size 1
+    " > "${JOB_DIR}/logs/convert_weights.log" 2>&1
+    echo "[polar_train] Weight conversion complete."
+else
+    echo "[polar_train] Weights already converted at ${TORCH_DIST_DIR}"
+fi
+
+# ── Step 3+4: Start Ray and launch training (single container session) ────────
+# Ray start daemonizes processes in the container's PID/network namespace.
+# ray job submit must run in the SAME apptainer exec invocation so it can
+# reach the Ray dashboard via localhost.  Separate invocations create isolated
+# namespaces, causing DNS/socket errors (OSError 107, Transport endpoint).
+echo "[polar_train] Starting Ray + training (single container session)..."
+echo "[polar_train]   Steps: ${TRAIN_NUM_ROLLOUTS}"
+echo "[polar_train]   Batch: ${ROLLOUT_BATCH_SIZE} prompts x ${N_SAMPLES_PER_PROMPT} samples"
+echo "[polar_train]   Global batch size: ${GLOBAL_BATCH_SIZE}"
+echo "[polar_train]   Actor GPUs: ${ACTOR_GPUS} (TP=${TP_SIZE}), Rollout GPUs: ${ROLLOUT_GPUS}"
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/opt/polar/src:/root/Megatron-LM\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\"
+  }
+}"
+
+${APPTAINER_EXEC} bash -c "
+    set -euo pipefail
+
+    # Verify critical training dependencies are importable
+    echo '[polar_train] Verifying runtime dependencies...'
+    python3 -c 'import sglang; import slime; import megatron; import mbridge; print(\"All training deps OK\")' || {
+        echo '[polar_train] ERROR: Missing training dependencies in SIF. Rebuild with: polar cluster build-sif --example train --force' >&2
+        exit 1
+    }
+
+    # Clean up any stale Ray processes
+    ray stop --force 2>/dev/null || true
+    sleep 2
+
+    # Start Ray head node (daemonizes)
+    ray start --head \
+        --node-ip-address '${TRAINER_NODE}' \
+        --port '${RAY_PORT}' \
+        --dashboard-host 0.0.0.0 \
+        --dashboard-port '${RAY_DASHBOARD_PORT}' \
+        --num-gpus '${TOTAL_GPUS}' \
+        --disable-usage-stats
+
+    echo '[polar_train] Ray cluster started. Dashboard: http://${TRAINER_NODE}:${RAY_DASHBOARD_PORT}'
+
+    # Wait for Ray dashboard to be ready
+    for i in \$(seq 1 30); do
+        if curl -sf http://127.0.0.1:${RAY_DASHBOARD_PORT}/api/version >/dev/null 2>&1; then
+            echo '[polar_train] Ray dashboard is ready.'
+            break
+        fi
+        echo '[polar_train] Waiting for Ray dashboard... ('\$i'/30)'
+        sleep 2
+    done
+
+    # Submit training job (uses localhost since we're in the same namespace)
+    export PYTHONPATH='/opt/polar/src:/root/Megatron-LM:\${PYTHONPATH:-}'
+    export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+    ray job submit \
+        --address='http://127.0.0.1:${RAY_DASHBOARD_PORT}' \
+        --runtime-env-json='${RUNTIME_ENV_JSON}' \
+        -- python3 /root/slime/train_async.py \
+        --actor-num-nodes 1 \
+        --actor-num-gpus-per-node '${ACTOR_GPUS}' \
+        --rollout-num-gpus '${ROLLOUT_GPUS}' \
+        --rollout-num-gpus-per-engine 1 \
+        ${MODEL_ARGS} \
+        --hf-checkpoint '${HF_CHECKPOINT}' \
+        --ref-load '${TORCH_DIST_DIR}' \
+        --load '${SAVE_DIR}' \
+        --save '${SAVE_DIR}' \
+        --save-interval 5 \
+        --update-weights-interval 1 \
+        --rollout-function-path polar.slime.rollout.generate_rollout_polar_async \
+        --custom-rm-path polar.slime.reward.reward_func \
+        --custom-config-path '${PATCHED_POLAR_CONFIG}' \
+        --prompt-data '${PROMPT_DATA}' \
+        --input-key prompt \
+        --label-key label \
+        --metadata-key metadata \
+        --rollout-shuffle \
+        --reward-key score \
+        --num-rollout '${TRAIN_NUM_ROLLOUTS}' \
+        --rollout-batch-size '${ROLLOUT_BATCH_SIZE}' \
+        --n-samples-per-prompt '${N_SAMPLES_PER_PROMPT}' \
+        --rollout-max-response-len 8192 \
+        --rollout-max-prompt-len 4096 \
+        --global-batch-size '${GLOBAL_BATCH_SIZE}' \
+        --rollout-global-dataset \
+        --disable-rollout-trim-samples \
+        --tensor-model-parallel-size '${TP_SIZE}' \
+        --sequence-parallel \
+        --pipeline-model-parallel-size 1 \
+        --context-parallel-size 1 \
+        --expert-model-parallel-size 1 \
+        --expert-tensor-parallel-size 1 \
+        --recompute-granularity full \
+        --recompute-method uniform \
+        --recompute-num-layers 1 \
+        --use-dynamic-batch-size \
+        --max-tokens-per-gpu 8192 \
+        --advantage-estimator '${ADVANTAGE_ESTIMATOR:-grpo}' \
+        --use-rollout-logprobs \
+        --use-kl-loss \
+        --kl-loss-coef 0.001 \
+        --kl-loss-type low_var_kl \
+        --entropy-coef 0.0 \
+        --eps-clip 0.2 \
+        --eps-clip-high 0.28 \
+        --optimizer adam \
+        --lr 1e-6 \
+        --lr-decay-style constant \
+        --weight-decay 0.1 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.98 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --accumulate-allreduce-grads-in-fp32 \
+        --attention-softmax-in-fp32 \
+        --attention-backend flash \
+        --no-gradient-accumulation-fusion \
+        ${WANDB_PROJECT:+--use-wandb --wandb-project '${WANDB_PROJECT}'} \
+        ${WANDB_EXP_NAME:+--wandb-exp-name '${WANDB_EXP_NAME}'} \
+        ${WANDB_GROUP:+--wandb-group '${WANDB_GROUP}'} \
+        --sglang-router-port '${SGLANG_ROUTER_PORT}' \
+        --sglang-disable-cuda-graph \
+        ${EXTRA_TRAIN_ARGS:-}
+" 2>&1 | tee "${JOB_DIR}/logs/training.log"
+
+TRAIN_EXIT=${PIPESTATUS[0]}
+
+# ── Results ───────────────────────────────────────────────────────────────────
+echo ""
+echo "[polar_train] ════════════════════════════════════════════════════════"
+echo "[polar_train] Training finished (exit code: ${TRAIN_EXIT})"
+echo "[polar_train] Checkpoints: ${SAVE_DIR}"
+echo "[polar_train] Rollout results: ${ROLLOUT_SAVE_DIR}"
+echo "[polar_train] Logs: ${JOB_DIR}/logs/"
+echo "[polar_train] ════════════════════════════════════════════════════════"
+
+exit ${TRAIN_EXIT}
diff --git a/src/polar/cluster/templates/wait_for_service.sh b/src/polar/cluster/templates/wait_for_service.sh
new file mode 100644
index 00000000..6e14cbd6
--- /dev/null
+++ b/src/polar/cluster/templates/wait_for_service.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# wait_for_service.sh — Poll an HTTP endpoint until it responds 2xx.
+#
+# Usage:
+#   source wait_for_service.sh
+#   wait_for_service "http://host:8000/health" "vLLM" 120 5
+
+wait_for_service() {
+    local url="$1"
+    local name="${2:-service}"
+    local max_attempts="${3:-60}"
+    local interval="${4:-5}"
+
+    echo "[wait] Waiting for ${name} at ${url} (max ${max_attempts} attempts, ${interval}s interval)..."
+    for ((i=1; i<=max_attempts; i++)); do
+        if curl -sf --max-time 5 "${url}" > /dev/null 2>&1; then
+            echo "[wait] ${name} is ready at ${url} (attempt ${i}/${max_attempts})"
+            return 0
+        fi
+        if (( i % 10 == 0 )); then
+            echo "[wait] Still waiting for ${name}... (${i}/${max_attempts})"
+        fi
+        sleep "${interval}"
+    done
+    echo "[wait] FATAL: ${name} did not become ready at ${url} after ${max_attempts} attempts"
+    return 1
+}
+
+# Allow direct invocation
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    wait_for_service "$@"
+fi
diff --git a/src/polar/cluster/topology.py b/src/polar/cluster/topology.py
new file mode 100644
index 00000000..602a2f50
--- /dev/null
+++ b/src/polar/cluster/topology.py
@@ -0,0 +1,195 @@
+"""Generate topology.yaml for a Polar SLURM job.
+
+Called inside the SLURM job after hostname discovery. Reads
+``SLURM_JOB_NODELIST`` and environment variables to produce a complete
+``topology.yaml`` consumed by ``polar serve_rollout`` / ``polar serve_gateway``.
+
+Usage (from sbatch script)::
+
+    python -m polar.cluster.topology --output /path/to/topology.yaml
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import socket
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+
+
+def discover_hostnames() -> list[str]:
+    """Return the SLURM-allocated hostnames, or ``[localhost]`` for local testing."""
+    nodelist = os.environ.get("SLURM_JOB_NODELIST")
+    if not nodelist:
+        hostname = socket.gethostname()
+        print(f"[topology] No SLURM_JOB_NODELIST; using local hostname: {hostname}")
+        return [hostname]
+
+    result = subprocess.run(
+        ["scontrol", "show", "hostnames", nodelist],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    hostnames = [h.strip() for h in result.stdout.strip().split("\n") if h.strip()]
+    if not hostnames:
+        raise RuntimeError(f"scontrol returned no hostnames for {nodelist}")
+    print(f"[topology] Discovered {len(hostnames)} node(s): {hostnames}")
+    return hostnames
+
+
+def build_topology(
+    hostnames: list[str],
+    *,
+    vllm_port: int | None = None,
+    sglang_base_url: str | None = None,
+    rollout_port: int | None = None,
+    gateway_base_port: int | None = None,
+    model_name: str | None = None,
+    default_sif: str | None = None,
+    max_init_workers: int | None = None,
+    max_run_workers: int | None = None,
+    max_postrun_workers: int | None = None,
+    ready_buffer_target: int | None = None,
+    save_dir: str | None = None,
+    vllm_timeout: int | None = None,
+) -> dict:
+    """Build the topology dict from discovered hostnames and configuration.
+
+    Each parameter falls back to the corresponding environment variable and
+    then to a hardcoded default — matching the contract of the old shell-based
+    ``generate_topology.py``.
+    """
+    _vllm_port = vllm_port or int(os.environ.get("VLLM_PORT", "8000"))
+    _rollout_port = rollout_port or int(os.environ.get("ROLLOUT_PORT", "8080"))
+    _gw_base = gateway_base_port or int(os.environ.get("GATEWAY_BASE_PORT", "8100"))
+    _model = model_name or os.environ.get("MODEL_NAME", "Qwen/Qwen3.5-27B")
+    _sif = default_sif or os.environ.get("DEFAULT_SIF_IMAGE", "")
+    _init = max_init_workers or int(os.environ.get("MAX_INIT_WORKERS", "8"))
+    _run = max_run_workers or int(os.environ.get("MAX_RUN_WORKERS", "4"))
+    _post = max_postrun_workers or int(os.environ.get("MAX_POSTRUN_WORKERS", "4"))
+    _buf = ready_buffer_target or int(os.environ.get("READY_BUFFER_TARGET", "4"))
+    _save = save_dir or os.environ.get("SAVE_DIR", "./rollout_results")
+    _timeout = vllm_timeout or int(os.environ.get("VLLM_TIMEOUT", "300"))
+
+    vllm_host = hostnames[0]
+    rollout_host = hostnames[0]
+
+    gateway_nodes = []
+    for i, hostname in enumerate(hostnames):
+        port = _gw_base + i
+        node: dict = {
+            "id": f"node-{i:02d}",
+            "host": "0.0.0.0",
+            "port": port,
+            "public_url": f"http://{hostname}:{port}",
+            "max_init_workers": _init,
+            "max_run_workers": _run,
+            "max_postrun_workers": _post,
+            "ready_buffer_target": _buf,
+            "model_served": _model,
+        }
+        if sglang_base_url:
+            node["sglang"] = {
+                "base_url": sglang_base_url,
+                "timeout": _timeout,
+            }
+        else:
+            node["vllm"] = {
+                "base_url": f"http://{vllm_host}:{_vllm_port}",
+                "timeout": _timeout,
+            }
+        if _sif:
+            runtime_cfg: dict = {
+                "backend": "apptainer",
+                "image": _sif,
+                "network": "host",
+            }
+            # swe_agent's swerex needs chown inside the container, which
+            # requires fakeroot in Apptainer.
+            if os.environ.get("RUNTIME_FAKEROOT", "").lower() in ("1", "true", "yes"):
+                runtime_cfg["kwargs"] = {"fakeroot": True}
+            node["default_runtime"] = runtime_cfg
+        gateway_nodes.append(node)
+
+    return {
+        "rollout": {
+            "host": "0.0.0.0",
+            "port": _rollout_port,
+            "public_url": f"http://{rollout_host}:{_rollout_port}",
+            "save_dir": _save,
+            "dispatch_poll_interval_seconds": 1.0,
+            "callback_grace_seconds": 10.0,
+        },
+        "gateway": {
+            "heartbeat_interval_seconds": 15,
+            "rollout_server_url": f"http://{rollout_host}:{_rollout_port}",
+            "nodes": gateway_nodes,
+        },
+    }
+
+
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate Polar topology.yaml for SLURM jobs",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default=None,
+        help="Output path for topology.yaml (default: stdout)",
+    )
+    parser.add_argument(
+        "--default-sif",
+        default=None,
+        help="Default SIF image path for agent containers",
+    )
+    parser.add_argument(
+        "--save-dir",
+        default=None,
+        help="Rollout results save directory",
+    )
+    parser.add_argument(
+        "--sglang-base-url",
+        default=None,
+        help="SGLang router URL (use sglang instead of vllm backend)",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _parse_args(argv)
+    hostnames = discover_hostnames()
+    topology = build_topology(
+        hostnames,
+        default_sif=args.default_sif or None,
+        save_dir=args.save_dir or None,
+        sglang_base_url=args.sglang_base_url or None,
+    )
+    output = yaml.dump(topology, default_flow_style=False, sort_keys=False)
+
+    if args.output:
+        path = Path(args.output)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(output)
+        print(f"[topology] Wrote topology to {args.output}")
+    else:
+        print(output)
+
+    nodes = topology["gateway"]["nodes"]
+    print("[topology] Summary:")
+    print(f"  Rollout: {topology['rollout']['public_url']}")
+    if "vllm" in nodes[0]:
+        print(f"  vLLM:    {nodes[0]['vllm']['base_url']}")
+    elif "sglang" in nodes[0]:
+        print(f"  SGLang:  {nodes[0]['sglang']['base_url']}")
+    for node in nodes:
+        print(f"  Gateway: {node['id']} @ {node['public_url']}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/polar/config/topology.py b/src/polar/config/topology.py
index a46d11ff..c04dde29 100644
--- a/src/polar/config/topology.py
+++ b/src/polar/config/topology.py
@@ -161,7 +161,10 @@ def _parse_gateway(raw: Any, rollout_public_url: str) -> GatewayConfig:
                 if public_url_raw is not None
                 else _default_public_url(host, port)
             )
-            sglang = _require_mapping(node.get("sglang"), f"gateway.nodes[{index}].sglang")
+            sglang = _require_mapping(
+                node.get("vllm") or node.get("sglang"),
+                f"gateway.nodes[{index}].vllm",
+            )
             default_runtime_raw = node.get("default_runtime")
             default_runtime = None
             if default_runtime_raw is not None:
diff --git a/src/polar/gateway/node.py b/src/polar/gateway/node.py
index cb57e767..11959d60 100644
--- a/src/polar/gateway/node.py
+++ b/src/polar/gateway/node.py
@@ -313,6 +313,8 @@ async def _run_exec_inputs(
         log_dir = managed.session_dir / "logs" / "agent"
         log_dir.mkdir(parents=True, exist_ok=True)
 
+        last_stdout: str | None = None
+        last_stderr: str | None = None
         for i, step in enumerate(steps):
             if managed.cancel_requested:
                 return AgentRunResult(
@@ -325,28 +327,43 @@ async def _run_exec_inputs(
                 env=merged_env,
                 timeout_sec=self._remaining_budget(managed),
             )
+            last_stdout = result.stdout
+            last_stderr = result.stderr
             self._write_exec_log(
                 log_dir, f"step.{i:02d}", result.stdout, result.stderr
             )
+            logger.info(
+                "Step %d for session %s: rc=%s stdout_tail=%s",
+                i,
+                managed.request.session_id,
+                result.return_code,
+                (result.stdout or "")[-500:],
+            )
             if result.return_code == -1:
                 return AgentRunResult(
                     status="timeout",
                     return_code=-1,
                     error=f"step {i} timed out",
-                    metadata=self._step_metadata(log_dir, i, managed),
+                    metadata=self._step_metadata(
+                        log_dir, i, managed, last_stdout, last_stderr
+                    ),
                 )
             if result.return_code != 0:
                 return AgentRunResult(
                     status="failed",
                     return_code=result.return_code,
                     error=f"step {i} exited with code {result.return_code}",
-                    metadata=self._step_metadata(log_dir, i, managed),
+                    metadata=self._step_metadata(
+                        log_dir, i, managed, last_stdout, last_stderr
+                    ),
                 )
 
         return AgentRunResult(
             status="completed",
             return_code=0,
-            metadata=self._step_metadata(log_dir, len(steps) - 1, managed),
+            metadata=self._step_metadata(
+                log_dir, len(steps) - 1, managed, last_stdout, last_stderr
+            ),
         )
 
     # ------------------------------------------------------------------
@@ -751,12 +768,24 @@ def _write_exec_log(
             (log_dir / f"{prefix}.stderr.log").write_text(stderr)
 
     @staticmethod
-    def _step_metadata(log_dir: Path, step_index: int, managed: ManagedSession) -> dict:
-        return {
+    def _step_metadata(
+        log_dir: Path,
+        step_index: int,
+        managed: ManagedSession,
+        last_stdout: str | None = None,
+        last_stderr: str | None = None,
+    ) -> dict:
+        meta: dict = {
             "log_dir": str(log_dir),
             "last_step": step_index,
             "cwd": str(managed.session_dir),
         }
+        # Include truncated output tails so they survive session dir cleanup
+        if last_stdout:
+            meta["stdout_tail"] = last_stdout[-4000:]
+        if last_stderr:
+            meta["stderr_tail"] = last_stderr[-4000:]
+        return meta
 
     def _error_result(
         self,
diff --git a/src/polar/gateway/server.py b/src/polar/gateway/server.py
index 215277b5..aad9d445 100644
--- a/src/polar/gateway/server.py
+++ b/src/polar/gateway/server.py
@@ -72,6 +72,93 @@ class GatewayState:
 _configured_topology_path: str | None = None
 _configured_node_id: str | None = None
 
+# Cached max_model_len from the backend model (populated lazily).
+_max_model_len: int | None = None
+_DEFAULT_MAX_OUTPUT_TOKENS = 4096  # Sensible default if model info unavailable
+
+
+async def _fetch_max_model_len(base_url: str) -> int | None:
+    """Query backend for max_model_len via /v1/models."""
+    try:
+        import httpx
+
+        async with httpx.AsyncClient(base_url=base_url, timeout=10.0) as client:
+            resp = await client.get("/v1/models")
+            if resp.is_success:
+                data = resp.json().get("data", [])
+                if data:
+                    return data[0].get("max_model_len")
+    except Exception:
+        pass
+    return None
+
+
+def _clamp_max_tokens(request: dict[str, Any]) -> None:
+    """Reduce max_tokens / max_completion_tokens so it leaves room for input.
+
+    Reserves at least 25% of the context window (min 2048 tokens) for the
+    input prompt.  This avoids the common failure where
+    ``max_tokens == max_model_len`` leaves zero tokens for the prompt.
+
+    Handles both ``max_tokens`` (legacy) and ``max_completion_tokens``
+    (modern OpenAI API used by litellm/openhands).
+    """
+    global _max_model_len
+    if _max_model_len:
+        input_reserve = max(_max_model_len // 4, 2048)
+        limit = _max_model_len - input_reserve
+    else:
+        limit = _DEFAULT_MAX_OUTPUT_TOKENS
+    for key in ("max_tokens", "max_completion_tokens"):
+        val = request.get(key)
+        if val is not None and isinstance(val, int) and val > limit:
+            logger.info(
+                "Clamped %s from %d to %d (model limit %s, input reserve %d)",
+                key, val, limit, _max_model_len or "default",
+                input_reserve if _max_model_len else 0,
+            )
+            request[key] = limit
+
+
+def _try_reduce_max_tokens_from_error(
+    error_msg: str,
+    request: dict[str, Any],
+) -> bool:
+    """On a vLLM token-limit 400 error, shrink output-token fields by ~30%.
+
+    Handles both ``max_tokens`` and ``max_completion_tokens``.
+    Returns ``True`` if any field was lowered (caller should retry).
+
+    We deliberately avoid parsing the reported input-token count from the
+    error because vLLM reports a *derived* value (``context_len + 1 -
+    max_tokens``) rather than the true tokenised length, which makes
+    error-guided reduction unreliable.  A fixed 30% reduction converges
+    quickly in practice.
+    """
+    if "maximum context length" not in error_msg:
+        return False
+
+    changed = False
+    for key in ("max_tokens", "max_completion_tokens"):
+        old = request.get(key)
+        if isinstance(old, int) and old > 128:
+            new = max(128, old * 7 // 10)  # ~30% reduction
+            logger.info("Auto-reducing %s from %d to %d", key, old, new)
+            request[key] = new
+            changed = True
+
+    # If no output-token field exists, add one at ¼ of context.
+    if not changed and _max_model_len:
+        for key in ("max_tokens", "max_completion_tokens"):
+            if key not in request:
+                new = _max_model_len // 4
+                logger.info("Adding %s=%d to constrain output", key, new)
+                request[key] = new
+                changed = True
+                break
+
+    return changed
+
 
 def configure_server(topology_path: str = "topology.yaml", *, node_id: str | None = None) -> None:
     global _configured_topology_path, _configured_node_id, _state
@@ -142,10 +229,20 @@ def get_state() -> GatewayState:
 
 @asynccontextmanager
 async def _lifespan(_: FastAPI):
+    global _max_model_len
     state = get_state()
     await state.node_manager.start()
     if state.control_client is not None:
         await state.control_client.start()
+    # Cache backend model's max_model_len for request clamping.
+    _max_model_len = await _fetch_max_model_len(state.node.sglang_base_url)
+    if _max_model_len:
+        logger.info("Backend max_model_len: %d", _max_model_len)
+    else:
+        logger.warning(
+            "Could not fetch max_model_len from backend; using default cap %d",
+            _DEFAULT_MAX_OUTPUT_TOKENS,
+        )
     try:
         yield
     finally:
@@ -500,13 +597,87 @@ async def proxy_request(request: Request, path: str):
         request.method, full_path, api_type.value, original_model, session_id,
     )
 
+    # Debug: log request body keys and previous_response_id for diagnostics
+    body_keys = sorted(body.keys()) if isinstance(body, dict) else "not-a-dict"
+    prev_resp_id = body.get("previous_response_id") if isinstance(body, dict) else None
+    input_type = type(body.get("input", "")).__name__ if isinstance(body, dict) else "?"
+    input_len = len(body.get("input", "")) if isinstance(body, dict) and isinstance(body.get("input"), (str, list)) else 0
+    logger.info(
+        "  body_keys=%s prev_response_id=%s input_type=%s input_len=%s stream=%s",
+        body_keys, prev_resp_id, input_type, input_len, body.get("stream"),
+    )
+
     if api_type == APIType.GOOGLE and "streamGenerateContent" in full_path:
         body["_streaming"] = True
 
-    transformed_body = body.copy()
-    transformed_body["_polar_model_served"] = state.node.model_served
-    openai_request = transformer.transform_request(transformed_body)
+    # Resolve previous_response_id for multi-turn Responses API conversations
+    if (
+        api_type == APIType.OPENAI_RESPONSES
+        and isinstance(body, dict)
+        and body.get("previous_response_id")
+    ):
+        prev_id = body["previous_response_id"]
+        logger.info("  Resolving previous_response_id=%s from session %s", prev_id, session_id)
+        session_data = state.storage.load_completion_session(session_id)
+        if session_data and session_data.completions:
+            # Rebuild conversation history from stored completions
+            history_items: list[dict[str, Any]] = []
+            for rec in session_data.completions:
+                req_msgs = rec.request.get("messages", [])
+                resp_choices = rec.response.get("choices", [])
+                # Add the request messages (skip system — instructions handles that)
+                for msg in req_msgs:
+                    role = msg.get("role", "")
+                    if role == "system":
+                        continue
+                    if role == "user":
+                        history_items.append({"type": "message", "role": "user", "content": msg.get("content", "")})
+                    elif role == "tool":
+                        history_items.append({
+                            "type": "function_call_output",
+                            "call_id": msg.get("tool_call_id", ""),
+                            "output": msg.get("content", ""),
+                        })
+                # Add the assistant response
+                if resp_choices:
+                    resp_msg = resp_choices[0].get("message", {})
+                    content = resp_msg.get("content", "")
+                    tool_calls = resp_msg.get("tool_calls", [])
+                    if content:
+                        history_items.append({
+                            "type": "message",
+                            "role": "assistant",
+                            "content": [{"type": "output_text", "text": content}],
+                        })
+                    for tc in tool_calls:
+                        func = tc.get("function", {})
+                        history_items.append({
+                            "type": "function_call",
+                            "call_id": tc.get("id", ""),
+                            "name": func.get("name", ""),
+                            "arguments": func.get("arguments", "{}"),
+                        })
+
+            # Merge: history_items + current input items
+            current_input = body.get("input", [])
+            if isinstance(current_input, str):
+                current_input = [{"type": "message", "role": "user", "content": current_input}]
+            elif not isinstance(current_input, list):
+                current_input = []
+            body["input"] = history_items + current_input
+            logger.info(
+                "  Resolved history: %d records, %d history items + %d current items",
+                len(session_data.completions), len(history_items), len(current_input),
+            )
+        else:
+            logger.warning("  No session data found for previous_response_id=%s", prev_id)
+
+    openai_request = transformer.transform_request(body)
     openai_request["model"] = state.node.model_served
+
+    # Clamp max_tokens so it never exceeds the backend model's capacity.
+    _clamp_max_tokens(openai_request)
+
     is_streaming = openai_request.get("stream", False)
 
     if is_streaming:
@@ -541,11 +712,28 @@ async def _handle_non_streaming(
     session_info: Any | None,
 ) -> JSONResponse:
     state = get_state()
-    try:
-        response = await state.sglang.completion(openai_request)
-    except UpstreamError as exc:
-        logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc)
-        return _upstream_error_response(api_type, exc)
+    response = None
+    last_exc: Exception | None = None
+    for _attempt in range(4):
+        try:
+            response = await state.sglang.completion(openai_request)
+            break
+        except UpstreamHTTPError as exc:
+            last_exc = exc
+            if (
+                exc.status_code == 400
+                and _try_reduce_max_tokens_from_error(str(exc), openai_request)
+            ):
+                logger.info("Retrying (%d) with reduced max_tokens", _attempt + 1)
+                continue
+            logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc)
+            return _upstream_error_response(api_type, exc)
+        except UpstreamError as exc:
+            logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc)
+            return _upstream_error_response(api_type, exc)
+    if response is None:
+        logger.warning("All retries exhausted for session %s: %s", session_id, last_exc)
+        return _upstream_error_response(api_type, last_exc or UpstreamError("max_tokens retries exhausted"))
 
     state.storage.save_message(
         session_id,
@@ -573,11 +761,30 @@ async def _handle_streaming(
     session_info: Any | None,
 ) -> StreamingResponse | JSONResponse:
     state = get_state()
-    try:
-        raw_stream = await state.sglang.open_completion_stream(openai_request)
-    except UpstreamError as exc:
-        logger.warning("Streaming setup error for session %s: %s", session_id, exc)
-        return _upstream_error_response(api_type, exc)
+    # Try opening the stream; on a token-limit 400 error, auto-reduce
+    # max_tokens and retry (up to 3 retries, so 4 total attempts).
+    raw_stream = None
+    last_exc: Exception | None = None
+    for _attempt in range(4):
+        try:
+            raw_stream = await state.sglang.open_completion_stream(openai_request)
+            break
+        except UpstreamHTTPError as exc:
+            last_exc = exc
+            if (
+                exc.status_code == 400
+                and _try_reduce_max_tokens_from_error(str(exc), openai_request)
+            ):
+                logger.info("Retrying (%d) with reduced max_tokens", _attempt + 1)
+                continue
+            logger.warning("Streaming setup error for session %s: %s", session_id, exc)
+            return _upstream_error_response(api_type, exc)
+        except UpstreamError as exc:
+            logger.warning("Streaming setup error for session %s: %s", session_id, exc)
+            return _upstream_error_response(api_type, exc)
+    if raw_stream is None:
+        logger.warning("All retries exhausted for session %s: %s", session_id, last_exc)
+        return _upstream_error_response(api_type, last_exc or UpstreamError("max_tokens retries exhausted"))
 
     accumulator = StreamAccumulator()
     stream_state = transformer.create_stream_state(original_request)
diff --git a/src/polar/gateway/transform/openai_responses.py b/src/polar/gateway/transform/openai_responses.py
index d485ea29..9668faf4 100644
--- a/src/polar/gateway/transform/openai_responses.py
+++ b/src/polar/gateway/transform/openai_responses.py
@@ -263,6 +263,16 @@ def transform_request(self, body: dict[str, Any]) -> dict[str, Any]:
         elif isinstance(input_data, list):
             messages.extend(self._convert_input_items_to_messages(input_data))
 
+        # vLLM / Qwen chat templates require exactly ONE system message at
+        # position 0.  Responses API clients (e.g. codex CLI) may produce
+        # multiple system messages (from "instructions" + "developer" items).
+        # Merge them into a single system message.
+        system_parts = [m["content"] for m in messages if m.get("role") == "system" and m.get("content")]
+        other_msgs = [m for m in messages if m.get("role") != "system"]
+        if system_parts:
+            messages = [{"role": "system", "content": "\n\n".join(system_parts)}] + other_msgs
+        else:
+            messages = other_msgs
         result: dict[str, Any] = {"messages": messages}
 
         if "max_tokens" in body:
@@ -322,9 +332,11 @@ def transform_response(
             else:
                 output_items.append({
                     "type": "function_call",
+                    "id": f"fc_{uuid.uuid4().hex[:24]}",
                     "call_id": tc.get("id", ""),
                     "name": name,
                     "arguments": func.get("arguments", "{}"),
+                    "status": "completed",
                 })
 
         usage = response.get("usage", {})
@@ -378,6 +390,10 @@ def _convert_input_items_to_messages(
                     pending_tool_outputs = []
 
                 role = item.get("role", "user")
+                # Responses API uses "developer" for system-level instructions;
+                # map to "system" for Chat Completions compatibility.
+                if role == "developer":
+                    role = "system"
                 content = self._extract_text_from_content(item.get("content", ""))
                 messages.append({"role": role, "content": content})
 
@@ -446,6 +462,8 @@ def _convert_response_item_to_message(self, item: dict[str, Any]) -> Optional[di
 
         if item_type == "message":
             role = item.get("role", "user")
+            if role == "developer":
+                role = "system"
             content_items = item.get("content", [])
             text_parts = []
             for c in content_items if isinstance(content_items, list) else []:
@@ -466,6 +484,8 @@ def _convert_response_item_to_message(self, item: dict[str, Any]) -> Optional[di
         # Fallback: plain {role, content} dict
         if not item_type and "role" in item and "content" in item:
             role = item["role"]
+            if role == "developer":
+                role = "system"
             content = item["content"]
             if isinstance(content, str):
                 return {"role": role, "content": content}
diff --git a/src/polar/runtime/apptainer.py b/src/polar/runtime/apptainer.py
index be85449b..e386bba8 100644
--- a/src/polar/runtime/apptainer.py
+++ b/src/polar/runtime/apptainer.py
@@ -2,9 +2,11 @@
 
 from __future__ import annotations
 
+import asyncio
 import hashlib
 import logging
 import os
+import logging
 import shlex
 import shutil
 from pathlib import Path
@@ -23,8 +25,15 @@ def __init__(self, spec: RuntimeSpec, session_id: str, session_dir: Path) -> Non
         # Use a hash suffix to guarantee uniqueness even when session IDs
         # share a long prefix (e.g. "sk-polar-...-eval" vs "sk-polar-...").
         short_hash = hashlib.sha256(session_id.encode()).hexdigest()[:8]
-        safe_name = session_id.replace("/", "-")[:30]
-        self._instance_name = f"polar-{safe_name}-{short_hash}"
+        safe_name = session_id.replace("/", "-")
+        if len(safe_name) > 40:
+            # Keep a recognisable prefix plus a hash suffix to guarantee
+            # uniqueness even when session IDs share a long common prefix
+            # (e.g. "sk-polar-<uuid>" vs "sk-polar-<uuid>-eval").
+            prefix = safe_name[:24]
+            suffix = hashlib.sha256(safe_name.encode()).hexdigest()[:12]
+            safe_name = f"{prefix}-{suffix}"
+        self._instance_name = f"polar-{safe_name}"
         self._binary = self._resolve_binary()
 
     @property
@@ -42,12 +51,15 @@ def can_disable_internet(self) -> bool:
     async def start(self) -> None:
         if self._destroyed:
             raise RuntimeError("apptainer runtime was already destroyed")
-        # Use a host-backed overlay directory instead of --writable-tmpfs
-        # (default tmpfs overlay is only 64 MB, too small for most workloads).
-        self._overlay_dir = self.session_dir / "overlay"
-        self._overlay_dir.mkdir(parents=True, exist_ok=True)
-        args = [self._binary, "instance", "start",
-                "--overlay", str(self._overlay_dir)]
+        args = [self._binary, "instance", "start"]
+        # --writable-tmpfs gives a small (64 MB) writable layer on top of the
+        # read-only SIF for caches/configs.  Actual workload data goes through
+        # the bind mount (session_dir → /polar/session).
+        # --no-home avoids mounting the host home directory which would leak
+        # host-specific paths into the container.
+        args.extend(["--writable-tmpfs", "--no-home"])
+        if self.spec.kwargs.get("fakeroot"):
+            args.append("--fakeroot")
         if self.spec.gpus > 0:
             args.append("--nv")
         network_name: str | None
@@ -59,10 +71,26 @@ async def start(self) -> None:
             args.extend(["--net", "--network", network_name])
         args.extend(["--bind", f"{self.session_dir}:{self.runtime_session_dir}"])
         args.extend([self.spec.image, self._instance_name])
-        rc, _, _ = await self._run_local_command(*args)
+        # Do NOT use capture=True here. `apptainer instance start` forks a
+        # daemon that inherits pipe fds; asyncio.communicate() then blocks
+        # forever waiting for the daemon to close them.  We redirect stderr
+        # to a temp file so we can still report errors on failure.
+        stderr_path = self.session_dir / "apptainer_start.err"
+        stderr_fh = stderr_path.open("w")
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *args,
+                stdout=asyncio.subprocess.DEVNULL,
+                stderr=stderr_fh,
+            )
+            rc = await proc.wait()
+        finally:
+            stderr_fh.close()
         if rc != 0:
+            detail = stderr_path.read_text().strip()
             raise RuntimeError(
                 f"{self._binary} instance start failed with exit code {rc}"
+                + (f": {detail}" if detail else "")
             )
 
     _STOP_TIMEOUT = 30.0
@@ -94,9 +122,14 @@ async def exec(
         if effective_workdir:
             wrapped_command = f"cd {shlex.quote(effective_workdir)} && {command}"
         args = [self._binary, "exec", f"instance://{self._instance_name}"]
+        # Ensure HOME is set inside the container (--no-home leaves it
+        # pointing at the non-existent host home) and clear host-specific
+        # cache dirs that would fail inside a read-only overlay.
+        effective_env: dict[str, str] = {"HOME": "/root"}
         if env:
-            args.append("env")
-            args.extend(f"{key}={value}" for key, value in env.items())
+            effective_env.update(env)
+        args.append("env")
+        args.extend(f"{key}={value}" for key, value in effective_env.items())
         args.extend(["bash", "-lc", wrapped_command])
         rc, stdout, stderr = await self._run_local_command(
             *args, timeout=timeout_sec, capture=True