diff --git a/.gitignore b/.gitignore index 8f8ed15d..e876b4c1 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,21 @@ wandb/ checkpoints/ rollout_results/ reference_projects/ -docs/ \ No newline at end of file +docs/ + +# SLURM job outputs +*.out +*.err + +# Cluster results +results/ + +# Private cluster configs and logs +worklogs/ + +# Cluster artifacts (generated locally) +cluster.yaml +opencode.def +test_results.log +tmp_defs/ +sif_images/ diff --git a/examples/calculator/README.md b/examples/calculator/README.md index 2ebee5e6..895d56a5 100644 --- a/examples/calculator/README.md +++ b/examples/calculator/README.md @@ -101,4 +101,70 @@ Each rollout then prepares a fresh workspace by: - installing the harness CLI or SDK for that run - creating `/polar/session/workspace` - uploading `calculator.py` and `test_calculator.py` -- initializing a git repo used by the evaluator \ No newline at end of file +- initializing a git repo used by the evaluator + +## Cluster Deployment (SLURM) + +For running on a SLURM cluster with Apptainer containers and vLLM inference. +See [examples/slurm/README.md](../slurm/README.md) for full documentation. + +### 1. Configure + +```bash +cp examples/slurm/cluster.yaml.example my-cluster.yaml +# Edit my-cluster.yaml with your cluster details +``` + +### 2. One-Time Setup + +```bash +polar cluster setup -c my-cluster.yaml +``` + +### 3. Build SIF Image + +```bash +# Single harness: +polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode + +# Multiple harnesses: +polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode,codex,swe_agent +``` + +### 4. Start Services + +```bash +polar cluster serve -c my-cluster.yaml +``` + +Once services are ready, the command prints the job ID and a sample `submit-task` command. + +### 5. Submit Tasks + +```bash +# Use the job ID from step 4 +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness opencode + +# Multiple harnesses against the same running service +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness codex +``` + +### 6. Stop Services + +```bash +scancel JOB_ID +``` + +### 7. Collect Results + +```bash +polar cluster sync -c my-cluster.yaml +``` + +**One-shot alternative** — start services, run tasks, and exit in one command: + +```bash +polar cluster launch -c my-cluster.yaml --example calculator --harness opencode +``` diff --git a/examples/calculator/swe_agent/Dockerfile b/examples/calculator/swe_agent/Dockerfile new file mode 100644 index 00000000..47c53664 --- /dev/null +++ b/examples/calculator/swe_agent/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.12-slim + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + git \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Install SWE-agent from git (PyPI version has missing dependency) +RUN pip install --no-cache-dir "git+https://github.com/SWE-agent/SWE-agent.git" \ + && SITE=$(python -c "import site; print(site.getsitepackages()[0])") \ + && git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src \ + && cp -r /tmp/swe-agent-src/config "$SITE/config" \ + && cp -r /tmp/swe-agent-src/tools "$SITE/tools" \ + && mkdir -p "$SITE/trajectories" \ + && rm -rf /tmp/swe-agent-src + +# Pre-install tool dependencies so install.sh is a no-op at runtime +# (compute nodes may lack internet access). +RUN pip install --no-cache-dir 'tree-sitter==0.21.3' 'tree-sitter-languages' + +WORKDIR /polar/session diff --git a/examples/slurm/README.md b/examples/slurm/README.md new file mode 100644 index 00000000..f000cc1b --- /dev/null +++ b/examples/slurm/README.md @@ -0,0 +1,415 @@ +# Polar on SLURM + +Deploy Polar agent rollout jobs on a SLURM cluster using Apptainer containers and vLLM inference. + +## Prerequisites + +- SSH access to a SLURM login node +- SLURM account with GPU allocation +- Shared filesystem (Lustre, GPFS, etc.) accessible from all nodes +- Apptainer on the cluster (for running agent containers) +- Python 3.10+ on the cluster + +## Install Polar + +From a checkout of this repository (laptop or cluster login node): + +```bash +python3 -m venv .venv +source .venv/bin/activate +uv pip install -e . +polar --help +``` + +After `polar cluster setup`, the cluster workspace venv also provides the `polar` command. + +## Quick Start + +### 1. Configure + +```bash +cp examples/slurm/cluster.yaml.example my-cluster.yaml +# Edit my-cluster.yaml with your cluster details +``` + +Key fields to set: +- `slurm.login_node` -- SSH hostname of your login node +- `slurm.account` -- your SLURM account string +- `slurm.partition` -- SLURM partition name +- `paths.workspace` -- base directory on shared filesystem + +### 2. One-Time Setup + +```bash +polar cluster setup -c my-cluster.yaml +``` + +This syncs code to the cluster and: +- Creates directories (sif_images/, results/, apptainer_cache/) +- Creates a Python venv and installs Polar +- Verifies Apptainer and CUDA are accessible + +### 3. Build SIF Images + +```bash +# Build a single harness +polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode + +# Build multiple harnesses at once +polar cluster build-sif -c my-cluster.yaml --example calculator --harness opencode,codex,swe_agent +``` + +### 4. Start Services + +```bash +# Start vLLM + rollout + gateway (waits until ready) +polar cluster serve -c my-cluster.yaml + +# Override model or resources +polar cluster serve -c my-cluster.yaml --model "Qwen/Qwen3.5-72B" --nodes 2 + +# Preview without submitting +polar cluster serve -c my-cluster.yaml --dry-run +``` + +Once ready, the command prints the job ID and a sample `submit-task` command. + +### 5. Submit Tasks + +```bash +# Submit against the running service (use job ID from step 4) +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness opencode + +# Multiple harnesses can reuse the same running service +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness codex +``` + +### 6. Stop Services + +```bash +scancel JOB_ID +``` + +### 7. One-Shot Mode (Alternative) + +If you prefer a single command that starts services, runs tasks, and exits: + +```bash +polar cluster launch -c my-cluster.yaml --example calculator --harness opencode +``` + +Services stop after tasks complete. + +### 8. Monitor + +```bash +# Check job status +polar cluster status -c my-cluster.yaml + +# Or directly via SSH +ssh squeue -u \$USER +``` + +### 9. Sync Results + +```bash +# Sync everything +polar cluster sync -c my-cluster.yaml + +# Sync a specific job +polar cluster sync -c my-cluster.yaml --job-id 12345 + +# Sync only code changes +polar cluster sync -c my-cluster.yaml --code-only +``` + +## Example Workflows + +### Calculator (Quick Validation) + +Build SIFs and submit tasks for one or more harnesses: + +```bash +polar cluster build-sif -c my-cluster.yaml \ + --example calculator --harness opencode,codex,swe_agent + +polar cluster serve -c my-cluster.yaml + +# Use the job ID printed by serve +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness opencode +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness codex +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example calculator --harness swe_agent +``` + +**Supported calculator harnesses:** + +| Harness | API | vLLM Compatible | Status | +|---------|-----|-----------------|--------| +| opencode | OpenAI Chat | Yes | Verified | +| codex | OpenAI Responses | Yes | Verified | +| swe_agent | OpenAI Chat | Yes | Verified | +| qwen_code | OpenAI Chat | Yes | Available | +| openhands_sdk | OpenAI Chat | Yes | Available | +| claude_code | Anthropic Messages | No (needs native API) | Local only | +| gemini_cli | Google Generative AI | No (needs native API) | Local only | + +### SWE-Gym (10 Curated Tasks) + +Each SWE-Gym instance needs its own SIF: + +```bash +# Build all 10 sample SIFs +polar cluster build-sif -c my-cluster.yaml \ + --example swegym --harness swe_agent + +# Start services and submit +polar cluster serve -c my-cluster.yaml +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swegym --harness swe_agent \ + --timeout-seconds 2400 + +# Or a single instance +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swegym --harness swe_agent \ + --timeout-seconds 2400 --instance-id getmoto__moto-7365 +``` + +Sample instances: + +| Instance | Repo | +|----------|------| +| `getmoto__moto-7365` | getmoto/moto | +| `python__mypy-10392` | python/mypy | +| `conan-io__conan-13721` | conan-io/conan | +| `iterative__dvc-1809` | iterative/dvc | +| `dask__dask-10441` | dask/dask | +| `pydantic__pydantic-8072` | pydantic/pydantic | +| `pandas-dev__pandas-58335` | pandas-dev/pandas | +| `facebookresearch__hydra-1783` | facebookresearch/hydra | +| `bokeh__bokeh-13636` | bokeh/bokeh | +| `Project-MONAI__MONAI-2238` | Project-MONAI/MONAI | + +### SWE-bench Verified (500 Tasks) + +Full benchmark evaluation with per-instance containers: + +```bash +# Cache dataset (once) +python -c "from examples.swebench_verified.dataset import load_swebench_verified; load_swebench_verified()" + +# Build per-instance SIFs +polar cluster build-sif -c my-cluster.yaml \ + --example swebench_verified --harness opencode \ + --instance-id django__django-15098 + +# Start services and submit +polar cluster serve -c my-cluster.yaml +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swebench_verified --harness opencode \ + --timeout-seconds 3600 --instance-id django__django-15098 +``` + +### SWE-Gym Slime GRPO (RL Training) + +Distributed RL training using Polar for rollout and Slime + Megatron for GRPO training. +See [examples/swegym_slime_grpo/README.md](../swegym_slime_grpo/README.md) for detailed setup. + +```bash +# Build training SIF (uses slimerl/slime Docker image as base) +polar cluster build-sif -c my-cluster.yaml --example train + +# Submit training job +polar cluster train -c my-cluster.yaml \ + --polar-config examples/swegym_slime_grpo/polar_config.yaml \ + --prompt-data examples/swegym_slime_grpo/swegym_10_tasks.jsonl \ + --num-rollouts 5 +``` + +## CLI Commands + +### `polar cluster setup` + +One-time cluster initialization. + +### `polar cluster build-sif` + +Build Apptainer SIF images on the cluster. + +| Option | Default | Description | +|--------|---------|-------------| +| `--example` | calculator | Task type: `calculator`, `swegym`, `swebench_verified`, `train` | +| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. | +| `--instance-id` | -- | Specific instance (repeatable, for swegym/swebench) | +| `--force` | -- | Rebuild even if SIF exists | + +### `polar cluster serve` + +Start vLLM + rollout + gateway services and wait until ready. + +| Option | Default | Description | +|--------|---------|-------------| +| `--model` | Qwen/Qwen3.5-27B | Model name for vLLM | +| `--nodes` | 1 | Number of SLURM nodes | +| `--gpus` | 8 | GPUs per node | +| `--time` | 04:00:00 | Job time limit | +| `--no-sync` | -- | Skip rsync to cluster | +| `--no-wait` | -- | Submit and return immediately | +| `--wait-timeout` | 600 | Seconds to wait for services | +| `--dry-run` | -- | Print sbatch command only | + +### `polar cluster submit-task` + +Submit tasks to a running service. + +| Option | Default | Description | +|--------|---------|-------------| +| `--job-id` | (required) | SLURM job ID from `serve` | +| `--example` | calculator | Task: `calculator`, `swegym`, `swebench_verified` | +| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. | +| `--num-rollouts` | 4 | Rollouts per task | +| `--timeout-seconds` | 900 | Per-session timeout | +| `--instance-id` | -- | Specific instance (repeatable) | + +### `polar cluster launch` (one-shot) + +Starts services, runs tasks, and exits in one SLURM job. + +| Option | Default | Description | +|--------|---------|-------------| +| `--example` | calculator | Task: `calculator`, `swegym`, `swebench_verified` | +| `--harness` | opencode | Agent: `opencode`, `codex`, `swe_agent`, etc. | +| `--model` | Qwen/Qwen3.5-27B | Model name for vLLM | +| `--nodes` | 1 | Number of SLURM nodes | +| `--gpus` | 8 | GPUs per node | +| `--time` | 04:00:00 | Job time limit | +| `--num-rollouts` | 4 | Rollouts per task | +| `--timeout-seconds` | 900 | Per-session timeout | +| `--instance-id` | -- | Specific instance (repeatable) | +| `--no-sync` | -- | Skip rsync to cluster | +| `--dry-run` | -- | Print sbatch command only | + +### `polar cluster train` + +Submit RL training job (Slime + Megatron GRPO). + +| Option | Default | Description | +|--------|---------|-------------| +| `--polar-config` | (required) | Path to polar_config.yaml | +| `--prompt-data` | (required) | Path to JSONL training data | +| `--hf-checkpoint` | Qwen/Qwen3-4B | HuggingFace model checkpoint | +| `--num-rollouts` | 5 | Training steps | +| `--no-sync` | -- | Skip rsync to cluster | +| `--no-wait` | -- | Submit and return immediately | +| `--dry-run` | -- | Print sbatch command only | + +### `polar cluster status` + +Show job status and running services. + +### `polar cluster sync` + +Sync code and results between local and cluster. + +| Option | Default | Description | +|--------|---------|-------------| +| `--results-only` | -- | Only sync results (no code) | +| `--code-only` | -- | Only sync code (no results) | +| `--job-id` | -- | Sync a specific job's results | + +## Architecture + +### Two-Phase (serve + submit-task) + +The `serve` command submits a SLURM job that: + +``` +1. Discover hostnames (scontrol show hostnames) +2. Generate topology.yaml +3. Start vLLM server (GPU, ~2-5min to load model) +4. Start rollout service (CPU) +5. Start gateway node(s) (CPU, manages Apptainer containers) +6. Write .services_ready sentinel +7. Wait indefinitely (until scancel or SLURM timeout) +``` + +The `submit-task` command discovers the running service via the sentinel file and submits tasks. + +### One-Shot (launch) + +The `launch` command bundles everything into one SLURM job (steps 1-5 above + submit tasks + wait + cleanup). + +For multi-node jobs: +- Node 0: vLLM + rollout + gateway +- Node 1+: additional gateway nodes (via `srun --overlap`) + +## Results Structure + +``` +polar-serve_/ + topology.yaml + .services_ready + logs/ + vllm.log + rollout.log + gateway_node-00.log + rollout_results/ + task_/ses_.json + tasks/ + request.json # (calculator) + response.json # (calculator) + manifest.json # (swegym/swebench) + / # (swegym/swebench: one dir per instance) + request.json + response.json + summary.json # (swegym/swebench) +``` + +## Tuning Parameters + +| Parameter | Default | Notes | +|-----------|---------|-------| +| `--num-rollouts` | 4 | Number of parallel sessions per task | +| `--timeout-seconds` | 900 | Per-session timeout. Use 2400 for swegym, 3600 for swebench | +| `max_model_len` | 16384 | In cluster.yaml `model` section. Increase for complex tasks | +| `max_num_seqs` | 64 | Max concurrent vLLM sequences | +| `gpu_memory_utilization` | 0.90 | Reduce if OOM | +| `tensor_parallel_size` | 8 | Match `gpus_per_node` | +| `tool_call_parser` | qwen3_xml | Must match model. qwen3_xml for Qwen3.5, hermes for others | + +## Switching Models + +Edit your `cluster.yaml`: + +```yaml +model: + name: "Qwen/Qwen3.5-27B" # Change to your model + tensor_parallel_size: 8 # Adjust for model size + tool_call_parser: "qwen3_xml" # Match model's tool call format +``` + +Or override on the command line: + +```bash +polar cluster serve -c my-cluster.yaml --model "Qwen/Qwen3.5-72B" +``` + +**Note**: The model must be cached in `$HF_HOME` on the cluster (compute nodes run with `HF_HUB_OFFLINE=1`). To cache a new model: + +```bash +huggingface-cli download Qwen/Qwen3.5-72B +``` + +## Backend Support + +The cluster config supports multiple backends via the `backend` field: + +| Backend | Status | Description | +|---------|--------|-------------| +| `slurm` | Supported | SSH + sbatch job submission | +| `local` | Stub | Run services locally (use example scripts) | +| `k8s` | Planned | Kubernetes pod scheduling | diff --git a/examples/slurm/cluster.yaml.example b/examples/slurm/cluster.yaml.example new file mode 100644 index 00000000..1b550f8b --- /dev/null +++ b/examples/slurm/cluster.yaml.example @@ -0,0 +1,71 @@ +# Polar cluster configuration. +# +# Copy this file and fill in your cluster details: +# cp examples/slurm/cluster.yaml.example my-cluster.yaml +# # Edit my-cluster.yaml +# polar cluster launch -c my-cluster.yaml +# +# The same config works for local, SLURM, and (future) Kubernetes backends. +# Set 'backend' to select the deployment target. + +# ── Backend ───────────────────────────────────────────────────────────────── +backend: slurm # "local" | "slurm" | "k8s" + +# ── SLURM connection (required when backend=slurm) ───────────────────────── +slurm: + login_node: "" # SSH host for job submission + account: "" # SLURM account/project string + partition: "" # SLURM partition name + +# ── Paths on the cluster filesystem ───────────────────────────────────────── +paths: + # Base workspace directory (required). All other paths derive from this. + workspace: "/path/to/your/workspace" + + # Override any derived path if your layout differs: + # polar_root: "${workspace}/polar" + # code: "${polar_root}/ProRL-Agent-Server" + # sif_dir: "${polar_root}/sif_images" + # results: "${polar_root}/results" + # venv: "${polar_root}/.venv" + + # Optional: custom binary paths (leave commented if system-installed) + # apptainer_bin_dir: "/path/to/apptainer/usr/bin" + # cuda_home: "/path/to/cuda-12.x" + +# ── Model configuration ────────────────────────────────────────────────────── +model: + name: "Qwen/Qwen3.5-27B" # HuggingFace model ID + tensor_parallel_size: 8 # TP size for vLLM + max_model_len: 16384 + gpu_memory_utilization: 0.90 + max_num_seqs: 64 + tool_call_parser: "qwen3_xml" # qwen3_xml, hermes, etc. + +# ── Task defaults ──────────────────────────────────────────────────────────── +task: + example: "calculator" # calculator or swegym + harness: "opencode" # opencode, codex, swe_agent, etc. + num_rollouts: 4 + timeout_seconds: 900 + +# ── Resource allocation ────────────────────────────────────────────────────── +resources: + nodes: 1 + gpus_per_node: 8 + cpus_per_task: 64 + mem: "512G" + time: "04:00:00" # HH:MM:SS job time limit + +# ── Service ports ──────────────────────────────────────────────────────────── +ports: + vllm: 18000 + rollout: 18080 + gateway_base: 18100 + +# ── Gateway tuning ─────────────────────────────────────────────────────────── +gateway: + max_init_workers: 8 + max_run_workers: 4 + max_postrun_workers: 4 + ready_buffer_target: 4 diff --git a/examples/slurm/setup_cluster.sh b/examples/slurm/setup_cluster.sh new file mode 100644 index 00000000..272ef07f --- /dev/null +++ b/examples/slurm/setup_cluster.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# setup_cluster.sh — One-time cluster setup for Polar on SLURM. +# +# Run via: +# polar cluster setup -c my-cluster.yaml +# +# Or manually on the cluster: +# export POLAR_WORKSPACE=/path/to/workspace +# bash examples/slurm/setup_cluster.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# Source env.sh from the cluster templates +source "${REPO_ROOT}/src/polar/cluster/templates/env.sh" + +echo "===============================================================" +echo "Polar Cluster Setup" +echo "===============================================================" + +# ── 1. Create directory structure ────────────────────────────────────────────── +echo "[setup] Creating directories..." +mkdir -p "${POLAR_ROOT}" +mkdir -p "${POLAR_SIFS}" +mkdir -p "${POLAR_RESULTS}" +mkdir -p "${APPTAINER_CACHEDIR}" +echo " POLAR_ROOT: ${POLAR_ROOT}" +echo " POLAR_SIFS: ${POLAR_SIFS}" +echo " POLAR_RESULTS: ${POLAR_RESULTS}" +echo " APPTAINER_CACHEDIR: ${APPTAINER_CACHEDIR}" + +# ── 2. Check Apptainer ──────────────────────────────────────────────────────── +echo "" +echo "[setup] Checking Apptainer..." +if command -v apptainer &>/dev/null; then + echo " Apptainer: $(which apptainer)" + apptainer --version +else + echo " WARNING: Apptainer not found on PATH." + echo " Set paths.apptainer_bin_dir in your cluster.yaml." +fi + +# ── 3. Create/update Python venv ────────────────────────────────────────────── +echo "" +echo "[setup] Setting up Python venv at ${POLAR_VENV}..." +if [ ! -d "${POLAR_VENV}" ]; then + echo " Creating new venv..." + python3 -m venv "${POLAR_VENV}" +fi +source "${POLAR_VENV}/bin/activate" + +echo " Python: $(which python) ($(python --version))" + +# Install polar in editable mode +echo " Installing polar..." +pip install --upgrade pip +pip install -e "${POLAR_CODE}" 2>&1 | tail -5 + +# Check if vLLM is installed +if python -c "import vllm" 2>/dev/null; then + echo " vLLM: $(python -c 'import vllm; print(vllm.__version__)')" +else + echo "" + echo " WARNING: vLLM is not installed in this venv." + echo " To install: pip install vllm" +fi + +# ── 4. Verify polar CLI ─────────────────────────────────────────────────────── +echo "" +echo "[setup] Verifying polar CLI..." +polar --help > /dev/null 2>&1 && echo " polar CLI: OK" || echo " ERROR: polar CLI not working" + +# ── 5. Summary ───────────────────────────────────────────────────────────────── +echo "" +echo "===============================================================" +echo "Setup complete!" +echo "" +echo "Next steps:" +echo " 1. Build SIF images:" +echo " polar cluster build-sif -c cluster.yaml --example calculator --harness opencode" +echo "" +echo " 2. Submit a job:" +echo " polar cluster launch -c cluster.yaml" +echo "===============================================================" diff --git a/examples/swebench_verified/README.md b/examples/swebench_verified/README.md index 5dbc37b7..1385b0a9 100644 --- a/examples/swebench_verified/README.md +++ b/examples/swebench_verified/README.md @@ -87,3 +87,87 @@ uv run python examples/swebench_verified/submit_swebench_tasks.py \ --max-concurrent 4 \ --max-tasks 10 ``` + +Supported harness names: `claude_code`, `codex`, `opencode`, `openhands_sdk` + +## Cluster Deployment (SLURM) + +For running on a SLURM cluster with Apptainer containers and vLLM inference. +See [examples/slurm/README.md](../slurm/README.md) for full documentation. + +### 1. Configure + +```bash +cp examples/slurm/cluster.yaml.example my-cluster.yaml +# Edit my-cluster.yaml with your cluster details +``` + +### 2. Populate Dataset Cache + +The task runner needs the full SWE-bench Verified dataset cached locally. +Run once (requires `datasets` library): + +```bash +python -c "from examples.swebench_verified.dataset import load_swebench_verified; load_swebench_verified()" +``` + +### 3. Build SIF Images + +Each SWE-bench Verified instance needs a per-instance SIF: + +```bash +# Build SIF for a specific instance + harness +polar cluster build-sif -c my-cluster.yaml \ + --example swebench_verified --harness opencode \ + --instance-id django__django-15098 + +# Build multiple instances +polar cluster build-sif -c my-cluster.yaml \ + --example swebench_verified --harness opencode \ + --instance-id django__django-15098 \ + --instance-id sympy__sympy-18835 +``` + +### 4. Start Services + +```bash +polar cluster serve -c my-cluster.yaml +``` + +Once services are ready, the command prints the job ID. + +### 5. Submit Tasks + +```bash +# Submit a single instance (use job ID from step 4) +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swebench_verified --harness opencode \ + --timeout-seconds 3600 --instance-id django__django-15098 + +# Submit multiple instances +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swebench_verified --harness opencode \ + --timeout-seconds 3600 \ + --instance-id django__django-15098 \ + --instance-id sympy__sympy-18835 +``` + +### 6. Stop Services + +```bash +scancel JOB_ID +``` + +### 7. Collect Results + +```bash +polar cluster sync -c my-cluster.yaml +``` + +**One-shot alternative** — start services, run tasks, and exit in one command: + +```bash +polar cluster launch -c my-cluster.yaml \ + --example swebench_verified --harness opencode \ + --timeout-seconds 3600 --instance-id django__django-15098 +``` diff --git a/examples/swebench_verified/dataset.py b/examples/swebench_verified/dataset.py index 7078dd69..143915b8 100644 --- a/examples/swebench_verified/dataset.py +++ b/examples/swebench_verified/dataset.py @@ -12,7 +12,10 @@ DEFAULT_CACHE_PATH = Path.home() / ".cache" / "polar" / "swebench_verified.json" HARNESS_IMAGE_PREFIX = "polar-swebench" -SUPPORTED_HARNESSES = ("opencode", "codex", "claude_code") +SUPPORTED_HARNESSES = ( + "opencode", "codex", "claude_code", + "gemini_cli", "qwen_code", "swe_agent", "openhands_sdk", +) def sanitize_instance_id(instance_id: str) -> str: diff --git a/examples/swebench_verified/submit_swebench_tasks.py b/examples/swebench_verified/submit_swebench_tasks.py index 24be1290..f53fb025 100644 --- a/examples/swebench_verified/submit_swebench_tasks.py +++ b/examples/swebench_verified/submit_swebench_tasks.py @@ -34,6 +34,8 @@ "codex": "@openai/codex@latest", "opencode": "opencode-ai@latest", "claude_code": "@anthropic-ai/claude-code@latest", + "gemini_cli": "@google/gemini-cli@latest", + "qwen_code": "@qwen-code/qwen-code@latest", } _PREPARE_BASE = ( @@ -48,8 +50,18 @@ def prepare_command_for_harness(harness: str) -> str: - pkg = HARNESS_NPM_PACKAGE[harness] - return f"npm install -g {pkg} && {_PREPARE_BASE}" + if harness in HARNESS_NPM_PACKAGE: + pkg = HARNESS_NPM_PACKAGE[harness] + return f"npm install -g {pkg} && {_PREPARE_BASE}" + if harness == "swe_agent": + return ( + "source /opt/miniconda3/etc/profile.d/conda.sh && " + "conda activate polar-sweagent && " + f"{_PREPARE_BASE}" + ) + if harness == "openhands_sdk": + return _PREPARE_BASE + raise ValueError(f"Unknown harness: {harness}") def runtime_env_for_harness(harness: str) -> dict[str, str]: @@ -146,6 +158,31 @@ def select_instances(args: argparse.Namespace) -> list[dict[str, Any]]: return instances +def agent_settings_for_harness(harness: str) -> dict[str, Any]: + if harness == "swe_agent": + return { + "repo_path": "/polar/session/workspace", + "shell_preamble": ( + "source /opt/miniconda3/etc/profile.d/conda.sh && " + "conda activate polar-sweagent && " + "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH" + ), + } + return {} + + +def agent_env_for_harness(harness: str) -> dict[str, str]: + if harness in ("openhands_sdk", "openhands"): + return {"WORKSPACE_BASE": "/polar/session/workspace"} + return {} + + +def runtime_kwargs_for_harness(harness: str) -> dict[str, Any]: + if harness == "swe_agent": + return {"fakeroot": True} + return {} + + def build_task_request( args: argparse.Namespace, *, @@ -154,6 +191,7 @@ def build_task_request( ) -> dict[str, Any]: instance_id = str(instance["instance_id"]) image = runtime_image_for_instance(instance_id) + kwargs = runtime_kwargs_for_harness(args.harness) return { "task_id": f"swebench-{args.harness}-{sanitize_instance_id(instance_id)}-{batch_id}", "instruction": str(instance["problem_statement"]).strip(), @@ -166,11 +204,12 @@ def build_task_request( "env": runtime_env_for_harness(args.harness), "network": "host", "workdir": "/polar/session/workspace", + **({"kwargs": kwargs} if kwargs else {}), }, "agent": { "harness": args.harness, - "settings": {}, - "env": {}, + "settings": agent_settings_for_harness(args.harness), + "env": agent_env_for_harness(args.harness), }, "builder": {"strategy": "prefix_merging"}, "evaluator": { diff --git a/examples/swegym/README.md b/examples/swegym/README.md index 67edf889..10441126 100644 --- a/examples/swegym/README.md +++ b/examples/swegym/README.md @@ -86,3 +86,63 @@ examples/swegym//batches// - The sample is text-only SWE-Gym data. - The submit helper extracts patches from `/polar/session/workspace`, then replays them onto `/testbed` for grading. - `swe_agent` uses a dedicated `polar-sweagent` environment inside the derived image. +- `openhands_sdk` only builds on benchmark images whose native Python is already compatible. + +## Cluster Deployment (SLURM) + +For running on a SLURM cluster with Apptainer containers and vLLM inference. +See [examples/slurm/README.md](../slurm/README.md) for full documentation. + +### 1. Configure + +```bash +cp examples/slurm/cluster.yaml.example my-cluster.yaml +# Edit my-cluster.yaml with your cluster details +``` + +### 2. Build SIF Images + +```bash +polar cluster build-sif -c my-cluster.yaml --example swegym --harness swe_agent +``` + +### 3. Start Services + +```bash +polar cluster serve -c my-cluster.yaml +``` + +Once services are ready, the command prints the job ID. + +### 4. Submit Tasks + +```bash +# All 10 sample instances (use job ID from step 3) +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swegym --harness swe_agent \ + --timeout-seconds 2400 + +# Or a single instance +polar cluster submit-task -c my-cluster.yaml \ + --job-id JOB_ID --example swegym --harness swe_agent \ + --timeout-seconds 2400 --instance-id getmoto__moto-7365 +``` + +### 5. Stop Services + +```bash +scancel JOB_ID +``` + +### 6. Collect Results + +```bash +polar cluster sync -c my-cluster.yaml +``` + +**One-shot alternative** — start services, run tasks, and exit in one command: + +```bash +polar cluster launch -c my-cluster.yaml --example swegym --harness swe_agent \ + --timeout-seconds 2400 +``` diff --git a/examples/swegym/sample_tasks.py b/examples/swegym/sample_tasks.py index 6cbba34e..65a5372c 100644 --- a/examples/swegym/sample_tasks.py +++ b/examples/swegym/sample_tasks.py @@ -152,3 +152,10 @@ def fetch_sample_instances( cache_file.parent.mkdir(parents=True, exist_ok=True) cache_file.write_text(json.dumps(ordered, indent=2, ensure_ascii=True, sort_keys=True)) return ordered + + +if __name__ == "__main__": + instances = fetch_sample_instances() + print(f"Cached {len(instances)} instances to {DEFAULT_CACHE_PATH}") + for inst in instances: + print(f" {inst['instance_id']}") diff --git a/pyproject.toml b/pyproject.toml index 1cc4a30c..590c3010 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,3 +40,6 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] include = ["polar*"] + +[tool.setuptools.package-data] +"polar.cluster" = ["templates/*.sbatch", "templates/*.sh"] diff --git a/src/polar/agent/harnesses/codex.py b/src/polar/agent/harnesses/codex.py index 18e1afd8..cac1f0a9 100644 --- a/src/polar/agent/harnesses/codex.py +++ b/src/polar/agent/harnesses/codex.py @@ -16,7 +16,7 @@ class CodexHarness(BaseHarness): def __init__(self, agent_spec: AgentSpec) -> None: super().__init__(agent_spec) - self._codex_home = "$HOME/.codex" + self._codex_home = "/root/.codex" async def setup(self, runtime: BaseRuntime) -> None: await runtime.exec(f"mkdir -p {self._codex_home}") @@ -82,6 +82,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]: return [ ExecInput( command=( + f"set -o pipefail && " f"codex exec {flags_str} -- {escaped} " f"2>&1 | tee {RUNTIME_AGENT_LOG_DIR}/codex.txt" ), diff --git a/src/polar/agent/harnesses/opencode.py b/src/polar/agent/harnesses/opencode.py index 48659f05..95517148 100644 --- a/src/polar/agent/harnesses/opencode.py +++ b/src/polar/agent/harnesses/opencode.py @@ -70,6 +70,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]: return [ ExecInput( command=( + f"set -o pipefail; " f"opencode -m {shlex.quote(model)} run " f"--format=json -- {escaped} " f"2>&1 | tee {RUNTIME_AGENT_LOG_DIR}/opencode.txt" diff --git a/src/polar/agent/harnesses/openhands_sdk.py b/src/polar/agent/harnesses/openhands_sdk.py index 4c393c9d..ab0ad7a9 100644 --- a/src/polar/agent/harnesses/openhands_sdk.py +++ b/src/polar/agent/harnesses/openhands_sdk.py @@ -79,7 +79,8 @@ def run_steps(self, instruction: str) -> list[ExecInput]: ExecInput( command=( 'export LLM_API_KEY="$OPENAI_API_KEY" LLM_BASE_URL="$OPENAI_BASE_URL" && ' - 'PYTHON_BIN="$HOME/.venv/bin/python"; ' + 'PYTHON_BIN="/opt/miniconda3/envs/polar-openhands/bin/python"; ' + '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="$HOME/.venv/bin/python"; ' '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="/opt/openhands-sdk-venv/bin/python"; ' '[ -x "$PYTHON_BIN" ] || PYTHON_BIN="$(command -v python3 || command -v python)"; ' '"$PYTHON_BIN" ' @@ -93,12 +94,19 @@ def run_steps(self, instruction: str) -> list[ExecInput]: _RUNNER_SCRIPT = r'''#!/usr/bin/env python3 """OpenHands SDK runner for Polar.""" -from __future__ import annotations +import sys +if sys.version_info < (3, 10): + print( + f"Error: OpenHands SDK requires Python >= 3.10, " + f"got {'.'.join(map(str, sys.version_info[:3]))}", + file=sys.stderr, + ) + sys.exit(1) import json import os -import sys from pathlib import Path +from typing import Optional def _load_skills(skill_paths_raw: str) -> list[object]: @@ -136,7 +144,7 @@ def _load_skills(skill_paths_raw: str) -> list[object]: return skills -def _load_mcp_config() -> dict[str, object] | None: +def _load_mcp_config() -> Optional[dict[str, object]]: raw = os.environ.get("MCP_SERVERS_JSON") if not raw: return None diff --git a/src/polar/agent/harnesses/swe_agent.py b/src/polar/agent/harnesses/swe_agent.py index f9ae21fa..39bf5db9 100644 --- a/src/polar/agent/harnesses/swe_agent.py +++ b/src/polar/agent/harnesses/swe_agent.py @@ -86,6 +86,7 @@ def run_steps(self, instruction: str) -> list[ExecInput]: command=( f"cat > {self._problem_statement_path} << 'POLARINST'\n{safe_instruction}\nPOLARINST\n" f"{preamble}" + 'set -o pipefail && ' 'export OPENAI_API_KEY="$OPENAI_API_KEY" OPENAI_BASE_URL="$OPENAI_BASE_URL" && ' f"sweagent run " f"--agent.model.name={shlex.quote(model)} " diff --git a/src/polar/cli.py b/src/polar/cli.py index ea96b9f3..66b78d1e 100644 --- a/src/polar/cli.py +++ b/src/polar/cli.py @@ -5,6 +5,7 @@ import argparse import json from pathlib import Path +import subprocess import sys from typing import Any from urllib.parse import urlparse @@ -93,6 +94,108 @@ def build_parser() -> argparse.ArgumentParser: help="Print the raw response JSON.", ) + # ── cluster subcommands ───────────────────────────────────────────────── + cluster_parser = subparsers.add_parser( + "cluster", + help="Cluster deployment operations (launch, setup, sync, build-sif, status, train).", + ) + cluster_sub = cluster_parser.add_subparsers( + dest="cluster_command", + required=True, + ) + + # polar cluster launch + launch_p = cluster_sub.add_parser("launch", help="Sync code and submit a cluster job.") + launch_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + launch_p.add_argument("--example", default=None, help="Override task.example") + launch_p.add_argument("--harness", default=None, help="Override task.harness") + launch_p.add_argument("--model", default=None, help="Override model.name") + launch_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes") + launch_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node") + launch_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)") + launch_p.add_argument("--num-rollouts", type=int, default=None) + launch_p.add_argument("--timeout-seconds", type=float, default=None) + launch_p.add_argument( + "--instance-id", action="append", default=None, + help="SWE-Gym instance ID (repeatable; defaults to sample 10)", + ) + launch_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster") + launch_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only") + + # polar cluster setup + setup_p = cluster_sub.add_parser("setup", help="One-time cluster environment setup.") + setup_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + + # polar cluster status + cstatus_p = cluster_sub.add_parser("status", help="Check SLURM job status.") + cstatus_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + cstatus_p.add_argument("--job-id", default=None, help="Specific job ID to query") + + # polar cluster sync + sync_p = cluster_sub.add_parser("sync", help="Sync code/results from cluster.") + sync_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + sync_p.add_argument("--job-id", default=None, help="Sync specific job results") + sync_p.add_argument("--code-only", action="store_true") + sync_p.add_argument("--results-only", action="store_true") + sync_p.add_argument("--dry-run", action="store_true") + + # polar cluster build-sif + sif_p = cluster_sub.add_parser("build-sif", help="Build Apptainer SIF images.") + sif_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + sif_p.add_argument("--example", required=True, help="Example name (calculator, swegym, swebench_verified, train)") + sif_p.add_argument("--harness", default=None, help="Comma-separated harness names (required except for --example train)") + sif_p.add_argument("--force", action="store_true", help="Rebuild even if SIF exists") + sif_p.add_argument( + "--instance-id", action="append", default=None, + help="SWE-Gym instance ID (repeatable; defaults to sample 10)", + ) + + # polar cluster serve + serve_p = cluster_sub.add_parser("serve", help="Start services (vLLM + rollout + gateway).") + serve_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + serve_p.add_argument("--model", default=None, help="Override model.name") + serve_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes") + serve_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node") + serve_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)") + serve_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster") + serve_p.add_argument("--no-wait", action="store_true", help="Don't wait for services to be ready") + serve_p.add_argument("--wait-timeout", type=int, default=600, help="Seconds to wait for readiness (default: 600)") + serve_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only") + + # polar cluster submit-task + submit_task_p = cluster_sub.add_parser("submit-task", help="Submit tasks to a running serve job.") + submit_task_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + submit_task_p.add_argument("--job-id", required=True, help="SLURM job ID of the serve job") + submit_task_p.add_argument("--example", default=None, help="Override task.example") + submit_task_p.add_argument("--harness", default=None, help="Override task.harness") + submit_task_p.add_argument("--num-rollouts", type=int, default=None) + submit_task_p.add_argument("--timeout-seconds", type=float, default=None) + submit_task_p.add_argument( + "--instance-id", action="append", default=None, + help="SWE-Gym instance ID (repeatable; defaults to sample 10)", + ) + + # polar cluster train + train_p = cluster_sub.add_parser("train", help="Submit a distributed RL training job.") + train_p.add_argument("-c", "--config", required=True, help="Path to cluster.yaml") + train_p.add_argument("--polar-config", default=None, help="Path to polar_config.yaml (bridge config)") + train_p.add_argument("--prompt-data", default=None, help="Path to JSONL training data") + train_p.add_argument("--hf-checkpoint", default=None, help="HuggingFace model checkpoint") + train_p.add_argument("--num-rollouts", type=int, default=None, help="Number of training steps") + train_p.add_argument("--rollout-batch-size", type=int, default=None) + train_p.add_argument("--n-samples-per-prompt", type=int, default=None) + train_p.add_argument("--global-batch-size", type=int, default=None) + train_p.add_argument("--actor-gpus", type=int, default=None) + train_p.add_argument("--rollout-gpus", type=int, default=None) + train_p.add_argument("--tp-size", type=int, default=None) + train_p.add_argument("--nodes", type=int, default=None, help="Override resources.nodes") + train_p.add_argument("--gpus", type=int, default=None, help="Override resources.gpus_per_node") + train_p.add_argument("--time", default=None, help="Override resources.time (HH:MM:SS)") + train_p.add_argument("--no-sync", action="store_true", help="Skip rsync to cluster") + train_p.add_argument("--no-wait", action="store_true", help="Don't wait for training to complete") + train_p.add_argument("--wait-timeout", type=int, default=3600, help="Seconds to wait (default: 3600)") + train_p.add_argument("--dry-run", action="store_true", help="Print sbatch command only") + return parser @@ -111,6 +214,8 @@ def main(argv: list[str] | None = None) -> int: return _handle_submit(args) if args.command == "status": return _handle_status(args) + if args.command == "cluster": + return _handle_cluster(args) except httpx.HTTPStatusError as exc: body = exc.response.text.strip() if body: @@ -127,14 +232,161 @@ def main(argv: list[str] | None = None) -> int: except httpx.HTTPError as exc: print(f"error: could not reach the rollout service: {exc}", file=sys.stderr) return 1 - except (FileNotFoundError, ValueError) as exc: + except NotImplementedError as exc: print(f"error: {exc}", file=sys.stderr) return 1 + except (FileNotFoundError, ValueError, TimeoutError) as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + except subprocess.CalledProcessError as exc: + print(f"error: command failed with exit code {exc.returncode}", file=sys.stderr) + if exc.stderr: + print(exc.stderr.strip(), file=sys.stderr) + return 1 parser.error(f"Unknown command: {args.command}") return 2 +def _handle_cluster(args: argparse.Namespace) -> int: + # Lazy imports — avoid loading cluster modules for non-cluster commands. + from polar.cluster.config import ClusterConfig + from polar.cluster.backend import get_backend + + config = ClusterConfig.load(args.config) + overrides = _build_cluster_overrides(args) + if overrides: + config = config.apply_overrides(overrides) + + repo_root = Path.cwd() + backend = get_backend(config) + + cmd = args.cluster_command + if cmd == "launch": + backend.launch(repo_root, dry_run=args.dry_run, no_sync=args.no_sync) + return 0 + if cmd == "setup": + backend.setup(repo_root) + return 0 + if cmd == "status": + result = backend.status(job_id=args.job_id) + jobs = result.get("jobs", []) + if not jobs: + print("No jobs found.") + else: + print(f"{'JOB_ID':<12} {'NAME':<30} {'STATE':<12} {'TIME':<10} {'NODES'}") + for j in jobs: + print(f"{j.get('job_id',''):<12} {j.get('name',''):<30} {j.get('state',''):<12} {j.get('time',''):<10} {j.get('nodes','')}") + return 0 + if cmd == "sync": + backend.sync( + repo_root, + job_id=args.job_id, + code_only=args.code_only, + results_only=args.results_only, + dry_run=args.dry_run, + ) + return 0 + if cmd == "build-sif": + if args.example != "train" and not args.harness: + print("error: --harness is required for non-train examples", file=sys.stderr) + return 1 + harnesses = [h.strip() for h in args.harness.split(",")] if args.harness else [] + results = backend.build_sif( + repo_root, args.example, harnesses, + force=args.force, + instance_ids=getattr(args, "instance_id", None), + ) + for key, sif_path in results.items(): + print(f" {key}: {sif_path}") + return 0 + if cmd == "serve": + result = backend.serve( + repo_root, + dry_run=args.dry_run, + no_sync=args.no_sync, + wait=not args.no_wait, + wait_timeout=args.wait_timeout, + ) + if result: + print(f"\n[cluster] Services ready.") + print(f"[cluster] Job ID: {result['job_id']}") + print(f"[cluster] Topology: {result['topology']}") + print(f"\n[cluster] Submit tasks with:") + print(f" polar cluster submit-task -c {args.config} \\") + print(f" --job-id {result['job_id']} --example calculator --harness opencode") + return 0 + if cmd == "submit-task": + return backend.submit_task( + repo_root, + job_id=args.job_id, + example=getattr(args, "example", None), + harness=getattr(args, "harness", None), + ) + if cmd == "train": + result = backend.train( + repo_root, + dry_run=args.dry_run, + no_sync=args.no_sync, + wait=not args.no_wait, + wait_timeout=args.wait_timeout, + ) + if result: + print(f"\n[cluster] Training job info:") + for k, v in result.items(): + print(f" {k}: {v}") + return 0 + + print(f"Unknown cluster command: {cmd}", file=sys.stderr) + return 2 + + +def _build_cluster_overrides(args: argparse.Namespace) -> dict: + """Extract CLI flag overrides into a nested dict for ``ClusterConfig.apply_overrides``.""" + overrides: dict = {} + if getattr(args, "example", None): + overrides.setdefault("task", {})["example"] = args.example + if getattr(args, "harness", None) and args.cluster_command == "launch": + overrides.setdefault("task", {})["harness"] = args.harness + if getattr(args, "model", None): + overrides.setdefault("model", {})["name"] = args.model + if getattr(args, "nodes", None) is not None: + overrides.setdefault("resources", {})["nodes"] = args.nodes + if getattr(args, "gpus", None) is not None: + overrides.setdefault("resources", {})["gpus_per_node"] = args.gpus + if getattr(args, "time", None): + overrides.setdefault("resources", {})["time"] = args.time + if getattr(args, "num_rollouts", None) is not None: + if getattr(args, "cluster_command", None) == "train": + overrides.setdefault("train", {})["num_rollouts"] = args.num_rollouts + else: + overrides.setdefault("task", {})["num_rollouts"] = args.num_rollouts + if getattr(args, "timeout_seconds", None) is not None: + overrides.setdefault("task", {})["timeout_seconds"] = args.timeout_seconds + if getattr(args, "instance_id", None): + overrides.setdefault("task", {})["instance_ids"] = args.instance_id + # Train-specific overrides + if getattr(args, "polar_config", None): + overrides.setdefault("train", {})["polar_config"] = args.polar_config + if getattr(args, "prompt_data", None): + overrides.setdefault("train", {})["prompt_data"] = args.prompt_data + if getattr(args, "hf_checkpoint", None): + overrides.setdefault("train", {})["hf_checkpoint"] = args.hf_checkpoint + if getattr(args, "rollout_batch_size", None) is not None: + overrides.setdefault("train", {})["rollout_batch_size"] = args.rollout_batch_size + if getattr(args, "n_samples_per_prompt", None) is not None: + overrides.setdefault("train", {})["n_samples_per_prompt"] = args.n_samples_per_prompt + if getattr(args, "global_batch_size", None) is not None: + overrides.setdefault("train", {})["global_batch_size"] = args.global_batch_size + if getattr(args, "actor_gpus", None) is not None: + overrides.setdefault("train", {})["actor_gpus"] = args.actor_gpus + if getattr(args, "rollout_gpus", None) is not None: + overrides.setdefault("train", {})["rollout_gpus"] = args.rollout_gpus + if getattr(args, "tp_size", None) is not None: + overrides.setdefault("train", {})["tp_size"] = args.tp_size + return overrides + + def _handle_submit(args: argparse.Namespace) -> int: rollout_url = _resolve_rollout_url(args.config, args.rollout_url) payload = _load_structured_file(args.task_file) diff --git a/src/polar/cluster/__init__.py b/src/polar/cluster/__init__.py new file mode 100644 index 00000000..a7ec9ca0 --- /dev/null +++ b/src/polar/cluster/__init__.py @@ -0,0 +1,6 @@ +"""Polar cluster deployment — launch jobs on local, SLURM, or K8s backends.""" + +from polar.cluster.config import ClusterConfig +from polar.cluster.backend import ClusterBackend, get_backend + +__all__ = ["ClusterConfig", "ClusterBackend", "get_backend"] diff --git a/src/polar/cluster/backend.py b/src/polar/cluster/backend.py new file mode 100644 index 00000000..5e239420 --- /dev/null +++ b/src/polar/cluster/backend.py @@ -0,0 +1,127 @@ +"""Abstract cluster backend and factory.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +from polar.cluster.config import ClusterConfig + + +class ClusterBackend(ABC): + """Base class for deployment backends (local, SLURM, K8s, ...).""" + + def __init__(self, config: ClusterConfig) -> None: + self.config = config + + @abstractmethod + def launch( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + ) -> str: + """Sync code and launch a job. Return a job identifier string.""" + + @abstractmethod + def setup(self, repo_root: Path) -> None: + """One-time environment setup on the target cluster.""" + + @abstractmethod + def status(self, job_id: str | None = None) -> dict[str, Any]: + """Query job / service status.""" + + @abstractmethod + def sync( + self, + repo_root: Path, + *, + job_id: str | None = None, + code_only: bool = False, + results_only: bool = False, + dry_run: bool = False, + ) -> None: + """Sync results (and optionally code) back from the cluster.""" + + @abstractmethod + def build_sif( + self, + repo_root: Path, + example: str, + harnesses: list[str], + *, + force: bool = False, + instance_ids: list[str] | None = None, + ) -> dict[str, Path]: + """Build Apptainer SIF images. Return ``{key: sif_path}``. + + For calculator, *key* is the harness name. + For swegym, *key* is ``harness/sanitized_instance_id``. + When *instance_ids* is ``None`` for swegym, builds all sample instances. + """ + + + # ── Optional two-phase methods (non-abstract) ───────────────────────── + + def serve( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + wait: bool = True, + wait_timeout: int = 600, + ) -> dict[str, str]: + """Start services without submitting tasks. Return job info.""" + raise NotImplementedError( + f"The {type(self).__name__} backend does not support 'serve'. " + "Use 'polar cluster launch' for a combined workflow." + ) + + def submit_task( + self, + repo_root: Path, + *, + job_id: str, + example: str | None = None, + harness: str | None = None, + ) -> int: + """Submit tasks to a running service. Return exit code.""" + raise NotImplementedError( + f"The {type(self).__name__} backend does not support 'submit-task'. " + "Use 'polar cluster launch' for a combined workflow." + ) + + def train( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + wait: bool = True, + wait_timeout: int = 3600, + ) -> dict[str, str]: + """Submit a training job. Return job info dict.""" + raise NotImplementedError( + f"The {type(self).__name__} backend does not support 'train'." + ) + + +def get_backend(config: ClusterConfig) -> ClusterBackend: + """Return the appropriate backend for *config.backend*.""" + if config.backend == "slurm": + from polar.cluster.slurm import SlurmBackend + + return SlurmBackend(config) + if config.backend == "local": + from polar.cluster.local import LocalBackend + + return LocalBackend(config) + if config.backend == "k8s": + raise NotImplementedError( + "Kubernetes backend is not yet implemented. " + "Contributions welcome!" + ) + raise ValueError(f"Unknown backend: {config.backend!r}") diff --git a/src/polar/cluster/config.py b/src/polar/cluster/config.py new file mode 100644 index 00000000..b0b332de --- /dev/null +++ b/src/polar/cluster/config.py @@ -0,0 +1,300 @@ +"""Unified cluster configuration model. + +Parses a ``cluster.yaml`` file into a typed Pydantic model. The config is +backend-agnostic: the same YAML schema works for local, SLURM, and (future) +K8s backends — only the ``backend`` field and its corresponding connection +section differ. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Literal + +import yaml +from pydantic import BaseModel, model_validator + + +class SlurmConnection(BaseModel): + """SLURM-specific connection details (only required when ``backend: slurm``).""" + + login_node: str = "" + account: str = "" + partition: str = "" + + +class ClusterPaths(BaseModel): + """Filesystem paths on the target cluster / machine.""" + + workspace: str = "" + polar_root: str = "" + code: str = "" + sif_dir: str = "" + results: str = "" + venv: str = "" + apptainer_bin_dir: str = "" + cuda_home: str = "" + + @model_validator(mode="after") + def _derive_paths(self) -> "ClusterPaths": + if self.workspace: + if not self.polar_root: + self.polar_root = f"{self.workspace}/polar" + if not self.code: + self.code = f"{self.polar_root}/ProRL-Agent-Server" + if not self.sif_dir: + self.sif_dir = f"{self.polar_root}/sif_images" + if not self.results: + self.results = f"{self.polar_root}/results" + if not self.venv: + self.venv = f"{self.polar_root}/.venv" + return self + + +class ModelConfig(BaseModel): + """LLM model configuration for vLLM.""" + + name: str = "Qwen/Qwen3.5-27B" + tensor_parallel_size: int = 8 + max_model_len: int = 16384 + gpu_memory_utilization: float = 0.90 + max_num_seqs: int = 64 + tool_call_parser: str = "qwen3_xml" + + +class TaskConfig(BaseModel): + """Default task / example settings.""" + + example: str = "calculator" + harness: str = "opencode" + num_rollouts: int = 4 + timeout_seconds: float = 900.0 + instance_ids: list[str] = [] + + +class ResourceConfig(BaseModel): + """Compute resource allocation.""" + + nodes: int = 1 + gpus_per_node: int = 8 + cpus_per_task: int = 64 + mem: str = "512G" + time: str = "04:00:00" + + +class PortConfig(BaseModel): + """Service port assignments.""" + + vllm: int = 18000 + rollout: int = 18080 + gateway_base: int = 18100 + + +class GatewayTuning(BaseModel): + """Gateway worker pool sizing.""" + + max_init_workers: int = 8 + max_run_workers: int = 4 + max_postrun_workers: int = 4 + ready_buffer_target: int = 4 + + +class TrainConfig(BaseModel): + """RL training configuration for Slime + Megatron GRPO.""" + + polar_config: str = "" + prompt_data: str = "" + hf_checkpoint: str = "Qwen/Qwen3-4B" + torch_dist_dir: str = "" + save_dir: str = "" + model_args: list[str] = [ + "--swiglu", + "--num-layers", "36", + "--hidden-size", "2560", + "--ffn-hidden-size", "9728", + "--num-attention-heads", "32", + "--group-query-attention", + "--num-query-groups", "8", + "--use-rotary-position-embeddings", + "--disable-bias-linear", + "--normalization", "RMSNorm", + "--norm-epsilon", "1e-6", + "--rotary-base", "1000000", + "--vocab-size", "151936", + "--kv-channels", "128", + "--qk-layernorm", + ] + num_rollouts: int = 5 + rollout_batch_size: int = 2 + n_samples_per_prompt: int = 16 + global_batch_size: int = 32 + actor_gpus: int = 4 + rollout_gpus: int = 4 + tp_size: int = 2 + sglang_router_port: int = 9000 + ray_port: int = 6379 + ray_dashboard_port: int = 8265 + extra_args: list[str] = [] + wandb_project: str = "" + wandb_exp_name: str = "" + wandb_group: str = "" + + +class ClusterConfig(BaseModel): + """Top-level cluster configuration. + + The ``backend`` field selects which deployment backend to use: + ``"local"``, ``"slurm"``, or ``"k8s"`` (future). + """ + + backend: Literal["local", "slurm", "k8s"] = "slurm" + slurm: SlurmConnection = SlurmConnection() + paths: ClusterPaths = ClusterPaths() + model: ModelConfig = ModelConfig() + task: TaskConfig = TaskConfig() + resources: ResourceConfig = ResourceConfig() + ports: PortConfig = PortConfig() + gateway: GatewayTuning = GatewayTuning() + train: TrainConfig = TrainConfig() + + # ── Constructors ───────────────────────────────────────────────────────── + + @classmethod + def load(cls, path: str | Path) -> "ClusterConfig": + """Load configuration from a YAML file.""" + p = Path(path) + if not p.exists(): + raise FileNotFoundError(f"Cluster config not found: {p}") + with p.open() as fh: + raw = yaml.safe_load(fh) or {} + if not isinstance(raw, dict): + raise ValueError(f"Cluster config must be a YAML mapping: {p}") + # Support legacy configs that use 'cluster' section for slurm fields + if "cluster" in raw and "slurm" not in raw: + raw["slurm"] = raw.pop("cluster") + return cls.model_validate(raw) + + # ── Helpers ─────────────────────────────────────────────────────────────── + + def sbatch_export_vars(self) -> dict[str, str]: + """Build the flat env-var dict passed to ``sbatch --export``.""" + v: dict[str, str] = { + "POLAR_CODE": self.paths.code, + "POLAR_WORKSPACE": self.paths.workspace, + "EXAMPLE": self.task.example, + "HARNESS": self.task.harness, + "MODEL_NAME": self.model.name, + "MODEL_PATH": self.model.name, + "TENSOR_PARALLEL_SIZE": str(self.model.tensor_parallel_size), + "NUM_ROLLOUTS": str(self.task.num_rollouts), + "TIMEOUT_SECONDS": str(self.task.timeout_seconds), + "VLLM_PORT": str(self.ports.vllm), + "ROLLOUT_PORT": str(self.ports.rollout), + "GATEWAY_BASE_PORT": str(self.ports.gateway_base), + "MAX_INIT_WORKERS": str(self.gateway.max_init_workers), + "MAX_RUN_WORKERS": str(self.gateway.max_run_workers), + "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers), + "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target), + "GPU_MEMORY_UTILIZATION": str(self.model.gpu_memory_utilization), + "MAX_MODEL_LEN": str(self.model.max_model_len), + "MAX_NUM_SEQS": str(self.model.max_num_seqs), + "TOOL_CALL_PARSER": self.model.tool_call_parser, + } + if self.task.instance_ids: + v["INSTANCE_IDS"] = ",".join(self.task.instance_ids) + if self.paths.apptainer_bin_dir: + v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir + if self.paths.cuda_home: + v["CUDA_HOME"] = self.paths.cuda_home + # swe_agent's swerex needs --fakeroot in Apptainer for chown support + if self.task.harness == "swe_agent": + v["RUNTIME_FAKEROOT"] = "true" + return v + + def sbatch_serve_export_vars(self) -> dict[str, str]: + """Build env-var dict for serve-only sbatch (no task-specific vars).""" + v: dict[str, str] = { + "POLAR_CODE": self.paths.code, + "POLAR_WORKSPACE": self.paths.workspace, + "MODEL_NAME": self.model.name, + "MODEL_PATH": self.model.name, + "TENSOR_PARALLEL_SIZE": str(self.model.tensor_parallel_size), + "VLLM_PORT": str(self.ports.vllm), + "ROLLOUT_PORT": str(self.ports.rollout), + "GATEWAY_BASE_PORT": str(self.ports.gateway_base), + "MAX_INIT_WORKERS": str(self.gateway.max_init_workers), + "MAX_RUN_WORKERS": str(self.gateway.max_run_workers), + "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers), + "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target), + "GPU_MEMORY_UTILIZATION": str(self.model.gpu_memory_utilization), + "MAX_MODEL_LEN": str(self.model.max_model_len), + "MAX_NUM_SEQS": str(self.model.max_num_seqs), + "TOOL_CALL_PARSER": self.model.tool_call_parser, + } + if self.paths.apptainer_bin_dir: + v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir + if self.paths.cuda_home: + v["CUDA_HOME"] = self.paths.cuda_home + return v + + def sbatch_train_export_vars(self) -> dict[str, str]: + """Build env-var dict for the training sbatch job.""" + t = self.train + v: dict[str, str] = { + "POLAR_CODE": self.paths.code, + "POLAR_WORKSPACE": self.paths.workspace, + "POLAR_CONFIG_PATH": t.polar_config, + "PROMPT_DATA": t.prompt_data, + "HF_CHECKPOINT": t.hf_checkpoint, + "MODEL_NAME": t.hf_checkpoint, + "TRAIN_NUM_ROLLOUTS": str(t.num_rollouts), + "ROLLOUT_BATCH_SIZE": str(t.rollout_batch_size), + "N_SAMPLES_PER_PROMPT": str(t.n_samples_per_prompt), + "GLOBAL_BATCH_SIZE": str(t.global_batch_size), + "ACTOR_GPUS": str(t.actor_gpus), + "ROLLOUT_GPUS": str(t.rollout_gpus), + "TP_SIZE": str(t.tp_size), + "SGLANG_ROUTER_PORT": str(t.sglang_router_port), + "RAY_PORT": str(t.ray_port), + "RAY_DASHBOARD_PORT": str(t.ray_dashboard_port), + "ROLLOUT_PORT": str(self.ports.rollout), + "GATEWAY_BASE_PORT": str(self.ports.gateway_base), + "MAX_INIT_WORKERS": str(self.gateway.max_init_workers), + "MAX_RUN_WORKERS": str(self.gateway.max_run_workers), + "MAX_POSTRUN_WORKERS": str(self.gateway.max_postrun_workers), + "READY_BUFFER_TARGET": str(self.gateway.ready_buffer_target), + } + if t.torch_dist_dir: + v["TORCH_DIST_DIR"] = t.torch_dist_dir + if t.save_dir: + v["TRAIN_SAVE_DIR"] = t.save_dir + if t.model_args: + v["MODEL_ARGS"] = " ".join(t.model_args) + if t.extra_args: + v["EXTRA_TRAIN_ARGS"] = " ".join(t.extra_args) + if t.wandb_project: + v["WANDB_PROJECT"] = t.wandb_project + if t.wandb_exp_name: + v["WANDB_EXP_NAME"] = t.wandb_exp_name + if t.wandb_group: + v["WANDB_GROUP"] = t.wandb_group + if self.paths.apptainer_bin_dir: + v["APPTAINER_BIN_DIR"] = self.paths.apptainer_bin_dir + if self.paths.cuda_home: + v["CUDA_HOME"] = self.paths.cuda_home + return v + + def apply_overrides(self, overrides: dict[str, Any]) -> "ClusterConfig": + """Return a new config with *overrides* deep-merged on top.""" + data = self.model_dump() + _deep_merge(data, overrides) + return ClusterConfig.model_validate(data) + + +def _deep_merge(base: dict, overlay: dict) -> None: + """Recursively merge *overlay* into *base* in place.""" + for key, value in overlay.items(): + if key in base and isinstance(base[key], dict) and isinstance(value, dict): + _deep_merge(base[key], value) + else: + base[key] = value diff --git a/src/polar/cluster/local.py b/src/polar/cluster/local.py new file mode 100644 index 00000000..293da892 --- /dev/null +++ b/src/polar/cluster/local.py @@ -0,0 +1,57 @@ +"""Local backend — run Polar services directly on the current machine.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from polar.cluster.backend import ClusterBackend + + +class LocalBackend(ClusterBackend): + """Run Polar services locally (no cluster scheduler). + + For local development the recommended flow is the per-example + ``submit_tasks.py`` scripts under ``examples/``. This backend exists + as a placeholder so the unified config schema works with + ``backend: local`` and can be extended in the future. + """ + + def launch(self, repo_root: Path, *, dry_run: bool = False, no_sync: bool = False) -> str: + raise NotImplementedError( + "Local launch is not yet integrated into 'polar cluster launch'.\n" + "Use the per-example submit scripts instead:\n" + " python examples/calculator/opencode/submit_tasks.py" + ) + + def setup(self, repo_root: Path) -> None: + print("[local] No setup required for local backend.") + + def status(self, job_id: str | None = None) -> dict[str, Any]: + return {"backend": "local", "status": "not implemented"} + + def sync( + self, + repo_root: Path, + *, + job_id: str | None = None, + code_only: bool = False, + results_only: bool = False, + dry_run: bool = False, + ) -> None: + print("[local] No sync needed for local backend.") + + def build_sif( + self, + repo_root: Path, + example: str, + harnesses: list[str], + *, + force: bool = False, + instance_ids: list[str] | None = None, + ) -> dict[str, Path]: + raise NotImplementedError( + "Local SIF builds require Docker or Apptainer installed locally.\n" + "Use 'docker build' in the example directory, or set backend: slurm\n" + "to build on a cluster with Apptainer." + ) diff --git a/src/polar/cluster/slurm.py b/src/polar/cluster/slurm.py new file mode 100644 index 00000000..40618e98 --- /dev/null +++ b/src/polar/cluster/slurm.py @@ -0,0 +1,1065 @@ +"""SLURM cluster backend — rsync code, submit sbatch jobs, sync results.""" + +from __future__ import annotations + +import importlib.resources +import platform +import re +import shlex +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +from polar.cluster.backend import ClusterBackend +from polar.cluster.config import ClusterConfig + +# Files and directories excluded from rsync to the cluster. +_RSYNC_EXCLUDES = [ + ".git", + "__pycache__", + "*.pyc", + ".venv", + "node_modules", + "worklogs/", + "results/", + "rollout_results/", + "*.egg-info", +] + + +class SlurmBackend(ClusterBackend): + """Deploy Polar via SSH + sbatch on a SLURM cluster.""" + + def __init__(self, config: ClusterConfig) -> None: + super().__init__(config) + slurm = config.slurm + if not slurm.login_node: + raise ValueError("slurm.login_node is required for the SLURM backend") + if not slurm.account: + raise ValueError("slurm.account is required for the SLURM backend") + if not slurm.partition: + raise ValueError("slurm.partition is required for the SLURM backend") + if not config.paths.workspace: + raise ValueError("paths.workspace is required for the SLURM backend") + + # ── Public API ──────────────────────────────────────────────────────────── + + def launch( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + ) -> str: + cfg = self.config + self._print_summary() + + if not no_sync: + self._sync_code_to_cluster(repo_root) + + # Pre-populate swegym sample cache so the job can find instance data + if cfg.task.example == "swegym": + self._sync_swegym_cache() + if cfg.task.example == "swebench_verified": + self._sync_swebench_cache() + + sbatch_cmd = self._build_sbatch_command() + print(f"\n[cluster] sbatch command:\n {sbatch_cmd}") + + if dry_run: + print("\n[cluster] Dry run — not submitting.") + return "" + + results_dir = cfg.paths.results + self._ssh_run(f"mkdir -p '{results_dir}'") + out = self._ssh_run(sbatch_cmd, capture=True) + job_id = out.strip().split()[-1] + + login = cfg.slurm.login_node + example, harness = cfg.task.example, cfg.task.harness + print(f"\n[cluster] Job submitted: {job_id}") + print(f"[cluster] Monitor:") + print(f" ssh {login} squeue -j {job_id}") + print(f" ssh {login} 'tail -f {results_dir}/polar-{example}-{harness}_{job_id}.out'") + print(f"\n[cluster] Sync results after completion:") + print(f" polar cluster sync -c ") + return job_id + + def setup(self, repo_root: Path) -> None: + cfg = self.config + self._sync_code_to_cluster(repo_root) + print(f"\n[cluster] Running setup on {cfg.slurm.login_node}...") + env_parts = [f"export POLAR_WORKSPACE='{cfg.paths.workspace}'"] + if cfg.paths.apptainer_bin_dir: + env_parts.append(f"export APPTAINER_BIN_DIR='{cfg.paths.apptainer_bin_dir}'") + if cfg.paths.cuda_home: + env_parts.append(f"export CUDA_HOME='{cfg.paths.cuda_home}'") + env_str = " && ".join(env_parts) + setup_script = f"{cfg.paths.code}/examples/slurm/setup_cluster.sh" + self._ssh_run(f"{env_str} && cd '{cfg.paths.code}' && bash '{setup_script}'") + print("[cluster] Setup complete.") + + def status(self, job_id: str | None = None) -> dict[str, Any]: + if job_id: + out = self._ssh_run( + f"squeue -j {job_id} --format='%i %j %T %M %N' --noheader", + capture=True, + ) + else: + out = self._ssh_run( + "squeue -u $USER --format='%i %j %T %M %N' --noheader", + capture=True, + ) + jobs: list[dict[str, str]] = [] + for line in out.strip().splitlines(): + parts = line.split(None, 4) + if len(parts) >= 3: + jobs.append({ + "job_id": parts[0], + "name": parts[1] if len(parts) > 1 else "", + "state": parts[2] if len(parts) > 2 else "", + "time": parts[3] if len(parts) > 3 else "", + "nodes": parts[4] if len(parts) > 4 else "", + }) + return {"jobs": jobs} + + def sync( + self, + repo_root: Path, + *, + job_id: str | None = None, + code_only: bool = False, + results_only: bool = False, + dry_run: bool = False, + ) -> None: + cfg = self.config + login = cfg.slurm.login_node + local = self._is_local() + extra: list[str] = [] + if dry_run: + extra.append("--dry-run") + + def _remote_path(p: str) -> str: + return p if local else f"{login}:{p}" + + if not results_only: + print(f"[cluster] Syncing code from {login}...") + src = _remote_path(cfg.paths.code + "/") + dst = str(repo_root) + "/" + self._rsync(src, dst, extra_args=extra, exclude=_RSYNC_EXCLUDES) + + if not code_only: + print(f"[cluster] Syncing results from {login}...") + if job_id: + pattern = f"*_{job_id}" + src = _remote_path(f"{cfg.paths.results}/{pattern}/") + dst_dir = repo_root / "results" / pattern + dst_dir.mkdir(parents=True, exist_ok=True) + self._rsync(src, str(dst_dir) + "/", extra_args=extra) + else: + src = _remote_path(cfg.paths.results + "/") + dst_dir = repo_root / "results" + dst_dir.mkdir(parents=True, exist_ok=True) + self._rsync(src, str(dst_dir) + "/", extra_args=extra) + + print("[cluster] Sync complete.") + + def build_sif( + self, + repo_root: Path, + example: str, + harnesses: list[str], + *, + force: bool = False, + instance_ids: list[str] | None = None, + ) -> dict[str, Path]: + cfg = self.config + results: dict[str, Path] = {} + + if example == "train": + sif_name = "train-slime-grpo.sif" + def_content = _generate_train_def(cfg.paths.code) + sif_path = self._build_single_sif( + sif_name, def_content, force=force, + srun_time="01:00:00", srun_mem="64G", + ) + results["train"] = Path(sif_path) + return results + + if example in ("swegym", "swebench_verified"): + # Per-instance examples: one SIF per (harness, instance_id) + if not instance_ids: + if example == "swegym": + from polar.cluster.tasks import SWEGYM_SAMPLE + instance_ids = [i["instance_id"] for i in SWEGYM_SAMPLE] + else: + raise ValueError( + f"--instance-id is required for {example} SIF builds." + ) + prefix = "swegym" if example == "swegym" else "swebench" + for harness in harnesses: + for instance_id in instance_ids: + sanitized = _sanitize_instance_id(instance_id) + sif_name = f"{prefix}-{harness}-{sanitized}.sif" + def_content = _generate_def_file( + repo_root, example, harness, instance_id=instance_id, + ) + if def_content is None: + print(f"[cluster] WARNING: No .def for {example}/{harness}/{instance_id}") + continue + sif_path = self._build_single_sif( + sif_name, def_content, force=force, + ) + key = f"{harness}/{sanitized}" + results[key] = Path(sif_path) + else: + # Calculator and other examples: one SIF per harness + for harness in harnesses: + sif_name = f"{example}-{harness}.sif" + def_content = _generate_def_file(repo_root, example, harness) + if def_content is None: + print(f"[cluster] WARNING: Cannot generate .def for {example}/{harness}, skipping") + continue + sif_path = self._build_single_sif( + sif_name, def_content, force=force, + ) + results[harness] = Path(sif_path) + + return results + + def serve( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + wait: bool = True, + wait_timeout: int = 600, + ) -> dict[str, str]: + """Submit a serve-only sbatch job. Return {job_id, topology}.""" + cfg = self.config + self._print_serve_summary() + + if not no_sync: + self._sync_code_to_cluster(repo_root) + + sbatch_cmd = self._build_serve_sbatch_command() + print(f"\n[cluster] sbatch command:\n {sbatch_cmd}") + + if dry_run: + print("\n[cluster] Dry run — not submitting.") + return {} + + results_dir = cfg.paths.results + self._ssh_run(f"mkdir -p '{results_dir}'") + out = self._ssh_run(sbatch_cmd, capture=True) + job_id = out.strip().split()[-1] + + job_dir = f"{results_dir}/polar-serve_{job_id}" + sentinel = f"{job_dir}/.services_ready" + + print(f"\n[cluster] Job submitted: {job_id}") + + if not wait: + print(f"[cluster] Not waiting. Check readiness with:") + print(f" polar cluster status -c --job-id {job_id}") + return {"job_id": job_id, "topology": f"{job_dir}/topology.yaml"} + + print(f"[cluster] Waiting for services to be ready (timeout: {wait_timeout}s)...") + poll_interval = 10 + for attempt in range(wait_timeout // poll_interval): + content = self._ssh_run( + f"cat '{sentinel}' 2>/dev/null || true", + capture=True, + ) + if "TOPOLOGY=" in content: + topology = "" + for line in content.strip().splitlines(): + if line.startswith("TOPOLOGY="): + topology = line.split("=", 1)[1] + break + return {"job_id": job_id, "topology": topology} + + # Check job is still alive + state = self._get_job_state(job_id) + if state in ("FAILED", "CANCELLED", "TIMEOUT", "COMPLETED", ""): + raise RuntimeError( + f"Serve job {job_id} entered state '{state}' before services were ready. " + f"Check logs: {job_dir}/logs/" + ) + elapsed = (attempt + 1) * poll_interval + if elapsed % 30 == 0: + print(f"[cluster] Still waiting... ({elapsed}s, job state: {state})") + time.sleep(poll_interval) + + raise TimeoutError( + f"Services not ready after {wait_timeout}s. " + f"Check job logs: {job_dir}/logs/" + ) + + def submit_task( + self, + repo_root: Path, + *, + job_id: str, + example: str | None = None, + harness: str | None = None, + ) -> int: + """Submit tasks to a running serve job. Return exit code.""" + cfg = self.config + example = example or cfg.task.example + harness = harness or cfg.task.harness + + # Discover topology from job ID + topology_path = self._find_topology(job_id) + job_dir = str(Path(topology_path).parent) + + print(f"[cluster] Submitting tasks to job {job_id}") + print(f"[cluster] Topology: {topology_path}") + print(f"[cluster] Example: {example}") + print(f"[cluster] Harness: {harness}") + + # Build instance-id args + instance_id_args = "" + if cfg.task.instance_ids: + for iid in cfg.task.instance_ids: + instance_id_args += f" --instance-id {iid}" + + env_setup = ( + f"export POLAR_WORKSPACE='{cfg.paths.workspace}' && " + f"source '{cfg.paths.code}/src/polar/cluster/templates/env.sh'" + ) + task_cmd = ( + f"{env_setup} && " + f"python -m polar.cluster.tasks " + f"--example {example} --harness {harness} " + f"--topology {topology_path} " + f"--sif-dir {cfg.paths.sif_dir} " + f"--output-dir {job_dir}/tasks/{harness} " + f"--num-rollouts {cfg.task.num_rollouts} " + f"--timeout-seconds {cfg.task.timeout_seconds}" + f"{instance_id_args}" + ) + + try: + self._ssh_run(task_cmd) + print("[cluster] Task submission complete.") + return 0 + except subprocess.CalledProcessError as exc: + print(f"[cluster] Task submission failed (exit code {exc.returncode})") + return exc.returncode + + def train( + self, + repo_root: Path, + *, + dry_run: bool = False, + no_sync: bool = False, + wait: bool = True, + wait_timeout: int = 3600, + ) -> dict[str, str]: + """Submit a training sbatch job. Return job info dict.""" + cfg = self.config + self._print_train_summary() + + if not no_sync: + self._sync_code_to_cluster(repo_root) + + sbatch_cmd = self._build_train_sbatch_command() + print(f"\n[cluster] sbatch command:\n {sbatch_cmd}") + + if dry_run: + print("\n[cluster] Dry run — not submitting.") + return {} + + results_dir = cfg.paths.results + self._ssh_run(f"mkdir -p '{results_dir}'") + out = self._ssh_run(sbatch_cmd, capture=True) + job_id = out.strip().split()[-1] + + job_dir = f"{results_dir}/polar-train_{job_id}" + login = cfg.slurm.login_node + + print(f"\n[cluster] Training job submitted: {job_id}") + print(f"[cluster] Monitor:") + print(f" ssh {login} squeue -j {job_id}") + print(f" ssh {login} 'tail -f {results_dir}/polar-train_{job_id}.out'") + + if not wait: + return {"job_id": job_id, "job_dir": job_dir} + + print(f"[cluster] Waiting for training to complete (timeout: {wait_timeout}s)...") + poll_interval = 30 + for attempt in range(wait_timeout // poll_interval): + state = self._get_job_state(job_id) + if state == "COMPLETED": + print(f"[cluster] Training job {job_id} completed successfully.") + return {"job_id": job_id, "job_dir": job_dir, "state": "COMPLETED"} + if state in ("FAILED", "CANCELLED", "TIMEOUT", ""): + raise RuntimeError( + f"Training job {job_id} entered state '{state}'. " + f"Check logs: ssh {login} 'tail -100 {results_dir}/polar-train_{job_id}.out'" + ) + elapsed = (attempt + 1) * poll_interval + if elapsed % 120 == 0: + print(f"[cluster] Training still running... ({elapsed}s, state: {state})") + time.sleep(poll_interval) + + raise TimeoutError( + f"Training job {job_id} not completed after {wait_timeout}s. " + f"Job may still be running. Check: ssh {login} squeue -j {job_id}" + ) + + def _build_single_sif( + self, + sif_name: str, + def_content: str, + *, + force: bool = False, + srun_time: str = "00:30:00", + srun_mem: str = "32G", + ) -> str: + """Build a single SIF image on the cluster and return its path.""" + cfg = self.config + sif_path = f"{cfg.paths.sif_dir}/{sif_name}" + + # Check if SIF already exists (skip unless --force) + if not force: + try: + self._ssh_run(f"test -f '{sif_path}'", check=True) + print(f"[cluster] SIF exists, skipping: {sif_name}") + return sif_path + except subprocess.CalledProcessError: + pass # file doesn't exist, proceed + + print(f"[cluster] Building SIF: {sif_name}") + def_dir = f"{cfg.paths.polar_root}/tmp_defs" + remote_def = f"{def_dir}/{sif_name}.def" + self._ssh_run(f"mkdir -p '{def_dir}'") + self._ssh_run(f"cat > '{remote_def}' << 'POLAREOF'\n{def_content}\nPOLAREOF") + self._ssh_run(f"mkdir -p '{cfg.paths.sif_dir}'") + + force_flag = "--force" if force else "" + cache_dir = f"{cfg.paths.polar_root}/apptainer_cache" + path_prefix = "" + if cfg.paths.apptainer_bin_dir: + path_prefix = f"export PATH='{cfg.paths.apptainer_bin_dir}':$PATH && " + build_cmd = ( + f"{path_prefix}" + f"export APPTAINER_CACHEDIR='{cache_dir}' && mkdir -p '{cache_dir}' && " + f"apptainer build {force_flag} '{sif_path}' '{remote_def}'" + ) + account = cfg.slurm.account + try: + self._ssh_run( + f"srun --account={account} --partition=cpu_short --time={srun_time} " + f"--cpus-per-task=8 --mem={srun_mem} bash -c {shlex.quote(build_cmd)}" + ) + except subprocess.CalledProcessError: + print(f"[cluster] srun failed, trying direct build...") + self._ssh_run(build_cmd) + + self._ssh_run(f"rm -f '{remote_def}'") + print(f"[cluster] Built: {sif_path}") + return sif_path + + # ── Internal helpers ────────────────────────────────────────────────────── + + def _is_local(self) -> bool: + """Return True if we're already on the login node (sbatch available).""" + if not hasattr(self, "_local_cache"): + import shutil + self._local_cache = shutil.which("sbatch") is not None + return self._local_cache + + def _ssh_run( + self, + command: str, + *, + capture: bool = False, + check: bool = True, + ) -> str: + if self._is_local(): + cmd = ["bash", "-l", "-c", command] + else: + login = self.config.slurm.login_node + cmd = ["ssh", "-o", "ConnectTimeout=10", login, command] + result = subprocess.run( + cmd, + capture_output=capture, + text=True, + check=check, + ) + if capture: + return result.stdout + return "" + + def _rsync( + self, + src: str, + dst: str, + *, + exclude: list[str] | None = None, + extra_args: list[str] | None = None, + ) -> None: + cmd = ["rsync", "-avz", "--delete"] + for pattern in exclude or []: + cmd.extend(["--exclude", pattern]) + cmd.extend(extra_args or []) + cmd.extend([src, dst]) + subprocess.run(cmd, check=True) + + def _sync_code_to_cluster(self, repo_root: Path) -> None: + cfg = self.config + login = cfg.slurm.login_node + print(f"\n[cluster] Syncing code to {login}:{cfg.paths.code}/ ...") + self._ssh_run(f"mkdir -p '{cfg.paths.code}'") + if self._is_local(): + src = str(repo_root) + "/" + dst = cfg.paths.code + "/" + else: + src = str(repo_root) + "/" + dst = f"{login}:{cfg.paths.code}/" + self._rsync(src, dst, exclude=_RSYNC_EXCLUDES) + print("[cluster] Sync complete.") + + def _sync_swegym_cache(self) -> None: + """Ensure the SWE-Gym sample instance cache exists on the cluster.""" + cache_file = Path.home() / ".cache" / "polar" / "swegym_sample_10.json" + if not cache_file.exists(): + try: + from examples.swegym.sample_tasks import fetch_sample_instances + print("[cluster] Fetching SWE-Gym sample data from HuggingFace...") + fetch_sample_instances() + except Exception as exc: + print(f"[cluster] WARNING: Could not fetch SWE-Gym sample data: {exc}") + return + if cache_file.exists(): + if self._is_local(): + # Already on the login node — cache is in place, nothing to sync. + print("[cluster] SWE-Gym sample cache already present.") + else: + login = self.config.slurm.login_node + self._ssh_run("mkdir -p ~/.cache/polar/") + self._rsync(str(cache_file), f"{login}:~/.cache/polar/swegym_sample_10.json") + print("[cluster] SWE-Gym sample cache synced.") + + def _sync_swebench_cache(self) -> None: + """Ensure the SWE-bench Verified dataset cache exists on the cluster.""" + cache_file = Path.home() / ".cache" / "polar" / "swebench_verified.json" + if not cache_file.exists(): + try: + import importlib + spec = importlib.util.spec_from_file_location( + "dataset", + Path(__file__).resolve().parents[2] / "examples" / "swebench_verified" / "dataset.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + print("[cluster] Fetching SWE-bench Verified dataset from HuggingFace...") + mod.load_swebench_verified() + except Exception as exc: + print(f"[cluster] WARNING: Could not fetch SWE-bench Verified data: {exc}") + return + if cache_file.exists(): + if self._is_local(): + print("[cluster] SWE-bench Verified cache already present.") + else: + login = self.config.slurm.login_node + self._ssh_run("mkdir -p ~/.cache/polar/") + self._rsync(str(cache_file), f"{login}:~/.cache/polar/swebench_verified.json") + print("[cluster] SWE-bench Verified cache synced.") + + def _build_sbatch_command(self) -> str: + cfg = self.config + export_vars = cfg.sbatch_export_vars() + export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items()) + + example = cfg.task.example + harness = cfg.task.harness + results_dir = cfg.paths.results + job_name = f"polar-{example}-{harness}" + sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm.sbatch" + + parts = [ + "sbatch", + f"--account={cfg.slurm.account}", + f"--partition={cfg.slurm.partition}", + f"--nodes={cfg.resources.nodes}", + f"--gres=gpu:{cfg.resources.gpus_per_node}", + f"--cpus-per-task={cfg.resources.cpus_per_task}", + f"--mem={cfg.resources.mem}", + f"--time={cfg.resources.time}", + f"--job-name={job_name}", + f"--output={results_dir}/{job_name}_%j.out", + f"--error={results_dir}/{job_name}_%j.err", + f"--export={export_str}", + sbatch_path, + ] + return " ".join(parts) + + def _build_serve_sbatch_command(self) -> str: + cfg = self.config + export_vars = cfg.sbatch_serve_export_vars() + export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items()) + + results_dir = cfg.paths.results + job_name = "polar-serve" + sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm_serve.sbatch" + + parts = [ + "sbatch", + f"--account={cfg.slurm.account}", + f"--partition={cfg.slurm.partition}", + f"--nodes={cfg.resources.nodes}", + f"--gres=gpu:{cfg.resources.gpus_per_node}", + f"--cpus-per-task={cfg.resources.cpus_per_task}", + f"--mem={cfg.resources.mem}", + f"--time={cfg.resources.time}", + f"--job-name={job_name}", + f"--output={results_dir}/{job_name}_%j.out", + f"--error={results_dir}/{job_name}_%j.err", + f"--export={export_str}", + sbatch_path, + ] + return " ".join(parts) + + def _build_train_sbatch_command(self) -> str: + cfg = self.config + export_vars = cfg.sbatch_train_export_vars() + export_str = "ALL," + ",".join(f"{k}={v}" for k, v in export_vars.items()) + + results_dir = cfg.paths.results + job_name = "polar-train" + sbatch_path = f"{cfg.paths.code}/src/polar/cluster/templates/polar_slurm_train.sbatch" + + parts = [ + "sbatch", + f"--account={cfg.slurm.account}", + f"--partition={cfg.slurm.partition}", + f"--nodes={cfg.resources.nodes}", + f"--gres=gpu:{cfg.resources.gpus_per_node}", + f"--cpus-per-task={cfg.resources.cpus_per_task}", + f"--mem={cfg.resources.mem}", + f"--time={cfg.resources.time}", + f"--job-name={job_name}", + f"--output={results_dir}/{job_name}_%j.out", + f"--error={results_dir}/{job_name}_%j.err", + # Quote the export string — values like MODEL_ARGS contain spaces + f"--export={shlex.quote(export_str)}", + sbatch_path, + ] + return " ".join(parts) + + def _get_job_state(self, job_id: str) -> str: + """Query SLURM for the current state of a job.""" + out = self._ssh_run( + f"squeue -j {job_id} --format='%T' --noheader 2>/dev/null || true", + capture=True, + ) + return out.strip() + + def _find_topology(self, job_id: str) -> str: + """Discover the topology.yaml path for a running serve job.""" + cfg = self.config + results_dir = cfg.paths.results + + # Try the sentinel file first (written by polar_slurm_serve.sbatch) + sentinel = f"{results_dir}/polar-serve_{job_id}/.services_ready" + content = self._ssh_run( + f"cat '{sentinel}' 2>/dev/null || true", + capture=True, + ) + if "TOPOLOGY=" in content: + for line in content.strip().splitlines(): + if line.startswith("TOPOLOGY="): + return line.split("=", 1)[1] + + # Fallback: search for topology.yaml matching the job ID + out = self._ssh_run( + f"ls '{results_dir}'/*_{job_id}/topology.yaml 2>/dev/null || true", + capture=True, + ) + path = out.strip().splitlines()[0] if out.strip() else "" + if path: + return path + + raise FileNotFoundError( + f"Cannot find topology for job {job_id}. " + f"Is the serve job running? Check: polar cluster status -c --job-id {job_id}" + ) + + def _print_serve_summary(self) -> None: + cfg = self.config + lines = [ + "=" * 65, + "Polar SLURM Serve (services only)", + "=" * 65, + f" Model: {cfg.model.name}", + f" TP size: {cfg.model.tensor_parallel_size}", + " " + "-" * 60, + f" Login node: {cfg.slurm.login_node}", + f" Account: {cfg.slurm.account}", + f" Partition: {cfg.slurm.partition}", + f" Nodes: {cfg.resources.nodes}", + f" GPUs/node: {cfg.resources.gpus_per_node}", + f" Time limit: {cfg.resources.time}", + f" Workspace: {cfg.paths.workspace}", + "=" * 65, + ] + print("\n".join(lines)) + + def _print_train_summary(self) -> None: + cfg = self.config + t = cfg.train + lines = [ + "=" * 65, + "Polar SLURM Training Job (Slime + Megatron GRPO)", + "=" * 65, + f" HF checkpoint: {t.hf_checkpoint}", + f" Actor GPUs: {t.actor_gpus} (TP={t.tp_size})", + f" Rollout GPUs: {t.rollout_gpus}", + f" Num rollouts: {t.num_rollouts}", + f" Batch: {t.rollout_batch_size} prompts x {t.n_samples_per_prompt} samples", + f" Global batch: {t.global_batch_size}", + " " + "-" * 60, + f" Login node: {cfg.slurm.login_node}", + f" Account: {cfg.slurm.account}", + f" Partition: {cfg.slurm.partition}", + f" Nodes: {cfg.resources.nodes}", + f" GPUs/node: {cfg.resources.gpus_per_node}", + f" Time limit: {cfg.resources.time}", + f" Workspace: {cfg.paths.workspace}", + "=" * 65, + ] + print("\n".join(lines)) + + def _print_summary(self) -> None: + cfg = self.config + lines = [ + "=" * 65, + "Polar SLURM Job Submission", + "=" * 65, + f" Example: {cfg.task.example}", + f" Harness: {cfg.task.harness}", + f" Model: {cfg.model.name}", + f" TP size: {cfg.model.tensor_parallel_size}", + f" Rollouts: {cfg.task.num_rollouts}", + f" Timeout: {cfg.task.timeout_seconds}s", + " " + "-" * 60, + f" Login node: {cfg.slurm.login_node}", + f" Account: {cfg.slurm.account}", + f" Partition: {cfg.slurm.partition}", + f" Nodes: {cfg.resources.nodes}", + f" GPUs/node: {cfg.resources.gpus_per_node}", + f" Time limit: {cfg.resources.time}", + f" Workspace: {cfg.paths.workspace}", + "=" * 65, + ] + print("\n".join(lines)) + + +# ── SIF definition file generation ──────────────────────────────────────────── + + +def _sanitize_instance_id(instance_id: str) -> str: + """Normalize instance ID for use in filenames.""" + normalized = instance_id.strip().lower() + normalized = re.sub(r"[^a-z0-9_.-]+", "-", normalized.replace("__", "--")) + normalized = re.sub(r"-{2,}", "-", normalized) + return normalized.strip("-") + + +# Maps harness name to (base_image, install_commands) +_CALCULATOR_HARNESS_DEFS: dict[str, tuple[str, list[str]]] = { + "opencode": ( + "node:22-bookworm-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*", + "npm install -g opencode-ai@latest", + ], + ), + "codex": ( + "node:22-bookworm-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*", + "npm install -g @openai/codex@latest", + ], + ), + "claude_code": ( + "node:22-bookworm-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*", + "npm install -g @anthropic-ai/claude-code@latest", + ], + ), + "gemini_cli": ( + "node:22-bookworm-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*", + "npm install -g @google/gemini-cli@latest", + ], + ), + "qwen_code": ( + "node:22-bookworm-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git python-is-python3 python3 && rm -rf /var/lib/apt/lists/*", + "npm install -g @qwen-code/qwen-code@latest", + ], + ), + "swe_agent": ( + "python:3.12-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git build-essential && rm -rf /var/lib/apt/lists/*", + "pip install --no-cache-dir 'sweagent[all] @ git+https://github.com/SWE-agent/SWE-agent.git'", + "pip install --no-cache-dir tree-sitter==0.21.3 tree-sitter-languages", + "SITE=$(python -c 'import site; print(site.getsitepackages()[0])') && " + "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && " + "cp -r /tmp/swe-agent-src/config $SITE/config && " + "cp -r /tmp/swe-agent-src/tools $SITE/tools && " + "mkdir -p $SITE/trajectories && " + "rm -rf /tmp/swe-agent-src", + ], + ), + "openhands_sdk": ( + "python:3.12-slim", + [ + "apt-get update && apt-get install -y --no-install-recommends bash ca-certificates curl git && rm -rf /var/lib/apt/lists/*", + "pip install --no-cache-dir openhands-sdk openhands-tools", + ], + ), +} + + +def _generate_def_file( + repo_root: Path, + example: str, + harness: str, + instance_id: str | None = None, +) -> str | None: + """Generate an Apptainer ``.def`` file for building a harness SIF.""" + if example == "calculator": + spec = _CALCULATOR_HARNESS_DEFS.get(harness) + if spec is None: + return None + base_image, commands = spec + post = "\n ".join(commands) + return ( + f"Bootstrap: docker\n" + f"From: {base_image}\n" + f"\n" + f"%post\n" + f" {post}\n" + f" mkdir -p /polar/session/workspace /polar/session/logs/agent\n" + f"\n" + f"%environment\n" + f" export DEBIAN_FRONTEND=noninteractive\n" + f"\n" + f"%labels\n" + f" io.polar.example {example}\n" + f" io.polar.harness {harness}\n" + ) + if example == "swegym": + return _generate_swegym_def(harness, instance_id) + if example == "swebench_verified": + return _generate_swebench_def(harness, instance_id) + return None + + +def _swegym_base_image(instance_id: str) -> str: + """Derive the SWE-Gym eval base image from an instance ID.""" + suffix = instance_id.replace("__", "_s_").lower() + return f"docker.io/xingyaoww/sweb.eval.x86_64.{suffix}:latest" + + +# SWE-Gym harness install commands (layered on top of per-instance base image). +# Base images already have conda + testbed env; we add the agent harness tools. +_SWEGYM_HARNESS_DEFS: dict[str, list[str]] = { + "swe_agent": [ + "/opt/miniconda3/bin/conda create -y -n polar-sweagent python=3.11 pip", + "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir " + "'git+https://github.com/SWE-agent/SWE-agent.git'", + "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir " + "tree-sitter==0.21.3 tree-sitter-languages", + "SITE=$(/opt/miniconda3/envs/polar-sweagent/bin/python -c " + "\"import site; print(site.getsitepackages()[0])\") && " + "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && " + "cp -r /tmp/swe-agent-src/config $SITE/config && " + "cp -r /tmp/swe-agent-src/tools $SITE/tools && " + "mkdir -p $SITE/trajectories && " + "/opt/miniconda3/bin/conda clean -afy && " + "rm -rf /tmp/swe-agent-src", + ], +} + + +def _generate_swegym_def( + harness: str, + instance_id: str | None, +) -> str | None: + """Generate an Apptainer .def for a SWE-Gym per-instance SIF.""" + if instance_id is None: + return None + spec = _SWEGYM_HARNESS_DEFS.get(harness) + if spec is None: + return None + base_image = _swegym_base_image(instance_id) + post = "\n ".join(spec) + return ( + f"Bootstrap: docker\n" + f"From: {base_image}\n" + f"\n" + f"%post\n" + f" {post}\n" + f" mkdir -p /polar/session/workspace /polar/session/logs/agent\n" + f"\n" + f"%environment\n" + f" export DEBIAN_FRONTEND=noninteractive\n" + f" export PATH=/opt/miniconda3/envs/testbed/bin:" + f"/opt/miniconda3/envs/polar-sweagent/bin:$PATH\n" + f"\n" + f"%labels\n" + f" io.polar.example swegym\n" + f" io.polar.harness {harness}\n" + f" io.polar.instance_id {instance_id}\n" + ) + + +# ── SWE-bench Verified SIF definitions ────────────────────────────────────── + +_SWEBENCH_NODE_INSTALL = ( + "apt-get update && " + "apt-get install -y --no-install-recommends ca-certificates curl gnupg && " + "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && " + "apt-get install -y --no-install-recommends nodejs && " + "apt-get clean && rm -rf /var/lib/apt/lists/*" +) + +_SWEBENCH_HARNESS_DEFS: dict[str, list[str]] = { + "opencode": [ + _SWEBENCH_NODE_INSTALL, + "npm install -g opencode-ai@latest", + ], + "codex": [ + _SWEBENCH_NODE_INSTALL, + "npm install -g @openai/codex@latest", + ], + "claude_code": [ + _SWEBENCH_NODE_INSTALL, + "npm install -g @anthropic-ai/claude-code@latest", + ], + "gemini_cli": [ + _SWEBENCH_NODE_INSTALL, + "npm install -g @google/gemini-cli@latest", + ], + "qwen_code": [ + _SWEBENCH_NODE_INSTALL, + "npm install -g @qwen-code/qwen-code@latest", + ], + "swe_agent": [ + "apt-get update && apt-get install -y --no-install-recommends " + "bash ca-certificates curl git build-essential && rm -rf /var/lib/apt/lists/*", + "/opt/miniconda3/bin/conda create -y -n polar-sweagent python=3.11 pip", + "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir " + "'git+https://github.com/SWE-agent/SWE-agent.git'", + "/opt/miniconda3/envs/polar-sweagent/bin/python -m pip install --no-cache-dir " + "tree-sitter==0.21.3 tree-sitter-languages", + "SITE=$(/opt/miniconda3/envs/polar-sweagent/bin/python -c " + "\"import site; print(site.getsitepackages()[0])\") && " + "git clone --depth 1 https://github.com/SWE-agent/SWE-agent.git /tmp/swe-agent-src && " + "cp -r /tmp/swe-agent-src/config $SITE/config && " + "cp -r /tmp/swe-agent-src/tools $SITE/tools && " + "mkdir -p $SITE/trajectories && " + "/opt/miniconda3/bin/conda clean -afy && " + "rm -rf /tmp/swe-agent-src", + ], + "openhands_sdk": [ + "apt-get update && apt-get install -y --no-install-recommends " + "bash ca-certificates curl git && rm -rf /var/lib/apt/lists/*", + "/opt/miniconda3/bin/conda create -y -n polar-openhands python=3.12 pip", + "/opt/miniconda3/envs/polar-openhands/bin/python -m pip install --no-cache-dir " + "openhands-sdk openhands-tools", + "/opt/miniconda3/bin/conda clean -afy", + ], +} + + +def _swebench_base_image(instance_id: str) -> str: + """Derive the SWE-bench eval base image from an instance ID.""" + suffix = instance_id.replace("__", "_s_").lower() + return f"docker.io/xingyaoww/sweb.eval.x86_64.{suffix}:latest" + + +def _generate_swebench_def( + harness: str, + instance_id: str | None, +) -> str | None: + """Generate an Apptainer .def for a SWE-bench Verified per-instance SIF.""" + if instance_id is None: + return None + spec = _SWEBENCH_HARNESS_DEFS.get(harness) + if spec is None: + return None + base_image = _swebench_base_image(instance_id) + post = "\n ".join(spec) + + # Build PATH: always include testbed; add harness-specific conda envs + path_parts = ["/opt/miniconda3/envs/testbed/bin"] + if harness == "swe_agent": + path_parts.insert(0, "/opt/miniconda3/envs/polar-sweagent/bin") + elif harness == "openhands_sdk": + path_parts.insert(0, "/opt/miniconda3/envs/polar-openhands/bin") + path_env = ":".join(path_parts) + + return ( + f"Bootstrap: docker\n" + f"From: {base_image}\n" + f"\n" + f"%post\n" + f" {post}\n" + f" mkdir -p /polar/session/workspace /polar/session/logs/agent\n" + f"\n" + f"%environment\n" + f" export DEBIAN_FRONTEND=noninteractive\n" + f" export PATH={path_env}:$PATH\n" + f"\n" + f"%labels\n" + f" io.polar.example swebench_verified\n" + f" io.polar.harness {harness}\n" + f" io.polar.instance_id {instance_id}\n" + ) + + +def _generate_train_def(code_path: str) -> str: + """Generate an Apptainer .def for the Slime+Megatron GRPO training SIF. + + Uses slimerl/slime Docker image which ships sglang v0.5.9, Megatron-LM, + flash-attn, transformer_engine, apex, mbridge, and all training deps + pre-built. We only add Polar and apply Polar's Slime patch on top. + """ + return ( + "Bootstrap: docker\n" + "From: slimerl/slime:nightly-dev-20260329a\n" + "\n" + "%files\n" + f" {code_path} /opt/polar\n" + "\n" + "%post\n" + " # Install Polar on top of the Slime image\n" + " pip install -e /opt/polar\n" + "\n" + " # Apply Slime patch (adds external advantage estimator for Polar)\n" + " bash /opt/polar/scripts/patch/patch_slime.sh\n" + "\n" + "%environment\n" + ' export PYTHONPATH="/opt/polar/src:/root/Megatron-LM:${PYTHONPATH:-}"\n' + " export CUDA_DEVICE_MAX_CONNECTIONS=1\n" + " export PYTHONNOUSERSITE=1\n" + ' export LD_LIBRARY_PATH="/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}"\n' + "\n" + "%labels\n" + " io.polar.example train\n" + " io.polar.framework slime-grpo\n" + ) diff --git a/src/polar/cluster/tasks.py b/src/polar/cluster/tasks.py new file mode 100644 index 00000000..373ec2f3 --- /dev/null +++ b/src/polar/cluster/tasks.py @@ -0,0 +1,632 @@ +"""Build and submit Polar task payloads on SLURM. + +Called inside the SLURM job to construct task JSON with correct absolute SIF +image paths and submit them through the Polar CLI. + +Usage (from sbatch script):: + + python -m polar.cluster.tasks --example calculator --harness opencode \\ + --topology /path/to/topology.yaml --sif-dir /lustre/.../sif_images +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, ensure_ascii=True, sort_keys=True)) + + +def _submit_task(request_path: Path, topology_path: str) -> dict[str, Any]: + """Submit a task JSON via ``polar submit`` and return the response.""" + command = [ + sys.executable, "-m", "polar.cli", + "submit", str(request_path), + "-c", topology_path, + "--json", + ] + print(f"[tasks] Running: {' '.join(command)}") + completed = subprocess.run(command, check=True, capture_output=True, text=True) + return json.loads(completed.stdout) + + +def _summarize_result(response: dict[str, Any]) -> dict[str, Any]: + sessions = response.get("results") or [] + completed = sum(1 for s in sessions if s.get("status") == "COMPLETED") + reward_one = 0 + for s in sessions: + traces = (s.get("trajectory") or {}).get("traces") or [] + if traces and traces[-1].get("reward") == 1.0: + reward_one += 1 + return { + "total_sessions": len(sessions), + "completed_sessions": completed, + "reward_one_sessions": reward_one, + } + + +def _sanitize_instance_id(instance_id: str) -> str: + normalized = instance_id.strip().lower() + normalized = re.sub(r"[^a-z0-9_.-]+", "-", normalized.replace("__", "--")) + normalized = re.sub(r"-{2,}", "-", normalized) + return normalized.strip("-") + + +# ── Calculator ──────────────────────────────────────────────────────────────── + +CALCULATOR_INSTRUCTION = """\ +Write a Python calculator with no extra imports. Support arithmetic expressions over integers and +parentheses. Save it as `calculator.py`. + +Expose a `Calculator` class that can be called with a string expression. + +Example: + +from calculator import Calculator +cal = Calculator() +print(cal("4*3-3")) # should print 9""" + +CALCULATOR_TEST = """\ +from calculator import Calculator + +cal = Calculator() + +assert cal("4*3-3") == 9 +assert cal("(2+3)*4") == 20 +assert cal("10/2+7") == 12 +assert cal("18-(3*4)") == 6 +assert cal(" 8 + 2 * 5 ") == 18 + +print("calculator tests passed") +""" + + +def build_calculator_task( + harness: str, + sif_dir: str, + output_dir: str, + *, + agent_model: str = "openai/gpt-4o", + num_rollouts: int = 4, + timeout_seconds: float = 900.0, + batch_id: str = "", +) -> dict[str, Any]: + """Build a calculator task payload.""" + sif_path = os.path.join(sif_dir, f"calculator-{harness}.sif") + if not os.path.isfile(sif_path): + raise FileNotFoundError( + f"Calculator SIF not found: {sif_path}\n" + f"Build it with: polar cluster build-sif --example calculator --harness {harness}" + ) + + test_dir = Path(output_dir) / "assets" + test_dir.mkdir(parents=True, exist_ok=True) + test_file = test_dir / "test_calculator.py" + test_file.write_text(CALCULATOR_TEST) + + return { + "task_id": f"calculator-{harness}-slurm-{batch_id}", + "instruction": CALCULATOR_INSTRUCTION, + "num_rollouts": num_rollouts, + "timeout_seconds": timeout_seconds, + "runtime": { + "backend": "apptainer", + "image": sif_path, + "prepare": [ + { + "type": "exec", + "command": ( + "mkdir -p /polar/session/workspace /polar/session/logs/agent && " + "cd /polar/session/workspace && git init && " + "git config user.email 'polar@test' && " + "git config user.name 'Polar'" + ), + }, + { + "type": "upload_file", + "source": str(test_file.resolve()), + "target": "/polar/session/workspace/test_calculator.py", + }, + { + "type": "exec", + "command": "cd /polar/session/workspace && git add -A && git commit -m 'initial'", + }, + ], + "env": {}, + "network": "host", + "workdir": "/polar/session/workspace", + # swe_agent's swerex does chown inside the container; Apptainer + # needs --fakeroot to support ownership changes on overlayFS. + **({"kwargs": {"fakeroot": True}} if harness == "swe_agent" else {}), + }, + "agent": { + "harness": harness, + "model_name": agent_model, + "settings": {}, + "env": {}, + }, + "builder": {"strategy": "prefix_merging"}, + "evaluator": { + "strategy": "swegym_git_diff", + "config": { + "repo_dir": "/polar/session/workspace", + "patch_command": ( + "cd /polar/session/workspace && git add -A && git diff --cached --binary" + ), + "test_command": ( + "cd /polar/session/workspace && python3 test_calculator.py && " + "echo 'PASSED test_calculator'" + ), + "test_timeout": 60.0, + "expected_output_json": {"test_calculator": "PASSED"}, + }, + "refresh_runtime": False, + }, + } + + +# ── SWE-Gym ────────────────────────────────────────────────────────────────── + +SWEGYM_SAMPLE = [ + {"instance_id": "getmoto__moto-7365", "repo": "getmoto/moto"}, + {"instance_id": "python__mypy-10392", "repo": "python/mypy"}, + {"instance_id": "conan-io__conan-13721", "repo": "conan-io/conan"}, + {"instance_id": "iterative__dvc-1809", "repo": "iterative/dvc"}, + {"instance_id": "dask__dask-10441", "repo": "dask/dask"}, + {"instance_id": "pydantic__pydantic-8072", "repo": "pydantic/pydantic"}, + {"instance_id": "pandas-dev__pandas-58335", "repo": "pandas-dev/pandas"}, + {"instance_id": "facebookresearch__hydra-1783", "repo": "facebookresearch/hydra"}, + {"instance_id": "bokeh__bokeh-13636", "repo": "bokeh/bokeh"}, + {"instance_id": "Project-MONAI__MONAI-2238", "repo": "Project-MONAI/MONAI"}, +] + +SWEGYM_PREPARE = ( + "rm -rf /polar/session/workspace && " + "mkdir -p /polar/session/logs/agent /polar/session/workspace /root/.venv/bin && " + "cp -a /testbed/. /polar/session/workspace/ && " + # swerex's shutil.copytree fails on dangling symlinks (e.g. bokeh repo) + "find /polar/session/workspace -xtype l -delete 2>/dev/null; " + "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python && " + "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python3 && " + "git config --global core.pager '' && " + "cd /polar/session/workspace && git reset --hard" +) + + +def build_swegym_task( + harness: str, + sif_dir: str, + instance: dict[str, Any], + *, + agent_model: str = "openai/gpt-4o", + num_rollouts: int = 4, + timeout_seconds: float = 900.0, + batch_id: str = "", +) -> dict[str, Any]: + """Build a SWE-Gym task payload for a single instance.""" + instance_id = instance["instance_id"] + sif_name = f"swegym-{harness}-{_sanitize_instance_id(instance_id)}.sif" + sif_path = os.path.join(sif_dir, sif_name) + + if not os.path.isfile(sif_path): + raise FileNotFoundError(f"SWE-Gym SIF not found: {sif_path}") + + agent_settings: dict[str, Any] = {} + agent_env: dict[str, str] = {} + if harness == "swe_agent": + agent_settings = { + "repo_path": "/polar/session/workspace", + "shell_preamble": ( + "source /opt/miniconda3/etc/profile.d/conda.sh && " + "conda activate polar-sweagent && " + "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH" + ), + } + elif harness in ("openhands_sdk", "openhands"): + agent_env = {"WORKSPACE_BASE": "/polar/session/workspace"} + + return { + "task_id": f"swegym-{harness}-{_sanitize_instance_id(instance_id)}-{batch_id}", + "instruction": str(instance.get("problem_statement", "")).strip(), + "num_rollouts": num_rollouts, + "timeout_seconds": timeout_seconds, + "runtime": { + "backend": "apptainer", + "image": sif_path, + "prepare": [{"type": "exec", "command": SWEGYM_PREPARE}], + "env": {}, + "network": "host", + "workdir": "/polar/session/workspace", + **({"kwargs": {"fakeroot": True}} if harness == "swe_agent" else {}), + }, + "agent": { + "harness": harness, + "model_name": agent_model, + "settings": agent_settings, + "env": agent_env, + }, + "builder": {"strategy": "prefix_merging"}, + "evaluator": { + "strategy": "swegym_git_diff", + "config": { + "repo_dir": "/testbed", + "patch_command": "cd /polar/session/workspace && git add -A && git diff --cached --binary --submodule=diff", + "instance": instance, + }, + "refresh_runtime": False, + }, + } + + +# ── SWE-bench Verified ──────────────────────────────────────────────────────── + +SWEBENCH_PREPARE_BASE = ( + "rm -rf /polar/session/workspace && " + "mkdir -p /polar/session/logs/agent /polar/session/workspace /root/.venv/bin && " + "cp -a /testbed/. /polar/session/workspace/ && " + "find /polar/session/workspace -xtype l -delete 2>/dev/null; " + "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python && " + "ln -sf /opt/miniconda3/envs/testbed/bin/python /root/.venv/bin/python3 && " + "git config --global core.pager '' && " + "cd /polar/session/workspace && git reset --hard; true" +) + + +def build_swebench_task( + harness: str, + sif_dir: str, + instance: dict[str, Any], + *, + agent_model: str = "openai/gpt-4o", + num_rollouts: int = 1, + timeout_seconds: float = 3600.0, + batch_id: str = "", +) -> dict[str, Any]: + """Build a SWE-bench Verified task payload for a single instance.""" + instance_id = instance["instance_id"] + sif_name = f"swebench-{harness}-{_sanitize_instance_id(instance_id)}.sif" + sif_path = os.path.join(sif_dir, sif_name) + + if not os.path.isfile(sif_path): + raise FileNotFoundError(f"SWE-bench SIF not found: {sif_path}") + + runtime_env: dict[str, str] = {} + if harness == "opencode": + runtime_env["OPENCODE_FAKE_VCS"] = "git" + + exclude_patterns: list[str] = [] + if harness == "claude_code": + exclude_patterns.extend([".claude/**", "**/.claude/**"]) + + agent_settings: dict[str, Any] = {} + agent_env: dict[str, str] = {} + if harness == "swe_agent": + agent_settings = { + "repo_path": "/polar/session/workspace", + "shell_preamble": ( + "source /opt/miniconda3/etc/profile.d/conda.sh && " + "conda activate polar-sweagent && " + "export PATH=/opt/miniconda3/envs/testbed/bin:$PATH" + ), + } + elif harness in ("openhands_sdk", "openhands"): + agent_env = {"WORKSPACE_BASE": "/polar/session/workspace"} + + runtime_kwargs: dict[str, Any] = {} + if harness == "swe_agent": + runtime_kwargs["fakeroot"] = True + + return { + "task_id": f"swebench-{harness}-{_sanitize_instance_id(instance_id)}-{batch_id}", + "instruction": str(instance.get("problem_statement", "")).strip(), + "num_rollouts": num_rollouts, + "timeout_seconds": timeout_seconds, + "runtime": { + "backend": "apptainer", + "image": sif_path, + "prepare": [{"type": "exec", "command": SWEBENCH_PREPARE_BASE}], + "env": runtime_env, + "network": "host", + "workdir": "/polar/session/workspace", + **({"kwargs": runtime_kwargs} if runtime_kwargs else {}), + }, + "agent": { + "harness": harness, + "model_name": agent_model, + "settings": agent_settings, + "env": agent_env, + }, + "builder": {"strategy": "prefix_merging"}, + "evaluator": { + "strategy": "swegym_git_diff", + "config": { + "repo_dir": "/testbed", + "patch_command": ( + "cd /polar/session/workspace && " + "git add -A && git diff --cached --binary" + ), + "instance": instance, + **({"exclude_patterns": exclude_patterns} if exclude_patterns else {}), + }, + "refresh_runtime": False, + }, + } + + +# ── Runner functions ────────────────────────────────────────────────────────── + +def run_calculator(args: argparse.Namespace) -> int: + batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + output_dir = Path(args.output_dir) + + payload = build_calculator_task( + args.harness, + args.sif_dir, + args.output_dir, + agent_model=args.agent_model, + num_rollouts=args.num_rollouts, + timeout_seconds=args.timeout_seconds, + batch_id=batch_id, + ) + request_path = output_dir / "request.json" + response_path = output_dir / "response.json" + _write_json(request_path, payload) + print(f"[calculator] Wrote request to {request_path}") + + if args.dry_run: + print("[calculator] Dry run — not submitting.") + return 0 + + result = _submit_task(request_path, args.topology) + _write_json(response_path, result) + summary = _summarize_result(result) + print(f"[calculator] Done: {summary['reward_one_sessions']}/{summary['total_sessions']} reward=1.0") + print(f"[calculator] Response: {response_path}") + return 0 + + +def run_swegym(args: argparse.Namespace) -> int: + batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + output_dir = Path(args.output_dir) + + instances = SWEGYM_SAMPLE + if args.instance_id: + wanted = set(args.instance_id) + instances = [i for i in instances if i["instance_id"] in wanted] + missing = wanted - {i["instance_id"] for i in instances} + if missing: + print(f"[swegym] WARNING: Unknown instance_ids: {missing}") + instances = instances[: args.max_tasks] + + if not instances: + print("[swegym] No instances selected.") + return 1 + + # Try to load full instance data from cache + cache_path = Path.home() / ".cache" / "polar" / "swegym_sample_10.json" + full_instances: dict[str, dict[str, Any]] = {} + if cache_path.exists(): + cached = json.loads(cache_path.read_text()) + full_instances = {str(i.get("instance_id")): i for i in cached} + + manifest = { + "batch_id": batch_id, + "harness": args.harness, + "model_name": args.model_name, + "num_rollouts": args.num_rollouts, + "tasks": [i["instance_id"] for i in instances], + } + _write_json(output_dir / "manifest.json", manifest) + + summaries: list[dict[str, Any]] = [] + for instance_meta in instances: + instance_id = instance_meta["instance_id"] + instance = {**instance_meta} + if instance_id in full_instances: + instance = full_instances[instance_id] + + if not instance.get("problem_statement"): + print( + f"[swegym] WARNING: No problem_statement for {instance_id}. " + f"Run: python examples/swegym/sample_tasks.py to populate cache." + ) + continue + + task_dir = output_dir / _sanitize_instance_id(instance_id) + request_path = task_dir / "request.json" + response_path = task_dir / "response.json" + + try: + payload = build_swegym_task( + args.harness, + args.sif_dir, + instance, + agent_model=args.agent_model, + num_rollouts=args.num_rollouts, + timeout_seconds=args.timeout_seconds, + batch_id=batch_id, + ) + except FileNotFoundError as e: + print(f"[swegym] Skipping {instance_id}: {e}") + continue + + _write_json(request_path, payload) + print(f"[swegym] [{instance_id}] Wrote request to {request_path}") + + if args.dry_run: + summaries.append({"instance_id": instance_id, "dry_run": True}) + continue + + try: + result = _submit_task(request_path, args.topology) + _write_json(response_path, result) + summary = { + "instance_id": instance_id, + "task_id": payload["task_id"], + **_summarize_result(result), + } + summaries.append(summary) + print( + f"[swegym] [{instance_id}] Done: " + f"reward_1={summary['reward_one_sessions']}/{summary['total_sessions']}" + ) + except subprocess.CalledProcessError as e: + print(f"[swegym] [{instance_id}] FAILED: {e}") + if e.stderr: + print(f" stderr: {e.stderr[:500]}") + summaries.append({"instance_id": instance_id, "error": str(e)}) + + _write_json(output_dir / "summary.json", summaries) + print(f"[swegym] Batch summary: {output_dir / 'summary.json'}") + return 0 + + +def run_swebench(args: argparse.Namespace) -> int: + batch_id = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + output_dir = Path(args.output_dir) + + cache_path = Path.home() / ".cache" / "polar" / "swebench_verified.json" + if not cache_path.exists(): + print( + f"[swebench] ERROR: Dataset cache not found at {cache_path}\n" + f" Populate it with: python -c \"" + f"from examples.swebench_verified.dataset import load_swebench_verified; " + f"load_swebench_verified()\"" + ) + return 1 + + all_instances = json.loads(cache_path.read_text()) + instances_by_id: dict[str, dict[str, Any]] = { + str(i["instance_id"]): i for i in all_instances + } + + if args.instance_id: + wanted = set(args.instance_id) + instances = [instances_by_id[iid] for iid in wanted if iid in instances_by_id] + missing = wanted - {str(i["instance_id"]) for i in instances} + if missing: + print(f"[swebench] WARNING: Unknown instance_ids: {missing}") + else: + instances = all_instances + instances = instances[: args.max_tasks] + + if not instances: + print("[swebench] No instances selected.") + return 1 + + manifest = { + "batch_id": batch_id, + "harness": args.harness, + "num_rollouts": args.num_rollouts, + "tasks": [str(i["instance_id"]) for i in instances], + } + _write_json(output_dir / "manifest.json", manifest) + + summaries: list[dict[str, Any]] = [] + for instance in instances: + instance_id = str(instance["instance_id"]) + task_dir = output_dir / _sanitize_instance_id(instance_id) + request_path = task_dir / "request.json" + response_path = task_dir / "response.json" + + if not instance.get("problem_statement"): + print(f"[swebench] WARNING: No problem_statement for {instance_id}, skipping.") + continue + + try: + payload = build_swebench_task( + args.harness, + args.sif_dir, + instance, + agent_model=args.agent_model, + num_rollouts=args.num_rollouts, + timeout_seconds=args.timeout_seconds, + batch_id=batch_id, + ) + except FileNotFoundError as e: + print(f"[swebench] Skipping {instance_id}: {e}") + continue + + _write_json(request_path, payload) + print(f"[swebench] [{instance_id}] Wrote request to {request_path}") + + if args.dry_run: + summaries.append({"instance_id": instance_id, "dry_run": True}) + continue + + try: + result = _submit_task(request_path, args.topology) + _write_json(response_path, result) + summary = { + "instance_id": instance_id, + "task_id": payload["task_id"], + **_summarize_result(result), + } + summaries.append(summary) + print( + f"[swebench] [{instance_id}] Done: " + f"reward_1={summary['reward_one_sessions']}/{summary['total_sessions']}" + ) + except subprocess.CalledProcessError as e: + print(f"[swebench] [{instance_id}] FAILED: {e}") + if e.stderr: + print(f" stderr: {e.stderr[:500]}") + summaries.append({"instance_id": instance_id, "error": str(e)}) + + _write_json(output_dir / "summary.json", summaries) + print(f"[swebench] Batch summary: {output_dir / 'summary.json'}") + return 0 + + +# ── CLI entry point ─────────────────────────────────────────────────────────── + +def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Build and submit Polar tasks on SLURM") + parser.add_argument("--example", required=True, choices=["calculator", "swegym", "swebench_verified"]) + parser.add_argument("--harness", default="opencode") + parser.add_argument("--topology", required=True, help="Path to topology.yaml") + parser.add_argument("--sif-dir", required=True, help="Directory containing SIF images") + parser.add_argument("--output-dir", default="./task_outputs") + parser.add_argument("--num-rollouts", type=int, default=4) + parser.add_argument("--timeout-seconds", type=float, default=900.0) + parser.add_argument( + "--model-name", + default=os.environ.get("MODEL_NAME", "Qwen/Qwen3.5-27B"), + ) + parser.add_argument( + "--agent-model", + default=os.environ.get("AGENT_MODEL", "openai/gpt-4o"), + ) + parser.add_argument("--max-tasks", type=int, default=10) + parser.add_argument("--instance-id", action="append", default=[]) + parser.add_argument("--dry-run", action="store_true") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = _parse_args(argv) + if args.example == "calculator": + return run_calculator(args) + elif args.example == "swegym": + return run_swegym(args) + elif args.example == "swebench_verified": + return run_swebench(args) + else: + print(f"Unknown example: {args.example}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/polar/cluster/templates/__init__.py b/src/polar/cluster/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/polar/cluster/templates/env.sh b/src/polar/cluster/templates/env.sh new file mode 100644 index 00000000..05186b5f --- /dev/null +++ b/src/polar/cluster/templates/env.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Polar SLURM environment configuration. +# Source this file in all SLURM job scripts and helpers. +# +# Required: POLAR_WORKSPACE must be set before sourcing. +# The easiest way is via 'polar cluster launch -c cluster.yaml'. + +# ── Cluster paths ────────────────────────────────────────────────────────────── +if [ -z "${POLAR_WORKSPACE:-}" ]; then + echo "ERROR: POLAR_WORKSPACE must be set before sourcing env.sh" >&2 + echo " Use: polar cluster launch -c cluster.yaml" >&2 + return 1 2>/dev/null || exit 1 +fi + +export POLAR_ROOT="${POLAR_ROOT:-${POLAR_WORKSPACE}/polar}" +export POLAR_CODE="${POLAR_CODE:-${POLAR_ROOT}/ProRL-Agent-Server}" +export POLAR_SIFS="${POLAR_SIFS:-${POLAR_ROOT}/sif_images}" +export POLAR_RESULTS="${POLAR_RESULTS:-${POLAR_ROOT}/results}" +export POLAR_VENV="${POLAR_VENV:-${POLAR_ROOT}/.venv}" + +# ── Apptainer (optional — skip if system-installed) ─────────────────────────── +if [ -n "${APPTAINER_BIN_DIR:-}" ]; then + export PATH="${APPTAINER_BIN_DIR}:${PATH}" +fi +export APPTAINER_CACHEDIR="${APPTAINER_CACHEDIR:-${POLAR_ROOT}/apptainer_cache}" + +# ── CUDA (optional — for FlashInfer GDN kernel JIT compilation) ─────────────── +if [ -n "${CUDA_HOME:-}" ]; then + export PATH="${CUDA_HOME}/bin:${PATH}" + export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +fi + +# ── Caches (redirect to shared storage to avoid home directory quota) ───────── +export HF_HOME="${HF_HOME:-${POLAR_WORKSPACE}/hf_cache}" +export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}" +export TORCH_HOME="${TORCH_HOME:-${POLAR_WORKSPACE}/torch_cache}" +export PIP_CACHE_DIR="${PIP_CACHE_DIR:-${POLAR_WORKSPACE}/pip_cache}" +export XDG_CACHE_HOME="${XDG_CACHE_HOME:-${POLAR_WORKSPACE}/xdg_cache}" +export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-${POLAR_WORKSPACE}/triton_cache}" + +# ── Python ───────────────────────────────────────────────────────────────────── +if [ -d "${POLAR_VENV}" ]; then + source "${POLAR_VENV}/bin/activate" +fi + +# Ensure polar source tree is importable even without pip install -e +export PYTHONPATH="${POLAR_CODE}/src${PYTHONPATH:+:${PYTHONPATH}}" + +# ── Service ports ────────────────────────────────────────────────────────────── +export VLLM_PORT="${VLLM_PORT:-18000}" +export ROLLOUT_PORT="${ROLLOUT_PORT:-18080}" +export GATEWAY_BASE_PORT="${GATEWAY_BASE_PORT:-18100}" + +# ── vLLM defaults ────────────────────────────────────────────────────────────── +export MODEL_NAME="${MODEL_NAME:-Qwen/Qwen3.5-27B}" +export MODEL_PATH="${MODEL_PATH:-${MODEL_NAME}}" +export TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-8}" +export MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}" +export GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" +export MAX_NUM_SEQS="${MAX_NUM_SEQS:-64}" +export TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-qwen3_xml}" + +# ── Gateway defaults ────────────────────────────────────────────────────────── +export MAX_INIT_WORKERS="${MAX_INIT_WORKERS:-8}" +export MAX_RUN_WORKERS="${MAX_RUN_WORKERS:-4}" +export MAX_POSTRUN_WORKERS="${MAX_POSTRUN_WORKERS:-4}" +export READY_BUFFER_TARGET="${READY_BUFFER_TARGET:-4}" + +# ── Training defaults (used by polar_slurm_train.sbatch) ───────────────────── +export SGLANG_ROUTER_PORT="${SGLANG_ROUTER_PORT:-9000}" +export RAY_PORT="${RAY_PORT:-6379}" +export RAY_DASHBOARD_PORT="${RAY_DASHBOARD_PORT:-8265}" diff --git a/src/polar/cluster/templates/polar_slurm.sbatch b/src/polar/cluster/templates/polar_slurm.sbatch new file mode 100644 index 00000000..68f052b4 --- /dev/null +++ b/src/polar/cluster/templates/polar_slurm.sbatch @@ -0,0 +1,252 @@ +#!/bin/bash +# ──────────────────────────────────────────────────────────────────────────────── +# Polar SLURM Job Script +# +# Orchestrates: vLLM server → Rollout service → Gateway node(s) → Task submission +# +# All SBATCH directives are passed via sbatch CLI flags from 'polar cluster launch'. +# No hardcoded account, partition, or resource values. +# +# Required env vars (pass via --export): +# POLAR_CODE - Path to ProRL-Agent-Server on cluster +# POLAR_WORKSPACE - Base workspace path on cluster +# EXAMPLE - "calculator" or "swegym" (default: calculator) +# HARNESS - Agent harness name (default: opencode) +# +# Optional env vars: +# MODEL_NAME - Model to serve (default from env.sh) +# MODEL_PATH - Model path/name for vLLM (default: $MODEL_NAME) +# TENSOR_PARALLEL_SIZE - TP size for vLLM (default: 8) +# NUM_ROLLOUTS - Number of rollouts per task (default: 4) +# TIMEOUT_SECONDS - Per-task timeout (default: 900) +# DEFAULT_SIF_IMAGE - Override SIF path for agent containers +# TASK_FILE - Submit a specific task JSON instead of using the example +# ──────────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +# SLURM copies the script to a local spool directory, so BASH_SOURCE[0] won't +# point back to the original source tree. POLAR_CODE must be passed via --export. +if [ -z "${POLAR_CODE:-}" ]; then + echo "FATAL: POLAR_CODE not set. Use 'polar cluster launch -c cluster.yaml'." >&2 + exit 1 +fi +if [ -z "${POLAR_WORKSPACE:-}" ]; then + echo "FATAL: POLAR_WORKSPACE not set. Use 'polar cluster launch -c cluster.yaml'." >&2 + exit 1 +fi + +# Source environment setup from the cluster templates +TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates" +source "${TEMPLATE_DIR}/env.sh" + +# ── Defaults ─────────────────────────────────────────────────────────────────── +EXAMPLE="${EXAMPLE:-calculator}" +HARNESS="${HARNESS:-opencode}" +NUM_ROLLOUTS="${NUM_ROLLOUTS:-4}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-900}" + +# ── Job directory ────────────────────────────────────────────────────────────── +JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}" +mkdir -p "${JOB_DIR}/logs" +echo "[polar_slurm] Job directory: ${JOB_DIR}" +echo "[polar_slurm] Job ID: ${SLURM_JOB_ID}" +echo "[polar_slurm] Nodes: ${SLURM_JOB_NODELIST}" +echo "[polar_slurm] GPUs per node: ${SLURM_GPUS_ON_NODE:-unknown}" + +# ── Discover hostnames ───────────────────────────────────────────────────────── +HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}")) +NODE_0="${HOSTNAMES[0]}" +NUM_NODES="${#HOSTNAMES[@]}" +echo "[polar_slurm] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}" + +# ── Resolve SIF image ───────────────────────────────────────────────────────── +if [ -z "${DEFAULT_SIF_IMAGE:-}" ]; then + if [ "${EXAMPLE}" = "calculator" ]; then + DEFAULT_SIF_IMAGE="${POLAR_SIFS}/calculator-${HARNESS}.sif" + else + # For swegym, each task has its own SIF — submit script handles this + DEFAULT_SIF_IMAGE="" + fi +fi + +if [ -n "${DEFAULT_SIF_IMAGE}" ] && [ ! -f "${DEFAULT_SIF_IMAGE}" ]; then + echo "[polar_slurm] WARNING: SIF not found: ${DEFAULT_SIF_IMAGE}" + echo "[polar_slurm] Build it first with: polar cluster build-sif --example ${EXAMPLE} --harness ${HARNESS}" +fi + +# ── Save results directory ───────────────────────────────────────────────────── +export SAVE_DIR="${JOB_DIR}/rollout_results" +mkdir -p "${SAVE_DIR}" + +# ── Generate topology.yaml ───────────────────────────────────────────────────── +TOPOLOGY_PATH="${JOB_DIR}/topology.yaml" +python -m polar.cluster.topology \ + --output "${TOPOLOGY_PATH}" \ + --save-dir "${SAVE_DIR}" \ + ${DEFAULT_SIF_IMAGE:+--default-sif "${DEFAULT_SIF_IMAGE}"} + +export POLAR_TOPOLOGY="${TOPOLOGY_PATH}" +echo "[polar_slurm] Generated topology: ${TOPOLOGY_PATH}" +cat "${TOPOLOGY_PATH}" + +# ── Kill stale processes on ports from previous runs ───────────────────────── +echo "[polar_slurm] Cleaning stale processes on ports..." +for port in "${VLLM_PORT}" "${ROLLOUT_PORT}"; do + stale_pid=$(lsof -ti ":${port}" 2>/dev/null || true) + if [ -n "${stale_pid}" ]; then + echo "[polar_slurm] Killing stale PID ${stale_pid} on port ${port}" + kill -9 ${stale_pid} 2>/dev/null || true + sleep 1 + fi +done +for ((i=0; i/dev/null || true) + if [ -n "${stale_pid}" ]; then + echo "[polar_slurm] Killing stale PID ${stale_pid} on port ${gw_port}" + kill -9 ${stale_pid} 2>/dev/null || true + sleep 1 + fi +done + +# ── PID tracking for cleanup ─────────────────────────────────────────────────── +PIDS=() +cleanup() { + echo "[polar_slurm] Cleaning up..." + for pid in "${PIDS[@]}"; do + if kill -0 "${pid}" 2>/dev/null; then + echo "[polar_slurm] Killing PID ${pid}" + kill "${pid}" 2>/dev/null || true + fi + done + wait 2>/dev/null || true + echo "[polar_slurm] Cleanup complete." +} +trap cleanup EXIT INT TERM + +# ── Source readiness poller ──────────────────────────────────────────────────── +source "${TEMPLATE_DIR}/wait_for_service.sh" + +# ── Launch vLLM ──────────────────────────────────────────────────────────────── +echo "[polar_slurm] Starting vLLM server on ${NODE_0}:${VLLM_PORT}..." +echo "[polar_slurm] Model: ${MODEL_PATH}" +echo "[polar_slurm] TP size: ${TENSOR_PARALLEL_SIZE}" + +python -m vllm.entrypoints.openai.api_server \ + --model "${MODEL_PATH}" \ + --port "${VLLM_PORT}" \ + --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \ + --max-model-len "${MAX_MODEL_LEN}" \ + --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION:-0.90}" \ + --max-num-seqs "${MAX_NUM_SEQS:-64}" \ + --enable-auto-tool-choice \ + --tool-call-parser "${TOOL_CALL_PARSER:-hermes}" \ + --gdn-prefill-backend triton \ + --trust-remote-code \ + --host 0.0.0.0 \ + > "${JOB_DIR}/logs/vllm.log" 2>&1 & +PIDS+=($!) +echo "[polar_slurm] vLLM PID: ${PIDS[-1]}" + +wait_for_service "http://${NODE_0}:${VLLM_PORT}/health" "vLLM" 240 5 || exit 1 + +# ── Launch Rollout Service ───────────────────────────────────────────────────── +echo "[polar_slurm] Starting rollout service on ${NODE_0}:${ROLLOUT_PORT}..." + +polar serve_rollout -c "${TOPOLOGY_PATH}" \ + > "${JOB_DIR}/logs/rollout.log" 2>&1 & +PIDS+=($!) +echo "[polar_slurm] Rollout PID: ${PIDS[-1]}" + +wait_for_service "http://${NODE_0}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1 + +# ── Launch Gateway Node(s) ───────────────────────────────────────────────────── +for ((i=0; i "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 & + PIDS+=($!) + else + # Gateway on remote node via srun + srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \ + --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \ + --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \ + bash -c " + source '${TEMPLATE_DIR}/env.sh' + export POLAR_TOPOLOGY='${TOPOLOGY_PATH}' + export POLAR_GATEWAY_NODE_ID='${NODE_ID}' + python -m polar.gateway.server + " & + PIDS+=($!) + fi + echo "[polar_slurm] Gateway ${NODE_ID} PID: ${PIDS[-1]}" +done + +# Wait for all gateways +for ((i=0; i "${JOB_DIR}/logs/status.json" 2>&1 || true +polar status -c "${TOPOLOGY_PATH}" || true + +# ── Submit Tasks ─────────────────────────────────────────────────────────────── +echo "" +echo "[polar_slurm] ════════════════════════════════════════════════════════" +echo "[polar_slurm] Submitting tasks: EXAMPLE=${EXAMPLE}, HARNESS=${HARNESS}" +echo "[polar_slurm] ════════════════════════════════════════════════════════" + +if [ -n "${TASK_FILE:-}" ]; then + # Direct task file submission + echo "[polar_slurm] Submitting task file: ${TASK_FILE}" + polar submit "${TASK_FILE}" \ + -c "${TOPOLOGY_PATH}" \ + --json | tee "${JOB_DIR}/response.json" +else + # Use the cluster task builder module + INSTANCE_ID_ARGS="" + if [ -n "${INSTANCE_IDS:-}" ]; then + IFS=',' read -ra _IDS <<< "${INSTANCE_IDS}" + for _id in "${_IDS[@]}"; do + INSTANCE_ID_ARGS="${INSTANCE_ID_ARGS} --instance-id ${_id}" + done + fi + python -m polar.cluster.tasks \ + --example "${EXAMPLE}" \ + --harness "${HARNESS}" \ + --topology "${TOPOLOGY_PATH}" \ + --sif-dir "${POLAR_SIFS}" \ + --output-dir "${JOB_DIR}/tasks" \ + --num-rollouts "${NUM_ROLLOUTS}" \ + --timeout-seconds "${TIMEOUT_SECONDS}" \ + ${INSTANCE_ID_ARGS} +fi + +SUBMIT_EXIT=$? + +# ── Results ──────────────────────────────────────────────────────────────────── +echo "" +echo "[polar_slurm] ════════════════════════════════════════════════════════" +echo "[polar_slurm] Task submission finished (exit code: ${SUBMIT_EXIT})" +echo "[polar_slurm] Results directory: ${JOB_DIR}" +echo "[polar_slurm] Rollout results: ${SAVE_DIR}" +echo "[polar_slurm] ════════════════════════════════════════════════════════" + +# Final status check +polar status -c "${TOPOLOGY_PATH}" || true + +echo "[polar_slurm] Job complete." +exit ${SUBMIT_EXIT} diff --git a/src/polar/cluster/templates/polar_slurm_serve.sbatch b/src/polar/cluster/templates/polar_slurm_serve.sbatch new file mode 100644 index 00000000..d16520db --- /dev/null +++ b/src/polar/cluster/templates/polar_slurm_serve.sbatch @@ -0,0 +1,202 @@ +#!/bin/bash +# ──────────────────────────────────────────────────────────────────────────────── +# Polar SLURM Serve-Only Job Script +# +# Starts: vLLM server → Rollout service → Gateway node(s), then waits. +# Tasks are submitted separately via 'polar cluster submit-task'. +# +# All SBATCH directives are passed via sbatch CLI flags from 'polar cluster serve'. +# No hardcoded account, partition, or resource values. +# +# Required env vars (pass via --export): +# POLAR_CODE - Path to ProRL-Agent-Server on cluster +# POLAR_WORKSPACE - Base workspace path on cluster +# +# Optional env vars: +# MODEL_NAME - Model to serve (default from env.sh) +# MODEL_PATH - Model path/name for vLLM (default: $MODEL_NAME) +# TENSOR_PARALLEL_SIZE - TP size for vLLM (default: 8) +# ──────────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +if [ -z "${POLAR_CODE:-}" ]; then + echo "FATAL: POLAR_CODE not set. Use 'polar cluster serve -c cluster.yaml'." >&2 + exit 1 +fi +if [ -z "${POLAR_WORKSPACE:-}" ]; then + echo "FATAL: POLAR_WORKSPACE not set. Use 'polar cluster serve -c cluster.yaml'." >&2 + exit 1 +fi + +# Source environment setup from the cluster templates +TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates" +source "${TEMPLATE_DIR}/env.sh" + +# ── Job directory ────────────────────────────────────────────────────────────── +JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}" +mkdir -p "${JOB_DIR}/logs" +echo "[polar_serve] Job directory: ${JOB_DIR}" +echo "[polar_serve] Job ID: ${SLURM_JOB_ID}" +echo "[polar_serve] Nodes: ${SLURM_JOB_NODELIST}" +echo "[polar_serve] GPUs per node: ${SLURM_GPUS_ON_NODE:-unknown}" + +# ── Discover hostnames ───────────────────────────────────────────────────────── +HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}")) +NODE_0="${HOSTNAMES[0]}" +NUM_NODES="${#HOSTNAMES[@]}" +echo "[polar_serve] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}" + +# ── Save results directory ───────────────────────────────────────────────────── +export SAVE_DIR="${JOB_DIR}/rollout_results" +mkdir -p "${SAVE_DIR}" + +# ── Generate topology.yaml ───────────────────────────────────────────────────── +TOPOLOGY_PATH="${JOB_DIR}/topology.yaml" +python -m polar.cluster.topology \ + --output "${TOPOLOGY_PATH}" \ + --save-dir "${SAVE_DIR}" + +export POLAR_TOPOLOGY="${TOPOLOGY_PATH}" +echo "[polar_serve] Generated topology: ${TOPOLOGY_PATH}" +cat "${TOPOLOGY_PATH}" + +# ── Kill stale processes on ports from previous runs ───────────────────────── +echo "[polar_serve] Cleaning stale processes on ports..." +for port in "${VLLM_PORT}" "${ROLLOUT_PORT}"; do + stale_pid=$(lsof -ti ":${port}" 2>/dev/null || true) + if [ -n "${stale_pid}" ]; then + echo "[polar_serve] Killing stale PID ${stale_pid} on port ${port}" + kill -9 ${stale_pid} 2>/dev/null || true + sleep 1 + fi +done +for ((i=0; i/dev/null || true) + if [ -n "${stale_pid}" ]; then + echo "[polar_serve] Killing stale PID ${stale_pid} on port ${gw_port}" + kill -9 ${stale_pid} 2>/dev/null || true + sleep 1 + fi +done + +# ── PID tracking for cleanup ─────────────────────────────────────────────────── +PIDS=() +cleanup() { + echo "[polar_serve] Cleaning up..." + for pid in "${PIDS[@]}"; do + if kill -0 "${pid}" 2>/dev/null; then + echo "[polar_serve] Killing PID ${pid}" + kill "${pid}" 2>/dev/null || true + fi + done + wait 2>/dev/null || true + echo "[polar_serve] Cleanup complete." +} +trap cleanup EXIT INT TERM + +# ── Source readiness poller ──────────────────────────────────────────────────── +source "${TEMPLATE_DIR}/wait_for_service.sh" + +# ── Launch vLLM ──────────────────────────────────────────────────────────────── +echo "[polar_serve] Starting vLLM server on ${NODE_0}:${VLLM_PORT}..." +echo "[polar_serve] Model: ${MODEL_PATH}" +echo "[polar_serve] TP size: ${TENSOR_PARALLEL_SIZE}" + +python -m vllm.entrypoints.openai.api_server \ + --model "${MODEL_PATH}" \ + --port "${VLLM_PORT}" \ + --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \ + --max-model-len "${MAX_MODEL_LEN}" \ + --gpu-memory-utilization "${GPU_MEMORY_UTILIZATION:-0.90}" \ + --max-num-seqs "${MAX_NUM_SEQS:-64}" \ + --enable-auto-tool-choice \ + --tool-call-parser "${TOOL_CALL_PARSER:-hermes}" \ + --gdn-prefill-backend triton \ + --trust-remote-code \ + --host 0.0.0.0 \ + > "${JOB_DIR}/logs/vllm.log" 2>&1 & +PIDS+=($!) +echo "[polar_serve] vLLM PID: ${PIDS[-1]}" + +wait_for_service "http://${NODE_0}:${VLLM_PORT}/health" "vLLM" 240 5 || exit 1 + +# ── Launch Rollout Service ───────────────────────────────────────────────────── +echo "[polar_serve] Starting rollout service on ${NODE_0}:${ROLLOUT_PORT}..." + +polar serve_rollout -c "${TOPOLOGY_PATH}" \ + > "${JOB_DIR}/logs/rollout.log" 2>&1 & +PIDS+=($!) +echo "[polar_serve] Rollout PID: ${PIDS[-1]}" + +wait_for_service "http://${NODE_0}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1 + +# ── Launch Gateway Node(s) ───────────────────────────────────────────────────── +for ((i=0; i "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 & + PIDS+=($!) + else + srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \ + --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \ + --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \ + bash -c " + source '${TEMPLATE_DIR}/env.sh' + export POLAR_TOPOLOGY='${TOPOLOGY_PATH}' + export POLAR_GATEWAY_NODE_ID='${NODE_ID}' + python -m polar.gateway.server + " & + PIDS+=($!) + fi + echo "[polar_serve] Gateway ${NODE_ID} PID: ${PIDS[-1]}" +done + +# Wait for all gateways +for ((i=0; i "${JOB_DIR}/logs/status.json" 2>&1 || true +polar status -c "${TOPOLOGY_PATH}" || true + +# Write sentinel file so 'polar cluster serve' knows services are ready +cat > "${JOB_DIR}/.services_ready" < --job-id ${SLURM_JOB_ID} --example calculator --harness opencode" +echo "[polar_serve] ════════════════════════════════════════════════════════" + +# Wait indefinitely — services stay alive until SLURM time limit or scancel +while true; do + sleep 60 + # Periodic health check + if ! kill -0 "${PIDS[0]}" 2>/dev/null; then + echo "[polar_serve] vLLM process died. Exiting." + exit 1 + fi +done diff --git a/src/polar/cluster/templates/polar_slurm_train.sbatch b/src/polar/cluster/templates/polar_slurm_train.sbatch new file mode 100644 index 00000000..68f64c96 --- /dev/null +++ b/src/polar/cluster/templates/polar_slurm_train.sbatch @@ -0,0 +1,360 @@ +#!/bin/bash +# ──────────────────────────────────────────────────────────────────────────────── +# Polar SLURM Training Job Script +# +# Orchestrates distributed RL training: Polar services (rollout + gateway) for +# agent session management, and Slime/Ray/SGLang/Megatron for GRPO training. +# +# GPU layout (single trainer node, 8 GPUs): +# GPU 0..ROLLOUT_GPUS-1 – SGLang inference (Slime/Ray-managed, weight-synced) +# GPU ROLLOUT_GPUS..7 – Megatron GRPO training (TP=TP_SIZE, DP=auto) +# +# Polar services (CPU-only) run on the host using POLAR_VENV. +# Training commands run inside the training SIF via apptainer exec. +# +# Required env vars (pass via --export): +# POLAR_CODE – Path to ProRL-Agent-Server on cluster +# POLAR_WORKSPACE – Base workspace path on cluster +# POLAR_CONFIG_PATH – Path to polar_config.yaml (bridge config) +# PROMPT_DATA – Path to JSONL training data +# HF_CHECKPOINT – HuggingFace model checkpoint name +# TRAIN_SIF – Path to training SIF image +# ──────────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +if [ -z "${POLAR_CODE:-}" ]; then + echo "FATAL: POLAR_CODE not set." >&2; exit 1 +fi +if [ -z "${POLAR_WORKSPACE:-}" ]; then + echo "FATAL: POLAR_WORKSPACE not set." >&2; exit 1 +fi + +# Source environment setup +TEMPLATE_DIR="${POLAR_CODE}/src/polar/cluster/templates" +source "${TEMPLATE_DIR}/env.sh" + +# ── Defaults ────────────────────────────────────────────────────────────────── +ACTOR_GPUS="${ACTOR_GPUS:-4}" +ROLLOUT_GPUS="${ROLLOUT_GPUS:-4}" +TP_SIZE="${TP_SIZE:-2}" +SGLANG_ROUTER_PORT="${SGLANG_ROUTER_PORT:-9000}" +RAY_PORT="${RAY_PORT:-6379}" +RAY_DASHBOARD_PORT="${RAY_DASHBOARD_PORT:-8265}" +TRAIN_NUM_ROLLOUTS="${TRAIN_NUM_ROLLOUTS:-5}" +ROLLOUT_BATCH_SIZE="${ROLLOUT_BATCH_SIZE:-2}" +N_SAMPLES_PER_PROMPT="${N_SAMPLES_PER_PROMPT:-16}" +GLOBAL_BATCH_SIZE="${GLOBAL_BATCH_SIZE:-32}" +TOTAL_GPUS=$((ACTOR_GPUS + ROLLOUT_GPUS)) + +# Training SIF — must be pre-built via 'polar cluster build-sif --example train' +TRAIN_SIF="${TRAIN_SIF:-${POLAR_SIFS}/train-slime-grpo.sif}" +if [ ! -f "${TRAIN_SIF}" ]; then + echo "FATAL: Training SIF not found: ${TRAIN_SIF}" >&2 + echo " Build with: polar cluster build-sif -c cluster.yaml --example train" >&2 + exit 1 +fi + +# HF cache on lustre (shared, persistent across jobs) +HF_HOME="${POLAR_WORKSPACE}/hf_cache" +mkdir -p "${HF_HOME}" + +# Apptainer exec prefix for GPU commands. +# PYTHONNOUSERSITE prevents user-installed packages (e.g. torch in ~/.local) +# from shadowing the NGC container's system packages. +# LD_LIBRARY_PATH includes CUDA compat for forward-compatibility with older drivers. +# HF_HOME on lustre so the container can access downloaded models. +APPTAINER_EXEC="apptainer exec --nv --writable-tmpfs --no-home \ + --bind ${POLAR_WORKSPACE}:${POLAR_WORKSPACE} \ + --env PYTHONNOUSERSITE=1 \ + --env LD_LIBRARY_PATH=/usr/local/cuda/compat:\${LD_LIBRARY_PATH:-} \ + --env HF_HOME=${HF_HOME} \ + --env HF_HUB_OFFLINE=0 \ + ${TRAIN_SIF}" + +# ── Job directory ───────────────────────────────────────────────────────────── +JOB_DIR="${POLAR_RESULTS}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}" +mkdir -p "${JOB_DIR}/logs" +echo "[polar_train] Job directory: ${JOB_DIR}" +echo "[polar_train] Job ID: ${SLURM_JOB_ID}" +echo "[polar_train] Nodes: ${SLURM_JOB_NODELIST}" +echo "[polar_train] Training SIF: ${TRAIN_SIF}" + +# ── Discover hostnames ──────────────────────────────────────────────────────── +HOSTNAMES=($(scontrol show hostnames "${SLURM_JOB_NODELIST}")) +TRAINER_NODE="${HOSTNAMES[0]}" +NUM_NODES="${#HOSTNAMES[@]}" +echo "[polar_train] Trainer node: ${TRAINER_NODE}" +echo "[polar_train] Allocated ${NUM_NODES} node(s): ${HOSTNAMES[*]}" + +# ── Derived paths ───────────────────────────────────────────────────────────── +HF_CKPT_BASENAME="${HF_CHECKPOINT##*/}" +TORCH_DIST_DIR="${TORCH_DIST_DIR:-${POLAR_WORKSPACE}/checkpoints/${HF_CKPT_BASENAME}_torch_dist}" +SAVE_DIR="${TRAIN_SAVE_DIR:-${JOB_DIR}/checkpoints}" +mkdir -p "${SAVE_DIR}" +export SAVE_DIR + +ROLLOUT_SAVE_DIR="${JOB_DIR}/rollout_results" +mkdir -p "${ROLLOUT_SAVE_DIR}" + +# ── Generate topology.yaml ──────────────────────────────────────────────────── +# Points gateway at SGLang router (on trainer node), NOT at vLLM +TOPOLOGY_PATH="${JOB_DIR}/topology.yaml" +SGLANG_URL="http://${TRAINER_NODE}:${SGLANG_ROUTER_PORT}" + +python -m polar.cluster.topology \ + --output "${TOPOLOGY_PATH}" \ + --save-dir "${ROLLOUT_SAVE_DIR}" \ + --sglang-base-url "${SGLANG_URL}" + +export POLAR_TOPOLOGY="${TOPOLOGY_PATH}" +echo "[polar_train] Generated topology: ${TOPOLOGY_PATH}" +cat "${TOPOLOGY_PATH}" + +# ── Patch polar_config.yaml with actual rollout URL ─────────────────────────── +PATCHED_POLAR_CONFIG="${JOB_DIR}/polar_config.yaml" +POLAR_ROLLOUT_URL="http://${TRAINER_NODE}:${ROLLOUT_PORT}" +sed "s|polar_rollout_url:.*|polar_rollout_url: \"${POLAR_ROLLOUT_URL}\"|" \ + "${POLAR_CONFIG_PATH}" > "${PATCHED_POLAR_CONFIG}" +echo "[polar_train] Patched polar_config.yaml: polar_rollout_url → ${POLAR_ROLLOUT_URL}" + +# ── PID tracking for cleanup ────────────────────────────────────────────────── +PIDS=() +cleanup() { + echo "[polar_train] Cleaning up..." + for pid in "${PIDS[@]}"; do + if kill -0 "${pid}" 2>/dev/null; then + echo "[polar_train] Killing PID ${pid}" + kill "${pid}" 2>/dev/null || true + fi + done + wait 2>/dev/null || true + echo "[polar_train] Cleanup complete." +} +trap cleanup EXIT INT TERM + +source "${TEMPLATE_DIR}/wait_for_service.sh" + +# ── Step 1: Start Polar services (CPU, on host using POLAR_VENV) ────────────── +echo "[polar_train] Starting Polar rollout server on ${TRAINER_NODE}:${ROLLOUT_PORT}..." +polar serve_rollout -c "${TOPOLOGY_PATH}" \ + > "${JOB_DIR}/logs/rollout.log" 2>&1 & +PIDS+=($!) + +wait_for_service "http://${TRAINER_NODE}:${ROLLOUT_PORT}/health" "Rollout" 30 2 || exit 1 + +# Start gateway(s) on all nodes +for ((i=0; i "${JOB_DIR}/logs/gateway_${NODE_ID}.log" 2>&1 & + PIDS+=($!) + else + srun --overlap --nodes=1 --ntasks=1 --nodelist="${GW_HOST}" \ + --output="${JOB_DIR}/logs/gateway_${NODE_ID}.log" \ + --error="${JOB_DIR}/logs/gateway_${NODE_ID}.err" \ + bash -c " + source '${TEMPLATE_DIR}/env.sh' + export POLAR_TOPOLOGY='${TOPOLOGY_PATH}' + export POLAR_GATEWAY_NODE_ID='${NODE_ID}' + python -m polar.gateway.server + " & + PIDS+=($!) + fi +done + +for ((i=0; i ${HF_LOCAL_DIR}" + HF_HUB_OFFLINE=0 HF_HOME="${HF_HOME}" python -c " +from huggingface_hub import snapshot_download +snapshot_download('${HF_CHECKPOINT}', local_dir='${HF_LOCAL_DIR}') +" + echo "[polar_train] Download complete." + else + echo "[polar_train] HF model already cached at ${HF_LOCAL_DIR}" + fi + HF_CHECKPOINT="${HF_LOCAL_DIR}" +fi + +if [ ! -d "${TORCH_DIST_DIR}/release" ]; then + echo "[polar_train] Converting HF weights: ${HF_CHECKPOINT} -> ${TORCH_DIST_DIR}" + mkdir -p "${TORCH_DIST_DIR}" + ${APPTAINER_EXEC} bash -c " + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export PYTHONPATH='/opt/polar/src:/root/Megatron-LM:\${PYTHONPATH:-}' + torchrun --nproc_per_node 1 \ + /root/slime/tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS} \ + --hf-checkpoint '${HF_CHECKPOINT}' \ + --save '${TORCH_DIST_DIR}' \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --context-parallel-size 1 \ + --expert-model-parallel-size 1 \ + --expert-tensor-parallel-size 1 + " > "${JOB_DIR}/logs/convert_weights.log" 2>&1 + echo "[polar_train] Weight conversion complete." +else + echo "[polar_train] Weights already converted at ${TORCH_DIST_DIR}" +fi + +# ── Step 3+4: Start Ray and launch training (single container session) ──────── +# Ray start daemonizes processes in the container's PID/network namespace. +# ray job submit must run in the SAME apptainer exec invocation so it can +# reach the Ray dashboard via localhost. Separate invocations create isolated +# namespaces, causing DNS/socket errors (OSError 107, Transport endpoint). +echo "[polar_train] Starting Ray + training (single container session)..." +echo "[polar_train] Steps: ${TRAIN_NUM_ROLLOUTS}" +echo "[polar_train] Batch: ${ROLLOUT_BATCH_SIZE} prompts x ${N_SAMPLES_PER_PROMPT} samples" +echo "[polar_train] Global batch size: ${GLOBAL_BATCH_SIZE}" +echo "[polar_train] Actor GPUs: ${ACTOR_GPUS} (TP=${TP_SIZE}), Rollout GPUs: ${ROLLOUT_GPUS}" + +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/opt/polar/src:/root/Megatron-LM\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\" + } +}" + +${APPTAINER_EXEC} bash -c " + set -euo pipefail + + # Verify critical training dependencies are importable + echo '[polar_train] Verifying runtime dependencies...' + python3 -c 'import sglang; import slime; import megatron; import mbridge; print(\"All training deps OK\")' || { + echo '[polar_train] ERROR: Missing training dependencies in SIF. Rebuild with: polar cluster build-sif --example train --force' >&2 + exit 1 + } + + # Clean up any stale Ray processes + ray stop --force 2>/dev/null || true + sleep 2 + + # Start Ray head node (daemonizes) + ray start --head \ + --node-ip-address '${TRAINER_NODE}' \ + --port '${RAY_PORT}' \ + --dashboard-host 0.0.0.0 \ + --dashboard-port '${RAY_DASHBOARD_PORT}' \ + --num-gpus '${TOTAL_GPUS}' \ + --disable-usage-stats + + echo '[polar_train] Ray cluster started. Dashboard: http://${TRAINER_NODE}:${RAY_DASHBOARD_PORT}' + + # Wait for Ray dashboard to be ready + for i in \$(seq 1 30); do + if curl -sf http://127.0.0.1:${RAY_DASHBOARD_PORT}/api/version >/dev/null 2>&1; then + echo '[polar_train] Ray dashboard is ready.' + break + fi + echo '[polar_train] Waiting for Ray dashboard... ('\$i'/30)' + sleep 2 + done + + # Submit training job (uses localhost since we're in the same namespace) + export PYTHONPATH='/opt/polar/src:/root/Megatron-LM:\${PYTHONPATH:-}' + export CUDA_DEVICE_MAX_CONNECTIONS=1 + + ray job submit \ + --address='http://127.0.0.1:${RAY_DASHBOARD_PORT}' \ + --runtime-env-json='${RUNTIME_ENV_JSON}' \ + -- python3 /root/slime/train_async.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node '${ACTOR_GPUS}' \ + --rollout-num-gpus '${ROLLOUT_GPUS}' \ + --rollout-num-gpus-per-engine 1 \ + ${MODEL_ARGS} \ + --hf-checkpoint '${HF_CHECKPOINT}' \ + --ref-load '${TORCH_DIST_DIR}' \ + --load '${SAVE_DIR}' \ + --save '${SAVE_DIR}' \ + --save-interval 5 \ + --update-weights-interval 1 \ + --rollout-function-path polar.slime.rollout.generate_rollout_polar_async \ + --custom-rm-path polar.slime.reward.reward_func \ + --custom-config-path '${PATCHED_POLAR_CONFIG}' \ + --prompt-data '${PROMPT_DATA}' \ + --input-key prompt \ + --label-key label \ + --metadata-key metadata \ + --rollout-shuffle \ + --reward-key score \ + --num-rollout '${TRAIN_NUM_ROLLOUTS}' \ + --rollout-batch-size '${ROLLOUT_BATCH_SIZE}' \ + --n-samples-per-prompt '${N_SAMPLES_PER_PROMPT}' \ + --rollout-max-response-len 8192 \ + --rollout-max-prompt-len 4096 \ + --global-batch-size '${GLOBAL_BATCH_SIZE}' \ + --rollout-global-dataset \ + --disable-rollout-trim-samples \ + --tensor-model-parallel-size '${TP_SIZE}' \ + --sequence-parallel \ + --pipeline-model-parallel-size 1 \ + --context-parallel-size 1 \ + --expert-model-parallel-size 1 \ + --expert-tensor-parallel-size 1 \ + --recompute-granularity full \ + --recompute-method uniform \ + --recompute-num-layers 1 \ + --use-dynamic-batch-size \ + --max-tokens-per-gpu 8192 \ + --advantage-estimator '${ADVANTAGE_ESTIMATOR:-grpo}' \ + --use-rollout-logprobs \ + --use-kl-loss \ + --kl-loss-coef 0.001 \ + --kl-loss-type low_var_kl \ + --entropy-coef 0.0 \ + --eps-clip 0.2 \ + --eps-clip-high 0.28 \ + --optimizer adam \ + --lr 1e-6 \ + --lr-decay-style constant \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.98 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --accumulate-allreduce-grads-in-fp32 \ + --attention-softmax-in-fp32 \ + --attention-backend flash \ + --no-gradient-accumulation-fusion \ + ${WANDB_PROJECT:+--use-wandb --wandb-project '${WANDB_PROJECT}'} \ + ${WANDB_EXP_NAME:+--wandb-exp-name '${WANDB_EXP_NAME}'} \ + ${WANDB_GROUP:+--wandb-group '${WANDB_GROUP}'} \ + --sglang-router-port '${SGLANG_ROUTER_PORT}' \ + --sglang-disable-cuda-graph \ + ${EXTRA_TRAIN_ARGS:-} +" 2>&1 | tee "${JOB_DIR}/logs/training.log" + +TRAIN_EXIT=${PIPESTATUS[0]} + +# ── Results ─────────────────────────────────────────────────────────────────── +echo "" +echo "[polar_train] ════════════════════════════════════════════════════════" +echo "[polar_train] Training finished (exit code: ${TRAIN_EXIT})" +echo "[polar_train] Checkpoints: ${SAVE_DIR}" +echo "[polar_train] Rollout results: ${ROLLOUT_SAVE_DIR}" +echo "[polar_train] Logs: ${JOB_DIR}/logs/" +echo "[polar_train] ════════════════════════════════════════════════════════" + +exit ${TRAIN_EXIT} diff --git a/src/polar/cluster/templates/wait_for_service.sh b/src/polar/cluster/templates/wait_for_service.sh new file mode 100644 index 00000000..6e14cbd6 --- /dev/null +++ b/src/polar/cluster/templates/wait_for_service.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# wait_for_service.sh — Poll an HTTP endpoint until it responds 2xx. +# +# Usage: +# source wait_for_service.sh +# wait_for_service "http://host:8000/health" "vLLM" 120 5 + +wait_for_service() { + local url="$1" + local name="${2:-service}" + local max_attempts="${3:-60}" + local interval="${4:-5}" + + echo "[wait] Waiting for ${name} at ${url} (max ${max_attempts} attempts, ${interval}s interval)..." + for ((i=1; i<=max_attempts; i++)); do + if curl -sf --max-time 5 "${url}" > /dev/null 2>&1; then + echo "[wait] ${name} is ready at ${url} (attempt ${i}/${max_attempts})" + return 0 + fi + if (( i % 10 == 0 )); then + echo "[wait] Still waiting for ${name}... (${i}/${max_attempts})" + fi + sleep "${interval}" + done + echo "[wait] FATAL: ${name} did not become ready at ${url} after ${max_attempts} attempts" + return 1 +} + +# Allow direct invocation +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + wait_for_service "$@" +fi diff --git a/src/polar/cluster/topology.py b/src/polar/cluster/topology.py new file mode 100644 index 00000000..602a2f50 --- /dev/null +++ b/src/polar/cluster/topology.py @@ -0,0 +1,195 @@ +"""Generate topology.yaml for a Polar SLURM job. + +Called inside the SLURM job after hostname discovery. Reads +``SLURM_JOB_NODELIST`` and environment variables to produce a complete +``topology.yaml`` consumed by ``polar serve_rollout`` / ``polar serve_gateway``. + +Usage (from sbatch script):: + + python -m polar.cluster.topology --output /path/to/topology.yaml +""" + +from __future__ import annotations + +import argparse +import os +import socket +import subprocess +import sys +from pathlib import Path + +import yaml + + +def discover_hostnames() -> list[str]: + """Return the SLURM-allocated hostnames, or ``[localhost]`` for local testing.""" + nodelist = os.environ.get("SLURM_JOB_NODELIST") + if not nodelist: + hostname = socket.gethostname() + print(f"[topology] No SLURM_JOB_NODELIST; using local hostname: {hostname}") + return [hostname] + + result = subprocess.run( + ["scontrol", "show", "hostnames", nodelist], + capture_output=True, + text=True, + check=True, + ) + hostnames = [h.strip() for h in result.stdout.strip().split("\n") if h.strip()] + if not hostnames: + raise RuntimeError(f"scontrol returned no hostnames for {nodelist}") + print(f"[topology] Discovered {len(hostnames)} node(s): {hostnames}") + return hostnames + + +def build_topology( + hostnames: list[str], + *, + vllm_port: int | None = None, + sglang_base_url: str | None = None, + rollout_port: int | None = None, + gateway_base_port: int | None = None, + model_name: str | None = None, + default_sif: str | None = None, + max_init_workers: int | None = None, + max_run_workers: int | None = None, + max_postrun_workers: int | None = None, + ready_buffer_target: int | None = None, + save_dir: str | None = None, + vllm_timeout: int | None = None, +) -> dict: + """Build the topology dict from discovered hostnames and configuration. + + Each parameter falls back to the corresponding environment variable and + then to a hardcoded default — matching the contract of the old shell-based + ``generate_topology.py``. + """ + _vllm_port = vllm_port or int(os.environ.get("VLLM_PORT", "8000")) + _rollout_port = rollout_port or int(os.environ.get("ROLLOUT_PORT", "8080")) + _gw_base = gateway_base_port or int(os.environ.get("GATEWAY_BASE_PORT", "8100")) + _model = model_name or os.environ.get("MODEL_NAME", "Qwen/Qwen3.5-27B") + _sif = default_sif or os.environ.get("DEFAULT_SIF_IMAGE", "") + _init = max_init_workers or int(os.environ.get("MAX_INIT_WORKERS", "8")) + _run = max_run_workers or int(os.environ.get("MAX_RUN_WORKERS", "4")) + _post = max_postrun_workers or int(os.environ.get("MAX_POSTRUN_WORKERS", "4")) + _buf = ready_buffer_target or int(os.environ.get("READY_BUFFER_TARGET", "4")) + _save = save_dir or os.environ.get("SAVE_DIR", "./rollout_results") + _timeout = vllm_timeout or int(os.environ.get("VLLM_TIMEOUT", "300")) + + vllm_host = hostnames[0] + rollout_host = hostnames[0] + + gateway_nodes = [] + for i, hostname in enumerate(hostnames): + port = _gw_base + i + node: dict = { + "id": f"node-{i:02d}", + "host": "0.0.0.0", + "port": port, + "public_url": f"http://{hostname}:{port}", + "max_init_workers": _init, + "max_run_workers": _run, + "max_postrun_workers": _post, + "ready_buffer_target": _buf, + "model_served": _model, + } + if sglang_base_url: + node["sglang"] = { + "base_url": sglang_base_url, + "timeout": _timeout, + } + else: + node["vllm"] = { + "base_url": f"http://{vllm_host}:{_vllm_port}", + "timeout": _timeout, + } + if _sif: + runtime_cfg: dict = { + "backend": "apptainer", + "image": _sif, + "network": "host", + } + # swe_agent's swerex needs chown inside the container, which + # requires fakeroot in Apptainer. + if os.environ.get("RUNTIME_FAKEROOT", "").lower() in ("1", "true", "yes"): + runtime_cfg["kwargs"] = {"fakeroot": True} + node["default_runtime"] = runtime_cfg + gateway_nodes.append(node) + + return { + "rollout": { + "host": "0.0.0.0", + "port": _rollout_port, + "public_url": f"http://{rollout_host}:{_rollout_port}", + "save_dir": _save, + "dispatch_poll_interval_seconds": 1.0, + "callback_grace_seconds": 10.0, + }, + "gateway": { + "heartbeat_interval_seconds": 15, + "rollout_server_url": f"http://{rollout_host}:{_rollout_port}", + "nodes": gateway_nodes, + }, + } + + +def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate Polar topology.yaml for SLURM jobs", + ) + parser.add_argument( + "-o", "--output", + default=None, + help="Output path for topology.yaml (default: stdout)", + ) + parser.add_argument( + "--default-sif", + default=None, + help="Default SIF image path for agent containers", + ) + parser.add_argument( + "--save-dir", + default=None, + help="Rollout results save directory", + ) + parser.add_argument( + "--sglang-base-url", + default=None, + help="SGLang router URL (use sglang instead of vllm backend)", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = _parse_args(argv) + hostnames = discover_hostnames() + topology = build_topology( + hostnames, + default_sif=args.default_sif or None, + save_dir=args.save_dir or None, + sglang_base_url=args.sglang_base_url or None, + ) + output = yaml.dump(topology, default_flow_style=False, sort_keys=False) + + if args.output: + path = Path(args.output) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(output) + print(f"[topology] Wrote topology to {args.output}") + else: + print(output) + + nodes = topology["gateway"]["nodes"] + print("[topology] Summary:") + print(f" Rollout: {topology['rollout']['public_url']}") + if "vllm" in nodes[0]: + print(f" vLLM: {nodes[0]['vllm']['base_url']}") + elif "sglang" in nodes[0]: + print(f" SGLang: {nodes[0]['sglang']['base_url']}") + for node in nodes: + print(f" Gateway: {node['id']} @ {node['public_url']}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/polar/config/topology.py b/src/polar/config/topology.py index a46d11ff..c04dde29 100644 --- a/src/polar/config/topology.py +++ b/src/polar/config/topology.py @@ -161,7 +161,10 @@ def _parse_gateway(raw: Any, rollout_public_url: str) -> GatewayConfig: if public_url_raw is not None else _default_public_url(host, port) ) - sglang = _require_mapping(node.get("sglang"), f"gateway.nodes[{index}].sglang") + sglang = _require_mapping( + node.get("vllm") or node.get("sglang"), + f"gateway.nodes[{index}].vllm", + ) default_runtime_raw = node.get("default_runtime") default_runtime = None if default_runtime_raw is not None: diff --git a/src/polar/gateway/node.py b/src/polar/gateway/node.py index cb57e767..11959d60 100644 --- a/src/polar/gateway/node.py +++ b/src/polar/gateway/node.py @@ -313,6 +313,8 @@ async def _run_exec_inputs( log_dir = managed.session_dir / "logs" / "agent" log_dir.mkdir(parents=True, exist_ok=True) + last_stdout: str | None = None + last_stderr: str | None = None for i, step in enumerate(steps): if managed.cancel_requested: return AgentRunResult( @@ -325,28 +327,43 @@ async def _run_exec_inputs( env=merged_env, timeout_sec=self._remaining_budget(managed), ) + last_stdout = result.stdout + last_stderr = result.stderr self._write_exec_log( log_dir, f"step.{i:02d}", result.stdout, result.stderr ) + logger.info( + "Step %d for session %s: rc=%s stdout_tail=%s", + i, + managed.request.session_id, + result.return_code, + (result.stdout or "")[-500:], + ) if result.return_code == -1: return AgentRunResult( status="timeout", return_code=-1, error=f"step {i} timed out", - metadata=self._step_metadata(log_dir, i, managed), + metadata=self._step_metadata( + log_dir, i, managed, last_stdout, last_stderr + ), ) if result.return_code != 0: return AgentRunResult( status="failed", return_code=result.return_code, error=f"step {i} exited with code {result.return_code}", - metadata=self._step_metadata(log_dir, i, managed), + metadata=self._step_metadata( + log_dir, i, managed, last_stdout, last_stderr + ), ) return AgentRunResult( status="completed", return_code=0, - metadata=self._step_metadata(log_dir, len(steps) - 1, managed), + metadata=self._step_metadata( + log_dir, len(steps) - 1, managed, last_stdout, last_stderr + ), ) # ------------------------------------------------------------------ @@ -751,12 +768,24 @@ def _write_exec_log( (log_dir / f"{prefix}.stderr.log").write_text(stderr) @staticmethod - def _step_metadata(log_dir: Path, step_index: int, managed: ManagedSession) -> dict: - return { + def _step_metadata( + log_dir: Path, + step_index: int, + managed: ManagedSession, + last_stdout: str | None = None, + last_stderr: str | None = None, + ) -> dict: + meta: dict = { "log_dir": str(log_dir), "last_step": step_index, "cwd": str(managed.session_dir), } + # Include truncated output tails so they survive session dir cleanup + if last_stdout: + meta["stdout_tail"] = last_stdout[-4000:] + if last_stderr: + meta["stderr_tail"] = last_stderr[-4000:] + return meta def _error_result( self, diff --git a/src/polar/gateway/server.py b/src/polar/gateway/server.py index 215277b5..aad9d445 100644 --- a/src/polar/gateway/server.py +++ b/src/polar/gateway/server.py @@ -72,6 +72,93 @@ class GatewayState: _configured_topology_path: str | None = None _configured_node_id: str | None = None +# Cached max_model_len from the backend model (populated lazily). +_max_model_len: int | None = None +_DEFAULT_MAX_OUTPUT_TOKENS = 4096 # Sensible default if model info unavailable + + +async def _fetch_max_model_len(base_url: str) -> int | None: + """Query backend for max_model_len via /v1/models.""" + try: + import httpx + + async with httpx.AsyncClient(base_url=base_url, timeout=10.0) as client: + resp = await client.get("/v1/models") + if resp.is_success: + data = resp.json().get("data", []) + if data: + return data[0].get("max_model_len") + except Exception: + pass + return None + + +def _clamp_max_tokens(request: dict[str, Any]) -> None: + """Reduce max_tokens / max_completion_tokens so it leaves room for input. + + Reserves at least 25% of the context window (min 2048 tokens) for the + input prompt. This avoids the common failure where + ``max_tokens == max_model_len`` leaves zero tokens for the prompt. + + Handles both ``max_tokens`` (legacy) and ``max_completion_tokens`` + (modern OpenAI API used by litellm/openhands). + """ + global _max_model_len + if _max_model_len: + input_reserve = max(_max_model_len // 4, 2048) + limit = _max_model_len - input_reserve + else: + limit = _DEFAULT_MAX_OUTPUT_TOKENS + for key in ("max_tokens", "max_completion_tokens"): + val = request.get(key) + if val is not None and isinstance(val, int) and val > limit: + logger.info( + "Clamped %s from %d to %d (model limit %s, input reserve %d)", + key, val, limit, _max_model_len or "default", + input_reserve if _max_model_len else 0, + ) + request[key] = limit + + +def _try_reduce_max_tokens_from_error( + error_msg: str, + request: dict[str, Any], +) -> bool: + """On a vLLM token-limit 400 error, shrink output-token fields by ~30%. + + Handles both ``max_tokens`` and ``max_completion_tokens``. + Returns ``True`` if any field was lowered (caller should retry). + + We deliberately avoid parsing the reported input-token count from the + error because vLLM reports a *derived* value (``context_len + 1 - + max_tokens``) rather than the true tokenised length, which makes + error-guided reduction unreliable. A fixed 30% reduction converges + quickly in practice. + """ + if "maximum context length" not in error_msg: + return False + + changed = False + for key in ("max_tokens", "max_completion_tokens"): + old = request.get(key) + if isinstance(old, int) and old > 128: + new = max(128, old * 7 // 10) # ~30% reduction + logger.info("Auto-reducing %s from %d to %d", key, old, new) + request[key] = new + changed = True + + # If no output-token field exists, add one at ¼ of context. + if not changed and _max_model_len: + for key in ("max_tokens", "max_completion_tokens"): + if key not in request: + new = _max_model_len // 4 + logger.info("Adding %s=%d to constrain output", key, new) + request[key] = new + changed = True + break + + return changed + def configure_server(topology_path: str = "topology.yaml", *, node_id: str | None = None) -> None: global _configured_topology_path, _configured_node_id, _state @@ -142,10 +229,20 @@ def get_state() -> GatewayState: @asynccontextmanager async def _lifespan(_: FastAPI): + global _max_model_len state = get_state() await state.node_manager.start() if state.control_client is not None: await state.control_client.start() + # Cache backend model's max_model_len for request clamping. + _max_model_len = await _fetch_max_model_len(state.node.sglang_base_url) + if _max_model_len: + logger.info("Backend max_model_len: %d", _max_model_len) + else: + logger.warning( + "Could not fetch max_model_len from backend; using default cap %d", + _DEFAULT_MAX_OUTPUT_TOKENS, + ) try: yield finally: @@ -500,13 +597,87 @@ async def proxy_request(request: Request, path: str): request.method, full_path, api_type.value, original_model, session_id, ) + # Debug: log request body keys and previous_response_id for diagnostics + body_keys = sorted(body.keys()) if isinstance(body, dict) else "not-a-dict" + prev_resp_id = body.get("previous_response_id") if isinstance(body, dict) else None + input_type = type(body.get("input", "")).__name__ if isinstance(body, dict) else "?" + input_len = len(body.get("input", "")) if isinstance(body, dict) and isinstance(body.get("input"), (str, list)) else 0 + logger.info( + " body_keys=%s prev_response_id=%s input_type=%s input_len=%s stream=%s", + body_keys, prev_resp_id, input_type, input_len, body.get("stream"), + ) + if api_type == APIType.GOOGLE and "streamGenerateContent" in full_path: body["_streaming"] = True - transformed_body = body.copy() - transformed_body["_polar_model_served"] = state.node.model_served - openai_request = transformer.transform_request(transformed_body) + # Resolve previous_response_id for multi-turn Responses API conversations + if ( + api_type == APIType.OPENAI_RESPONSES + and isinstance(body, dict) + and body.get("previous_response_id") + ): + prev_id = body["previous_response_id"] + logger.info(" Resolving previous_response_id=%s from session %s", prev_id, session_id) + session_data = state.storage.load_completion_session(session_id) + if session_data and session_data.completions: + # Rebuild conversation history from stored completions + history_items: list[dict[str, Any]] = [] + for rec in session_data.completions: + req_msgs = rec.request.get("messages", []) + resp_choices = rec.response.get("choices", []) + # Add the request messages (skip system — instructions handles that) + for msg in req_msgs: + role = msg.get("role", "") + if role == "system": + continue + if role == "user": + history_items.append({"type": "message", "role": "user", "content": msg.get("content", "")}) + elif role == "tool": + history_items.append({ + "type": "function_call_output", + "call_id": msg.get("tool_call_id", ""), + "output": msg.get("content", ""), + }) + # Add the assistant response + if resp_choices: + resp_msg = resp_choices[0].get("message", {}) + content = resp_msg.get("content", "") + tool_calls = resp_msg.get("tool_calls", []) + if content: + history_items.append({ + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": content}], + }) + for tc in tool_calls: + func = tc.get("function", {}) + history_items.append({ + "type": "function_call", + "call_id": tc.get("id", ""), + "name": func.get("name", ""), + "arguments": func.get("arguments", "{}"), + }) + + # Merge: history_items + current input items + current_input = body.get("input", []) + if isinstance(current_input, str): + current_input = [{"type": "message", "role": "user", "content": current_input}] + elif not isinstance(current_input, list): + current_input = [] + body["input"] = history_items + current_input + logger.info( + " Resolved history: %d records, %d history items + %d current items", + len(session_data.completions), len(history_items), len(current_input), + ) + else: + logger.warning(" No session data found for previous_response_id=%s", prev_id) + + openai_request = transformer.transform_request(body) openai_request["model"] = state.node.model_served + + # Clamp max_tokens so it never exceeds the backend model's capacity. + _clamp_max_tokens(openai_request) + is_streaming = openai_request.get("stream", False) if is_streaming: @@ -541,11 +712,28 @@ async def _handle_non_streaming( session_info: Any | None, ) -> JSONResponse: state = get_state() - try: - response = await state.sglang.completion(openai_request) - except UpstreamError as exc: - logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc) - return _upstream_error_response(api_type, exc) + response = None + last_exc: Exception | None = None + for _attempt in range(4): + try: + response = await state.sglang.completion(openai_request) + break + except UpstreamHTTPError as exc: + last_exc = exc + if ( + exc.status_code == 400 + and _try_reduce_max_tokens_from_error(str(exc), openai_request) + ): + logger.info("Retrying (%d) with reduced max_tokens", _attempt + 1) + continue + logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc) + return _upstream_error_response(api_type, exc) + except UpstreamError as exc: + logger.warning("Non-streaming upstream error for session %s: %s", session_id, exc) + return _upstream_error_response(api_type, exc) + if response is None: + logger.warning("All retries exhausted for session %s: %s", session_id, last_exc) + return _upstream_error_response(api_type, last_exc or UpstreamError("max_tokens retries exhausted")) state.storage.save_message( session_id, @@ -573,11 +761,30 @@ async def _handle_streaming( session_info: Any | None, ) -> StreamingResponse | JSONResponse: state = get_state() - try: - raw_stream = await state.sglang.open_completion_stream(openai_request) - except UpstreamError as exc: - logger.warning("Streaming setup error for session %s: %s", session_id, exc) - return _upstream_error_response(api_type, exc) + # Try opening the stream; on a token-limit 400 error, auto-reduce + # max_tokens and retry (up to 3 retries, so 4 total attempts). + raw_stream = None + last_exc: Exception | None = None + for _attempt in range(4): + try: + raw_stream = await state.sglang.open_completion_stream(openai_request) + break + except UpstreamHTTPError as exc: + last_exc = exc + if ( + exc.status_code == 400 + and _try_reduce_max_tokens_from_error(str(exc), openai_request) + ): + logger.info("Retrying (%d) with reduced max_tokens", _attempt + 1) + continue + logger.warning("Streaming setup error for session %s: %s", session_id, exc) + return _upstream_error_response(api_type, exc) + except UpstreamError as exc: + logger.warning("Streaming setup error for session %s: %s", session_id, exc) + return _upstream_error_response(api_type, exc) + if raw_stream is None: + logger.warning("All retries exhausted for session %s: %s", session_id, last_exc) + return _upstream_error_response(api_type, last_exc or UpstreamError("max_tokens retries exhausted")) accumulator = StreamAccumulator() stream_state = transformer.create_stream_state(original_request) diff --git a/src/polar/gateway/transform/openai_responses.py b/src/polar/gateway/transform/openai_responses.py index d485ea29..9668faf4 100644 --- a/src/polar/gateway/transform/openai_responses.py +++ b/src/polar/gateway/transform/openai_responses.py @@ -263,6 +263,16 @@ def transform_request(self, body: dict[str, Any]) -> dict[str, Any]: elif isinstance(input_data, list): messages.extend(self._convert_input_items_to_messages(input_data)) + # vLLM / Qwen chat templates require exactly ONE system message at + # position 0. Responses API clients (e.g. codex CLI) may produce + # multiple system messages (from "instructions" + "developer" items). + # Merge them into a single system message. + system_parts = [m["content"] for m in messages if m.get("role") == "system" and m.get("content")] + other_msgs = [m for m in messages if m.get("role") != "system"] + if system_parts: + messages = [{"role": "system", "content": "\n\n".join(system_parts)}] + other_msgs + else: + messages = other_msgs result: dict[str, Any] = {"messages": messages} if "max_tokens" in body: @@ -322,9 +332,11 @@ def transform_response( else: output_items.append({ "type": "function_call", + "id": f"fc_{uuid.uuid4().hex[:24]}", "call_id": tc.get("id", ""), "name": name, "arguments": func.get("arguments", "{}"), + "status": "completed", }) usage = response.get("usage", {}) @@ -378,6 +390,10 @@ def _convert_input_items_to_messages( pending_tool_outputs = [] role = item.get("role", "user") + # Responses API uses "developer" for system-level instructions; + # map to "system" for Chat Completions compatibility. + if role == "developer": + role = "system" content = self._extract_text_from_content(item.get("content", "")) messages.append({"role": role, "content": content}) @@ -446,6 +462,8 @@ def _convert_response_item_to_message(self, item: dict[str, Any]) -> Optional[di if item_type == "message": role = item.get("role", "user") + if role == "developer": + role = "system" content_items = item.get("content", []) text_parts = [] for c in content_items if isinstance(content_items, list) else []: @@ -466,6 +484,8 @@ def _convert_response_item_to_message(self, item: dict[str, Any]) -> Optional[di # Fallback: plain {role, content} dict if not item_type and "role" in item and "content" in item: role = item["role"] + if role == "developer": + role = "system" content = item["content"] if isinstance(content, str): return {"role": role, "content": content} diff --git a/src/polar/runtime/apptainer.py b/src/polar/runtime/apptainer.py index be85449b..e386bba8 100644 --- a/src/polar/runtime/apptainer.py +++ b/src/polar/runtime/apptainer.py @@ -2,9 +2,11 @@ from __future__ import annotations +import asyncio import hashlib import logging import os +import logging import shlex import shutil from pathlib import Path @@ -23,8 +25,15 @@ def __init__(self, spec: RuntimeSpec, session_id: str, session_dir: Path) -> Non # Use a hash suffix to guarantee uniqueness even when session IDs # share a long prefix (e.g. "sk-polar-...-eval" vs "sk-polar-..."). short_hash = hashlib.sha256(session_id.encode()).hexdigest()[:8] - safe_name = session_id.replace("/", "-")[:30] - self._instance_name = f"polar-{safe_name}-{short_hash}" + safe_name = session_id.replace("/", "-") + if len(safe_name) > 40: + # Keep a recognisable prefix plus a hash suffix to guarantee + # uniqueness even when session IDs share a long common prefix + # (e.g. "sk-polar-" vs "sk-polar--eval"). + prefix = safe_name[:24] + suffix = hashlib.sha256(safe_name.encode()).hexdigest()[:12] + safe_name = f"{prefix}-{suffix}" + self._instance_name = f"polar-{safe_name}" self._binary = self._resolve_binary() @property @@ -42,12 +51,15 @@ def can_disable_internet(self) -> bool: async def start(self) -> None: if self._destroyed: raise RuntimeError("apptainer runtime was already destroyed") - # Use a host-backed overlay directory instead of --writable-tmpfs - # (default tmpfs overlay is only 64 MB, too small for most workloads). - self._overlay_dir = self.session_dir / "overlay" - self._overlay_dir.mkdir(parents=True, exist_ok=True) - args = [self._binary, "instance", "start", - "--overlay", str(self._overlay_dir)] + args = [self._binary, "instance", "start"] + # --writable-tmpfs gives a small (64 MB) writable layer on top of the + # read-only SIF for caches/configs. Actual workload data goes through + # the bind mount (session_dir → /polar/session). + # --no-home avoids mounting the host home directory which would leak + # host-specific paths into the container. + args.extend(["--writable-tmpfs", "--no-home"]) + if self.spec.kwargs.get("fakeroot"): + args.append("--fakeroot") if self.spec.gpus > 0: args.append("--nv") network_name: str | None @@ -59,10 +71,26 @@ async def start(self) -> None: args.extend(["--net", "--network", network_name]) args.extend(["--bind", f"{self.session_dir}:{self.runtime_session_dir}"]) args.extend([self.spec.image, self._instance_name]) - rc, _, _ = await self._run_local_command(*args) + # Do NOT use capture=True here. `apptainer instance start` forks a + # daemon that inherits pipe fds; asyncio.communicate() then blocks + # forever waiting for the daemon to close them. We redirect stderr + # to a temp file so we can still report errors on failure. + stderr_path = self.session_dir / "apptainer_start.err" + stderr_fh = stderr_path.open("w") + try: + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.DEVNULL, + stderr=stderr_fh, + ) + rc = await proc.wait() + finally: + stderr_fh.close() if rc != 0: + detail = stderr_path.read_text().strip() raise RuntimeError( f"{self._binary} instance start failed with exit code {rc}" + + (f": {detail}" if detail else "") ) _STOP_TIMEOUT = 30.0 @@ -94,9 +122,14 @@ async def exec( if effective_workdir: wrapped_command = f"cd {shlex.quote(effective_workdir)} && {command}" args = [self._binary, "exec", f"instance://{self._instance_name}"] + # Ensure HOME is set inside the container (--no-home leaves it + # pointing at the non-existent host home) and clear host-specific + # cache dirs that would fail inside a read-only overlay. + effective_env: dict[str, str] = {"HOME": "/root"} if env: - args.append("env") - args.extend(f"{key}={value}" for key, value in env.items()) + effective_env.update(env) + args.append("env") + args.extend(f"{key}={value}" for key, value in effective_env.items()) args.extend(["bash", "-lc", wrapped_command]) rc, stdout, stderr = await self._run_local_command( *args, timeout=timeout_sec, capture=True