From 28301e0d201b8aac7fa57ca948016d68c3f905ac Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 03:46:54 +0000 Subject: [PATCH 01/56] feat: add K8s and Slurm scheduler backends for profiling jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a scheduler package with: - ProfileJobSpec dataclass for all profiling parameters - BaseScheduler ABC with render/submit/dry_run interface - K8sScheduler: generates valid K8s Job YAML with GPU resources, PVC/hostPath volumes, nodeSelector, serviceAccount support - SlurmScheduler: generates sbatch scripts with docker/enroot/bare-metal container runtimes, module loading, and custom #SBATCH directives - scripts/submit_profile.py: unified CLI entry point with --scheduler {k8s,slurm}, --dry-run (default) and --submit modes Zero external dependencies — uses only Python stdlib. --- schedulers/__init__.py | 12 ++ schedulers/base.py | 129 +++++++++++++++++++++ schedulers/k8s.py | 140 ++++++++++++++++++++++ schedulers/slurm.py | 141 ++++++++++++++++++++++ scripts/submit_profile.py | 238 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 660 insertions(+) create mode 100644 schedulers/__init__.py create mode 100644 schedulers/base.py create mode 100644 schedulers/k8s.py create mode 100644 schedulers/slurm.py create mode 100644 scripts/submit_profile.py diff --git a/schedulers/__init__.py b/schedulers/__init__.py new file mode 100644 index 0000000..6e1547b --- /dev/null +++ b/schedulers/__init__.py @@ -0,0 +1,12 @@ +"""Scheduler backends for submitting FlowSim profiling jobs to K8s or Slurm.""" + +from schedulers.base import BaseScheduler, ProfileJobSpec +from schedulers.k8s import K8sScheduler +from schedulers.slurm import SlurmScheduler + +__all__ = [ + "BaseScheduler", + "K8sScheduler", + "ProfileJobSpec", + "SlurmScheduler", +] diff --git a/schedulers/base.py b/schedulers/base.py new file mode 100644 index 0000000..df40429 --- /dev/null +++ b/schedulers/base.py @@ -0,0 +1,129 @@ +"""Abstract base class for FlowSim job schedulers.""" + +from __future__ import annotations + +import abc +import shlex +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ProfileJobSpec: + """All parameters needed to run a stage-profiling job. + + The scheduler backends render this into a K8s Job YAML or Slurm + sbatch script. + """ + + # -- Profiling workload -- + collect: str # "perf", "shapes", or "all" + model_path: str + tp: int = 1 + dp: int = 1 + bs: int = 1 + input_len: int = 2048 + existing_ctx: int = 0 + decode_tokens: int = 32 + warmup_n: int = 5 + disable_chunked_prefill: bool = False + max_prefill_tokens: int = 131072 + + # -- Infrastructure -- + image: str = "flowsim-image:latest" + gpus: int = 1 # total GPU count (must be >= tp * dp) + host: str = "0.0.0.0" + port: int = 30001 + output_dir: str = "/flowsim/stage_traces" + log_dir: str = "/flowsim/tests/test-artifacts" + job_name: str = "" + + # -- Extra server opts (appended verbatim) -- + extra_server_opts: str = "" + + def build_server_opts(self) -> str: + """Build the ``--server-opts`` string for run_stage_profile.py.""" + parts = [ + f"--model-path {self.model_path}", + f"--tp {self.tp}", + f"--host {self.host}", + f"--port {self.port}", + ] + if self.dp > 1: + parts.append(f"--dp {self.dp}") + if self.extra_server_opts: + parts.append(self.extra_server_opts) + return " ".join(parts) + + def build_profile_command(self) -> list[str]: + """Build the full ``python scripts/run_stage_profile.py ...`` command.""" + cmd = [ + "python", + "scripts/run_stage_profile.py", + "--collect", + self.collect, + "--launch-server", + "--server-opts", + self.build_server_opts(), + "--bs", + str(self.bs), + "--input-len", + str(self.input_len), + "--existing-ctx", + str(self.existing_ctx), + "--decode-tokens", + str(self.decode_tokens), + "--warmup-n", + str(self.warmup_n), + "--host", + self.host, + "--port", + str(self.port), + "--output-dir", + self.output_dir, + "--log-dir", + self.log_dir, + ] + if self.disable_chunked_prefill: + cmd.append("--disable-chunked-prefill") + cmd.extend(["--max-prefill-tokens", str(self.max_prefill_tokens)]) + return cmd + + def build_shell_command(self) -> str: + """Build a single shell command string (properly quoted).""" + cmd = self.build_profile_command() + # Quote the --server-opts value since it contains spaces + quoted = [] + i = 0 + while i < len(cmd): + if cmd[i] == "--server-opts" and i + 1 < len(cmd): + quoted.append(cmd[i]) + quoted.append(shlex.quote(cmd[i + 1])) + i += 2 + else: + quoted.append(cmd[i]) + i += 1 + return " ".join(quoted) + + def default_job_name(self) -> str: + """Generate a default job name from workload params.""" + if self.job_name: + return self.job_name + model_short = self.model_path.split("/")[-1].lower().replace(".", "-") + return f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" + + +class BaseScheduler(abc.ABC): + """Abstract scheduler backend.""" + + @abc.abstractmethod + def render(self, spec: ProfileJobSpec) -> str: + """Render the job manifest / script as a string.""" + + @abc.abstractmethod + def submit(self, spec: ProfileJobSpec) -> str: + """Submit the job and return a job identifier string.""" + + def dry_run(self, spec: ProfileJobSpec) -> str: + """Render and return the manifest without submitting.""" + return self.render(spec) diff --git a/schedulers/k8s.py b/schedulers/k8s.py new file mode 100644 index 0000000..533967e --- /dev/null +++ b/schedulers/k8s.py @@ -0,0 +1,140 @@ +"""Kubernetes Job scheduler for FlowSim profiling.""" + +from __future__ import annotations + +import subprocess +import tempfile + +from schedulers.base import BaseScheduler, ProfileJobSpec + + +class K8sScheduler(BaseScheduler): + """Generate and optionally submit a Kubernetes Job for profiling. + + Parameters + ---------- + namespace : str + Kubernetes namespace for the Job. + pvc_name : str, optional + Name of a PersistentVolumeClaim to mount for trace output. + If empty, uses ``emptyDir`` (traces are lost when the pod exits). + host_output_dir : str, optional + If set (and *pvc_name* is empty), use a ``hostPath`` volume at + this path instead of a PVC. + node_selector : dict, optional + Kubernetes nodeSelector labels (e.g., ``{"gpu": "a100"}``). + service_account : str, optional + ServiceAccount name for the pod. + shm_size : str + Size of ``/dev/shm`` (shared memory). Defaults to ``"16Gi"``. + """ + + def __init__( + self, + *, + namespace: str = "default", + pvc_name: str = "", + host_output_dir: str = "", + node_selector: dict[str, str] | None = None, + service_account: str = "", + shm_size: str = "16Gi", + ) -> None: + self.namespace = namespace + self.pvc_name = pvc_name + self.host_output_dir = host_output_dir + self.node_selector = node_selector or {} + self.service_account = service_account + self.shm_size = shm_size + + def render(self, spec: ProfileJobSpec) -> str: + job_name = spec.default_job_name()[:63] # K8s name limit + cmd = spec.build_profile_command() + + lines: list[str] = [] + _a = lines.append + + _a("apiVersion: batch/v1") + _a("kind: Job") + _a("metadata:") + _a(f" name: {job_name}") + _a(f" namespace: {self.namespace}") + _a(" labels:") + _a(" app: flowsim") + _a(" component: profiling") + _a(f" collect: {spec.collect}") + _a("spec:") + _a(" backoffLimit: 0") + _a(" ttlSecondsAfterFinished: 86400") + _a(" template:") + _a(" metadata:") + _a(" labels:") + _a(" app: flowsim") + _a(" component: profiling") + _a(" spec:") + if self.service_account: + _a(f" serviceAccountName: {self.service_account}") + if self.node_selector: + _a(" nodeSelector:") + for k, v in self.node_selector.items(): + _a(f" {k}: {v}") + _a(" restartPolicy: Never") + _a(" containers:") + _a(" - name: profiler") + _a(f" image: {spec.image}") + _a(" imagePullPolicy: IfNotPresent") + _a(" workingDir: /flowsim") + _a(" command:") + for c in cmd: + _a(f' - "{c}"') + _a(" env:") + _a(" - name: SGLANG_PROFILE_KERNELS") + _a(' value: "1"') + _a(" resources:") + _a(" limits:") + _a(f' nvidia.com/gpu: "{spec.gpus}"') + _a(" requests:") + _a(f' nvidia.com/gpu: "{spec.gpus}"') + + # volumeMounts + _a(" volumeMounts:") + _a(" - name: dshm") + _a(" mountPath: /dev/shm") + if self.pvc_name or self.host_output_dir: + _a(" - name: output") + _a(f" mountPath: {spec.output_dir}") + + # volumes + _a(" volumes:") + _a(" - name: dshm") + _a(" emptyDir:") + _a(" medium: Memory") + _a(f" sizeLimit: {self.shm_size}") + if self.pvc_name: + _a(" - name: output") + _a(" persistentVolumeClaim:") + _a(f" claimName: {self.pvc_name}") + elif self.host_output_dir: + _a(" - name: output") + _a(" hostPath:") + _a(f" path: {self.host_output_dir}") + _a(" type: DirectoryOrCreate") + + return "\n".join(lines) + "\n" + + def submit(self, spec: ProfileJobSpec) -> str: + manifest = self.render(spec) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) as f: + f.write(manifest) + f.flush() + result = subprocess.run( + ["kubectl", "apply", "-f", f.name], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"kubectl apply failed:\n{result.stderr.strip()}" + ) + return result.stdout.strip() diff --git a/schedulers/slurm.py b/schedulers/slurm.py new file mode 100644 index 0000000..4aa18d8 --- /dev/null +++ b/schedulers/slurm.py @@ -0,0 +1,141 @@ +"""Slurm sbatch scheduler for FlowSim profiling.""" + +from __future__ import annotations + +import subprocess +import tempfile +import textwrap + +from schedulers.base import BaseScheduler, ProfileJobSpec + + +class SlurmScheduler(BaseScheduler): + """Generate and optionally submit an sbatch script for profiling. + + Parameters + ---------- + partition : str + Slurm partition to submit to. + time_limit : str + Wall-clock time limit (e.g., ``"01:00:00"``). + account : str, optional + ``--account`` for which allocation to charge. + constraint : str, optional + ``--constraint`` node feature (e.g., ``"gpu80g"``). + container_runtime : str + How to run the container inside the allocation. + ``"docker"`` -> ``docker run`` + ``"enroot"`` -> ``srun --container-image`` + ``"none"`` -> run bare-metal (no container) + container_mounts : str + Bind-mount string passed to the container runtime + (e.g., ``"/data:/data"``). + modules : list[str] + ``module load`` commands to run before the job + (relevant for ``"none"`` runtime). + extra_sbatch : list[str] + Additional ``#SBATCH`` lines, each *without* the ``#SBATCH`` prefix. + """ + + def __init__( + self, + *, + partition: str = "gpu", + time_limit: str = "02:00:00", + account: str = "", + constraint: str = "", + container_runtime: str = "none", + container_mounts: str = "", + modules: list[str] | None = None, + extra_sbatch: list[str] | None = None, + ) -> None: + self.partition = partition + self.time_limit = time_limit + self.account = account + self.constraint = constraint + self.container_runtime = container_runtime + self.container_mounts = container_mounts + self.modules = modules or [] + self.extra_sbatch = extra_sbatch or [] + + def render(self, spec: ProfileJobSpec) -> str: + job_name = spec.default_job_name() + cmd = spec.build_shell_command() + + lines = [ + "#!/bin/bash", + f"#SBATCH --job-name={job_name}", + f"#SBATCH --partition={self.partition}", + f"#SBATCH --gpus-per-node={spec.gpus}", + f"#SBATCH --ntasks=1", + f"#SBATCH --time={self.time_limit}", + f"#SBATCH --output={spec.output_dir}/{job_name}_%j.log", + ] + + if self.account: + lines.append(f"#SBATCH --account={self.account}") + if self.constraint: + lines.append(f"#SBATCH --constraint={self.constraint}") + for extra in self.extra_sbatch: + lines.append(f"#SBATCH {extra}") + + lines.append("") + lines.append("set -euo pipefail") + lines.append("") + + if self.modules: + for mod in self.modules: + lines.append(f"module load {mod}") + lines.append("") + + lines.append("export SGLANG_PROFILE_KERNELS=1") + lines.append("") + + if self.container_runtime == "docker": + mounts = "" + if self.container_mounts: + mounts = f" -v {self.container_mounts}" + lines.append( + f"docker run --gpus all --ipc=host --shm-size=16g" + f"{mounts} -w /flowsim {spec.image} \\" + ) + lines.append(f" {cmd}") + elif self.container_runtime == "enroot": + mounts = "" + if self.container_mounts: + mounts = f" --container-mounts={self.container_mounts}" + lines.append( + f"srun --container-image={spec.image}" + f" --container-workdir=/flowsim" + f"{mounts} \\" + ) + lines.append(f" {cmd}") + elif self.container_runtime == "none": + lines.append(f"cd /flowsim") + lines.append(cmd) + else: + raise ValueError( + f"Unknown container_runtime: {self.container_runtime!r}. " + "Choose from: docker, enroot, none" + ) + + lines.append("") + return "\n".join(lines) + + def submit(self, spec: ProfileJobSpec) -> str: + script = self.render(spec) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".sh", delete=False + ) as f: + f.write(script) + f.flush() + result = subprocess.run( + ["sbatch", f.name], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"sbatch failed:\n{result.stderr.strip()}" + ) + return result.stdout.strip() diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py new file mode 100644 index 0000000..8e309dd --- /dev/null +++ b/scripts/submit_profile.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +"""Submit FlowSim profiling jobs to Kubernetes or Slurm. + +Usage examples +-------------- + +Dry-run (print Kubernetes Job YAML to stdout): + + python scripts/submit_profile.py \\ + --scheduler k8s \\ + --collect perf \\ + --model-path Qwen/Qwen3-235B-A22B-FP8 \\ + --tp 4 --gpus 4 \\ + --bs 1 --input-len 2048 --decode-tokens 32 \\ + --image flowsim-image:latest \\ + --k8s-namespace default \\ + --k8s-pvc flowsim-traces \\ + --dry-run + +Dry-run (print Slurm sbatch script to stdout): + + python scripts/submit_profile.py \\ + --scheduler slurm \\ + --collect perf \\ + --model-path Qwen/Qwen3-235B-A22B-FP8 \\ + --tp 4 --gpus 4 \\ + --slurm-partition gpu-a100 \\ + --slurm-time 02:00:00 \\ + --dry-run + +Submit directly to cluster: + + python scripts/submit_profile.py \\ + --scheduler k8s \\ + ... \\ + --submit +""" + +from __future__ import annotations + +import argparse +import sys + +# Allow running from the repo root as ``python scripts/submit_profile.py`` +sys.path.insert(0, ".") + +from schedulers.base import ProfileJobSpec +from schedulers.k8s import K8sScheduler +from schedulers.slurm import SlurmScheduler + + +def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Submit FlowSim profiling jobs to K8s or Slurm.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # -- Scheduler choice -- + p.add_argument( + "--scheduler", + choices=["k8s", "slurm"], + required=True, + help="Scheduler backend.", + ) + + # -- Profiling workload (mirrors run_stage_profile.py) -- + wl = p.add_argument_group("workload") + wl.add_argument( + "--collect", + choices=["perf", "shapes", "all"], + required=True, + ) + wl.add_argument("--model-path", required=True, help="HF model path") + wl.add_argument("--tp", type=int, default=1) + wl.add_argument("--dp", type=int, default=1) + wl.add_argument("--bs", type=int, default=1, help="Batch size") + wl.add_argument("--input-len", type=int, default=2048) + wl.add_argument("--existing-ctx", type=int, default=0) + wl.add_argument("--decode-tokens", type=int, default=32) + wl.add_argument("--warmup-n", type=int, default=5) + wl.add_argument( + "--disable-chunked-prefill", action="store_true", + ) + wl.add_argument("--max-prefill-tokens", type=int, default=131072) + wl.add_argument( + "--extra-server-opts", + default="", + help="Extra server options appended verbatim", + ) + + # -- Infrastructure -- + infra = p.add_argument_group("infrastructure") + infra.add_argument("--image", default="flowsim-image:latest") + infra.add_argument( + "--gpus", type=int, default=1, help="Total GPU count", + ) + infra.add_argument("--host", default="0.0.0.0") + infra.add_argument("--port", type=int, default=30001) + infra.add_argument("--output-dir", default="/flowsim/stage_traces") + infra.add_argument( + "--log-dir", default="/flowsim/tests/test-artifacts", + ) + infra.add_argument("--job-name", default="") + + # -- Kubernetes-specific -- + k8s = p.add_argument_group("kubernetes options") + k8s.add_argument("--k8s-namespace", default="default") + k8s.add_argument( + "--k8s-pvc", + default="", + help="PVC name for output volume (omit for emptyDir)", + ) + k8s.add_argument( + "--k8s-host-output-dir", + default="", + help="hostPath for output (used when --k8s-pvc is empty)", + ) + k8s.add_argument( + "--k8s-node-selector", + action="append", + default=[], + metavar="KEY=VALUE", + help="Node selector labels (repeatable)", + ) + k8s.add_argument("--k8s-service-account", default="") + k8s.add_argument("--k8s-shm-size", default="16Gi") + + # -- Slurm-specific -- + slurm = p.add_argument_group("slurm options") + slurm.add_argument("--slurm-partition", default="gpu") + slurm.add_argument("--slurm-time", default="02:00:00") + slurm.add_argument("--slurm-account", default="") + slurm.add_argument("--slurm-constraint", default="") + slurm.add_argument( + "--slurm-container-runtime", + choices=["docker", "enroot", "none"], + default="none", + ) + slurm.add_argument("--slurm-container-mounts", default="") + slurm.add_argument( + "--slurm-module", + action="append", + default=[], + help="Modules to load (repeatable)", + ) + slurm.add_argument( + "--slurm-extra-sbatch", + action="append", + default=[], + metavar="DIRECTIVE", + help="Extra #SBATCH directives (repeatable, without prefix)", + ) + + # -- Action -- + action = p.add_mutually_exclusive_group() + action.add_argument( + "--dry-run", + action="store_true", + default=True, + help="Print the rendered manifest to stdout (default)", + ) + action.add_argument( + "--submit", + action="store_true", + help="Actually submit the job to the cluster", + ) + + return p.parse_args(argv) + + +def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: + return ProfileJobSpec( + collect=args.collect, + model_path=args.model_path, + tp=args.tp, + dp=args.dp, + bs=args.bs, + input_len=args.input_len, + existing_ctx=args.existing_ctx, + decode_tokens=args.decode_tokens, + warmup_n=args.warmup_n, + disable_chunked_prefill=args.disable_chunked_prefill, + max_prefill_tokens=args.max_prefill_tokens, + image=args.image, + gpus=args.gpus, + host=args.host, + port=args.port, + output_dir=args.output_dir, + log_dir=args.log_dir, + job_name=args.job_name, + extra_server_opts=args.extra_server_opts, + ) + + +def _build_scheduler(args: argparse.Namespace): + if args.scheduler == "k8s": + node_sel = {} + for item in args.k8s_node_selector: + k, _, v = item.partition("=") + if not v: + sys.exit(f"Bad --k8s-node-selector format: {item!r} (use KEY=VALUE)") + node_sel[k] = v + return K8sScheduler( + namespace=args.k8s_namespace, + pvc_name=args.k8s_pvc, + host_output_dir=args.k8s_host_output_dir, + node_selector=node_sel, + service_account=args.k8s_service_account, + shm_size=args.k8s_shm_size, + ) + else: + return SlurmScheduler( + partition=args.slurm_partition, + time_limit=args.slurm_time, + account=args.slurm_account, + constraint=args.slurm_constraint, + container_runtime=args.slurm_container_runtime, + container_mounts=args.slurm_container_mounts, + modules=args.slurm_module, + extra_sbatch=args.slurm_extra_sbatch, + ) + + +def main(argv: list[str] | None = None) -> None: + args = _parse_args(argv) + spec = _build_spec(args) + scheduler = _build_scheduler(args) + + if args.submit: + result = scheduler.submit(spec) + print(result) + else: + print(scheduler.dry_run(spec)) + + +if __name__ == "__main__": + main() From 26c9f476cbe0000281d97ca156d8d327e1a465b7 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:00:44 +0000 Subject: [PATCH 02/56] feat: switch to proper API clients for remote submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit K8s: - render() now builds a dict and serializes via yaml.safe_dump (falls back to json.dumps if PyYAML is absent). Fixes YAML injection when values contain : # or quotes. - submit() uses the 'kubernetes' Python client (kubeconfig / in-cluster). - New args: --k8s-kubeconfig, --k8s-context. Slurm: - submit() now posts to slurmrestd REST API via urllib.request (stdlib). - Supports JWT auth, configurable API version (v0.0.39–v0.0.41+), and TLS certificate verification toggle. - New args: --slurm-rest-url, --slurm-jwt-token, --slurm-api-version, --slurm-no-verify-ssl. render() / dry-run remain zero-dependency (stdlib only). submit() requires 'kubernetes' package for K8s; Slurm uses stdlib. --- schedulers/k8s.py | 200 ++++++++++++++++++++++---------------- schedulers/slurm.py | 133 +++++++++++++++++++++---- scripts/submit_profile.py | 38 ++++++++ 3 files changed, 266 insertions(+), 105 deletions(-) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 533967e..9b957a9 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -1,12 +1,29 @@ -"""Kubernetes Job scheduler for FlowSim profiling.""" +"""Kubernetes Job scheduler for FlowSim profiling. + +Uses the ``kubernetes`` Python client for remote submission. +The ``render()`` / ``dry_run()`` path uses stdlib only (json fallback if +PyYAML is not installed — JSON is valid YAML 1.2 and ``kubectl`` accepts it). +""" from __future__ import annotations -import subprocess -import tempfile +import json from schedulers.base import BaseScheduler, ProfileJobSpec +# Optional: nicer YAML output for dry-run. +try: + import yaml as _yaml # type: ignore[import-untyped] + + def _dump(obj: dict) -> str: + return _yaml.safe_dump(obj, default_flow_style=False, sort_keys=False) + +except ImportError: + _yaml = None # type: ignore[assignment] + + def _dump(obj: dict) -> str: # type: ignore[misc] + return json.dumps(obj, indent=2, ensure_ascii=False) + "\n" + class K8sScheduler(BaseScheduler): """Generate and optionally submit a Kubernetes Job for profiling. @@ -15,6 +32,11 @@ class K8sScheduler(BaseScheduler): ---------- namespace : str Kubernetes namespace for the Job. + kubeconfig : str, optional + Path to a kubeconfig file. When empty, the ``kubernetes`` client + tries in-cluster config, then ``~/.kube/config``. + context : str, optional + kubeconfig context to activate. pvc_name : str, optional Name of a PersistentVolumeClaim to mount for trace output. If empty, uses ``emptyDir`` (traces are lost when the pod exits). @@ -33,6 +55,8 @@ def __init__( self, *, namespace: str = "default", + kubeconfig: str = "", + context: str = "", pvc_name: str = "", host_output_dir: str = "", node_selector: dict[str, str] | None = None, @@ -40,6 +64,8 @@ def __init__( shm_size: str = "16Gi", ) -> None: self.namespace = namespace + self.kubeconfig = kubeconfig + self.context = context self.pvc_name = pvc_name self.host_output_dir = host_output_dir self.node_selector = node_selector or {} @@ -47,94 +73,96 @@ def __init__( self.shm_size = shm_size def render(self, spec: ProfileJobSpec) -> str: - job_name = spec.default_job_name()[:63] # K8s name limit + return _dump(self._build_job_dict(spec)) + + # ----------------------------------------------------------------- + # Build a plain-dict manifest (used by both render and submit) + # ----------------------------------------------------------------- + def _build_job_dict(self, spec: ProfileJobSpec) -> dict: + """Return the Job manifest as a nested Python dict.""" + job_name = spec.default_job_name()[:63] cmd = spec.build_profile_command() - lines: list[str] = [] - _a = lines.append - - _a("apiVersion: batch/v1") - _a("kind: Job") - _a("metadata:") - _a(f" name: {job_name}") - _a(f" namespace: {self.namespace}") - _a(" labels:") - _a(" app: flowsim") - _a(" component: profiling") - _a(f" collect: {spec.collect}") - _a("spec:") - _a(" backoffLimit: 0") - _a(" ttlSecondsAfterFinished: 86400") - _a(" template:") - _a(" metadata:") - _a(" labels:") - _a(" app: flowsim") - _a(" component: profiling") - _a(" spec:") - if self.service_account: - _a(f" serviceAccountName: {self.service_account}") - if self.node_selector: - _a(" nodeSelector:") - for k, v in self.node_selector.items(): - _a(f" {k}: {v}") - _a(" restartPolicy: Never") - _a(" containers:") - _a(" - name: profiler") - _a(f" image: {spec.image}") - _a(" imagePullPolicy: IfNotPresent") - _a(" workingDir: /flowsim") - _a(" command:") - for c in cmd: - _a(f' - "{c}"') - _a(" env:") - _a(" - name: SGLANG_PROFILE_KERNELS") - _a(' value: "1"') - _a(" resources:") - _a(" limits:") - _a(f' nvidia.com/gpu: "{spec.gpus}"') - _a(" requests:") - _a(f' nvidia.com/gpu: "{spec.gpus}"') - - # volumeMounts - _a(" volumeMounts:") - _a(" - name: dshm") - _a(" mountPath: /dev/shm") - if self.pvc_name or self.host_output_dir: - _a(" - name: output") - _a(f" mountPath: {spec.output_dir}") - - # volumes - _a(" volumes:") - _a(" - name: dshm") - _a(" emptyDir:") - _a(" medium: Memory") - _a(f" sizeLimit: {self.shm_size}") + # volumes + mounts + volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}] + volumes: list[dict] = [ + {"name": "dshm", "emptyDir": {"medium": "Memory", "sizeLimit": self.shm_size}}, + ] if self.pvc_name: - _a(" - name: output") - _a(" persistentVolumeClaim:") - _a(f" claimName: {self.pvc_name}") + volume_mounts.append({"name": "output", "mountPath": spec.output_dir}) + volumes.append({"name": "output", "persistentVolumeClaim": {"claimName": self.pvc_name}}) elif self.host_output_dir: - _a(" - name: output") - _a(" hostPath:") - _a(f" path: {self.host_output_dir}") - _a(" type: DirectoryOrCreate") + volume_mounts.append({"name": "output", "mountPath": spec.output_dir}) + volumes.append({"name": "output", "hostPath": {"path": self.host_output_dir, "type": "DirectoryOrCreate"}}) - return "\n".join(lines) + "\n" + container = { + "name": "profiler", + "image": spec.image, + "imagePullPolicy": "IfNotPresent", + "workingDir": "/flowsim", + "command": cmd, + "env": [{"name": "SGLANG_PROFILE_KERNELS", "value": "1"}], + "resources": { + "limits": {"nvidia.com/gpu": str(spec.gpus)}, + "requests": {"nvidia.com/gpu": str(spec.gpus)}, + }, + "volumeMounts": volume_mounts, + } + + pod_spec: dict = { + "restartPolicy": "Never", + "containers": [container], + "volumes": volumes, + } + if self.service_account: + pod_spec["serviceAccountName"] = self.service_account + if self.node_selector: + pod_spec["nodeSelector"] = dict(self.node_selector) + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": job_name, + "namespace": self.namespace, + "labels": {"app": "flowsim", "component": "profiling", "collect": spec.collect}, + }, + "spec": { + "backoffLimit": 0, + "ttlSecondsAfterFinished": 86400, + "template": { + "metadata": {"labels": {"app": "flowsim", "component": "profiling"}}, + "spec": pod_spec, + }, + }, + } def submit(self, spec: ProfileJobSpec) -> str: - manifest = self.render(spec) - with tempfile.NamedTemporaryFile( - mode="w", suffix=".yaml", delete=False - ) as f: - f.write(manifest) - f.flush() - result = subprocess.run( - ["kubectl", "apply", "-f", f.name], - capture_output=True, - text=True, + """Submit via the ``kubernetes`` Python client (``pip install kubernetes``).""" + try: + from kubernetes import client as k8s_client, config as k8s_config + except ImportError: + raise RuntimeError( + "The 'kubernetes' package is required for --submit. " + "Install it with: pip install kubernetes" ) - if result.returncode != 0: - raise RuntimeError( - f"kubectl apply failed:\n{result.stderr.strip()}" - ) - return result.stdout.strip() + + # Load kubeconfig / in-cluster config + config_kwargs: dict = {} + if self.kubeconfig: + config_kwargs["config_file"] = self.kubeconfig + if self.context: + config_kwargs["context"] = self.context + + try: + k8s_config.load_kube_config(**config_kwargs) + except k8s_config.ConfigException: + k8s_config.load_incluster_config() + + body = self._build_job_dict(spec) + batch_api = k8s_client.BatchV1Api() + resp = batch_api.create_namespaced_job( + namespace=self.namespace, + body=body, + ) + return f"job.batch/{resp.metadata.name} created (namespace={resp.metadata.namespace})" diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 4aa18d8..9261a15 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -1,14 +1,23 @@ -"""Slurm sbatch scheduler for FlowSim profiling.""" +"""Slurm sbatch scheduler for FlowSim profiling. + +``render()`` / ``dry_run()`` produce a standalone bash script (zero deps). +``submit()`` posts the script to a slurmrestd endpoint via stdlib +``urllib.request`` — no extra packages needed. +""" from __future__ import annotations -import subprocess -import tempfile -import textwrap +import json +import ssl +import urllib.error +import urllib.request from schedulers.base import BaseScheduler, ProfileJobSpec +_DEFAULT_API_VERSION = "v0.0.40" + + class SlurmScheduler(BaseScheduler): """Generate and optionally submit an sbatch script for profiling. @@ -18,6 +27,17 @@ class SlurmScheduler(BaseScheduler): Slurm partition to submit to. time_limit : str Wall-clock time limit (e.g., ``"01:00:00"``). + rest_url : str + Base URL of the slurmrestd daemon + (e.g., ``"https://slurm.example.com:6820"``). + Required only for ``submit()``. + jwt_token : str + JWT/auth token for slurmrestd. Required only for ``submit()``. + api_version : str + slurmrestd OpenAPI version (default: ``"v0.0.40"``). + Adjust to match your cluster (``v0.0.39``, ``v0.0.41``, …). + verify_ssl : bool + Whether to verify the slurmrestd TLS certificate (default True). account : str, optional ``--account`` for which allocation to charge. constraint : str, optional @@ -42,6 +62,10 @@ def __init__( *, partition: str = "gpu", time_limit: str = "02:00:00", + rest_url: str = "", + jwt_token: str = "", + api_version: str = _DEFAULT_API_VERSION, + verify_ssl: bool = True, account: str = "", constraint: str = "", container_runtime: str = "none", @@ -51,6 +75,10 @@ def __init__( ) -> None: self.partition = partition self.time_limit = time_limit + self.rest_url = rest_url.rstrip("/") + self.jwt_token = jwt_token + self.api_version = api_version + self.verify_ssl = verify_ssl self.account = account self.constraint = constraint self.container_runtime = container_runtime @@ -123,19 +151,86 @@ def render(self, spec: ProfileJobSpec) -> str: return "\n".join(lines) def submit(self, spec: ProfileJobSpec) -> str: - script = self.render(spec) - with tempfile.NamedTemporaryFile( - mode="w", suffix=".sh", delete=False - ) as f: - f.write(script) - f.flush() - result = subprocess.run( - ["sbatch", f.name], - capture_output=True, - text=True, + """Submit the job via slurmrestd REST API. + + Requires ``rest_url`` and ``jwt_token`` to be set. + Uses only ``urllib.request`` from the standard library. + """ + if not self.rest_url: + raise RuntimeError( + "--slurm-rest-url is required for --submit. " + "Point it at your slurmrestd endpoint " + "(e.g. https://slurm.example.com:6820)." + ) + if not self.jwt_token: + raise RuntimeError( + "--slurm-jwt-token is required for --submit. " + "Generate one via: scontrol token lifespan=3600" ) - if result.returncode != 0: - raise RuntimeError( - f"sbatch failed:\n{result.stderr.strip()}" - ) - return result.stdout.strip() + + script = self.render(spec) + job_name = spec.default_job_name() + + url = ( + f"{self.rest_url}/slurm/{self.api_version}/job/submit" + ) + + # slurmrestd job submission payload + payload = { + "script": script, + "job": { + "name": job_name, + "partition": self.partition, + "time_limit": {"number": self._parse_time_minutes(), "set": True}, + "tasks": 1, + "current_working_directory": "/flowsim", + "environment": ["PATH=/usr/local/bin:/usr/bin:/bin"], + }, + } + if self.account: + payload["job"]["account"] = self.account + + data = json.dumps(payload).encode() + headers = { + "Content-Type": "application/json", + "X-SLURM-USER-TOKEN": self.jwt_token, + } + req = urllib.request.Request(url, data=data, headers=headers, method="POST") + + ctx: ssl.SSLContext | None = None + if not self.verify_ssl: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + try: + with urllib.request.urlopen(req, context=ctx) as resp: + body = json.loads(resp.read()) + except urllib.error.HTTPError as exc: + detail = exc.read().decode(errors="replace") + raise RuntimeError( + f"slurmrestd returned HTTP {exc.code}:\n{detail}" + ) from exc + except urllib.error.URLError as exc: + raise RuntimeError( + f"Cannot reach slurmrestd at {self.rest_url}: {exc.reason}" + ) from exc + + # Response contains job_id on success, errors array on failure + errors = body.get("errors") or [] + if errors: + msgs = "; ".join(e.get("error", str(e)) for e in errors) + raise RuntimeError(f"slurmrestd job submit failed: {msgs}") + + job_id = body.get("job_id", "unknown") + return f"Submitted batch job {job_id}" + + def _parse_time_minutes(self) -> int: + """Convert HH:MM:SS time_limit to total minutes.""" + parts = self.time_limit.split(":") + if len(parts) == 3: + h, m, s = int(parts[0]), int(parts[1]), int(parts[2]) + return h * 60 + m + (1 if s > 0 else 0) + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + return int(parts[0]) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 8e309dd..d4bc47c 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -106,6 +106,16 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: # -- Kubernetes-specific -- k8s = p.add_argument_group("kubernetes options") k8s.add_argument("--k8s-namespace", default="default") + k8s.add_argument( + "--k8s-kubeconfig", + default="", + help="Path to kubeconfig file (empty = default lookup)", + ) + k8s.add_argument( + "--k8s-context", + default="", + help="kubeconfig context to use", + ) k8s.add_argument( "--k8s-pvc", default="", @@ -130,6 +140,28 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: slurm = p.add_argument_group("slurm options") slurm.add_argument("--slurm-partition", default="gpu") slurm.add_argument("--slurm-time", default="02:00:00") + slurm.add_argument( + "--slurm-rest-url", + default="", + help="slurmrestd base URL (e.g. https://slurm.example.com:6820). " + "Required for --submit.", + ) + slurm.add_argument( + "--slurm-jwt-token", + default="", + help="JWT token for slurmrestd auth. " + "Generate via: scontrol token lifespan=3600", + ) + slurm.add_argument( + "--slurm-api-version", + default="v0.0.40", + help="slurmrestd OpenAPI version (default: v0.0.40)", + ) + slurm.add_argument( + "--slurm-no-verify-ssl", + action="store_true", + help="Skip TLS certificate verification for slurmrestd", + ) slurm.add_argument("--slurm-account", default="") slurm.add_argument("--slurm-constraint", default="") slurm.add_argument( @@ -203,6 +235,8 @@ def _build_scheduler(args: argparse.Namespace): node_sel[k] = v return K8sScheduler( namespace=args.k8s_namespace, + kubeconfig=args.k8s_kubeconfig, + context=args.k8s_context, pvc_name=args.k8s_pvc, host_output_dir=args.k8s_host_output_dir, node_selector=node_sel, @@ -213,6 +247,10 @@ def _build_scheduler(args: argparse.Namespace): return SlurmScheduler( partition=args.slurm_partition, time_limit=args.slurm_time, + rest_url=args.slurm_rest_url, + jwt_token=args.slurm_jwt_token, + api_version=args.slurm_api_version, + verify_ssl=not args.slurm_no_verify_ssl, account=args.slurm_account, constraint=args.slurm_constraint, container_runtime=args.slurm_container_runtime, From 52294a60dba8f383906e3271724fef2569b17314 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:07:00 +0000 Subject: [PATCH 03/56] chore: add proper pyproject.toml with dependency declarations - Core deps: requests, perfetto, numpy, pandas - Optional dependency groups: k8s: kubernetes>=27.0, PyYAML>=6.0 slurm: (stdlib only, no extra deps) sim: scalesim, scipy, torch viz: matplotlib, seaborn api: fastapi, pydantic, uvicorn dev: black, pytest all: everything - Entry point: flowsim-submit -> scripts.submit_profile:main - requires-python >= 3.10 --- pyproject.toml | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0b237ec..ecf3f9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,66 @@ +[build-system] +requires = ["setuptools>=68.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "flowsim" +version = "0.1.0" +description = "Workload simulation pipeline for kernel-level inference profiling" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.10" +dependencies = [ + "requests>=2.28", + "perfetto>=0.7", + "numpy>=1.24", + "pandas>=1.5", +] + +[project.optional-dependencies] +# Scheduler backends ------------------------------------------------------- +k8s = [ + "kubernetes>=27.0", # K8s Python client for remote job submission + "PyYAML>=6.0", # nicer YAML dry-run output (json fallback w/o this) +] +slurm = [] # Slurm REST API uses stdlib urllib only + +# Full simulation stack (matches Dockerfile) -------------------------------- +sim = [ + "scalesim>=2.0", + "scipy>=1.10", + "torch>=2.0", +] + +# Visualization ------------------------------------------------------------- +viz = [ + "matplotlib>=3.7", + "seaborn>=0.12", +] + +# Backend API --------------------------------------------------------------- +api = [ + "fastapi>=0.100", + "pydantic>=2.0", + "uvicorn>=0.23", +] + +# Development --------------------------------------------------------------- +dev = [ + "black>=23.0", + "pytest>=7.0", +] + +# Everything ---------------------------------------------------------------- +all = [ + "flowsim[k8s,sim,viz,api,dev]", +] + +[project.scripts] +flowsim-submit = "scripts.submit_profile:main" + [tool.black] line-length = 80 include = '\.pyi?$' + +[tool.pytest.ini_options] +testpaths = ["tests"] From 54b615210232765964194565d6d3453e890f1f02 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:12:29 +0000 Subject: [PATCH 04/56] fix: make pip install -e . and entry point actually work - Add scripts/__init__.py so 'scripts' is a findable package - Remove sys.path hack from submit_profile.py (not needed after install) - Add [tool.setuptools.packages.find] with explicit include list (excludes tests/ and backend/ from the installable package) - Improve K8s submit error: catch both kubeconfig and in-cluster failures and show a single clear message with --k8s-kubeconfig hint Verified: pip install -e '.[k8s]' -> flowsim-submit --dry-run works. --- pyproject.toml | 8 ++++++++ schedulers/k8s.py | 12 +++++++++++- scripts/__init__.py | 0 scripts/submit_profile.py | 3 --- 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 scripts/__init__.py diff --git a/pyproject.toml b/pyproject.toml index ecf3f9b..f92fad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,14 @@ all = [ "flowsim[k8s,sim,viz,api,dev]", ] +[tool.setuptools.packages.find] +include = [ + "schedulers*", + "scripts*", + "simulator*", + "utils*", +] + [project.scripts] flowsim-submit = "scripts.submit_profile:main" diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 9b957a9..6b58ea9 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -157,7 +157,17 @@ def submit(self, spec: ProfileJobSpec) -> str: try: k8s_config.load_kube_config(**config_kwargs) except k8s_config.ConfigException: - k8s_config.load_incluster_config() + try: + k8s_config.load_incluster_config() + except k8s_config.ConfigException: + hint = "" + if not self.kubeconfig: + hint = " Try --k8s-kubeconfig /path/to/kubeconfig." + raise RuntimeError( + "No valid Kubernetes configuration found. " + "Checked kubeconfig file and in-cluster environment." + + hint + ) body = self._build_job_dict(spec) batch_api = k8s_client.BatchV1Api() diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index d4bc47c..2e24318 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -41,9 +41,6 @@ import argparse import sys -# Allow running from the repo root as ``python scripts/submit_profile.py`` -sys.path.insert(0, ".") - from schedulers.base import ProfileJobSpec from schedulers.k8s import K8sScheduler from schedulers.slurm import SlurmScheduler From 9e1c1f49c0db69215d64a28db39f5718e1c96f5b Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:16:11 +0000 Subject: [PATCH 05/56] refactor: unified CLI as 'flowsim submit' instead of 'flowsim-submit' - Add scripts/cli.py with subcommand routing (flowsim {submit, ...}) - Entry point changed: flowsim-submit -> flowsim - 'flowsim submit' delegates to submit_profile.main() - Extensible for future subcommands (profile, parse, simulate) --- pyproject.toml | 2 +- scripts/cli.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 scripts/cli.py diff --git a/pyproject.toml b/pyproject.toml index f92fad0..feade94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ include = [ ] [project.scripts] -flowsim-submit = "scripts.submit_profile:main" +flowsim = "scripts.cli:main" [tool.black] line-length = 80 diff --git a/scripts/cli.py b/scripts/cli.py new file mode 100644 index 0000000..5ea74bd --- /dev/null +++ b/scripts/cli.py @@ -0,0 +1,44 @@ +"""FlowSim CLI — unified entry point. + +Usage:: + + flowsim submit --scheduler k8s --collect perf --model-path ... --dry-run + flowsim submit --scheduler slurm --collect perf --model-path ... --submit +""" + +from __future__ import annotations + +import argparse +import sys + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="flowsim", + description="FlowSim: workload simulation pipeline CLI", + ) + sub = parser.add_subparsers(dest="command") + sub.required = True + + # ---- submit ---- + sub.add_parser( + "submit", + help="Submit a profiling job to K8s or Slurm", + add_help=False, # submit_profile has its own --help + ) + + # Parse only the subcommand, pass the rest through + args, remaining = parser.parse_known_args(argv) + + if args.command == "submit": + from scripts.submit_profile import main as submit_main + + submit_main(remaining) + return 0 + + parser.print_help() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From d37d8f3a22fd5463691981e0807804eba6878414 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:18:20 +0000 Subject: [PATCH 06/56] fix: 'flowsim submit' submits by default, --dry-run to preview Removed the redundant --submit flag. The subcommand name already implies submission; --dry-run is the opt-out. --- scripts/submit_profile.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 2e24318..41d6f54 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -182,17 +182,10 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) # -- Action -- - action = p.add_mutually_exclusive_group() - action.add_argument( + p.add_argument( "--dry-run", action="store_true", - default=True, - help="Print the rendered manifest to stdout (default)", - ) - action.add_argument( - "--submit", - action="store_true", - help="Actually submit the job to the cluster", + help="Only print the rendered manifest; do not submit", ) return p.parse_args(argv) @@ -262,11 +255,11 @@ def main(argv: list[str] | None = None) -> None: spec = _build_spec(args) scheduler = _build_scheduler(args) - if args.submit: + if args.dry_run: + print(scheduler.dry_run(spec)) + else: result = scheduler.submit(spec) print(result) - else: - print(scheduler.dry_run(spec)) if __name__ == "__main__": From af48f0c0621c5f226c1aff3980a3383a970e94de Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:19:40 +0000 Subject: [PATCH 07/56] fix: validate cluster connection params before submit - Slurm: fail fast if --slurm-rest-url or --slurm-jwt-token missing - K8s: warn to stderr when no explicit kubeconfig/context provided - --dry-run skips validation (no cluster needed for manifest preview) --- scripts/submit_profile.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 41d6f54..9654e40 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -253,6 +253,11 @@ def _build_scheduler(args: argparse.Namespace): def main(argv: list[str] | None = None) -> None: args = _parse_args(argv) spec = _build_spec(args) + + # Validate connection params before building the scheduler + if not args.dry_run: + _validate_connection(args) + scheduler = _build_scheduler(args) if args.dry_run: @@ -262,5 +267,31 @@ def main(argv: list[str] | None = None) -> None: print(result) +def _validate_connection(args: argparse.Namespace) -> None: + """Fail fast if required cluster connection params are missing.""" + if args.scheduler == "k8s": + # kubernetes client can auto-discover from ~/.kube/config or + # in-cluster env, but warn if nothing explicit is given + if not args.k8s_kubeconfig and not args.k8s_context: + print( + "Note: no --k8s-kubeconfig or --k8s-context specified. " + "Will try ~/.kube/config and in-cluster auto-discovery.", + file=sys.stderr, + ) + elif args.scheduler == "slurm": + missing = [] + if not args.slurm_rest_url: + missing.append("--slurm-rest-url") + if not args.slurm_jwt_token: + missing.append("--slurm-jwt-token") + if missing: + sys.exit( + f"Error: {', '.join(missing)} required for Slurm submission.\n" + f" --slurm-rest-url: slurmrestd endpoint " + f"(e.g. https://slurm.example.com:6820)\n" + f" --slurm-jwt-token: generate via 'scontrol token lifespan=3600'" + ) + + if __name__ == "__main__": main() From 87a2c332c1d73c5460efbb83cb513280a4bb28bc Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:21:22 +0000 Subject: [PATCH 08/56] feat: support env vars for cluster connection params Connection params now read from environment variables as defaults, so you don't have to pass them every invocation: K8s: KUBECONFIG -> --k8s-kubeconfig FLOWSIM_K8S_NAMESPACE -> --k8s-namespace FLOWSIM_K8S_CONTEXT -> --k8s-context Slurm: FLOWSIM_SLURM_REST_URL -> --slurm-rest-url FLOWSIM_SLURM_JWT_TOKEN -> --slurm-jwt-token FLOWSIM_SLURM_PARTITION -> --slurm-partition FLOWSIM_SLURM_TIME -> --slurm-time FLOWSIM_SLURM_API_VERSION -> --slurm-api-version CLI flags override env vars. Env var names shown in --help. --- scripts/submit_profile.py | 41 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 9654e40..0d95efc 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -39,6 +39,7 @@ from __future__ import annotations import argparse +import os import sys from schedulers.base import ProfileJobSpec @@ -102,16 +103,20 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: # -- Kubernetes-specific -- k8s = p.add_argument_group("kubernetes options") - k8s.add_argument("--k8s-namespace", default="default") + k8s.add_argument( + "--k8s-namespace", + default=os.environ.get("FLOWSIM_K8S_NAMESPACE", "default"), + help="K8s namespace (env: FLOWSIM_K8S_NAMESPACE)", + ) k8s.add_argument( "--k8s-kubeconfig", - default="", - help="Path to kubeconfig file (empty = default lookup)", + default=os.environ.get("KUBECONFIG", ""), + help="Path to kubeconfig file (env: KUBECONFIG)", ) k8s.add_argument( "--k8s-context", - default="", - help="kubeconfig context to use", + default=os.environ.get("FLOWSIM_K8S_CONTEXT", ""), + help="kubeconfig context to use (env: FLOWSIM_K8S_CONTEXT)", ) k8s.add_argument( "--k8s-pvc", @@ -135,24 +140,30 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: # -- Slurm-specific -- slurm = p.add_argument_group("slurm options") - slurm.add_argument("--slurm-partition", default="gpu") - slurm.add_argument("--slurm-time", default="02:00:00") + slurm.add_argument( + "--slurm-partition", + default=os.environ.get("FLOWSIM_SLURM_PARTITION", "gpu"), + help="Slurm partition (env: FLOWSIM_SLURM_PARTITION)", + ) + slurm.add_argument( + "--slurm-time", + default=os.environ.get("FLOWSIM_SLURM_TIME", "02:00:00"), + help="Wall time limit (env: FLOWSIM_SLURM_TIME)", + ) slurm.add_argument( "--slurm-rest-url", - default="", - help="slurmrestd base URL (e.g. https://slurm.example.com:6820). " - "Required for --submit.", + default=os.environ.get("FLOWSIM_SLURM_REST_URL", ""), + help="slurmrestd base URL (env: FLOWSIM_SLURM_REST_URL)", ) slurm.add_argument( "--slurm-jwt-token", - default="", - help="JWT token for slurmrestd auth. " - "Generate via: scontrol token lifespan=3600", + default=os.environ.get("FLOWSIM_SLURM_JWT_TOKEN", ""), + help="JWT token for slurmrestd (env: FLOWSIM_SLURM_JWT_TOKEN)", ) slurm.add_argument( "--slurm-api-version", - default="v0.0.40", - help="slurmrestd OpenAPI version (default: v0.0.40)", + default=os.environ.get("FLOWSIM_SLURM_API_VERSION", "v0.0.40"), + help="slurmrestd API version (env: FLOWSIM_SLURM_API_VERSION)", ) slurm.add_argument( "--slurm-no-verify-ssl", From 63ab491b0ed5b25c3101e982fd00722993ad4aea Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:30:43 +0000 Subject: [PATCH 09/56] feat: config-first approach with flowsim init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No more built-in defaults for cluster connection params. Users must configure before submitting: flowsim init # copies templates to ~/.flowsim/ vim ~/.flowsim/k8s.yaml # fill in kubeconfig, namespace, etc. vim ~/.flowsim/slurm.yaml # fill in rest_url, partition, etc. flowsim submit ... # works Changes: - Add 'flowsim init' subcommand (copies templates, --force to overwrite) - Split config into ~/.flowsim/k8s.yaml and ~/.flowsim/slurm.yaml - Templates have empty REQUIRED fields — submit fails if unfilled - Config loader: schedulers/config.py with per-scheduler load functions - Priority: CLI flag > env var > config file (no silent fallbacks) - Slurm jwt_token_cmd: execute a command to get token at submit time - --dry-run skips all validation (no config needed for preview) --- schedulers/config.py | 108 ++++++++++++++++++++++++++++++++ schedulers/templates/k8s.yaml | 27 ++++++++ schedulers/templates/slurm.yaml | 30 +++++++++ scripts/cli.py | 56 +++++++++++++++-- scripts/submit_profile.py | 108 ++++++++++++++++++++++---------- 5 files changed, 292 insertions(+), 37 deletions(-) create mode 100644 schedulers/config.py create mode 100644 schedulers/templates/k8s.yaml create mode 100644 schedulers/templates/slurm.yaml diff --git a/schedulers/config.py b/schedulers/config.py new file mode 100644 index 0000000..011bf42 --- /dev/null +++ b/schedulers/config.py @@ -0,0 +1,108 @@ +"""Load FlowSim scheduler config from per-scheduler YAML files. + +Config file lookup (per scheduler): + +K8s: + 1. ``FLOWSIM_K8S_CONFIG`` env var + 2. ``~/.flowsim/k8s.yaml`` + +Slurm: + 1. ``FLOWSIM_SLURM_CONFIG`` env var + 2. ``~/.flowsim/slurm.yaml`` + +Priority (highest → lowest): + CLI flag > env var > config file > built-in default + +Template files are in ``schedulers/templates/k8s.yaml`` and +``schedulers/templates/slurm.yaml``. Copy to ``~/.flowsim/`` and edit. + +For Slurm, use ``jwt_token_cmd`` instead of ``jwt_token`` to avoid +storing secrets in plaintext. The command is executed at submit time +and its stdout is used as the token. +""" + +from __future__ import annotations + +import os +import shlex +import subprocess +from pathlib import Path + +# Optional: try PyYAML, fall back to JSON +try: + import yaml as _yaml + + def _load_yaml(path: Path) -> dict: + with open(path) as f: + return _yaml.safe_load(f) or {} + +except ImportError: + import json as _json + + def _load_yaml(path: Path) -> dict: # type: ignore[misc] + """Fallback: accept JSON (valid YAML 1.2 subset).""" + with open(path) as f: + return _json.load(f) + + +_CONFIG_DIR = Path.home() / ".flowsim" + + +def _resolve_path(env_var: str, filename: str) -> Path | None: + """Return the config file path, or None if it doesn't exist.""" + env = os.environ.get(env_var) + if env: + p = Path(env) + return p if p.is_file() else None + default = _CONFIG_DIR / filename + return default if default.is_file() else None + + +def load_k8s_config() -> dict: + """Load ``~/.flowsim/k8s.yaml`` (or ``FLOWSIM_K8S_CONFIG``).""" + path = _resolve_path("FLOWSIM_K8S_CONFIG", "k8s.yaml") + if path is None: + return {} + try: + return _load_yaml(path) + except Exception: + return {} + + +def load_slurm_config() -> dict: + """Load ``~/.flowsim/slurm.yaml`` (or ``FLOWSIM_SLURM_CONFIG``).""" + path = _resolve_path("FLOWSIM_SLURM_CONFIG", "slurm.yaml") + if path is None: + return {} + try: + return _load_yaml(path) + except Exception: + return {} + + +def resolve_jwt_token(slurm_cfg: dict) -> str: + """Get the JWT token from config, executing jwt_token_cmd if needed.""" + token = slurm_cfg.get("jwt_token", "") + if token: + return str(token) + + cmd = slurm_cfg.get("jwt_token_cmd", "") + if cmd: + result = subprocess.run( + shlex.split(cmd), + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + return result.stdout.strip() + + return "" + + +def cfg_get(cfg: dict, key: str, fallback: str = "") -> str: + """Get a value from a flat config dict, or fallback.""" + val = cfg.get(key) + if val is not None: + return str(val) + return fallback diff --git a/schedulers/templates/k8s.yaml b/schedulers/templates/k8s.yaml new file mode 100644 index 0000000..bac3a77 --- /dev/null +++ b/schedulers/templates/k8s.yaml @@ -0,0 +1,27 @@ +# FlowSim Kubernetes scheduler config +# +# Created by: flowsim init +# Location: ~/.flowsim/k8s.yaml +# +# Fill in the values below, then submit with: +# flowsim submit --scheduler k8s --collect perf --model-path ... +# +# CLI flags and env vars can override individual values. + +# REQUIRED — path to your kubeconfig file +kubeconfig: "" # e.g. /home/me/.kube/prod.kubeconfig + +# REQUIRED — which context and namespace to use +context: "" # e.g. prod-cluster (empty = current-context) +namespace: "" # e.g. ml-team + +# Output storage (pick one or leave both empty for emptyDir) +pvc: "" # PVC name for trace output +host_output_dir: "" # hostPath alternative to PVC + +# Optional +service_account: "" +shm_size: "16Gi" +# node_selector: +# gpu: a100 +# tier: high diff --git a/schedulers/templates/slurm.yaml b/schedulers/templates/slurm.yaml new file mode 100644 index 0000000..0910f4a --- /dev/null +++ b/schedulers/templates/slurm.yaml @@ -0,0 +1,30 @@ +# FlowSim Slurm scheduler config +# +# Created by: flowsim init +# Location: ~/.flowsim/slurm.yaml +# +# Fill in the values below, then submit with: +# flowsim submit --scheduler slurm --collect perf --model-path ... +# +# CLI flags and env vars can override individual values. + +# REQUIRED — slurmrestd endpoint +rest_url: "" # e.g. https://slurm.corp.com:6820 + +# REQUIRED — authentication (pick one) +# jwt_token: "" # not recommended — stored in plaintext +jwt_token_cmd: "" # e.g. "scontrol token lifespan=3600" + +# REQUIRED — cluster settings +partition: "" # e.g. gpu-h100 +account: "" # e.g. my-project + +# Optional +api_version: "v0.0.40" +time: "02:00:00" +constraint: "" +container_runtime: "none" # docker | enroot | none +container_mounts: "" +# modules: +# - cuda/12.6 +# - anaconda3 diff --git a/scripts/cli.py b/scripts/cli.py index 5ea74bd..dd2d825 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -2,14 +2,54 @@ Usage:: - flowsim submit --scheduler k8s --collect perf --model-path ... --dry-run - flowsim submit --scheduler slurm --collect perf --model-path ... --submit + flowsim init # set up ~/.flowsim/ config files + flowsim submit --scheduler k8s ... # submit a profiling job + flowsim submit ... --dry-run # preview manifest without submitting """ from __future__ import annotations import argparse +import shutil import sys +from pathlib import Path + + +_TEMPLATE_DIR = Path(__file__).resolve().parent.parent / "schedulers" / "templates" +_CONFIG_DIR = Path.home() / ".flowsim" + + +def _cmd_init(argv: list[str]) -> int: + """Copy config templates to ~/.flowsim/.""" + parser = argparse.ArgumentParser( + prog="flowsim init", + description="Initialize ~/.flowsim/ with scheduler config templates.", + ) + parser.add_argument( + "--force", + action="store_true", + help="Overwrite existing config files", + ) + args = parser.parse_args(argv) + + _CONFIG_DIR.mkdir(parents=True, exist_ok=True) + + templates = list(_TEMPLATE_DIR.glob("*.yaml")) + if not templates: + print(f"Error: no templates found in {_TEMPLATE_DIR}", file=sys.stderr) + return 1 + + for src in templates: + dst = _CONFIG_DIR / src.name + if dst.exists() and not args.force: + print(f" skip {dst} (already exists, use --force to overwrite)") + else: + shutil.copy2(src, dst) + print(f" wrote {dst}") + + print(f"\nEdit the files in {_CONFIG_DIR}/ to configure your cluster,") + print("then run: flowsim submit --scheduler ...") + return 0 def main(argv: list[str] | None = None) -> int: @@ -20,16 +60,22 @@ def main(argv: list[str] | None = None) -> int: sub = parser.add_subparsers(dest="command") sub.required = True - # ---- submit ---- + sub.add_parser( + "init", + help="Initialize ~/.flowsim/ with config templates", + add_help=False, + ) sub.add_parser( "submit", help="Submit a profiling job to K8s or Slurm", - add_help=False, # submit_profile has its own --help + add_help=False, ) - # Parse only the subcommand, pass the rest through args, remaining = parser.parse_known_args(argv) + if args.command == "init": + return _cmd_init(remaining) + if args.command == "submit": from scripts.submit_profile import main as submit_main diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 0d95efc..5a41349 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -43,11 +43,21 @@ import sys from schedulers.base import ProfileJobSpec +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_jwt_token from schedulers.k8s import K8sScheduler from schedulers.slurm import SlurmScheduler +def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: + """Resolve default: env var > config file > fallback.""" + return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) + + def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + # Load per-scheduler config files for defaults + k8s_cfg = load_k8s_config() + slurm_cfg = load_slurm_config() + p = argparse.ArgumentParser( description="Submit FlowSim profiling jobs to K8s or Slurm.", formatter_class=argparse.RawDescriptionHelpFormatter, @@ -102,30 +112,30 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: infra.add_argument("--job-name", default="") # -- Kubernetes-specific -- - k8s = p.add_argument_group("kubernetes options") + k8s = p.add_argument_group("kubernetes options (config: ~/.flowsim/k8s.yaml)") k8s.add_argument( "--k8s-namespace", - default=os.environ.get("FLOWSIM_K8S_NAMESPACE", "default"), + default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), help="K8s namespace (env: FLOWSIM_K8S_NAMESPACE)", ) k8s.add_argument( "--k8s-kubeconfig", - default=os.environ.get("KUBECONFIG", ""), + default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), help="Path to kubeconfig file (env: KUBECONFIG)", ) k8s.add_argument( "--k8s-context", - default=os.environ.get("FLOWSIM_K8S_CONTEXT", ""), - help="kubeconfig context to use (env: FLOWSIM_K8S_CONTEXT)", + default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), + help="kubeconfig context (env: FLOWSIM_K8S_CONTEXT)", ) k8s.add_argument( "--k8s-pvc", - default="", + default=cfg_get(k8s_cfg, "pvc", ""), help="PVC name for output volume (omit for emptyDir)", ) k8s.add_argument( "--k8s-host-output-dir", - default="", + default=cfg_get(k8s_cfg, "host_output_dir", ""), help="hostPath for output (used when --k8s-pvc is empty)", ) k8s.add_argument( @@ -135,34 +145,40 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: metavar="KEY=VALUE", help="Node selector labels (repeatable)", ) - k8s.add_argument("--k8s-service-account", default="") - k8s.add_argument("--k8s-shm-size", default="16Gi") + k8s.add_argument( + "--k8s-service-account", + default=cfg_get(k8s_cfg, "service_account", ""), + ) + k8s.add_argument( + "--k8s-shm-size", + default=cfg_get(k8s_cfg, "shm_size", "16Gi"), + ) # -- Slurm-specific -- - slurm = p.add_argument_group("slurm options") + slurm = p.add_argument_group("slurm options (config: ~/.flowsim/slurm.yaml)") slurm.add_argument( "--slurm-partition", - default=os.environ.get("FLOWSIM_SLURM_PARTITION", "gpu"), + default=_d("FLOWSIM_SLURM_PARTITION", slurm_cfg, "partition", ""), help="Slurm partition (env: FLOWSIM_SLURM_PARTITION)", ) slurm.add_argument( "--slurm-time", - default=os.environ.get("FLOWSIM_SLURM_TIME", "02:00:00"), + default=_d("FLOWSIM_SLURM_TIME", slurm_cfg, "time", "02:00:00"), help="Wall time limit (env: FLOWSIM_SLURM_TIME)", ) slurm.add_argument( "--slurm-rest-url", - default=os.environ.get("FLOWSIM_SLURM_REST_URL", ""), + default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), help="slurmrestd base URL (env: FLOWSIM_SLURM_REST_URL)", ) slurm.add_argument( "--slurm-jwt-token", - default=os.environ.get("FLOWSIM_SLURM_JWT_TOKEN", ""), + default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), help="JWT token for slurmrestd (env: FLOWSIM_SLURM_JWT_TOKEN)", ) slurm.add_argument( "--slurm-api-version", - default=os.environ.get("FLOWSIM_SLURM_API_VERSION", "v0.0.40"), + default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), help="slurmrestd API version (env: FLOWSIM_SLURM_API_VERSION)", ) slurm.add_argument( @@ -170,19 +186,30 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: action="store_true", help="Skip TLS certificate verification for slurmrestd", ) - slurm.add_argument("--slurm-account", default="") - slurm.add_argument("--slurm-constraint", default="") + slurm.add_argument( + "--slurm-account", + default=cfg_get(slurm_cfg, "account", ""), + ) + slurm.add_argument( + "--slurm-constraint", + default=cfg_get(slurm_cfg, "constraint", ""), + ) slurm.add_argument( "--slurm-container-runtime", choices=["docker", "enroot", "none"], - default="none", + default=cfg_get(slurm_cfg, "container_runtime", "none"), + ) + slurm.add_argument( + "--slurm-container-mounts", + default=cfg_get(slurm_cfg, "container_mounts", ""), ) - slurm.add_argument("--slurm-container-mounts", default="") + # Modules from config (list) + CLI (append) + cfg_modules = slurm_cfg.get("modules") if isinstance(slurm_cfg.get("modules"), list) else [] slurm.add_argument( "--slurm-module", action="append", - default=[], - help="Modules to load (repeatable)", + default=[str(m) for m in cfg_modules], + help="Modules to load (repeatable, merged with config)", ) slurm.add_argument( "--slurm-extra-sbatch", @@ -263,12 +290,19 @@ def _build_scheduler(args: argparse.Namespace): def main(argv: list[str] | None = None) -> None: args = _parse_args(argv) - spec = _build_spec(args) - # Validate connection params before building the scheduler + # Resolve Slurm JWT token from jwt_token_cmd in config if needed + if args.scheduler == "slurm" and not args.slurm_jwt_token: + slurm_cfg = load_slurm_config() + token = resolve_jwt_token(slurm_cfg) + if token: + args.slurm_jwt_token = token + + # Validate required connection params before submit if not args.dry_run: _validate_connection(args) + spec = _build_spec(args) scheduler = _build_scheduler(args) if args.dry_run: @@ -278,29 +312,39 @@ def main(argv: list[str] | None = None) -> None: print(result) +_INIT_HINT = "Run 'flowsim init' to create config files." + + def _validate_connection(args: argparse.Namespace) -> None: """Fail fast if required cluster connection params are missing.""" if args.scheduler == "k8s": - # kubernetes client can auto-discover from ~/.kube/config or - # in-cluster env, but warn if nothing explicit is given + if not args.k8s_namespace: + sys.exit( + "Error: K8s namespace not set.\n" + "Set it in ~/.flowsim/k8s.yaml, FLOWSIM_K8S_NAMESPACE env var,\n" + f"or --k8s-namespace flag. {_INIT_HINT}" + ) + # kubeconfig is optional (in-cluster auto-discovery), but warn if not args.k8s_kubeconfig and not args.k8s_context: print( - "Note: no --k8s-kubeconfig or --k8s-context specified. " + "Note: no kubeconfig or context specified. " "Will try ~/.kube/config and in-cluster auto-discovery.", file=sys.stderr, ) elif args.scheduler == "slurm": missing = [] if not args.slurm_rest_url: - missing.append("--slurm-rest-url") + missing.append("rest_url (--slurm-rest-url)") if not args.slurm_jwt_token: - missing.append("--slurm-jwt-token") + missing.append("jwt_token/jwt_token_cmd (--slurm-jwt-token)") + if not args.slurm_partition: + missing.append("partition (--slurm-partition)") if missing: sys.exit( - f"Error: {', '.join(missing)} required for Slurm submission.\n" - f" --slurm-rest-url: slurmrestd endpoint " - f"(e.g. https://slurm.example.com:6820)\n" - f" --slurm-jwt-token: generate via 'scontrol token lifespan=3600'" + "Error: missing required Slurm config:\n" + + "\n".join(f" - {m}" for m in missing) + + f"\n\nSet them in ~/.flowsim/slurm.yaml or via CLI flags.\n" + + _INIT_HINT ) From 7116fef3de3586e258949bf0b33e0e931ca27fa1 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:36:57 +0000 Subject: [PATCH 10/56] refactor: flowsim init takes CLI args instead of interactive prompts - flowsim init k8s --kubeconfig ... --namespace ... - flowsim init slurm --rest-url ... --partition ... --account ... - Required fields enforced by argparse, --help shows everything - --force to overwrite existing config - Demote --dry-run to [debug] in submit help text - Remove template-copy approach, use _save_yaml() directly --- schedulers/config.py | 14 ++++ scripts/cli.py | 136 ++++++++++++++++++++++++++++++-------- scripts/submit_profile.py | 2 +- 3 files changed, 124 insertions(+), 28 deletions(-) diff --git a/schedulers/config.py b/schedulers/config.py index 011bf42..4f35494 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -48,6 +48,20 @@ def _load_yaml(path: Path) -> dict: # type: ignore[misc] _CONFIG_DIR = Path.home() / ".flowsim" +def _save_yaml(path: Path, data: dict) -> None: + """Write a dict to a YAML file (uses PyYAML if available, else JSON).""" + path.parent.mkdir(parents=True, exist_ok=True) + try: + import yaml as _y + with open(path, "w") as f: + _y.safe_dump(data, f, default_flow_style=False, sort_keys=False) + except ImportError: + import json as _j + with open(path, "w") as f: + _j.dump(data, f, indent=2, ensure_ascii=False) + f.write("\n") + + def _resolve_path(env_var: str, filename: str) -> Path | None: """Return the config file path, or None if it doesn't exist.""" env = os.environ.get(env_var) diff --git a/scripts/cli.py b/scripts/cli.py index dd2d825..5cd370a 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -2,53 +2,135 @@ Usage:: - flowsim init # set up ~/.flowsim/ config files - flowsim submit --scheduler k8s ... # submit a profiling job - flowsim submit ... --dry-run # preview manifest without submitting + flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team ... + flowsim init slurm --rest-url https://slurm:6820 --partition gpu ... + flowsim submit --scheduler k8s --collect perf --model-path ... + flowsim submit ... --dry-run # debug: preview manifest """ from __future__ import annotations import argparse -import shutil import sys from pathlib import Path -_TEMPLATE_DIR = Path(__file__).resolve().parent.parent / "schedulers" / "templates" _CONFIG_DIR = Path.home() / ".flowsim" +def _init_k8s_parser(sub: argparse._SubParsersAction) -> None: + p = sub.add_parser("k8s", help="Configure Kubernetes scheduler") + p.add_argument("--kubeconfig", required=True, + help="Path to kubeconfig file (REQUIRED)") + p.add_argument("--context", default="", + help="Kubeconfig context (empty = current-context)") + p.add_argument("--namespace", required=True, + help="Kubernetes namespace (REQUIRED)") + p.add_argument("--pvc", default="", + help="PVC name for trace output") + p.add_argument("--host-output-dir", default="", + help="hostPath alternative to PVC") + p.add_argument("--service-account", default="", + help="Service account for the job pod") + p.add_argument("--shm-size", default="16Gi", + help="Shared memory size (default: 16Gi)") + p.add_argument("--force", action="store_true", + help="Overwrite existing config file") + + +def _init_slurm_parser(sub: argparse._SubParsersAction) -> None: + p = sub.add_parser("slurm", help="Configure Slurm scheduler") + p.add_argument("--rest-url", required=True, + help="slurmrestd endpoint URL (REQUIRED)") + p.add_argument("--partition", required=True, + help="Slurm partition (REQUIRED)") + p.add_argument("--account", required=True, + help="Slurm account (REQUIRED)") + p.add_argument("--jwt-token-cmd", default="", + help='Command to get JWT token, e.g. "scontrol token lifespan=3600"') + p.add_argument("--jwt-token", default="", + help="Static JWT token (not recommended)") + p.add_argument("--api-version", default="v0.0.40", + help="slurmrestd API version (default: v0.0.40)") + p.add_argument("--time", default="02:00:00", + help="Job time limit (default: 02:00:00)") + p.add_argument("--constraint", default="", + help="Node constraint") + p.add_argument("--container-runtime", default="none", + choices=["docker", "enroot", "none"], + help="Container runtime (default: none)") + p.add_argument("--container-mounts", default="", + help="Container mount spec") + p.add_argument("--force", action="store_true", + help="Overwrite existing config file") + + def _cmd_init(argv: list[str]) -> int: - """Copy config templates to ~/.flowsim/.""" + """Save scheduler config to ~/.flowsim/ from CLI args.""" + from schedulers.config import _save_yaml + parser = argparse.ArgumentParser( prog="flowsim init", - description="Initialize ~/.flowsim/ with scheduler config templates.", - ) - parser.add_argument( - "--force", - action="store_true", - help="Overwrite existing config files", + description=( + "Configure a scheduler and save to ~/.flowsim/.\n\n" + "Examples:\n" + " flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team\n" + " flowsim init slurm --rest-url https://slurm:6820 " + "--partition gpu --account proj" + ), + formatter_class=argparse.RawDescriptionHelpFormatter, ) - args = parser.parse_args(argv) + sub = parser.add_subparsers(dest="scheduler") + sub.required = True + _init_k8s_parser(sub) + _init_slurm_parser(sub) - _CONFIG_DIR.mkdir(parents=True, exist_ok=True) + args = parser.parse_args(argv) - templates = list(_TEMPLATE_DIR.glob("*.yaml")) - if not templates: - print(f"Error: no templates found in {_TEMPLATE_DIR}", file=sys.stderr) + if args.scheduler == "k8s": + kube_path = Path(args.kubeconfig).expanduser() + if not kube_path.is_file(): + print(f"Error: kubeconfig not found: {kube_path}", file=sys.stderr) + return 1 + cfg = { + "kubeconfig": str(kube_path), + "context": args.context, + "namespace": args.namespace, + "pvc": args.pvc, + "host_output_dir": args.host_output_dir, + "service_account": args.service_account, + "shm_size": args.shm_size, + } + dst = _CONFIG_DIR / "k8s.yaml" + + elif args.scheduler == "slurm": + if not args.jwt_token_cmd and not args.jwt_token: + print("Error: provide --jwt-token-cmd or --jwt-token", file=sys.stderr) + return 1 + cfg = { + "rest_url": args.rest_url, + "jwt_token_cmd": args.jwt_token_cmd, + "jwt_token": args.jwt_token, + "partition": args.partition, + "account": args.account, + "api_version": args.api_version, + "time": args.time, + "constraint": args.constraint, + "container_runtime": args.container_runtime, + "container_mounts": args.container_mounts, + } + dst = _CONFIG_DIR / "slurm.yaml" + else: + parser.print_help() return 1 - for src in templates: - dst = _CONFIG_DIR / src.name - if dst.exists() and not args.force: - print(f" skip {dst} (already exists, use --force to overwrite)") - else: - shutil.copy2(src, dst) - print(f" wrote {dst}") + if dst.exists() and not args.force: + print(f"Error: {dst} already exists (use --force to overwrite)", + file=sys.stderr) + return 1 - print(f"\nEdit the files in {_CONFIG_DIR}/ to configure your cluster,") - print("then run: flowsim submit --scheduler ...") + _save_yaml(dst, cfg) + print(f"Saved {dst}") return 0 @@ -62,7 +144,7 @@ def main(argv: list[str] | None = None) -> int: sub.add_parser( "init", - help="Initialize ~/.flowsim/ with config templates", + help="Configure a scheduler (k8s/slurm) and save to ~/.flowsim/", add_help=False, ) sub.add_parser( diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 5a41349..8400bec 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -223,7 +223,7 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: p.add_argument( "--dry-run", action="store_true", - help="Only print the rendered manifest; do not submit", + help="[debug] Print rendered manifest without submitting", ) return p.parse_args(argv) From 8987c389f446754fe42dce116c4843ec1b1d54e6 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:50:20 +0000 Subject: [PATCH 11/56] feat: PD disaggregation support + multi-node Docker test infra Docker test environments: - kind-multi-node.yaml: 1 control-plane + 2 workers (GPU 0, GPU 1) - slurm-compose.yaml: slurmctld + 2 slurmd (GPU 0, GPU 1) + slurmrestd - slurm-node.dockerfile + slurm.conf: Slurm 23.11 with JWT auth PD disaggregation: - ProfileJobSpec: disagg_mode, disagg_transfer_backend, disagg_bootstrap_port, disagg_prefill_pp, disagg_ib_device - as_prefill() / as_decode() helpers for creating PD pairs - BaseScheduler: render_pd_pair() and submit_pd_pair() - CLI: --pd flag submits prefill + decode job pair - --disagg-transfer-backend (mooncake/nixl), --disagg-bootstrap-port, etc. Bugfix: - resolve_jwt_token: catch FileNotFoundError when jwt_token_cmd binary missing --- dockerfiles/kind-multi-node.yaml | 64 +++++++++++++ dockerfiles/slurm-compose.yaml | 152 ++++++++++++++++++++++++++++++ dockerfiles/slurm-node.dockerfile | 52 ++++++++++ dockerfiles/slurm.conf | 48 ++++++++++ schedulers/base.py | 42 ++++++++- schedulers/config.py | 19 ++-- scripts/submit_profile.py | 47 ++++++++- 7 files changed, 413 insertions(+), 11 deletions(-) create mode 100644 dockerfiles/kind-multi-node.yaml create mode 100644 dockerfiles/slurm-compose.yaml create mode 100644 dockerfiles/slurm-node.dockerfile create mode 100644 dockerfiles/slurm.conf diff --git a/dockerfiles/kind-multi-node.yaml b/dockerfiles/kind-multi-node.yaml new file mode 100644 index 0000000..c2208c4 --- /dev/null +++ b/dockerfiles/kind-multi-node.yaml @@ -0,0 +1,64 @@ +# kind cluster config — 1 control-plane + 2 GPU worker nodes +# +# Each worker gets one GPU via NVIDIA device plugin. +# Requires: kind, kubectl, nvidia-container-toolkit +# +# Usage: +# # Install kind (once) +# curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.27.0/kind-linux-amd64 +# chmod +x ./kind && sudo mv ./kind /usr/local/bin/ +# +# # Create cluster +# kind create cluster --name flowsim --config dockerfiles/kind-multi-node.yaml +# +# # Install NVIDIA device plugin (exposes GPUs to K8s) +# kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml +# +# # Verify +# kubectl get nodes +# kubectl describe node flowsim-worker | grep nvidia.com/gpu +# kubectl describe node flowsim-worker2 | grep nvidia.com/gpu +# +# # Init FlowSim +# flowsim init k8s --kubeconfig ~/.kube/config \ +# --context kind-flowsim --namespace default --force +# +# # Submit a profiling job +# flowsim submit --scheduler k8s --collect perf \ +# --model-path /models/Qwen-7B --gpus 1 +# +# # Teardown +# kind delete cluster --name flowsim + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 + +nodes: + - role: control-plane + + - role: worker + extraMounts: + # Pass GPU 0 into this node + - hostPath: /dev/nvidia0 + containerPath: /dev/nvidia0 + - hostPath: /dev/nvidiactl + containerPath: /dev/nvidiactl + - hostPath: /dev/nvidia-uvm + containerPath: /dev/nvidia-uvm + # Mount model weights (adjust to your path) + - hostPath: /home/administrator/zhangt + containerPath: /workspace + readOnly: true + + - role: worker + extraMounts: + # Pass GPU 1 into this node + - hostPath: /dev/nvidia1 + containerPath: /dev/nvidia1 + - hostPath: /dev/nvidiactl + containerPath: /dev/nvidiactl + - hostPath: /dev/nvidia-uvm + containerPath: /dev/nvidia-uvm + - hostPath: /home/administrator/zhangt + containerPath: /workspace + readOnly: true diff --git a/dockerfiles/slurm-compose.yaml b/dockerfiles/slurm-compose.yaml new file mode 100644 index 0000000..29f694d --- /dev/null +++ b/dockerfiles/slurm-compose.yaml @@ -0,0 +1,152 @@ +# Slurm test cluster — slurmctld + 2 compute nodes (GPU 0, GPU 1) + slurmrestd +# +# Usage: +# cd dockerfiles/ +# docker compose -f slurm-compose.yaml up -d +# +# # Wait for cluster to be ready (~30s) +# docker exec slurmctld sinfo +# +# # Get JWT token for REST API +# docker exec slurmctld scontrol token lifespan=3600 +# +# # Init FlowSim +# flowsim init slurm --rest-url http://localhost:6820 \ +# --partition normal --account default \ +# --jwt-token-cmd "docker exec slurmctld scontrol token lifespan=3600" \ +# --force +# +# # Submit a job +# flowsim submit --scheduler slurm --collect perf \ +# --model-path /models/Qwen-7B --gpus 1 +# +# # Teardown +# docker compose -f slurm-compose.yaml down -v + +x-slurm-base: &slurm-base + build: + context: . + dockerfile: slurm-node.dockerfile + volumes: + - slurm-etc:/etc/slurm + - munge-socket:/run/munge + # Share workspace for model weights / traces + - /home/administrator/zhangt:/workspace:ro + networks: + - slurm-net + +services: + # ---- Munge (shared auth daemon) ---- + munge: + <<: *slurm-base + container_name: munge + hostname: munge + command: > + bash -c " + if [ ! -f /etc/munge/munge.key ]; then + mungekey --create --force + fi + chown munge:munge /etc/munge/munge.key + chmod 400 /etc/munge/munge.key + gosu munge munged --foreground + " + volumes: + - munge-key:/etc/munge + - munge-socket:/run/munge + + # ---- Controller ---- + slurmctld: + <<: *slurm-base + container_name: slurmctld + hostname: slurmctld + command: > + bash -c " + until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done + slurmctld -D -vvv + " + depends_on: + - munge + volumes: + - slurm-etc:/etc/slurm + - munge-key:/etc/munge:ro + - munge-socket:/run/munge + - slurm-state:/var/spool/slurmctld + + # ---- Compute node 0 (GPU 0) ---- + slurmd-0: + <<: *slurm-base + container_name: slurmd-0 + hostname: slurmd-0 + command: > + bash -c " + until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done + slurmd -D -vvv + " + depends_on: + - slurmctld + volumes: + - slurm-etc:/etc/slurm:ro + - munge-key:/etc/munge:ro + - munge-socket:/run/munge + - /home/administrator/zhangt:/workspace:ro + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["0"] + capabilities: [gpu] + + # ---- Compute node 1 (GPU 1) ---- + slurmd-1: + <<: *slurm-base + container_name: slurmd-1 + hostname: slurmd-1 + command: > + bash -c " + until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done + slurmd -D -vvv + " + depends_on: + - slurmctld + volumes: + - slurm-etc:/etc/slurm:ro + - munge-key:/etc/munge:ro + - munge-socket:/run/munge + - /home/administrator/zhangt:/workspace:ro + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["1"] + capabilities: [gpu] + + # ---- REST API ---- + slurmrestd: + <<: *slurm-base + container_name: slurmrestd + hostname: slurmrestd + command: > + bash -c " + until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done + slurmrestd -a rest_auth/jwt 0.0.0.0:6820 -vvv + " + depends_on: + - slurmctld + ports: + - "6820:6820" + volumes: + - slurm-etc:/etc/slurm:ro + - munge-key:/etc/munge:ro + - munge-socket:/run/munge + +volumes: + slurm-etc: + slurm-state: + munge-key: + munge-socket: + +networks: + slurm-net: + driver: bridge diff --git a/dockerfiles/slurm-node.dockerfile b/dockerfiles/slurm-node.dockerfile new file mode 100644 index 0000000..397284d --- /dev/null +++ b/dockerfiles/slurm-node.dockerfile @@ -0,0 +1,52 @@ +# Slurm node image — controller, compute, and REST API +# +# Based on Ubuntu 22.04 with Slurm 23.11 + munge + JWT support. +# Used by slurm-compose.yaml. + +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + gosu \ + libhttp-parser-dev \ + libjson-c-dev \ + libjwt-dev \ + libmunge-dev \ + munge \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Install Slurm 23.11 from source (includes slurmrestd + JWT auth) +ARG SLURM_VERSION=23.11.10 +RUN cd /tmp && \ + wget -q https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 && \ + tar xjf slurm-${SLURM_VERSION}.tar.bz2 && \ + cd slurm-${SLURM_VERSION} && \ + ./configure \ + --prefix=/usr \ + --sysconfdir=/etc/slurm \ + --with-jwt \ + --with-http-parser \ + --with-json \ + --enable-slurmrestd && \ + make -j"$(nproc)" && \ + make install && \ + rm -rf /tmp/slurm-* + +# Create required directories and users +RUN useradd -r -s /sbin/nologin slurm && \ + mkdir -p /etc/slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ + chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm + +# Slurm config — 2 compute nodes, 1 GPU each +COPY slurm.conf /etc/slurm/slurm.conf + +# JWT key for REST API auth +RUN dd if=/dev/urandom bs=32 count=1 2>/dev/null | base64 > /etc/slurm/jwt_hs256.key && \ + chown slurm:slurm /etc/slurm/jwt_hs256.key && \ + chmod 0600 /etc/slurm/jwt_hs256.key + +CMD ["bash"] diff --git a/dockerfiles/slurm.conf b/dockerfiles/slurm.conf new file mode 100644 index 0000000..734509d --- /dev/null +++ b/dockerfiles/slurm.conf @@ -0,0 +1,48 @@ +# slurm.conf — minimal 2-node cluster for FlowSim testing +# +# Controller: slurmctld +# Compute: slurmd-0 (1 GPU), slurmd-1 (1 GPU) +# REST API: slurmrestd on port 6820 + +ClusterName=flowsim +SlurmctldHost=slurmctld + +# Auth +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key + +# Paths +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +# Scheduling +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +# GPU support +GresTypes=gpu + +# Accounting (minimal) +AccountingStorageType=accounting_storage/none +JobAcctGatherType=jobacct_gather/none + +# Timeouts +SlurmctldTimeout=30 +SlurmdTimeout=30 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 + +# Partitions +PartitionName=normal Nodes=slurmd-[0-1] Default=YES MaxTime=INFINITE State=UP + +# Node definitions — 1 GPU each +NodeName=slurmd-0 CPUs=8 RealMemory=32000 Gres=gpu:1 State=UNKNOWN +NodeName=slurmd-1 CPUs=8 RealMemory=32000 Gres=gpu:1 State=UNKNOWN diff --git a/schedulers/base.py b/schedulers/base.py index df40429..1427e8e 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -38,6 +38,13 @@ class ProfileJobSpec: log_dir: str = "/flowsim/tests/test-artifacts" job_name: str = "" + # -- PD disaggregation -- + disagg_mode: str = "" # "prefill", "decode", or "" (unified) + disagg_transfer_backend: str = "mooncake" # "mooncake" or "nixl" + disagg_bootstrap_port: int = 8998 + disagg_prefill_pp: int = 1 + disagg_ib_device: str = "" + # -- Extra server opts (appended verbatim) -- extra_server_opts: str = "" @@ -51,6 +58,14 @@ def build_server_opts(self) -> str: ] if self.dp > 1: parts.append(f"--dp {self.dp}") + if self.disagg_mode: + parts.append(f"--disaggregation-mode {self.disagg_mode}") + parts.append(f"--disaggregation-transfer-backend {self.disagg_transfer_backend}") + parts.append(f"--disaggregation-bootstrap-port {self.disagg_bootstrap_port}") + if self.disagg_prefill_pp > 1: + parts.append(f"--disaggregation-prefill-pp {self.disagg_prefill_pp}") + if self.disagg_ib_device: + parts.append(f"--disaggregation-ib-device {self.disagg_ib_device}") if self.extra_server_opts: parts.append(self.extra_server_opts) return " ".join(parts) @@ -110,7 +125,20 @@ def default_job_name(self) -> str: if self.job_name: return self.job_name model_short = self.model_path.split("/")[-1].lower().replace(".", "-") - return f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" + name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" + if self.disagg_mode: + name += f"-{self.disagg_mode}" + return name + + def as_prefill(self) -> "ProfileJobSpec": + """Return a copy configured as the prefill instance.""" + from dataclasses import replace + return replace(self, disagg_mode="prefill") + + def as_decode(self) -> "ProfileJobSpec": + """Return a copy configured as the decode instance.""" + from dataclasses import replace + return replace(self, disagg_mode="decode") class BaseScheduler(abc.ABC): @@ -127,3 +155,15 @@ def submit(self, spec: ProfileJobSpec) -> str: def dry_run(self, spec: ProfileJobSpec) -> str: """Render and return the manifest without submitting.""" return self.render(spec) + + def render_pd_pair(self, spec: ProfileJobSpec) -> str: + """Render both prefill and decode manifests for PD disaggregation.""" + prefill = self.render(spec.as_prefill()) + decode = self.render(spec.as_decode()) + return f"# === PREFILL INSTANCE ===\n{prefill}\n# === DECODE INSTANCE ===\n{decode}" + + def submit_pd_pair(self, spec: ProfileJobSpec) -> str: + """Submit both prefill and decode jobs.""" + r1 = self.submit(spec.as_prefill()) + r2 = self.submit(spec.as_decode()) + return f"[prefill] {r1}\n[decode] {r2}" diff --git a/schedulers/config.py b/schedulers/config.py index 4f35494..185c87f 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -102,14 +102,17 @@ def resolve_jwt_token(slurm_cfg: dict) -> str: cmd = slurm_cfg.get("jwt_token_cmd", "") if cmd: - result = subprocess.run( - shlex.split(cmd), - capture_output=True, - text=True, - timeout=30, - ) - if result.returncode == 0: - return result.stdout.strip() + try: + result = subprocess.run( + shlex.split(cmd), + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + return result.stdout.strip() + except (FileNotFoundError, OSError): + pass return "" diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 8400bec..3701892 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -226,6 +226,37 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: help="[debug] Print rendered manifest without submitting", ) + # -- PD disaggregation -- + pd = p.add_argument_group("PD disaggregation") + pd.add_argument( + "--pd", + action="store_true", + help="Submit a prefill + decode job pair (PD disaggregation)", + ) + pd.add_argument( + "--disagg-transfer-backend", + default="mooncake", + choices=["mooncake", "nixl"], + help="KV transfer backend (default: mooncake)", + ) + pd.add_argument( + "--disagg-bootstrap-port", + type=int, + default=8998, + help="Bootstrap port for PD coordination (default: 8998)", + ) + pd.add_argument( + "--disagg-prefill-pp", + type=int, + default=1, + help="Pipeline parallelism for prefill instance (default: 1)", + ) + pd.add_argument( + "--disagg-ib-device", + default="", + help="InfiniBand device for RDMA transfer", + ) + return p.parse_args(argv) @@ -250,6 +281,10 @@ def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: log_dir=args.log_dir, job_name=args.job_name, extra_server_opts=args.extra_server_opts, + disagg_transfer_backend=args.disagg_transfer_backend, + disagg_bootstrap_port=args.disagg_bootstrap_port, + disagg_prefill_pp=args.disagg_prefill_pp, + disagg_ib_device=args.disagg_ib_device, ) @@ -305,10 +340,18 @@ def main(argv: list[str] | None = None) -> None: spec = _build_spec(args) scheduler = _build_scheduler(args) + is_pd = args.pd + if args.dry_run: - print(scheduler.dry_run(spec)) + if is_pd: + print(scheduler.render_pd_pair(spec)) + else: + print(scheduler.dry_run(spec)) else: - result = scheduler.submit(spec) + if is_pd: + result = scheduler.submit_pd_pair(spec) + else: + result = scheduler.submit(spec) print(result) From 6f1bda283467885a11972cdac46d14dd8439a796 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 04:59:38 +0000 Subject: [PATCH 12/56] chore: add dev-setup.sh/dev-teardown.sh for one-click test clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - dev-setup.sh: auto-installs kind/kubectl, creates kind cluster, starts Slurm compose, runs flowsim init — all in one command - dev-teardown.sh: tears down both clusters cleanly - Supports 'kind', 'slurm', or 'all' (default) targets - Verified: kind cluster creation + K8s Job submit + PD pair submit all work --- dockerfiles/dev-setup.sh | 157 ++++++++++++++++++++++++++++++++++++ dockerfiles/dev-teardown.sh | 44 ++++++++++ 2 files changed, 201 insertions(+) create mode 100755 dockerfiles/dev-setup.sh create mode 100755 dockerfiles/dev-teardown.sh diff --git a/dockerfiles/dev-setup.sh b/dockerfiles/dev-setup.sh new file mode 100755 index 0000000..d948bf0 --- /dev/null +++ b/dockerfiles/dev-setup.sh @@ -0,0 +1,157 @@ +#!/usr/bin/env bash +# dev-setup.sh — one-shot setup for FlowSim test clusters (kind + Slurm) +# +# Usage: +# ./dockerfiles/dev-setup.sh # setup both kind + slurm +# ./dockerfiles/dev-setup.sh kind # kind only +# ./dockerfiles/dev-setup.sh slurm # slurm only +# +# Teardown: +# ./dockerfiles/dev-teardown.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +KIND_VERSION="v0.27.0" +KIND_CLUSTER_NAME="flowsim" +KUBECTL_STABLE_URL="https://dl.k8s.io/release/stable.txt" +NVIDIA_DEVICE_PLUGIN="https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml" + +log() { printf "\033[1;32m[setup]\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[setup]\033[0m %s\n" "$*"; } +err() { printf "\033[1;31m[setup]\033[0m %s\n" "$*" >&2; exit 1; } + +# ---------------------------------------------------------------- +# Dependency checks & auto-install +# ---------------------------------------------------------------- +ensure_docker() { + command -v docker >/dev/null || err "Docker is required but not installed." + docker info >/dev/null 2>&1 || err "Docker daemon not running." + log "Docker: $(docker --version)" +} + +ensure_kind() { + if command -v kind >/dev/null; then + log "kind already installed: $(kind version)" + return + fi + log "Installing kind ${KIND_VERSION}..." + curl -fsSLo /tmp/kind "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64" + chmod +x /tmp/kind + sudo mv /tmp/kind /usr/local/bin/kind + log "kind installed: $(kind version)" +} + +ensure_kubectl() { + if command -v kubectl >/dev/null; then + log "kubectl already installed" + return + fi + log "Installing kubectl..." + local ver + ver="$(curl -fsSL "${KUBECTL_STABLE_URL}")" + curl -fsSLo /tmp/kubectl "https://dl.k8s.io/release/${ver}/bin/linux/amd64/kubectl" + chmod +x /tmp/kubectl + sudo mv /tmp/kubectl /usr/local/bin/kubectl + log "kubectl installed: $(kubectl version --client --short 2>/dev/null || true)" +} + +# ---------------------------------------------------------------- +# Kind cluster +# ---------------------------------------------------------------- +setup_kind() { + ensure_docker + ensure_kind + ensure_kubectl + + if kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER_NAME}$"; then + warn "kind cluster '${KIND_CLUSTER_NAME}' already exists, skipping creation" + else + log "Creating kind cluster '${KIND_CLUSTER_NAME}' (1 control-plane + 2 GPU workers)..." + kind create cluster --name "${KIND_CLUSTER_NAME}" \ + --config "${SCRIPT_DIR}/kind-multi-node.yaml" + log "Installing NVIDIA device plugin..." + kubectl apply -f "${NVIDIA_DEVICE_PLUGIN}" + fi + + log "Cluster nodes:" + kubectl get nodes -o wide + echo + + log "Initializing FlowSim K8s config..." + local kubeconfig + kubeconfig="${HOME}/.kube/config" + flowsim init k8s \ + --kubeconfig "${kubeconfig}" \ + --context "kind-${KIND_CLUSTER_NAME}" \ + --namespace default \ + --force + echo + log "Kind cluster ready. Test with:" + log " flowsim submit --scheduler k8s --collect perf --model-path --dry-run" +} + +# ---------------------------------------------------------------- +# Slurm cluster (docker compose) +# ---------------------------------------------------------------- +setup_slurm() { + ensure_docker + + if ! docker compose version >/dev/null 2>&1; then + err "docker compose v2 is required but not available." + fi + + log "Building and starting Slurm cluster (slurmctld + 2 slurmd + slurmrestd)..." + docker compose -f "${SCRIPT_DIR}/slurm-compose.yaml" up -d --build + + log "Waiting for slurmctld to become ready..." + local retries=30 + while ! docker exec slurmctld sinfo >/dev/null 2>&1; do + retries=$((retries - 1)) + if [ "${retries}" -le 0 ]; then + err "slurmctld did not become ready in time" + fi + sleep 2 + done + + log "Slurm cluster status:" + docker exec slurmctld sinfo + echo + + log "Initializing FlowSim Slurm config..." + flowsim init slurm \ + --rest-url "http://localhost:6820" \ + --partition normal \ + --account default \ + --jwt-token-cmd "docker exec slurmctld scontrol token lifespan=3600" \ + --force + echo + log "Slurm cluster ready. Test with:" + log " flowsim submit --scheduler slurm --collect perf --model-path --dry-run" +} + +# ---------------------------------------------------------------- +# Main +# ---------------------------------------------------------------- +target="${1:-all}" + +case "${target}" in + kind) + setup_kind + ;; + slurm) + setup_slurm + ;; + all) + setup_kind + echo + setup_slurm + ;; + *) + echo "Usage: $0 [kind|slurm|all]" + exit 1 + ;; +esac + +echo +log "All done. Teardown with: ./dockerfiles/dev-teardown.sh" diff --git a/dockerfiles/dev-teardown.sh b/dockerfiles/dev-teardown.sh new file mode 100755 index 0000000..154b049 --- /dev/null +++ b/dockerfiles/dev-teardown.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# dev-teardown.sh — tear down FlowSim test clusters +# +# Usage: +# ./dockerfiles/dev-teardown.sh # teardown both +# ./dockerfiles/dev-teardown.sh kind # kind only +# ./dockerfiles/dev-teardown.sh slurm # slurm only + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +KIND_CLUSTER_NAME="flowsim" + +log() { printf "\033[1;32m[teardown]\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[teardown]\033[0m %s\n" "$*"; } + +teardown_kind() { + if command -v kind >/dev/null && kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER_NAME}$"; then + log "Deleting kind cluster '${KIND_CLUSTER_NAME}'..." + kind delete cluster --name "${KIND_CLUSTER_NAME}" + else + warn "kind cluster '${KIND_CLUSTER_NAME}' not found, skipping" + fi +} + +teardown_slurm() { + if docker compose -f "${SCRIPT_DIR}/slurm-compose.yaml" ps --quiet 2>/dev/null | head -1 | grep -q .; then + log "Stopping Slurm containers..." + docker compose -f "${SCRIPT_DIR}/slurm-compose.yaml" down -v + else + warn "Slurm containers not running, skipping" + fi +} + +target="${1:-all}" + +case "${target}" in + kind) teardown_kind ;; + slurm) teardown_slurm ;; + all) teardown_kind; teardown_slurm ;; + *) echo "Usage: $0 [kind|slurm|all]"; exit 1 ;; +esac + +log "Done." From d2bb08e5cfa48ce974c470d86836e0828a18ac59 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:05:24 +0000 Subject: [PATCH 13/56] =?UTF-8?q?feat:=20add=20local=20scheduler=20backend?= =?UTF-8?q?=20=E2=80=94=20flowsim=20submit=20--scheduler=20local?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - LocalScheduler runs profiling via subprocess on this machine - --local-gpus to set CUDA_VISIBLE_DEVICES (e.g. '0' or '0,1') - --local-workdir for custom working directory - No cluster config needed; replaces manual 'python scripts/run_stage_profile.py' - Supports --pd for local PD disaggregation testing - Skips cluster connection validation for local scheduler --- schedulers/__init__.py | 4 +- schedulers/local.py | 80 +++++++++++++++++++++++++++++++++++++++ scripts/submit_profile.py | 35 +++++++++++++++-- 3 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 schedulers/local.py diff --git a/schedulers/__init__.py b/schedulers/__init__.py index 6e1547b..fd20eb2 100644 --- a/schedulers/__init__.py +++ b/schedulers/__init__.py @@ -1,12 +1,14 @@ -"""Scheduler backends for submitting FlowSim profiling jobs to K8s or Slurm.""" +"""Scheduler backends for submitting FlowSim profiling jobs.""" from schedulers.base import BaseScheduler, ProfileJobSpec from schedulers.k8s import K8sScheduler +from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler __all__ = [ "BaseScheduler", "K8sScheduler", + "LocalScheduler", "ProfileJobSpec", "SlurmScheduler", ] diff --git a/schedulers/local.py b/schedulers/local.py new file mode 100644 index 0000000..c1cb1fe --- /dev/null +++ b/schedulers/local.py @@ -0,0 +1,80 @@ +"""Local scheduler — run profiling directly on this machine. + +``render()`` returns the shell command string. +``submit()`` executes it as a subprocess. +""" + +from __future__ import annotations + +import os +import subprocess +import sys + +from schedulers.base import BaseScheduler, ProfileJobSpec + + +class LocalScheduler(BaseScheduler): + """Run profiling jobs locally via subprocess. + + Parameters + ---------- + gpus : str + ``CUDA_VISIBLE_DEVICES`` value (e.g., ``"0"`` or ``"0,1"``). + Empty string means use all visible GPUs. + workdir : str + Working directory for the subprocess. + Defaults to the FlowSim project root. + """ + + def __init__( + self, + *, + gpus: str = "", + workdir: str = "", + ) -> None: + self.gpus = gpus + self.workdir = workdir or self._find_project_root() + + @staticmethod + def _find_project_root() -> str: + """Walk up from this file to find the FlowSim project root.""" + d = os.path.dirname(os.path.abspath(__file__)) + # schedulers/ is one level below project root + return os.path.dirname(d) + + def render(self, spec: ProfileJobSpec) -> str: + lines = [] + if self.gpus: + lines.append(f"export CUDA_VISIBLE_DEVICES={self.gpus}") + lines.append("export SGLANG_PROFILE_KERNELS=1") + lines.append(f"cd {self.workdir}") + lines.append(spec.build_shell_command()) + return "\n".join(lines) + + def submit(self, spec: ProfileJobSpec) -> str: + """Run the profiling command locally as a subprocess.""" + cmd = spec.build_shell_command() + + env = os.environ.copy() + env["SGLANG_PROFILE_KERNELS"] = "1" + if self.gpus: + env["CUDA_VISIBLE_DEVICES"] = self.gpus + + job_name = spec.default_job_name() + print(f"[local] Running {job_name}...") + print(f"[local] cmd: {cmd}") + print(f"[local] workdir: {self.workdir}") + if self.gpus: + print(f"[local] CUDA_VISIBLE_DEVICES={self.gpus}") + print() + + result = subprocess.run( + cmd, + shell=True, + cwd=self.workdir, + env=env, + ) + + if result.returncode != 0: + return f"[local] {job_name} FAILED (exit code {result.returncode})" + return f"[local] {job_name} completed successfully" diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 3701892..18c68aa 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -1,9 +1,17 @@ #!/usr/bin/env python3 -"""Submit FlowSim profiling jobs to Kubernetes or Slurm. +"""Submit FlowSim profiling jobs locally, to Kubernetes, or to Slurm. Usage examples -------------- +Run locally (no cluster needed): + + flowsim submit \\ + --scheduler local \\ + --collect perf \\ + --model-path Qwen/Qwen3-8B \\ + --tp 1 --local-gpus 0 + Dry-run (print Kubernetes Job YAML to stdout): python scripts/submit_profile.py \\ @@ -45,6 +53,7 @@ from schedulers.base import ProfileJobSpec from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_jwt_token from schedulers.k8s import K8sScheduler +from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler @@ -67,7 +76,7 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: # -- Scheduler choice -- p.add_argument( "--scheduler", - choices=["k8s", "slurm"], + choices=["local", "k8s", "slurm"], required=True, help="Scheduler backend.", ) @@ -111,6 +120,19 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) infra.add_argument("--job-name", default="") + # -- Local options -- + loc = p.add_argument_group("local options") + loc.add_argument( + "--local-gpus", + default="", + help="CUDA_VISIBLE_DEVICES for local execution (e.g. '0' or '0,1')", + ) + loc.add_argument( + "--local-workdir", + default="", + help="Working directory for local execution (default: FlowSim project root)", + ) + # -- Kubernetes-specific -- k8s = p.add_argument_group("kubernetes options (config: ~/.flowsim/k8s.yaml)") k8s.add_argument( @@ -289,7 +311,12 @@ def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: def _build_scheduler(args: argparse.Namespace): - if args.scheduler == "k8s": + if args.scheduler == "local": + return LocalScheduler( + gpus=args.local_gpus, + workdir=args.local_workdir, + ) + elif args.scheduler == "k8s": node_sel = {} for item in args.k8s_node_selector: k, _, v = item.partition("=") @@ -334,7 +361,7 @@ def main(argv: list[str] | None = None) -> None: args.slurm_jwt_token = token # Validate required connection params before submit - if not args.dry_run: + if not args.dry_run and args.scheduler not in ("local",): _validate_connection(args) spec = _build_spec(args) From e5e303cf8f688a36f8c89059280a086f67e20330 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:12:19 +0000 Subject: [PATCH 14/56] test: add 61 unit tests for scheduler CLI, backends, and config Tests cover: - ProfileJobSpec: job name, server opts, disagg params, as_prefill/decode - K8sScheduler.render: YAML validity, namespace, GPU resources, PVC, hostPath, nodeSelector, serviceAccount, labels, PD pair - SlurmScheduler.render: shebang, sbatch directives, docker/enroot/bare, modules, extra sbatch, constraint, time parsing - LocalScheduler.render: GPU selection, workdir, env vars - CLI init: help, required args, bad kubeconfig, save/load config, overwrite protection, --force - CLI submit: help, dry-run for local/k8s/slurm, PD pair, nixl backend - Config: save/load yaml, jwt_token static/cmd/bad_cmd, cfg_get All tests run inside the FlowSim Docker container. --- tests/unit/test_scheduler_cli.py | 580 +++++++++++++++++++++++++++++++ 1 file changed, 580 insertions(+) create mode 100644 tests/unit/test_scheduler_cli.py diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py new file mode 100644 index 0000000..055e117 --- /dev/null +++ b/tests/unit/test_scheduler_cli.py @@ -0,0 +1,580 @@ +"""Unit tests for the scheduler CLI (flowsim init / submit) and backends.""" + +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from unittest import mock + +import pytest +import yaml + +from schedulers.base import ProfileJobSpec +from schedulers.k8s import K8sScheduler +from schedulers.local import LocalScheduler +from schedulers.slurm import SlurmScheduler + + +# ========================================================================= +# ProfileJobSpec +# ========================================================================= + +class TestProfileJobSpec: + """Tests for ProfileJobSpec dataclass methods.""" + + @pytest.fixture() + def spec(self) -> ProfileJobSpec: + return ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + tp=2, + bs=4, + input_len=1024, + ) + + def test_default_job_name(self, spec: ProfileJobSpec): + name = spec.default_job_name() + assert name == "flowsim-perf-qwen3-8b-bs4-il1024" + + def test_custom_job_name(self, spec: ProfileJobSpec): + spec.job_name = "my-job" + assert spec.default_job_name() == "my-job" + + def test_job_name_disagg_suffix(self, spec: ProfileJobSpec): + spec.disagg_mode = "prefill" + assert spec.default_job_name().endswith("-prefill") + + def test_build_server_opts_basic(self, spec: ProfileJobSpec): + opts = spec.build_server_opts() + assert "--model-path Qwen/Qwen3-8B" in opts + assert "--tp 2" in opts + assert "--disaggregation" not in opts + + def test_build_server_opts_dp(self, spec: ProfileJobSpec): + spec.dp = 4 + assert "--dp 4" in spec.build_server_opts() + + def test_build_server_opts_disagg(self, spec: ProfileJobSpec): + spec.disagg_mode = "prefill" + spec.disagg_transfer_backend = "nixl" + opts = spec.build_server_opts() + assert "--disaggregation-mode prefill" in opts + assert "--disaggregation-transfer-backend nixl" in opts + assert "--disaggregation-bootstrap-port 8998" in opts + + def test_build_server_opts_disagg_pp(self, spec: ProfileJobSpec): + spec.disagg_mode = "prefill" + spec.disagg_prefill_pp = 2 + assert "--disaggregation-prefill-pp 2" in spec.build_server_opts() + + def test_build_server_opts_extra(self, spec: ProfileJobSpec): + spec.extra_server_opts = "--some-flag" + assert "--some-flag" in spec.build_server_opts() + + def test_build_profile_command(self, spec: ProfileJobSpec): + cmd = spec.build_profile_command() + assert cmd[0] == "python" + assert "scripts/run_stage_profile.py" in cmd[1] + assert "--collect" in cmd + assert "perf" in cmd + assert "--bs" in cmd + assert "4" in cmd + + def test_build_shell_command_quotes_server_opts(self, spec: ProfileJobSpec): + shell = spec.build_shell_command() + # server-opts contains spaces, must be quoted + assert "--server-opts '" in shell or '--server-opts "' in shell + + def test_as_prefill(self, spec: ProfileJobSpec): + p = spec.as_prefill() + assert p.disagg_mode == "prefill" + assert spec.disagg_mode == "" # original unchanged + + def test_as_decode(self, spec: ProfileJobSpec): + d = spec.as_decode() + assert d.disagg_mode == "decode" + assert spec.disagg_mode == "" + + +# ========================================================================= +# K8sScheduler.render +# ========================================================================= + +class TestK8sScheduler: + """Tests for K8s Job manifest generation.""" + + @pytest.fixture() + def scheduler(self) -> K8sScheduler: + return K8sScheduler( + namespace="ml-team", + kubeconfig="/fake/kubeconfig", + context="prod", + shm_size="32Gi", + ) + + @pytest.fixture() + def spec(self) -> ProfileJobSpec: + return ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + gpus=2, + ) + + def test_render_valid_yaml(self, scheduler, spec): + rendered = scheduler.render(spec) + doc = yaml.safe_load(rendered) + assert doc["apiVersion"] == "batch/v1" + assert doc["kind"] == "Job" + + def test_render_namespace(self, scheduler, spec): + doc = yaml.safe_load(scheduler.render(spec)) + assert doc["metadata"]["namespace"] == "ml-team" + + def test_render_gpu_resources(self, scheduler, spec): + doc = yaml.safe_load(scheduler.render(spec)) + container = doc["spec"]["template"]["spec"]["containers"][0] + assert container["resources"]["limits"]["nvidia.com/gpu"] == "2" + + def test_render_shm_size(self, scheduler, spec): + doc = yaml.safe_load(scheduler.render(spec)) + volumes = doc["spec"]["template"]["spec"]["volumes"] + dshm = [v for v in volumes if v["name"] == "dshm"][0] + assert dshm["emptyDir"]["sizeLimit"] == "32Gi" + + def test_render_pvc_volume(self, spec): + sched = K8sScheduler(namespace="default", pvc_name="my-pvc") + doc = yaml.safe_load(sched.render(spec)) + volumes = doc["spec"]["template"]["spec"]["volumes"] + pvc_vol = [v for v in volumes if v["name"] == "output"] + assert len(pvc_vol) == 1 + assert pvc_vol[0]["persistentVolumeClaim"]["claimName"] == "my-pvc" + + def test_render_host_output_dir(self, spec): + sched = K8sScheduler(namespace="default", host_output_dir="/data/out") + doc = yaml.safe_load(sched.render(spec)) + volumes = doc["spec"]["template"]["spec"]["volumes"] + host_vol = [v for v in volumes if v["name"] == "output"] + assert len(host_vol) == 1 + assert host_vol[0]["hostPath"]["path"] == "/data/out" + + def test_render_node_selector(self, spec): + sched = K8sScheduler(namespace="default", node_selector={"gpu": "h100"}) + doc = yaml.safe_load(sched.render(spec)) + pod_spec = doc["spec"]["template"]["spec"] + assert pod_spec["nodeSelector"]["gpu"] == "h100" + + def test_render_service_account(self, spec): + sched = K8sScheduler(namespace="default", service_account="runner") + doc = yaml.safe_load(sched.render(spec)) + pod_spec = doc["spec"]["template"]["spec"] + assert pod_spec["serviceAccountName"] == "runner" + + def test_render_labels(self, scheduler, spec): + doc = yaml.safe_load(scheduler.render(spec)) + labels = doc["metadata"]["labels"] + assert labels["app"] == "flowsim" + assert labels["collect"] == "perf" + + def test_render_pd_pair(self, scheduler, spec): + output = scheduler.render_pd_pair(spec) + assert "PREFILL INSTANCE" in output + assert "DECODE INSTANCE" in output + # Both should be valid YAML docs + docs = output.split("# === DECODE INSTANCE ===") + assert len(docs) == 2 + + +# ========================================================================= +# SlurmScheduler.render +# ========================================================================= + +class TestSlurmScheduler: + """Tests for Slurm sbatch script generation.""" + + @pytest.fixture() + def scheduler(self) -> SlurmScheduler: + return SlurmScheduler( + partition="gpu-h100", + time_limit="01:00:00", + account="my-proj", + ) + + @pytest.fixture() + def spec(self) -> ProfileJobSpec: + return ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + gpus=4, + ) + + def test_render_shebang(self, scheduler, spec): + script = scheduler.render(spec) + assert script.startswith("#!/bin/bash\n") + + def test_render_sbatch_directives(self, scheduler, spec): + script = scheduler.render(spec) + assert "#SBATCH --partition=gpu-h100" in script + assert "#SBATCH --gpus-per-node=4" in script + assert "#SBATCH --time=01:00:00" in script + assert "#SBATCH --account=my-proj" in script + + def test_render_env_vars(self, scheduler, spec): + script = scheduler.render(spec) + assert "SGLANG_PROFILE_KERNELS=1" in script + + def test_render_command(self, scheduler, spec): + script = scheduler.render(spec) + assert "scripts/run_stage_profile.py" in script + assert "--collect perf" in script + + def test_render_docker_runtime(self, spec): + sched = SlurmScheduler( + partition="gpu", + container_runtime="docker", + container_mounts="/data:/data", + ) + script = sched.render(spec) + assert "docker run" in script + assert "-v /data:/data" in script + + def test_render_enroot_runtime(self, spec): + sched = SlurmScheduler( + partition="gpu", + container_runtime="enroot", + ) + script = sched.render(spec) + assert "srun --container-image" in script + + def test_render_modules(self, spec): + sched = SlurmScheduler( + partition="gpu", + modules=["cuda/12.6", "anaconda3"], + ) + script = sched.render(spec) + assert "module load cuda/12.6" in script + assert "module load anaconda3" in script + + def test_render_extra_sbatch(self, spec): + sched = SlurmScheduler( + partition="gpu", + extra_sbatch=["--mem=64G", "--exclusive"], + ) + script = sched.render(spec) + assert "#SBATCH --mem=64G" in script + assert "#SBATCH --exclusive" in script + + def test_render_constraint(self, spec): + sched = SlurmScheduler(partition="gpu", constraint="gpu80g") + script = sched.render(spec) + assert "#SBATCH --constraint=gpu80g" in script + + def test_time_parse_minutes(self): + sched = SlurmScheduler(partition="gpu", time_limit="02:30:00") + assert sched._parse_time_minutes() == 150 + + +# ========================================================================= +# LocalScheduler.render +# ========================================================================= + +class TestLocalScheduler: + """Tests for local execution backend.""" + + @pytest.fixture() + def spec(self) -> ProfileJobSpec: + return ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + ) + + def test_render_with_gpus(self, spec): + sched = LocalScheduler(gpus="0,1") + output = sched.render(spec) + assert "CUDA_VISIBLE_DEVICES=0,1" in output + + def test_render_without_gpus(self, spec): + sched = LocalScheduler(gpus="") + output = sched.render(spec) + assert "CUDA_VISIBLE_DEVICES" not in output + + def test_render_has_command(self, spec): + sched = LocalScheduler() + output = sched.render(spec) + assert "scripts/run_stage_profile.py" in output + assert "SGLANG_PROFILE_KERNELS=1" in output + + def test_render_workdir(self, spec): + sched = LocalScheduler(workdir="/my/project") + output = sched.render(spec) + assert "cd /my/project" in output + + def test_dry_run_equals_render(self, spec): + sched = LocalScheduler(gpus="0") + assert sched.dry_run(spec) == sched.render(spec) + + +# ========================================================================= +# CLI: flowsim init +# ========================================================================= + +class TestCLIInit: + """Tests for `flowsim init` subcommand.""" + + def test_init_no_args_shows_help(self, capsys): + from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: + _cmd_init([]) + assert exc_info.value.code != 0 + + def test_init_k8s_help(self, capsys): + from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: + _cmd_init(["k8s", "--help"]) + assert exc_info.value.code == 0 + out = capsys.readouterr().out + assert "--kubeconfig" in out + assert "--namespace" in out + + def test_init_slurm_help(self, capsys): + from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: + _cmd_init(["slurm", "--help"]) + assert exc_info.value.code == 0 + out = capsys.readouterr().out + assert "--rest-url" in out + assert "--partition" in out + + def test_init_k8s_missing_required(self): + from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: + _cmd_init(["k8s"]) + assert exc_info.value.code != 0 + + def test_init_slurm_missing_required(self): + from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: + _cmd_init(["slurm"]) + assert exc_info.value.code != 0 + + def test_init_k8s_bad_kubeconfig(self): + from scripts.cli import _cmd_init + rc = _cmd_init(["k8s", "--kubeconfig", "/nonexistent/path", "--namespace", "ns"]) + assert rc != 0 + + def test_init_k8s_saves_config(self, tmp_path: Path): + # Create a fake kubeconfig + kube = tmp_path / "kubeconfig" + kube.write_text("apiVersion: v1\nclusters: []\n") + + config_dir = tmp_path / "flowsim" + with mock.patch("scripts.cli._CONFIG_DIR", config_dir): + from scripts.cli import _cmd_init + rc = _cmd_init([ + "k8s", + "--kubeconfig", str(kube), + "--namespace", "test-ns", + ]) + assert rc == 0 + cfg_file = config_dir / "k8s.yaml" + assert cfg_file.exists() + cfg = yaml.safe_load(cfg_file.read_text()) + assert cfg["namespace"] == "test-ns" + assert cfg["kubeconfig"] == str(kube) + + def test_init_slurm_saves_config(self, tmp_path: Path): + config_dir = tmp_path / "flowsim" + with mock.patch("scripts.cli._CONFIG_DIR", config_dir): + from scripts.cli import _cmd_init + rc = _cmd_init([ + "slurm", + "--rest-url", "http://localhost:6820", + "--partition", "gpu", + "--account", "proj", + "--jwt-token", "fake-token", + ]) + assert rc == 0 + cfg_file = config_dir / "slurm.yaml" + assert cfg_file.exists() + cfg = yaml.safe_load(cfg_file.read_text()) + assert cfg["rest_url"] == "http://localhost:6820" + assert cfg["partition"] == "gpu" + assert cfg["account"] == "proj" + + def test_init_refuses_overwrite(self, tmp_path: Path): + config_dir = tmp_path / "flowsim" + config_dir.mkdir() + (config_dir / "slurm.yaml").write_text("existing: true\n") + + with mock.patch("scripts.cli._CONFIG_DIR", config_dir): + from scripts.cli import _cmd_init + rc = _cmd_init([ + "slurm", + "--rest-url", "http://localhost:6820", + "--partition", "gpu", + "--account", "proj", + "--jwt-token", "tok", + ]) + assert rc != 0 # should refuse + + def test_init_force_overwrite(self, tmp_path: Path): + config_dir = tmp_path / "flowsim" + config_dir.mkdir() + (config_dir / "slurm.yaml").write_text("existing: true\n") + + with mock.patch("scripts.cli._CONFIG_DIR", config_dir): + from scripts.cli import _cmd_init + rc = _cmd_init([ + "slurm", + "--rest-url", "http://localhost:6820", + "--partition", "gpu", + "--account", "proj", + "--jwt-token", "tok", + "--force", + ]) + assert rc == 0 + cfg = yaml.safe_load((config_dir / "slurm.yaml").read_text()) + assert cfg["rest_url"] == "http://localhost:6820" + + +# ========================================================================= +# CLI: flowsim submit (parse/dry-run only, no actual submission) +# ========================================================================= + +class TestCLISubmit: + """Tests for `flowsim submit` argument parsing and dry-run.""" + + def _run(self, *args: str, expect_ok: bool = True) -> str: + """Run submit via the Python function, capture stdout.""" + from scripts.submit_profile import main as submit_main + import io + from contextlib import redirect_stdout + buf = io.StringIO() + with redirect_stdout(buf): + submit_main(list(args)) + return buf.getvalue() + + def test_submit_help(self, capsys): + from scripts.submit_profile import main as submit_main + with pytest.raises(SystemExit) as exc_info: + submit_main(["--help"]) + assert exc_info.value.code == 0 + out = capsys.readouterr().out + assert "--scheduler" in out + assert "local" in out + + def test_submit_missing_required(self): + from scripts.submit_profile import main as submit_main + with pytest.raises(SystemExit): + submit_main([]) + + def test_submit_local_dry_run(self): + out = self._run( + "--scheduler", "local", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--dry-run", + ) + assert "scripts/run_stage_profile.py" in out + assert "SGLANG_PROFILE_KERNELS=1" in out + + def test_submit_local_dry_run_with_gpus(self): + out = self._run( + "--scheduler", "local", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--local-gpus", "0,1", + "--dry-run", + ) + assert "CUDA_VISIBLE_DEVICES=0,1" in out + + def test_submit_k8s_dry_run(self): + out = self._run( + "--scheduler", "k8s", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--k8s-namespace", "default", + "--dry-run", + ) + assert "apiVersion: batch/v1" in out + assert "kind: Job" in out + + def test_submit_slurm_dry_run(self): + out = self._run( + "--scheduler", "slurm", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--slurm-partition", "gpu", + "--slurm-rest-url", "http://fake:6820", + "--slurm-jwt-token", "fake-token", + "--dry-run", + ) + assert "#!/bin/bash" in out + assert "#SBATCH --partition=gpu" in out + + def test_submit_pd_dry_run(self): + out = self._run( + "--scheduler", "local", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--pd", + "--dry-run", + ) + assert "PREFILL INSTANCE" in out + assert "DECODE INSTANCE" in out + assert "--disaggregation-mode prefill" in out + assert "--disaggregation-mode decode" in out + + def test_submit_pd_nixl_backend(self): + out = self._run( + "--scheduler", "local", + "--collect", "perf", + "--model-path", "Qwen/Qwen3-8B", + "--pd", + "--disagg-transfer-backend", "nixl", + "--dry-run", + ) + assert "--disaggregation-transfer-backend nixl" in out + + +# ========================================================================= +# Config loading +# ========================================================================= + +class TestConfig: + """Tests for config file loading and saving.""" + + def test_save_and_load_yaml(self, tmp_path: Path): + from schedulers.config import _save_yaml, _load_yaml + data = {"rest_url": "http://localhost:6820", "partition": "gpu"} + path = tmp_path / "test.yaml" + _save_yaml(path, data) + loaded = _load_yaml(path) + assert loaded == data + + def test_resolve_jwt_token_static(self): + from schedulers.config import resolve_jwt_token + cfg = {"jwt_token": "my-secret"} + assert resolve_jwt_token(cfg) == "my-secret" + + def test_resolve_jwt_token_cmd(self): + from schedulers.config import resolve_jwt_token + cfg = {"jwt_token_cmd": "echo test-token-123"} + assert resolve_jwt_token(cfg) == "test-token-123" + + def test_resolve_jwt_token_bad_cmd(self): + from schedulers.config import resolve_jwt_token + cfg = {"jwt_token_cmd": "/nonexistent/binary"} + # Should not raise, just return empty + assert resolve_jwt_token(cfg) == "" + + def test_resolve_jwt_token_empty(self): + from schedulers.config import resolve_jwt_token + assert resolve_jwt_token({}) == "" + + def test_cfg_get(self): + from schedulers.config import cfg_get + cfg = {"key": "value", "empty": ""} + assert cfg_get(cfg, "key", "default") == "value" + assert cfg_get(cfg, "empty", "default") == "" + assert cfg_get(cfg, "missing", "default") == "default" From c60cd1180f11f412d27f81be94ddced356662369 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:39:02 +0000 Subject: [PATCH 15/56] feat: persistent logs under output_dir, flowsim status/logs, refuse K8s submit without PVC - log_dir is now derived as {output_dir}/logs/ (single volume covers both) - LocalScheduler.submit() tees stdout/stderr to log files in real time - K8s submit refuses if no --k8s-pvc or --k8s-host-output-dir (prevents data loss) - Slurm output_dir defaults to ~/flowsim_traces (shared filesystem) - Local output_dir defaults to {project}/stage_traces/ - Add flowsim status/logs subcommands (K8s via API, Slurm via slurmrestd, local via log files) - Submit prints result location + follow-up commands after every job - Add integration tests for local scheduler --- .gitignore | 3 +- schedulers/base.py | 31 ++- schedulers/k8s.py | 103 ++++++++++ schedulers/local.py | 117 ++++++++++- schedulers/slurm.py | 81 ++++++++ scripts/cli.py | 22 +++ scripts/run_stage_profile.py | 8 +- scripts/status_profile.py | 157 +++++++++++++++ scripts/submit_profile.py | 48 ++++- tests/integration/test_scheduler_local.py | 229 ++++++++++++++++++++++ 10 files changed, 780 insertions(+), 19 deletions(-) create mode 100644 scripts/status_profile.py create mode 100644 tests/integration/test_scheduler_local.py diff --git a/.gitignore b/.gitignore index 706276b..b70854b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ tests/test-artifacts/ unknown_kernels.json /artifacts /server_profile -/server_simulate \ No newline at end of file +/server_simulate +/stage_traces/ \ No newline at end of file diff --git a/schedulers/base.py b/schedulers/base.py index 1427e8e..3cbc2e7 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -35,7 +35,6 @@ class ProfileJobSpec: host: str = "0.0.0.0" port: int = 30001 output_dir: str = "/flowsim/stage_traces" - log_dir: str = "/flowsim/tests/test-artifacts" job_name: str = "" # -- PD disaggregation -- @@ -70,6 +69,11 @@ def build_server_opts(self) -> str: parts.append(self.extra_server_opts) return " ".join(parts) + @property + def log_dir(self) -> str: + """Server logs go under ``{output_dir}/logs/``.""" + return self.output_dir + "/logs" + def build_profile_command(self) -> list[str]: """Build the full ``python scripts/run_stage_profile.py ...`` command.""" cmd = [ @@ -152,6 +156,31 @@ def render(self, spec: ProfileJobSpec) -> str: def submit(self, spec: ProfileJobSpec) -> str: """Submit the job and return a job identifier string.""" + def status(self, job_id: str) -> dict: + """Query job status. Returns dict with at least 'state' key. + + Subclasses should return:: + + { + "state": "Pending" | "Running" | "Succeeded" | "Failed" | ..., + "message": "human-readable detail", + "output_hint": "where to find trace files", + } + """ + raise NotImplementedError(f"{type(self).__name__} does not support status queries") + + def logs(self, job_id: str, *, tail: int = 100) -> str: + """Retrieve recent log output for a job. + + Parameters + ---------- + job_id : str + Job name (K8s) or job ID (Slurm) or log prefix (local). + tail : int + Number of lines from the end to return. + """ + raise NotImplementedError(f"{type(self).__name__} does not support log retrieval") + def dry_run(self, spec: ProfileJobSpec) -> str: """Render and return the manifest without submitting.""" return self.render(spec) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 6b58ea9..1640f5c 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -176,3 +176,106 @@ def submit(self, spec: ProfileJobSpec) -> str: body=body, ) return f"job.batch/{resp.metadata.name} created (namespace={resp.metadata.namespace})" + + # ----------------------------------------------------------------- + # Helpers shared by status / logs + # ----------------------------------------------------------------- + + def _load_k8s(self): + """Load kubeconfig and return (BatchV1Api, CoreV1Api).""" + from kubernetes import client as k8s_client, config as k8s_config + + config_kwargs: dict = {} + if self.kubeconfig: + config_kwargs["config_file"] = self.kubeconfig + if self.context: + config_kwargs["context"] = self.context + try: + k8s_config.load_kube_config(**config_kwargs) + except k8s_config.ConfigException: + k8s_config.load_incluster_config() + + return k8s_client.BatchV1Api(), k8s_client.CoreV1Api() + + def status(self, job_id: str) -> dict: + """Query K8s Job status by job name.""" + try: + from kubernetes import client as k8s_client + except ImportError: + raise RuntimeError("pip install kubernetes") + + batch_api, core_api = self._load_k8s() + + job = batch_api.read_namespaced_job(name=job_id, namespace=self.namespace) + st = job.status + + # Determine state + if st.succeeded and st.succeeded > 0: + state = "Succeeded" + elif st.failed and st.failed > 0: + state = "Failed" + elif st.active and st.active > 0: + state = "Running" + else: + state = "Pending" + + # Pod info + pods = core_api.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"job-name={job_id}", + ) + pod_statuses = [] + for pod in pods.items: + phase = pod.status.phase + node = pod.spec.node_name or "unscheduled" + pod_statuses.append(f"{pod.metadata.name} ({phase}, node={node})") + + output_hint = "" + if self.pvc_name: + output_hint = f"Traces persisted on PVC '{self.pvc_name}'" + elif self.host_output_dir: + output_hint = f"Traces at hostPath {self.host_output_dir} on the scheduled node" + else: + output_hint = "WARNING: no PVC or hostPath configured — traces are lost when pod exits" + + msg_parts = [f"Job: {job_id} Namespace: {self.namespace} State: {state}"] + if pod_statuses: + msg_parts.append("Pods: " + ", ".join(pod_statuses)) + msg_parts.append(output_hint) + + return { + "state": state, + "message": "\n".join(msg_parts), + "output_hint": output_hint, + } + + def logs(self, job_id: str, *, tail: int = 100) -> str: + """Retrieve logs from the pod(s) of a K8s Job.""" + try: + from kubernetes import client as k8s_client + except ImportError: + raise RuntimeError("pip install kubernetes") + + _, core_api = self._load_k8s() + + pods = core_api.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"job-name={job_id}", + ) + if not pods.items: + return f"No pods found for job {job_id} in namespace {self.namespace}" + + parts = [] + for pod in pods.items: + name = pod.metadata.name + try: + log_text = core_api.read_namespaced_pod_log( + name=name, + namespace=self.namespace, + tail_lines=tail, + ) + except Exception as exc: + log_text = f"(error reading logs: {exc})" + parts.append(f"=== {name} ===\n{log_text}") + + return "\n".join(parts) diff --git a/schedulers/local.py b/schedulers/local.py index c1cb1fe..da3b03a 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -1,7 +1,7 @@ """Local scheduler — run profiling directly on this machine. ``render()`` returns the shell command string. -``submit()`` executes it as a subprocess. +``submit()`` executes it as a subprocess, with stdout/stderr tee'd to log files. """ from __future__ import annotations @@ -9,6 +9,7 @@ import os import subprocess import sys +import time from schedulers.base import BaseScheduler, ProfileJobSpec @@ -52,7 +53,11 @@ def render(self, spec: ProfileJobSpec) -> str: return "\n".join(lines) def submit(self, spec: ProfileJobSpec) -> str: - """Run the profiling command locally as a subprocess.""" + """Run the profiling command locally as a subprocess. + + stdout and stderr are streamed to the terminal *and* saved to + log files under ``spec.log_dir``. + """ cmd = spec.build_shell_command() env = os.environ.copy() @@ -61,20 +66,112 @@ def submit(self, spec: ProfileJobSpec) -> str: env["CUDA_VISIBLE_DEVICES"] = self.gpus job_name = spec.default_job_name() + log_dir = spec.log_dir + os.makedirs(log_dir, exist_ok=True) + ts = int(time.time()) + stdout_path = os.path.join(log_dir, f"{job_name}_{ts}.stdout.log") + stderr_path = os.path.join(log_dir, f"{job_name}_{ts}.stderr.log") + print(f"[local] Running {job_name}...") print(f"[local] cmd: {cmd}") print(f"[local] workdir: {self.workdir}") if self.gpus: print(f"[local] CUDA_VISIBLE_DEVICES={self.gpus}") + print(f"[local] logs: {stdout_path}") + print(f"[local] {stderr_path}") print() - result = subprocess.run( - cmd, - shell=True, - cwd=self.workdir, - env=env, + with open(stdout_path, "w") as fout, open(stderr_path, "w") as ferr: + proc = subprocess.Popen( + cmd, + shell=True, + cwd=self.workdir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + # Stream stdout/stderr to terminal + log files in real time. + # Use threads to avoid blocking on one stream while the other + # fills its OS pipe buffer. + import threading + + def _tee(src, dest_file, dest_stream): + for line in src: + dest_stream.buffer.write(line) + dest_stream.buffer.flush() + dest_file.write(line.decode("utf-8", errors="replace")) + dest_file.flush() + + t_out = threading.Thread( + target=_tee, args=(proc.stdout, fout, sys.stdout), daemon=True, + ) + t_err = threading.Thread( + target=_tee, args=(proc.stderr, ferr, sys.stderr), daemon=True, + ) + t_out.start() + t_err.start() + proc.wait() + t_out.join() + t_err.join() + + if proc.returncode != 0: + return ( + f"[local] {job_name} FAILED (exit code {proc.returncode})\n" + f"[local] stdout log: {stdout_path}\n" + f"[local] stderr log: {stderr_path}" + ) + return ( + f"[local] {job_name} completed successfully\n" + f"[local] stdout log: {stdout_path}\n" + f"[local] stderr log: {stderr_path}" ) - if result.returncode != 0: - return f"[local] {job_name} FAILED (exit code {result.returncode})" - return f"[local] {job_name} completed successfully" + def status(self, job_id: str) -> dict: + """Check local job status by looking for log files. + + ``job_id`` is the job name prefix used in log filenames. + """ + import glob + + log_dir = os.path.join(self.workdir, "stage_traces", "logs") + pattern = os.path.join(log_dir, f"{job_id}_*.stdout.log") + matches = sorted(glob.glob(pattern)) + + if not matches: + return { + "state": "NotFound", + "message": f"No logs found matching {pattern}", + "output_hint": "", + } + + latest = matches[-1] + stderr_log = latest.replace(".stdout.log", ".stderr.log") + trace_dir = os.path.join(self.workdir, "stage_traces") + + return { + "state": "Completed", + "message": ( + f"Latest log: {latest}\n" + f"Stderr log: {stderr_log}\n" + f"Traces dir: {trace_dir}" + ), + "output_hint": trace_dir, + } + + def logs(self, job_id: str, *, tail: int = 100) -> str: + """Read the last *tail* lines from the most recent local log file.""" + import glob + + log_dir = os.path.join(self.workdir, "stage_traces", "logs") + pattern = os.path.join(log_dir, f"{job_id}_*.stdout.log") + matches = sorted(glob.glob(pattern)) + + if not matches: + return f"No logs found matching {pattern}" + + latest = matches[-1] + with open(latest) as f: + all_lines = f.readlines() + + header = f"=== {latest} (last {tail} lines) ===\n" + return header + "".join(all_lines[-tail:]) diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 9261a15..9ec84a6 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -225,6 +225,87 @@ def submit(self, spec: ProfileJobSpec) -> str: job_id = body.get("job_id", "unknown") return f"Submitted batch job {job_id}" + def _rest_get(self, path: str) -> dict: + """GET a slurmrestd endpoint and return parsed JSON.""" + if not self.rest_url: + raise RuntimeError("--slurm-rest-url is required") + if not self.jwt_token: + raise RuntimeError("--slurm-jwt-token is required") + + url = f"{self.rest_url}{path}" + headers = { + "X-SLURM-USER-TOKEN": self.jwt_token, + } + req = urllib.request.Request(url, headers=headers, method="GET") + + ctx: ssl.SSLContext | None = None + if not self.verify_ssl: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + try: + with urllib.request.urlopen(req, context=ctx) as resp: + return json.loads(resp.read()) + except urllib.error.HTTPError as exc: + detail = exc.read().decode(errors="replace") + raise RuntimeError(f"slurmrestd returned HTTP {exc.code}:\n{detail}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"Cannot reach slurmrestd at {self.rest_url}: {exc.reason}") from exc + + def status(self, job_id: str) -> dict: + """Query Slurm job status via slurmrestd.""" + body = self._rest_get(f"/slurm/{self.api_version}/job/{job_id}") + + errors = body.get("errors") or [] + if errors: + msgs = "; ".join(e.get("error", str(e)) for e in errors) + raise RuntimeError(f"slurmrestd error: {msgs}") + + jobs = body.get("jobs", []) + if not jobs: + return {"state": "Unknown", "message": f"No job found with ID {job_id}", "output_hint": ""} + + job = jobs[0] + state = job.get("job_state", ["UNKNOWN"]) + if isinstance(state, list): + state = state[0] if state else "UNKNOWN" + name = job.get("name", "") + node_list = job.get("nodes", "") + output_file = job.get("standard_output", "") + work_dir = job.get("current_working_directory", "") + + msg_parts = [ + f"Job ID: {job_id} Name: {name} State: {state}", + f"Nodes: {node_list}" if node_list else "Nodes: (not yet assigned)", + ] + if output_file: + msg_parts.append(f"Output log: {output_file}") + if work_dir: + msg_parts.append(f"Working dir: {work_dir}") + + return { + "state": state, + "message": "\n".join(msg_parts), + "output_hint": output_file, + } + + def logs(self, job_id: str, *, tail: int = 100) -> str: + """Retrieve log output for a Slurm job. + + Tries to read the sbatch output file via slurmrestd. + Falls back to showing job info if direct log access isn't available. + """ + info = self.status(job_id) + output_file = info.get("output_hint", "") + lines = [info["message"], ""] + + if output_file: + lines.append(f"To view full logs on the cluster:") + lines.append(f" tail -{tail} {output_file}") + + return "\n".join(lines) + def _parse_time_minutes(self) -> int: """Convert HH:MM:SS time_limit to total minutes.""" parts = self.time_limit.split(":") diff --git a/scripts/cli.py b/scripts/cli.py index 5cd370a..c17796d 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -152,6 +152,16 @@ def main(argv: list[str] | None = None) -> int: help="Submit a profiling job to K8s or Slurm", add_help=False, ) + sub.add_parser( + "status", + help="Query job status (local/k8s/slurm)", + add_help=False, + ) + sub.add_parser( + "logs", + help="Retrieve job logs (local/k8s/slurm)", + add_help=False, + ) args, remaining = parser.parse_known_args(argv) @@ -164,6 +174,18 @@ def main(argv: list[str] | None = None) -> int: submit_main(remaining) return 0 + if args.command == "status": + from scripts.status_profile import main_status + + main_status(remaining) + return 0 + + if args.command == "logs": + from scripts.status_profile import main_logs + + main_logs(remaining) + return 0 + parser.print_help() return 1 diff --git a/scripts/run_stage_profile.py b/scripts/run_stage_profile.py index 8346e3b..c27d6f3 100644 --- a/scripts/run_stage_profile.py +++ b/scripts/run_stage_profile.py @@ -714,8 +714,8 @@ def parse_args(argv: Optional[list] = None) -> argparse.Namespace: ) srv.add_argument( "--log-dir", - default="/flowsim/tests/test-artifacts", - help="Directory for server logs", + default="", + help="Directory for server logs (default: {output-dir}/logs/)", ) return p.parse_args(argv) @@ -873,6 +873,10 @@ def _write_summary(args, summary: list[dict]) -> None: def main(argv: Optional[list] = None) -> int: args = parse_args(argv) + # Default log_dir to {output_dir}/logs/ if not specified + if not args.log_dir: + args.log_dir = os.path.join(args.output_dir, "logs") + if args.decode_tokens < 2: print( "[ERROR] --decode-tokens must be >= 2. " diff --git a/scripts/status_profile.py b/scripts/status_profile.py new file mode 100644 index 0000000..bfcce41 --- /dev/null +++ b/scripts/status_profile.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""Query FlowSim profiling job status and logs. + +Usage examples +-------------- + +Check K8s job status:: + + flowsim status --scheduler k8s --job flowsim-perf-qwen3-8b-bs1-il2048 + +Get K8s job logs:: + + flowsim logs --scheduler k8s --job flowsim-perf-qwen3-8b-bs1-il2048 + +Check Slurm job status:: + + flowsim status --scheduler slurm --job 12345 + +Check local job status (by job name prefix):: + + flowsim status --scheduler local --job flowsim-perf-qwen3-8b-bs1-il2048 +""" + +from __future__ import annotations + +import argparse +import os +import sys + +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_jwt_token +from schedulers.k8s import K8sScheduler +from schedulers.local import LocalScheduler +from schedulers.slurm import SlurmScheduler + + +def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: + return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) + + +def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: + k8s_cfg = load_k8s_config() + slurm_cfg = load_slurm_config() + + p = argparse.ArgumentParser( + description="Query FlowSim profiling job status or logs.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + p.add_argument( + "--scheduler", + choices=["local", "k8s", "slurm"], + required=True, + ) + p.add_argument( + "--job", + required=True, + help="Job name (k8s/local) or job ID (slurm)", + ) + p.add_argument( + "--tail", + type=int, + default=100, + help="Number of log lines to show (default: 100)", + ) + + # -- Local options -- + p.add_argument("--local-workdir", default="") + + # -- K8s options -- + p.add_argument( + "--k8s-namespace", + default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), + ) + p.add_argument( + "--k8s-kubeconfig", + default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), + ) + p.add_argument( + "--k8s-context", + default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), + ) + + # -- Slurm options -- + p.add_argument( + "--slurm-rest-url", + default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), + ) + p.add_argument( + "--slurm-jwt-token", + default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), + ) + p.add_argument( + "--slurm-api-version", + default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), + ) + p.add_argument( + "--slurm-no-verify-ssl", + action="store_true", + ) + + return p.parse_args(argv) + + +def _build_scheduler(args: argparse.Namespace): + if args.scheduler == "local": + return LocalScheduler(workdir=args.local_workdir) + elif args.scheduler == "k8s": + return K8sScheduler( + namespace=args.k8s_namespace, + kubeconfig=args.k8s_kubeconfig, + context=args.k8s_context, + ) + else: + return SlurmScheduler( + rest_url=args.slurm_rest_url, + jwt_token=args.slurm_jwt_token, + api_version=args.slurm_api_version, + verify_ssl=not args.slurm_no_verify_ssl, + ) + + +def main_status(argv: list[str] | None = None) -> None: + args = _parse_args(argv) + + # Resolve Slurm JWT if needed + if args.scheduler == "slurm" and not args.slurm_jwt_token: + slurm_cfg = load_slurm_config() + token = resolve_jwt_token(slurm_cfg) + if token: + args.slurm_jwt_token = token + + scheduler = _build_scheduler(args) + try: + info = scheduler.status(args.job) + print(info["message"]) + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) + + +def main_logs(argv: list[str] | None = None) -> None: + args = _parse_args(argv) + + # Resolve Slurm JWT if needed + if args.scheduler == "slurm" and not args.slurm_jwt_token: + slurm_cfg = load_slurm_config() + token = resolve_jwt_token(slurm_cfg) + if token: + args.slurm_jwt_token = token + + scheduler = _build_scheduler(args) + try: + text = scheduler.logs(args.job, tail=args.tail) + print(text) + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 18c68aa..25061f1 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -114,10 +114,7 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) infra.add_argument("--host", default="0.0.0.0") infra.add_argument("--port", type=int, default=30001) - infra.add_argument("--output-dir", default="/flowsim/stage_traces") - infra.add_argument( - "--log-dir", default="/flowsim/tests/test-artifacts", - ) + infra.add_argument("--output-dir", default="") infra.add_argument("--job-name", default="") # -- Local options -- @@ -300,7 +297,6 @@ def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: host=args.host, port=args.port, output_dir=args.output_dir, - log_dir=args.log_dir, job_name=args.job_name, extra_server_opts=args.extra_server_opts, disagg_transfer_backend=args.disagg_transfer_backend, @@ -353,6 +349,18 @@ def _build_scheduler(args: argparse.Namespace): def main(argv: list[str] | None = None) -> None: args = _parse_args(argv) + # Smart defaults for output_dir based on scheduler + if not args.output_dir: + if args.scheduler == "local": + project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + args.output_dir = os.path.join(project_root, "stage_traces") + elif args.scheduler == "slurm": + # Slurm: default to ~/flowsim_traces (shared filesystem) + args.output_dir = os.path.expanduser("~/flowsim_traces") + else: + # K8s: container path (PVC/hostPath mounted here) + args.output_dir = "/flowsim/stage_traces" + # Resolve Slurm JWT token from jwt_token_cmd in config if needed if args.scheduler == "slurm" and not args.slurm_jwt_token: slurm_cfg = load_slurm_config() @@ -380,6 +388,22 @@ def main(argv: list[str] | None = None) -> None: else: result = scheduler.submit(spec) print(result) + # Tell user where to find results + print() + print(f"Traces: {spec.output_dir}") + print(f"Logs: {spec.log_dir}") + if args.scheduler == "k8s": + if args.k8s_pvc: + print(f" (persisted on PVC '{args.k8s_pvc}')") + else: + print(f" (persisted at hostPath '{args.k8s_host_output_dir}' on the node)") + print(f"\nTo check status: flowsim status --scheduler k8s --job {spec.default_job_name()[:63]}") + print(f"To view logs: flowsim logs --scheduler k8s --job {spec.default_job_name()[:63]}") + elif args.scheduler == "slurm": + print(f" (on cluster shared filesystem)") + print(f"\nTo check status: flowsim status --scheduler slurm --job ") + else: + print(f"\nTo view logs: flowsim logs --scheduler local --job {spec.default_job_name()}") _INIT_HINT = "Run 'flowsim init' to create config files." @@ -394,6 +418,20 @@ def _validate_connection(args: argparse.Namespace) -> None: "Set it in ~/.flowsim/k8s.yaml, FLOWSIM_K8S_NAMESPACE env var,\n" f"or --k8s-namespace flag. {_INIT_HINT}" ) + # Traces + logs must survive pod termination + if not args.k8s_pvc and not args.k8s_host_output_dir: + sys.exit( + "Error: no persistent storage configured for K8s job output.\n" + "Traces and logs are written to output_dir inside the pod —\n" + "without a volume mount they are lost when the pod exits.\n\n" + "Set one of:\n" + " --k8s-pvc (PersistentVolumeClaim)\n" + " --k8s-host-output-dir (hostPath on the node)\n\n" + "Or configure in ~/.flowsim/k8s.yaml:\n" + " pvc: my-traces-pvc\n" + " # or\n" + " host_output_dir: /data/flowsim-traces" + ) # kubeconfig is optional (in-cluster auto-discovery), but warn if not args.k8s_kubeconfig and not args.k8s_context: print( diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py new file mode 100644 index 0000000..062418a --- /dev/null +++ b/tests/integration/test_scheduler_local.py @@ -0,0 +1,229 @@ +"""Integration tests for `flowsim submit --scheduler local`. + +Runs actual profiling jobs inside the FlowSim Docker container and verifies +that traces and parsed CSVs are produced. + +Requirements +------------ +* Running inside the ``flowsim`` Docker container with GPUs. +* ``pip install -e .`` done (or schedulers/ available on PYTHONPATH). + +Environment Variables +--------------------- +``MODEL`` + Model path (default: ``/flowsim/workload/models/configs/Qwen3-235B-A22B``). +``LOAD_FORMAT`` + Load format (default: ``dummy``). + +Usage +----- + docker exec flowsim-test python -m pytest tests/integration/test_scheduler_local.py -v -x +""" + +import glob +import os +import subprocess +import sys + +import pytest + +_PROJECT_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..") +) + +MODEL = os.environ.get( + "MODEL", "/flowsim/workload/models/configs/Qwen3-235B-A22B" +) +LOAD_FORMAT = os.environ.get("LOAD_FORMAT", "dummy") +ARTIFACT_DIR = os.environ.get( + "PYTEST_ARTIFACT_DIR", "/flowsim/tests/test-artifacts" +) + + +def _flowsim_submit(*args: str, timeout: int = 1200) -> subprocess.CompletedProcess: + """Run ``flowsim submit`` via Python entry point.""" + cmd = [ + sys.executable, "-u", "-c", + "from scripts.cli import main; main()", + "submit", *args, + ] + env = os.environ.copy() + env["PYTHONPATH"] = _PROJECT_ROOT + ( + ":" + env.get("PYTHONPATH", "") + ) + env["PYTHONUNBUFFERED"] = "1" + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=_PROJECT_ROOT, + env=env, + timeout=timeout, + ) + return result + + +class TestLocalSubmitPerf: + """flowsim submit --scheduler local --collect perf — runs real profiling.""" + + def test_local_perf_tp1(self): + """Single-GPU perf profiling via flowsim submit.""" + output_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp1") + log_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp1_logs") + + r = _flowsim_submit( + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--tp", "1", + "--bs", "1", + "--input-len", "512", + "--decode-tokens", "8", + "--warmup-n", "2", + "--gpus", "1", + "--local-gpus", "0", + "--output-dir", output_dir, + "--log-dir", log_dir, + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + ) + + # Print output for debugging + if r.returncode != 0: + print("STDOUT:", r.stdout[-3000:]) + print("STDERR:", r.stderr[-3000:]) + assert r.returncode == 0, f"flowsim submit failed (exit {r.returncode})" + + # Verify trace files exist + traces = glob.glob( + os.path.join(output_dir, "**/*.trace.json.gz"), recursive=True + ) + assert len(traces) > 0, f"No trace files under {output_dir}" + + extend = [t for t in traces if "EXTEND" in os.path.basename(t)] + decode = [t for t in traces if "DECODE" in os.path.basename(t)] + assert len(extend) > 0, "No EXTEND traces" + assert len(decode) > 0, "No DECODE traces" + + # Verify parsed CSVs + csvs = glob.glob( + os.path.join(output_dir, "**/parsed/*.csv"), recursive=True + ) + assert len(csvs) > 0, f"No parsed CSVs under {output_dir}" + + def test_local_perf_tp2(self): + """Multi-GPU perf profiling (TP=2) via flowsim submit.""" + output_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp2") + log_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp2_logs") + + r = _flowsim_submit( + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--tp", "2", + "--bs", "1", + "--input-len", "1024", + "--decode-tokens", "8", + "--warmup-n", "2", + "--gpus", "2", + "--local-gpus", "0,1", + "--output-dir", output_dir, + "--log-dir", log_dir, + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + ) + + if r.returncode != 0: + print("STDOUT:", r.stdout[-3000:]) + print("STDERR:", r.stderr[-3000:]) + assert r.returncode == 0, f"flowsim submit failed (exit {r.returncode})" + + traces = glob.glob( + os.path.join(output_dir, "**/*.trace.json.gz"), recursive=True + ) + assert len(traces) > 0, f"No trace files under {output_dir}" + + # TP=2 should produce traces for both ranks + tp0 = [t for t in traces if "TP-0" in os.path.basename(t)] + tp1 = [t for t in traces if "TP-1" in os.path.basename(t)] + assert len(tp0) > 0, "No TP-0 traces" + assert len(tp1) > 0, "No TP-1 traces" + + +class TestLocalSubmitDryRun: + """flowsim submit --scheduler local --dry-run — verify command generation.""" + + def test_dry_run_output(self): + r = _flowsim_submit( + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--tp", "2", + "--local-gpus", "0,1", + "--dry-run", + ) + assert r.returncode == 0 + assert "CUDA_VISIBLE_DEVICES=0,1" in r.stdout + assert "scripts/run_stage_profile.py" in r.stdout + assert "--tp 2" in r.stdout + + def test_dry_run_pd(self): + r = _flowsim_submit( + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--pd", + "--dry-run", + ) + assert r.returncode == 0 + assert "PREFILL INSTANCE" in r.stdout + assert "DECODE INSTANCE" in r.stdout + assert "--disaggregation-mode prefill" in r.stdout + assert "--disaggregation-mode decode" in r.stdout + + +class TestK8sSubmitDryRun: + """flowsim submit --scheduler k8s --dry-run — verify YAML generation.""" + + def test_k8s_dry_run(self): + r = _flowsim_submit( + "--scheduler", "k8s", + "--collect", "perf", + "--model-path", MODEL, + "--k8s-namespace", "default", + "--dry-run", + ) + assert r.returncode == 0 + assert "apiVersion: batch/v1" in r.stdout + assert "kind: Job" in r.stdout + assert MODEL in r.stdout + + def test_k8s_pd_dry_run(self): + r = _flowsim_submit( + "--scheduler", "k8s", + "--collect", "perf", + "--model-path", MODEL, + "--k8s-namespace", "default", + "--pd", + "--dry-run", + ) + assert r.returncode == 0 + assert "PREFILL INSTANCE" in r.stdout + assert "DECODE INSTANCE" in r.stdout + + +class TestSlurmSubmitDryRun: + """flowsim submit --scheduler slurm --dry-run — verify sbatch script.""" + + def test_slurm_dry_run(self): + r = _flowsim_submit( + "--scheduler", "slurm", + "--collect", "perf", + "--model-path", MODEL, + "--slurm-partition", "gpu", + "--slurm-rest-url", "http://fake:6820", + "--slurm-jwt-token", "fake", + "--dry-run", + ) + assert r.returncode == 0 + assert "#!/bin/bash" in r.stdout + assert "#SBATCH --partition=gpu" in r.stdout + assert MODEL in r.stdout From ea3c27abb0c66790cfac5e943525aa7c2eb2e129 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:43:33 +0000 Subject: [PATCH 16/56] fix: flowsim logs shows all log files (stdout + stderr) with listing --- schedulers/k8s.py | 16 ++++++++++++++-- schedulers/local.py | 33 +++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 1640f5c..69e13e1 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -250,7 +250,13 @@ def status(self, job_id: str) -> dict: } def logs(self, job_id: str, *, tail: int = 100) -> str: - """Retrieve logs from the pod(s) of a K8s Job.""" + """Retrieve logs from the pod(s) of a K8s Job. + + Shows the Pod stdout/stderr (profiling script output). + Server log files are persisted on the PVC/hostPath under + ``{output_dir}/logs/`` and can be accessed from the node + or another pod mounting the same volume. + """ try: from kubernetes import client as k8s_client except ImportError: @@ -276,6 +282,12 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: ) except Exception as exc: log_text = f"(error reading logs: {exc})" - parts.append(f"=== {name} ===\n{log_text}") + parts.append(f"=== Pod: {name} (last {tail} lines) ===\n{log_text}") + + # Hint about persistent server logs + if self.pvc_name: + parts.append(f"\nServer logs persisted on PVC '{self.pvc_name}' under {{output_dir}}/logs/") + elif self.host_output_dir: + parts.append(f"\nServer logs at {self.host_output_dir}/logs/ on the scheduled node") return "\n".join(parts) diff --git a/schedulers/local.py b/schedulers/local.py index da3b03a..2d38b0a 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -159,19 +159,36 @@ def status(self, job_id: str) -> dict: } def logs(self, job_id: str, *, tail: int = 100) -> str: - """Read the last *tail* lines from the most recent local log file.""" + """Show log files for a local job. + + Lists all log files matching *job_id*, then prints the last + *tail* lines of the most recent stdout **and** stderr logs. + """ import glob log_dir = os.path.join(self.workdir, "stage_traces", "logs") - pattern = os.path.join(log_dir, f"{job_id}_*.stdout.log") + pattern = os.path.join(log_dir, f"{job_id}_*") matches = sorted(glob.glob(pattern)) if not matches: return f"No logs found matching {pattern}" - latest = matches[-1] - with open(latest) as f: - all_lines = f.readlines() - - header = f"=== {latest} (last {tail} lines) ===\n" - return header + "".join(all_lines[-tail:]) + parts = [f"Log files ({len(matches)}):"] + for p in matches: + size = os.path.getsize(p) + parts.append(f" {p} ({size} bytes)") + parts.append("") + + # Show tail of latest stdout + stderr + stdout_files = sorted(f for f in matches if f.endswith(".stdout.log")) + stderr_files = sorted(f for f in matches if f.endswith(".stderr.log")) + + for label, files in [("stdout", stdout_files), ("stderr", stderr_files)]: + if files: + latest = files[-1] + with open(latest) as fh: + lines = fh.readlines() + parts.append(f"=== {latest} (last {tail} lines) ===") + parts.append("".join(lines[-tail:])) + + return "\n".join(parts) From eb46c36f716f8a7f256c9651ac4d3fcaa64ebe08 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:44:51 +0000 Subject: [PATCH 17/56] fix: flowsim logs shows file locations + actionable commands instead of dumping content --- schedulers/k8s.py | 56 +++++++++++++++++++++++++++++---------------- schedulers/local.py | 38 ++++++++++++++++-------------- schedulers/slurm.py | 31 +++++++++++++++++-------- 3 files changed, 79 insertions(+), 46 deletions(-) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 69e13e1..7d52319 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -250,13 +250,7 @@ def status(self, job_id: str) -> dict: } def logs(self, job_id: str, *, tail: int = 100) -> str: - """Retrieve logs from the pod(s) of a K8s Job. - - Shows the Pod stdout/stderr (profiling script output). - Server log files are persisted on the PVC/hostPath under - ``{output_dir}/logs/`` and can be accessed from the node - or another pod mounting the same volume. - """ + """Show where logs are and how to access them for a K8s Job.""" try: from kubernetes import client as k8s_client except ImportError: @@ -271,23 +265,45 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: if not pods.items: return f"No pods found for job {job_id} in namespace {self.namespace}" - parts = [] + parts: list[str] = [] + + # Pod info for pod in pods.items: name = pod.metadata.name - try: - log_text = core_api.read_namespaced_pod_log( - name=name, - namespace=self.namespace, - tail_lines=tail, - ) - except Exception as exc: - log_text = f"(error reading logs: {exc})" - parts.append(f"=== Pod: {name} (last {tail} lines) ===\n{log_text}") + phase = pod.status.phase + parts.append(f"Pod: {name} ({phase})") + + parts.append("") + + # Commands to view pod stdout + parts.append("View profiling script output:") + for pod in pods.items: + name = pod.metadata.name + parts.append(f" kubectl logs {name} -n {self.namespace}") + parts.append(f" kubectl logs {name} -n {self.namespace} --tail={tail}") + + parts.append("") - # Hint about persistent server logs + # Persistent log files if self.pvc_name: - parts.append(f"\nServer logs persisted on PVC '{self.pvc_name}' under {{output_dir}}/logs/") + parts.append(f"Server logs + traces persisted on PVC '{self.pvc_name}'.") + parts.append("Copy to local machine:") + for pod in pods.items: + name = pod.metadata.name + if pod.status.phase in ("Running", "Succeeded"): + parts.append(f" kubectl cp {self.namespace}/{name}:/flowsim/stage_traces ./stage_traces") + break + else: + parts.append(" (pod not running — mount the PVC in another pod to retrieve files)") elif self.host_output_dir: - parts.append(f"\nServer logs at {self.host_output_dir}/logs/ on the scheduled node") + parts.append(f"Server logs + traces at hostPath on the node:") + parts.append(f" {self.host_output_dir}/") + parts.append(f" {self.host_output_dir}/logs/") + # Identify node + for pod in pods.items: + if pod.spec.node_name: + parts.append(f" Node: {pod.spec.node_name}") + parts.append(f" scp {pod.spec.node_name}:{self.host_output_dir}/ ./stage_traces/") + break return "\n".join(parts) diff --git a/schedulers/local.py b/schedulers/local.py index 2d38b0a..67704c5 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -159,11 +159,7 @@ def status(self, job_id: str) -> dict: } def logs(self, job_id: str, *, tail: int = 100) -> str: - """Show log files for a local job. - - Lists all log files matching *job_id*, then prints the last - *tail* lines of the most recent stdout **and** stderr logs. - """ + """List log files for a local job and print access commands.""" import glob log_dir = os.path.join(self.workdir, "stage_traces", "logs") @@ -171,24 +167,32 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: matches = sorted(glob.glob(pattern)) if not matches: - return f"No logs found matching {pattern}" + # Also try wildcard — user may have given a partial name + pattern = os.path.join(log_dir, f"*{job_id}*") + matches = sorted(glob.glob(pattern)) + + if not matches: + return f"No logs found in {log_dir} matching '{job_id}'" - parts = [f"Log files ({len(matches)}):"] + parts = [f"Log directory: {log_dir}", ""] + parts.append(f"Files ({len(matches)}):") for p in matches: size = os.path.getsize(p) - parts.append(f" {p} ({size} bytes)") - parts.append("") + parts.append(f" {os.path.basename(p)} ({size:,} bytes)") - # Show tail of latest stdout + stderr + # Provide commands + parts.append("") + parts.append("View logs:") stdout_files = sorted(f for f in matches if f.endswith(".stdout.log")) stderr_files = sorted(f for f in matches if f.endswith(".stderr.log")) + if stdout_files: + parts.append(f" less {stdout_files[-1]}") + if stderr_files: + parts.append(f" less {stderr_files[-1]}") - for label, files in [("stdout", stdout_files), ("stderr", stderr_files)]: - if files: - latest = files[-1] - with open(latest) as fh: - lines = fh.readlines() - parts.append(f"=== {latest} (last {tail} lines) ===") - parts.append("".join(lines[-tail:])) + trace_dir = os.path.join(self.workdir, "stage_traces") + parts.append("") + parts.append(f"Trace files: {trace_dir}") + parts.append(f" ls {trace_dir}") return "\n".join(parts) diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 9ec84a6..3f5d166 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -291,20 +291,33 @@ def status(self, job_id: str) -> dict: } def logs(self, job_id: str, *, tail: int = 100) -> str: - """Retrieve log output for a Slurm job. - - Tries to read the sbatch output file via slurmrestd. - Falls back to showing job info if direct log access isn't available. - """ + """Show where Slurm job logs are and how to access them.""" info = self.status(job_id) output_file = info.get("output_hint", "") - lines = [info["message"], ""] + state = info.get("state", "UNKNOWN") + + parts = [info["message"], ""] if output_file: - lines.append(f"To view full logs on the cluster:") - lines.append(f" tail -{tail} {output_file}") + parts.append(f"Log file (on cluster shared filesystem):") + parts.append(f" {output_file}") + parts.append("") + parts.append("View on login node:") + parts.append(f" less {output_file}") + parts.append(f" tail -{tail} {output_file}") + parts.append("") + parts.append("Copy to local machine:") + parts.append(f" scp :{output_file} .") + else: + parts.append("No output file path found in job metadata.") - return "\n".join(lines) + # Trace files location + parts.append("") + parts.append("Trace files (on cluster shared filesystem):") + parts.append(" ~/flowsim_traces/") + parts.append(" ls ~/flowsim_traces/") + + return "\n".join(parts) def _parse_time_minutes(self) -> int: """Convert HH:MM:SS time_limit to total minutes.""" From 0e59219ab1b416650b0f732780266a821b9edc1d Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 05:56:55 +0000 Subject: [PATCH 18/56] test: integration tests for all 3 scheduler backends (local/k8s/slurm) - TestLocalScheduler: real TP=1 profiling, verify traces + logs + status/logs CLI - TestK8sScheduler: dry-run YAML (PVC mount, hostPath, log paths), refuse without storage, real Job submit to Kind cluster with status/logs verification - TestSlurmScheduler: dry-run sbatch script (output_dir, log_dir, PD pair) Results: 9 passed, 1 skipped (K8s real submit skipped in container, passes on host) --- tests/integration/test_scheduler_local.py | 359 +++++++++++++++------- 1 file changed, 251 insertions(+), 108 deletions(-) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 062418a..879c00f 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -1,12 +1,20 @@ -"""Integration tests for `flowsim submit --scheduler local`. +"""Integration tests for ``flowsim submit``, ``flowsim status``, ``flowsim logs``. -Runs actual profiling jobs inside the FlowSim Docker container and verifies -that traces and parsed CSVs are produced. +Tests all three scheduler backends (local, k8s, slurm) end-to-end. + +* **local** — runs real TP=1 profiling and verifies traces, parsed CSVs, + and log files are all produced in the correct locations. +* **k8s** — submits a real Job to a Kind cluster, verifies it was created, + then checks ``flowsim status`` / ``flowsim logs`` output. Also validates + that dry-run YAML has the correct volume mounts and log paths. +* **slurm** — dry-run only; verifies the sbatch script has the correct + ``output_dir``, ``--log-dir``, and ``#SBATCH --output`` directives. Requirements ------------ -* Running inside the ``flowsim`` Docker container with GPUs. -* ``pip install -e .`` done (or schedulers/ available on PYTHONPATH). +* The ``flowsim-test`` container with GPUs (for local tests). +* A Kind cluster named ``flowsim`` (for K8s tests). +* ``schedulers/`` available on PYTHONPATH. Environment Variables --------------------- @@ -17,13 +25,20 @@ Usage ----- - docker exec flowsim-test python -m pytest tests/integration/test_scheduler_local.py -v -x + # Inside container (local tests): + docker exec flowsim-test python -m pytest \ + tests/integration/test_scheduler_local.py -v -x + + # On host (k8s tests — needs kubeconfig): + python -m pytest tests/integration/test_scheduler_local.py \ + -v -x -k "k8s" """ import glob import os import subprocess import sys +import time import pytest @@ -39,20 +54,23 @@ "PYTEST_ARTIFACT_DIR", "/flowsim/tests/test-artifacts" ) +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- -def _flowsim_submit(*args: str, timeout: int = 1200) -> subprocess.CompletedProcess: - """Run ``flowsim submit`` via Python entry point.""" +def _flowsim_cli(*args: str, timeout: int = 1200) -> subprocess.CompletedProcess: + """Run a ``flowsim`` subcommand via Python entry point.""" cmd = [ sys.executable, "-u", "-c", "from scripts.cli import main; main()", - "submit", *args, + *args, ] env = os.environ.copy() env["PYTHONPATH"] = _PROJECT_ROOT + ( ":" + env.get("PYTHONPATH", "") ) env["PYTHONUNBUFFERED"] = "1" - result = subprocess.run( + return subprocess.run( cmd, capture_output=True, text=True, @@ -60,18 +78,58 @@ def _flowsim_submit(*args: str, timeout: int = 1200) -> subprocess.CompletedProc env=env, timeout=timeout, ) - return result -class TestLocalSubmitPerf: - """flowsim submit --scheduler local --collect perf — runs real profiling.""" +def _assert_traces(output_dir: str) -> None: + """Assert EXTEND + DECODE traces and parsed CSVs exist.""" + traces = glob.glob( + os.path.join(output_dir, "**/*.trace.json.gz"), recursive=True + ) + assert len(traces) > 0, f"No trace files under {output_dir}" + extend = [t for t in traces if "EXTEND" in os.path.basename(t)] + decode = [t for t in traces if "DECODE" in os.path.basename(t)] + assert len(extend) > 0, "No EXTEND traces" + assert len(decode) > 0, "No DECODE traces" + + csvs = glob.glob( + os.path.join(output_dir, "**/parsed/*.csv"), recursive=True + ) + assert len(csvs) > 0, f"No parsed CSVs under {output_dir}" + # At least EXTEND should be parsed; DECODE CSV may be absent for short sequences + extend_csvs = [c for c in csvs if "EXTEND" in os.path.basename(c)] + assert len(extend_csvs) > 0, "No EXTEND parsed CSVs" + + +def _assert_logs(output_dir: str) -> None: + """Assert server log files exist under {output_dir}/logs/.""" + log_dir = os.path.join(output_dir, "logs") + assert os.path.isdir(log_dir), f"Log directory not found: {log_dir}" + log_files = os.listdir(log_dir) + assert len(log_files) > 0, f"No log files in {log_dir}" + stdout_logs = [f for f in log_files if f.endswith(".stdout.log")] + stderr_logs = [f for f in log_files if f.endswith(".stderr.log")] + assert len(stdout_logs) > 0, f"No stdout logs in {log_dir}" + assert len(stderr_logs) > 0, f"No stderr logs in {log_dir}" + # At least one log should be non-empty + sizes = [ + os.path.getsize(os.path.join(log_dir, f)) + for f in stdout_logs + ] + assert max(sizes) > 0, "All stdout logs are empty" + + +# ===================================================================== +# LOCAL SCHEDULER — real profiling +# ===================================================================== +class TestLocalScheduler: + """Run real profiling via ``flowsim submit --scheduler local``.""" def test_local_perf_tp1(self): - """Single-GPU perf profiling via flowsim submit.""" - output_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp1") - log_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp1_logs") + """TP=1 perf profiling: traces + parsed CSVs + log files.""" + output_dir = os.path.join(ARTIFACT_DIR, "sched_local_tp1") - r = _flowsim_submit( + r = _flowsim_cli( + "submit", "--scheduler", "local", "--collect", "perf", "--model-path", MODEL, @@ -83,147 +141,232 @@ def test_local_perf_tp1(self): "--gpus", "1", "--local-gpus", "0", "--output-dir", output_dir, - "--log-dir", log_dir, "--extra-server-opts", f"--load-format {LOAD_FORMAT}", ) - # Print output for debugging if r.returncode != 0: print("STDOUT:", r.stdout[-3000:]) print("STDERR:", r.stderr[-3000:]) assert r.returncode == 0, f"flowsim submit failed (exit {r.returncode})" - # Verify trace files exist - traces = glob.glob( - os.path.join(output_dir, "**/*.trace.json.gz"), recursive=True - ) - assert len(traces) > 0, f"No trace files under {output_dir}" - - extend = [t for t in traces if "EXTEND" in os.path.basename(t)] - decode = [t for t in traces if "DECODE" in os.path.basename(t)] - assert len(extend) > 0, "No EXTEND traces" - assert len(decode) > 0, "No DECODE traces" + # Verify traces and parsed CSVs + _assert_traces(output_dir) - # Verify parsed CSVs - csvs = glob.glob( - os.path.join(output_dir, "**/parsed/*.csv"), recursive=True - ) - assert len(csvs) > 0, f"No parsed CSVs under {output_dir}" + # Verify log files under output_dir/logs/ + _assert_logs(output_dir) - def test_local_perf_tp2(self): - """Multi-GPU perf profiling (TP=2) via flowsim submit.""" - output_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp2") - log_dir = os.path.join(ARTIFACT_DIR, "local_perf_tp2_logs") + # Verify submit output mentions log/trace locations + combined = r.stdout + r.stderr + assert "Traces:" in combined, "Submit output should show trace location" + assert "Logs:" in combined, "Submit output should show log location" - r = _flowsim_submit( + def test_local_status(self): + """flowsim status --scheduler local should find logs from the previous run.""" + r = _flowsim_cli( + "status", "--scheduler", "local", - "--collect", "perf", - "--model-path", MODEL, - "--tp", "2", - "--bs", "1", - "--input-len", "1024", - "--decode-tokens", "8", - "--warmup-n", "2", - "--gpus", "2", - "--local-gpus", "0,1", - "--output-dir", output_dir, - "--log-dir", log_dir, - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--job", "flowsim-perf", ) + # Should either find logs or say not found — should not crash + assert r.returncode == 0 - if r.returncode != 0: - print("STDOUT:", r.stdout[-3000:]) - print("STDERR:", r.stderr[-3000:]) - assert r.returncode == 0, f"flowsim submit failed (exit {r.returncode})" - - traces = glob.glob( - os.path.join(output_dir, "**/*.trace.json.gz"), recursive=True + def test_local_logs(self): + """flowsim logs --scheduler local should list log files and give paths.""" + r = _flowsim_cli( + "logs", + "--scheduler", "local", + "--job", "flowsim-perf", ) - assert len(traces) > 0, f"No trace files under {output_dir}" - - # TP=2 should produce traces for both ranks - tp0 = [t for t in traces if "TP-0" in os.path.basename(t)] - tp1 = [t for t in traces if "TP-1" in os.path.basename(t)] - assert len(tp0) > 0, "No TP-0 traces" - assert len(tp1) > 0, "No TP-1 traces" + assert r.returncode == 0 + output = r.stdout + # Should contain file listing or "No logs" — not crash + assert "Log directory:" in output or "No logs" in output -class TestLocalSubmitDryRun: - """flowsim submit --scheduler local --dry-run — verify command generation.""" +# ===================================================================== +# K8S SCHEDULER +# ===================================================================== +class TestK8sScheduler: + """K8s scheduler: dry-run validates YAML structure, real submit to Kind.""" - def test_dry_run_output(self): - r = _flowsim_submit( - "--scheduler", "local", + def test_k8s_dry_run_has_volume_and_log_path(self): + """Dry-run YAML should mount output volume and pass --log-dir.""" + r = _flowsim_cli( + "submit", + "--scheduler", "k8s", "--collect", "perf", "--model-path", MODEL, - "--tp", "2", - "--local-gpus", "0,1", + "--k8s-namespace", "default", + "--k8s-pvc", "test-traces", + "--output-dir", "/data/traces", "--dry-run", ) assert r.returncode == 0 - assert "CUDA_VISIBLE_DEVICES=0,1" in r.stdout - assert "scripts/run_stage_profile.py" in r.stdout - assert "--tp 2" in r.stdout - - def test_dry_run_pd(self): - r = _flowsim_submit( - "--scheduler", "local", + yaml_output = r.stdout + + # Job structure + assert "apiVersion: batch/v1" in yaml_output + assert "kind: Job" in yaml_output + + # PVC volume mount + assert "test-traces" in yaml_output + assert "persistentVolumeClaim" in yaml_output + + # output_dir and derived log_dir appear in the command + assert "--output-dir" in yaml_output + assert "/data/traces" in yaml_output + assert "--log-dir" in yaml_output + assert "/data/traces/logs" in yaml_output + + def test_k8s_dry_run_hostpath(self): + """Dry-run with hostPath should have hostPath volume.""" + r = _flowsim_cli( + "submit", + "--scheduler", "k8s", "--collect", "perf", "--model-path", MODEL, - "--pd", + "--k8s-namespace", "default", + "--k8s-host-output-dir", "/mnt/traces", "--dry-run", ) assert r.returncode == 0 - assert "PREFILL INSTANCE" in r.stdout - assert "DECODE INSTANCE" in r.stdout - assert "--disaggregation-mode prefill" in r.stdout - assert "--disaggregation-mode decode" in r.stdout - - -class TestK8sSubmitDryRun: - """flowsim submit --scheduler k8s --dry-run — verify YAML generation.""" + assert "hostPath" in r.stdout + assert "/mnt/traces" in r.stdout - def test_k8s_dry_run(self): - r = _flowsim_submit( + def test_k8s_refuses_without_storage(self): + """Submit (not dry-run) without PVC or hostPath should fail.""" + r = _flowsim_cli( + "submit", "--scheduler", "k8s", "--collect", "perf", "--model-path", MODEL, "--k8s-namespace", "default", - "--dry-run", + # Explicitly clear any config defaults + "--k8s-pvc", "", + "--k8s-host-output-dir", "", ) - assert r.returncode == 0 - assert "apiVersion: batch/v1" in r.stdout - assert "kind: Job" in r.stdout - assert MODEL in r.stdout + assert r.returncode != 0 + combined = r.stdout + r.stderr + assert "persistent storage" in combined or "pvc" in combined.lower() - def test_k8s_pd_dry_run(self): - r = _flowsim_submit( + @pytest.mark.skipif( + not os.path.exists(os.path.expanduser("~/.kube/config")), + reason="No kubeconfig — skip K8s real submit (run on host with Kind cluster)", + ) + def test_k8s_real_submit_to_kind(self): + """Submit a real Job to Kind cluster, verify status + logs commands work.""" + job_name = f"test-integ-{int(time.time()) % 100000}" + r = _flowsim_cli( + "submit", "--scheduler", "k8s", "--collect", "perf", "--model-path", MODEL, "--k8s-namespace", "default", - "--pd", + "--k8s-host-output-dir", "/tmp/flowsim-test-traces", + "--job-name", job_name, + ) + combined = r.stdout + r.stderr + + if r.returncode != 0: + print("Submit output:", combined[-3000:]) + assert r.returncode == 0, f"K8s submit failed: {combined[-1000:]}" + assert "created" in combined.lower() + + # Verify submit output has location hints + assert "Traces:" in combined + assert "Logs:" in combined + assert "flowsim status" in combined + assert "flowsim logs" in combined + + # Check status + r2 = _flowsim_cli("status", "--scheduler", "k8s", "--job", job_name) + assert r2.returncode == 0 + assert job_name in r2.stdout + + # Check logs (may say "pending" or show pod info) + r3 = _flowsim_cli("logs", "--scheduler", "k8s", "--job", job_name) + assert r3.returncode == 0 + # Should mention kubectl or pod name or "No pods" + assert "kubectl" in r3.stdout or "No pods" in r3.stdout or "Pod:" in r3.stdout + + # Cleanup: delete the K8s job + subprocess.run( + ["kubectl", "--context", "kind-flowsim", "delete", "job", job_name, + "-n", "default", "--ignore-not-found"], + capture_output=True, timeout=30, + ) + + +# ===================================================================== +# SLURM SCHEDULER — dry-run only (no real cluster) +# ===================================================================== +class TestSlurmScheduler: + """Slurm scheduler: verify sbatch script has correct paths.""" + + def test_slurm_dry_run_output_and_log_paths(self): + """Dry-run sbatch script should reference output_dir and log_dir.""" + r = _flowsim_cli( + "submit", + "--scheduler", "slurm", + "--collect", "perf", + "--model-path", MODEL, + "--slurm-partition", "gpu", + "--slurm-rest-url", "http://fake:6820", + "--slurm-jwt-token", "fake-token", + "--output-dir", "/shared/flowsim_traces", "--dry-run", ) assert r.returncode == 0 - assert "PREFILL INSTANCE" in r.stdout - assert "DECODE INSTANCE" in r.stdout + script = r.stdout + # sbatch directives + assert "#!/bin/bash" in script + assert "#SBATCH --job-name=" in script + assert "#SBATCH --partition=gpu" in script -class TestSlurmSubmitDryRun: - """flowsim submit --scheduler slurm --dry-run — verify sbatch script.""" + # output_dir in the profiling command + assert "--output-dir" in script + assert "/shared/flowsim_traces" in script - def test_slurm_dry_run(self): - r = _flowsim_submit( + # log_dir = output_dir + /logs/ + assert "--log-dir" in script + assert "/shared/flowsim_traces/logs" in script + + def test_slurm_dry_run_default_output_dir(self): + """Default output_dir for Slurm should be ~/flowsim_traces.""" + r = _flowsim_cli( + "submit", "--scheduler", "slurm", "--collect", "perf", "--model-path", MODEL, "--slurm-partition", "gpu", "--slurm-rest-url", "http://fake:6820", - "--slurm-jwt-token", "fake", + "--slurm-jwt-token", "fake-token", + "--dry-run", + ) + assert r.returncode == 0 + assert "flowsim_traces" in r.stdout + + def test_slurm_dry_run_pd_pair(self): + """PD disaggregation dry-run should produce both scripts with correct paths.""" + r = _flowsim_cli( + "submit", + "--scheduler", "slurm", + "--collect", "perf", + "--model-path", MODEL, + "--slurm-partition", "gpu", + "--slurm-rest-url", "http://fake:6820", + "--slurm-jwt-token", "fake-token", + "--output-dir", "/shared/traces", + "--pd", "--dry-run", ) assert r.returncode == 0 - assert "#!/bin/bash" in r.stdout - assert "#SBATCH --partition=gpu" in r.stdout - assert MODEL in r.stdout + output = r.stdout + assert "PREFILL INSTANCE" in output + assert "DECODE INSTANCE" in output + assert "--disaggregation-mode prefill" in output + assert "--disaggregation-mode decode" in output + # Both scripts should reference the same output_dir + assert output.count("--output-dir") >= 2 + assert output.count("/shared/traces/logs") >= 2 From 2c36af02795e87ecce17f3cd914846722d18cc1a Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 18:06:07 +0000 Subject: [PATCH 19/56] feat: align CLI with standard job platform APIs - Add JobResult dataclass: submit() now returns structured data (job_id, scheduler, state, output_dir, message) instead of string - Add flowsim cancel: K8s (delete_namespaced_job), Slurm (DELETE via slurmrestd), local (no-op for synchronous jobs) - Add flowsim list: list FlowSim jobs with --status filter K8s (label_selector=app=flowsim), Slurm (slurmrestd /jobs), local (scan log files) - Add --follow / -f to flowsim logs: shows tail -f / kubectl logs -f commands for real-time log streaming - submit_pd_pair() now returns list[JobResult] instead of string - Post-submit output shows cancel/list/follow commands --- schedulers/__init__.py | 3 +- schedulers/base.py | 43 ++++++++++++-- schedulers/k8s.py | 75 +++++++++++++++++++++++-- schedulers/local.py | 86 ++++++++++++++++++++++++---- schedulers/slurm.py | 82 +++++++++++++++++++++++---- scripts/cli.py | 22 ++++++++ scripts/status_profile.py | 115 +++++++++++++++++++++++++------------- scripts/submit_profile.py | 33 +++++++---- 8 files changed, 377 insertions(+), 82 deletions(-) diff --git a/schedulers/__init__.py b/schedulers/__init__.py index fd20eb2..7e0df35 100644 --- a/schedulers/__init__.py +++ b/schedulers/__init__.py @@ -1,12 +1,13 @@ """Scheduler backends for submitting FlowSim profiling jobs.""" -from schedulers.base import BaseScheduler, ProfileJobSpec +from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler __all__ = [ "BaseScheduler", + "JobResult", "K8sScheduler", "LocalScheduler", "ProfileJobSpec", diff --git a/schedulers/base.py b/schedulers/base.py index 3cbc2e7..40d9cea 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -8,6 +8,17 @@ from typing import Optional +@dataclass +class JobResult: + """Structured return value from ``submit()``.""" + + job_id: str + scheduler: str # "local", "k8s", "slurm" + state: str # "Submitted", "Completed", "Failed" + output_dir: str = "" + message: str = "" + + @dataclass class ProfileJobSpec: """All parameters needed to run a stage-profiling job. @@ -153,8 +164,12 @@ def render(self, spec: ProfileJobSpec) -> str: """Render the job manifest / script as a string.""" @abc.abstractmethod - def submit(self, spec: ProfileJobSpec) -> str: - """Submit the job and return a job identifier string.""" + def submit(self, spec: ProfileJobSpec) -> JobResult: + """Submit the job and return a structured :class:`JobResult`.""" + + def cancel(self, job_id: str) -> str: + """Cancel a running or pending job. Returns a status message.""" + raise NotImplementedError(f"{type(self).__name__} does not support cancel") def status(self, job_id: str) -> dict: """Query job status. Returns dict with at least 'state' key. @@ -169,7 +184,7 @@ def status(self, job_id: str) -> dict: """ raise NotImplementedError(f"{type(self).__name__} does not support status queries") - def logs(self, job_id: str, *, tail: int = 100) -> str: + def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Retrieve recent log output for a job. Parameters @@ -178,9 +193,27 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: Job name (K8s) or job ID (Slurm) or log prefix (local). tail : int Number of lines from the end to return. + follow : bool + If True, stream logs in real time (blocking). """ raise NotImplementedError(f"{type(self).__name__} does not support log retrieval") + def list_jobs(self, *, status_filter: str = "") -> list[dict]: + """List jobs managed by this scheduler. + + Parameters + ---------- + status_filter : str + If non-empty, only return jobs matching this state + (e.g., ``"Running"``, ``"Succeeded"``, ``"PENDING"``). + + Returns + ------- + list[dict] + Each dict has at least ``{"job_id": ..., "state": ..., "name": ...}``. + """ + raise NotImplementedError(f"{type(self).__name__} does not support list") + def dry_run(self, spec: ProfileJobSpec) -> str: """Render and return the manifest without submitting.""" return self.render(spec) @@ -191,8 +224,8 @@ def render_pd_pair(self, spec: ProfileJobSpec) -> str: decode = self.render(spec.as_decode()) return f"# === PREFILL INSTANCE ===\n{prefill}\n# === DECODE INSTANCE ===\n{decode}" - def submit_pd_pair(self, spec: ProfileJobSpec) -> str: + def submit_pd_pair(self, spec: ProfileJobSpec) -> list[JobResult]: """Submit both prefill and decode jobs.""" r1 = self.submit(spec.as_prefill()) r2 = self.submit(spec.as_decode()) - return f"[prefill] {r1}\n[decode] {r2}" + return [r1, r2] diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 7d52319..44c2917 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -9,7 +9,7 @@ import json -from schedulers.base import BaseScheduler, ProfileJobSpec +from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec # Optional: nicer YAML output for dry-run. try: @@ -137,7 +137,7 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: }, } - def submit(self, spec: ProfileJobSpec) -> str: + def submit(self, spec: ProfileJobSpec) -> JobResult: """Submit via the ``kubernetes`` Python client (``pip install kubernetes``).""" try: from kubernetes import client as k8s_client, config as k8s_config @@ -175,7 +175,13 @@ def submit(self, spec: ProfileJobSpec) -> str: namespace=self.namespace, body=body, ) - return f"job.batch/{resp.metadata.name} created (namespace={resp.metadata.namespace})" + return JobResult( + job_id=resp.metadata.name, + scheduler="k8s", + state="Submitted", + output_dir=spec.output_dir, + message=f"job.batch/{resp.metadata.name} created (namespace={resp.metadata.namespace})", + ) # ----------------------------------------------------------------- # Helpers shared by status / logs @@ -197,6 +203,18 @@ def _load_k8s(self): return k8s_client.BatchV1Api(), k8s_client.CoreV1Api() + def cancel(self, job_id: str) -> str: + """Delete a K8s Job (and its pods) by name.""" + from kubernetes import client as k8s_client + + batch_api, _ = self._load_k8s() + batch_api.delete_namespaced_job( + name=job_id, + namespace=self.namespace, + body=k8s_client.V1DeleteOptions(propagation_policy="Foreground"), + ) + return f"job.batch/{job_id} deleted (namespace={self.namespace})" + def status(self, job_id: str) -> dict: """Query K8s Job status by job name.""" try: @@ -249,7 +267,7 @@ def status(self, job_id: str) -> dict: "output_hint": output_hint, } - def logs(self, job_id: str, *, tail: int = 100) -> str: + def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Show where logs are and how to access them for a K8s Job.""" try: from kubernetes import client as k8s_client @@ -265,6 +283,19 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: if not pods.items: return f"No pods found for job {job_id} in namespace {self.namespace}" + if follow: + # Stream logs from the first running/succeeded pod + for pod in pods.items: + name = pod.metadata.name + if pod.status.phase in ("Running", "Succeeded"): + # Use kubectl follow since the Python client follow is blocking + return ( + f"Follow logs:\n" + f" kubectl logs -f {name} -n {self.namespace}" + ) + name = pods.items[0].metadata.name + return f"Follow logs:\n kubectl logs -f {name} -n {self.namespace}" + parts: list[str] = [] # Pod info @@ -307,3 +338,39 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: break return "\n".join(parts) + + def list_jobs(self, *, status_filter: str = "") -> list[dict]: + """List FlowSim Jobs in the namespace (label: app=flowsim).""" + batch_api, _ = self._load_k8s() + + jobs = batch_api.list_namespaced_job( + namespace=self.namespace, + label_selector="app=flowsim", + ) + result: list[dict] = [] + for job in jobs.items: + st = job.status + if st.succeeded and st.succeeded > 0: + state = "Succeeded" + elif st.failed and st.failed > 0: + state = "Failed" + elif st.active and st.active > 0: + state = "Running" + else: + state = "Pending" + + if status_filter and state.lower() != status_filter.lower(): + continue + + created = "" + if job.metadata.creation_timestamp: + created = job.metadata.creation_timestamp.strftime("%Y-%m-%d %H:%M:%S") + + result.append({ + "job_id": job.metadata.name, + "name": job.metadata.name, + "state": state, + "namespace": self.namespace, + "created": created, + }) + return result diff --git a/schedulers/local.py b/schedulers/local.py index 67704c5..f23a743 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -11,7 +11,7 @@ import sys import time -from schedulers.base import BaseScheduler, ProfileJobSpec +from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec class LocalScheduler(BaseScheduler): @@ -52,7 +52,7 @@ def render(self, spec: ProfileJobSpec) -> str: lines.append(spec.build_shell_command()) return "\n".join(lines) - def submit(self, spec: ProfileJobSpec) -> str: + def submit(self, spec: ProfileJobSpec) -> JobResult: """Run the profiling command locally as a subprocess. stdout and stderr are streamed to the terminal *and* saved to @@ -115,17 +115,33 @@ def _tee(src, dest_file, dest_stream): t_err.join() if proc.returncode != 0: - return ( - f"[local] {job_name} FAILED (exit code {proc.returncode})\n" - f"[local] stdout log: {stdout_path}\n" - f"[local] stderr log: {stderr_path}" + return JobResult( + job_id=job_name, + scheduler="local", + state="Failed", + output_dir=spec.output_dir, + message=( + f"{job_name} FAILED (exit code {proc.returncode})\n" + f"stdout log: {stdout_path}\n" + f"stderr log: {stderr_path}" + ), ) - return ( - f"[local] {job_name} completed successfully\n" - f"[local] stdout log: {stdout_path}\n" - f"[local] stderr log: {stderr_path}" + return JobResult( + job_id=job_name, + scheduler="local", + state="Completed", + output_dir=spec.output_dir, + message=( + f"{job_name} completed successfully\n" + f"stdout log: {stdout_path}\n" + f"stderr log: {stderr_path}" + ), ) + def cancel(self, job_id: str) -> str: + """Local jobs run synchronously, so cancel is not applicable.""" + return f"Local jobs run synchronously and cannot be cancelled. Job: {job_id}" + def status(self, job_id: str) -> dict: """Check local job status by looking for log files. @@ -158,7 +174,7 @@ def status(self, job_id: str) -> dict: "output_hint": trace_dir, } - def logs(self, job_id: str, *, tail: int = 100) -> str: + def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """List log files for a local job and print access commands.""" import glob @@ -174,6 +190,12 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: if not matches: return f"No logs found in {log_dir} matching '{job_id}'" + if follow: + stdout_files = sorted(f for f in matches if f.endswith(".stdout.log")) + if stdout_files: + return f"Follow logs with:\n tail -f {stdout_files[-1]}" + return f"No stdout log found to follow for '{job_id}'" + parts = [f"Log directory: {log_dir}", ""] parts.append(f"Files ({len(matches)}):") for p in matches: @@ -189,6 +211,10 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: parts.append(f" less {stdout_files[-1]}") if stderr_files: parts.append(f" less {stderr_files[-1]}") + if stdout_files: + parts.append("") + parts.append("Follow logs:") + parts.append(f" tail -f {stdout_files[-1]}") trace_dir = os.path.join(self.workdir, "stage_traces") parts.append("") @@ -196,3 +222,41 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: parts.append(f" ls {trace_dir}") return "\n".join(parts) + + def list_jobs(self, *, status_filter: str = "") -> list[dict]: + """List local jobs by scanning log files.""" + import glob + import re + + log_dir = os.path.join(self.workdir, "stage_traces", "logs") + pattern = os.path.join(log_dir, "*.stdout.log") + matches = sorted(glob.glob(pattern)) + + jobs: list[dict] = [] + for path in matches: + basename = os.path.basename(path) + # Parse: {job_name}_{timestamp}.stdout.log + m = re.match(r"^(.+)_(\d+)\.stdout\.log$", basename) + if not m: + continue + name = m.group(1) + ts = m.group(2) + stderr = path.replace(".stdout.log", ".stderr.log") + stderr_size = os.path.getsize(stderr) if os.path.exists(stderr) else 0 + # If stderr has content, might have failed; otherwise completed + state = "Completed" + if stderr_size > 0: + # Check if there's an error indicator in stderr + state = "Completed" # local jobs are synchronous; if log exists, it finished + jobs.append({ + "job_id": name, + "name": name, + "state": state, + "timestamp": ts, + }) + + if status_filter: + filt = status_filter.lower() + jobs = [j for j in jobs if j["state"].lower() == filt] + + return jobs diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 3f5d166..790ade4 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -12,7 +12,7 @@ import urllib.error import urllib.request -from schedulers.base import BaseScheduler, ProfileJobSpec +from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec _DEFAULT_API_VERSION = "v0.0.40" @@ -150,7 +150,7 @@ def render(self, spec: ProfileJobSpec) -> str: lines.append("") return "\n".join(lines) - def submit(self, spec: ProfileJobSpec) -> str: + def submit(self, spec: ProfileJobSpec) -> JobResult: """Submit the job via slurmrestd REST API. Requires ``rest_url`` and ``jwt_token`` to be set. @@ -222,11 +222,17 @@ def submit(self, spec: ProfileJobSpec) -> str: msgs = "; ".join(e.get("error", str(e)) for e in errors) raise RuntimeError(f"slurmrestd job submit failed: {msgs}") - job_id = body.get("job_id", "unknown") - return f"Submitted batch job {job_id}" + job_id = str(body.get("job_id", "unknown")) + return JobResult( + job_id=job_id, + scheduler="slurm", + state="Submitted", + output_dir=spec.output_dir, + message=f"Submitted batch job {job_id}", + ) - def _rest_get(self, path: str) -> dict: - """GET a slurmrestd endpoint and return parsed JSON.""" + def _rest_request(self, path: str, *, method: str = "GET") -> dict: + """Send a request to slurmrestd and return parsed JSON.""" if not self.rest_url: raise RuntimeError("--slurm-rest-url is required") if not self.jwt_token: @@ -236,7 +242,7 @@ def _rest_get(self, path: str) -> dict: headers = { "X-SLURM-USER-TOKEN": self.jwt_token, } - req = urllib.request.Request(url, headers=headers, method="GET") + req = urllib.request.Request(url, headers=headers, method=method) ctx: ssl.SSLContext | None = None if not self.verify_ssl: @@ -253,6 +259,22 @@ def _rest_get(self, path: str) -> dict: except urllib.error.URLError as exc: raise RuntimeError(f"Cannot reach slurmrestd at {self.rest_url}: {exc.reason}") from exc + def _rest_get(self, path: str) -> dict: + """GET a slurmrestd endpoint and return parsed JSON.""" + return self._rest_request(path, method="GET") + + def cancel(self, job_id: str) -> str: + """Cancel a Slurm job via slurmrestd DELETE.""" + body = self._rest_request( + f"/slurm/{self.api_version}/job/{job_id}", + method="DELETE", + ) + errors = body.get("errors") or [] + if errors: + msgs = "; ".join(e.get("error", str(e)) for e in errors) + raise RuntimeError(f"slurmrestd cancel failed: {msgs}") + return f"Cancelled Slurm job {job_id}" + def status(self, job_id: str) -> dict: """Query Slurm job status via slurmrestd.""" body = self._rest_get(f"/slurm/{self.api_version}/job/{job_id}") @@ -290,7 +312,7 @@ def status(self, job_id: str) -> dict: "output_hint": output_file, } - def logs(self, job_id: str, *, tail: int = 100) -> str: + def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Show where Slurm job logs are and how to access them.""" info = self.status(job_id) output_file = info.get("output_hint", "") @@ -302,9 +324,16 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: parts.append(f"Log file (on cluster shared filesystem):") parts.append(f" {output_file}") parts.append("") - parts.append("View on login node:") - parts.append(f" less {output_file}") - parts.append(f" tail -{tail} {output_file}") + if follow: + parts.append("Follow logs:") + parts.append(f" tail -f {output_file}") + else: + parts.append("View on login node:") + parts.append(f" less {output_file}") + parts.append(f" tail -{tail} {output_file}") + parts.append("") + parts.append("Follow logs:") + parts.append(f" tail -f {output_file}") parts.append("") parts.append("Copy to local machine:") parts.append(f" scp :{output_file} .") @@ -319,6 +348,37 @@ def logs(self, job_id: str, *, tail: int = 100) -> str: return "\n".join(parts) + def list_jobs(self, *, status_filter: str = "") -> list[dict]: + """List Slurm jobs via slurmrestd /jobs endpoint.""" + body = self._rest_get(f"/slurm/{self.api_version}/jobs") + errors = body.get("errors") or [] + if errors: + msgs = "; ".join(e.get("error", str(e)) for e in errors) + raise RuntimeError(f"slurmrestd error: {msgs}") + + result: list[dict] = [] + for job in body.get("jobs", []): + name = job.get("name", "") + # Only show flowsim jobs (name starts with "flowsim-") + if not name.startswith("flowsim-"): + continue + + state = job.get("job_state", ["UNKNOWN"]) + if isinstance(state, list): + state = state[0] if state else "UNKNOWN" + + if status_filter and state.upper() != status_filter.upper(): + continue + + result.append({ + "job_id": str(job.get("job_id", "")), + "name": name, + "state": state, + "partition": job.get("partition", ""), + "nodes": job.get("nodes", ""), + }) + return result + def _parse_time_minutes(self) -> int: """Convert HH:MM:SS time_limit to total minutes.""" parts = self.time_limit.split(":") diff --git a/scripts/cli.py b/scripts/cli.py index c17796d..b5d2bc3 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -162,6 +162,16 @@ def main(argv: list[str] | None = None) -> int: help="Retrieve job logs (local/k8s/slurm)", add_help=False, ) + sub.add_parser( + "list", + help="List FlowSim jobs (local/k8s/slurm)", + add_help=False, + ) + sub.add_parser( + "cancel", + help="Cancel a running job (k8s/slurm)", + add_help=False, + ) args, remaining = parser.parse_known_args(argv) @@ -186,6 +196,18 @@ def main(argv: list[str] | None = None) -> int: main_logs(remaining) return 0 + if args.command == "list": + from scripts.status_profile import main_list + + main_list(remaining) + return 0 + + if args.command == "cancel": + from scripts.status_profile import main_cancel + + main_cancel(remaining) + return 0 + parser.print_help() return 1 diff --git a/scripts/status_profile.py b/scripts/status_profile.py index bfcce41..15244a4 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Query FlowSim profiling job status and logs. +"""Query FlowSim profiling job status, logs, list, and cancel. Usage examples -------------- @@ -12,13 +12,18 @@ flowsim logs --scheduler k8s --job flowsim-perf-qwen3-8b-bs1-il2048 -Check Slurm job status:: +Follow K8s job logs:: - flowsim status --scheduler slurm --job 12345 + flowsim logs --scheduler k8s --job flowsim-perf-qwen3-8b-bs1-il2048 --follow -Check local job status (by job name prefix):: +List all FlowSim jobs:: - flowsim status --scheduler local --job flowsim-perf-qwen3-8b-bs1-il2048 + flowsim list --scheduler k8s + flowsim list --scheduler k8s --status Running + +Cancel a job:: + + flowsim cancel --scheduler k8s --job flowsim-perf-qwen3-8b-bs1-il2048 """ from __future__ import annotations @@ -37,31 +42,16 @@ def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) -def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: +def _add_scheduler_args(p: argparse.ArgumentParser) -> None: + """Add common scheduler connection args to a parser.""" k8s_cfg = load_k8s_config() slurm_cfg = load_slurm_config() - p = argparse.ArgumentParser( - description="Query FlowSim profiling job status or logs.", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - p.add_argument( "--scheduler", choices=["local", "k8s", "slurm"], required=True, ) - p.add_argument( - "--job", - required=True, - help="Job name (k8s/local) or job ID (slurm)", - ) - p.add_argument( - "--tail", - type=int, - default=100, - help="Number of log lines to show (default: 100)", - ) # -- Local options -- p.add_argument("--local-workdir", default="") @@ -98,7 +88,14 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: action="store_true", ) - return p.parse_args(argv) + +def _resolve_slurm_jwt(args: argparse.Namespace) -> None: + """Resolve Slurm JWT from config if not provided.""" + if args.scheduler == "slurm" and not args.slurm_jwt_token: + slurm_cfg = load_slurm_config() + token = resolve_jwt_token(slurm_cfg) + if token: + args.slurm_jwt_token = token def _build_scheduler(args: argparse.Namespace): @@ -120,15 +117,12 @@ def _build_scheduler(args: argparse.Namespace): def main_status(argv: list[str] | None = None) -> None: - args = _parse_args(argv) - - # Resolve Slurm JWT if needed - if args.scheduler == "slurm" and not args.slurm_jwt_token: - slurm_cfg = load_slurm_config() - token = resolve_jwt_token(slurm_cfg) - if token: - args.slurm_jwt_token = token + p = argparse.ArgumentParser(description="Query FlowSim job status.") + _add_scheduler_args(p) + p.add_argument("--job", required=True, help="Job name or ID") + args = p.parse_args(argv) + _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: info = scheduler.status(args.job) @@ -139,19 +133,60 @@ def main_status(argv: list[str] | None = None) -> None: def main_logs(argv: list[str] | None = None) -> None: - args = _parse_args(argv) + p = argparse.ArgumentParser(description="Retrieve FlowSim job logs.") + _add_scheduler_args(p) + p.add_argument("--job", required=True, help="Job name or ID") + p.add_argument("--tail", type=int, default=100, help="Number of log lines (default: 100)") + p.add_argument("--follow", "-f", action="store_true", help="Follow log output") + args = p.parse_args(argv) + + _resolve_slurm_jwt(args) + scheduler = _build_scheduler(args) + try: + text = scheduler.logs(args.job, tail=args.tail, follow=args.follow) + print(text) + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) - # Resolve Slurm JWT if needed - if args.scheduler == "slurm" and not args.slurm_jwt_token: - slurm_cfg = load_slurm_config() - token = resolve_jwt_token(slurm_cfg) - if token: - args.slurm_jwt_token = token +def main_list(argv: list[str] | None = None) -> None: + p = argparse.ArgumentParser(description="List FlowSim jobs.") + _add_scheduler_args(p) + p.add_argument("--status", default="", help="Filter by job state (e.g. Running, Succeeded, PENDING)") + args = p.parse_args(argv) + + _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: - text = scheduler.logs(args.job, tail=args.tail) - print(text) + jobs = scheduler.list_jobs(status_filter=args.status) + if not jobs: + print("No jobs found.") + return + # Print table header + headers = list(jobs[0].keys()) + widths = {h: max(len(h), max(len(str(j.get(h, ""))) for j in jobs)) for h in headers} + header_line = " ".join(h.upper().ljust(widths[h]) for h in headers) + print(header_line) + print("-" * len(header_line)) + for job in jobs: + print(" ".join(str(job.get(h, "")).ljust(widths[h]) for h in headers)) + except Exception as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) + + +def main_cancel(argv: list[str] | None = None) -> None: + p = argparse.ArgumentParser(description="Cancel a FlowSim job.") + _add_scheduler_args(p) + p.add_argument("--job", required=True, help="Job name or ID to cancel") + args = p.parse_args(argv) + + _resolve_slurm_jwt(args) + scheduler = _build_scheduler(args) + try: + msg = scheduler.cancel(args.job) + print(msg) except Exception as exc: print(f"Error: {exc}", file=sys.stderr) sys.exit(1) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 25061f1..bcd9c23 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -384,26 +384,39 @@ def main(argv: list[str] | None = None) -> None: print(scheduler.dry_run(spec)) else: if is_pd: - result = scheduler.submit_pd_pair(spec) + results = scheduler.submit_pd_pair(spec) + for r in results: + print(r.message) + # Use the first result for follow-up hints + result = results[0] else: result = scheduler.submit(spec) - print(result) + print(result.message) + # Tell user where to find results print() - print(f"Traces: {spec.output_dir}") - print(f"Logs: {spec.log_dir}") - if args.scheduler == "k8s": + print(f"Traces: {result.output_dir}") + print(f"Logs: {result.output_dir}/logs/") + job_id = result.job_id + sched = args.scheduler + + if sched == "k8s": if args.k8s_pvc: print(f" (persisted on PVC '{args.k8s_pvc}')") else: print(f" (persisted at hostPath '{args.k8s_host_output_dir}' on the node)") - print(f"\nTo check status: flowsim status --scheduler k8s --job {spec.default_job_name()[:63]}") - print(f"To view logs: flowsim logs --scheduler k8s --job {spec.default_job_name()[:63]}") - elif args.scheduler == "slurm": + print(f"\nTo check status: flowsim status --scheduler k8s --job {job_id}") + print(f"To view logs: flowsim logs --scheduler k8s --job {job_id}") + print(f"To follow logs: flowsim logs --scheduler k8s --job {job_id} --follow") + print(f"To cancel: flowsim cancel --scheduler k8s --job {job_id}") + elif sched == "slurm": print(f" (on cluster shared filesystem)") - print(f"\nTo check status: flowsim status --scheduler slurm --job ") + print(f"\nTo check status: flowsim status --scheduler slurm --job {job_id}") + print(f"To view logs: flowsim logs --scheduler slurm --job {job_id}") + print(f"To cancel: flowsim cancel --scheduler slurm --job {job_id}") else: - print(f"\nTo view logs: flowsim logs --scheduler local --job {spec.default_job_name()}") + print(f"\nTo view logs: flowsim logs --scheduler local --job {job_id}") + print(f"To list all jobs: flowsim list --scheduler {sched}") _INIT_HINT = "Run 'flowsim init' to create config files." From 8cd62f8ba65b451221d4462da7b3dc743814e075 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 18:13:30 +0000 Subject: [PATCH 20/56] fix: CLI only shows scheduler-specific args based on --scheduler Two-pass argparse: peek --scheduler with a minimal pre-parser, then add only the relevant scheduler's options before full parse. 'flowsim submit --scheduler local --help' no longer shows k8s/slurm args. --- scripts/status_profile.py | 94 ++++---- scripts/submit_profile.py | 250 +++++++++++----------- tests/integration/test_scheduler_local.py | 188 +++++++++++++++- 3 files changed, 359 insertions(+), 173 deletions(-) diff --git a/scripts/status_profile.py b/scripts/status_profile.py index 15244a4..2f82ebc 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -43,50 +43,53 @@ def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: def _add_scheduler_args(p: argparse.ArgumentParser) -> None: - """Add common scheduler connection args to a parser.""" - k8s_cfg = load_k8s_config() - slurm_cfg = load_slurm_config() - + """Add common scheduler choice arg (first pass only).""" p.add_argument( "--scheduler", choices=["local", "k8s", "slurm"], required=True, ) - # -- Local options -- - p.add_argument("--local-workdir", default="") - # -- K8s options -- - p.add_argument( - "--k8s-namespace", - default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), - ) - p.add_argument( - "--k8s-kubeconfig", - default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), - ) - p.add_argument( - "--k8s-context", - default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), - ) +def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> None: + """Add only the args relevant to the chosen scheduler (second pass).""" + k8s_cfg = load_k8s_config() + slurm_cfg = load_slurm_config() - # -- Slurm options -- - p.add_argument( - "--slurm-rest-url", - default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), - ) - p.add_argument( - "--slurm-jwt-token", - default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), - ) - p.add_argument( - "--slurm-api-version", - default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), - ) - p.add_argument( - "--slurm-no-verify-ssl", - action="store_true", - ) + if scheduler == "local": + p.add_argument("--local-workdir", default="") + + elif scheduler == "k8s": + p.add_argument( + "--k8s-namespace", + default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), + ) + p.add_argument( + "--k8s-kubeconfig", + default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), + ) + p.add_argument( + "--k8s-context", + default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), + ) + + elif scheduler == "slurm": + p.add_argument( + "--slurm-rest-url", + default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), + ) + p.add_argument( + "--slurm-jwt-token", + default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), + ) + p.add_argument( + "--slurm-api-version", + default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), + ) + p.add_argument( + "--slurm-no-verify-ssl", + action="store_true", + ) def _resolve_slurm_jwt(args: argparse.Namespace) -> None: @@ -100,7 +103,7 @@ def _resolve_slurm_jwt(args: argparse.Namespace) -> None: def _build_scheduler(args: argparse.Namespace): if args.scheduler == "local": - return LocalScheduler(workdir=args.local_workdir) + return LocalScheduler(workdir=getattr(args, "local_workdir", "")) elif args.scheduler == "k8s": return K8sScheduler( namespace=args.k8s_namespace, @@ -116,11 +119,20 @@ def _build_scheduler(args: argparse.Namespace): ) +def _parse_two_pass(p: argparse.ArgumentParser, argv: list[str] | None = None) -> argparse.Namespace: + """Two-pass parse: peek --scheduler, add scheduler-specific args, full parse.""" + _pre = argparse.ArgumentParser(add_help=False) + _pre.add_argument("--scheduler", choices=["local", "k8s", "slurm"]) + pre, _ = _pre.parse_known_args(argv) + _add_scheduler_specific_args(p, pre.scheduler) + return p.parse_args(argv) + + def main_status(argv: list[str] | None = None) -> None: p = argparse.ArgumentParser(description="Query FlowSim job status.") _add_scheduler_args(p) p.add_argument("--job", required=True, help="Job name or ID") - args = p.parse_args(argv) + args = _parse_two_pass(p, argv) _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) @@ -138,7 +150,7 @@ def main_logs(argv: list[str] | None = None) -> None: p.add_argument("--job", required=True, help="Job name or ID") p.add_argument("--tail", type=int, default=100, help="Number of log lines (default: 100)") p.add_argument("--follow", "-f", action="store_true", help="Follow log output") - args = p.parse_args(argv) + args = _parse_two_pass(p, argv) _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) @@ -154,7 +166,7 @@ def main_list(argv: list[str] | None = None) -> None: p = argparse.ArgumentParser(description="List FlowSim jobs.") _add_scheduler_args(p) p.add_argument("--status", default="", help="Filter by job state (e.g. Running, Succeeded, PENDING)") - args = p.parse_args(argv) + args = _parse_two_pass(p, argv) _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) @@ -180,7 +192,7 @@ def main_cancel(argv: list[str] | None = None) -> None: p = argparse.ArgumentParser(description="Cancel a FlowSim job.") _add_scheduler_args(p) p.add_argument("--job", required=True, help="Job name or ID to cancel") - args = p.parse_args(argv) + args = _parse_two_pass(p, argv) _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index bcd9c23..18f7882 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -68,7 +68,7 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: slurm_cfg = load_slurm_config() p = argparse.ArgumentParser( - description="Submit FlowSim profiling jobs to K8s or Slurm.", + description="Submit FlowSim profiling jobs to local, K8s, or Slurm.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -117,127 +117,6 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: infra.add_argument("--output-dir", default="") infra.add_argument("--job-name", default="") - # -- Local options -- - loc = p.add_argument_group("local options") - loc.add_argument( - "--local-gpus", - default="", - help="CUDA_VISIBLE_DEVICES for local execution (e.g. '0' or '0,1')", - ) - loc.add_argument( - "--local-workdir", - default="", - help="Working directory for local execution (default: FlowSim project root)", - ) - - # -- Kubernetes-specific -- - k8s = p.add_argument_group("kubernetes options (config: ~/.flowsim/k8s.yaml)") - k8s.add_argument( - "--k8s-namespace", - default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), - help="K8s namespace (env: FLOWSIM_K8S_NAMESPACE)", - ) - k8s.add_argument( - "--k8s-kubeconfig", - default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), - help="Path to kubeconfig file (env: KUBECONFIG)", - ) - k8s.add_argument( - "--k8s-context", - default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), - help="kubeconfig context (env: FLOWSIM_K8S_CONTEXT)", - ) - k8s.add_argument( - "--k8s-pvc", - default=cfg_get(k8s_cfg, "pvc", ""), - help="PVC name for output volume (omit for emptyDir)", - ) - k8s.add_argument( - "--k8s-host-output-dir", - default=cfg_get(k8s_cfg, "host_output_dir", ""), - help="hostPath for output (used when --k8s-pvc is empty)", - ) - k8s.add_argument( - "--k8s-node-selector", - action="append", - default=[], - metavar="KEY=VALUE", - help="Node selector labels (repeatable)", - ) - k8s.add_argument( - "--k8s-service-account", - default=cfg_get(k8s_cfg, "service_account", ""), - ) - k8s.add_argument( - "--k8s-shm-size", - default=cfg_get(k8s_cfg, "shm_size", "16Gi"), - ) - - # -- Slurm-specific -- - slurm = p.add_argument_group("slurm options (config: ~/.flowsim/slurm.yaml)") - slurm.add_argument( - "--slurm-partition", - default=_d("FLOWSIM_SLURM_PARTITION", slurm_cfg, "partition", ""), - help="Slurm partition (env: FLOWSIM_SLURM_PARTITION)", - ) - slurm.add_argument( - "--slurm-time", - default=_d("FLOWSIM_SLURM_TIME", slurm_cfg, "time", "02:00:00"), - help="Wall time limit (env: FLOWSIM_SLURM_TIME)", - ) - slurm.add_argument( - "--slurm-rest-url", - default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), - help="slurmrestd base URL (env: FLOWSIM_SLURM_REST_URL)", - ) - slurm.add_argument( - "--slurm-jwt-token", - default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), - help="JWT token for slurmrestd (env: FLOWSIM_SLURM_JWT_TOKEN)", - ) - slurm.add_argument( - "--slurm-api-version", - default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), - help="slurmrestd API version (env: FLOWSIM_SLURM_API_VERSION)", - ) - slurm.add_argument( - "--slurm-no-verify-ssl", - action="store_true", - help="Skip TLS certificate verification for slurmrestd", - ) - slurm.add_argument( - "--slurm-account", - default=cfg_get(slurm_cfg, "account", ""), - ) - slurm.add_argument( - "--slurm-constraint", - default=cfg_get(slurm_cfg, "constraint", ""), - ) - slurm.add_argument( - "--slurm-container-runtime", - choices=["docker", "enroot", "none"], - default=cfg_get(slurm_cfg, "container_runtime", "none"), - ) - slurm.add_argument( - "--slurm-container-mounts", - default=cfg_get(slurm_cfg, "container_mounts", ""), - ) - # Modules from config (list) + CLI (append) - cfg_modules = slurm_cfg.get("modules") if isinstance(slurm_cfg.get("modules"), list) else [] - slurm.add_argument( - "--slurm-module", - action="append", - default=[str(m) for m in cfg_modules], - help="Modules to load (repeatable, merged with config)", - ) - slurm.add_argument( - "--slurm-extra-sbatch", - action="append", - default=[], - metavar="DIRECTIVE", - help="Extra #SBATCH directives (repeatable, without prefix)", - ) - # -- Action -- p.add_argument( "--dry-run", @@ -276,6 +155,133 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: help="InfiniBand device for RDMA transfer", ) + # ---- Two-pass: peek at --scheduler, then add only relevant args ---- + # Use a minimal pre-parser to avoid required-arg errors during peek. + _pre = argparse.ArgumentParser(add_help=False) + _pre.add_argument("--scheduler", choices=["local", "k8s", "slurm"]) + pre, _ = _pre.parse_known_args(argv) + + if pre.scheduler == "local": + loc = p.add_argument_group("local options") + loc.add_argument( + "--local-gpus", + default="", + help="CUDA_VISIBLE_DEVICES for local execution (e.g. '0' or '0,1')", + ) + loc.add_argument( + "--local-workdir", + default="", + help="Working directory for local execution (default: FlowSim project root)", + ) + + elif pre.scheduler == "k8s": + k8s = p.add_argument_group("kubernetes options (config: ~/.flowsim/k8s.yaml)") + k8s.add_argument( + "--k8s-namespace", + default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), + help="K8s namespace (env: FLOWSIM_K8S_NAMESPACE)", + ) + k8s.add_argument( + "--k8s-kubeconfig", + default=_d("KUBECONFIG", k8s_cfg, "kubeconfig", ""), + help="Path to kubeconfig file (env: KUBECONFIG)", + ) + k8s.add_argument( + "--k8s-context", + default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), + help="kubeconfig context (env: FLOWSIM_K8S_CONTEXT)", + ) + k8s.add_argument( + "--k8s-pvc", + default=cfg_get(k8s_cfg, "pvc", ""), + help="PVC name for output volume (omit for emptyDir)", + ) + k8s.add_argument( + "--k8s-host-output-dir", + default=cfg_get(k8s_cfg, "host_output_dir", ""), + help="hostPath for output (used when --k8s-pvc is empty)", + ) + k8s.add_argument( + "--k8s-node-selector", + action="append", + default=[], + metavar="KEY=VALUE", + help="Node selector labels (repeatable)", + ) + k8s.add_argument( + "--k8s-service-account", + default=cfg_get(k8s_cfg, "service_account", ""), + ) + k8s.add_argument( + "--k8s-shm-size", + default=cfg_get(k8s_cfg, "shm_size", "16Gi"), + ) + + elif pre.scheduler == "slurm": + slurm = p.add_argument_group("slurm options (config: ~/.flowsim/slurm.yaml)") + slurm.add_argument( + "--slurm-partition", + default=_d("FLOWSIM_SLURM_PARTITION", slurm_cfg, "partition", ""), + help="Slurm partition (env: FLOWSIM_SLURM_PARTITION)", + ) + slurm.add_argument( + "--slurm-time", + default=_d("FLOWSIM_SLURM_TIME", slurm_cfg, "time", "02:00:00"), + help="Wall time limit (env: FLOWSIM_SLURM_TIME)", + ) + slurm.add_argument( + "--slurm-rest-url", + default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), + help="slurmrestd base URL (env: FLOWSIM_SLURM_REST_URL)", + ) + slurm.add_argument( + "--slurm-jwt-token", + default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), + help="JWT token for slurmrestd (env: FLOWSIM_SLURM_JWT_TOKEN)", + ) + slurm.add_argument( + "--slurm-api-version", + default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), + help="slurmrestd API version (env: FLOWSIM_SLURM_API_VERSION)", + ) + slurm.add_argument( + "--slurm-no-verify-ssl", + action="store_true", + help="Skip TLS certificate verification for slurmrestd", + ) + slurm.add_argument( + "--slurm-account", + default=cfg_get(slurm_cfg, "account", ""), + ) + slurm.add_argument( + "--slurm-constraint", + default=cfg_get(slurm_cfg, "constraint", ""), + ) + slurm.add_argument( + "--slurm-container-runtime", + choices=["docker", "enroot", "none"], + default=cfg_get(slurm_cfg, "container_runtime", "none"), + ) + slurm.add_argument( + "--slurm-container-mounts", + default=cfg_get(slurm_cfg, "container_mounts", ""), + ) + # Modules from config (list) + CLI (append) + cfg_modules = slurm_cfg.get("modules") if isinstance(slurm_cfg.get("modules"), list) else [] + slurm.add_argument( + "--slurm-module", + action="append", + default=[str(m) for m in cfg_modules], + help="Modules to load (repeatable, merged with config)", + ) + slurm.add_argument( + "--slurm-extra-sbatch", + action="append", + default=[], + metavar="DIRECTIVE", + help="Extra #SBATCH directives (repeatable, without prefix)", + ) + return p.parse_args(argv) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 879c00f..932ea9e 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -1,11 +1,11 @@ -"""Integration tests for ``flowsim submit``, ``flowsim status``, ``flowsim logs``. +"""Integration tests for the FlowSim scheduler CLI. Tests all three scheduler backends (local, k8s, slurm) end-to-end. * **local** — runs real TP=1 profiling and verifies traces, parsed CSVs, - and log files are all produced in the correct locations. -* **k8s** — submits a real Job to a Kind cluster, verifies it was created, - then checks ``flowsim status`` / ``flowsim logs`` output. Also validates + log files, JobResult return, cancel, list, logs --follow. +* **k8s** — submits a real Job to a Kind cluster, verifies JobResult, + status, logs, list, cancel, logs --follow. Also validates that dry-run YAML has the correct volume mounts and log paths. * **slurm** — dry-run only; verifies the sbatch script has the correct ``output_dir``, ``--log-dir``, and ``#SBATCH --output`` directives. @@ -35,6 +35,7 @@ """ import glob +import json import os import subprocess import sys @@ -42,6 +43,9 @@ import pytest +from schedulers.base import JobResult, ProfileJobSpec +from schedulers.local import LocalScheduler + _PROJECT_ROOT = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..") ) @@ -182,6 +186,161 @@ def test_local_logs(self): # Should contain file listing or "No logs" — not crash assert "Log directory:" in output or "No logs" in output + def test_local_logs_follow(self): + """flowsim logs --follow should show tail -f command.""" + r = _flowsim_cli( + "logs", + "--scheduler", "local", + "--job", "flowsim-perf", + "--follow", + ) + assert r.returncode == 0 + output = r.stdout + assert "tail -f" in output or "No logs" in output + + def test_local_cancel(self): + """flowsim cancel --scheduler local should return a message (sync jobs can't be cancelled).""" + r = _flowsim_cli( + "cancel", + "--scheduler", "local", + "--job", "flowsim-perf", + ) + assert r.returncode == 0 + assert "cannot be cancelled" in r.stdout.lower() or "synchronous" in r.stdout.lower() + + def test_local_list(self): + """flowsim list --scheduler local should list jobs from log files.""" + r = _flowsim_cli( + "list", + "--scheduler", "local", + ) + assert r.returncode == 0 + output = r.stdout + # Should either show jobs or "No jobs found" + assert "JOB_ID" in output or "No jobs found" in output + + def test_local_list_status_filter(self): + """flowsim list --status Completed should filter.""" + r = _flowsim_cli( + "list", + "--scheduler", "local", + "--status", "Completed", + ) + assert r.returncode == 0 + + +# ===================================================================== +# LOCAL SCHEDULER — unit-level tests for JobResult and list_jobs +# ===================================================================== +class TestLocalSchedulerAPI: + """Test LocalScheduler API directly (no subprocess, no GPU).""" + + def test_submit_returns_job_result(self): + """LocalScheduler.submit() must return a JobResult, not a string.""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + sched = LocalScheduler(workdir=tmpdir) + spec = ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + output_dir=os.path.join(tmpdir, "traces"), + ) + # Monkey-patch: make build_shell_command return a trivial command + spec.build_shell_command = lambda: "echo hello" + result = sched.submit(spec) + assert isinstance(result, JobResult), f"Expected JobResult, got {type(result)}" + assert result.scheduler == "local" + assert result.state == "Completed" + assert result.job_id != "" + assert result.output_dir == spec.output_dir + + def test_submit_failed_returns_failed_state(self): + """A failing command should return JobResult with state=Failed.""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + sched = LocalScheduler(workdir=tmpdir) + spec = ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + output_dir=os.path.join(tmpdir, "traces"), + ) + spec.build_shell_command = lambda: "exit 1" + result = sched.submit(spec) + assert isinstance(result, JobResult) + assert result.state == "Failed" + + def test_list_jobs_finds_log_files(self): + """list_jobs() should find jobs from log file names.""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + log_dir = os.path.join(tmpdir, "stage_traces", "logs") + os.makedirs(log_dir) + # Create fake log files + for name in [ + "flowsim-perf-qwen3-8b-bs1-il512_1700000001.stdout.log", + "flowsim-perf-qwen3-8b-bs1-il512_1700000001.stderr.log", + "flowsim-perf-qwen3-8b-bs1-il1024_1700000002.stdout.log", + "flowsim-perf-qwen3-8b-bs1-il1024_1700000002.stderr.log", + ]: + open(os.path.join(log_dir, name), "w").close() + + sched = LocalScheduler(workdir=tmpdir) + jobs = sched.list_jobs() + assert len(jobs) == 2 + assert all("job_id" in j and "state" in j for j in jobs) + + def test_list_jobs_status_filter(self): + """list_jobs(status_filter=...) should filter results.""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + log_dir = os.path.join(tmpdir, "stage_traces", "logs") + os.makedirs(log_dir) + open(os.path.join(log_dir, "flowsim-perf-x_100.stdout.log"), "w").close() + open(os.path.join(log_dir, "flowsim-perf-x_100.stderr.log"), "w").close() + + sched = LocalScheduler(workdir=tmpdir) + assert len(sched.list_jobs(status_filter="Completed")) == 1 + assert len(sched.list_jobs(status_filter="Running")) == 0 + + def test_logs_follow_shows_tail_f(self): + """logs(follow=True) should return a tail -f command.""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + log_dir = os.path.join(tmpdir, "stage_traces", "logs") + os.makedirs(log_dir) + open(os.path.join(log_dir, "flowsim-perf-x_100.stdout.log"), "w").close() + + sched = LocalScheduler(workdir=tmpdir) + text = sched.logs("flowsim-perf-x", follow=True) + assert "tail -f" in text + + def test_cancel_returns_message(self): + """cancel() should return a message about sync jobs.""" + sched = LocalScheduler() + msg = sched.cancel("some-job") + assert "synchronous" in msg.lower() or "cannot" in msg.lower() + + def test_submit_pd_pair_returns_list(self): + """submit_pd_pair() must return list[JobResult].""" + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + sched = LocalScheduler(workdir=tmpdir) + spec = ProfileJobSpec( + collect="perf", + model_path="Qwen/Qwen3-8B", + output_dir=os.path.join(tmpdir, "traces"), + ) + # Monkey-patch to avoid real profiling + spec.build_shell_command = lambda: "echo hello" + results = sched.submit_pd_pair(spec) + assert isinstance(results, list) + assert len(results) == 2 + assert all(isinstance(r, JobResult) for r in results) + # One should be prefill, one decode + modes = {r.job_id for r in results} + assert any("prefill" in m for m in modes) + assert any("decode" in m for m in modes) + # ===================================================================== # K8S SCHEDULER @@ -289,12 +448,21 @@ def test_k8s_real_submit_to_kind(self): # Should mention kubectl or pod name or "No pods" assert "kubectl" in r3.stdout or "No pods" in r3.stdout or "Pod:" in r3.stdout - # Cleanup: delete the K8s job - subprocess.run( - ["kubectl", "--context", "kind-flowsim", "delete", "job", job_name, - "-n", "default", "--ignore-not-found"], - capture_output=True, timeout=30, - ) + # Check logs --follow + r3f = _flowsim_cli("logs", "--scheduler", "k8s", "--job", job_name, "--follow") + assert r3f.returncode == 0 + assert "kubectl logs -f" in r3f.stdout + + # Check list + r4 = _flowsim_cli("list", "--scheduler", "k8s") + assert r4.returncode == 0 + # Our job should appear in the listing + assert job_name in r4.stdout or "JOB_ID" in r4.stdout + + # Cancel via flowsim cancel + r5 = _flowsim_cli("cancel", "--scheduler", "k8s", "--job", job_name) + assert r5.returncode == 0 + assert "deleted" in r5.stdout.lower() # ===================================================================== From 84c895353b43ef050542fcafd4c7b4dce3e67c90 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 18:23:13 +0000 Subject: [PATCH 21/56] fix: use python3 instead of python in profile command Most systems (Ubuntu, Debian) don't have 'python' symlink by default. --- schedulers/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schedulers/base.py b/schedulers/base.py index 40d9cea..3a35682 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -88,7 +88,7 @@ def log_dir(self) -> str: def build_profile_command(self) -> list[str]: """Build the full ``python scripts/run_stage_profile.py ...`` command.""" cmd = [ - "python", + "python3", "scripts/run_stage_profile.py", "--collect", self.collect, From fab6314fed5253f73f49f20be972f2355f7d6600 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Tue, 17 Mar 2026 18:42:43 +0000 Subject: [PATCH 22/56] fix: use YYYYMMDD_HHMMSS timestamp in log filenames Before: flowsim-perf-qwen3-8b_1773771736.stdout.log After: flowsim-perf-qwen3-8b_20260317_184236.stdout.log list_jobs() regex updated to support both old epoch and new formats. --- schedulers/local.py | 123 ++++++++++++++-------- tests/integration/test_scheduler_local.py | 13 ++- tests/unit/test_scheduler_cli.py | 11 +- 3 files changed, 98 insertions(+), 49 deletions(-) diff --git a/schedulers/local.py b/schedulers/local.py index f23a743..4ec94c9 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -1,7 +1,8 @@ -"""Local scheduler — run profiling directly on this machine. +"""Local scheduler — run profiling via Docker on the local machine. -``render()`` returns the shell command string. +``render()`` returns the ``docker run`` command string. ``submit()`` executes it as a subprocess, with stdout/stderr tee'd to log files. +The profiling runs inside the FlowSim Docker image with GPU access. """ from __future__ import annotations @@ -14,17 +15,23 @@ from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec +def _shell_quote(s: str) -> str: + """Quote a string for safe embedding in a bash -c '...' invocation.""" + import shlex + return shlex.quote(s) + + class LocalScheduler(BaseScheduler): - """Run profiling jobs locally via subprocess. + """Run profiling jobs locally inside a Docker container. Parameters ---------- gpus : str - ``CUDA_VISIBLE_DEVICES`` value (e.g., ``"0"`` or ``"0,1"``). - Empty string means use all visible GPUs. + GPU device IDs for Docker ``--gpus`` (e.g., ``"0"`` or ``"0,1"``). + Empty string means all GPUs. workdir : str - Working directory for the subprocess. - Defaults to the FlowSim project root. + Host directory to use as the FlowSim project root for log scanning. + Defaults to the FlowSim project root on the host. """ def __init__( @@ -43,56 +50,83 @@ def _find_project_root() -> str: # schedulers/ is one level below project root return os.path.dirname(d) + def _docker_gpu_flag(self) -> str: + """Build the ``--gpus`` flag for ``docker run``.""" + if not self.gpus: + return "--gpus all" + return f"--gpus '\"device={self.gpus}\"'" + + def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: + """Build the full ``docker run`` command.""" + job_name = spec.default_job_name()[:63] + # Container always works with /flowsim/stage_traces internally. + container_output = "/flowsim/stage_traces" + container_log_dir = container_output + "/logs" + host_output = os.path.abspath(spec.output_dir) + host_log_dir = host_output + "/logs" + + # Build the inner command, then replace host paths with container paths. + inner_cmd = spec.build_shell_command() + inner_cmd = inner_cmd.replace(host_log_dir, container_log_dir) + inner_cmd = inner_cmd.replace(host_output, container_output) + + parts = [ + "docker run --rm", + f"--name {job_name}", + self._docker_gpu_flag(), + "--ipc=host --shm-size=16g", + "--network=host", + f"-e SGLANG_PROFILE_KERNELS=1", + f"-v {host_output}:{container_output}", + f"-w /flowsim", + spec.image, + f"bash -c {_shell_quote(inner_cmd)}", + ] + return " \\\n ".join(parts) + def render(self, spec: ProfileJobSpec) -> str: - lines = [] - if self.gpus: - lines.append(f"export CUDA_VISIBLE_DEVICES={self.gpus}") - lines.append("export SGLANG_PROFILE_KERNELS=1") - lines.append(f"cd {self.workdir}") - lines.append(spec.build_shell_command()) - return "\n".join(lines) + return self._build_docker_cmd(spec) def submit(self, spec: ProfileJobSpec) -> JobResult: - """Run the profiling command locally as a subprocess. + """Launch a Docker container for profiling. stdout and stderr are streamed to the terminal *and* saved to - log files under ``spec.log_dir``. + log files under ``spec.output_dir/logs/`` on the host. """ - cmd = spec.build_shell_command() - - env = os.environ.copy() - env["SGLANG_PROFILE_KERNELS"] = "1" - if self.gpus: - env["CUDA_VISIBLE_DEVICES"] = self.gpus + # Ensure host output dir exists before mounting + host_output = os.path.abspath(spec.output_dir) + log_dir = os.path.join(host_output, "logs") + os.makedirs(log_dir, exist_ok=True) + docker_cmd = self._build_docker_cmd(spec) job_name = spec.default_job_name() - log_dir = spec.log_dir - os.makedirs(log_dir, exist_ok=True) - ts = int(time.time()) + ts = time.strftime("%Y%m%d_%H%M%S") + + # Remove stale container with the same name (e.g. from a killed run) + subprocess.run( + ["docker", "rm", "-f", job_name[:63]], + capture_output=True, timeout=10, + ) stdout_path = os.path.join(log_dir, f"{job_name}_{ts}.stdout.log") stderr_path = os.path.join(log_dir, f"{job_name}_{ts}.stderr.log") - print(f"[local] Running {job_name}...") - print(f"[local] cmd: {cmd}") - print(f"[local] workdir: {self.workdir}") - if self.gpus: - print(f"[local] CUDA_VISIBLE_DEVICES={self.gpus}") + print(f"[local] Running {job_name} in Docker...") + print(f"[local] image: {spec.image}") + print(f"[local] gpus: {self.gpus or 'all'}") + print(f"[local] host output: {host_output}") print(f"[local] logs: {stdout_path}") print(f"[local] {stderr_path}") + print(f"[local] cmd:\n {docker_cmd}") print() with open(stdout_path, "w") as fout, open(stderr_path, "w") as ferr: proc = subprocess.Popen( - cmd, + docker_cmd, shell=True, cwd=self.workdir, - env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - # Stream stdout/stderr to terminal + log files in real time. - # Use threads to avoid blocking on one stream while the other - # fills its OS pipe buffer. import threading def _tee(src, dest_file, dest_stream): @@ -119,7 +153,7 @@ def _tee(src, dest_file, dest_stream): job_id=job_name, scheduler="local", state="Failed", - output_dir=spec.output_dir, + output_dir=host_output, message=( f"{job_name} FAILED (exit code {proc.returncode})\n" f"stdout log: {stdout_path}\n" @@ -130,7 +164,7 @@ def _tee(src, dest_file, dest_stream): job_id=job_name, scheduler="local", state="Completed", - output_dir=spec.output_dir, + output_dir=host_output, message=( f"{job_name} completed successfully\n" f"stdout log: {stdout_path}\n" @@ -139,8 +173,14 @@ def _tee(src, dest_file, dest_stream): ) def cancel(self, job_id: str) -> str: - """Local jobs run synchronously, so cancel is not applicable.""" - return f"Local jobs run synchronously and cannot be cancelled. Job: {job_id}" + """Stop the Docker container for a local job.""" + proc = subprocess.run( + ["docker", "stop", job_id], + capture_output=True, text=True, timeout=30, + ) + if proc.returncode == 0: + return f"Stopped container {job_id}" + return f"Could not stop container {job_id}: {proc.stderr.strip()}" def status(self, job_id: str) -> dict: """Check local job status by looking for log files. @@ -235,8 +275,9 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: jobs: list[dict] = [] for path in matches: basename = os.path.basename(path) - # Parse: {job_name}_{timestamp}.stdout.log - m = re.match(r"^(.+)_(\d+)\.stdout\.log$", basename) + # Parse: {job_name}_{YYYYMMDD_HHMMSS}.stdout.log + # Also support old epoch format {job_name}_{digits}.stdout.log + m = re.match(r"^(.+)_(\d{8}_\d{6}|\d{10,})\.stdout\.log$", basename) if not m: continue name = m.group(1) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 932ea9e..56cbdde 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -128,6 +128,10 @@ def _assert_logs(output_dir: str) -> None: class TestLocalScheduler: """Run real profiling via ``flowsim submit --scheduler local``.""" + @pytest.mark.skipif( + not os.path.isdir("/flowsim"), + reason="Local profiling tests must run inside the FlowSim Docker container", + ) def test_local_perf_tp1(self): """TP=1 perf profiling: traces + parsed CSVs + log files.""" output_dir = os.path.join(ARTIFACT_DIR, "sched_local_tp1") @@ -199,14 +203,15 @@ def test_local_logs_follow(self): assert "tail -f" in output or "No logs" in output def test_local_cancel(self): - """flowsim cancel --scheduler local should return a message (sync jobs can't be cancelled).""" + """flowsim cancel --scheduler local should attempt docker stop.""" r = _flowsim_cli( "cancel", "--scheduler", "local", "--job", "flowsim-perf", ) assert r.returncode == 0 - assert "cannot be cancelled" in r.stdout.lower() or "synchronous" in r.stdout.lower() + out = r.stdout.lower() + assert "stop" in out or "container" in out def test_local_list(self): """flowsim list --scheduler local should list jobs from log files.""" @@ -315,10 +320,10 @@ def test_logs_follow_shows_tail_f(self): assert "tail -f" in text def test_cancel_returns_message(self): - """cancel() should return a message about sync jobs.""" + """cancel() should attempt docker stop and return a message.""" sched = LocalScheduler() msg = sched.cancel("some-job") - assert "synchronous" in msg.lower() or "cannot" in msg.lower() + assert "stop" in msg.lower() or "container" in msg.lower() def test_submit_pd_pair_returns_list(self): """submit_pd_pair() must return list[JobResult].""" diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 055e117..1d50f64 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -74,7 +74,7 @@ def test_build_server_opts_extra(self, spec: ProfileJobSpec): def test_build_profile_command(self, spec: ProfileJobSpec): cmd = spec.build_profile_command() - assert cmd[0] == "python" + assert cmd[0] == "python3" assert "scripts/run_stage_profile.py" in cmd[1] assert "--collect" in cmd assert "perf" in cmd @@ -291,7 +291,8 @@ def spec(self) -> ProfileJobSpec: def test_render_with_gpus(self, spec): sched = LocalScheduler(gpus="0,1") output = sched.render(spec) - assert "CUDA_VISIBLE_DEVICES=0,1" in output + assert "device=0,1" in output + assert "docker run" in output def test_render_without_gpus(self, spec): sched = LocalScheduler(gpus="") @@ -307,7 +308,9 @@ def test_render_has_command(self, spec): def test_render_workdir(self, spec): sched = LocalScheduler(workdir="/my/project") output = sched.render(spec) - assert "cd /my/project" in output + # Docker mode: workdir is used for log scanning, not in the docker command + assert "docker run" in output + assert "scripts/run_stage_profile.py" in output def test_dry_run_equals_render(self, spec): sched = LocalScheduler(gpus="0") @@ -486,7 +489,7 @@ def test_submit_local_dry_run_with_gpus(self): "--local-gpus", "0,1", "--dry-run", ) - assert "CUDA_VISIBLE_DEVICES=0,1" in out + assert "device=0,1" in out def test_submit_k8s_dry_run(self): out = self._run( From 8f7605207626d8f7f7d52346a5df9d3e525cd5cc Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 20:42:50 +0000 Subject: [PATCH 23/56] scheduler: add Slurm CLI mode (sbatch/squeue/scancel) + integration test - Add submit_via='cli' mode to SlurmScheduler, using sbatch/squeue/scancel subprocess calls instead of slurmrestd REST API (which has JWT auth issues in Slurm 23.11 docker containers). - Add cli_prefix param for running commands via docker exec. - Use scontrol show job for status (works without slurmdbd). - Slurm compose: base image on flowsim-image:latest, compile Slurm 23.11 with NVML support, cgroup/v1, explicit GRES config. - Slurm test passes in ~76s (same as K8s test). - K8s test uses host mount for traces (no docker cp). - All three backends (local, k8s, slurm) tested and working. --- dockerfiles/cgroup.conf | 3 + dockerfiles/dev-setup.sh | 227 +++++- dockerfiles/dev-teardown.sh | 4 + dockerfiles/gres.conf | 3 + dockerfiles/kind-multi-node.yaml | 67 +- dockerfiles/slurm-compose.yaml | 73 +- dockerfiles/slurm-node.dockerfile | 19 +- dockerfiles/slurm.conf | 23 +- schedulers/k8s.py | 11 +- schedulers/local.py | 106 ++- schedulers/slurm.py | 193 +++++- schedulers/templates/k8s.yaml | 1 + scripts/cli.py | 3 + scripts/status_profile.py | 22 + scripts/submit_profile.py | 52 +- simulator/base_parser.py | 6 +- tests/integration/test_scheduler_local.py | 805 ++++++++++++---------- tests/unit/test_scheduler_cli.py | 10 + 18 files changed, 1088 insertions(+), 540 deletions(-) create mode 100644 dockerfiles/cgroup.conf create mode 100644 dockerfiles/gres.conf diff --git a/dockerfiles/cgroup.conf b/dockerfiles/cgroup.conf new file mode 100644 index 0000000..68de2cc --- /dev/null +++ b/dockerfiles/cgroup.conf @@ -0,0 +1,3 @@ +# cgroup.conf — use cgroup v1 (only v1 plugin available; v2 host is compatible +# via the unified/hybrid hierarchy mount) +CgroupPlugin=cgroup/v1 diff --git a/dockerfiles/dev-setup.sh b/dockerfiles/dev-setup.sh index d948bf0..7cefe05 100755 --- a/dockerfiles/dev-setup.sh +++ b/dockerfiles/dev-setup.sh @@ -14,8 +14,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" KIND_VERSION="v0.27.0" KIND_CLUSTER_NAME="flowsim" +KIND_WORKERS=("${KIND_CLUSTER_NAME}-worker") KUBECTL_STABLE_URL="https://dl.k8s.io/release/stable.txt" -NVIDIA_DEVICE_PLUGIN="https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml" +HELM_INSTALL_URL="https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3" +NVIDIA_CTK_KEYRING="/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg" log() { printf "\033[1;32m[setup]\033[0m %s\n" "$*"; } warn() { printf "\033[1;33m[setup]\033[0m %s\n" "$*"; } @@ -57,38 +59,237 @@ ensure_kubectl() { } # ---------------------------------------------------------------- -# Kind cluster +# Kind cluster with NVIDIA GPU via CDI +# (Official approach from NVIDIA k8s-device-plugin demo) +# https://github.com/NVIDIA/k8s-device-plugin/tree/main/demo/clusters/kind # ---------------------------------------------------------------- +ensure_nvidia_runtime() { + # Docker must use nvidia as default runtime so Kind node containers get GPU access + command -v nvidia-ctk >/dev/null || err "nvidia-container-toolkit is required (nvidia-ctk not found)." + command -v nvidia-smi >/dev/null || err "NVIDIA driver not found (nvidia-smi missing)." + log "nvidia-ctk: $(nvidia-ctk --version 2>&1 | head -1)" + + if ! docker info 2>/dev/null | grep -q "Default Runtime: nvidia"; then + log "Setting nvidia as default Docker runtime..." + sudo nvidia-ctk runtime configure --runtime=docker --set-as-default + sudo systemctl restart docker + log "Docker restarted with nvidia runtime as default" + else + log "Docker already using nvidia as default runtime" + fi + + # Required: accept-nvidia-visible-devices-as-volume-mounts must be true + # for Kind GPU passthrough via /var/run/nvidia-container-devices/all + local cfg="/etc/nvidia-container-runtime/config.toml" + if grep -qE '^\s*accept-nvidia-visible-devices-as-volume-mounts\s*=\s*true' "$cfg" 2>/dev/null; then + log "accept-nvidia-visible-devices-as-volume-mounts already enabled" + else + log "Enabling accept-nvidia-visible-devices-as-volume-mounts in $cfg..." + if grep -qE '#?\s*accept-nvidia-visible-devices-as-volume-mounts' "$cfg" 2>/dev/null; then + sudo sed -i 's/#*\s*accept-nvidia-visible-devices-as-volume-mounts.*/accept-nvidia-visible-devices-as-volume-mounts = true/' "$cfg" + else + echo 'accept-nvidia-visible-devices-as-volume-mounts = true' | sudo tee -a "$cfg" >/dev/null + fi + sudo systemctl restart docker + log "Host nvidia-container-runtime config updated and Docker restarted" + fi +} + +ensure_helm() { + if command -v helm >/dev/null; then + log "helm already installed: $(helm version --short 2>/dev/null)" + return + fi + log "Installing helm..." + curl -fsSL "${HELM_INSTALL_URL}" | bash + log "helm installed: $(helm version --short)" +} + setup_kind() { ensure_docker + ensure_nvidia_runtime ensure_kind ensure_kubectl + ensure_helm if kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER_NAME}$"; then warn "kind cluster '${KIND_CLUSTER_NAME}' already exists, skipping creation" else - log "Creating kind cluster '${KIND_CLUSTER_NAME}' (1 control-plane + 2 GPU workers)..." + log "Creating kind cluster '${KIND_CLUSTER_NAME}' (1 control-plane + 1 GPU worker)..." kind create cluster --name "${KIND_CLUSTER_NAME}" \ --config "${SCRIPT_DIR}/kind-multi-node.yaml" - log "Installing NVIDIA device plugin..." - kubectl apply -f "${NVIDIA_DEVICE_PLUGIN}" fi - log "Cluster nodes:" - kubectl get nodes -o wide - echo + # ── Post-creation: configure GPU support inside each worker node ── + for worker in "${KIND_WORKERS[@]}"; do + log "=== Configuring ${worker} ===" + + # Step 1: Unmount masked /proc/driver/nvidia + log "Unmounting /proc/driver/nvidia in ${worker}..." + docker exec "${worker}" umount -R /proc/driver/nvidia 2>/dev/null || true + + # Step 2: Install nvidia-container-toolkit inside the worker node + log "Installing nvidia-container-toolkit inside ${worker}..." + docker exec "${worker}" bash -c "apt-get update && apt-get install -y gpg" + docker exec "${worker}" bash -c "\ + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o ${NVIDIA_CTK_KEYRING} \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=${NVIDIA_CTK_KEYRING}] https://#g' \ + | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update \ + && apt-get install -y nvidia-container-toolkit" + + # Step 3: Configure CDI mode in containerd inside worker + log "Configuring CDI mode for containerd in ${worker}..." + docker exec "${worker}" bash -c "\ + nvidia-ctk config --set nvidia-container-runtime.modes.cdi.annotation-prefixes=nvidia.cdi.k8s.io/ \ + && nvidia-ctk runtime configure --runtime=containerd --cdi.enabled --config-source=command \ + && systemctl restart containerd" + # Step 4: Label worker node for GPU presence + kubectl --context "kind-${KIND_CLUSTER_NAME}" label node "${worker}" \ + --overwrite nvidia.com/gpu.present=true + done + + # Step 5: Create nvidia RuntimeClass + log "Creating nvidia RuntimeClass..." + kubectl --context "kind-${KIND_CLUSTER_NAME}" apply -f - <<'RTEOF' +apiVersion: node.k8s.io/v1 +handler: nvidia +kind: RuntimeClass +metadata: + name: nvidia +RTEOF + + # Step 6: Deploy per-node NVIDIA device plugin DaemonSets + # Each worker gets its own DaemonSet with a specific NVIDIA_VISIBLE_DEVICES + # so the device plugin only discovers/advertises that worker's assigned GPU. + # (Helm's single DaemonSet can't set different env per node.) + log "Deploying NVIDIA device plugin (per-node GPU assignment)..." + local CTX="kind-${KIND_CLUSTER_NAME}" + local PLUGIN_IMAGE="nvcr.io/nvidia/k8s-device-plugin:v0.17.1" + local gpu_idx=0 + for worker in "${KIND_WORKERS[@]}"; do + local ds_name="nvidia-device-plugin-${worker##*-}" # e.g. nvidia-device-plugin-worker + kubectl --context "$CTX" apply -f - </dev/null 2>&1; then + for worker in "${KIND_WORKERS[@]}"; do + if docker exec "${worker}" crictl images 2>/dev/null | grep -q "flowsim-image.*latest"; then + log "${FLOWSIM_IMAGE} already loaded in ${worker}, skipping" + else + log "Loading ${FLOWSIM_IMAGE} into ${worker} (~34GB, may take several minutes)..." + if command -v pv >/dev/null; then + docker save "${FLOWSIM_IMAGE}" | pv -f -a -b | \ + docker exec -i "${worker}" ctr -n k8s.io images import - + else + docker save "${FLOWSIM_IMAGE}" | \ + docker exec -i "${worker}" ctr -n k8s.io images import - + fi + log "${FLOWSIM_IMAGE} loaded into ${worker}" + fi + done + else + warn "${FLOWSIM_IMAGE} not found on host, skipping image load (build it first)" + fi + + # Step 9: Wait for GPU resources + log "Waiting for nvidia.com/gpu resources to appear (up to 180s)..." + local gpu_retries=36 + while true; do + gpu_count=$(kubectl --context "kind-${KIND_CLUSTER_NAME}" get nodes \ + -o jsonpath='{range .items[*]}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}' 2>/dev/null \ + | grep -cE '^[1-9]' || true) + if [ "${gpu_count}" -ge 1 ]; then + log "GPUs registered on ${gpu_count} node(s)" + break + fi + gpu_retries=$((gpu_retries - 1)) + if [ "${gpu_retries}" -le 0 ]; then + warn "GPUs not registered after 180s — debugging info:" + kubectl --context "kind-${KIND_CLUSTER_NAME}" get pods -n nvidia-device-plugin -o wide 2>/dev/null || true + kubectl --context "kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | grep -A5 "Allocatable" || true + break + fi + sleep 5 + done + + # Step 10: Init FlowSim K8s config log "Initializing FlowSim K8s config..." - local kubeconfig - kubeconfig="${HOME}/.kube/config" flowsim init k8s \ - --kubeconfig "${kubeconfig}" \ + --kubeconfig "${HOME}/.kube/config" \ --context "kind-${KIND_CLUSTER_NAME}" \ --namespace default \ + --host-output-dir /tmp/flowsim-traces \ + --runtime-class-name nvidia \ --force + + log "Cluster nodes:" + kubectl --context "kind-${KIND_CLUSTER_NAME}" get nodes -o wide + echo + + log "GPU resources:" + kubectl --context "kind-${KIND_CLUSTER_NAME}" get nodes \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}' 2>/dev/null || true echo - log "Kind cluster ready. Test with:" - log " flowsim submit --scheduler k8s --collect perf --model-path --dry-run" + + log "Kind cluster with GPU (CDI mode) ready." } # ---------------------------------------------------------------- diff --git a/dockerfiles/dev-teardown.sh b/dockerfiles/dev-teardown.sh index 154b049..dfb1c01 100755 --- a/dockerfiles/dev-teardown.sh +++ b/dockerfiles/dev-teardown.sh @@ -15,6 +15,10 @@ log() { printf "\033[1;32m[teardown]\033[0m %s\n" "$*"; } warn() { printf "\033[1;33m[teardown]\033[0m %s\n" "$*"; } teardown_kind() { + # Delete device plugin namespace (contains per-node DaemonSets) + if command -v kubectl >/dev/null; then + kubectl delete namespace nvidia-device-plugin --ignore-not-found 2>/dev/null || true + fi if command -v kind >/dev/null && kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER_NAME}$"; then log "Deleting kind cluster '${KIND_CLUSTER_NAME}'..." kind delete cluster --name "${KIND_CLUSTER_NAME}" diff --git a/dockerfiles/gres.conf b/dockerfiles/gres.conf new file mode 100644 index 0000000..745eeac --- /dev/null +++ b/dockerfiles/gres.conf @@ -0,0 +1,3 @@ +# Slurm GRES config — explicit GPU definition (AutoDetect=nvml requires +# cgroup v2 which is not available; define GPU manually) +Name=gpu Type=nvidia File=/dev/nvidia0 Count=1 diff --git a/dockerfiles/kind-multi-node.yaml b/dockerfiles/kind-multi-node.yaml index c2208c4..ddb8cd2 100644 --- a/dockerfiles/kind-multi-node.yaml +++ b/dockerfiles/kind-multi-node.yaml @@ -1,34 +1,22 @@ -# kind cluster config — 1 control-plane + 2 GPU worker nodes +# Kind cluster config — 1 control-plane + 1 GPU worker node # -# Each worker gets one GPU via NVIDIA device plugin. -# Requires: kind, kubectl, nvidia-container-toolkit +# GPU support via CDI mode (NVIDIA k8s-device-plugin official approach). +# See: https://github.com/NVIDIA/k8s-device-plugin/tree/main/demo/clusters/kind # -# Usage: -# # Install kind (once) -# curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.27.0/kind-linux-amd64 -# chmod +x ./kind && sudo mv ./kind /usr/local/bin/ -# -# # Create cluster -# kind create cluster --name flowsim --config dockerfiles/kind-multi-node.yaml -# -# # Install NVIDIA device plugin (exposes GPUs to K8s) -# kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml +# The single worker binds GPU 0. Change the containerPath index to +# assign a different GPU. # -# # Verify -# kubectl get nodes -# kubectl describe node flowsim-worker | grep nvidia.com/gpu -# kubectl describe node flowsim-worker2 | grep nvidia.com/gpu +# Pre-requisites (host): +# - Docker with nvidia as default runtime +# - accept-nvidia-visible-devices-as-volume-mounts = true +# in /etc/nvidia-container-runtime/config.toml +# - kind, kubectl, helm # -# # Init FlowSim -# flowsim init k8s --kubeconfig ~/.kube/config \ -# --context kind-flowsim --namespace default --force -# -# # Submit a profiling job -# flowsim submit --scheduler k8s --collect perf \ -# --model-path /models/Qwen-7B --gpus 1 +# Usage: +# ./dockerfiles/dev-setup.sh kind # -# # Teardown -# kind delete cluster --name flowsim +# Teardown: +# ./dockerfiles/dev-teardown.sh kind kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 @@ -36,29 +24,14 @@ apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane + # Worker — GPU 0 only - role: worker extraMounts: - # Pass GPU 0 into this node - - hostPath: /dev/nvidia0 - containerPath: /dev/nvidia0 - - hostPath: /dev/nvidiactl - containerPath: /dev/nvidiactl - - hostPath: /dev/nvidia-uvm - containerPath: /dev/nvidia-uvm - # Mount model weights (adjust to your path) - - hostPath: /home/administrator/zhangt - containerPath: /workspace - readOnly: true - - - role: worker - extraMounts: - # Pass GPU 1 into this node - - hostPath: /dev/nvidia1 - containerPath: /dev/nvidia1 - - hostPath: /dev/nvidiactl - containerPath: /dev/nvidiactl - - hostPath: /dev/nvidia-uvm - containerPath: /dev/nvidia-uvm + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/0 - hostPath: /home/administrator/zhangt containerPath: /workspace readOnly: true + # Writable mount so K8s pods can write traces directly to host + - hostPath: /home/administrator/zhangt/FlowSim/stage_traces + containerPath: /host-stage-traces diff --git a/dockerfiles/slurm-compose.yaml b/dockerfiles/slurm-compose.yaml index 29f694d..ee94656 100644 --- a/dockerfiles/slurm-compose.yaml +++ b/dockerfiles/slurm-compose.yaml @@ -48,6 +48,9 @@ services: fi chown munge:munge /etc/munge/munge.key chmod 400 /etc/munge/munge.key + mkdir -p /run/munge + chown munge:munge /run/munge + chmod 755 /run/munge gosu munge munged --foreground " volumes: @@ -61,6 +64,7 @@ services: hostname: slurmctld command: > bash -c " + mkdir -p /run/munge && chown munge:munge /run/munge until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done slurmctld -D -vvv " @@ -79,6 +83,7 @@ services: hostname: slurmd-0 command: > bash -c " + mkdir -p /run/munge && chown munge:munge /run/munge until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done slurmd -D -vvv " @@ -89,6 +94,10 @@ services: - munge-key:/etc/munge:ro - munge-socket:/run/munge - /home/administrator/zhangt:/workspace:ro + # Writable mount so traces appear on host + - /home/administrator/zhangt/FlowSim/stage_traces:/flowsim/stage_traces + # Cgroup needed by slurmd + - /sys/fs/cgroup:/sys/fs/cgroup:rw deploy: resources: reservations: @@ -97,49 +106,27 @@ services: device_ids: ["0"] capabilities: [gpu] - # ---- Compute node 1 (GPU 1) ---- - slurmd-1: - <<: *slurm-base - container_name: slurmd-1 - hostname: slurmd-1 - command: > - bash -c " - until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done - slurmd -D -vvv - " - depends_on: - - slurmctld - volumes: - - slurm-etc:/etc/slurm:ro - - munge-key:/etc/munge:ro - - munge-socket:/run/munge - - /home/administrator/zhangt:/workspace:ro - deploy: - resources: - reservations: - devices: - - driver: nvidia - device_ids: ["1"] - capabilities: [gpu] - - # ---- REST API ---- - slurmrestd: - <<: *slurm-base - container_name: slurmrestd - hostname: slurmrestd - command: > - bash -c " - until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done - slurmrestd -a rest_auth/jwt 0.0.0.0:6820 -vvv - " - depends_on: - - slurmctld - ports: - - "6820:6820" - volumes: - - slurm-etc:/etc/slurm:ro - - munge-key:/etc/munge:ro - - munge-socket:/run/munge + # ---- REST API (optional, for REST mode) ---- + # slurmrestd: + # <<: *slurm-base + # container_name: slurmrestd + # hostname: slurmrestd + # command: > + # bash -c " + # mkdir -p /run/munge && chown munge:munge /run/munge + # until [ -S /run/munge/munge.socket.2 ]; do sleep 0.5; done + # gosu slurm slurmrestd -a rest_auth/jwt 0.0.0.0:6820 -vvv -s slurmctld + # " + # depends_on: + # - slurmctld + # ports: + # - "6820:6820" + # cap_add: + # - SYS_ADMIN + # volumes: + # - slurm-etc:/etc/slurm:ro + # - munge-key:/etc/munge:ro + # - munge-socket:/run/munge volumes: slurm-etc: diff --git a/dockerfiles/slurm-node.dockerfile b/dockerfiles/slurm-node.dockerfile index 397284d..8b79db0 100644 --- a/dockerfiles/slurm-node.dockerfile +++ b/dockerfiles/slurm-node.dockerfile @@ -1,25 +1,24 @@ # Slurm node image — controller, compute, and REST API # -# Based on Ubuntu 22.04 with Slurm 23.11 + munge + JWT support. +# Based on flowsim-image so compute nodes have the full Python/sglang +# environment. Slurm 23.11 is compiled on top with JWT + NVML GRES. # Used by slurm-compose.yaml. -FROM ubuntu:22.04 +FROM flowsim-image:latest ENV DEBIAN_FRONTEND=noninteractive +# Slurm build dependencies + munge RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ gosu \ libhttp-parser-dev \ libjson-c-dev \ libjwt-dev \ libmunge-dev \ munge \ - wget \ && rm -rf /var/lib/apt/lists/* -# Install Slurm 23.11 from source (includes slurmrestd + JWT auth) +# Install Slurm 23.11 from source (slurmrestd + JWT auth + NVML GRES) ARG SLURM_VERSION=23.11.10 RUN cd /tmp && \ wget -q https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 && \ @@ -31,22 +30,26 @@ RUN cd /tmp && \ --with-jwt \ --with-http-parser \ --with-json \ + --with-nvml \ --enable-slurmrestd && \ make -j"$(nproc)" && \ make install && \ rm -rf /tmp/slurm-* # Create required directories and users -RUN useradd -r -s /sbin/nologin slurm && \ +RUN useradd -r -s /sbin/nologin slurm 2>/dev/null || true && \ mkdir -p /etc/slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm && \ chown slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/log/slurm -# Slurm config — 2 compute nodes, 1 GPU each +# Slurm config COPY slurm.conf /etc/slurm/slurm.conf +COPY gres.conf /etc/slurm/gres.conf +COPY cgroup.conf /etc/slurm/cgroup.conf # JWT key for REST API auth RUN dd if=/dev/urandom bs=32 count=1 2>/dev/null | base64 > /etc/slurm/jwt_hs256.key && \ chown slurm:slurm /etc/slurm/jwt_hs256.key && \ chmod 0600 /etc/slurm/jwt_hs256.key +WORKDIR /flowsim CMD ["bash"] diff --git a/dockerfiles/slurm.conf b/dockerfiles/slurm.conf index 734509d..7a26d5c 100644 --- a/dockerfiles/slurm.conf +++ b/dockerfiles/slurm.conf @@ -25,13 +25,14 @@ SchedulerType=sched/backfill SelectType=select/cons_tres SelectTypeParameters=CR_Core_Memory -# GPU support -GresTypes=gpu - -# Accounting (minimal) -AccountingStorageType=accounting_storage/none +# Accounting (disabled — no slurmdbd in test cluster) JobAcctGatherType=jobacct_gather/none +# Task management — disable cgroups (not available in containers) +TaskPlugin=task/none +ProctrackType=proctrack/linuxproc +JobContainerType=job_container/none + # Timeouts SlurmctldTimeout=30 SlurmdTimeout=30 @@ -40,9 +41,11 @@ MinJobAge=300 KillWait=30 Waittime=0 -# Partitions -PartitionName=normal Nodes=slurmd-[0-1] Default=YES MaxTime=INFINITE State=UP +# GRES (GPU) auto-detection +GresTypes=gpu + +# Partitions — single compute node for testing +PartitionName=normal Nodes=slurmd-0 Default=YES MaxTime=INFINITE State=UP -# Node definitions — 1 GPU each -NodeName=slurmd-0 CPUs=8 RealMemory=32000 Gres=gpu:1 State=UNKNOWN -NodeName=slurmd-1 CPUs=8 RealMemory=32000 Gres=gpu:1 State=UNKNOWN +# Node definition — 1 GPU (CPUs/memory match hardware) +NodeName=slurmd-0 CPUs=112 RealMemory=128000 Gres=gpu:1 State=UNKNOWN diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 44c2917..d29df96 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -49,6 +49,9 @@ class K8sScheduler(BaseScheduler): ServiceAccount name for the pod. shm_size : str Size of ``/dev/shm`` (shared memory). Defaults to ``"16Gi"``. + runtime_class_name : str, optional + Kubernetes RuntimeClass name for the pod (e.g., ``"nvidia"`` for + CDI-based GPU injection in Kind clusters). """ def __init__( @@ -62,6 +65,7 @@ def __init__( node_selector: dict[str, str] | None = None, service_account: str = "", shm_size: str = "16Gi", + runtime_class_name: str = "", ) -> None: self.namespace = namespace self.kubeconfig = kubeconfig @@ -71,6 +75,7 @@ def __init__( self.node_selector = node_selector or {} self.service_account = service_account self.shm_size = shm_size + self.runtime_class_name = runtime_class_name def render(self, spec: ProfileJobSpec) -> str: return _dump(self._build_job_dict(spec)) @@ -92,7 +97,9 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: volume_mounts.append({"name": "output", "mountPath": spec.output_dir}) volumes.append({"name": "output", "persistentVolumeClaim": {"claimName": self.pvc_name}}) elif self.host_output_dir: - volume_mounts.append({"name": "output", "mountPath": spec.output_dir}) + # Mount at base traces dir so the full directory structure + # (e.g. k8s/{timestamp}/bs1_...) is preserved on the host. + volume_mounts.append({"name": "output", "mountPath": "/flowsim/stage_traces"}) volumes.append({"name": "output", "hostPath": {"path": self.host_output_dir, "type": "DirectoryOrCreate"}}) container = { @@ -114,6 +121,8 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: "containers": [container], "volumes": volumes, } + if self.runtime_class_name: + pod_spec["runtimeClassName"] = self.runtime_class_name if self.service_account: pod_spec["serviceAccountName"] = self.service_account if self.node_selector: diff --git a/schedulers/local.py b/schedulers/local.py index 4ec94c9..17dd3a0 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -50,25 +50,53 @@ def _find_project_root() -> str: # schedulers/ is one level below project root return os.path.dirname(d) + @staticmethod + def _check_image_exists(image: str) -> None: + """Raise if the Docker image is not available locally.""" + result = subprocess.run( + ["docker", "image", "inspect", image], + capture_output=True, timeout=10, + ) + if result.returncode != 0: + raise SystemExit( + f"[local] Docker image '{image}' not found.\n" + f"Build it first, e.g.:\n" + f" docker build -t {image} -f dockerfiles/cuda12.6.dockerfile ." + ) + def _docker_gpu_flag(self) -> str: """Build the ``--gpus`` flag for ``docker run``.""" if not self.gpus: return "--gpus all" return f"--gpus '\"device={self.gpus}\"'" + def _host_output_dir(self, spec_output_dir: str) -> str: + """Host directory that gets bind-mounted into the container. + + Mirrors the container path structure under the host workdir. + e.g. container ``/flowsim/stage_traces/local/20260317_211318`` + → host ``{workdir}/stage_traces/local/20260317_211318`` + """ + # spec_output_dir is like /flowsim/stage_traces/local/{ts} + # Strip the /flowsim/ prefix to get the relative path + rel = spec_output_dir + if rel.startswith("/flowsim/"): + rel = rel[len("/flowsim/"):] + return os.path.join(self.workdir, rel) + def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: - """Build the full ``docker run`` command.""" + """Build the full ``docker run`` command. + + Paths in *spec* (model_path, output_dir, log_dir) are expected to be + relative to the project root or absolute container paths (``/flowsim/…``). + The container workdir is ``/flowsim``, so relative paths resolve + correctly without any string replacement. + """ job_name = spec.default_job_name()[:63] - # Container always works with /flowsim/stage_traces internally. - container_output = "/flowsim/stage_traces" - container_log_dir = container_output + "/logs" - host_output = os.path.abspath(spec.output_dir) - host_log_dir = host_output + "/logs" + host_output = self._host_output_dir(spec.output_dir) + container_output = spec.output_dir # e.g. /flowsim/stage_traces/local/{ts} - # Build the inner command, then replace host paths with container paths. inner_cmd = spec.build_shell_command() - inner_cmd = inner_cmd.replace(host_log_dir, container_log_dir) - inner_cmd = inner_cmd.replace(host_output, container_output) parts = [ "docker run --rm", @@ -78,6 +106,8 @@ def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: "--network=host", f"-e SGLANG_PROFILE_KERNELS=1", f"-v {host_output}:{container_output}", + f"-v {self.workdir}/simulator:/flowsim/simulator", + f"-v {self.workdir}/scripts:/flowsim/scripts", f"-w /flowsim", spec.image, f"bash -c {_shell_quote(inner_cmd)}", @@ -85,6 +115,7 @@ def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: return " \\\n ".join(parts) def render(self, spec: ProfileJobSpec) -> str: + self._check_image_exists(spec.image) return self._build_docker_cmd(spec) def submit(self, spec: ProfileJobSpec) -> JobResult: @@ -93,8 +124,10 @@ def submit(self, spec: ProfileJobSpec) -> JobResult: stdout and stderr are streamed to the terminal *and* saved to log files under ``spec.output_dir/logs/`` on the host. """ + self._check_image_exists(spec.image) + # Ensure host output dir exists before mounting - host_output = os.path.abspath(spec.output_dir) + host_output = self._host_output_dir(spec.output_dir) log_dir = os.path.join(host_output, "logs") os.makedirs(log_dir, exist_ok=True) @@ -182,6 +215,18 @@ def cancel(self, job_id: str) -> str: return f"Stopped container {job_id}" return f"Could not stop container {job_id}: {proc.stderr.strip()}" + def _find_log_dirs(self) -> list[str]: + """Find all log directories under stage_traces/{scheduler}/*/logs/.""" + import glob + base = os.path.join(self.workdir, "stage_traces", "local") + # New layout: stage_traces/local/{ts}/logs/ + dirs = sorted(glob.glob(os.path.join(base, "*/logs"))) + # Also check legacy flat layout: stage_traces/logs/ + legacy = os.path.join(self.workdir, "stage_traces", "logs") + if os.path.isdir(legacy): + dirs.append(legacy) + return dirs + def status(self, job_id: str) -> dict: """Check local job status by looking for log files. @@ -189,20 +234,23 @@ def status(self, job_id: str) -> dict: """ import glob - log_dir = os.path.join(self.workdir, "stage_traces", "logs") - pattern = os.path.join(log_dir, f"{job_id}_*.stdout.log") - matches = sorted(glob.glob(pattern)) + matches = [] + for log_dir in self._find_log_dirs(): + matches.extend(sorted(glob.glob( + os.path.join(log_dir, f"{job_id}_*.stdout.log") + ))) if not matches: return { "state": "NotFound", - "message": f"No logs found matching {pattern}", + "message": f"No logs found for job '{job_id}'", "output_hint": "", } latest = matches[-1] stderr_log = latest.replace(".stdout.log", ".stderr.log") - trace_dir = os.path.join(self.workdir, "stage_traces") + # trace_dir is the parent of logs/ + trace_dir = os.path.dirname(os.path.dirname(latest)) return { "state": "Completed", @@ -218,17 +266,20 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """List log files for a local job and print access commands.""" import glob - log_dir = os.path.join(self.workdir, "stage_traces", "logs") - pattern = os.path.join(log_dir, f"{job_id}_*") - matches = sorted(glob.glob(pattern)) + matches = [] + for log_dir in self._find_log_dirs(): + matches.extend(sorted(glob.glob( + os.path.join(log_dir, f"{job_id}_*") + ))) if not matches: - # Also try wildcard — user may have given a partial name - pattern = os.path.join(log_dir, f"*{job_id}*") - matches = sorted(glob.glob(pattern)) + for log_dir in self._find_log_dirs(): + matches.extend(sorted(glob.glob( + os.path.join(log_dir, f"*{job_id}*") + ))) if not matches: - return f"No logs found in {log_dir} matching '{job_id}'" + return f"No logs found matching '{job_id}'" if follow: stdout_files = sorted(f for f in matches if f.endswith(".stdout.log")) @@ -236,6 +287,7 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: return f"Follow logs with:\n tail -f {stdout_files[-1]}" return f"No stdout log found to follow for '{job_id}'" + log_dir = os.path.dirname(matches[-1]) parts = [f"Log directory: {log_dir}", ""] parts.append(f"Files ({len(matches)}):") for p in matches: @@ -256,7 +308,7 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: parts.append("Follow logs:") parts.append(f" tail -f {stdout_files[-1]}") - trace_dir = os.path.join(self.workdir, "stage_traces") + trace_dir = os.path.dirname(log_dir) # parent of logs/ parts.append("") parts.append(f"Trace files: {trace_dir}") parts.append(f" ls {trace_dir}") @@ -268,9 +320,11 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: import glob import re - log_dir = os.path.join(self.workdir, "stage_traces", "logs") - pattern = os.path.join(log_dir, "*.stdout.log") - matches = sorted(glob.glob(pattern)) + matches = [] + for log_dir in self._find_log_dirs(): + matches.extend(sorted(glob.glob( + os.path.join(log_dir, "*.stdout.log") + ))) jobs: list[dict] = [] for path in matches: diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 790ade4..6615fad 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -3,12 +3,22 @@ ``render()`` / ``dry_run()`` produce a standalone bash script (zero deps). ``submit()`` posts the script to a slurmrestd endpoint via stdlib ``urllib.request`` — no extra packages needed. + +Two submission modes are supported: + +* **rest** (default) — POST the script to a slurmrestd endpoint. + Requires ``rest_url`` and ``jwt_token``. +* **cli** — pipe the script to ``sbatch`` via subprocess. + Requires ``sbatch``/``squeue``/``scancel`` on PATH (or reachable + via ``cli_prefix``, e.g. ``"docker exec slurmctld"``). """ from __future__ import annotations import json +import shlex import ssl +import subprocess import urllib.error import urllib.request @@ -55,6 +65,12 @@ class SlurmScheduler(BaseScheduler): (relevant for ``"none"`` runtime). extra_sbatch : list[str] Additional ``#SBATCH`` lines, each *without* the ``#SBATCH`` prefix. + submit_via : str + ``"rest"`` (default) — use slurmrestd REST API. + ``"cli"`` — use ``sbatch`` / ``squeue`` / ``scancel`` subprocess. + cli_prefix : str + Shell prefix for CLI commands (e.g. ``"docker exec -i slurmctld"``). + Only used when ``submit_via="cli"``. """ def __init__( @@ -72,6 +88,8 @@ def __init__( container_mounts: str = "", modules: list[str] | None = None, extra_sbatch: list[str] | None = None, + submit_via: str = "rest", + cli_prefix: str = "", ) -> None: self.partition = partition self.time_limit = time_limit @@ -85,6 +103,8 @@ def __init__( self.container_mounts = container_mounts self.modules = modules or [] self.extra_sbatch = extra_sbatch or [] + self.submit_via = submit_via + self.cli_prefix = cli_prefix def render(self, spec: ProfileJobSpec) -> str: job_name = spec.default_job_name() @@ -111,6 +131,10 @@ def render(self, spec: ProfileJobSpec) -> str: lines.append("set -euo pipefail") lines.append("") + # Ensure output dir exists (needed for #SBATCH --output) + lines.append(f"mkdir -p {spec.output_dir}") + lines.append("") + if self.modules: for mod in self.modules: lines.append(f"module load {mod}") @@ -151,6 +175,61 @@ def render(self, spec: ProfileJobSpec) -> str: return "\n".join(lines) def submit(self, spec: ProfileJobSpec) -> JobResult: + """Submit the job via REST API or CLI, depending on ``submit_via``.""" + if self.submit_via == "cli": + return self._submit_cli(spec) + return self._submit_rest(spec) + + # ------------------------------------------------------------------ + # CLI helpers + # ------------------------------------------------------------------ + + def _cli_cmd(self, *args: str) -> list[str]: + """Build a command list, prepending ``cli_prefix`` if set.""" + prefix = shlex.split(self.cli_prefix) if self.cli_prefix else [] + return prefix + list(args) + + def _cli_run( + self, + *args: str, + input_data: str | None = None, + timeout: int = 60, + ) -> subprocess.CompletedProcess: + """Run a Slurm CLI command and return the CompletedProcess.""" + cmd = self._cli_cmd(*args) + return subprocess.run( + cmd, + capture_output=True, + text=True, + input=input_data, + timeout=timeout, + ) + + def _submit_cli(self, spec: ProfileJobSpec) -> JobResult: + """Submit via ``sbatch`` (piping the script on stdin).""" + script = self.render(spec) + job_name = spec.default_job_name() + + r = self._cli_run("sbatch", "--parsable", input_data=script, timeout=30) + if r.returncode != 0: + raise RuntimeError( + f"sbatch failed (exit {r.returncode}):\n{r.stderr}" + ) + + job_id = r.stdout.strip().split(";")[0] # parsable: "jobid" or "jobid;cluster" + return JobResult( + job_id=job_id, + scheduler="slurm", + state="Submitted", + output_dir=spec.output_dir, + message=f"Submitted batch job {job_id}", + ) + + # ------------------------------------------------------------------ + # REST submit + # ------------------------------------------------------------------ + + def _submit_rest(self, spec: ProfileJobSpec) -> JobResult: """Submit the job via slurmrestd REST API. Requires ``rest_url`` and ``jwt_token`` to be set. @@ -264,6 +343,112 @@ def _rest_get(self, path: str) -> dict: return self._rest_request(path, method="GET") def cancel(self, job_id: str) -> str: + """Cancel a Slurm job.""" + if self.submit_via == "cli": + return self._cancel_cli(job_id) + return self._cancel_rest(job_id) + + def status(self, job_id: str) -> dict: + """Query Slurm job status.""" + if self.submit_via == "cli": + return self._status_cli(job_id) + return self._status_rest(job_id) + + def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + """Show Slurm job log information.""" + if self.submit_via == "cli": + return self._logs_cli(job_id, tail=tail, follow=follow) + return self._logs_rest(job_id, tail=tail, follow=follow) + + def list_jobs(self, *, status_filter: str = "") -> list[dict]: + """List Slurm jobs.""" + if self.submit_via == "cli": + return self._list_jobs_cli(status_filter=status_filter) + return self._list_jobs_rest(status_filter=status_filter) + + # ------------------------------------------------------------------ + # CLI implementations + # ------------------------------------------------------------------ + + def _cancel_cli(self, job_id: str) -> str: + r = self._cli_run("scancel", job_id) + if r.returncode != 0: + raise RuntimeError(f"scancel failed: {r.stderr}") + return f"Cancelled Slurm job {job_id}" + + def _status_cli(self, job_id: str) -> dict: + # Use scontrol show job — works for both running and completed jobs + # (completed jobs stay in memory for MinJobAge seconds, default 300s) + r = self._cli_run("scontrol", "show", "job", job_id) + if r.returncode != 0 or not r.stdout.strip(): + return {"state": "Unknown", "message": f"No job found with ID {job_id}", "output_hint": ""} + + # Parse key=value output + fields: dict[str, str] = {} + for token in r.stdout.replace("\n", " ").split(): + if "=" in token: + k, _, v = token.partition("=") + fields[k] = v + + state = fields.get("JobState", "UNKNOWN") + name = fields.get("JobName", "") + nodes = fields.get("NodeList", "") + output_file = fields.get("StdOut", "") + + # Normalize to match test expectations + if state == "COMPLETED": + state = "Completed" + elif state == "FAILED": + state = "Failed" + + msg_parts = [ + f"Job ID: {job_id} Name: {name} State: {state}", + f"Nodes: {nodes}" if nodes else "Nodes: (not yet assigned)", + ] + if output_file: + msg_parts.append(f"Output log: {output_file}") + + return { + "state": state, + "message": "\n".join(msg_parts), + "output_hint": output_file, + } + + def _logs_cli(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + info = self._status_cli(job_id) + return info["message"] + + def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: + r = self._cli_run( + "squeue", "-o", "%i|%j|%T|%P|%N", "-h", + ) + if r.returncode != 0: + raise RuntimeError(f"squeue failed: {r.stderr}") + result: list[dict] = [] + for line in r.stdout.strip().splitlines(): + if not line.strip(): + continue + parts = line.split("|", 4) + name = parts[1] if len(parts) > 1 else "" + if not name.startswith("flowsim-"): + continue + state = parts[2] if len(parts) > 2 else "UNKNOWN" + if status_filter and state.upper() != status_filter.upper(): + continue + result.append({ + "job_id": parts[0] if parts else "", + "name": name, + "state": state, + "partition": parts[3] if len(parts) > 3 else "", + "nodes": parts[4] if len(parts) > 4 else "", + }) + return result + + # ------------------------------------------------------------------ + # REST implementations + # ------------------------------------------------------------------ + + def _cancel_rest(self, job_id: str) -> str: """Cancel a Slurm job via slurmrestd DELETE.""" body = self._rest_request( f"/slurm/{self.api_version}/job/{job_id}", @@ -275,7 +460,7 @@ def cancel(self, job_id: str) -> str: raise RuntimeError(f"slurmrestd cancel failed: {msgs}") return f"Cancelled Slurm job {job_id}" - def status(self, job_id: str) -> dict: + def _status_rest(self, job_id: str) -> dict: """Query Slurm job status via slurmrestd.""" body = self._rest_get(f"/slurm/{self.api_version}/job/{job_id}") @@ -312,9 +497,9 @@ def status(self, job_id: str) -> dict: "output_hint": output_file, } - def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def _logs_rest(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Show where Slurm job logs are and how to access them.""" - info = self.status(job_id) + info = self._status_rest(job_id) output_file = info.get("output_hint", "") state = info.get("state", "UNKNOWN") @@ -348,7 +533,7 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: return "\n".join(parts) - def list_jobs(self, *, status_filter: str = "") -> list[dict]: + def _list_jobs_rest(self, *, status_filter: str = "") -> list[dict]: """List Slurm jobs via slurmrestd /jobs endpoint.""" body = self._rest_get(f"/slurm/{self.api_version}/jobs") errors = body.get("errors") or [] diff --git a/schedulers/templates/k8s.yaml b/schedulers/templates/k8s.yaml index bac3a77..2adb927 100644 --- a/schedulers/templates/k8s.yaml +++ b/schedulers/templates/k8s.yaml @@ -22,6 +22,7 @@ host_output_dir: "" # hostPath alternative to PVC # Optional service_account: "" shm_size: "16Gi" +runtime_class_name: "" # e.g. "nvidia" for CDI-based GPU (Kind clusters) # node_selector: # gpu: a100 # tier: high diff --git a/scripts/cli.py b/scripts/cli.py index b5d2bc3..135ed84 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -34,6 +34,8 @@ def _init_k8s_parser(sub: argparse._SubParsersAction) -> None: help="Service account for the job pod") p.add_argument("--shm-size", default="16Gi", help="Shared memory size (default: 16Gi)") + p.add_argument("--runtime-class-name", default="", + help="RuntimeClass for pod (e.g. 'nvidia' for CDI mode)") p.add_argument("--force", action="store_true", help="Overwrite existing config file") @@ -100,6 +102,7 @@ def _cmd_init(argv: list[str]) -> int: "host_output_dir": args.host_output_dir, "service_account": args.service_account, "shm_size": args.shm_size, + "runtime_class_name": args.runtime_class_name, } dst = _CONFIG_DIR / "k8s.yaml" diff --git a/scripts/status_profile.py b/scripts/status_profile.py index 2f82ebc..4882d11 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -72,6 +72,14 @@ def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> "--k8s-context", default=_d("FLOWSIM_K8S_CONTEXT", k8s_cfg, "context", ""), ) + p.add_argument( + "--k8s-pvc", + default=cfg_get(k8s_cfg, "pvc", ""), + ) + p.add_argument( + "--k8s-host-output-dir", + default=cfg_get(k8s_cfg, "host_output_dir", ""), + ) elif scheduler == "slurm": p.add_argument( @@ -90,6 +98,15 @@ def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> "--slurm-no-verify-ssl", action="store_true", ) + p.add_argument( + "--slurm-submit-via", + choices=["rest", "cli"], + default=cfg_get(slurm_cfg, "submit_via", "rest"), + ) + p.add_argument( + "--slurm-cli-prefix", + default=cfg_get(slurm_cfg, "cli_prefix", ""), + ) def _resolve_slurm_jwt(args: argparse.Namespace) -> None: @@ -109,6 +126,8 @@ def _build_scheduler(args: argparse.Namespace): namespace=args.k8s_namespace, kubeconfig=args.k8s_kubeconfig, context=args.k8s_context, + pvc_name=getattr(args, "k8s_pvc", "") or "", + host_output_dir=getattr(args, "k8s_host_output_dir", "") or "", ) else: return SlurmScheduler( @@ -116,6 +135,8 @@ def _build_scheduler(args: argparse.Namespace): jwt_token=args.slurm_jwt_token, api_version=args.slurm_api_version, verify_ssl=not args.slurm_no_verify_ssl, + submit_via=args.slurm_submit_via, + cli_prefix=args.slurm_cli_prefix, ) @@ -138,6 +159,7 @@ def main_status(argv: list[str] | None = None) -> None: scheduler = _build_scheduler(args) try: info = scheduler.status(args.job) + print(f"State: {info['state']}") print(info["message"]) except Exception as exc: print(f"Error: {exc}", file=sys.stderr) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 18f7882..23089eb 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -216,6 +216,11 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: "--k8s-shm-size", default=cfg_get(k8s_cfg, "shm_size", "16Gi"), ) + k8s.add_argument( + "--k8s-runtime-class", + default=cfg_get(k8s_cfg, "runtime_class_name", ""), + help="RuntimeClass for pod (e.g. 'nvidia' for CDI mode)", + ) elif pre.scheduler == "slurm": slurm = p.add_argument_group("slurm options (config: ~/.flowsim/slurm.yaml)") @@ -281,6 +286,17 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: metavar="DIRECTIVE", help="Extra #SBATCH directives (repeatable, without prefix)", ) + slurm.add_argument( + "--slurm-submit-via", + choices=["rest", "cli"], + default=cfg_get(slurm_cfg, "submit_via", "rest"), + help="Submission mode: rest (slurmrestd) or cli (sbatch subprocess)", + ) + slurm.add_argument( + "--slurm-cli-prefix", + default=cfg_get(slurm_cfg, "cli_prefix", ""), + help='Shell prefix for CLI mode (e.g. "docker exec -i slurmctld")', + ) return p.parse_args(argv) @@ -334,6 +350,7 @@ def _build_scheduler(args: argparse.Namespace): node_selector=node_sel, service_account=args.k8s_service_account, shm_size=args.k8s_shm_size, + runtime_class_name=args.k8s_runtime_class, ) else: return SlurmScheduler( @@ -349,23 +366,27 @@ def _build_scheduler(args: argparse.Namespace): container_mounts=args.slurm_container_mounts, modules=args.slurm_module, extra_sbatch=args.slurm_extra_sbatch, + submit_via=args.slurm_submit_via, + cli_prefix=args.slurm_cli_prefix, ) def main(argv: list[str] | None = None) -> None: args = _parse_args(argv) - # Smart defaults for output_dir based on scheduler + # Smart defaults for output_dir based on scheduler. + # Layout: stage_traces/{scheduler}/{timestamp}/ + import time as _time + _ts = _time.strftime("%Y%m%d_%H%M%S") if not args.output_dir: if args.scheduler == "local": - project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - args.output_dir = os.path.join(project_root, "stage_traces") + args.output_dir = f"/flowsim/stage_traces/local/{_ts}" elif args.scheduler == "slurm": - # Slurm: default to ~/flowsim_traces (shared filesystem) - args.output_dir = os.path.expanduser("~/flowsim_traces") + args.output_dir = os.path.expanduser( + f"~/flowsim_traces/slurm/{_ts}" + ) else: - # K8s: container path (PVC/hostPath mounted here) - args.output_dir = "/flowsim/stage_traces" + args.output_dir = f"/flowsim/stage_traces/k8s/{_ts}" # Resolve Slurm JWT token from jwt_token_cmd in config if needed if args.scheduler == "slurm" and not args.slurm_jwt_token: @@ -378,6 +399,13 @@ def main(argv: list[str] | None = None) -> None: if not args.dry_run and args.scheduler not in ("local",): _validate_connection(args) + # For local scheduler, convert absolute host model_path to relative + # so it resolves correctly inside the container (workdir=/flowsim). + if args.scheduler == "local" and os.path.isabs(args.model_path): + project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if args.model_path.startswith(project_root): + args.model_path = os.path.relpath(args.model_path, project_root) + spec = _build_spec(args) scheduler = _build_scheduler(args) @@ -459,6 +487,16 @@ def _validate_connection(args: argparse.Namespace) -> None: file=sys.stderr, ) elif args.scheduler == "slurm": + if args.slurm_submit_via == "cli": + # CLI mode only needs partition + if not args.slurm_partition: + sys.exit( + "Error: missing required Slurm config:\n" + " - partition (--slurm-partition)\n\n" + f"Set it in ~/.flowsim/slurm.yaml or via CLI flag.\n" + + _INIT_HINT + ) + return missing = [] if not args.slurm_rest_url: missing.append("rest_url (--slurm-rest-url)") diff --git a/simulator/base_parser.py b/simulator/base_parser.py index ca9cadb..2b77967 100644 --- a/simulator/base_parser.py +++ b/simulator/base_parser.py @@ -319,12 +319,12 @@ def _parse_events(self) -> list[tuple]: else: # Case 2: If no ext_id, we need to find the shape from user annotations # Key Identification Methodology: Annotation is overlapped with kernel + dims_anno = "N/A" + input_type_anno = "N/A" + desc_anno = "" for anno_idx, anno in enumerate(annotation_events): if anno_idx in used_annotations: continue - dims_anno = "N/A" - input_type_anno = "N/A" - desc_anno = "" if "ProfilerStep" in anno.get("name", ""): continue anno_start = anno.get("ts", 0) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 56cbdde..62f52fb 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -2,38 +2,46 @@ Tests all three scheduler backends (local, k8s, slurm) end-to-end. -* **local** — runs real TP=1 profiling and verifies traces, parsed CSVs, - log files, JobResult return, cancel, list, logs --follow. -* **k8s** — submits a real Job to a Kind cluster, verifies JobResult, - status, logs, list, cancel, logs --follow. Also validates - that dry-run YAML has the correct volume mounts and log paths. -* **slurm** — dry-run only; verifies the sbatch script has the correct - ``output_dir``, ``--log-dir``, and ``#SBATCH --output`` directives. +* **local** — submits jobs via ``flowsim submit --scheduler local`` which + launches Docker containers on the host. Validates job lifecycle (submit, + list, status) and trace CSV correctness (GEMM dim0, FlashAttn seqlen). +* **k8s** — submits a real Job to a Kind cluster, retrieves traces via + ``docker cp``, and validates trace CSVs. Auto-sets up the Kind cluster + via ``dev-setup.sh`` if not already running. +* **slurm** — submits a real job to a local docker-compose Slurm cluster, + retrieves traces via ``docker cp``, and validates trace CSVs. Auto-sets + up the Slurm cluster via ``dev-setup.sh slurm`` if not already running. Requirements ------------ -* The ``flowsim-test`` container with GPUs (for local tests). -* A Kind cluster named ``flowsim`` (for K8s tests). +* Docker with ``flowsim-image:latest`` built (for local tests). +* A GPU-equipped host machine (local tests run on the physical host, + NOT inside a Docker container). +* ``dockerfiles/dev-setup.sh`` available (Kind and Slurm clusters are + automatically created if missing). * ``schedulers/`` available on PYTHONPATH. Environment Variables --------------------- ``MODEL`` - Model path (default: ``/flowsim/workload/models/configs/Qwen3-235B-A22B``). + Model path relative to project root + (default: ``workload/models/configs/Qwen3-235B-A22B``). ``LOAD_FORMAT`` Load format (default: ``dummy``). Usage ----- - # Inside container (local tests): - docker exec flowsim-test python -m pytest \ - tests/integration/test_scheduler_local.py -v -x + # On host (local scheduler tests — needs Docker + GPU): + cd FlowSim && python -m pytest \ + tests/integration/test_scheduler_local.py -v -x -k "local" # On host (k8s tests — needs kubeconfig): python -m pytest tests/integration/test_scheduler_local.py \ -v -x -k "k8s" """ +import ast +import csv import glob import json import os @@ -49,14 +57,13 @@ _PROJECT_ROOT = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..") ) +_DEV_SETUP = os.path.join(_PROJECT_ROOT, "dockerfiles", "dev-setup.sh") +_DEV_TEARDOWN = os.path.join(_PROJECT_ROOT, "dockerfiles", "dev-teardown.sh") MODEL = os.environ.get( - "MODEL", "/flowsim/workload/models/configs/Qwen3-235B-A22B" + "MODEL", "workload/models/configs/Qwen3-235B-A22B" ) LOAD_FORMAT = os.environ.get("LOAD_FORMAT", "dummy") -ARTIFACT_DIR = os.environ.get( - "PYTEST_ARTIFACT_DIR", "/flowsim/tests/test-artifacts" -) # --------------------------------------------------------------------------- # Helpers @@ -122,424 +129,466 @@ def _assert_logs(output_dir: str) -> None: assert max(sizes) > 0, "All stdout logs are empty" +# --------------------------------------------------------------------------- +# Shape validation helpers (same logic as test_stage_profile_configs.py) +# --------------------------------------------------------------------------- +def _read_csv(path): + with open(path, newline="") as f: + return list(csv.DictReader(f)) + + +_GEMM_NAME_PATTERNS = ("nvjet", "cublasLt", "cublas_", "cutlass_gemm") + + +def _first_matmul_dim0(rows): + """Return dim0 of the first GEMM kernel (the M dimension).""" + for row in rows: + if row.get("op", "") == "matmul": + dims = ast.literal_eval(row["Dims"]) + return dims[0][0] + for row in rows: + name = row["Name"] + dims_str = row.get("Dims", "N/A") + if dims_str == "N/A" or not dims_str: + continue + if any(pat in name for pat in _GEMM_NAME_PATTERNS): + dims = ast.literal_eval(dims_str) + if len(dims) >= 2 and len(dims[0]) == 2 and len(dims[1]) == 2: + return dims[0][0] + return None + + +def _attention_seqlen_pair(rows, bs, seq_len): + """Check that [bs, seq_len] (or +1) appears in FlashAttn dims.""" + for row in rows: + name = row["Name"] + if "FlashAttn" not in name: + continue + if "Combine" in name or "prepare" in name: + continue + dims = ast.literal_eval(row["Dims"]) + for d in dims: + if ( + isinstance(d, list) + and len(d) == 2 + and d[0] == bs + and d[1] in (seq_len, seq_len + 1) + ): + return d + return None + return None + + +def _validate_shapes(output_dir, bs, input_len, existing_ctx): + """Validate GEMM dim0 and FlashAttn seqlen in merged/shape_parsed CSVs.""" + tag = f"bs{bs}_input{input_len}_ctx{existing_ctx}" + for csv_subdir in ("merged", "shape_parsed"): + extend_csvs = sorted( + glob.glob(os.path.join(output_dir, tag, csv_subdir, "*TP-0*EXTEND*.csv")) + ) + decode_csvs = sorted( + glob.glob(os.path.join(output_dir, tag, csv_subdir, "*TP-0*DECODE*.csv")) + ) + if extend_csvs and decode_csvs: + break + else: + pytest.fail( + f"No EXTEND+DECODE CSVs for TP-0 in {output_dir}/{tag}/{{merged,shape_parsed}}/" + ) + + extend_rows = _read_csv(extend_csvs[0]) + decode_rows = _read_csv(decode_csvs[0]) + + # EXTEND first GEMM dim0 == bs * input_len + ext_gemm_dim0 = _first_matmul_dim0(extend_rows) + assert ext_gemm_dim0 is not None, "No matmul kernel found in EXTEND CSV" + expected_ext = bs * input_len + assert ext_gemm_dim0 == expected_ext, ( + f"EXTEND first GEMM dim0={ext_gemm_dim0}, expected bs*input_len={expected_ext}" + ) + + # EXTEND FlashAttn dims contain [bs, seq_len] + seq_len = input_len + existing_ctx + attn_pair = _attention_seqlen_pair(extend_rows, bs, seq_len) + assert attn_pair is not None, ( + f"No FlashAttention dim matching [bs={bs}, seqlen={seq_len}(+1)] in EXTEND CSV" + ) + + # DECODE first GEMM dim0 == bs + dec_gemm_dim0 = _first_matmul_dim0(decode_rows) + assert dec_gemm_dim0 is not None, "No matmul kernel found in DECODE CSV" + assert dec_gemm_dim0 == bs, ( + f"DECODE first GEMM dim0={dec_gemm_dim0}, expected bs={bs}" + ) + + # ===================================================================== -# LOCAL SCHEDULER — real profiling +# LOCAL SCHEDULER — real profiling (4-step flow) # ===================================================================== class TestLocalScheduler: - """Run real profiling via ``flowsim submit --scheduler local``.""" + """Run real profiling via ``flowsim`` CLI on the local Docker scheduler. + + Flow per test point: + 1. ``flowsim submit`` — submit the job (collect all) + 2. ``flowsim list`` — verify the job appears + 3. ``flowsim status`` — poll until Completed + 4. Validate trace CSVs — GEMM dim0, FlashAttn seqlen for EXTEND & DECODE + """ + + _TP1_POINTS = [ + {"bs": 1, "input_len": 2048, "existing_ctx": 0, "decode_tokens": 2}, + {"bs": 1, "input_len": 2048, "existing_ctx": 2048, "decode_tokens": 2}, + ] - @pytest.mark.skipif( - not os.path.isdir("/flowsim"), - reason="Local profiling tests must run inside the FlowSim Docker container", + @pytest.mark.parametrize( + "point", + _TP1_POINTS, + ids=[f"bs{p['bs']}_il{p['input_len']}_ctx{p['existing_ctx']}" for p in _TP1_POINTS], ) - def test_local_perf_tp1(self): - """TP=1 perf profiling: traces + parsed CSVs + log files.""" - output_dir = os.path.join(ARTIFACT_DIR, "sched_local_tp1") + def test_local_tp1_all(self, point): + bs = point["bs"] + input_len = point["input_len"] + existing_ctx = point["existing_ctx"] + decode_tokens = point["decode_tokens"] + # ── Step 1: submit ── r = _flowsim_cli( "submit", "--scheduler", "local", - "--collect", "perf", + "--collect", "all", "--model-path", MODEL, "--tp", "1", - "--bs", "1", - "--input-len", "512", - "--decode-tokens", "8", + "--bs", str(bs), + "--input-len", str(input_len), + "--existing-ctx", str(existing_ctx), + "--decode-tokens", str(decode_tokens), "--warmup-n", "2", "--gpus", "1", "--local-gpus", "0", - "--output-dir", output_dir, "--extra-server-opts", f"--load-format {LOAD_FORMAT}", ) - if r.returncode != 0: print("STDOUT:", r.stdout[-3000:]) print("STDERR:", r.stderr[-3000:]) assert r.returncode == 0, f"flowsim submit failed (exit {r.returncode})" - # Verify traces and parsed CSVs - _assert_traces(output_dir) - - # Verify log files under output_dir/logs/ - _assert_logs(output_dir) - - # Verify submit output mentions log/trace locations + # Extract job_id from output (line like "flowsim-all-... completed successfully") combined = r.stdout + r.stderr - assert "Traces:" in combined, "Submit output should show trace location" - assert "Logs:" in combined, "Submit output should show log location" - - def test_local_status(self): - """flowsim status --scheduler local should find logs from the previous run.""" - r = _flowsim_cli( - "status", - "--scheduler", "local", - "--job", "flowsim-perf", + job_id = None + for line in combined.splitlines(): + if "flowsim-all-" in line: + for word in line.split(): + if word.startswith("flowsim-all-"): + job_id = word.rstrip(".,;:") + break + if job_id: + break + assert job_id, f"Could not find job_id in submit output:\n{combined[-1000:]}" + + # ── Step 2: list — verify job appears ── + r_list = _flowsim_cli("list", "--scheduler", "local") + assert r_list.returncode == 0, "flowsim list failed" + assert job_id in r_list.stdout, ( + f"Job {job_id} not found in list output:\n{r_list.stdout}" ) - # Should either find logs or say not found — should not crash - assert r.returncode == 0 - def test_local_logs(self): - """flowsim logs --scheduler local should list log files and give paths.""" - r = _flowsim_cli( - "logs", - "--scheduler", "local", - "--job", "flowsim-perf", + # ── Step 3: status — should be Completed (submit is synchronous) ── + r_status = _flowsim_cli("status", "--scheduler", "local", "--job", job_id) + assert r_status.returncode == 0, "flowsim status failed" + status_out = r_status.stdout.lower() + assert "completed" in status_out, ( + f"Job {job_id} not completed:\n{r_status.stdout}" ) - assert r.returncode == 0 - output = r.stdout - # Should contain file listing or "No logs" — not crash - assert "Log directory:" in output or "No logs" in output - def test_local_logs_follow(self): - """flowsim logs --follow should show tail -f command.""" - r = _flowsim_cli( - "logs", - "--scheduler", "local", - "--job", "flowsim-perf", - "--follow", + # ── Step 4: validate trace CSVs ── + # Extract output_dir from status output (Traces dir: ...) + output_dir = None + for line in r_status.stdout.splitlines(): + if "Traces dir:" in line: + output_dir = line.split("Traces dir:", 1)[1].strip() + break + assert output_dir and os.path.isdir(output_dir), ( + f"Could not find traces dir in status output:\n{r_status.stdout}" ) - assert r.returncode == 0 - output = r.stdout - assert "tail -f" in output or "No logs" in output + _assert_traces(output_dir) + _assert_logs(output_dir) + _validate_shapes(output_dir, bs=bs, input_len=input_len, existing_ctx=existing_ctx) - def test_local_cancel(self): - """flowsim cancel --scheduler local should attempt docker stop.""" - r = _flowsim_cli( - "cancel", - "--scheduler", "local", - "--job", "flowsim-perf", - ) - assert r.returncode == 0 - out = r.stdout.lower() - assert "stop" in out or "container" in out - def test_local_list(self): - """flowsim list --scheduler local should list jobs from log files.""" - r = _flowsim_cli( - "list", - "--scheduler", "local", +# ===================================================================== +# Cluster setup helpers & fixtures +# ===================================================================== + +def _run_dev_setup(target: str) -> None: + """Run ``dockerfiles/dev-setup.sh `` and assert success.""" + r = subprocess.run( + ["bash", _DEV_SETUP, target], + capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=300, + ) + if r.returncode != 0: + raise RuntimeError( + f"dev-setup.sh {target} failed (exit {r.returncode}):\n" + f"stdout: {r.stdout[-2000:]}\nstderr: {r.stderr[-2000:]}" ) - assert r.returncode == 0 - output = r.stdout - # Should either show jobs or "No jobs found" - assert "JOB_ID" in output or "No jobs found" in output - def test_local_list_status_filter(self): - """flowsim list --status Completed should filter.""" - r = _flowsim_cli( - "list", - "--scheduler", "local", - "--status", "Completed", + +def _run_dev_teardown(target: str) -> None: + """Run ``dockerfiles/dev-teardown.sh ``.""" + subprocess.run( + ["bash", _DEV_TEARDOWN, target], + capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=120, + ) + + +def _kind_cluster_running() -> bool: + """Check if the Kind cluster named 'flowsim' is reachable.""" + try: + r = subprocess.run( + ["kubectl", "--context", "kind-flowsim", "get", "nodes"], + capture_output=True, text=True, timeout=15, ) - assert r.returncode == 0 + return r.returncode == 0 and "Ready" in r.stdout + except Exception: + return False -# ===================================================================== -# LOCAL SCHEDULER — unit-level tests for JobResult and list_jobs -# ===================================================================== -class TestLocalSchedulerAPI: - """Test LocalScheduler API directly (no subprocess, no GPU).""" +@pytest.fixture(scope="session") +def kind_cluster(): + """Ensure Kind cluster is running; auto-setup if needed. - def test_submit_returns_job_result(self): - """LocalScheduler.submit() must return a JobResult, not a string.""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - sched = LocalScheduler(workdir=tmpdir) - spec = ProfileJobSpec( - collect="perf", - model_path="Qwen/Qwen3-8B", - output_dir=os.path.join(tmpdir, "traces"), - ) - # Monkey-patch: make build_shell_command return a trivial command - spec.build_shell_command = lambda: "echo hello" - result = sched.submit(spec) - assert isinstance(result, JobResult), f"Expected JobResult, got {type(result)}" - assert result.scheduler == "local" - assert result.state == "Completed" - assert result.job_id != "" - assert result.output_dir == spec.output_dir - - def test_submit_failed_returns_failed_state(self): - """A failing command should return JobResult with state=Failed.""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - sched = LocalScheduler(workdir=tmpdir) - spec = ProfileJobSpec( - collect="perf", - model_path="Qwen/Qwen3-8B", - output_dir=os.path.join(tmpdir, "traces"), - ) - spec.build_shell_command = lambda: "exit 1" - result = sched.submit(spec) - assert isinstance(result, JobResult) - assert result.state == "Failed" + The cluster is kept alive after the test session to avoid + re-loading the 34 GB image every time. Use ``dev-teardown.sh kind`` + to clean up manually. + """ + if not _kind_cluster_running(): + _run_dev_setup("kind") + assert _kind_cluster_running(), "Kind cluster not reachable after setup" + yield - def test_list_jobs_finds_log_files(self): - """list_jobs() should find jobs from log file names.""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - log_dir = os.path.join(tmpdir, "stage_traces", "logs") - os.makedirs(log_dir) - # Create fake log files - for name in [ - "flowsim-perf-qwen3-8b-bs1-il512_1700000001.stdout.log", - "flowsim-perf-qwen3-8b-bs1-il512_1700000001.stderr.log", - "flowsim-perf-qwen3-8b-bs1-il1024_1700000002.stdout.log", - "flowsim-perf-qwen3-8b-bs1-il1024_1700000002.stderr.log", - ]: - open(os.path.join(log_dir, name), "w").close() - - sched = LocalScheduler(workdir=tmpdir) - jobs = sched.list_jobs() - assert len(jobs) == 2 - assert all("job_id" in j and "state" in j for j in jobs) - - def test_list_jobs_status_filter(self): - """list_jobs(status_filter=...) should filter results.""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - log_dir = os.path.join(tmpdir, "stage_traces", "logs") - os.makedirs(log_dir) - open(os.path.join(log_dir, "flowsim-perf-x_100.stdout.log"), "w").close() - open(os.path.join(log_dir, "flowsim-perf-x_100.stderr.log"), "w").close() - - sched = LocalScheduler(workdir=tmpdir) - assert len(sched.list_jobs(status_filter="Completed")) == 1 - assert len(sched.list_jobs(status_filter="Running")) == 0 - - def test_logs_follow_shows_tail_f(self): - """logs(follow=True) should return a tail -f command.""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - log_dir = os.path.join(tmpdir, "stage_traces", "logs") - os.makedirs(log_dir) - open(os.path.join(log_dir, "flowsim-perf-x_100.stdout.log"), "w").close() - - sched = LocalScheduler(workdir=tmpdir) - text = sched.logs("flowsim-perf-x", follow=True) - assert "tail -f" in text - - def test_cancel_returns_message(self): - """cancel() should attempt docker stop and return a message.""" - sched = LocalScheduler() - msg = sched.cancel("some-job") - assert "stop" in msg.lower() or "container" in msg.lower() - - def test_submit_pd_pair_returns_list(self): - """submit_pd_pair() must return list[JobResult].""" - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - sched = LocalScheduler(workdir=tmpdir) - spec = ProfileJobSpec( - collect="perf", - model_path="Qwen/Qwen3-8B", - output_dir=os.path.join(tmpdir, "traces"), - ) - # Monkey-patch to avoid real profiling - spec.build_shell_command = lambda: "echo hello" - results = sched.submit_pd_pair(spec) - assert isinstance(results, list) - assert len(results) == 2 - assert all(isinstance(r, JobResult) for r in results) - # One should be prefill, one decode - modes = {r.job_id for r in results} - assert any("prefill" in m for m in modes) - assert any("decode" in m for m in modes) + +@pytest.fixture(scope="session") +def slurm_cluster(): + """Ensure Slurm cluster is running; auto-setup if needed. + + Cluster is kept alive after tests. Use ``dev-teardown.sh slurm`` + to clean up manually. + """ + if not _slurm_cluster_running(): + _run_dev_setup("slurm") + assert _slurm_cluster_running(), "Slurm cluster not reachable after setup" + yield # ===================================================================== # K8S SCHEDULER # ===================================================================== class TestK8sScheduler: - """K8s scheduler: dry-run validates YAML structure, real submit to Kind.""" + """K8s scheduler: real submit to Kind cluster. - def test_k8s_dry_run_has_volume_and_log_path(self): - """Dry-run YAML should mount output volume and pass --log-dir.""" - r = _flowsim_cli( - "submit", - "--scheduler", "k8s", - "--collect", "perf", - "--model-path", MODEL, - "--k8s-namespace", "default", - "--k8s-pvc", "test-traces", - "--output-dir", "/data/traces", - "--dry-run", - ) - assert r.returncode == 0 - yaml_output = r.stdout - - # Job structure - assert "apiVersion: batch/v1" in yaml_output - assert "kind: Job" in yaml_output + Automatically sets up the Kind cluster via ``dev-setup.sh`` if not + already running. + """ - # PVC volume mount - assert "test-traces" in yaml_output - assert "persistentVolumeClaim" in yaml_output + def test_k8s_real_submit_to_kind(self, kind_cluster): + """Submit a real Job to Kind cluster: submit → list → status → retrieve → validate.""" + import shutil + import tempfile - # output_dir and derived log_dir appear in the command - assert "--output-dir" in yaml_output - assert "/data/traces" in yaml_output - assert "--log-dir" in yaml_output - assert "/data/traces/logs" in yaml_output + job_name = f"test-integ-{int(time.time()) % 100000}" + local_traces = tempfile.mkdtemp(prefix="flowsim-k8s-traces-") + + try: + # ── Step 0: clean stale test traces on host ── + host_traces = os.path.join(_PROJECT_ROOT, "stage_traces") + os.makedirs(host_traces, exist_ok=True) + + # ── Step 1: submit (host mount for trace retrieval) ── + r = _flowsim_cli( + "submit", + "--scheduler", "k8s", + "--collect", "all", + "--model-path", MODEL, + "--tp", "1", + "--bs", "1", + "--input-len", "2048", + "--existing-ctx", "0", + "--decode-tokens", "2", + "--warmup-n", "2", + "--gpus", "1", + "--k8s-namespace", "default", + "--k8s-host-output-dir", "/host-stage-traces", + "--job-name", job_name, + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + ) + combined = r.stdout + r.stderr + if r.returncode != 0: + print("Submit output:", combined[-3000:]) + assert r.returncode == 0, f"K8s submit failed: {combined[-1000:]}" + + # ── Step 2: list — verify job appears ── + r_list = _flowsim_cli("list", "--scheduler", "k8s") + assert r_list.returncode == 0 + assert job_name in r_list.stdout, ( + f"Job {job_name} not in list:\n{r_list.stdout}" + ) - def test_k8s_dry_run_hostpath(self): - """Dry-run with hostPath should have hostPath volume.""" - r = _flowsim_cli( - "submit", - "--scheduler", "k8s", - "--collect", "perf", - "--model-path", MODEL, - "--k8s-namespace", "default", - "--k8s-host-output-dir", "/mnt/traces", - "--dry-run", - ) - assert r.returncode == 0 - assert "hostPath" in r.stdout - assert "/mnt/traces" in r.stdout + # ── Step 3: status — poll until Completed/Succeeded (max 20 min) ── + deadline = time.time() + 1200 + state = "" + while time.time() < deadline: + r_status = _flowsim_cli("status", "--scheduler", "k8s", "--job", job_name) + assert r_status.returncode == 0 + state = r_status.stdout.lower() + if "completed" in state or "succeeded" in state: + break + if "failed" in state: + pytest.fail(f"K8s job failed:\n{r_status.stdout}") + time.sleep(15) + assert "completed" in state or "succeeded" in state, ( + f"K8s job did not complete in time:\n{r_status.stdout}" + ) - def test_k8s_refuses_without_storage(self): - """Submit (not dry-run) without PVC or hostPath should fail.""" - r = _flowsim_cli( - "submit", - "--scheduler", "k8s", - "--collect", "perf", - "--model-path", MODEL, - "--k8s-namespace", "default", - # Explicitly clear any config defaults - "--k8s-pvc", "", - "--k8s-host-output-dir", "", - ) - assert r.returncode != 0 - combined = r.stdout + r.stderr - assert "persistent storage" in combined or "pvc" in combined.lower() + # ── Step 4: traces are on host via Kind mount ── + # output_dir inside container: /flowsim/stage_traces/k8s/{ts} + # host_output_dir on worker: /host-stage-traces + # → host: {project}/stage_traces/k8s/{ts}/ + k8s_traces = os.path.join(host_traces, "k8s") + assert os.path.isdir(k8s_traces), ( + f"No k8s traces dir at {k8s_traces}" + ) + # Find the latest timestamped subdir + ts_dirs = sorted(os.listdir(k8s_traces)) + assert ts_dirs, f"No timestamp dirs in {k8s_traces}" + local_traces = os.path.join(k8s_traces, ts_dirs[-1]) - @pytest.mark.skipif( - not os.path.exists(os.path.expanduser("~/.kube/config")), - reason="No kubeconfig — skip K8s real submit (run on host with Kind cluster)", - ) - def test_k8s_real_submit_to_kind(self): - """Submit a real Job to Kind cluster, verify status + logs commands work.""" - job_name = f"test-integ-{int(time.time()) % 100000}" - r = _flowsim_cli( - "submit", - "--scheduler", "k8s", - "--collect", "perf", - "--model-path", MODEL, - "--k8s-namespace", "default", - "--k8s-host-output-dir", "/tmp/flowsim-test-traces", - "--job-name", job_name, - ) - combined = r.stdout + r.stderr + # ── Step 5: validate trace CSVs ── + _assert_traces(local_traces) + _assert_logs(local_traces) + _validate_shapes(local_traces, bs=1, input_len=2048, existing_ctx=0) - if r.returncode != 0: - print("Submit output:", combined[-3000:]) - assert r.returncode == 0, f"K8s submit failed: {combined[-1000:]}" - assert "created" in combined.lower() - - # Verify submit output has location hints - assert "Traces:" in combined - assert "Logs:" in combined - assert "flowsim status" in combined - assert "flowsim logs" in combined - - # Check status - r2 = _flowsim_cli("status", "--scheduler", "k8s", "--job", job_name) - assert r2.returncode == 0 - assert job_name in r2.stdout - - # Check logs (may say "pending" or show pod info) - r3 = _flowsim_cli("logs", "--scheduler", "k8s", "--job", job_name) - assert r3.returncode == 0 - # Should mention kubectl or pod name or "No pods" - assert "kubectl" in r3.stdout or "No pods" in r3.stdout or "Pod:" in r3.stdout - - # Check logs --follow - r3f = _flowsim_cli("logs", "--scheduler", "k8s", "--job", job_name, "--follow") - assert r3f.returncode == 0 - assert "kubectl logs -f" in r3f.stdout - - # Check list - r4 = _flowsim_cli("list", "--scheduler", "k8s") - assert r4.returncode == 0 - # Our job should appear in the listing - assert job_name in r4.stdout or "JOB_ID" in r4.stdout - - # Cancel via flowsim cancel - r5 = _flowsim_cli("cancel", "--scheduler", "k8s", "--job", job_name) - assert r5.returncode == 0 - assert "deleted" in r5.stdout.lower() + finally: + # Cleanup: cancel job (traces stay on host for inspection) + _flowsim_cli("cancel", "--scheduler", "k8s", "--job", job_name) # ===================================================================== -# SLURM SCHEDULER — dry-run only (no real cluster) +# SLURM SCHEDULER # ===================================================================== -class TestSlurmScheduler: - """Slurm scheduler: verify sbatch script has correct paths.""" - def test_slurm_dry_run_output_and_log_paths(self): - """Dry-run sbatch script should reference output_dir and log_dir.""" - r = _flowsim_cli( - "submit", - "--scheduler", "slurm", - "--collect", "perf", - "--model-path", MODEL, - "--slurm-partition", "gpu", - "--slurm-rest-url", "http://fake:6820", - "--slurm-jwt-token", "fake-token", - "--output-dir", "/shared/flowsim_traces", - "--dry-run", +def _slurm_cluster_running() -> bool: + """Check if local Slurm test cluster (docker compose) is running.""" + try: + r = subprocess.run( + ["docker", "exec", "slurmctld", "sinfo", "-h"], + capture_output=True, text=True, timeout=10, ) - assert r.returncode == 0 - script = r.stdout + return r.returncode == 0 and r.stdout.strip() != "" + except Exception: + return False - # sbatch directives - assert "#!/bin/bash" in script - assert "#SBATCH --job-name=" in script - assert "#SBATCH --partition=gpu" in script - # output_dir in the profiling command - assert "--output-dir" in script - assert "/shared/flowsim_traces" in script +# CLI prefix for running Slurm commands inside the slurmctld container. +# Uses -i so sbatch can read scripts from stdin. +_SLURM_CLI_PREFIX = "docker exec -i slurmctld" - # log_dir = output_dir + /logs/ - assert "--log-dir" in script - assert "/shared/flowsim_traces/logs" in script - def test_slurm_dry_run_default_output_dir(self): - """Default output_dir for Slurm should be ~/flowsim_traces.""" - r = _flowsim_cli( - "submit", - "--scheduler", "slurm", - "--collect", "perf", - "--model-path", MODEL, - "--slurm-partition", "gpu", - "--slurm-rest-url", "http://fake:6820", - "--slurm-jwt-token", "fake-token", - "--dry-run", - ) - assert r.returncode == 0 - assert "flowsim_traces" in r.stdout +class TestSlurmScheduler: + """Slurm scheduler: real submit to local docker-compose cluster. + + Uses CLI mode (sbatch/squeue/scancel) — no slurmrestd needed. + Automatically sets up the Slurm cluster via ``dev-setup.sh slurm`` + if not already running. + """ + + def test_slurm_real_submit(self, slurm_cluster): + """Submit to local Slurm cluster: submit → list → status → retrieve → validate.""" + + # Compute node has /flowsim/stage_traces mounted writable to host. + # output_dir inside the container maps directly to the host. + host_traces = os.path.join(_PROJECT_ROOT, "stage_traces") + os.makedirs(host_traces, exist_ok=True) + ts = int(time.time()) + output_dir = f"/flowsim/stage_traces/slurm/test_{ts}" + + job_id = None + try: + # ── Step 1: submit (CLI mode, container_runtime=none) ── + r = _flowsim_cli( + "submit", + "--scheduler", "slurm", + "--collect", "all", + "--model-path", MODEL, + "--tp", "1", + "--bs", "1", + "--input-len", "2048", + "--existing-ctx", "0", + "--decode-tokens", "2", + "--warmup-n", "2", + "--gpus", "1", + "--slurm-partition", "normal", + "--slurm-submit-via", "cli", + "--slurm-cli-prefix", _SLURM_CLI_PREFIX, + "--slurm-container-runtime", "none", + "--output-dir", output_dir, + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + ) + combined = r.stdout + r.stderr + if r.returncode != 0: + print("Submit output:", combined[-3000:]) + assert r.returncode == 0, f"Slurm submit failed: {combined[-1000:]}" + + # Extract job_id from output (line like "Submitted batch job 123") + for line in combined.splitlines(): + if "submitted" in line.lower(): + for word in line.split(): + if word.isdigit(): + job_id = word + break + if job_id: + break + assert job_id, f"Could not find job_id in submit output:\n{combined[-1000:]}" + + # ── Step 2: status — poll until Completed (max 20 min) ── + deadline = time.time() + 1200 + state = "" + while time.time() < deadline: + r_status = _flowsim_cli( + "status", "--scheduler", "slurm", + "--job", job_id, + "--slurm-submit-via", "cli", + "--slurm-cli-prefix", _SLURM_CLI_PREFIX, + ) + assert r_status.returncode == 0 + state = r_status.stdout.lower() + if "completed" in state or "succeeded" in state: + break + if "failed" in state: + pytest.fail(f"Slurm job failed:\n{r_status.stdout}") + time.sleep(15) + assert "completed" in state or "succeeded" in state, ( + f"Slurm job did not complete in time:\n{r_status.stdout}" + ) - def test_slurm_dry_run_pd_pair(self): - """PD disaggregation dry-run should produce both scripts with correct paths.""" - r = _flowsim_cli( - "submit", - "--scheduler", "slurm", - "--collect", "perf", - "--model-path", MODEL, - "--slurm-partition", "gpu", - "--slurm-rest-url", "http://fake:6820", - "--slurm-jwt-token", "fake-token", - "--output-dir", "/shared/traces", - "--pd", - "--dry-run", - ) - assert r.returncode == 0 - output = r.stdout - assert "PREFILL INSTANCE" in output - assert "DECODE INSTANCE" in output - assert "--disaggregation-mode prefill" in output - assert "--disaggregation-mode decode" in output - # Both scripts should reference the same output_dir - assert output.count("--output-dir") >= 2 - assert output.count("/shared/traces/logs") >= 2 + # ── Step 3: traces are on host via mount ── + slurm_traces = os.path.join(host_traces, "slurm") + assert os.path.isdir(slurm_traces), ( + f"No slurm traces dir at {slurm_traces}" + ) + ts_dirs = sorted(os.listdir(slurm_traces)) + assert ts_dirs, f"No test dirs in {slurm_traces}" + local_traces = os.path.join(slurm_traces, ts_dirs[-1]) + + # ── Step 4: validate trace CSVs ── + _assert_traces(local_traces) + _assert_logs(local_traces) + _validate_shapes(local_traces, bs=1, input_len=2048, existing_ctx=0) + + finally: + # Cleanup: cancel job (traces stay on host for inspection) + if job_id: + _flowsim_cli( + "cancel", "--scheduler", "slurm", + "--job", job_id, + "--slurm-submit-via", "cli", + "--slurm-cli-prefix", _SLURM_CLI_PREFIX, + ) diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 1d50f64..2bb0dec 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -281,6 +281,11 @@ def test_time_parse_minutes(self): class TestLocalScheduler: """Tests for local execution backend.""" + @pytest.fixture(autouse=True) + def _skip_image_check(self): + with mock.patch.object(LocalScheduler, "_check_image_exists"): + yield + @pytest.fixture() def spec(self) -> ProfileJobSpec: return ProfileJobSpec( @@ -447,6 +452,11 @@ def test_init_force_overwrite(self, tmp_path: Path): class TestCLISubmit: """Tests for `flowsim submit` argument parsing and dry-run.""" + @pytest.fixture(autouse=True) + def _skip_image_check(self): + with mock.patch.object(LocalScheduler, "_check_image_exists"): + yield + def _run(self, *args: str, expect_ok: bool = True) -> str: """Run submit via the Python function, capture stdout.""" from scripts.submit_profile import main as submit_main From 3edd5f4261c5b176e512ac8daba603b1e2448916 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 20:50:50 +0000 Subject: [PATCH 24/56] slurm: use YYYYMMDD_HHMMSS timestamp for output dirs (consistent with local/k8s) --- scripts/submit_profile.py | 4 +--- tests/integration/test_scheduler_local.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 23089eb..57c730c 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -382,9 +382,7 @@ def main(argv: list[str] | None = None) -> None: if args.scheduler == "local": args.output_dir = f"/flowsim/stage_traces/local/{_ts}" elif args.scheduler == "slurm": - args.output_dir = os.path.expanduser( - f"~/flowsim_traces/slurm/{_ts}" - ) + args.output_dir = f"/flowsim/stage_traces/slurm/{_ts}" else: args.output_dir = f"/flowsim/stage_traces/k8s/{_ts}" diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 62f52fb..2ff2e67 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -507,8 +507,8 @@ def test_slurm_real_submit(self, slurm_cluster): # output_dir inside the container maps directly to the host. host_traces = os.path.join(_PROJECT_ROOT, "stage_traces") os.makedirs(host_traces, exist_ok=True) - ts = int(time.time()) - output_dir = f"/flowsim/stage_traces/slurm/test_{ts}" + ts = time.strftime("%Y%m%d_%H%M%S") + output_dir = f"/flowsim/stage_traces/slurm/{ts}" job_id = None try: From 9bc2d94f4d11b891ec889967d605d516fe9985ae Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 20:56:38 +0000 Subject: [PATCH 25/56] docs: add scheduler README with CLI usage and architecture overview --- schedulers/README.md | 385 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 385 insertions(+) create mode 100644 schedulers/README.md diff --git a/schedulers/README.md b/schedulers/README.md new file mode 100644 index 0000000..cf14f3c --- /dev/null +++ b/schedulers/README.md @@ -0,0 +1,385 @@ +# FlowSim Schedulers + +FlowSim 支持三种调度器后端,用于提交 GPU profiling 任务: + +| 后端 | 适用场景 | 运行位置 | 依赖 | +|------|----------|----------|------| +| **local** | 单机开发/测试 | 宿主机 Docker 容器 | Docker + NVIDIA GPU | +| **k8s** | Kubernetes 集群 | K8s Job Pod | `kubernetes` Python 包 | +| **slurm** | HPC 集群 | Slurm 计算节点 | Slurm CLI 或 slurmrestd | + +## 快速上手 + +```bash +# 安装(从 FlowSim 项目根目录) +cd FlowSim +pip install -e . # 或确保 PYTHONPATH 包含项目根目录 + +# 查看帮助 +flowsim --help +flowsim submit --help +``` + +## 通用工作流 + +所有调度器共享相同的 CLI 接口: + +```bash +# 1. 提交任务 +flowsim submit --scheduler --collect \ + --model-path [选项...] + +# 2. 查看任务列表 +flowsim list --scheduler + +# 3. 查看任务状态 +flowsim status --scheduler --job + +# 4. 查看日志 +flowsim logs --scheduler --job + +# 5. 取消任务 +flowsim cancel --scheduler --job + +# 6. Dry-run(仅打印脚本/manifest,不提交) +flowsim submit --scheduler ... --dry-run +``` + +### 通用参数 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `--collect` | 收集模式:`perf`(性能) / `shapes`(形状) / `all`(两者) | 必填 | +| `--model-path` | HuggingFace 模型路径 | 必填 | +| `--tp` | Tensor parallelism | `1` | +| `--dp` | Data parallelism | `1` | +| `--bs` | Batch size | `1` | +| `--input-len` | 输入序列长度 | `2048` | +| `--existing-ctx` | 已有 KV cache 长度 | `0` | +| `--decode-tokens` | Decode 生成 token 数 | `32` | +| `--warmup-n` | Warmup 迭代数 | `5` | +| `--image` | Docker 镜像 | `flowsim-image:latest` | +| `--gpus` | GPU 数量 | `1` | +| `--output-dir` | 输出目录(自动生成如不指定) | `stage_traces/{scheduler}/{timestamp}/` | +| `--dry-run` | 仅打印脚本,不提交 | `false` | + +--- + +## 1. Local 调度器 + +直接在宿主机上通过 `docker run` 启动容器运行 profiling。最简单的方式,适合单机开发和测试。 + +### 使用 + +```bash +# 最简单的用法 — 使用 GPU 0 运行 +flowsim submit --scheduler local \ + --collect all \ + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 \ + --gpus 1 --local-gpus 0 \ + --extra-server-opts "--load-format dummy" + +# 多 GPU +flowsim submit --scheduler local \ + --collect perf \ + --model-path Qwen/Qwen3-8B \ + --tp 2 --gpus 2 --local-gpus 0,1 +``` + +### 专有参数 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `--local-gpus` | `CUDA_VISIBLE_DEVICES`(如 `0` 或 `0,1`) | 空(使用所有 GPU) | +| `--local-workdir` | 主机工作目录 | FlowSim 项目根目录 | + +### 工作原理 + +1. `render()` 生成一条 `docker run --gpus` 命令 +2. `submit()` 在宿主机执行该容器,同步等待完成 +3. Traces 写入宿主机 `stage_traces/local/{YYYYMMDD_HHMMSS}/` +4. `status()` / `logs()` / `list_jobs()` 扫描日志文件 + +--- + +## 2. Kubernetes 调度器 + +将 profiling 任务作为 Kubernetes Job 提交到集群。支持 PVC 和 hostPath 两种存储方式。 + +### 首次配置 + +```bash +flowsim init k8s \ + --kubeconfig ~/.kube/config \ + --namespace default \ + --host-output-dir /host-stage-traces \ + --runtime-class-name nvidia \ + --force +``` + +配置保存到 `~/.flowsim/k8s.yaml`,后续提交时自动读取。 + +### 使用 + +```bash +# 提交到 K8s 集群 +flowsim submit --scheduler k8s \ + --collect all \ + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --extra-server-opts "--load-format dummy" + +# 覆盖配置文件中的值 +flowsim submit --scheduler k8s \ + --collect perf \ + --model-path Qwen/Qwen3-8B \ + --k8s-namespace ml-team \ + --k8s-pvc my-traces-pvc \ + --gpus 4 --tp 4 + +# Dry-run 查看生成的 YAML +flowsim submit --scheduler k8s ... --dry-run +``` + +### 专有参数 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `--k8s-namespace` | K8s 命名空间 | `default` | +| `--k8s-kubeconfig` | kubeconfig 路径 | `~/.kube/config` | +| `--k8s-context` | kubeconfig context | 当前 context | +| `--k8s-pvc` | PVC 名称(持久存储) | 空 | +| `--k8s-host-output-dir` | hostPath 挂载路径(PVC 为空时使用) | 空 | +| `--k8s-node-selector` | 节点选择标签(可重复),格式 `KEY=VALUE` | 空 | +| `--k8s-service-account` | ServiceAccount | 空 | +| `--k8s-shm-size` | 共享内存大小 | `16Gi` | +| `--k8s-runtime-class` | RuntimeClass(如 `nvidia`,用于 CDI 模式) | 空 | + +### 工作原理 + +1. `render()` 生成 Kubernetes Job YAML/JSON manifest +2. `submit()` 通过 `kubernetes` Python 客户端创建 Job +3. Traces 通过 PVC 或 hostPath 持久化 +4. `status()` / `cancel()` / `list_jobs()` 通过 K8s API 操作 + +### Kind 本地测试集群 + +```bash +# 启动 Kind 集群(GPU passthrough + CDI 模式) +bash dockerfiles/dev-setup.sh kind + +# 运行 K8s 集成测试 +python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x + +# 清理 +bash dockerfiles/dev-teardown.sh kind +``` + +--- + +## 3. Slurm 调度器 + +生成 sbatch 脚本并提交到 Slurm 集群。支持两种提交模式: + +- **CLI 模式**(推荐):通过 `sbatch`/`squeue`/`scancel` 命令 +- **REST 模式**:通过 slurmrestd REST API + JWT 认证 + +### 首次配置 + +```bash +# CLI 模式(推荐,无需 slurmrestd) +flowsim init slurm \ + --rest-url http://unused \ + --partition gpu \ + --account my-project \ + --container-runtime none \ + --force + +# REST 模式(需要 slurmrestd) +flowsim init slurm \ + --rest-url https://slurm.example.com:6820 \ + --partition gpu \ + --account my-project \ + --jwt-token-cmd "scontrol token lifespan=3600" \ + --force +``` + +### 使用 + +```bash +# CLI 模式 — 直接调用 sbatch(最常用) +flowsim submit --scheduler slurm \ + --collect all \ + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --slurm-partition gpu \ + --slurm-submit-via cli \ + --extra-server-opts "--load-format dummy" + +# CLI 模式 + 远程前缀(通过 docker exec 或 ssh) +flowsim submit --scheduler slurm \ + --slurm-submit-via cli \ + --slurm-cli-prefix "docker exec -i slurmctld" \ + --slurm-partition normal \ + --collect perf --model-path Qwen/Qwen3-8B --gpus 1 + +# REST 模式 +flowsim submit --scheduler slurm \ + --slurm-submit-via rest \ + --slurm-rest-url http://localhost:6820 \ + --slurm-jwt-token "$(scontrol token lifespan=3600 | cut -d= -f2)" \ + --collect perf --model-path Qwen/Qwen3-8B --gpus 1 + +# Dry-run 查看生成的 sbatch 脚本 +flowsim submit --scheduler slurm ... --dry-run + +# 查看状态(CLI 模式) +flowsim status --scheduler slurm --job 12345 \ + --slurm-submit-via cli \ + --slurm-cli-prefix "docker exec -i slurmctld" + +# 取消任务 +flowsim cancel --scheduler slurm --job 12345 \ + --slurm-submit-via cli +``` + +### 专有参数 + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `--slurm-submit-via` | 提交模式:`cli`(sbatch)或 `rest`(slurmrestd) | `rest` | +| `--slurm-cli-prefix` | CLI 命令前缀(如 `"docker exec -i slurmctld"`) | 空 | +| `--slurm-partition` | Slurm 分区 | 空 | +| `--slurm-time` | 任务时间限制 | `02:00:00` | +| `--slurm-account` | 计费账户 | 空 | +| `--slurm-constraint` | 节点约束 | 空 | +| `--slurm-container-runtime` | 容器运行时:`docker` / `enroot` / `none` | `none` | +| `--slurm-container-mounts` | 容器挂载 | 空 | +| `--slurm-module` | `module load` 命令(可重复) | 空 | +| `--slurm-extra-sbatch` | 额外 `#SBATCH` 指令(可重复) | 空 | +| `--slurm-rest-url` | slurmrestd URL(REST 模式需要) | 空 | +| `--slurm-jwt-token` | JWT token(REST 模式需要) | 空 | +| `--slurm-api-version` | slurmrestd API 版本 | `v0.0.40` | +| `--slurm-no-verify-ssl` | 跳过 TLS 验证 | `false` | + +### container_runtime 说明 + +| 值 | 说明 | +|----|------| +| `none` | 直接在计算节点上运行(节点已有 Python/sglang 环境)| +| `docker` | 在分配的节点上 `docker run` | +| `enroot` | 使用 `srun --container-image` (NVIDIA enroot) | + +### 工作原理 + +**CLI 模式:** +1. `render()` 生成完整的 sbatch 脚本(含 `#SBATCH` 指令 + profiling 命令) +2. `submit()` 通过 `sbatch --parsable` 提交(脚本通过 stdin 传入) +3. `status()` 通过 `scontrol show job` 查询(无需 slurmdbd) +4. `cancel()` 通过 `scancel` 取消 +5. `list_jobs()` 通过 `squeue` 列出 + +如果 Slurm 命令不在本地 PATH 中,可通过 `--slurm-cli-prefix` 指定前缀,例如: +- `"docker exec -i slurmctld"` — 通过 Docker 容器 +- `"ssh login-node"` — 通过 SSH + +**REST 模式:** +1. 同上生成 sbatch 脚本 +2. `submit()` 通过 HTTP POST 到 slurmrestd 的 `/slurm/{version}/job/submit` +3. 所有操作通过 slurmrestd REST API + JWT 认证 + +### Docker Compose 本地测试集群 + +```bash +# 启动 Slurm 集群(slurmctld + 1 计算节点 + 1 GPU) +cd dockerfiles/ +docker compose -f slurm-compose.yaml up -d + +# 检查集群状态 +docker exec slurmctld sinfo + +# 运行 Slurm 集成测试 +python -m pytest tests/integration/test_scheduler_local.py::TestSlurmScheduler -v -x + +# 清理 +docker compose -f slurm-compose.yaml down -v +``` + +--- + +## 配置文件 + +配置保存在 `~/.flowsim/` 目录下,通过 `flowsim init` 生成: + +``` +~/.flowsim/ +├── k8s.yaml # K8s 调度器配置 +└── slurm.yaml # Slurm 调度器配置 +``` + +参数优先级(从高到低): +1. CLI flag(`--slurm-partition gpu`) +2. 环境变量(`FLOWSIM_SLURM_PARTITION=gpu`) +3. 配置文件(`~/.flowsim/slurm.yaml`) +4. 内置默认值 + +### 示例 k8s.yaml + +```yaml +kubeconfig: /home/user/.kube/config +namespace: default +host_output_dir: /host-stage-traces +runtime_class_name: nvidia +shm_size: 16Gi +``` + +### 示例 slurm.yaml + +```yaml +partition: gpu +account: my-project +time: "02:00:00" +container_runtime: none +submit_via: cli +cli_prefix: "" +``` + +--- + +## 输出目录结构 + +所有调度器产生统一的 trace 输出结构: + +``` +stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ +├── bs1_input2048_ctx0/ +│ ├── *.trace.json.gz # 原始 trace +│ ├── parsed/*.csv # 解析后的 CSV +│ ├── merged/*_merged.trace.csv # 合并的 trace CSV +│ ├── shape_traces/ # Shape trace(collect=shapes/all) +│ ├── shape_parsed/*.csv # Shape 解析 CSV +│ ├── analysis_extend.json # Extend 阶段分析 +│ └── analysis_decode.json # Decode 阶段分析 +├── logs/ +│ ├── server_*.stdout.log +│ └── server_*.stderr.log +└── sweep_summary.json +``` + +--- + +## PD Disaggregation(实验性) + +支持 Prefill-Decode 分离部署: + +```bash +flowsim submit --scheduler k8s \ + --pd \ + --collect perf \ + --model-path Qwen/Qwen3-235B-A22B-FP8 \ + --tp 4 --gpus 8 \ + --disagg-transfer-backend mooncake +``` + +这会生成两个 Job:一个 prefill 实例,一个 decode 实例。 From a92e32ba6280db4710c87170aa04adae31e12f95 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 21:32:13 +0000 Subject: [PATCH 26/56] profile: add --sweep for multi-point profiling in one job Usage: flowsim submit --scheduler local --collect perf --model-path Qwen/Qwen3-8B \ --sweep 1:2048:0 4:8192:0 16:2048:4096 Or from file: flowsim submit --scheduler local --collect perf --model-path Qwen/Qwen3-8B \ --sweep-file sweep_points.txt Each point is a BS:INPUT_LEN:CTX tuple. One server launch, multiple profile points sequentially. Backwards compatible: without --sweep, --bs/--input-len/--existing-ctx still works as single-point. --- schedulers/base.py | 23 +++++---- scripts/run_stage_profile.py | 90 +++++++++++++++++++++++++++++++++--- scripts/submit_profile.py | 50 ++++++++++++++++++++ 3 files changed, 149 insertions(+), 14 deletions(-) diff --git a/schedulers/base.py b/schedulers/base.py index 3a35682..b80aa46 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -5,7 +5,7 @@ import abc import shlex from dataclasses import dataclass, field -from typing import Optional +from typing import Optional, Sequence @dataclass @@ -55,6 +55,9 @@ class ProfileJobSpec: disagg_prefill_pp: int = 1 disagg_ib_device: str = "" + # -- Sweep: explicit list of (bs, input_len, existing_ctx) tuples -- + sweep_points: list[tuple[int, int, int]] = field(default_factory=list) + # -- Extra server opts (appended verbatim) -- extra_server_opts: str = "" @@ -95,12 +98,6 @@ def build_profile_command(self) -> list[str]: "--launch-server", "--server-opts", self.build_server_opts(), - "--bs", - str(self.bs), - "--input-len", - str(self.input_len), - "--existing-ctx", - str(self.existing_ctx), "--decode-tokens", str(self.decode_tokens), "--warmup-n", @@ -114,6 +111,13 @@ def build_profile_command(self) -> list[str]: "--log-dir", self.log_dir, ] + if self.sweep_points: + for bs, il, ctx in self.sweep_points: + cmd.extend(["--sweep", f"{bs}:{il}:{ctx}"]) + else: + cmd.extend(["--bs", str(self.bs)]) + cmd.extend(["--input-len", str(self.input_len)]) + cmd.extend(["--existing-ctx", str(self.existing_ctx)]) if self.disable_chunked_prefill: cmd.append("--disable-chunked-prefill") cmd.extend(["--max-prefill-tokens", str(self.max_prefill_tokens)]) @@ -140,7 +144,10 @@ def default_job_name(self) -> str: if self.job_name: return self.job_name model_short = self.model_path.split("/")[-1].lower().replace(".", "-") - name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" + if self.sweep_points: + name = f"flowsim-{self.collect}-{model_short}-sweep{len(self.sweep_points)}pt" + else: + name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" if self.disagg_mode: name += f"-{self.disagg_mode}" return name diff --git a/scripts/run_stage_profile.py b/scripts/run_stage_profile.py index c27d6f3..91f9143 100644 --- a/scripts/run_stage_profile.py +++ b/scripts/run_stage_profile.py @@ -700,6 +700,31 @@ def parse_args(argv: Optional[list] = None) -> argparse.Namespace: default="/flowsim/stage_traces", help="Root directory for trace output", ) + + sweep = p.add_argument_group("sweep (multi-point profiling)") + sweep.add_argument( + "--sweep", + type=str, + nargs="+", + default=[], + metavar="BS:INPUT_LEN:CTX", + help=( + "Profile multiple (bs, input_len, existing_ctx) points in one job. " + "Each value is a colon-separated tuple, e.g. --sweep 1:2048:0 4:8192:0 16:2048:4096. " + "Overrides --bs, --input-len, --existing-ctx." + ), + ) + sweep.add_argument( + "--sweep-file", + type=str, + default="", + metavar="FILE", + help=( + "Read sweep points from a file (one BS:INPUT_LEN:CTX per line, " + "# comments allowed). Overrides --bs, --input-len, --existing-ctx." + ), + ) + srv = p.add_argument_group("server launch (optional)") srv.add_argument( "--launch-server", @@ -721,6 +746,45 @@ def parse_args(argv: Optional[list] = None) -> argparse.Namespace: return p.parse_args(argv) +def _parse_sweep_point(s: str) -> tuple[int, int, int]: + """Parse a ``BS:INPUT_LEN:CTX`` string into an int 3-tuple.""" + parts = s.strip().split(":") + if len(parts) != 3: + raise ValueError( + f"Bad sweep point {s!r}: expected BS:INPUT_LEN:CTX " + f"(e.g. 1:2048:0)" + ) + try: + return int(parts[0]), int(parts[1]), int(parts[2]) + except ValueError: + raise ValueError( + f"Bad sweep point {s!r}: all three values must be integers" + ) + + +def _load_sweep_points(args) -> list[tuple[int, int, int]]: + """Resolve sweep points from --sweep, --sweep-file, or single-point args.""" + if args.sweep and args.sweep_file: + print("[ERROR] --sweep and --sweep-file are mutually exclusive") + raise SystemExit(1) + + points: list[tuple[int, int, int]] = [] + if args.sweep: + for s in args.sweep: + points.append(_parse_sweep_point(s)) + elif args.sweep_file: + with open(args.sweep_file) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + points.append(_parse_sweep_point(line)) + else: + # Single-point from --bs / --input-len / --existing-ctx + points.append((args.bs, args.input_len, args.existing_ctx)) + return points + + # --------------------------------------------------------------------------- # Phase runners # --------------------------------------------------------------------------- @@ -759,11 +823,11 @@ def _start_server( return proc -def _run_perf(args, summary: list[dict]) -> int: +def _run_perf(args, summary: list[dict], *, bs: int = 0, input_len: int = 0, existing_ctx: int = 0) -> int: """Collect traces for a single (bs, input_len, existing_ctx, decode_tokens) point.""" - bs = args.bs - input_len = args.input_len - existing_ctx = args.existing_ctx + bs = bs or args.bs + input_len = input_len or args.input_len + existing_ctx = existing_ctx if (bs != 0) else args.existing_ctx tag = f"bs{bs}_input{input_len}_ctx{existing_ctx}" sub_dir = os.path.join(args.output_dir, tag) @@ -887,6 +951,14 @@ def main(argv: Optional[list] = None) -> int: server_proc = None summary: list[dict] = [] + sweep_points = _load_sweep_points(args) + is_sweep = len(sweep_points) > 1 + + if is_sweep: + print(f"\n[sweep] {len(sweep_points)} points to profile:") + for i, (bs, il, ctx) in enumerate(sweep_points): + print(f" [{i+1}] bs={bs} input_len={il} existing_ctx={ctx}") + print() try: # ================================================================== @@ -908,7 +980,10 @@ def main(argv: Optional[list] = None) -> int: print(" PHASE 1 / 2 : PERF COLLECTION") print("=" * 60 + "\n") server_proc = _start_server(args, disable_cuda_graph=False) - _run_perf(args, summary) + for idx, (bs, il, ctx) in enumerate(sweep_points): + if is_sweep: + print(f"\n[sweep] Point {idx+1}/{len(sweep_points)}") + _run_perf(args, summary, bs=bs, input_len=il, existing_ctx=ctx) _write_summary(args, summary) print("\n[server] Shutting down for shape pass …") kill_server(server_proc) @@ -929,7 +1004,10 @@ def main(argv: Optional[list] = None) -> int: if args.collect == "perf": if args.launch_server: server_proc = _start_server(args, disable_cuda_graph=False) - _run_perf(args, summary) + for idx, (bs, il, ctx) in enumerate(sweep_points): + if is_sweep: + print(f"\n[sweep] Point {idx+1}/{len(sweep_points)}") + _run_perf(args, summary, bs=bs, input_len=il, existing_ctx=ctx) _write_summary(args, summary) return 0 diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 57c730c..f1e3051 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -105,6 +105,28 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: default="", help="Extra server options appended verbatim", ) + wl.add_argument( + "--sweep", + type=str, + nargs="+", + default=[], + metavar="BS:INPUT_LEN:CTX", + help=( + "Profile multiple (bs, input_len, existing_ctx) points in one job. " + "Each value is a colon-separated tuple, e.g. --sweep 1:2048:0 4:8192:0. " + "Overrides --bs, --input-len, --existing-ctx." + ), + ) + wl.add_argument( + "--sweep-file", + type=str, + default="", + metavar="FILE", + help=( + "Read sweep points from a file (one BS:INPUT_LEN:CTX per line, " + "# comments allowed). Overrides --bs, --input-len, --existing-ctx." + ), + ) # -- Infrastructure -- infra = p.add_argument_group("infrastructure") @@ -301,7 +323,34 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: return p.parse_args(argv) +def _parse_sweep_points(args) -> list[tuple[int, int, int]]: + """Resolve sweep points from --sweep / --sweep-file args.""" + if args.sweep and args.sweep_file: + sys.exit("Error: --sweep and --sweep-file are mutually exclusive") + points: list[tuple[int, int, int]] = [] + raw: list[str] = [] + if args.sweep: + raw = args.sweep + elif args.sweep_file: + with open(args.sweep_file) as f: + raw = [ + line.strip() + for line in f + if line.strip() and not line.strip().startswith("#") + ] + for s in raw: + parts = s.strip().split(":") + if len(parts) != 3: + sys.exit(f"Bad sweep point {s!r}: expected BS:INPUT_LEN:CTX") + try: + points.append((int(parts[0]), int(parts[1]), int(parts[2]))) + except ValueError: + sys.exit(f"Bad sweep point {s!r}: all three values must be integers") + return points + + def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: + sweep_points = _parse_sweep_points(args) return ProfileJobSpec( collect=args.collect, model_path=args.model_path, @@ -325,6 +374,7 @@ def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: disagg_bootstrap_port=args.disagg_bootstrap_port, disagg_prefill_pp=args.disagg_prefill_pp, disagg_ib_device=args.disagg_ib_device, + sweep_points=sweep_points, ) From 0aeff899ec566237b25315a81f13d4510f9a3293 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 21:42:08 +0000 Subject: [PATCH 27/56] test: add sweep integration tests (inline + file) Two tests in TestLocalSweep: - test_sweep_inline: --sweep 1:2048:0 1:4096:0 1:2048:2048 - test_sweep_file: same points read from a temp file Also fix: use single --sweep with multiple values (nargs=+) instead of repeated --sweep flags which argparse would override. --- schedulers/base.py | 3 +- tests/integration/test_scheduler_local.py | 122 ++++++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/schedulers/base.py b/schedulers/base.py index b80aa46..0641f41 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -112,8 +112,9 @@ def build_profile_command(self) -> list[str]: self.log_dir, ] if self.sweep_points: + cmd.append("--sweep") for bs, il, ctx in self.sweep_points: - cmd.extend(["--sweep", f"{bs}:{il}:{ctx}"]) + cmd.append(f"{bs}:{il}:{ctx}") else: cmd.extend(["--bs", str(self.bs)]) cmd.extend(["--input-len", str(self.input_len)]) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 2ff2e67..8815250 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -47,6 +47,7 @@ import os import subprocess import sys +import tempfile import time import pytest @@ -592,3 +593,124 @@ def test_slurm_real_submit(self, slurm_cluster): "--slurm-submit-via", "cli", "--slurm-cli-prefix", _SLURM_CLI_PREFIX, ) + + +# ===================================================================== +# SWEEP — multi-point profiling in a single job +# ===================================================================== + +# Three lightweight points: different (bs, input_len, existing_ctx) +_SWEEP_POINTS = [ + (1, 2048, 0), + (1, 4096, 0), + (1, 2048, 2048), +] + + +def _assert_sweep_output(host_output_dir: str, points: list[tuple[int, int, int]]) -> None: + """Validate that every sweep point produced traces and parsed CSVs.""" + for bs, il, ctx in points: + tag = f"bs{bs}_input{il}_ctx{ctx}" + point_dir = os.path.join(host_output_dir, tag) + assert os.path.isdir(point_dir), f"Missing sweep point dir: {point_dir}" + _assert_traces(point_dir) + + # sweep_summary.json should exist at the root + summary_path = os.path.join(host_output_dir, "sweep_summary.json") + assert os.path.isfile(summary_path), f"Missing {summary_path}" + with open(summary_path) as f: + summary = json.load(f) + assert len(summary) == len(points), ( + f"Expected {len(points)} entries in sweep_summary.json, got {len(summary)}" + ) + for entry in summary: + assert entry["traces"] > 0, f"Point {entry} has 0 traces" + + +class TestLocalSweep: + """Multi-point sweep via ``--sweep`` and ``--sweep-file`` on local scheduler. + + Validates that one job profiles all requested points and produces + correct directory structure, traces, and sweep_summary.json. + """ + + def test_sweep_inline(self): + """Submit a 3-point sweep using inline --sweep tuples.""" + sweep_args = [f"{bs}:{il}:{ctx}" for bs, il, ctx in _SWEEP_POINTS] + + r = _flowsim_cli( + "submit", + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--tp", "1", + "--decode-tokens", "2", + "--warmup-n", "2", + "--gpus", "1", + "--local-gpus", "0", + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--sweep", *sweep_args, + ) + combined = r.stdout + r.stderr + if r.returncode != 0: + print("STDOUT:", r.stdout[-3000:]) + print("STDERR:", r.stderr[-3000:]) + assert r.returncode == 0, f"sweep submit failed (exit {r.returncode})" + + # Find host output dir from submit output + output_dir = None + for line in combined.splitlines(): + if "Traces:" in line: + output_dir = line.split("Traces:", 1)[1].strip() + break + assert output_dir and os.path.isdir(output_dir), ( + f"Could not find traces dir in output:\n{combined[-1000:]}" + ) + + _assert_sweep_output(output_dir, _SWEEP_POINTS) + _assert_logs(output_dir) + + def test_sweep_file(self): + """Submit a 3-point sweep reading points from a file.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False, prefix="sweep_" + ) as f: + f.write("# bs:input_len:existing_ctx\n") + for bs, il, ctx in _SWEEP_POINTS: + f.write(f"{bs}:{il}:{ctx}\n") + sweep_file = f.name + + try: + r = _flowsim_cli( + "submit", + "--scheduler", "local", + "--collect", "perf", + "--model-path", MODEL, + "--tp", "1", + "--decode-tokens", "2", + "--warmup-n", "2", + "--gpus", "1", + "--local-gpus", "0", + "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--sweep-file", sweep_file, + ) + combined = r.stdout + r.stderr + if r.returncode != 0: + print("STDOUT:", r.stdout[-3000:]) + print("STDERR:", r.stderr[-3000:]) + assert r.returncode == 0, f"sweep-file submit failed (exit {r.returncode})" + + # Find host output dir from submit output + output_dir = None + for line in combined.splitlines(): + if "Traces:" in line: + output_dir = line.split("Traces:", 1)[1].strip() + break + assert output_dir and os.path.isdir(output_dir), ( + f"Could not find traces dir in output:\n{combined[-1000:]}" + ) + + _assert_sweep_output(output_dir, _SWEEP_POINTS) + _assert_logs(output_dir) + finally: + os.unlink(sweep_file) From f58d791f01d1f68046b0a01aba20b75b9f901182 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 22:08:47 +0000 Subject: [PATCH 28/56] refactor: dedup shared utilities, deprecate Slurm REST mode - Extract resolve_default() to config.py (was _d() duplicated in submit/status) - Extract parse_sweep_point()/load_sweep_file() to scripts/__init__.py - K8s: submit() reuses _load_k8s() instead of duplicating kubeconfig logic - K8s: remove unused kubernetes imports in status()/logs() - Local: move inline imports (glob/re/shlex/threading) to module level - Local: remove dead if-branch in list_jobs (always set Completed) - Slurm: default submit_via='cli', deprecate REST mode with DeprecationWarning - Slurm: add TODO for _logs_cli (currently returns status info only) - CLI: flowsim init slurm supports --submit-via/--cli-prefix, rest-url optional - Template: slurm.yaml updated for CLI-first workflow - run_stage_profile: fix _run_perf sentinel bs=0 -> Optional[int] --- schedulers/config.py | 5 +++ schedulers/k8s.py | 63 +++++++++++---------------------- schedulers/local.py | 17 +++------ schedulers/slurm.py | 24 +++++++++---- schedulers/templates/slurm.yaml | 14 ++++---- scripts/__init__.py | 36 +++++++++++++++++++ scripts/cli.py | 19 ++++++---- scripts/run_stage_profile.py | 44 ++++++----------------- scripts/status_profile.py | 8 ++--- scripts/submit_profile.py | 40 ++++++++------------- 10 files changed, 131 insertions(+), 139 deletions(-) diff --git a/schedulers/config.py b/schedulers/config.py index 185c87f..e228cb3 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -123,3 +123,8 @@ def cfg_get(cfg: dict, key: str, fallback: str = "") -> str: if val is not None: return str(val) return fallback + + +def resolve_default(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: + """Resolve a config value: env var > config file > fallback.""" + return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index d29df96..741721e 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -148,38 +148,9 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: def submit(self, spec: ProfileJobSpec) -> JobResult: """Submit via the ``kubernetes`` Python client (``pip install kubernetes``).""" - try: - from kubernetes import client as k8s_client, config as k8s_config - except ImportError: - raise RuntimeError( - "The 'kubernetes' package is required for --submit. " - "Install it with: pip install kubernetes" - ) - - # Load kubeconfig / in-cluster config - config_kwargs: dict = {} - if self.kubeconfig: - config_kwargs["config_file"] = self.kubeconfig - if self.context: - config_kwargs["context"] = self.context - - try: - k8s_config.load_kube_config(**config_kwargs) - except k8s_config.ConfigException: - try: - k8s_config.load_incluster_config() - except k8s_config.ConfigException: - hint = "" - if not self.kubeconfig: - hint = " Try --k8s-kubeconfig /path/to/kubeconfig." - raise RuntimeError( - "No valid Kubernetes configuration found. " - "Checked kubeconfig file and in-cluster environment." - + hint - ) + batch_api, _ = self._load_k8s() body = self._build_job_dict(spec) - batch_api = k8s_client.BatchV1Api() resp = batch_api.create_namespaced_job( namespace=self.namespace, body=body, @@ -197,8 +168,17 @@ def submit(self, spec: ProfileJobSpec) -> JobResult: # ----------------------------------------------------------------- def _load_k8s(self): - """Load kubeconfig and return (BatchV1Api, CoreV1Api).""" - from kubernetes import client as k8s_client, config as k8s_config + """Load kubeconfig and return (BatchV1Api, CoreV1Api). + + Raises RuntimeError with actionable message on failure. + """ + try: + from kubernetes import client as k8s_client, config as k8s_config + except ImportError: + raise RuntimeError( + "The 'kubernetes' package is required. " + "Install it with: pip install kubernetes" + ) config_kwargs: dict = {} if self.kubeconfig: @@ -208,7 +188,14 @@ def _load_k8s(self): try: k8s_config.load_kube_config(**config_kwargs) except k8s_config.ConfigException: - k8s_config.load_incluster_config() + try: + k8s_config.load_incluster_config() + except k8s_config.ConfigException: + hint = " Try --k8s-kubeconfig /path/to/kubeconfig." if not self.kubeconfig else "" + raise RuntimeError( + "No valid Kubernetes configuration found. " + "Checked kubeconfig file and in-cluster environment." + hint + ) return k8s_client.BatchV1Api(), k8s_client.CoreV1Api() @@ -226,11 +213,6 @@ def cancel(self, job_id: str) -> str: def status(self, job_id: str) -> dict: """Query K8s Job status by job name.""" - try: - from kubernetes import client as k8s_client - except ImportError: - raise RuntimeError("pip install kubernetes") - batch_api, core_api = self._load_k8s() job = batch_api.read_namespaced_job(name=job_id, namespace=self.namespace) @@ -278,11 +260,6 @@ def status(self, job_id: str) -> dict: def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Show where logs are and how to access them for a K8s Job.""" - try: - from kubernetes import client as k8s_client - except ImportError: - raise RuntimeError("pip install kubernetes") - _, core_api = self._load_k8s() pods = core_api.list_namespaced_pod( diff --git a/schedulers/local.py b/schedulers/local.py index 17dd3a0..eeaa020 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -7,9 +7,13 @@ from __future__ import annotations +import glob import os +import re +import shlex import subprocess import sys +import threading import time from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec @@ -17,7 +21,6 @@ def _shell_quote(s: str) -> str: """Quote a string for safe embedding in a bash -c '...' invocation.""" - import shlex return shlex.quote(s) @@ -160,7 +163,6 @@ def submit(self, spec: ProfileJobSpec) -> JobResult: stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - import threading def _tee(src, dest_file, dest_stream): for line in src: @@ -217,7 +219,6 @@ def cancel(self, job_id: str) -> str: def _find_log_dirs(self) -> list[str]: """Find all log directories under stage_traces/{scheduler}/*/logs/.""" - import glob base = os.path.join(self.workdir, "stage_traces", "local") # New layout: stage_traces/local/{ts}/logs/ dirs = sorted(glob.glob(os.path.join(base, "*/logs"))) @@ -232,8 +233,6 @@ def status(self, job_id: str) -> dict: ``job_id`` is the job name prefix used in log filenames. """ - import glob - matches = [] for log_dir in self._find_log_dirs(): matches.extend(sorted(glob.glob( @@ -264,8 +263,6 @@ def status(self, job_id: str) -> dict: def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """List log files for a local job and print access commands.""" - import glob - matches = [] for log_dir in self._find_log_dirs(): matches.extend(sorted(glob.glob( @@ -317,9 +314,6 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: def list_jobs(self, *, status_filter: str = "") -> list[dict]: """List local jobs by scanning log files.""" - import glob - import re - matches = [] for log_dir in self._find_log_dirs(): matches.extend(sorted(glob.glob( @@ -340,9 +334,6 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: stderr_size = os.path.getsize(stderr) if os.path.exists(stderr) else 0 # If stderr has content, might have failed; otherwise completed state = "Completed" - if stderr_size > 0: - # Check if there's an error indicator in stderr - state = "Completed" # local jobs are synchronous; if log exists, it finished jobs.append({ "job_id": name, "name": name, diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 6615fad..b297fec 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -6,11 +6,11 @@ Two submission modes are supported: -* **rest** (default) — POST the script to a slurmrestd endpoint. - Requires ``rest_url`` and ``jwt_token``. -* **cli** — pipe the script to ``sbatch`` via subprocess. +* **cli** (default) — pipe the script to ``sbatch`` via subprocess. Requires ``sbatch``/``squeue``/``scancel`` on PATH (or reachable via ``cli_prefix``, e.g. ``"docker exec slurmctld"``). +* **rest** (deprecated) — POST the script to a slurmrestd endpoint. + Requires ``rest_url`` and ``jwt_token``. """ from __future__ import annotations @@ -66,8 +66,8 @@ class SlurmScheduler(BaseScheduler): extra_sbatch : list[str] Additional ``#SBATCH`` lines, each *without* the ``#SBATCH`` prefix. submit_via : str - ``"rest"`` (default) — use slurmrestd REST API. - ``"cli"`` — use ``sbatch`` / ``squeue`` / ``scancel`` subprocess. + ``"cli"`` (default) — use ``sbatch`` / ``squeue`` / ``scancel`` subprocess. + ``"rest"`` (deprecated) — use slurmrestd REST API. cli_prefix : str Shell prefix for CLI commands (e.g. ``"docker exec -i slurmctld"``). Only used when ``submit_via="cli"``. @@ -88,7 +88,7 @@ def __init__( container_mounts: str = "", modules: list[str] | None = None, extra_sbatch: list[str] | None = None, - submit_via: str = "rest", + submit_via: str = "cli", cli_prefix: str = "", ) -> None: self.partition = partition @@ -106,6 +106,16 @@ def __init__( self.submit_via = submit_via self.cli_prefix = cli_prefix + if self.submit_via != "cli": + import warnings + warnings.warn( + "Slurm REST mode (slurmrestd) is deprecated and will be " + "removed in a future release. Use submit_via='cli' " + "(sbatch) instead.", + DeprecationWarning, + stacklevel=2, + ) + def render(self, spec: ProfileJobSpec) -> str: job_name = spec.default_job_name() cmd = spec.build_shell_command() @@ -415,6 +425,8 @@ def _status_cli(self, job_id: str) -> dict: } def _logs_cli(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + # TODO: read actual Slurm log file (StdOut from scontrol) + # and support tail/follow properly. info = self._status_cli(job_id) return info["message"] diff --git a/schedulers/templates/slurm.yaml b/schedulers/templates/slurm.yaml index 0910f4a..3ff2d0a 100644 --- a/schedulers/templates/slurm.yaml +++ b/schedulers/templates/slurm.yaml @@ -8,17 +8,19 @@ # # CLI flags and env vars can override individual values. -# REQUIRED — slurmrestd endpoint -rest_url: "" # e.g. https://slurm.corp.com:6820 - -# REQUIRED — authentication (pick one) -# jwt_token: "" # not recommended — stored in plaintext -jwt_token_cmd: "" # e.g. "scontrol token lifespan=3600" +# REQUIRED — submission mode +submit_via: "cli" # "cli" (sbatch, default) or "rest" (deprecated) +cli_prefix: "" # e.g. "docker exec -i slurmctld" (optional) # REQUIRED — cluster settings partition: "" # e.g. gpu-h100 account: "" # e.g. my-project +# REST mode (deprecated) — only needed if submit_via: rest +rest_url: "" # e.g. https://slurm.corp.com:6820 +# jwt_token: "" # not recommended — stored in plaintext +jwt_token_cmd: "" # e.g. "scontrol token lifespan=3600" + # Optional api_version: "v0.0.40" time: "02:00:00" diff --git a/scripts/__init__.py b/scripts/__init__.py index e69de29..e785b75 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -0,0 +1,36 @@ +"""Shared utilities for FlowSim CLI scripts.""" + + +def parse_sweep_point(s: str) -> tuple[int, int, int]: + """Parse a ``BS:INPUT_LEN:CTX`` string into an int 3-tuple. + + Raises :class:`ValueError` on bad input. + """ + parts = s.strip().split(":") + if len(parts) != 3: + raise ValueError( + f"Bad sweep point {s!r}: expected BS:INPUT_LEN:CTX " + f"(e.g. 1:2048:0)" + ) + try: + return int(parts[0]), int(parts[1]), int(parts[2]) + except ValueError: + raise ValueError( + f"Bad sweep point {s!r}: all three values must be integers" + ) + + +def load_sweep_file(path: str) -> list[tuple[int, int, int]]: + """Read sweep points from a file (one ``BS:INPUT_LEN:CTX`` per line). + + Blank lines and ``#`` comments are skipped. + Raises :class:`ValueError` on bad entries. + """ + points: list[tuple[int, int, int]] = [] + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + points.append(parse_sweep_point(line)) + return points diff --git a/scripts/cli.py b/scripts/cli.py index 135ed84..fc4d0cb 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -42,12 +42,17 @@ def _init_k8s_parser(sub: argparse._SubParsersAction) -> None: def _init_slurm_parser(sub: argparse._SubParsersAction) -> None: p = sub.add_parser("slurm", help="Configure Slurm scheduler") - p.add_argument("--rest-url", required=True, - help="slurmrestd endpoint URL (REQUIRED)") p.add_argument("--partition", required=True, help="Slurm partition (REQUIRED)") - p.add_argument("--account", required=True, - help="Slurm account (REQUIRED)") + p.add_argument("--account", default="", + help="Slurm account") + p.add_argument("--submit-via", default="cli", + choices=["cli", "rest"], + help="Submission mode (default: cli)") + p.add_argument("--cli-prefix", default="", + help='CLI mode prefix, e.g. "docker exec -i slurmctld"') + p.add_argument("--rest-url", default="", + help="slurmrestd endpoint URL (REST mode only, deprecated)") p.add_argument("--jwt-token-cmd", default="", help='Command to get JWT token, e.g. "scontrol token lifespan=3600"') p.add_argument("--jwt-token", default="", @@ -107,10 +112,12 @@ def _cmd_init(argv: list[str]) -> int: dst = _CONFIG_DIR / "k8s.yaml" elif args.scheduler == "slurm": - if not args.jwt_token_cmd and not args.jwt_token: - print("Error: provide --jwt-token-cmd or --jwt-token", file=sys.stderr) + if args.submit_via == "rest" and not args.jwt_token_cmd and not args.jwt_token: + print("Error: REST mode requires --jwt-token-cmd or --jwt-token", file=sys.stderr) return 1 cfg = { + "submit_via": args.submit_via, + "cli_prefix": args.cli_prefix, "rest_url": args.rest_url, "jwt_token_cmd": args.jwt_token_cmd, "jwt_token": args.jwt_token, diff --git a/scripts/run_stage_profile.py b/scripts/run_stage_profile.py index 91f9143..00dce4b 100644 --- a/scripts/run_stage_profile.py +++ b/scripts/run_stage_profile.py @@ -107,6 +107,7 @@ ) from utils.net import wait_for_port from utils.shape_merge import merge_shapes_dir +from scripts import load_sweep_file, parse_sweep_point # --------------------------------------------------------------------------- # Defaults @@ -746,43 +747,18 @@ def parse_args(argv: Optional[list] = None) -> argparse.Namespace: return p.parse_args(argv) -def _parse_sweep_point(s: str) -> tuple[int, int, int]: - """Parse a ``BS:INPUT_LEN:CTX`` string into an int 3-tuple.""" - parts = s.strip().split(":") - if len(parts) != 3: - raise ValueError( - f"Bad sweep point {s!r}: expected BS:INPUT_LEN:CTX " - f"(e.g. 1:2048:0)" - ) - try: - return int(parts[0]), int(parts[1]), int(parts[2]) - except ValueError: - raise ValueError( - f"Bad sweep point {s!r}: all three values must be integers" - ) - - def _load_sweep_points(args) -> list[tuple[int, int, int]]: """Resolve sweep points from --sweep, --sweep-file, or single-point args.""" if args.sweep and args.sweep_file: print("[ERROR] --sweep and --sweep-file are mutually exclusive") raise SystemExit(1) - points: list[tuple[int, int, int]] = [] if args.sweep: - for s in args.sweep: - points.append(_parse_sweep_point(s)) - elif args.sweep_file: - with open(args.sweep_file) as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - points.append(_parse_sweep_point(line)) - else: - # Single-point from --bs / --input-len / --existing-ctx - points.append((args.bs, args.input_len, args.existing_ctx)) - return points + return [parse_sweep_point(s) for s in args.sweep] + if args.sweep_file: + return load_sweep_file(args.sweep_file) + # Single-point from --bs / --input-len / --existing-ctx + return [(args.bs, args.input_len, args.existing_ctx)] # --------------------------------------------------------------------------- @@ -823,11 +799,11 @@ def _start_server( return proc -def _run_perf(args, summary: list[dict], *, bs: int = 0, input_len: int = 0, existing_ctx: int = 0) -> int: +def _run_perf(args, summary: list[dict], *, bs: Optional[int] = None, input_len: Optional[int] = None, existing_ctx: Optional[int] = None) -> int: """Collect traces for a single (bs, input_len, existing_ctx, decode_tokens) point.""" - bs = bs or args.bs - input_len = input_len or args.input_len - existing_ctx = existing_ctx if (bs != 0) else args.existing_ctx + bs = bs if bs is not None else args.bs + input_len = input_len if input_len is not None else args.input_len + existing_ctx = existing_ctx if existing_ctx is not None else args.existing_ctx tag = f"bs{bs}_input{input_len}_ctx{existing_ctx}" sub_dir = os.path.join(args.output_dir, tag) diff --git a/scripts/status_profile.py b/scripts/status_profile.py index 4882d11..085d79e 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -29,17 +29,15 @@ from __future__ import annotations import argparse -import os import sys -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_jwt_token +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default, resolve_jwt_token from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler -def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: - return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) +_d = resolve_default def _add_scheduler_args(p: argparse.ArgumentParser) -> None: @@ -101,7 +99,7 @@ def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> p.add_argument( "--slurm-submit-via", choices=["rest", "cli"], - default=cfg_get(slurm_cfg, "submit_via", "rest"), + default=cfg_get(slurm_cfg, "submit_via", "cli"), ) p.add_argument( "--slurm-cli-prefix", diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index f1e3051..8999212 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -51,15 +51,15 @@ import sys from schedulers.base import ProfileJobSpec -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_jwt_token +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default, resolve_jwt_token from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler +from scripts import load_sweep_file, parse_sweep_point -def _d(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: - """Resolve default: env var > config file > fallback.""" - return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) +# Short alias for argparse default= expressions +_d = resolve_default def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -311,8 +311,8 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: slurm.add_argument( "--slurm-submit-via", choices=["rest", "cli"], - default=cfg_get(slurm_cfg, "submit_via", "rest"), - help="Submission mode: rest (slurmrestd) or cli (sbatch subprocess)", + default=cfg_get(slurm_cfg, "submit_via", "cli"), + help="Submission mode: cli (sbatch subprocess) or rest (slurmrestd, deprecated)", ) slurm.add_argument( "--slurm-cli-prefix", @@ -327,26 +327,14 @@ def _parse_sweep_points(args) -> list[tuple[int, int, int]]: """Resolve sweep points from --sweep / --sweep-file args.""" if args.sweep and args.sweep_file: sys.exit("Error: --sweep and --sweep-file are mutually exclusive") - points: list[tuple[int, int, int]] = [] - raw: list[str] = [] - if args.sweep: - raw = args.sweep - elif args.sweep_file: - with open(args.sweep_file) as f: - raw = [ - line.strip() - for line in f - if line.strip() and not line.strip().startswith("#") - ] - for s in raw: - parts = s.strip().split(":") - if len(parts) != 3: - sys.exit(f"Bad sweep point {s!r}: expected BS:INPUT_LEN:CTX") - try: - points.append((int(parts[0]), int(parts[1]), int(parts[2]))) - except ValueError: - sys.exit(f"Bad sweep point {s!r}: all three values must be integers") - return points + try: + if args.sweep: + return [parse_sweep_point(s) for s in args.sweep] + if args.sweep_file: + return load_sweep_file(args.sweep_file) + except ValueError as e: + sys.exit(str(e)) + return [] def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: From 3152a72383acc3870804b7326529e2e72236c7b3 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 22:15:23 +0000 Subject: [PATCH 29/56] review: fix remaining issues (stale docstring, unused vars, README defaults) - slurm.py: fix module docstring (no longer says 'posts to slurmrestd') - local.py: remove unused stderr/stderr_size vars in list_jobs() - k8s.py: extract _k8s_job_state() helper (was duplicated in status+list_jobs) - README: update Slurm default to cli, mark REST as deprecated, fix init example --- schedulers/README.md | 13 ++++++------- schedulers/k8s.py | 31 +++++++++++++------------------ schedulers/local.py | 3 --- schedulers/slurm.py | 3 +-- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index cf14f3c..7f99f4e 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -182,21 +182,20 @@ bash dockerfiles/dev-teardown.sh kind 生成 sbatch 脚本并提交到 Slurm 集群。支持两种提交模式: -- **CLI 模式**(推荐):通过 `sbatch`/`squeue`/`scancel` 命令 -- **REST 模式**:通过 slurmrestd REST API + JWT 认证 +- **CLI 模式**(推荐,默认):通过 `sbatch`/`squeue`/`scancel` 命令 +- **REST 模式**(已弃用):通过 slurmrestd REST API + JWT 认证 ### 首次配置 ```bash # CLI 模式(推荐,无需 slurmrestd) flowsim init slurm \ - --rest-url http://unused \ --partition gpu \ --account my-project \ --container-runtime none \ --force -# REST 模式(需要 slurmrestd) +# REST 模式(已弃用,需要 slurmrestd) flowsim init slurm \ --rest-url https://slurm.example.com:6820 \ --partition gpu \ @@ -224,7 +223,7 @@ flowsim submit --scheduler slurm \ --slurm-partition normal \ --collect perf --model-path Qwen/Qwen3-8B --gpus 1 -# REST 模式 +# REST 模式(已弃用) flowsim submit --scheduler slurm \ --slurm-submit-via rest \ --slurm-rest-url http://localhost:6820 \ @@ -248,7 +247,7 @@ flowsim cancel --scheduler slurm --job 12345 \ | 参数 | 说明 | 默认值 | |------|------|--------| -| `--slurm-submit-via` | 提交模式:`cli`(sbatch)或 `rest`(slurmrestd) | `rest` | +| `--slurm-submit-via` | 提交模式:`cli`(sbatch)或 `rest`(slurmrestd,已弃用) | `cli` | | `--slurm-cli-prefix` | CLI 命令前缀(如 `"docker exec -i slurmctld"`) | 空 | | `--slurm-partition` | Slurm 分区 | 空 | | `--slurm-time` | 任务时间限制 | `02:00:00` | @@ -284,7 +283,7 @@ flowsim cancel --scheduler slurm --job 12345 \ - `"docker exec -i slurmctld"` — 通过 Docker 容器 - `"ssh login-node"` — 通过 SSH -**REST 模式:** +**REST 模式(已弃用):** 1. 同上生成 sbatch 脚本 2. `submit()` 通过 HTTP POST 到 slurmrestd 的 `/slurm/{version}/job/submit` 3. 所有操作通过 slurmrestd REST API + JWT 认证 diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 741721e..83b991c 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -11,6 +11,17 @@ from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec + +def _k8s_job_state(status) -> str: + """Derive a human-readable state string from a K8s Job status object.""" + if status.succeeded and status.succeeded > 0: + return "Succeeded" + if status.failed and status.failed > 0: + return "Failed" + if status.active and status.active > 0: + return "Running" + return "Pending" + # Optional: nicer YAML output for dry-run. try: import yaml as _yaml # type: ignore[import-untyped] @@ -216,17 +227,9 @@ def status(self, job_id: str) -> dict: batch_api, core_api = self._load_k8s() job = batch_api.read_namespaced_job(name=job_id, namespace=self.namespace) - st = job.status # Determine state - if st.succeeded and st.succeeded > 0: - state = "Succeeded" - elif st.failed and st.failed > 0: - state = "Failed" - elif st.active and st.active > 0: - state = "Running" - else: - state = "Pending" + state = _k8s_job_state(job.status) # Pod info pods = core_api.list_namespaced_pod( @@ -335,15 +338,7 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: ) result: list[dict] = [] for job in jobs.items: - st = job.status - if st.succeeded and st.succeeded > 0: - state = "Succeeded" - elif st.failed and st.failed > 0: - state = "Failed" - elif st.active and st.active > 0: - state = "Running" - else: - state = "Pending" + state = _k8s_job_state(job.status) if status_filter and state.lower() != status_filter.lower(): continue diff --git a/schedulers/local.py b/schedulers/local.py index eeaa020..4c61865 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -330,9 +330,6 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: continue name = m.group(1) ts = m.group(2) - stderr = path.replace(".stdout.log", ".stderr.log") - stderr_size = os.path.getsize(stderr) if os.path.exists(stderr) else 0 - # If stderr has content, might have failed; otherwise completed state = "Completed" jobs.append({ "job_id": name, diff --git a/schedulers/slurm.py b/schedulers/slurm.py index b297fec..6d2880b 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -1,8 +1,7 @@ """Slurm sbatch scheduler for FlowSim profiling. ``render()`` / ``dry_run()`` produce a standalone bash script (zero deps). -``submit()`` posts the script to a slurmrestd endpoint via stdlib -``urllib.request`` — no extra packages needed. +``submit()`` pipes the script to ``sbatch`` by default (CLI mode). Two submission modes are supported: From 19973cb732bea0bec8de43e8cabd485c51981b1f Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 22:28:59 +0000 Subject: [PATCH 30/56] remove Slurm REST dead code, rewrite README in English - Delete all slurmrestd REST methods (submit/cancel/status/logs/list) - Remove ssl, urllib, json imports from slurm.py - Remove REST constructor params (rest_url, jwt_token, api_version, verify_ssl, submit_via) - Remove resolve_jwt_token() from config.py - Remove REST CLI args from submit_profile.py, status_profile.py, cli.py - Strip REST fields from slurm.yaml template - Remove JWT-related tests, update init/submit tests - Rewrite schedulers/README.md entirely in English, no REST references - 56 unit tests pass, net -524 lines --- schedulers/README.md | 331 ++++++++++++++----------------- schedulers/config.py | 29 --- schedulers/slurm.py | 322 +----------------------------- schedulers/templates/slurm.yaml | 11 +- scripts/cli.py | 24 +-- scripts/status_profile.py | 41 +--- scripts/submit_profile.py | 61 +----- tests/unit/test_scheduler_cli.py | 39 +--- 8 files changed, 167 insertions(+), 691 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 7f99f4e..48e6b0b 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -1,78 +1,78 @@ # FlowSim Schedulers -FlowSim 支持三种调度器后端,用于提交 GPU profiling 任务: +FlowSim supports three scheduler backends for submitting GPU profiling jobs: -| 后端 | 适用场景 | 运行位置 | 依赖 | -|------|----------|----------|------| -| **local** | 单机开发/测试 | 宿主机 Docker 容器 | Docker + NVIDIA GPU | -| **k8s** | Kubernetes 集群 | K8s Job Pod | `kubernetes` Python 包 | -| **slurm** | HPC 集群 | Slurm 计算节点 | Slurm CLI 或 slurmrestd | +| Backend | Use Case | Runs On | Dependencies | +|---------|----------|---------|--------------| +| **local** | Single-machine dev/testing | Host Docker container | Docker + NVIDIA GPU | +| **k8s** | Kubernetes cluster | K8s Job Pod | `kubernetes` Python package | +| **slurm** | HPC cluster | Slurm compute node | Slurm CLI (`sbatch`/`squeue`/`scancel`) | -## 快速上手 +## Quick Start ```bash -# 安装(从 FlowSim 项目根目录) +# Install (from FlowSim project root) cd FlowSim -pip install -e . # 或确保 PYTHONPATH 包含项目根目录 +pip install -e . # or ensure PYTHONPATH includes the project root -# 查看帮助 +# Show help flowsim --help flowsim submit --help ``` -## 通用工作流 +## Common Workflow -所有调度器共享相同的 CLI 接口: +All schedulers share the same CLI interface: ```bash -# 1. 提交任务 +# 1. Submit a job flowsim submit --scheduler --collect \ - --model-path [选项...] + --model-path [options...] -# 2. 查看任务列表 +# 2. List jobs flowsim list --scheduler -# 3. 查看任务状态 +# 3. Check job status flowsim status --scheduler --job -# 4. 查看日志 +# 4. View logs flowsim logs --scheduler --job -# 5. 取消任务 +# 5. Cancel a job flowsim cancel --scheduler --job -# 6. Dry-run(仅打印脚本/manifest,不提交) +# 6. Dry-run (print script/manifest without submitting) flowsim submit --scheduler ... --dry-run ``` -### 通用参数 +### Common Parameters -| 参数 | 说明 | 默认值 | -|------|------|--------| -| `--collect` | 收集模式:`perf`(性能) / `shapes`(形状) / `all`(两者) | 必填 | -| `--model-path` | HuggingFace 模型路径 | 必填 | +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--collect` | Collection mode: `perf` / `shapes` / `all` | required | +| `--model-path` | HuggingFace model path | required | | `--tp` | Tensor parallelism | `1` | | `--dp` | Data parallelism | `1` | | `--bs` | Batch size | `1` | -| `--input-len` | 输入序列长度 | `2048` | -| `--existing-ctx` | 已有 KV cache 长度 | `0` | -| `--decode-tokens` | Decode 生成 token 数 | `32` | -| `--warmup-n` | Warmup 迭代数 | `5` | -| `--image` | Docker 镜像 | `flowsim-image:latest` | -| `--gpus` | GPU 数量 | `1` | -| `--output-dir` | 输出目录(自动生成如不指定) | `stage_traces/{scheduler}/{timestamp}/` | -| `--dry-run` | 仅打印脚本,不提交 | `false` | +| `--input-len` | Input sequence length | `2048` | +| `--existing-ctx` | Existing KV cache length | `0` | +| `--decode-tokens` | Decode token count | `32` | +| `--warmup-n` | Warmup iterations | `5` | +| `--image` | Docker image | `flowsim-image:latest` | +| `--gpus` | GPU count | `1` | +| `--output-dir` | Output directory (auto-generated if omitted) | `stage_traces/{scheduler}/{timestamp}/` | +| `--dry-run` | Print script only, do not submit | `false` | --- -## 1. Local 调度器 +## 1. Local Scheduler -直接在宿主机上通过 `docker run` 启动容器运行 profiling。最简单的方式,适合单机开发和测试。 +Runs profiling directly on the host via `docker run`. The simplest option, suitable for single-machine development and testing. -### 使用 +### Usage ```bash -# 最简单的用法 — 使用 GPU 0 运行 +# Simplest usage — run on GPU 0 flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ @@ -80,34 +80,34 @@ flowsim submit --scheduler local \ --gpus 1 --local-gpus 0 \ --extra-server-opts "--load-format dummy" -# 多 GPU +# Multi-GPU flowsim submit --scheduler local \ --collect perf \ --model-path Qwen/Qwen3-8B \ --tp 2 --gpus 2 --local-gpus 0,1 ``` -### 专有参数 +### Parameters -| 参数 | 说明 | 默认值 | -|------|------|--------| -| `--local-gpus` | `CUDA_VISIBLE_DEVICES`(如 `0` 或 `0,1`) | 空(使用所有 GPU) | -| `--local-workdir` | 主机工作目录 | FlowSim 项目根目录 | +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--local-gpus` | `CUDA_VISIBLE_DEVICES` (e.g. `0` or `0,1`) | empty (all GPUs) | +| `--local-workdir` | Host working directory | FlowSim project root | -### 工作原理 +### How It Works -1. `render()` 生成一条 `docker run --gpus` 命令 -2. `submit()` 在宿主机执行该容器,同步等待完成 -3. Traces 写入宿主机 `stage_traces/local/{YYYYMMDD_HHMMSS}/` -4. `status()` / `logs()` / `list_jobs()` 扫描日志文件 +1. `render()` generates a `docker run --gpus` command +2. `submit()` runs the container on the host, waits for completion +3. Traces are written to `stage_traces/local/{YYYYMMDD_HHMMSS}/` +4. `status()` / `logs()` / `list_jobs()` scan log files --- -## 2. Kubernetes 调度器 +## 2. Kubernetes Scheduler -将 profiling 任务作为 Kubernetes Job 提交到集群。支持 PVC 和 hostPath 两种存储方式。 +Submits profiling jobs as Kubernetes Jobs to a cluster. Supports both PVC and hostPath storage. -### 首次配置 +### First-Time Setup ```bash flowsim init k8s \ @@ -118,19 +118,19 @@ flowsim init k8s \ --force ``` -配置保存到 `~/.flowsim/k8s.yaml`,后续提交时自动读取。 +Config is saved to `~/.flowsim/k8s.yaml` and automatically loaded on subsequent submissions. -### 使用 +### Usage ```bash -# 提交到 K8s 集群 +# Submit to K8s cluster flowsim submit --scheduler k8s \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ --tp 1 --bs 1 --input-len 2048 --gpus 1 \ --extra-server-opts "--load-format dummy" -# 覆盖配置文件中的值 +# Override config file values flowsim submit --scheduler k8s \ --collect perf \ --model-path Qwen/Qwen3-8B \ @@ -138,192 +138,158 @@ flowsim submit --scheduler k8s \ --k8s-pvc my-traces-pvc \ --gpus 4 --tp 4 -# Dry-run 查看生成的 YAML +# Dry-run to preview the generated YAML flowsim submit --scheduler k8s ... --dry-run ``` -### 专有参数 +### Parameters -| 参数 | 说明 | 默认值 | -|------|------|--------| -| `--k8s-namespace` | K8s 命名空间 | `default` | -| `--k8s-kubeconfig` | kubeconfig 路径 | `~/.kube/config` | -| `--k8s-context` | kubeconfig context | 当前 context | -| `--k8s-pvc` | PVC 名称(持久存储) | 空 | -| `--k8s-host-output-dir` | hostPath 挂载路径(PVC 为空时使用) | 空 | -| `--k8s-node-selector` | 节点选择标签(可重复),格式 `KEY=VALUE` | 空 | -| `--k8s-service-account` | ServiceAccount | 空 | -| `--k8s-shm-size` | 共享内存大小 | `16Gi` | -| `--k8s-runtime-class` | RuntimeClass(如 `nvidia`,用于 CDI 模式) | 空 | +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--k8s-namespace` | K8s namespace | `default` | +| `--k8s-kubeconfig` | kubeconfig path | `~/.kube/config` | +| `--k8s-context` | kubeconfig context | current context | +| `--k8s-pvc` | PVC name (persistent storage) | empty | +| `--k8s-host-output-dir` | hostPath mount (used when PVC is empty) | empty | +| `--k8s-node-selector` | Node selector labels (repeatable), format `KEY=VALUE` | empty | +| `--k8s-service-account` | ServiceAccount | empty | +| `--k8s-shm-size` | Shared memory size | `16Gi` | +| `--k8s-runtime-class` | RuntimeClass (e.g. `nvidia` for CDI mode) | empty | -### 工作原理 +### How It Works -1. `render()` 生成 Kubernetes Job YAML/JSON manifest -2. `submit()` 通过 `kubernetes` Python 客户端创建 Job -3. Traces 通过 PVC 或 hostPath 持久化 -4. `status()` / `cancel()` / `list_jobs()` 通过 K8s API 操作 +1. `render()` generates a Kubernetes Job YAML/JSON manifest +2. `submit()` creates the Job via the `kubernetes` Python client +3. Traces are persisted via PVC or hostPath +4. `status()` / `cancel()` / `list_jobs()` operate via the K8s API -### Kind 本地测试集群 +### Kind Local Test Cluster ```bash -# 启动 Kind 集群(GPU passthrough + CDI 模式) +# Start a Kind cluster (GPU passthrough + CDI mode) bash dockerfiles/dev-setup.sh kind -# 运行 K8s 集成测试 +# Run K8s integration tests python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x -# 清理 +# Teardown bash dockerfiles/dev-teardown.sh kind ``` --- -## 3. Slurm 调度器 +## 3. Slurm Scheduler -生成 sbatch 脚本并提交到 Slurm 集群。支持两种提交模式: +Generates sbatch scripts and submits them to a Slurm cluster via `sbatch`/`squeue`/`scancel`. -- **CLI 模式**(推荐,默认):通过 `sbatch`/`squeue`/`scancel` 命令 -- **REST 模式**(已弃用):通过 slurmrestd REST API + JWT 认证 - -### 首次配置 +### First-Time Setup ```bash -# CLI 模式(推荐,无需 slurmrestd) flowsim init slurm \ --partition gpu \ --account my-project \ --container-runtime none \ --force - -# REST 模式(已弃用,需要 slurmrestd) -flowsim init slurm \ - --rest-url https://slurm.example.com:6820 \ - --partition gpu \ - --account my-project \ - --jwt-token-cmd "scontrol token lifespan=3600" \ - --force ``` -### 使用 +### Usage ```bash -# CLI 模式 — 直接调用 sbatch(最常用) +# Submit via sbatch flowsim submit --scheduler slurm \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ --tp 1 --bs 1 --input-len 2048 --gpus 1 \ --slurm-partition gpu \ - --slurm-submit-via cli \ --extra-server-opts "--load-format dummy" -# CLI 模式 + 远程前缀(通过 docker exec 或 ssh) +# CLI prefix (e.g. via docker exec or ssh) flowsim submit --scheduler slurm \ - --slurm-submit-via cli \ --slurm-cli-prefix "docker exec -i slurmctld" \ --slurm-partition normal \ --collect perf --model-path Qwen/Qwen3-8B --gpus 1 -# REST 模式(已弃用) -flowsim submit --scheduler slurm \ - --slurm-submit-via rest \ - --slurm-rest-url http://localhost:6820 \ - --slurm-jwt-token "$(scontrol token lifespan=3600 | cut -d= -f2)" \ - --collect perf --model-path Qwen/Qwen3-8B --gpus 1 - -# Dry-run 查看生成的 sbatch 脚本 +# Dry-run to preview the generated sbatch script flowsim submit --scheduler slurm ... --dry-run -# 查看状态(CLI 模式) +# Check status flowsim status --scheduler slurm --job 12345 \ - --slurm-submit-via cli \ --slurm-cli-prefix "docker exec -i slurmctld" -# 取消任务 -flowsim cancel --scheduler slurm --job 12345 \ - --slurm-submit-via cli +# Cancel a job +flowsim cancel --scheduler slurm --job 12345 ``` -### 专有参数 - -| 参数 | 说明 | 默认值 | -|------|------|--------| -| `--slurm-submit-via` | 提交模式:`cli`(sbatch)或 `rest`(slurmrestd,已弃用) | `cli` | -| `--slurm-cli-prefix` | CLI 命令前缀(如 `"docker exec -i slurmctld"`) | 空 | -| `--slurm-partition` | Slurm 分区 | 空 | -| `--slurm-time` | 任务时间限制 | `02:00:00` | -| `--slurm-account` | 计费账户 | 空 | -| `--slurm-constraint` | 节点约束 | 空 | -| `--slurm-container-runtime` | 容器运行时:`docker` / `enroot` / `none` | `none` | -| `--slurm-container-mounts` | 容器挂载 | 空 | -| `--slurm-module` | `module load` 命令(可重复) | 空 | -| `--slurm-extra-sbatch` | 额外 `#SBATCH` 指令(可重复) | 空 | -| `--slurm-rest-url` | slurmrestd URL(REST 模式需要) | 空 | -| `--slurm-jwt-token` | JWT token(REST 模式需要) | 空 | -| `--slurm-api-version` | slurmrestd API 版本 | `v0.0.40` | -| `--slurm-no-verify-ssl` | 跳过 TLS 验证 | `false` | - -### container_runtime 说明 - -| 值 | 说明 | -|----|------| -| `none` | 直接在计算节点上运行(节点已有 Python/sglang 环境)| -| `docker` | 在分配的节点上 `docker run` | -| `enroot` | 使用 `srun --container-image` (NVIDIA enroot) | - -### 工作原理 - -**CLI 模式:** -1. `render()` 生成完整的 sbatch 脚本(含 `#SBATCH` 指令 + profiling 命令) -2. `submit()` 通过 `sbatch --parsable` 提交(脚本通过 stdin 传入) -3. `status()` 通过 `scontrol show job` 查询(无需 slurmdbd) -4. `cancel()` 通过 `scancel` 取消 -5. `list_jobs()` 通过 `squeue` 列出 - -如果 Slurm 命令不在本地 PATH 中,可通过 `--slurm-cli-prefix` 指定前缀,例如: -- `"docker exec -i slurmctld"` — 通过 Docker 容器 -- `"ssh login-node"` — 通过 SSH - -**REST 模式(已弃用):** -1. 同上生成 sbatch 脚本 -2. `submit()` 通过 HTTP POST 到 slurmrestd 的 `/slurm/{version}/job/submit` -3. 所有操作通过 slurmrestd REST API + JWT 认证 - -### Docker Compose 本地测试集群 +### Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--slurm-partition` | Slurm partition | empty | +| `--slurm-time` | Job time limit | `02:00:00` | +| `--slurm-account` | Billing account | empty | +| `--slurm-constraint` | Node constraint | empty | +| `--slurm-cli-prefix` | Shell prefix for CLI commands (e.g. `"docker exec -i slurmctld"`) | empty | +| `--slurm-container-runtime` | Container runtime: `docker` / `enroot` / `none` | `none` | +| `--slurm-container-mounts` | Container mounts | empty | +| `--slurm-module` | `module load` commands (repeatable) | empty | +| `--slurm-extra-sbatch` | Extra `#SBATCH` directives (repeatable) | empty | + +### container_runtime Options + +| Value | Description | +|-------|-------------| +| `none` | Run directly on compute node (Python/sglang must be installed) | +| `docker` | Run via `docker run` on the allocated node | +| `enroot` | Run via `srun --container-image` (NVIDIA enroot) | + +### How It Works + +1. `render()` generates a complete sbatch script (`#SBATCH` directives + profiling command) +2. `submit()` pipes the script to `sbatch --parsable` +3. `status()` queries via `scontrol show job` +4. `cancel()` runs `scancel` +5. `list_jobs()` runs `squeue` + +If Slurm commands are not on the local PATH, use `--slurm-cli-prefix` to specify a prefix, e.g.: +- `"docker exec -i slurmctld"` — via Docker container +- `"ssh login-node"` — via SSH + +### Docker Compose Local Test Cluster ```bash -# 启动 Slurm 集群(slurmctld + 1 计算节点 + 1 GPU) +# Start Slurm cluster (slurmctld + 1 compute node + 1 GPU) cd dockerfiles/ docker compose -f slurm-compose.yaml up -d -# 检查集群状态 +# Check cluster status docker exec slurmctld sinfo -# 运行 Slurm 集成测试 +# Run Slurm integration tests python -m pytest tests/integration/test_scheduler_local.py::TestSlurmScheduler -v -x -# 清理 +# Teardown docker compose -f slurm-compose.yaml down -v ``` --- -## 配置文件 +## Configuration -配置保存在 `~/.flowsim/` 目录下,通过 `flowsim init` 生成: +Config files are stored in `~/.flowsim/` and generated via `flowsim init`: ``` ~/.flowsim/ -├── k8s.yaml # K8s 调度器配置 -└── slurm.yaml # Slurm 调度器配置 +├── k8s.yaml # K8s scheduler config +└── slurm.yaml # Slurm scheduler config ``` -参数优先级(从高到低): -1. CLI flag(`--slurm-partition gpu`) -2. 环境变量(`FLOWSIM_SLURM_PARTITION=gpu`) -3. 配置文件(`~/.flowsim/slurm.yaml`) -4. 内置默认值 +Parameter priority (highest to lowest): +1. CLI flag (`--slurm-partition gpu`) +2. Environment variable (`FLOWSIM_SLURM_PARTITION=gpu`) +3. Config file (`~/.flowsim/slurm.yaml`) +4. Built-in default -### 示例 k8s.yaml +### Example k8s.yaml ```yaml kubeconfig: /home/user/.kube/config @@ -333,33 +299,32 @@ runtime_class_name: nvidia shm_size: 16Gi ``` -### 示例 slurm.yaml +### Example slurm.yaml ```yaml partition: gpu account: my-project time: "02:00:00" container_runtime: none -submit_via: cli cli_prefix: "" ``` --- -## 输出目录结构 +## Output Directory Structure -所有调度器产生统一的 trace 输出结构: +All schedulers produce a unified trace output structure: ``` stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ ├── bs1_input2048_ctx0/ -│ ├── *.trace.json.gz # 原始 trace -│ ├── parsed/*.csv # 解析后的 CSV -│ ├── merged/*_merged.trace.csv # 合并的 trace CSV -│ ├── shape_traces/ # Shape trace(collect=shapes/all) -│ ├── shape_parsed/*.csv # Shape 解析 CSV -│ ├── analysis_extend.json # Extend 阶段分析 -│ └── analysis_decode.json # Decode 阶段分析 +│ ├── *.trace.json.gz # Raw traces +│ ├── parsed/*.csv # Parsed CSVs +│ ├── merged/*_merged.trace.csv # Merged trace CSV +│ ├── shape_traces/ # Shape traces (collect=shapes/all) +│ ├── shape_parsed/*.csv # Shape parsed CSVs +│ ├── analysis_extend.json # Extend stage analysis +│ └── analysis_decode.json # Decode stage analysis ├── logs/ │ ├── server_*.stdout.log │ └── server_*.stderr.log @@ -368,9 +333,9 @@ stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ --- -## PD Disaggregation(实验性) +## PD Disaggregation (Experimental) -支持 Prefill-Decode 分离部署: +Supports Prefill-Decode disaggregated deployment: ```bash flowsim submit --scheduler k8s \ @@ -381,4 +346,4 @@ flowsim submit --scheduler k8s \ --disagg-transfer-backend mooncake ``` -这会生成两个 Job:一个 prefill 实例,一个 decode 实例。 +This generates two Jobs: one prefill instance and one decode instance. diff --git a/schedulers/config.py b/schedulers/config.py index e228cb3..723dfc2 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -15,17 +15,11 @@ Template files are in ``schedulers/templates/k8s.yaml`` and ``schedulers/templates/slurm.yaml``. Copy to ``~/.flowsim/`` and edit. - -For Slurm, use ``jwt_token_cmd`` instead of ``jwt_token`` to avoid -storing secrets in plaintext. The command is executed at submit time -and its stdout is used as the token. """ from __future__ import annotations import os -import shlex -import subprocess from pathlib import Path # Optional: try PyYAML, fall back to JSON @@ -94,29 +88,6 @@ def load_slurm_config() -> dict: return {} -def resolve_jwt_token(slurm_cfg: dict) -> str: - """Get the JWT token from config, executing jwt_token_cmd if needed.""" - token = slurm_cfg.get("jwt_token", "") - if token: - return str(token) - - cmd = slurm_cfg.get("jwt_token_cmd", "") - if cmd: - try: - result = subprocess.run( - shlex.split(cmd), - capture_output=True, - text=True, - timeout=30, - ) - if result.returncode == 0: - return result.stdout.strip() - except (FileNotFoundError, OSError): - pass - - return "" - - def cfg_get(cfg: dict, key: str, fallback: str = "") -> str: """Get a value from a flat config dict, or fallback.""" val = cfg.get(key) diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 6d2880b..67e954b 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -1,32 +1,20 @@ """Slurm sbatch scheduler for FlowSim profiling. ``render()`` / ``dry_run()`` produce a standalone bash script (zero deps). -``submit()`` pipes the script to ``sbatch`` by default (CLI mode). +``submit()`` pipes the script to ``sbatch`` via subprocess (CLI mode). -Two submission modes are supported: - -* **cli** (default) — pipe the script to ``sbatch`` via subprocess. - Requires ``sbatch``/``squeue``/``scancel`` on PATH (or reachable - via ``cli_prefix``, e.g. ``"docker exec slurmctld"``). -* **rest** (deprecated) — POST the script to a slurmrestd endpoint. - Requires ``rest_url`` and ``jwt_token``. +Requires ``sbatch``/``squeue``/``scancel`` on PATH (or reachable +via ``cli_prefix``, e.g. ``"docker exec slurmctld"``). """ from __future__ import annotations -import json import shlex -import ssl import subprocess -import urllib.error -import urllib.request from schedulers.base import BaseScheduler, JobResult, ProfileJobSpec -_DEFAULT_API_VERSION = "v0.0.40" - - class SlurmScheduler(BaseScheduler): """Generate and optionally submit an sbatch script for profiling. @@ -36,17 +24,6 @@ class SlurmScheduler(BaseScheduler): Slurm partition to submit to. time_limit : str Wall-clock time limit (e.g., ``"01:00:00"``). - rest_url : str - Base URL of the slurmrestd daemon - (e.g., ``"https://slurm.example.com:6820"``). - Required only for ``submit()``. - jwt_token : str - JWT/auth token for slurmrestd. Required only for ``submit()``. - api_version : str - slurmrestd OpenAPI version (default: ``"v0.0.40"``). - Adjust to match your cluster (``v0.0.39``, ``v0.0.41``, …). - verify_ssl : bool - Whether to verify the slurmrestd TLS certificate (default True). account : str, optional ``--account`` for which allocation to charge. constraint : str, optional @@ -64,12 +41,8 @@ class SlurmScheduler(BaseScheduler): (relevant for ``"none"`` runtime). extra_sbatch : list[str] Additional ``#SBATCH`` lines, each *without* the ``#SBATCH`` prefix. - submit_via : str - ``"cli"`` (default) — use ``sbatch`` / ``squeue`` / ``scancel`` subprocess. - ``"rest"`` (deprecated) — use slurmrestd REST API. cli_prefix : str Shell prefix for CLI commands (e.g. ``"docker exec -i slurmctld"``). - Only used when ``submit_via="cli"``. """ def __init__( @@ -77,44 +50,24 @@ def __init__( *, partition: str = "gpu", time_limit: str = "02:00:00", - rest_url: str = "", - jwt_token: str = "", - api_version: str = _DEFAULT_API_VERSION, - verify_ssl: bool = True, account: str = "", constraint: str = "", container_runtime: str = "none", container_mounts: str = "", modules: list[str] | None = None, extra_sbatch: list[str] | None = None, - submit_via: str = "cli", cli_prefix: str = "", ) -> None: self.partition = partition self.time_limit = time_limit - self.rest_url = rest_url.rstrip("/") - self.jwt_token = jwt_token - self.api_version = api_version - self.verify_ssl = verify_ssl self.account = account self.constraint = constraint self.container_runtime = container_runtime self.container_mounts = container_mounts self.modules = modules or [] self.extra_sbatch = extra_sbatch or [] - self.submit_via = submit_via self.cli_prefix = cli_prefix - if self.submit_via != "cli": - import warnings - warnings.warn( - "Slurm REST mode (slurmrestd) is deprecated and will be " - "removed in a future release. Use submit_via='cli' " - "(sbatch) instead.", - DeprecationWarning, - stacklevel=2, - ) - def render(self, spec: ProfileJobSpec) -> str: job_name = spec.default_job_name() cmd = spec.build_shell_command() @@ -184,10 +137,8 @@ def render(self, spec: ProfileJobSpec) -> str: return "\n".join(lines) def submit(self, spec: ProfileJobSpec) -> JobResult: - """Submit the job via REST API or CLI, depending on ``submit_via``.""" - if self.submit_via == "cli": - return self._submit_cli(spec) - return self._submit_rest(spec) + """Submit the job via ``sbatch``.""" + return self._submit_cli(spec) # ------------------------------------------------------------------ # CLI helpers @@ -234,146 +185,21 @@ def _submit_cli(self, spec: ProfileJobSpec) -> JobResult: message=f"Submitted batch job {job_id}", ) - # ------------------------------------------------------------------ - # REST submit - # ------------------------------------------------------------------ - - def _submit_rest(self, spec: ProfileJobSpec) -> JobResult: - """Submit the job via slurmrestd REST API. - - Requires ``rest_url`` and ``jwt_token`` to be set. - Uses only ``urllib.request`` from the standard library. - """ - if not self.rest_url: - raise RuntimeError( - "--slurm-rest-url is required for --submit. " - "Point it at your slurmrestd endpoint " - "(e.g. https://slurm.example.com:6820)." - ) - if not self.jwt_token: - raise RuntimeError( - "--slurm-jwt-token is required for --submit. " - "Generate one via: scontrol token lifespan=3600" - ) - - script = self.render(spec) - job_name = spec.default_job_name() - - url = ( - f"{self.rest_url}/slurm/{self.api_version}/job/submit" - ) - - # slurmrestd job submission payload - payload = { - "script": script, - "job": { - "name": job_name, - "partition": self.partition, - "time_limit": {"number": self._parse_time_minutes(), "set": True}, - "tasks": 1, - "current_working_directory": "/flowsim", - "environment": ["PATH=/usr/local/bin:/usr/bin:/bin"], - }, - } - if self.account: - payload["job"]["account"] = self.account - - data = json.dumps(payload).encode() - headers = { - "Content-Type": "application/json", - "X-SLURM-USER-TOKEN": self.jwt_token, - } - req = urllib.request.Request(url, data=data, headers=headers, method="POST") - - ctx: ssl.SSLContext | None = None - if not self.verify_ssl: - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - try: - with urllib.request.urlopen(req, context=ctx) as resp: - body = json.loads(resp.read()) - except urllib.error.HTTPError as exc: - detail = exc.read().decode(errors="replace") - raise RuntimeError( - f"slurmrestd returned HTTP {exc.code}:\n{detail}" - ) from exc - except urllib.error.URLError as exc: - raise RuntimeError( - f"Cannot reach slurmrestd at {self.rest_url}: {exc.reason}" - ) from exc - - # Response contains job_id on success, errors array on failure - errors = body.get("errors") or [] - if errors: - msgs = "; ".join(e.get("error", str(e)) for e in errors) - raise RuntimeError(f"slurmrestd job submit failed: {msgs}") - - job_id = str(body.get("job_id", "unknown")) - return JobResult( - job_id=job_id, - scheduler="slurm", - state="Submitted", - output_dir=spec.output_dir, - message=f"Submitted batch job {job_id}", - ) - - def _rest_request(self, path: str, *, method: str = "GET") -> dict: - """Send a request to slurmrestd and return parsed JSON.""" - if not self.rest_url: - raise RuntimeError("--slurm-rest-url is required") - if not self.jwt_token: - raise RuntimeError("--slurm-jwt-token is required") - - url = f"{self.rest_url}{path}" - headers = { - "X-SLURM-USER-TOKEN": self.jwt_token, - } - req = urllib.request.Request(url, headers=headers, method=method) - - ctx: ssl.SSLContext | None = None - if not self.verify_ssl: - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - try: - with urllib.request.urlopen(req, context=ctx) as resp: - return json.loads(resp.read()) - except urllib.error.HTTPError as exc: - detail = exc.read().decode(errors="replace") - raise RuntimeError(f"slurmrestd returned HTTP {exc.code}:\n{detail}") from exc - except urllib.error.URLError as exc: - raise RuntimeError(f"Cannot reach slurmrestd at {self.rest_url}: {exc.reason}") from exc - - def _rest_get(self, path: str) -> dict: - """GET a slurmrestd endpoint and return parsed JSON.""" - return self._rest_request(path, method="GET") - def cancel(self, job_id: str) -> str: """Cancel a Slurm job.""" - if self.submit_via == "cli": - return self._cancel_cli(job_id) - return self._cancel_rest(job_id) + return self._cancel_cli(job_id) def status(self, job_id: str) -> dict: """Query Slurm job status.""" - if self.submit_via == "cli": - return self._status_cli(job_id) - return self._status_rest(job_id) + return self._status_cli(job_id) def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: """Show Slurm job log information.""" - if self.submit_via == "cli": - return self._logs_cli(job_id, tail=tail, follow=follow) - return self._logs_rest(job_id, tail=tail, follow=follow) + return self._logs_cli(job_id, tail=tail, follow=follow) def list_jobs(self, *, status_filter: str = "") -> list[dict]: """List Slurm jobs.""" - if self.submit_via == "cli": - return self._list_jobs_cli(status_filter=status_filter) - return self._list_jobs_rest(status_filter=status_filter) + return self._list_jobs_cli(status_filter=status_filter) # ------------------------------------------------------------------ # CLI implementations @@ -454,133 +280,3 @@ def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: "nodes": parts[4] if len(parts) > 4 else "", }) return result - - # ------------------------------------------------------------------ - # REST implementations - # ------------------------------------------------------------------ - - def _cancel_rest(self, job_id: str) -> str: - """Cancel a Slurm job via slurmrestd DELETE.""" - body = self._rest_request( - f"/slurm/{self.api_version}/job/{job_id}", - method="DELETE", - ) - errors = body.get("errors") or [] - if errors: - msgs = "; ".join(e.get("error", str(e)) for e in errors) - raise RuntimeError(f"slurmrestd cancel failed: {msgs}") - return f"Cancelled Slurm job {job_id}" - - def _status_rest(self, job_id: str) -> dict: - """Query Slurm job status via slurmrestd.""" - body = self._rest_get(f"/slurm/{self.api_version}/job/{job_id}") - - errors = body.get("errors") or [] - if errors: - msgs = "; ".join(e.get("error", str(e)) for e in errors) - raise RuntimeError(f"slurmrestd error: {msgs}") - - jobs = body.get("jobs", []) - if not jobs: - return {"state": "Unknown", "message": f"No job found with ID {job_id}", "output_hint": ""} - - job = jobs[0] - state = job.get("job_state", ["UNKNOWN"]) - if isinstance(state, list): - state = state[0] if state else "UNKNOWN" - name = job.get("name", "") - node_list = job.get("nodes", "") - output_file = job.get("standard_output", "") - work_dir = job.get("current_working_directory", "") - - msg_parts = [ - f"Job ID: {job_id} Name: {name} State: {state}", - f"Nodes: {node_list}" if node_list else "Nodes: (not yet assigned)", - ] - if output_file: - msg_parts.append(f"Output log: {output_file}") - if work_dir: - msg_parts.append(f"Working dir: {work_dir}") - - return { - "state": state, - "message": "\n".join(msg_parts), - "output_hint": output_file, - } - - def _logs_rest(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: - """Show where Slurm job logs are and how to access them.""" - info = self._status_rest(job_id) - output_file = info.get("output_hint", "") - state = info.get("state", "UNKNOWN") - - parts = [info["message"], ""] - - if output_file: - parts.append(f"Log file (on cluster shared filesystem):") - parts.append(f" {output_file}") - parts.append("") - if follow: - parts.append("Follow logs:") - parts.append(f" tail -f {output_file}") - else: - parts.append("View on login node:") - parts.append(f" less {output_file}") - parts.append(f" tail -{tail} {output_file}") - parts.append("") - parts.append("Follow logs:") - parts.append(f" tail -f {output_file}") - parts.append("") - parts.append("Copy to local machine:") - parts.append(f" scp :{output_file} .") - else: - parts.append("No output file path found in job metadata.") - - # Trace files location - parts.append("") - parts.append("Trace files (on cluster shared filesystem):") - parts.append(" ~/flowsim_traces/") - parts.append(" ls ~/flowsim_traces/") - - return "\n".join(parts) - - def _list_jobs_rest(self, *, status_filter: str = "") -> list[dict]: - """List Slurm jobs via slurmrestd /jobs endpoint.""" - body = self._rest_get(f"/slurm/{self.api_version}/jobs") - errors = body.get("errors") or [] - if errors: - msgs = "; ".join(e.get("error", str(e)) for e in errors) - raise RuntimeError(f"slurmrestd error: {msgs}") - - result: list[dict] = [] - for job in body.get("jobs", []): - name = job.get("name", "") - # Only show flowsim jobs (name starts with "flowsim-") - if not name.startswith("flowsim-"): - continue - - state = job.get("job_state", ["UNKNOWN"]) - if isinstance(state, list): - state = state[0] if state else "UNKNOWN" - - if status_filter and state.upper() != status_filter.upper(): - continue - - result.append({ - "job_id": str(job.get("job_id", "")), - "name": name, - "state": state, - "partition": job.get("partition", ""), - "nodes": job.get("nodes", ""), - }) - return result - - def _parse_time_minutes(self) -> int: - """Convert HH:MM:SS time_limit to total minutes.""" - parts = self.time_limit.split(":") - if len(parts) == 3: - h, m, s = int(parts[0]), int(parts[1]), int(parts[2]) - return h * 60 + m + (1 if s > 0 else 0) - if len(parts) == 2: - return int(parts[0]) * 60 + int(parts[1]) - return int(parts[0]) diff --git a/schedulers/templates/slurm.yaml b/schedulers/templates/slurm.yaml index 3ff2d0a..b4d77a1 100644 --- a/schedulers/templates/slurm.yaml +++ b/schedulers/templates/slurm.yaml @@ -8,21 +8,12 @@ # # CLI flags and env vars can override individual values. -# REQUIRED — submission mode -submit_via: "cli" # "cli" (sbatch, default) or "rest" (deprecated) -cli_prefix: "" # e.g. "docker exec -i slurmctld" (optional) - # REQUIRED — cluster settings partition: "" # e.g. gpu-h100 account: "" # e.g. my-project -# REST mode (deprecated) — only needed if submit_via: rest -rest_url: "" # e.g. https://slurm.corp.com:6820 -# jwt_token: "" # not recommended — stored in plaintext -jwt_token_cmd: "" # e.g. "scontrol token lifespan=3600" - # Optional -api_version: "v0.0.40" +cli_prefix: "" # e.g. "docker exec -i slurmctld" time: "02:00:00" constraint: "" container_runtime: "none" # docker | enroot | none diff --git a/scripts/cli.py b/scripts/cli.py index fc4d0cb..00409fb 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -3,7 +3,7 @@ Usage:: flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team ... - flowsim init slurm --rest-url https://slurm:6820 --partition gpu ... + flowsim init slurm --partition gpu --account proj ... flowsim submit --scheduler k8s --collect perf --model-path ... flowsim submit ... --dry-run # debug: preview manifest """ @@ -46,19 +46,8 @@ def _init_slurm_parser(sub: argparse._SubParsersAction) -> None: help="Slurm partition (REQUIRED)") p.add_argument("--account", default="", help="Slurm account") - p.add_argument("--submit-via", default="cli", - choices=["cli", "rest"], - help="Submission mode (default: cli)") p.add_argument("--cli-prefix", default="", help='CLI mode prefix, e.g. "docker exec -i slurmctld"') - p.add_argument("--rest-url", default="", - help="slurmrestd endpoint URL (REST mode only, deprecated)") - p.add_argument("--jwt-token-cmd", default="", - help='Command to get JWT token, e.g. "scontrol token lifespan=3600"') - p.add_argument("--jwt-token", default="", - help="Static JWT token (not recommended)") - p.add_argument("--api-version", default="v0.0.40", - help="slurmrestd API version (default: v0.0.40)") p.add_argument("--time", default="02:00:00", help="Job time limit (default: 02:00:00)") p.add_argument("--constraint", default="", @@ -82,8 +71,7 @@ def _cmd_init(argv: list[str]) -> int: "Configure a scheduler and save to ~/.flowsim/.\n\n" "Examples:\n" " flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team\n" - " flowsim init slurm --rest-url https://slurm:6820 " - "--partition gpu --account proj" + " flowsim init slurm --partition gpu --account proj" ), formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -112,18 +100,10 @@ def _cmd_init(argv: list[str]) -> int: dst = _CONFIG_DIR / "k8s.yaml" elif args.scheduler == "slurm": - if args.submit_via == "rest" and not args.jwt_token_cmd and not args.jwt_token: - print("Error: REST mode requires --jwt-token-cmd or --jwt-token", file=sys.stderr) - return 1 cfg = { - "submit_via": args.submit_via, "cli_prefix": args.cli_prefix, - "rest_url": args.rest_url, - "jwt_token_cmd": args.jwt_token_cmd, - "jwt_token": args.jwt_token, "partition": args.partition, "account": args.account, - "api_version": args.api_version, "time": args.time, "constraint": args.constraint, "container_runtime": args.container_runtime, diff --git a/scripts/status_profile.py b/scripts/status_profile.py index 085d79e..5d10f84 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -31,7 +31,7 @@ import argparse import sys -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default, resolve_jwt_token +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler @@ -80,42 +80,12 @@ def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> ) elif scheduler == "slurm": - p.add_argument( - "--slurm-rest-url", - default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), - ) - p.add_argument( - "--slurm-jwt-token", - default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), - ) - p.add_argument( - "--slurm-api-version", - default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), - ) - p.add_argument( - "--slurm-no-verify-ssl", - action="store_true", - ) - p.add_argument( - "--slurm-submit-via", - choices=["rest", "cli"], - default=cfg_get(slurm_cfg, "submit_via", "cli"), - ) p.add_argument( "--slurm-cli-prefix", default=cfg_get(slurm_cfg, "cli_prefix", ""), ) -def _resolve_slurm_jwt(args: argparse.Namespace) -> None: - """Resolve Slurm JWT from config if not provided.""" - if args.scheduler == "slurm" and not args.slurm_jwt_token: - slurm_cfg = load_slurm_config() - token = resolve_jwt_token(slurm_cfg) - if token: - args.slurm_jwt_token = token - - def _build_scheduler(args: argparse.Namespace): if args.scheduler == "local": return LocalScheduler(workdir=getattr(args, "local_workdir", "")) @@ -129,11 +99,6 @@ def _build_scheduler(args: argparse.Namespace): ) else: return SlurmScheduler( - rest_url=args.slurm_rest_url, - jwt_token=args.slurm_jwt_token, - api_version=args.slurm_api_version, - verify_ssl=not args.slurm_no_verify_ssl, - submit_via=args.slurm_submit_via, cli_prefix=args.slurm_cli_prefix, ) @@ -153,7 +118,6 @@ def main_status(argv: list[str] | None = None) -> None: p.add_argument("--job", required=True, help="Job name or ID") args = _parse_two_pass(p, argv) - _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: info = scheduler.status(args.job) @@ -172,7 +136,6 @@ def main_logs(argv: list[str] | None = None) -> None: p.add_argument("--follow", "-f", action="store_true", help="Follow log output") args = _parse_two_pass(p, argv) - _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: text = scheduler.logs(args.job, tail=args.tail, follow=args.follow) @@ -188,7 +151,6 @@ def main_list(argv: list[str] | None = None) -> None: p.add_argument("--status", default="", help="Filter by job state (e.g. Running, Succeeded, PENDING)") args = _parse_two_pass(p, argv) - _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: jobs = scheduler.list_jobs(status_filter=args.status) @@ -214,7 +176,6 @@ def main_cancel(argv: list[str] | None = None) -> None: p.add_argument("--job", required=True, help="Job name or ID to cancel") args = _parse_two_pass(p, argv) - _resolve_slurm_jwt(args) scheduler = _build_scheduler(args) try: msg = scheduler.cancel(args.job) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 8999212..150116e 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -51,7 +51,7 @@ import sys from schedulers.base import ProfileJobSpec -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default, resolve_jwt_token +from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler @@ -256,26 +256,6 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=_d("FLOWSIM_SLURM_TIME", slurm_cfg, "time", "02:00:00"), help="Wall time limit (env: FLOWSIM_SLURM_TIME)", ) - slurm.add_argument( - "--slurm-rest-url", - default=_d("FLOWSIM_SLURM_REST_URL", slurm_cfg, "rest_url", ""), - help="slurmrestd base URL (env: FLOWSIM_SLURM_REST_URL)", - ) - slurm.add_argument( - "--slurm-jwt-token", - default=_d("FLOWSIM_SLURM_JWT_TOKEN", slurm_cfg, "jwt_token", ""), - help="JWT token for slurmrestd (env: FLOWSIM_SLURM_JWT_TOKEN)", - ) - slurm.add_argument( - "--slurm-api-version", - default=_d("FLOWSIM_SLURM_API_VERSION", slurm_cfg, "api_version", "v0.0.40"), - help="slurmrestd API version (env: FLOWSIM_SLURM_API_VERSION)", - ) - slurm.add_argument( - "--slurm-no-verify-ssl", - action="store_true", - help="Skip TLS certificate verification for slurmrestd", - ) slurm.add_argument( "--slurm-account", default=cfg_get(slurm_cfg, "account", ""), @@ -308,12 +288,6 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: metavar="DIRECTIVE", help="Extra #SBATCH directives (repeatable, without prefix)", ) - slurm.add_argument( - "--slurm-submit-via", - choices=["rest", "cli"], - default=cfg_get(slurm_cfg, "submit_via", "cli"), - help="Submission mode: cli (sbatch subprocess) or rest (slurmrestd, deprecated)", - ) slurm.add_argument( "--slurm-cli-prefix", default=cfg_get(slurm_cfg, "cli_prefix", ""), @@ -394,17 +368,12 @@ def _build_scheduler(args: argparse.Namespace): return SlurmScheduler( partition=args.slurm_partition, time_limit=args.slurm_time, - rest_url=args.slurm_rest_url, - jwt_token=args.slurm_jwt_token, - api_version=args.slurm_api_version, - verify_ssl=not args.slurm_no_verify_ssl, account=args.slurm_account, constraint=args.slurm_constraint, container_runtime=args.slurm_container_runtime, container_mounts=args.slurm_container_mounts, modules=args.slurm_module, extra_sbatch=args.slurm_extra_sbatch, - submit_via=args.slurm_submit_via, cli_prefix=args.slurm_cli_prefix, ) @@ -424,13 +393,6 @@ def main(argv: list[str] | None = None) -> None: else: args.output_dir = f"/flowsim/stage_traces/k8s/{_ts}" - # Resolve Slurm JWT token from jwt_token_cmd in config if needed - if args.scheduler == "slurm" and not args.slurm_jwt_token: - slurm_cfg = load_slurm_config() - token = resolve_jwt_token(slurm_cfg) - if token: - args.slurm_jwt_token = token - # Validate required connection params before submit if not args.dry_run and args.scheduler not in ("local",): _validate_connection(args) @@ -523,28 +485,11 @@ def _validate_connection(args: argparse.Namespace) -> None: file=sys.stderr, ) elif args.scheduler == "slurm": - if args.slurm_submit_via == "cli": - # CLI mode only needs partition - if not args.slurm_partition: - sys.exit( - "Error: missing required Slurm config:\n" - " - partition (--slurm-partition)\n\n" - f"Set it in ~/.flowsim/slurm.yaml or via CLI flag.\n" - + _INIT_HINT - ) - return - missing = [] - if not args.slurm_rest_url: - missing.append("rest_url (--slurm-rest-url)") - if not args.slurm_jwt_token: - missing.append("jwt_token/jwt_token_cmd (--slurm-jwt-token)") if not args.slurm_partition: - missing.append("partition (--slurm-partition)") - if missing: sys.exit( "Error: missing required Slurm config:\n" - + "\n".join(f" - {m}" for m in missing) - + f"\n\nSet them in ~/.flowsim/slurm.yaml or via CLI flags.\n" + " - partition (--slurm-partition)\n\n" + f"Set it in ~/.flowsim/slurm.yaml or via CLI flag.\n" + _INIT_HINT ) diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 2bb0dec..9968ea1 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -269,10 +269,6 @@ def test_render_constraint(self, spec): script = sched.render(spec) assert "#SBATCH --constraint=gpu80g" in script - def test_time_parse_minutes(self): - sched = SlurmScheduler(partition="gpu", time_limit="02:30:00") - assert sched._parse_time_minutes() == 150 - # ========================================================================= # LocalScheduler.render @@ -350,7 +346,7 @@ def test_init_slurm_help(self, capsys): _cmd_init(["slurm", "--help"]) assert exc_info.value.code == 0 out = capsys.readouterr().out - assert "--rest-url" in out + assert "--cli-prefix" in out assert "--partition" in out def test_init_k8s_missing_required(self): @@ -396,16 +392,13 @@ def test_init_slurm_saves_config(self, tmp_path: Path): from scripts.cli import _cmd_init rc = _cmd_init([ "slurm", - "--rest-url", "http://localhost:6820", "--partition", "gpu", "--account", "proj", - "--jwt-token", "fake-token", ]) assert rc == 0 cfg_file = config_dir / "slurm.yaml" assert cfg_file.exists() cfg = yaml.safe_load(cfg_file.read_text()) - assert cfg["rest_url"] == "http://localhost:6820" assert cfg["partition"] == "gpu" assert cfg["account"] == "proj" @@ -418,10 +411,8 @@ def test_init_refuses_overwrite(self, tmp_path: Path): from scripts.cli import _cmd_init rc = _cmd_init([ "slurm", - "--rest-url", "http://localhost:6820", "--partition", "gpu", "--account", "proj", - "--jwt-token", "tok", ]) assert rc != 0 # should refuse @@ -434,15 +425,13 @@ def test_init_force_overwrite(self, tmp_path: Path): from scripts.cli import _cmd_init rc = _cmd_init([ "slurm", - "--rest-url", "http://localhost:6820", "--partition", "gpu", "--account", "proj", - "--jwt-token", "tok", "--force", ]) assert rc == 0 cfg = yaml.safe_load((config_dir / "slurm.yaml").read_text()) - assert cfg["rest_url"] == "http://localhost:6820" + assert cfg["partition"] == "gpu" # ========================================================================= @@ -518,8 +507,6 @@ def test_submit_slurm_dry_run(self): "--collect", "perf", "--model-path", "Qwen/Qwen3-8B", "--slurm-partition", "gpu", - "--slurm-rest-url", "http://fake:6820", - "--slurm-jwt-token", "fake-token", "--dry-run", ) assert "#!/bin/bash" in out @@ -559,32 +546,12 @@ class TestConfig: def test_save_and_load_yaml(self, tmp_path: Path): from schedulers.config import _save_yaml, _load_yaml - data = {"rest_url": "http://localhost:6820", "partition": "gpu"} + data = {"partition": "gpu", "account": "proj"} path = tmp_path / "test.yaml" _save_yaml(path, data) loaded = _load_yaml(path) assert loaded == data - def test_resolve_jwt_token_static(self): - from schedulers.config import resolve_jwt_token - cfg = {"jwt_token": "my-secret"} - assert resolve_jwt_token(cfg) == "my-secret" - - def test_resolve_jwt_token_cmd(self): - from schedulers.config import resolve_jwt_token - cfg = {"jwt_token_cmd": "echo test-token-123"} - assert resolve_jwt_token(cfg) == "test-token-123" - - def test_resolve_jwt_token_bad_cmd(self): - from schedulers.config import resolve_jwt_token - cfg = {"jwt_token_cmd": "/nonexistent/binary"} - # Should not raise, just return empty - assert resolve_jwt_token(cfg) == "" - - def test_resolve_jwt_token_empty(self): - from schedulers.config import resolve_jwt_token - assert resolve_jwt_token({}) == "" - def test_cfg_get(self): from schedulers.config import cfg_get cfg = {"key": "value", "empty": ""} From 892eeacf80e07e9068c6f45a069ebc1902cb18b9 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 22:47:16 +0000 Subject: [PATCH 31/56] review: normalize Slurm states, implement _logs_cli, dedup image check, add env var docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Normalize all Slurm job states (PENDING→Pending, RUNNING→Running, etc.) - Implement _logs_cli: read log file via tail, fallback to hints - Remove duplicate _check_image_exists() call from local.py render() - Add supported environment variables table to README - 56 unit tests pass --- schedulers/README.md | 14 ++++++++++++- schedulers/local.py | 1 - schedulers/slurm.py | 48 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 48e6b0b..2ab2b0e 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -285,10 +285,22 @@ Config files are stored in `~/.flowsim/` and generated via `flowsim init`: Parameter priority (highest to lowest): 1. CLI flag (`--slurm-partition gpu`) -2. Environment variable (`FLOWSIM_SLURM_PARTITION=gpu`) +2. Environment variable (see table below) 3. Config file (`~/.flowsim/slurm.yaml`) 4. Built-in default +### Supported Environment Variables + +| Variable | Overrides | Example | +|----------|-----------|--------| +| `KUBECONFIG` | `--k8s-kubeconfig` | `/home/user/.kube/config` | +| `FLOWSIM_K8S_NAMESPACE` | `--k8s-namespace` | `ml-team` | +| `FLOWSIM_K8S_CONTEXT` | `--k8s-context` | `kind-flowsim` | +| `FLOWSIM_K8S_CONFIG` | Config file path | `/etc/flowsim/k8s.yaml` | +| `FLOWSIM_SLURM_PARTITION` | `--slurm-partition` | `gpu-h100` | +| `FLOWSIM_SLURM_TIME` | `--slurm-time` | `04:00:00` | +| `FLOWSIM_SLURM_CONFIG` | Config file path | `/etc/flowsim/slurm.yaml` | + ### Example k8s.yaml ```yaml diff --git a/schedulers/local.py b/schedulers/local.py index 4c61865..673acac 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -118,7 +118,6 @@ def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: return " \\\n ".join(parts) def render(self, spec: ProfileJobSpec) -> str: - self._check_image_exists(spec.image) return self._build_docker_cmd(spec) def submit(self, spec: ProfileJobSpec) -> JobResult: diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 67e954b..265a725 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -230,11 +230,20 @@ def _status_cli(self, job_id: str) -> dict: nodes = fields.get("NodeList", "") output_file = fields.get("StdOut", "") - # Normalize to match test expectations - if state == "COMPLETED": - state = "Completed" - elif state == "FAILED": - state = "Failed" + # Normalize Slurm uppercase states to capitalized format + _STATE_MAP = { + "PENDING": "Pending", + "RUNNING": "Running", + "SUSPENDED": "Suspended", + "COMPLETED": "Completed", + "CANCELLED": "Cancelled", + "FAILED": "Failed", + "TIMEOUT": "Timeout", + "NODE_FAIL": "Failed", + "PREEMPTED": "Preempted", + "OUT_OF_MEMORY": "Failed", + } + state = _STATE_MAP.get(state, state) msg_parts = [ f"Job ID: {job_id} Name: {name} State: {state}", @@ -250,10 +259,33 @@ def _status_cli(self, job_id: str) -> dict: } def _logs_cli(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: - # TODO: read actual Slurm log file (StdOut from scontrol) - # and support tail/follow properly. info = self._status_cli(job_id) - return info["message"] + output_file = info.get("output_hint", "") + + if not output_file: + return info["message"] + "\n(no log file path found)" + + # Try to read the log file via CLI prefix (handles remote Slurm) + if follow: + return ( + f"{info['message']}\n\n" + f"Follow logs:\n" + f" tail -f {output_file}" + ) + + r = self._cli_run("tail", f"-{tail}", output_file, timeout=15) + if r.returncode == 0 and r.stdout.strip(): + return r.stdout + + # Fallback: file may not exist yet or be on a remote node + return ( + f"{info['message']}\n\n" + f"Log file: {output_file}\n" + f"View on login node:\n" + f" tail -{tail} {output_file}\n" + f"Follow:\n" + f" tail -f {output_file}" + ) def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: r = self._cli_run( From 73cedbed78c7df5ca83e8c1b5f1ea19c3d0e205a Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 22:59:49 +0000 Subject: [PATCH 32/56] fix: remove --slurm-submit-via from integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missed in REST removal — integration tests still passed the deleted CLI flag. All tests pass: - 56 unit tests - K8s integration (77s) - Slurm integration (76s) --- tests/integration/test_scheduler_local.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 8815250..5f592a8 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -527,7 +527,6 @@ def test_slurm_real_submit(self, slurm_cluster): "--warmup-n", "2", "--gpus", "1", "--slurm-partition", "normal", - "--slurm-submit-via", "cli", "--slurm-cli-prefix", _SLURM_CLI_PREFIX, "--slurm-container-runtime", "none", "--output-dir", output_dir, @@ -556,7 +555,6 @@ def test_slurm_real_submit(self, slurm_cluster): r_status = _flowsim_cli( "status", "--scheduler", "slurm", "--job", job_id, - "--slurm-submit-via", "cli", "--slurm-cli-prefix", _SLURM_CLI_PREFIX, ) assert r_status.returncode == 0 @@ -590,7 +588,6 @@ def test_slurm_real_submit(self, slurm_cluster): _flowsim_cli( "cancel", "--scheduler", "slurm", "--job", job_id, - "--slurm-submit-via", "cli", "--slurm-cli-prefix", _SLURM_CLI_PREFIX, ) From 0a30f7fc446141325131e5cbe3dd13588057fbb5 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 23:48:45 +0000 Subject: [PATCH 33/56] refactor: move test infra to tests/integration/infra/, delete unused templates - Move dev-setup.sh, dev-teardown.sh, slurm-compose.yaml, slurm-node.dockerfile, kind-multi-node.yaml, slurm.conf, cgroup.conf, gres.conf from dockerfiles/ to tests/integration/infra/ - Delete schedulers/templates/ (unused by code; flowsim init generates config directly from CLI args) - Update all path references in README, config.py, test files, and shell script comments - dockerfiles/ now contains only cuda12.6.dockerfile (app image) --- schedulers/README.md | 6 ++-- schedulers/config.py | 4 +-- schedulers/templates/k8s.yaml | 28 ------------------- schedulers/templates/slurm.yaml | 23 --------------- .../integration/infra}/cgroup.conf | 0 .../integration/infra}/dev-setup.sh | 10 +++---- .../integration/infra}/dev-teardown.sh | 6 ++-- .../integration/infra}/gres.conf | 0 .../integration/infra}/kind-multi-node.yaml | 4 +-- .../integration/infra}/slurm-compose.yaml | 4 ++- .../integration/infra}/slurm-node.dockerfile | 0 .../integration/infra}/slurm.conf | 0 tests/integration/test_scheduler_local.py | 10 +++---- 13 files changed, 23 insertions(+), 72 deletions(-) delete mode 100644 schedulers/templates/k8s.yaml delete mode 100644 schedulers/templates/slurm.yaml rename {dockerfiles => tests/integration/infra}/cgroup.conf (100%) rename {dockerfiles => tests/integration/infra}/dev-setup.sh (97%) rename {dockerfiles => tests/integration/infra}/dev-teardown.sh (86%) rename {dockerfiles => tests/integration/infra}/gres.conf (100%) rename {dockerfiles => tests/integration/infra}/kind-multi-node.yaml (91%) rename {dockerfiles => tests/integration/infra}/slurm-compose.yaml (96%) rename {dockerfiles => tests/integration/infra}/slurm-node.dockerfile (100%) rename {dockerfiles => tests/integration/infra}/slurm.conf (100%) diff --git a/schedulers/README.md b/schedulers/README.md index 2ab2b0e..5b1ebf5 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -167,13 +167,13 @@ flowsim submit --scheduler k8s ... --dry-run ```bash # Start a Kind cluster (GPU passthrough + CDI mode) -bash dockerfiles/dev-setup.sh kind +bash tests/integration/infra/dev-setup.sh kind # Run K8s integration tests python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x # Teardown -bash dockerfiles/dev-teardown.sh kind +bash tests/integration/infra/dev-teardown.sh kind ``` --- @@ -258,7 +258,7 @@ If Slurm commands are not on the local PATH, use `--slurm-cli-prefix` to specify ```bash # Start Slurm cluster (slurmctld + 1 compute node + 1 GPU) -cd dockerfiles/ +cd tests/integration/infra/ docker compose -f slurm-compose.yaml up -d # Check cluster status diff --git a/schedulers/config.py b/schedulers/config.py index 723dfc2..18ab55e 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -13,8 +13,8 @@ Priority (highest → lowest): CLI flag > env var > config file > built-in default -Template files are in ``schedulers/templates/k8s.yaml`` and -``schedulers/templates/slurm.yaml``. Copy to ``~/.flowsim/`` and edit. +Run ``flowsim init k8s`` or ``flowsim init slurm`` to generate +a config under ``~/.flowsim/``. """ from __future__ import annotations diff --git a/schedulers/templates/k8s.yaml b/schedulers/templates/k8s.yaml deleted file mode 100644 index 2adb927..0000000 --- a/schedulers/templates/k8s.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# FlowSim Kubernetes scheduler config -# -# Created by: flowsim init -# Location: ~/.flowsim/k8s.yaml -# -# Fill in the values below, then submit with: -# flowsim submit --scheduler k8s --collect perf --model-path ... -# -# CLI flags and env vars can override individual values. - -# REQUIRED — path to your kubeconfig file -kubeconfig: "" # e.g. /home/me/.kube/prod.kubeconfig - -# REQUIRED — which context and namespace to use -context: "" # e.g. prod-cluster (empty = current-context) -namespace: "" # e.g. ml-team - -# Output storage (pick one or leave both empty for emptyDir) -pvc: "" # PVC name for trace output -host_output_dir: "" # hostPath alternative to PVC - -# Optional -service_account: "" -shm_size: "16Gi" -runtime_class_name: "" # e.g. "nvidia" for CDI-based GPU (Kind clusters) -# node_selector: -# gpu: a100 -# tier: high diff --git a/schedulers/templates/slurm.yaml b/schedulers/templates/slurm.yaml deleted file mode 100644 index b4d77a1..0000000 --- a/schedulers/templates/slurm.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# FlowSim Slurm scheduler config -# -# Created by: flowsim init -# Location: ~/.flowsim/slurm.yaml -# -# Fill in the values below, then submit with: -# flowsim submit --scheduler slurm --collect perf --model-path ... -# -# CLI flags and env vars can override individual values. - -# REQUIRED — cluster settings -partition: "" # e.g. gpu-h100 -account: "" # e.g. my-project - -# Optional -cli_prefix: "" # e.g. "docker exec -i slurmctld" -time: "02:00:00" -constraint: "" -container_runtime: "none" # docker | enroot | none -container_mounts: "" -# modules: -# - cuda/12.6 -# - anaconda3 diff --git a/dockerfiles/cgroup.conf b/tests/integration/infra/cgroup.conf similarity index 100% rename from dockerfiles/cgroup.conf rename to tests/integration/infra/cgroup.conf diff --git a/dockerfiles/dev-setup.sh b/tests/integration/infra/dev-setup.sh similarity index 97% rename from dockerfiles/dev-setup.sh rename to tests/integration/infra/dev-setup.sh index 7cefe05..afbb9f7 100755 --- a/dockerfiles/dev-setup.sh +++ b/tests/integration/infra/dev-setup.sh @@ -2,12 +2,12 @@ # dev-setup.sh — one-shot setup for FlowSim test clusters (kind + Slurm) # # Usage: -# ./dockerfiles/dev-setup.sh # setup both kind + slurm -# ./dockerfiles/dev-setup.sh kind # kind only -# ./dockerfiles/dev-setup.sh slurm # slurm only +# ./tests/integration/infra/dev-setup.sh # setup both kind + slurm +# ./tests/integration/infra/dev-setup.sh kind # kind only +# ./tests/integration/infra/dev-setup.sh slurm # slurm only # # Teardown: -# ./dockerfiles/dev-teardown.sh +# ./tests/integration/infra/dev-teardown.sh set -euo pipefail @@ -355,4 +355,4 @@ case "${target}" in esac echo -log "All done. Teardown with: ./dockerfiles/dev-teardown.sh" +log "All done. Teardown with: ./tests/integration/infra/dev-teardown.sh" diff --git a/dockerfiles/dev-teardown.sh b/tests/integration/infra/dev-teardown.sh similarity index 86% rename from dockerfiles/dev-teardown.sh rename to tests/integration/infra/dev-teardown.sh index dfb1c01..c5e74ee 100755 --- a/dockerfiles/dev-teardown.sh +++ b/tests/integration/infra/dev-teardown.sh @@ -2,9 +2,9 @@ # dev-teardown.sh — tear down FlowSim test clusters # # Usage: -# ./dockerfiles/dev-teardown.sh # teardown both -# ./dockerfiles/dev-teardown.sh kind # kind only -# ./dockerfiles/dev-teardown.sh slurm # slurm only +# ./tests/integration/infra/dev-teardown.sh # teardown both +# ./tests/integration/infra/dev-teardown.sh kind # kind only +# ./tests/integration/infra/dev-teardown.sh slurm # slurm only set -euo pipefail diff --git a/dockerfiles/gres.conf b/tests/integration/infra/gres.conf similarity index 100% rename from dockerfiles/gres.conf rename to tests/integration/infra/gres.conf diff --git a/dockerfiles/kind-multi-node.yaml b/tests/integration/infra/kind-multi-node.yaml similarity index 91% rename from dockerfiles/kind-multi-node.yaml rename to tests/integration/infra/kind-multi-node.yaml index ddb8cd2..90b4e6f 100644 --- a/dockerfiles/kind-multi-node.yaml +++ b/tests/integration/infra/kind-multi-node.yaml @@ -13,10 +13,10 @@ # - kind, kubectl, helm # # Usage: -# ./dockerfiles/dev-setup.sh kind +# ./tests/integration/infra/dev-setup.sh kind # # Teardown: -# ./dockerfiles/dev-teardown.sh kind +# ./tests/integration/infra/dev-teardown.sh kind kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 diff --git a/dockerfiles/slurm-compose.yaml b/tests/integration/infra/slurm-compose.yaml similarity index 96% rename from dockerfiles/slurm-compose.yaml rename to tests/integration/infra/slurm-compose.yaml index ee94656..c2369ba 100644 --- a/dockerfiles/slurm-compose.yaml +++ b/tests/integration/infra/slurm-compose.yaml @@ -1,7 +1,7 @@ # Slurm test cluster — slurmctld + 2 compute nodes (GPU 0, GPU 1) + slurmrestd # # Usage: -# cd dockerfiles/ +# cd tests/integration/infra/ # docker compose -f slurm-compose.yaml up -d # # # Wait for cluster to be ready (~30s) @@ -22,6 +22,8 @@ # # # Teardown # docker compose -f slurm-compose.yaml down -v +# # Or from project root: +# docker compose -f tests/integration/infra/slurm-compose.yaml down -v x-slurm-base: &slurm-base build: diff --git a/dockerfiles/slurm-node.dockerfile b/tests/integration/infra/slurm-node.dockerfile similarity index 100% rename from dockerfiles/slurm-node.dockerfile rename to tests/integration/infra/slurm-node.dockerfile diff --git a/dockerfiles/slurm.conf b/tests/integration/infra/slurm.conf similarity index 100% rename from dockerfiles/slurm.conf rename to tests/integration/infra/slurm.conf diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index 5f592a8..a2086f1 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -17,7 +17,7 @@ * Docker with ``flowsim-image:latest`` built (for local tests). * A GPU-equipped host machine (local tests run on the physical host, NOT inside a Docker container). -* ``dockerfiles/dev-setup.sh`` available (Kind and Slurm clusters are +* ``tests/integration/infra/dev-setup.sh`` available (Kind and Slurm clusters are automatically created if missing). * ``schedulers/`` available on PYTHONPATH. @@ -58,8 +58,8 @@ _PROJECT_ROOT = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..") ) -_DEV_SETUP = os.path.join(_PROJECT_ROOT, "dockerfiles", "dev-setup.sh") -_DEV_TEARDOWN = os.path.join(_PROJECT_ROOT, "dockerfiles", "dev-teardown.sh") +_DEV_SETUP = os.path.join(_PROJECT_ROOT, "tests", "integration", "infra", "dev-setup.sh") +_DEV_TEARDOWN = os.path.join(_PROJECT_ROOT, "tests", "integration", "infra", "dev-teardown.sh") MODEL = os.environ.get( "MODEL", "workload/models/configs/Qwen3-235B-A22B" @@ -321,7 +321,7 @@ def test_local_tp1_all(self, point): # ===================================================================== def _run_dev_setup(target: str) -> None: - """Run ``dockerfiles/dev-setup.sh `` and assert success.""" + """Run ``tests/integration/infra/dev-setup.sh `` and assert success.""" r = subprocess.run( ["bash", _DEV_SETUP, target], capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=300, @@ -334,7 +334,7 @@ def _run_dev_setup(target: str) -> None: def _run_dev_teardown(target: str) -> None: - """Run ``dockerfiles/dev-teardown.sh ``.""" + """Run ``tests/integration/infra/dev-teardown.sh ``.""" subprocess.run( ["bash", _DEV_TEARDOWN, target], capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=120, From 7831272a824f99c6456a0bedc13b22922272f481 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Wed, 18 Mar 2026 23:53:44 +0000 Subject: [PATCH 34/56] remove untested PD disaggregation code - Remove disagg_mode, disagg_transfer_backend, disagg_bootstrap_port, disagg_prefill_pp, disagg_ib_device fields from ProfileJobSpec - Remove as_prefill(), as_decode(), render_pd_pair(), submit_pd_pair() - Remove --pd, --disagg-* CLI args from submit_profile.py - Remove PD branch from main() submit/dry-run logic - Remove 8 PD-related unit tests - Remove PD Disaggregation section from README - 48 unit tests pass --- schedulers/README.md | 16 --------- schedulers/base.py | 39 --------------------- scripts/submit_profile.py | 53 ++-------------------------- tests/unit/test_scheduler_cli.py | 59 -------------------------------- 4 files changed, 3 insertions(+), 164 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 5b1ebf5..6069f2c 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -343,19 +343,3 @@ stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ └── sweep_summary.json ``` ---- - -## PD Disaggregation (Experimental) - -Supports Prefill-Decode disaggregated deployment: - -```bash -flowsim submit --scheduler k8s \ - --pd \ - --collect perf \ - --model-path Qwen/Qwen3-235B-A22B-FP8 \ - --tp 4 --gpus 8 \ - --disagg-transfer-backend mooncake -``` - -This generates two Jobs: one prefill instance and one decode instance. diff --git a/schedulers/base.py b/schedulers/base.py index 0641f41..a47ac1f 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -48,13 +48,6 @@ class ProfileJobSpec: output_dir: str = "/flowsim/stage_traces" job_name: str = "" - # -- PD disaggregation -- - disagg_mode: str = "" # "prefill", "decode", or "" (unified) - disagg_transfer_backend: str = "mooncake" # "mooncake" or "nixl" - disagg_bootstrap_port: int = 8998 - disagg_prefill_pp: int = 1 - disagg_ib_device: str = "" - # -- Sweep: explicit list of (bs, input_len, existing_ctx) tuples -- sweep_points: list[tuple[int, int, int]] = field(default_factory=list) @@ -71,14 +64,6 @@ def build_server_opts(self) -> str: ] if self.dp > 1: parts.append(f"--dp {self.dp}") - if self.disagg_mode: - parts.append(f"--disaggregation-mode {self.disagg_mode}") - parts.append(f"--disaggregation-transfer-backend {self.disagg_transfer_backend}") - parts.append(f"--disaggregation-bootstrap-port {self.disagg_bootstrap_port}") - if self.disagg_prefill_pp > 1: - parts.append(f"--disaggregation-prefill-pp {self.disagg_prefill_pp}") - if self.disagg_ib_device: - parts.append(f"--disaggregation-ib-device {self.disagg_ib_device}") if self.extra_server_opts: parts.append(self.extra_server_opts) return " ".join(parts) @@ -149,20 +134,8 @@ def default_job_name(self) -> str: name = f"flowsim-{self.collect}-{model_short}-sweep{len(self.sweep_points)}pt" else: name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" - if self.disagg_mode: - name += f"-{self.disagg_mode}" return name - def as_prefill(self) -> "ProfileJobSpec": - """Return a copy configured as the prefill instance.""" - from dataclasses import replace - return replace(self, disagg_mode="prefill") - - def as_decode(self) -> "ProfileJobSpec": - """Return a copy configured as the decode instance.""" - from dataclasses import replace - return replace(self, disagg_mode="decode") - class BaseScheduler(abc.ABC): """Abstract scheduler backend.""" @@ -225,15 +198,3 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: def dry_run(self, spec: ProfileJobSpec) -> str: """Render and return the manifest without submitting.""" return self.render(spec) - - def render_pd_pair(self, spec: ProfileJobSpec) -> str: - """Render both prefill and decode manifests for PD disaggregation.""" - prefill = self.render(spec.as_prefill()) - decode = self.render(spec.as_decode()) - return f"# === PREFILL INSTANCE ===\n{prefill}\n# === DECODE INSTANCE ===\n{decode}" - - def submit_pd_pair(self, spec: ProfileJobSpec) -> list[JobResult]: - """Submit both prefill and decode jobs.""" - r1 = self.submit(spec.as_prefill()) - r2 = self.submit(spec.as_decode()) - return [r1, r2] diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 150116e..747b9b3 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -146,37 +146,6 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: help="[debug] Print rendered manifest without submitting", ) - # -- PD disaggregation -- - pd = p.add_argument_group("PD disaggregation") - pd.add_argument( - "--pd", - action="store_true", - help="Submit a prefill + decode job pair (PD disaggregation)", - ) - pd.add_argument( - "--disagg-transfer-backend", - default="mooncake", - choices=["mooncake", "nixl"], - help="KV transfer backend (default: mooncake)", - ) - pd.add_argument( - "--disagg-bootstrap-port", - type=int, - default=8998, - help="Bootstrap port for PD coordination (default: 8998)", - ) - pd.add_argument( - "--disagg-prefill-pp", - type=int, - default=1, - help="Pipeline parallelism for prefill instance (default: 1)", - ) - pd.add_argument( - "--disagg-ib-device", - default="", - help="InfiniBand device for RDMA transfer", - ) - # ---- Two-pass: peek at --scheduler, then add only relevant args ---- # Use a minimal pre-parser to avoid required-arg errors during peek. _pre = argparse.ArgumentParser(add_help=False) @@ -332,10 +301,6 @@ def _build_spec(args: argparse.Namespace) -> ProfileJobSpec: output_dir=args.output_dir, job_name=args.job_name, extra_server_opts=args.extra_server_opts, - disagg_transfer_backend=args.disagg_transfer_backend, - disagg_bootstrap_port=args.disagg_bootstrap_port, - disagg_prefill_pp=args.disagg_prefill_pp, - disagg_ib_device=args.disagg_ib_device, sweep_points=sweep_points, ) @@ -407,23 +372,11 @@ def main(argv: list[str] | None = None) -> None: spec = _build_spec(args) scheduler = _build_scheduler(args) - is_pd = args.pd - if args.dry_run: - if is_pd: - print(scheduler.render_pd_pair(spec)) - else: - print(scheduler.dry_run(spec)) + print(scheduler.dry_run(spec)) else: - if is_pd: - results = scheduler.submit_pd_pair(spec) - for r in results: - print(r.message) - # Use the first result for follow-up hints - result = results[0] - else: - result = scheduler.submit(spec) - print(result.message) + result = scheduler.submit(spec) + print(result.message) # Tell user where to find results print() diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 9968ea1..0bdfe9d 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -41,33 +41,15 @@ def test_custom_job_name(self, spec: ProfileJobSpec): spec.job_name = "my-job" assert spec.default_job_name() == "my-job" - def test_job_name_disagg_suffix(self, spec: ProfileJobSpec): - spec.disagg_mode = "prefill" - assert spec.default_job_name().endswith("-prefill") - def test_build_server_opts_basic(self, spec: ProfileJobSpec): opts = spec.build_server_opts() assert "--model-path Qwen/Qwen3-8B" in opts assert "--tp 2" in opts - assert "--disaggregation" not in opts def test_build_server_opts_dp(self, spec: ProfileJobSpec): spec.dp = 4 assert "--dp 4" in spec.build_server_opts() - def test_build_server_opts_disagg(self, spec: ProfileJobSpec): - spec.disagg_mode = "prefill" - spec.disagg_transfer_backend = "nixl" - opts = spec.build_server_opts() - assert "--disaggregation-mode prefill" in opts - assert "--disaggregation-transfer-backend nixl" in opts - assert "--disaggregation-bootstrap-port 8998" in opts - - def test_build_server_opts_disagg_pp(self, spec: ProfileJobSpec): - spec.disagg_mode = "prefill" - spec.disagg_prefill_pp = 2 - assert "--disaggregation-prefill-pp 2" in spec.build_server_opts() - def test_build_server_opts_extra(self, spec: ProfileJobSpec): spec.extra_server_opts = "--some-flag" assert "--some-flag" in spec.build_server_opts() @@ -86,16 +68,6 @@ def test_build_shell_command_quotes_server_opts(self, spec: ProfileJobSpec): # server-opts contains spaces, must be quoted assert "--server-opts '" in shell or '--server-opts "' in shell - def test_as_prefill(self, spec: ProfileJobSpec): - p = spec.as_prefill() - assert p.disagg_mode == "prefill" - assert spec.disagg_mode == "" # original unchanged - - def test_as_decode(self, spec: ProfileJobSpec): - d = spec.as_decode() - assert d.disagg_mode == "decode" - assert spec.disagg_mode == "" - # ========================================================================= # K8sScheduler.render @@ -176,14 +148,6 @@ def test_render_labels(self, scheduler, spec): assert labels["app"] == "flowsim" assert labels["collect"] == "perf" - def test_render_pd_pair(self, scheduler, spec): - output = scheduler.render_pd_pair(spec) - assert "PREFILL INSTANCE" in output - assert "DECODE INSTANCE" in output - # Both should be valid YAML docs - docs = output.split("# === DECODE INSTANCE ===") - assert len(docs) == 2 - # ========================================================================= # SlurmScheduler.render @@ -512,29 +476,6 @@ def test_submit_slurm_dry_run(self): assert "#!/bin/bash" in out assert "#SBATCH --partition=gpu" in out - def test_submit_pd_dry_run(self): - out = self._run( - "--scheduler", "local", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", - "--pd", - "--dry-run", - ) - assert "PREFILL INSTANCE" in out - assert "DECODE INSTANCE" in out - assert "--disaggregation-mode prefill" in out - assert "--disaggregation-mode decode" in out - - def test_submit_pd_nixl_backend(self): - out = self._run( - "--scheduler", "local", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", - "--pd", - "--disagg-transfer-backend", "nixl", - "--dry-run", - ) - assert "--disaggregation-transfer-backend nixl" in out # ========================================================================= From b6dbbbb12c38cb4ec4436dc242cf69730c531dbe Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 00:47:46 +0000 Subject: [PATCH 35/56] simplify flowsim init: write annotated template instead of argparse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - flowsim init k8s → writes commented k8s.yaml template to ~/.flowsim/ - flowsim init slurm → writes commented slurm.yaml template - Users edit the file directly (comments explain each field) - Removed ~60 lines of argparse init code - Kept --force overwrite logic - Updated README examples and tests (43 pass) --- schedulers/README.md | 22 ++-- scripts/cli.py | 167 +++++++++++++++---------------- tests/unit/test_scheduler_cli.py | 97 +++++------------- 3 files changed, 114 insertions(+), 172 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 6069f2c..520559b 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -110,16 +110,11 @@ Submits profiling jobs as Kubernetes Jobs to a cluster. Supports both PVC and ho ### First-Time Setup ```bash -flowsim init k8s \ - --kubeconfig ~/.kube/config \ - --namespace default \ - --host-output-dir /host-stage-traces \ - --runtime-class-name nvidia \ - --force +# Generate an annotated config template +flowsim init k8s +# Edit ~/.flowsim/k8s.yaml with your cluster details ``` -Config is saved to `~/.flowsim/k8s.yaml` and automatically loaded on subsequent submissions. - ### Usage ```bash @@ -185,11 +180,9 @@ Generates sbatch scripts and submits them to a Slurm cluster via `sbatch`/`squeu ### First-Time Setup ```bash -flowsim init slurm \ - --partition gpu \ - --account my-project \ - --container-runtime none \ - --force +# Generate an annotated config template +flowsim init slurm +# Edit ~/.flowsim/slurm.yaml with your cluster details ``` ### Usage @@ -275,7 +268,8 @@ docker compose -f slurm-compose.yaml down -v ## Configuration -Config files are stored in `~/.flowsim/` and generated via `flowsim init`: +Config files are stored in `~/.flowsim/` and generated via `flowsim init`. +Templates include comments explaining each field — edit to match your cluster: ``` ~/.flowsim/ diff --git a/scripts/cli.py b/scripts/cli.py index 00409fb..78a3912 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -2,8 +2,8 @@ Usage:: - flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team ... - flowsim init slurm --partition gpu --account proj ... + flowsim init k8s # create ~/.flowsim/k8s.yaml template + flowsim init slurm # create ~/.flowsim/slurm.yaml template flowsim submit --scheduler k8s --collect perf --model-path ... flowsim submit ... --dry-run # debug: preview manifest """ @@ -17,110 +17,103 @@ _CONFIG_DIR = Path.home() / ".flowsim" +# ---- Annotated config templates (written by `flowsim init`) ---- -def _init_k8s_parser(sub: argparse._SubParsersAction) -> None: - p = sub.add_parser("k8s", help="Configure Kubernetes scheduler") - p.add_argument("--kubeconfig", required=True, - help="Path to kubeconfig file (REQUIRED)") - p.add_argument("--context", default="", - help="Kubeconfig context (empty = current-context)") - p.add_argument("--namespace", required=True, - help="Kubernetes namespace (REQUIRED)") - p.add_argument("--pvc", default="", - help="PVC name for trace output") - p.add_argument("--host-output-dir", default="", - help="hostPath alternative to PVC") - p.add_argument("--service-account", default="", - help="Service account for the job pod") - p.add_argument("--shm-size", default="16Gi", - help="Shared memory size (default: 16Gi)") - p.add_argument("--runtime-class-name", default="", - help="RuntimeClass for pod (e.g. 'nvidia' for CDI mode)") - p.add_argument("--force", action="store_true", - help="Overwrite existing config file") - - -def _init_slurm_parser(sub: argparse._SubParsersAction) -> None: - p = sub.add_parser("slurm", help="Configure Slurm scheduler") - p.add_argument("--partition", required=True, - help="Slurm partition (REQUIRED)") - p.add_argument("--account", default="", - help="Slurm account") - p.add_argument("--cli-prefix", default="", - help='CLI mode prefix, e.g. "docker exec -i slurmctld"') - p.add_argument("--time", default="02:00:00", - help="Job time limit (default: 02:00:00)") - p.add_argument("--constraint", default="", - help="Node constraint") - p.add_argument("--container-runtime", default="none", - choices=["docker", "enroot", "none"], - help="Container runtime (default: none)") - p.add_argument("--container-mounts", default="", - help="Container mount spec") - p.add_argument("--force", action="store_true", - help="Overwrite existing config file") +_K8S_TEMPLATE = """\ +# FlowSim Kubernetes scheduler config +# Edit this file, then run: flowsim submit --scheduler k8s ... +# Path to kubeconfig file (required) +kubeconfig: ~/.kube/config -def _cmd_init(argv: list[str]) -> int: - """Save scheduler config to ~/.flowsim/ from CLI args.""" - from schedulers.config import _save_yaml +# Kubeconfig context (empty = current-context) +context: "" + +# Kubernetes namespace (required) +namespace: default + +# Persistent storage for trace output (set one): +# pvc: my-traces-pvc +# host_output_dir: /data/flowsim-traces +pvc: "" +host_output_dir: "" + +# Service account for the job pod (empty = default) +service_account: "" + +# Shared memory size (for /dev/shm in the pod) +shm_size: "16Gi" + +# RuntimeClass (e.g. "nvidia" for CDI GPU passthrough) +runtime_class_name: "" +""" + +_SLURM_TEMPLATE = """\ +# FlowSim Slurm scheduler config +# Edit this file, then run: flowsim submit --scheduler slurm ... + +# Slurm partition (required) +partition: gpu + +# Billing account (empty = default) +account: "" + +# Job time limit +time: "02:00:00" + +# Node constraint (e.g. "h100") +constraint: "" +# CLI prefix for remote sbatch/squeue/scancel +# Examples: +# "docker exec -i slurmctld" (via Docker container) +# "ssh login-node" (via SSH) +cli_prefix: "" + +# Container runtime: docker | enroot | none +container_runtime: none + +# Container mount spec (for enroot/docker) +container_mounts: "" +""" + + +def _cmd_init(argv: list[str]) -> int: + """Copy an annotated config template to ~/.flowsim/.""" parser = argparse.ArgumentParser( prog="flowsim init", description=( - "Configure a scheduler and save to ~/.flowsim/.\n\n" + "Generate a scheduler config template under ~/.flowsim/.\n\n" "Examples:\n" - " flowsim init k8s --kubeconfig ~/.kube/config --namespace ml-team\n" - " flowsim init slurm --partition gpu --account proj" + " flowsim init k8s # creates ~/.flowsim/k8s.yaml\n" + " flowsim init slurm # creates ~/.flowsim/slurm.yaml\n" + " flowsim init slurm --force # overwrite existing" ), formatter_class=argparse.RawDescriptionHelpFormatter, ) - sub = parser.add_subparsers(dest="scheduler") - sub.required = True - _init_k8s_parser(sub) - _init_slurm_parser(sub) - + parser.add_argument( + "scheduler", choices=["k8s", "slurm"], + help="Scheduler type", + ) + parser.add_argument( + "--force", action="store_true", + help="Overwrite existing config file", + ) args = parser.parse_args(argv) - if args.scheduler == "k8s": - kube_path = Path(args.kubeconfig).expanduser() - if not kube_path.is_file(): - print(f"Error: kubeconfig not found: {kube_path}", file=sys.stderr) - return 1 - cfg = { - "kubeconfig": str(kube_path), - "context": args.context, - "namespace": args.namespace, - "pvc": args.pvc, - "host_output_dir": args.host_output_dir, - "service_account": args.service_account, - "shm_size": args.shm_size, - "runtime_class_name": args.runtime_class_name, - } - dst = _CONFIG_DIR / "k8s.yaml" - - elif args.scheduler == "slurm": - cfg = { - "cli_prefix": args.cli_prefix, - "partition": args.partition, - "account": args.account, - "time": args.time, - "constraint": args.constraint, - "container_runtime": args.container_runtime, - "container_mounts": args.container_mounts, - } - dst = _CONFIG_DIR / "slurm.yaml" - else: - parser.print_help() - return 1 + templates = {"k8s": _K8S_TEMPLATE, "slurm": _SLURM_TEMPLATE} + dst = _CONFIG_DIR / f"{args.scheduler}.yaml" if dst.exists() and not args.force: print(f"Error: {dst} already exists (use --force to overwrite)", file=sys.stderr) return 1 - _save_yaml(dst, cfg) - print(f"Saved {dst}") + _CONFIG_DIR.mkdir(parents=True, exist_ok=True) + dst.write_text(templates[args.scheduler]) + print(f"Created {dst}") + print("Edit the file, then run: flowsim submit --scheduler " + f"{args.scheduler} ...") return 0 diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 0bdfe9d..5e07fca 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -295,76 +295,39 @@ def test_init_no_args_shows_help(self, capsys): _cmd_init([]) assert exc_info.value.code != 0 - def test_init_k8s_help(self, capsys): - from scripts.cli import _cmd_init - with pytest.raises(SystemExit) as exc_info: - _cmd_init(["k8s", "--help"]) - assert exc_info.value.code == 0 - out = capsys.readouterr().out - assert "--kubeconfig" in out - assert "--namespace" in out - - def test_init_slurm_help(self, capsys): - from scripts.cli import _cmd_init - with pytest.raises(SystemExit) as exc_info: - _cmd_init(["slurm", "--help"]) - assert exc_info.value.code == 0 - out = capsys.readouterr().out - assert "--cli-prefix" in out - assert "--partition" in out - - def test_init_k8s_missing_required(self): - from scripts.cli import _cmd_init - with pytest.raises(SystemExit) as exc_info: - _cmd_init(["k8s"]) - assert exc_info.value.code != 0 - - def test_init_slurm_missing_required(self): - from scripts.cli import _cmd_init - with pytest.raises(SystemExit) as exc_info: - _cmd_init(["slurm"]) - assert exc_info.value.code != 0 - - def test_init_k8s_bad_kubeconfig(self): - from scripts.cli import _cmd_init - rc = _cmd_init(["k8s", "--kubeconfig", "/nonexistent/path", "--namespace", "ns"]) - assert rc != 0 - - def test_init_k8s_saves_config(self, tmp_path: Path): - # Create a fake kubeconfig - kube = tmp_path / "kubeconfig" - kube.write_text("apiVersion: v1\nclusters: []\n") - + def test_init_k8s_creates_template(self, tmp_path: Path): config_dir = tmp_path / "flowsim" with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init - rc = _cmd_init([ - "k8s", - "--kubeconfig", str(kube), - "--namespace", "test-ns", - ]) + rc = _cmd_init(["k8s"]) assert rc == 0 cfg_file = config_dir / "k8s.yaml" assert cfg_file.exists() - cfg = yaml.safe_load(cfg_file.read_text()) - assert cfg["namespace"] == "test-ns" - assert cfg["kubeconfig"] == str(kube) - - def test_init_slurm_saves_config(self, tmp_path: Path): + content = cfg_file.read_text() + assert "kubeconfig:" in content + assert "namespace:" in content + # Template should have comments + assert content.startswith("#") + # Should be valid YAML + cfg = yaml.safe_load(content) + assert "kubeconfig" in cfg + assert "namespace" in cfg + + def test_init_slurm_creates_template(self, tmp_path: Path): config_dir = tmp_path / "flowsim" with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init - rc = _cmd_init([ - "slurm", - "--partition", "gpu", - "--account", "proj", - ]) + rc = _cmd_init(["slurm"]) assert rc == 0 cfg_file = config_dir / "slurm.yaml" assert cfg_file.exists() - cfg = yaml.safe_load(cfg_file.read_text()) - assert cfg["partition"] == "gpu" - assert cfg["account"] == "proj" + content = cfg_file.read_text() + assert "partition:" in content + assert "cli_prefix:" in content + # Template should have comments + assert content.startswith("#") + cfg = yaml.safe_load(content) + assert "partition" in cfg def test_init_refuses_overwrite(self, tmp_path: Path): config_dir = tmp_path / "flowsim" @@ -373,11 +336,7 @@ def test_init_refuses_overwrite(self, tmp_path: Path): with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init - rc = _cmd_init([ - "slurm", - "--partition", "gpu", - "--account", "proj", - ]) + rc = _cmd_init(["slurm"]) assert rc != 0 # should refuse def test_init_force_overwrite(self, tmp_path: Path): @@ -387,15 +346,11 @@ def test_init_force_overwrite(self, tmp_path: Path): with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init - rc = _cmd_init([ - "slurm", - "--partition", "gpu", - "--account", "proj", - "--force", - ]) + rc = _cmd_init(["slurm", "--force"]) assert rc == 0 - cfg = yaml.safe_load((config_dir / "slurm.yaml").read_text()) - assert cfg["partition"] == "gpu" + content = (config_dir / "slurm.yaml").read_text() + assert "partition:" in content + assert "existing" not in content # ========================================================================= From 95028db75328b335e72ab0329c2f1817ca912450 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 00:52:11 +0000 Subject: [PATCH 36/56] add --config flag to flowsim init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - flowsim init k8s --config my.yaml → installs user file to ~/.flowsim/ - flowsim init k8s → writes annotated template (unchanged) - Added 2 tests: config copy + missing file error --- scripts/cli.py | 38 +++++++++++++++++++++++--------- tests/unit/test_scheduler_cli.py | 20 +++++++++++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/scripts/cli.py b/scripts/cli.py index 78a3912..c123421 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -79,15 +79,19 @@ def _cmd_init(argv: list[str]) -> int: - """Copy an annotated config template to ~/.flowsim/.""" + """Set up scheduler config under ~/.flowsim/. + + Without --config: writes an annotated template. + With --config: copies the user-provided file. + """ parser = argparse.ArgumentParser( prog="flowsim init", description=( - "Generate a scheduler config template under ~/.flowsim/.\n\n" + "Set up scheduler config under ~/.flowsim/.\n\n" "Examples:\n" - " flowsim init k8s # creates ~/.flowsim/k8s.yaml\n" - " flowsim init slurm # creates ~/.flowsim/slurm.yaml\n" - " flowsim init slurm --force # overwrite existing" + " flowsim init k8s # write template\n" + " flowsim init k8s --config my.yaml # use existing file\n" + " flowsim init slurm --force # overwrite existing" ), formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -95,13 +99,16 @@ def _cmd_init(argv: list[str]) -> int: "scheduler", choices=["k8s", "slurm"], help="Scheduler type", ) + parser.add_argument( + "--config", "-c", default="", + help="Path to an existing config YAML to install", + ) parser.add_argument( "--force", action="store_true", help="Overwrite existing config file", ) args = parser.parse_args(argv) - templates = {"k8s": _K8S_TEMPLATE, "slurm": _SLURM_TEMPLATE} dst = _CONFIG_DIR / f"{args.scheduler}.yaml" if dst.exists() and not args.force: @@ -110,10 +117,21 @@ def _cmd_init(argv: list[str]) -> int: return 1 _CONFIG_DIR.mkdir(parents=True, exist_ok=True) - dst.write_text(templates[args.scheduler]) - print(f"Created {dst}") - print("Edit the file, then run: flowsim submit --scheduler " - f"{args.scheduler} ...") + + if args.config: + src = Path(args.config).expanduser() + if not src.is_file(): + print(f"Error: config file not found: {src}", file=sys.stderr) + return 1 + import shutil + shutil.copy2(src, dst) + print(f"Installed {src} → {dst}") + else: + templates = {"k8s": _K8S_TEMPLATE, "slurm": _SLURM_TEMPLATE} + dst.write_text(templates[args.scheduler]) + print(f"Created {dst}") + print("Edit the file, then run: flowsim submit --scheduler " + f"{args.scheduler} ...") return 0 diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 5e07fca..08e7146 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -352,6 +352,26 @@ def test_init_force_overwrite(self, tmp_path: Path): assert "partition:" in content assert "existing" not in content + def test_init_config_copies_file(self, tmp_path: Path): + # User has an existing config + user_cfg = tmp_path / "my-k8s.yaml" + user_cfg.write_text("namespace: prod\nkubeconfig: /etc/kube\n") + + config_dir = tmp_path / "flowsim" + with mock.patch("scripts.cli._CONFIG_DIR", config_dir): + from scripts.cli import _cmd_init + rc = _cmd_init(["k8s", "--config", str(user_cfg)]) + assert rc == 0 + installed = config_dir / "k8s.yaml" + assert installed.exists() + cfg = yaml.safe_load(installed.read_text()) + assert cfg["namespace"] == "prod" + + def test_init_config_missing_file(self): + from scripts.cli import _cmd_init + rc = _cmd_init(["k8s", "--config", "/nonexistent/path.yaml"]) + assert rc != 0 + # ========================================================================= # CLI: flowsim submit (parse/dry-run only, no actual submission) From ac41690600818afdd633f8b1531bdbd8a8db0363 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 00:59:44 +0000 Subject: [PATCH 37/56] use template files for flowsim init instead of inline strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move annotated templates to schedulers/templates/{k8s,slurm}.yaml - flowsim init k8s → copies bundled template to ~/.flowsim/ - flowsim init k8s --config my.yaml → copies user file instead - Remove inline template strings from cli.py --- schedulers/config.py | 5 +- schedulers/templates/k8s.yaml | 27 +++++++++ schedulers/templates/slurm.yaml | 27 +++++++++ scripts/cli.py | 100 +++++++------------------------- 4 files changed, 77 insertions(+), 82 deletions(-) create mode 100644 schedulers/templates/k8s.yaml create mode 100644 schedulers/templates/slurm.yaml diff --git a/schedulers/config.py b/schedulers/config.py index 18ab55e..3b2d2fd 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -13,8 +13,9 @@ Priority (highest → lowest): CLI flag > env var > config file > built-in default -Run ``flowsim init k8s`` or ``flowsim init slurm`` to generate -a config under ``~/.flowsim/``. +Run ``flowsim init k8s`` or ``flowsim init slurm`` to install +a config template under ``~/.flowsim/``. Templates are in +``schedulers/templates/``. """ from __future__ import annotations diff --git a/schedulers/templates/k8s.yaml b/schedulers/templates/k8s.yaml new file mode 100644 index 0000000..8f548de --- /dev/null +++ b/schedulers/templates/k8s.yaml @@ -0,0 +1,27 @@ +# FlowSim Kubernetes scheduler config +# Copy to ~/.flowsim/k8s.yaml and edit: +# flowsim init k8s --config schedulers/templates/k8s.yaml + +# Path to kubeconfig file (required) +kubeconfig: ~/.kube/config + +# Kubeconfig context (empty = current-context) +context: "" + +# Kubernetes namespace (required) +namespace: default + +# Persistent storage for trace output (set one): +# pvc: my-traces-pvc +# host_output_dir: /data/flowsim-traces +pvc: "" +host_output_dir: "" + +# Service account for the job pod (empty = default) +service_account: "" + +# Shared memory size (for /dev/shm in the pod) +shm_size: "16Gi" + +# RuntimeClass (e.g. "nvidia" for CDI GPU passthrough) +runtime_class_name: "" diff --git a/schedulers/templates/slurm.yaml b/schedulers/templates/slurm.yaml new file mode 100644 index 0000000..5f27328 --- /dev/null +++ b/schedulers/templates/slurm.yaml @@ -0,0 +1,27 @@ +# FlowSim Slurm scheduler config +# Copy to ~/.flowsim/slurm.yaml and edit: +# flowsim init slurm --config schedulers/templates/slurm.yaml + +# Slurm partition (required) +partition: gpu + +# Billing account (empty = default) +account: "" + +# Job time limit +time: "02:00:00" + +# Node constraint (e.g. "h100") +constraint: "" + +# CLI prefix for remote sbatch/squeue/scancel +# Examples: +# "docker exec -i slurmctld" (via Docker container) +# "ssh login-node" (via SSH) +cli_prefix: "" + +# Container runtime: docker | enroot | none +container_runtime: none + +# Container mount spec (for enroot/docker) +container_mounts: "" diff --git a/scripts/cli.py b/scripts/cli.py index c123421..ba0a65e 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -16,81 +16,22 @@ _CONFIG_DIR = Path.home() / ".flowsim" - -# ---- Annotated config templates (written by `flowsim init`) ---- - -_K8S_TEMPLATE = """\ -# FlowSim Kubernetes scheduler config -# Edit this file, then run: flowsim submit --scheduler k8s ... - -# Path to kubeconfig file (required) -kubeconfig: ~/.kube/config - -# Kubeconfig context (empty = current-context) -context: "" - -# Kubernetes namespace (required) -namespace: default - -# Persistent storage for trace output (set one): -# pvc: my-traces-pvc -# host_output_dir: /data/flowsim-traces -pvc: "" -host_output_dir: "" - -# Service account for the job pod (empty = default) -service_account: "" - -# Shared memory size (for /dev/shm in the pod) -shm_size: "16Gi" - -# RuntimeClass (e.g. "nvidia" for CDI GPU passthrough) -runtime_class_name: "" -""" - -_SLURM_TEMPLATE = """\ -# FlowSim Slurm scheduler config -# Edit this file, then run: flowsim submit --scheduler slurm ... - -# Slurm partition (required) -partition: gpu - -# Billing account (empty = default) -account: "" - -# Job time limit -time: "02:00:00" - -# Node constraint (e.g. "h100") -constraint: "" - -# CLI prefix for remote sbatch/squeue/scancel -# Examples: -# "docker exec -i slurmctld" (via Docker container) -# "ssh login-node" (via SSH) -cli_prefix: "" - -# Container runtime: docker | enroot | none -container_runtime: none - -# Container mount spec (for enroot/docker) -container_mounts: "" -""" +_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "schedulers" / "templates" def _cmd_init(argv: list[str]) -> int: - """Set up scheduler config under ~/.flowsim/. + """Install a scheduler config to ~/.flowsim/. - Without --config: writes an annotated template. - With --config: copies the user-provided file. + Without --config: copies the bundled template from schedulers/templates/. + With --config: copies the specified file. """ parser = argparse.ArgumentParser( prog="flowsim init", description=( - "Set up scheduler config under ~/.flowsim/.\n\n" + "Install scheduler config under ~/.flowsim/.\n\n" "Examples:\n" - " flowsim init k8s # write template\n" - " flowsim init k8s --config my.yaml # use existing file\n" + " flowsim init k8s # install bundled template\n" + " flowsim init k8s --config my.yaml # install your own file\n" " flowsim init slurm --force # overwrite existing" ), formatter_class=argparse.RawDescriptionHelpFormatter, @@ -101,7 +42,7 @@ def _cmd_init(argv: list[str]) -> int: ) parser.add_argument( "--config", "-c", default="", - help="Path to an existing config YAML to install", + help="Path to a config YAML to install (default: bundled template)", ) parser.add_argument( "--force", action="store_true", @@ -116,22 +57,21 @@ def _cmd_init(argv: list[str]) -> int: file=sys.stderr) return 1 - _CONFIG_DIR.mkdir(parents=True, exist_ok=True) - if args.config: src = Path(args.config).expanduser() - if not src.is_file(): - print(f"Error: config file not found: {src}", file=sys.stderr) - return 1 - import shutil - shutil.copy2(src, dst) - print(f"Installed {src} → {dst}") else: - templates = {"k8s": _K8S_TEMPLATE, "slurm": _SLURM_TEMPLATE} - dst.write_text(templates[args.scheduler]) - print(f"Created {dst}") - print("Edit the file, then run: flowsim submit --scheduler " - f"{args.scheduler} ...") + src = _TEMPLATES_DIR / f"{args.scheduler}.yaml" + + if not src.is_file(): + print(f"Error: config file not found: {src}", file=sys.stderr) + return 1 + + import shutil + _CONFIG_DIR.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + print(f"Installed {src} → {dst}") + print(f"Edit {dst}, then run: flowsim submit --scheduler " + f"{args.scheduler} ...") return 0 From da8ab00589911ee3bafd53567118f9a4a538a6ba Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 01:01:28 +0000 Subject: [PATCH 38/56] update README: reflect template-file init with --config option --- schedulers/README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 520559b..8f9d8e1 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -110,9 +110,12 @@ Submits profiling jobs as Kubernetes Jobs to a cluster. Supports both PVC and ho ### First-Time Setup ```bash -# Generate an annotated config template +# Install the bundled config template flowsim init k8s # Edit ~/.flowsim/k8s.yaml with your cluster details + +# Or install your own config file +flowsim init k8s --config my-cluster.yaml ``` ### Usage @@ -180,9 +183,12 @@ Generates sbatch scripts and submits them to a Slurm cluster via `sbatch`/`squeu ### First-Time Setup ```bash -# Generate an annotated config template +# Install the bundled config template flowsim init slurm # Edit ~/.flowsim/slurm.yaml with your cluster details + +# Or install your own config file +flowsim init slurm --config my-slurm.yaml ``` ### Usage @@ -268,8 +274,8 @@ docker compose -f slurm-compose.yaml down -v ## Configuration -Config files are stored in `~/.flowsim/` and generated via `flowsim init`. -Templates include comments explaining each field — edit to match your cluster: +Config files are stored in `~/.flowsim/` and installed via `flowsim init`. +Templates are in `schedulers/templates/` with comments explaining each field: ``` ~/.flowsim/ From 059f3eaf4535d7b9429b5d582810ce1d3e83b75b Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 19:09:17 +0000 Subject: [PATCH 39/56] docs: streamline READMEs, unify examples, remove legacy manual workflow - Root README: replace manual docker run profile/parse with flowsim submit - Schedulers README: remove redundant How It Works, inline YAML examples, scattered test sections - Unify model/params across both READMEs (Qwen3-235B-A22B, tp=1, gpus=1, --load-format dummy) - Add Scheduler Backends section to root README linking to schedulers/README.md --- README.md | 215 +++++++++----------------------- schedulers/README.md | 291 ++++++++++++++----------------------------- 2 files changed, 151 insertions(+), 355 deletions(-) diff --git a/README.md b/README.md index c4a674e..604c2a5 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ The project supports rapid deployment using Docker, includes scripts for environ ## Table of Contents - [Getting Started](#getting-started) +- [Stage Profiling](#stage-profiling) +- [Scheduler Backends](#scheduler-backends) - [For Developers](#for-developers) - [Risks and limitations](#risks-and-limitations) - [License](#license) @@ -49,208 +51,112 @@ make build-docker This creates a local image named `flowsim-image` with FlowSim patches already applied to sglang. -### 2. Run Profile → Parse → Simulate +### 2. Profile (Generate Traces) -Create workspace directories on your host for storing traces and results: +Use `flowsim submit` to capture stage-separated traces (EXTEND + DECODE), parse them, and run cross-rank analysis — all in one step. See [Stage Profiling](#stage-profiling) for how stages and collection modes work. ```bash -mkdir -p /data/flowsim-profile -mkdir -p /data/flowsim-simulate -``` - -#### Step 1: Profile (Generate Traces) - -```bash -sudo docker run --gpus=all \ - -v /data/flowsim-profile:/workspace/profile \ - -v /data/flowsim-simulate:/workspace/simulate \ - -w /flowsim \ - --cap-add=SYS_ADMIN \ - --network=host \ - --shm-size 911G \ - flowsim-image \ - python scripts/run_profile.py \ - --profile-dir /workspace/profile \ - --log-dir /workspace/profile/logs \ - --bench-timeout 3600 \ - --server-opts "--model-path /flowsim/workload/models/configs/deepseek/ --load-format dummy --tp 4 --ep 4 --host 0.0.0.0 --port 30001 --attention-backend flashinfer --disable-cuda-graph" \ - --bench-opts "--backend sglang --host 0.0.0.0 --port 30001 --dataset-name defined-len --prefill-decode-lens 1024:8 --num-prompts 1 --profile" -``` - -**What this does:** -- Starts an sglang server with profiling enabled -- Runs benchmark requests against it -- Generates `*.trace.json.gz` files in `/data/flowsim-profile` (mounted as `/workspace/profile`) - -**Note:** The first run will be slow (~10 minutes) due to DeepGEMM kernel warmup and compilation. For stable performance, avoid using `--rm` flag and reuse the same container using `sudo docker exec -it bash`. Subsequent runs with similar configurations will be faster. - -**Tip:** -- Adjust `--server-opts` and `--bench-opts` to match your model, parallelism (TP/DP/EP), and workload requirements. All `sglang.launch_server` and `bench_serving.py` parameters are supported. -- Trace files can be visualized using [Perfetto UI](https://ui.perfetto.dev/) by uploading the `.trace.json.gz` files directly. -- For multi-GPU profiling (TP > 1), merge individual traces into a single file for a global view: - ```bash - python /flowsim/utils/merge_trace.py \ - --trace_dir /data/flowsim-profile \ - --output /data/flowsim-profile/merged_trace.json - ``` - Then visualize the merged trace at [Perfetto UI](https://ui.perfetto.dev/). - -#### Step 2: Parse (Convert Trace to CSV) - -```bash -sudo docker run --rm \ - -v /data/flowsim-profile:/workspace/profile \ - -v /data/flowsim-simulate:/workspace/simulate \ - -w /flowsim \ - flowsim-image \ - python -m scripts.run_parse \ - --trace-file /workspace/profile/your-trace-name-TP-0.trace.json.gz \ - --output-dir /workspace/simulate +pip install -e . +flowsim submit --scheduler local \ + --collect all \ + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --extra-server-opts "--load-format dummy" ``` -Replace `your-trace-name-TP-0.trace.json.gz` with the actual filename from step 1. - -**What this does:** -- Parses the trace file -- Extracts kernel-level information (operator, shapes, dtypes) -- Generates a CSV file and JSON summary in `/data/flowsim-simulate` (mounted as `/workspace/simulate`) - -**Fallback:** If you don't have a GPU or can't run profiling, use the demo trace shipped with the repo: +For K8s / Slurm clusters, see [Scheduler Backends](#scheduler-backends). +**Tip:** Trace files can be visualized at [Perfetto UI](https://ui.perfetto.dev/). For multi-GPU traces, merge them first: ```bash -sudo docker run --rm \ - -v /data/flowsim-simulate:/workspace/simulate \ - -w /flowsim \ - flowsim-image \ - python -m scripts.run_parse \ - --trace-file /flowsim/demo/deepseekv3-TP-0.trace.json.gz \ - --output-dir /workspace/simulate +python utils/merge_trace.py --trace_dir stage_traces/local/*/bs1_input2048_ctx0 --output merged.json ``` -#### Step 3: Simulate (Run Hardware Simulation) +### 3. Simulate (Run Hardware Simulation) -This step requires a running LLMCompass backend. First, build the backend image: +Build and start the LLMCompass backend, then submit parsed traces for kernel-level simulation: ```bash +# Build backend image sudo docker build -t llmcompass-backend -f backend/LLMCompass/Dockerfile backend/LLMCompass/ -``` -Then start the backend: - -```bash -# Terminal 1: Start LLMCompass backend +# Terminal 1: Start backend sudo docker run --rm -p 8000:8000 llmcompass-backend -``` -Then in another terminal, run the simulation: - -```bash # Terminal 2: Run simulation -sudo docker run --rm \ - --network=host \ - -v /data/flowsim-profile:/workspace/profile \ - -v /data/flowsim-simulate:/workspace/simulate \ +sudo docker run --rm --network=host \ + -v /data/flowsim:/workspace \ flowsim-image \ python -m scripts.run_simulate \ - --trace-file /workspace/profile/your-trace-name-TP-0.trace.json.gz \ + --trace-file /workspace/traces/bs1_input2048_ctx0/*-TP-0-EXTEND.trace.json.gz \ --api-url http://127.0.0.1:8000 \ --artifact-dir /workspace/simulate/llmcompass ``` -**What this does:** -- Parses the trace into kernels -- Submits each kernel to the LLMCompass backend `/tasks` API -- Polls until all tasks complete -- Writes request/response artifacts to `/workspace/simulate/llmcompass` - -### 3. Inspect Results - -All generated files are available on your host at `/data/`: +### 4. Inspect Results ```bash -ls -lh /data/flowsim-profile/ # Raw trace files -ls -lh /data/flowsim-simulate/ # Parsed CSV, summary, simulation artifacts +ls -lh /data/flowsim/traces/ # Stage-separated traces + parsed CSVs +ls -lh /data/flowsim/simulate/ # Simulation artifacts ``` --- -## Stage Profiling (`run_stage_profile.py`) +## Stage Profiling -`scripts/run_stage_profile.py` is the single entry-point for **stage-separated** profiling: it captures prefill (EXTEND) and decode traces independently, parses them, runs cross-rank kernel analysis, and optionally collects kernel input shapes. +FlowSim performs **stage-separated** profiling: it captures prefill (EXTEND) and decode traces independently, parses them, runs cross-rank kernel analysis, and optionally collects kernel input shapes. -### Quick reference +### How stages work Each profiling request produces **two** stage-separated traces: - **EXTEND** (prefill) — processes `input_len` new tokens (with optional `existing_ctx` tokens already in KV cache) -- **DECODE** — profiler captures `decode-tokens` decode batch steps - -The profiler captures exactly **one** EXTEND batch and **decode-tokens** DECODE batches per run. +- **DECODE** — captures `decode-tokens` decode batch steps -| Flag | Description | Default | -|---|---|---| -| `--input-len` | Number of new prefill tokens per request (EXTEND) | 2048 | -| `--existing-ctx` | Tokens already in KV cache from a prior request (0 = cold prefill) | 0 | -| `--bs` | Batch size (concurrent requests) | 1 | -| `--decode-tokens` | Number of decode tokens to generate (= number of decode batches profiled) | 32 | +### Collection modes | Mode | What it does | |---|---| -| `--collect perf` | Profile a single (bs, input_len, existing_ctx) point → trace (EXTEND + DECODE) → parse → cross-rank analysis | -| `--collect shapes` | Re-run **without CUDA graph** to capture kernel input shapes, then merge into timing CSVs (both EXTEND and DECODE) | -| `--collect all` | Both phases back-to-back (auto-restarts the server in between). Requires `--launch-server`. | - -`--collect` is required. Use `perf`, `shapes`, or `all`. +| `--collect perf` | Profile a single (bs, input_len, existing_ctx) point → trace → parse → cross-rank analysis | +| `--collect shapes` | Re-run **without CUDA graph** to capture kernel input shapes, then merge into timing CSVs | +| `--collect all` | Both phases back-to-back (auto-restarts the server in between) | ### Examples -**Cold prefill** (server already running): - ```bash -python3 scripts/run_stage_profile.py \ +# Basic profiling +flowsim submit --scheduler local \ --collect perf \ - --bs 1 --input-len 2048 --decode-tokens 32 \ - --output-dir /workspace/traces \ - --host 0.0.0.0 --port 30001 -``` + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --extra-server-opts "--load-format dummy" -**With existing KV cache context:** - -```bash -python3 scripts/run_stage_profile.py \ +# With existing KV cache context +flowsim submit --scheduler local \ --collect perf \ - --bs 4 --input-len 512 --existing-ctx 4096 --decode-tokens 32 \ - --output-dir /workspace/traces \ - --launch-server \ - --server-opts "--model-path Qwen/Qwen3-235B-A22B-FP8 --tp 4 --host 0.0.0.0 --port 30001" -``` + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 4 --input-len 512 --existing-ctx 4096 --gpus 1 \ + --extra-server-opts "--load-format dummy" -**Collect shapes only** (requires a no-CUDA-graph server): - -```bash -python3 scripts/run_stage_profile.py \ - --collect shapes \ - --output-dir /workspace/sweep_P1_tp4 \ - --launch-server \ - --server-opts "--model-path Qwen/Qwen3-235B-A22B-FP8 --tp 4 --host 0.0.0.0 --port 30001" -``` - -When `--collect shapes` is used with `--launch-server`, the server is automatically started with `--disable-cuda-graph --disable-cuda-graph-padding`. - -**Full pipeline** (perf → auto-restart → shapes → merge): +# Full pipeline (perf + shapes) +flowsim submit --scheduler local \ + --collect all \ + --model-path workload/models/configs/Qwen3-235B-A22B \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --extra-server-opts "--load-format dummy" -```bash -python3 scripts/run_stage_profile.py \ +# Multi-point sweep +flowsim submit --scheduler local \ --collect all \ - --output-dir /workspace/sweep_P1_tp4 \ - --launch-server \ - --server-opts "--model-path Qwen/Qwen3-235B-A22B-FP8 --tp 4 --host 0.0.0.0 --port 30001" + --model-path workload/models/configs/Qwen3-235B-A22B \ + --sweep 1:2048:0 4:2048:0 8:2048:0 --gpus 1 \ + --extra-server-opts "--load-format dummy" ``` +For K8s / Slurm clusters, replace `--scheduler local` with `k8s` or `slurm`. See [schedulers/README.md](schedulers/README.md) for full scheduler documentation. ### Output structure ``` -sweep_P1_tp4/ +stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ ├── sweep_summary.json ├── bs1_input2048_ctx0/ │ ├── *-TP-*-EXTEND.trace.json.gz @@ -266,23 +172,22 @@ sweep_P1_tp4/ After `--collect shapes`, each `parsed/TP-*-DECODE.csv` gains a `Dims` column with kernel tensor shapes. -### Helper scripts - -| Script | Purpose | -|---|---| -| `tests/integration/test_stage_profile_configs.py` | Integration tests for `--collect {perf,shapes,all}` across parallelism configs. Run with `pytest` inside Docker. Filter with `RUN_CONFIGS=P1`. | - ### Utilities (`utils/`) | File | Purpose | |---|---| | `utils/cross_rank_agg.py` | Cross-rank kernel aggregation (symmetric collectives → min, asymmetric → max, compute → mean) | | `utils/shape_merge.py` | Merge kernel shape data into timing CSVs | -| `utils/net.py` | Shared networking helpers (`wait_for_port`) | | `utils/merge_trace.py` | Merge multi-rank traces into a single Perfetto-compatible file | --- +## Scheduler Backends + +For submitting profiling jobs to **local Docker**, **Kubernetes**, or **Slurm** clusters, use the `flowsim` CLI. See [schedulers/README.md](schedulers/README.md) for full documentation including per-scheduler parameters, configuration, and environment variables. + +--- + ## For Developers ### Customizing Profiling Workloads diff --git a/schedulers/README.md b/schedulers/README.md index 8f9d8e1..d0835e7 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -11,133 +11,92 @@ FlowSim supports three scheduler backends for submitting GPU profiling jobs: ## Quick Start ```bash -# Install (from FlowSim project root) -cd FlowSim -pip install -e . # or ensure PYTHONPATH includes the project root - -# Show help +pip install -e . flowsim --help -flowsim submit --help ``` ## Common Workflow -All schedulers share the same CLI interface: - ```bash -# 1. Submit a job -flowsim submit --scheduler --collect \ - --model-path [options...] - -# 2. List jobs -flowsim list --scheduler - -# 3. Check job status -flowsim status --scheduler --job - -# 4. View logs -flowsim logs --scheduler --job - -# 5. Cancel a job -flowsim cancel --scheduler --job - -# 6. Dry-run (print script/manifest without submitting) -flowsim submit --scheduler ... --dry-run +# Submit a job (same interface for all backends) +flowsim submit --scheduler \ + --collect \ + --model-path \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 + +# Job lifecycle +flowsim list --scheduler +flowsim status --scheduler --job +flowsim logs --scheduler --job +flowsim cancel --scheduler --job + +# Preview without submitting +flowsim submit --scheduler ... --dry-run + +# Multi-point sweep +flowsim submit --scheduler \ + --collect all --model-path workload/models/configs/Qwen3-235B-A22B \ + --sweep 1:2048:0 4:2048:0 8:2048:0 --gpus 1 ``` ### Common Parameters | Parameter | Description | Default | |-----------|-------------|---------| -| `--collect` | Collection mode: `perf` / `shapes` / `all` | required | +| `--collect` | `perf` / `shapes` / `all` | required | | `--model-path` | HuggingFace model path | required | | `--tp` | Tensor parallelism | `1` | | `--dp` | Data parallelism | `1` | | `--bs` | Batch size | `1` | | `--input-len` | Input sequence length | `2048` | | `--existing-ctx` | Existing KV cache length | `0` | -| `--decode-tokens` | Decode token count | `32` | -| `--warmup-n` | Warmup iterations | `5` | -| `--image` | Docker image | `flowsim-image:latest` | | `--gpus` | GPU count | `1` | -| `--output-dir` | Output directory (auto-generated if omitted) | `stage_traces/{scheduler}/{timestamp}/` | -| `--dry-run` | Print script only, do not submit | `false` | +| `--image` | Docker image | `flowsim-image:latest` | +| `--output-dir` | Output directory | `stage_traces/{scheduler}/{timestamp}/` | +| `--dry-run` | Print script only | `false` | --- ## 1. Local Scheduler -Runs profiling directly on the host via `docker run`. The simplest option, suitable for single-machine development and testing. - -### Usage +Runs profiling via `docker run` on the host machine. ```bash -# Simplest usage — run on GPU 0 flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 \ - --gpus 1 --local-gpus 0 \ + --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --local-gpus 0 \ --extra-server-opts "--load-format dummy" - -# Multi-GPU -flowsim submit --scheduler local \ - --collect perf \ - --model-path Qwen/Qwen3-8B \ - --tp 2 --gpus 2 --local-gpus 0,1 ``` -### Parameters - | Parameter | Description | Default | |-----------|-------------|---------| -| `--local-gpus` | `CUDA_VISIBLE_DEVICES` (e.g. `0` or `0,1`) | empty (all GPUs) | +| `--local-gpus` | `CUDA_VISIBLE_DEVICES` (e.g. `0` or `0,1`) | all GPUs | | `--local-workdir` | Host working directory | FlowSim project root | -### How It Works - -1. `render()` generates a `docker run --gpus` command -2. `submit()` runs the container on the host, waits for completion -3. Traces are written to `stage_traces/local/{YYYYMMDD_HHMMSS}/` -4. `status()` / `logs()` / `list_jobs()` scan log files - --- ## 2. Kubernetes Scheduler -Submits profiling jobs as Kubernetes Jobs to a cluster. Supports both PVC and hostPath storage. +Submits profiling jobs as Kubernetes Jobs. Supports PVC and hostPath storage. -### First-Time Setup +### Setup ```bash -# Install the bundled config template -flowsim init k8s -# Edit ~/.flowsim/k8s.yaml with your cluster details - -# Or install your own config file -flowsim init k8s --config my-cluster.yaml +flowsim init k8s # install bundled template +flowsim init k8s --config my-cluster.yaml # or use your own +# Edit ~/.flowsim/k8s.yaml ``` ### Usage ```bash -# Submit to K8s cluster flowsim submit --scheduler k8s \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ --tp 1 --bs 1 --input-len 2048 --gpus 1 \ --extra-server-opts "--load-format dummy" - -# Override config file values -flowsim submit --scheduler k8s \ - --collect perf \ - --model-path Qwen/Qwen3-8B \ - --k8s-namespace ml-team \ - --k8s-pvc my-traces-pvc \ - --gpus 4 --tp 4 - -# Dry-run to preview the generated YAML -flowsim submit --scheduler k8s ... --dry-run ``` ### Parameters @@ -147,76 +106,43 @@ flowsim submit --scheduler k8s ... --dry-run | `--k8s-namespace` | K8s namespace | `default` | | `--k8s-kubeconfig` | kubeconfig path | `~/.kube/config` | | `--k8s-context` | kubeconfig context | current context | -| `--k8s-pvc` | PVC name (persistent storage) | empty | -| `--k8s-host-output-dir` | hostPath mount (used when PVC is empty) | empty | -| `--k8s-node-selector` | Node selector labels (repeatable), format `KEY=VALUE` | empty | +| `--k8s-pvc` | PVC name for traces | empty | +| `--k8s-host-output-dir` | hostPath (when no PVC) | empty | +| `--k8s-node-selector` | Node selector `KEY=VALUE` (repeatable) | empty | | `--k8s-service-account` | ServiceAccount | empty | | `--k8s-shm-size` | Shared memory size | `16Gi` | -| `--k8s-runtime-class` | RuntimeClass (e.g. `nvidia` for CDI mode) | empty | - -### How It Works - -1. `render()` generates a Kubernetes Job YAML/JSON manifest -2. `submit()` creates the Job via the `kubernetes` Python client -3. Traces are persisted via PVC or hostPath -4. `status()` / `cancel()` / `list_jobs()` operate via the K8s API - -### Kind Local Test Cluster - -```bash -# Start a Kind cluster (GPU passthrough + CDI mode) -bash tests/integration/infra/dev-setup.sh kind - -# Run K8s integration tests -python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x - -# Teardown -bash tests/integration/infra/dev-teardown.sh kind -``` +| `--k8s-runtime-class` | RuntimeClass (e.g. `nvidia`) | empty | --- ## 3. Slurm Scheduler -Generates sbatch scripts and submits them to a Slurm cluster via `sbatch`/`squeue`/`scancel`. +Generates sbatch scripts and submits via `sbatch`/`squeue`/`scancel`. -### First-Time Setup +### Setup ```bash -# Install the bundled config template -flowsim init slurm -# Edit ~/.flowsim/slurm.yaml with your cluster details - -# Or install your own config file -flowsim init slurm --config my-slurm.yaml +flowsim init slurm # install bundled template +flowsim init slurm --config my-slurm.yaml # or use your own +# Edit ~/.flowsim/slurm.yaml ``` ### Usage ```bash -# Submit via sbatch flowsim submit --scheduler slurm \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ --tp 1 --bs 1 --input-len 2048 --gpus 1 \ --slurm-partition gpu \ --extra-server-opts "--load-format dummy" +``` -# CLI prefix (e.g. via docker exec or ssh) -flowsim submit --scheduler slurm \ - --slurm-cli-prefix "docker exec -i slurmctld" \ - --slurm-partition normal \ - --collect perf --model-path Qwen/Qwen3-8B --gpus 1 - -# Dry-run to preview the generated sbatch script -flowsim submit --scheduler slurm ... --dry-run - -# Check status -flowsim status --scheduler slurm --job 12345 \ +For remote clusters, use `--slurm-cli-prefix`: +```bash +flowsim submit --scheduler slurm ... \ --slurm-cli-prefix "docker exec -i slurmctld" - -# Cancel a job -flowsim cancel --scheduler slurm --job 12345 +# or: --slurm-cli-prefix "ssh login-node" ``` ### Parameters @@ -227,69 +153,29 @@ flowsim cancel --scheduler slurm --job 12345 | `--slurm-time` | Job time limit | `02:00:00` | | `--slurm-account` | Billing account | empty | | `--slurm-constraint` | Node constraint | empty | -| `--slurm-cli-prefix` | Shell prefix for CLI commands (e.g. `"docker exec -i slurmctld"`) | empty | -| `--slurm-container-runtime` | Container runtime: `docker` / `enroot` / `none` | `none` | +| `--slurm-cli-prefix` | Shell prefix for remote CLI | empty | +| `--slurm-container-runtime` | `docker` / `enroot` / `none` | `none` | | `--slurm-container-mounts` | Container mounts | empty | | `--slurm-module` | `module load` commands (repeatable) | empty | | `--slurm-extra-sbatch` | Extra `#SBATCH` directives (repeatable) | empty | -### container_runtime Options - -| Value | Description | -|-------|-------------| -| `none` | Run directly on compute node (Python/sglang must be installed) | -| `docker` | Run via `docker run` on the allocated node | -| `enroot` | Run via `srun --container-image` (NVIDIA enroot) | - -### How It Works - -1. `render()` generates a complete sbatch script (`#SBATCH` directives + profiling command) -2. `submit()` pipes the script to `sbatch --parsable` -3. `status()` queries via `scontrol show job` -4. `cancel()` runs `scancel` -5. `list_jobs()` runs `squeue` - -If Slurm commands are not on the local PATH, use `--slurm-cli-prefix` to specify a prefix, e.g.: -- `"docker exec -i slurmctld"` — via Docker container -- `"ssh login-node"` — via SSH - -### Docker Compose Local Test Cluster - -```bash -# Start Slurm cluster (slurmctld + 1 compute node + 1 GPU) -cd tests/integration/infra/ -docker compose -f slurm-compose.yaml up -d - -# Check cluster status -docker exec slurmctld sinfo - -# Run Slurm integration tests -python -m pytest tests/integration/test_scheduler_local.py::TestSlurmScheduler -v -x - -# Teardown -docker compose -f slurm-compose.yaml down -v -``` - --- ## Configuration -Config files are stored in `~/.flowsim/` and installed via `flowsim init`. -Templates are in `schedulers/templates/` with comments explaining each field: +Config files live in `~/.flowsim/` and are installed via `flowsim init`. +Templates with comments are in `schedulers/templates/`. ``` ~/.flowsim/ -├── k8s.yaml # K8s scheduler config -└── slurm.yaml # Slurm scheduler config +├── k8s.yaml +└── slurm.yaml ``` -Parameter priority (highest to lowest): -1. CLI flag (`--slurm-partition gpu`) -2. Environment variable (see table below) -3. Config file (`~/.flowsim/slurm.yaml`) -4. Built-in default +**Priority** (highest to lowest): +CLI flag → environment variable → config file → built-in default -### Supported Environment Variables +### Environment Variables | Variable | Overrides | Example | |----------|-----------|--------| @@ -301,45 +187,50 @@ Parameter priority (highest to lowest): | `FLOWSIM_SLURM_TIME` | `--slurm-time` | `04:00:00` | | `FLOWSIM_SLURM_CONFIG` | Config file path | `/etc/flowsim/slurm.yaml` | -### Example k8s.yaml - -```yaml -kubeconfig: /home/user/.kube/config -namespace: default -host_output_dir: /host-stage-traces -runtime_class_name: nvidia -shm_size: 16Gi -``` - -### Example slurm.yaml - -```yaml -partition: gpu -account: my-project -time: "02:00:00" -container_runtime: none -cli_prefix: "" -``` - --- -## Output Directory Structure - -All schedulers produce a unified trace output structure: +## Output Structure ``` stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ ├── bs1_input2048_ctx0/ -│ ├── *.trace.json.gz # Raw traces -│ ├── parsed/*.csv # Parsed CSVs -│ ├── merged/*_merged.trace.csv # Merged trace CSV -│ ├── shape_traces/ # Shape traces (collect=shapes/all) -│ ├── shape_parsed/*.csv # Shape parsed CSVs -│ ├── analysis_extend.json # Extend stage analysis -│ └── analysis_decode.json # Decode stage analysis +│ ├── *.trace.json.gz +│ ├── parsed/*.csv +│ ├── merged/*_merged.trace.csv +│ ├── shape_traces/ + shape_parsed/ +│ ├── analysis_extend.json +│ └── analysis_decode.json ├── logs/ │ ├── server_*.stdout.log │ └── server_*.stderr.log └── sweep_summary.json ``` +--- + +## Development + +### Test Clusters + +```bash +# Kind (K8s) — GPU passthrough via CDI +bash tests/integration/infra/dev-setup.sh kind +bash tests/integration/infra/dev-teardown.sh kind + +# Slurm — Docker Compose cluster +cd tests/integration/infra/ +docker compose -f slurm-compose.yaml up -d +docker compose -f slurm-compose.yaml down -v +``` + +### Running Tests + +```bash +# Unit tests +python -m pytest tests/unit/test_scheduler_cli.py -v + +# Integration tests +python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x +python -m pytest tests/integration/test_scheduler_local.py::TestSlurmScheduler -v -x +``` + From b0dfdd50659d9c9947c808a32cdd4e4995619ff3 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 19:12:43 +0000 Subject: [PATCH 40/56] format: fix with black --- schedulers/base.py | 24 +- schedulers/config.py | 6 +- schedulers/k8s.py | 103 ++++-- schedulers/local.py | 72 +++-- schedulers/slurm.py | 39 ++- scripts/cli.py | 28 +- scripts/run_stage_profile.py | 13 +- scripts/status_profile.py | 46 ++- scripts/submit_profile.py | 79 +++-- tests/integration/test_scheduler_local.py | 378 ++++++++++++++-------- tests/unit/test_scheduler_cli.py | 66 +++- 11 files changed, 588 insertions(+), 266 deletions(-) diff --git a/schedulers/base.py b/schedulers/base.py index a47ac1f..d3b32c4 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -13,8 +13,8 @@ class JobResult: """Structured return value from ``submit()``.""" job_id: str - scheduler: str # "local", "k8s", "slurm" - state: str # "Submitted", "Completed", "Failed" + scheduler: str # "local", "k8s", "slurm" + state: str # "Submitted", "Completed", "Failed" output_dir: str = "" message: str = "" @@ -150,7 +150,9 @@ def submit(self, spec: ProfileJobSpec) -> JobResult: def cancel(self, job_id: str) -> str: """Cancel a running or pending job. Returns a status message.""" - raise NotImplementedError(f"{type(self).__name__} does not support cancel") + raise NotImplementedError( + f"{type(self).__name__} does not support cancel" + ) def status(self, job_id: str) -> dict: """Query job status. Returns dict with at least 'state' key. @@ -163,9 +165,13 @@ def status(self, job_id: str) -> dict: "output_hint": "where to find trace files", } """ - raise NotImplementedError(f"{type(self).__name__} does not support status queries") + raise NotImplementedError( + f"{type(self).__name__} does not support status queries" + ) - def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def logs( + self, job_id: str, *, tail: int = 100, follow: bool = False + ) -> str: """Retrieve recent log output for a job. Parameters @@ -177,7 +183,9 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: follow : bool If True, stream logs in real time (blocking). """ - raise NotImplementedError(f"{type(self).__name__} does not support log retrieval") + raise NotImplementedError( + f"{type(self).__name__} does not support log retrieval" + ) def list_jobs(self, *, status_filter: str = "") -> list[dict]: """List jobs managed by this scheduler. @@ -193,7 +201,9 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: list[dict] Each dict has at least ``{"job_id": ..., "state": ..., "name": ...}``. """ - raise NotImplementedError(f"{type(self).__name__} does not support list") + raise NotImplementedError( + f"{type(self).__name__} does not support list" + ) def dry_run(self, spec: ProfileJobSpec) -> str: """Render and return the manifest without submitting.""" diff --git a/schedulers/config.py b/schedulers/config.py index 3b2d2fd..433c87b 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -48,10 +48,12 @@ def _save_yaml(path: Path, data: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) try: import yaml as _y + with open(path, "w") as f: _y.safe_dump(data, f, default_flow_style=False, sort_keys=False) except ImportError: import json as _j + with open(path, "w") as f: _j.dump(data, f, indent=2, ensure_ascii=False) f.write("\n") @@ -97,6 +99,8 @@ def cfg_get(cfg: dict, key: str, fallback: str = "") -> str: return fallback -def resolve_default(env_var: str, cfg: dict, key: str, fallback: str = "") -> str: +def resolve_default( + env_var: str, cfg: dict, key: str, fallback: str = "" +) -> str: """Resolve a config value: env var > config file > fallback.""" return os.environ.get(env_var, "") or cfg_get(cfg, key, fallback) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index 83b991c..e75e7f8 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -22,6 +22,7 @@ def _k8s_job_state(status) -> str: return "Running" return "Pending" + # Optional: nicer YAML output for dry-run. try: import yaml as _yaml # type: ignore[import-untyped] @@ -102,16 +103,36 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: # volumes + mounts volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}] volumes: list[dict] = [ - {"name": "dshm", "emptyDir": {"medium": "Memory", "sizeLimit": self.shm_size}}, + { + "name": "dshm", + "emptyDir": {"medium": "Memory", "sizeLimit": self.shm_size}, + }, ] if self.pvc_name: - volume_mounts.append({"name": "output", "mountPath": spec.output_dir}) - volumes.append({"name": "output", "persistentVolumeClaim": {"claimName": self.pvc_name}}) + volume_mounts.append( + {"name": "output", "mountPath": spec.output_dir} + ) + volumes.append( + { + "name": "output", + "persistentVolumeClaim": {"claimName": self.pvc_name}, + } + ) elif self.host_output_dir: # Mount at base traces dir so the full directory structure # (e.g. k8s/{timestamp}/bs1_...) is preserved on the host. - volume_mounts.append({"name": "output", "mountPath": "/flowsim/stage_traces"}) - volumes.append({"name": "output", "hostPath": {"path": self.host_output_dir, "type": "DirectoryOrCreate"}}) + volume_mounts.append( + {"name": "output", "mountPath": "/flowsim/stage_traces"} + ) + volumes.append( + { + "name": "output", + "hostPath": { + "path": self.host_output_dir, + "type": "DirectoryOrCreate", + }, + } + ) container = { "name": "profiler", @@ -145,13 +166,19 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: "metadata": { "name": job_name, "namespace": self.namespace, - "labels": {"app": "flowsim", "component": "profiling", "collect": spec.collect}, + "labels": { + "app": "flowsim", + "component": "profiling", + "collect": spec.collect, + }, }, "spec": { "backoffLimit": 0, "ttlSecondsAfterFinished": 86400, "template": { - "metadata": {"labels": {"app": "flowsim", "component": "profiling"}}, + "metadata": { + "labels": {"app": "flowsim", "component": "profiling"} + }, "spec": pod_spec, }, }, @@ -202,7 +229,11 @@ def _load_k8s(self): try: k8s_config.load_incluster_config() except k8s_config.ConfigException: - hint = " Try --k8s-kubeconfig /path/to/kubeconfig." if not self.kubeconfig else "" + hint = ( + " Try --k8s-kubeconfig /path/to/kubeconfig." + if not self.kubeconfig + else "" + ) raise RuntimeError( "No valid Kubernetes configuration found. " "Checked kubeconfig file and in-cluster environment." + hint @@ -226,7 +257,9 @@ def status(self, job_id: str) -> dict: """Query K8s Job status by job name.""" batch_api, core_api = self._load_k8s() - job = batch_api.read_namespaced_job(name=job_id, namespace=self.namespace) + job = batch_api.read_namespaced_job( + name=job_id, namespace=self.namespace + ) # Determine state state = _k8s_job_state(job.status) @@ -250,7 +283,9 @@ def status(self, job_id: str) -> dict: else: output_hint = "WARNING: no PVC or hostPath configured — traces are lost when pod exits" - msg_parts = [f"Job: {job_id} Namespace: {self.namespace} State: {state}"] + msg_parts = [ + f"Job: {job_id} Namespace: {self.namespace} State: {state}" + ] if pod_statuses: msg_parts.append("Pods: " + ", ".join(pod_statuses)) msg_parts.append(output_hint) @@ -261,7 +296,9 @@ def status(self, job_id: str) -> dict: "output_hint": output_hint, } - def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def logs( + self, job_id: str, *, tail: int = 100, follow: bool = False + ) -> str: """Show where logs are and how to access them for a K8s Job.""" _, core_api = self._load_k8s() @@ -270,7 +307,9 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: label_selector=f"job-name={job_id}", ) if not pods.items: - return f"No pods found for job {job_id} in namespace {self.namespace}" + return ( + f"No pods found for job {job_id} in namespace {self.namespace}" + ) if follow: # Stream logs from the first running/succeeded pod @@ -300,21 +339,29 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: for pod in pods.items: name = pod.metadata.name parts.append(f" kubectl logs {name} -n {self.namespace}") - parts.append(f" kubectl logs {name} -n {self.namespace} --tail={tail}") + parts.append( + f" kubectl logs {name} -n {self.namespace} --tail={tail}" + ) parts.append("") # Persistent log files if self.pvc_name: - parts.append(f"Server logs + traces persisted on PVC '{self.pvc_name}'.") + parts.append( + f"Server logs + traces persisted on PVC '{self.pvc_name}'." + ) parts.append("Copy to local machine:") for pod in pods.items: name = pod.metadata.name if pod.status.phase in ("Running", "Succeeded"): - parts.append(f" kubectl cp {self.namespace}/{name}:/flowsim/stage_traces ./stage_traces") + parts.append( + f" kubectl cp {self.namespace}/{name}:/flowsim/stage_traces ./stage_traces" + ) break else: - parts.append(" (pod not running — mount the PVC in another pod to retrieve files)") + parts.append( + " (pod not running — mount the PVC in another pod to retrieve files)" + ) elif self.host_output_dir: parts.append(f"Server logs + traces at hostPath on the node:") parts.append(f" {self.host_output_dir}/") @@ -323,7 +370,9 @@ def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: for pod in pods.items: if pod.spec.node_name: parts.append(f" Node: {pod.spec.node_name}") - parts.append(f" scp {pod.spec.node_name}:{self.host_output_dir}/ ./stage_traces/") + parts.append( + f" scp {pod.spec.node_name}:{self.host_output_dir}/ ./stage_traces/" + ) break return "\n".join(parts) @@ -345,13 +394,17 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: created = "" if job.metadata.creation_timestamp: - created = job.metadata.creation_timestamp.strftime("%Y-%m-%d %H:%M:%S") + created = job.metadata.creation_timestamp.strftime( + "%Y-%m-%d %H:%M:%S" + ) - result.append({ - "job_id": job.metadata.name, - "name": job.metadata.name, - "state": state, - "namespace": self.namespace, - "created": created, - }) + result.append( + { + "job_id": job.metadata.name, + "name": job.metadata.name, + "state": state, + "namespace": self.namespace, + "created": created, + } + ) return result diff --git a/schedulers/local.py b/schedulers/local.py index 673acac..f9c2aa8 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -58,7 +58,8 @@ def _check_image_exists(image: str) -> None: """Raise if the Docker image is not available locally.""" result = subprocess.run( ["docker", "image", "inspect", image], - capture_output=True, timeout=10, + capture_output=True, + timeout=10, ) if result.returncode != 0: raise SystemExit( @@ -84,7 +85,7 @@ def _host_output_dir(self, spec_output_dir: str) -> str: # Strip the /flowsim/ prefix to get the relative path rel = spec_output_dir if rel.startswith("/flowsim/"): - rel = rel[len("/flowsim/"):] + rel = rel[len("/flowsim/") :] return os.path.join(self.workdir, rel) def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: @@ -97,7 +98,9 @@ def _build_docker_cmd(self, spec: ProfileJobSpec) -> str: """ job_name = spec.default_job_name()[:63] host_output = self._host_output_dir(spec.output_dir) - container_output = spec.output_dir # e.g. /flowsim/stage_traces/local/{ts} + container_output = ( + spec.output_dir + ) # e.g. /flowsim/stage_traces/local/{ts} inner_cmd = spec.build_shell_command() @@ -140,7 +143,8 @@ def submit(self, spec: ProfileJobSpec) -> JobResult: # Remove stale container with the same name (e.g. from a killed run) subprocess.run( ["docker", "rm", "-f", job_name[:63]], - capture_output=True, timeout=10, + capture_output=True, + timeout=10, ) stdout_path = os.path.join(log_dir, f"{job_name}_{ts}.stdout.log") stderr_path = os.path.join(log_dir, f"{job_name}_{ts}.stderr.log") @@ -171,10 +175,14 @@ def _tee(src, dest_file, dest_stream): dest_file.flush() t_out = threading.Thread( - target=_tee, args=(proc.stdout, fout, sys.stdout), daemon=True, + target=_tee, + args=(proc.stdout, fout, sys.stdout), + daemon=True, ) t_err = threading.Thread( - target=_tee, args=(proc.stderr, ferr, sys.stderr), daemon=True, + target=_tee, + args=(proc.stderr, ferr, sys.stderr), + daemon=True, ) t_out.start() t_err.start() @@ -210,7 +218,9 @@ def cancel(self, job_id: str) -> str: """Stop the Docker container for a local job.""" proc = subprocess.run( ["docker", "stop", job_id], - capture_output=True, text=True, timeout=30, + capture_output=True, + text=True, + timeout=30, ) if proc.returncode == 0: return f"Stopped container {job_id}" @@ -234,9 +244,11 @@ def status(self, job_id: str) -> dict: """ matches = [] for log_dir in self._find_log_dirs(): - matches.extend(sorted(glob.glob( - os.path.join(log_dir, f"{job_id}_*.stdout.log") - ))) + matches.extend( + sorted( + glob.glob(os.path.join(log_dir, f"{job_id}_*.stdout.log")) + ) + ) if not matches: return { @@ -260,25 +272,29 @@ def status(self, job_id: str) -> dict: "output_hint": trace_dir, } - def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def logs( + self, job_id: str, *, tail: int = 100, follow: bool = False + ) -> str: """List log files for a local job and print access commands.""" matches = [] for log_dir in self._find_log_dirs(): - matches.extend(sorted(glob.glob( - os.path.join(log_dir, f"{job_id}_*") - ))) + matches.extend( + sorted(glob.glob(os.path.join(log_dir, f"{job_id}_*"))) + ) if not matches: for log_dir in self._find_log_dirs(): - matches.extend(sorted(glob.glob( - os.path.join(log_dir, f"*{job_id}*") - ))) + matches.extend( + sorted(glob.glob(os.path.join(log_dir, f"*{job_id}*"))) + ) if not matches: return f"No logs found matching '{job_id}'" if follow: - stdout_files = sorted(f for f in matches if f.endswith(".stdout.log")) + stdout_files = sorted( + f for f in matches if f.endswith(".stdout.log") + ) if stdout_files: return f"Follow logs with:\n tail -f {stdout_files[-1]}" return f"No stdout log found to follow for '{job_id}'" @@ -315,9 +331,9 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: """List local jobs by scanning log files.""" matches = [] for log_dir in self._find_log_dirs(): - matches.extend(sorted(glob.glob( - os.path.join(log_dir, "*.stdout.log") - ))) + matches.extend( + sorted(glob.glob(os.path.join(log_dir, "*.stdout.log"))) + ) jobs: list[dict] = [] for path in matches: @@ -330,12 +346,14 @@ def list_jobs(self, *, status_filter: str = "") -> list[dict]: name = m.group(1) ts = m.group(2) state = "Completed" - jobs.append({ - "job_id": name, - "name": name, - "state": state, - "timestamp": ts, - }) + jobs.append( + { + "job_id": name, + "name": name, + "state": state, + "timestamp": ts, + } + ) if status_filter: filt = status_filter.lower() diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 265a725..ad12e75 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -176,7 +176,9 @@ def _submit_cli(self, spec: ProfileJobSpec) -> JobResult: f"sbatch failed (exit {r.returncode}):\n{r.stderr}" ) - job_id = r.stdout.strip().split(";")[0] # parsable: "jobid" or "jobid;cluster" + job_id = r.stdout.strip().split(";")[ + 0 + ] # parsable: "jobid" or "jobid;cluster" return JobResult( job_id=job_id, scheduler="slurm", @@ -193,7 +195,9 @@ def status(self, job_id: str) -> dict: """Query Slurm job status.""" return self._status_cli(job_id) - def logs(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def logs( + self, job_id: str, *, tail: int = 100, follow: bool = False + ) -> str: """Show Slurm job log information.""" return self._logs_cli(job_id, tail=tail, follow=follow) @@ -216,7 +220,11 @@ def _status_cli(self, job_id: str) -> dict: # (completed jobs stay in memory for MinJobAge seconds, default 300s) r = self._cli_run("scontrol", "show", "job", job_id) if r.returncode != 0 or not r.stdout.strip(): - return {"state": "Unknown", "message": f"No job found with ID {job_id}", "output_hint": ""} + return { + "state": "Unknown", + "message": f"No job found with ID {job_id}", + "output_hint": "", + } # Parse key=value output fields: dict[str, str] = {} @@ -258,7 +266,9 @@ def _status_cli(self, job_id: str) -> dict: "output_hint": output_file, } - def _logs_cli(self, job_id: str, *, tail: int = 100, follow: bool = False) -> str: + def _logs_cli( + self, job_id: str, *, tail: int = 100, follow: bool = False + ) -> str: info = self._status_cli(job_id) output_file = info.get("output_hint", "") @@ -289,7 +299,10 @@ def _logs_cli(self, job_id: str, *, tail: int = 100, follow: bool = False) -> st def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: r = self._cli_run( - "squeue", "-o", "%i|%j|%T|%P|%N", "-h", + "squeue", + "-o", + "%i|%j|%T|%P|%N", + "-h", ) if r.returncode != 0: raise RuntimeError(f"squeue failed: {r.stderr}") @@ -304,11 +317,13 @@ def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: state = parts[2] if len(parts) > 2 else "UNKNOWN" if status_filter and state.upper() != status_filter.upper(): continue - result.append({ - "job_id": parts[0] if parts else "", - "name": name, - "state": state, - "partition": parts[3] if len(parts) > 3 else "", - "nodes": parts[4] if len(parts) > 4 else "", - }) + result.append( + { + "job_id": parts[0] if parts else "", + "name": name, + "state": state, + "partition": parts[3] if len(parts) > 3 else "", + "nodes": parts[4] if len(parts) > 4 else "", + } + ) return result diff --git a/scripts/cli.py b/scripts/cli.py index ba0a65e..04d0c84 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -14,9 +14,10 @@ import sys from pathlib import Path - _CONFIG_DIR = Path.home() / ".flowsim" -_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "schedulers" / "templates" +_TEMPLATES_DIR = ( + Path(__file__).resolve().parent.parent / "schedulers" / "templates" +) def _cmd_init(argv: list[str]) -> int: @@ -37,15 +38,19 @@ def _cmd_init(argv: list[str]) -> int: formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( - "scheduler", choices=["k8s", "slurm"], + "scheduler", + choices=["k8s", "slurm"], help="Scheduler type", ) parser.add_argument( - "--config", "-c", default="", + "--config", + "-c", + default="", help="Path to a config YAML to install (default: bundled template)", ) parser.add_argument( - "--force", action="store_true", + "--force", + action="store_true", help="Overwrite existing config file", ) args = parser.parse_args(argv) @@ -53,8 +58,10 @@ def _cmd_init(argv: list[str]) -> int: dst = _CONFIG_DIR / f"{args.scheduler}.yaml" if dst.exists() and not args.force: - print(f"Error: {dst} already exists (use --force to overwrite)", - file=sys.stderr) + print( + f"Error: {dst} already exists (use --force to overwrite)", + file=sys.stderr, + ) return 1 if args.config: @@ -67,11 +74,14 @@ def _cmd_init(argv: list[str]) -> int: return 1 import shutil + _CONFIG_DIR.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) print(f"Installed {src} → {dst}") - print(f"Edit {dst}, then run: flowsim submit --scheduler " - f"{args.scheduler} ...") + print( + f"Edit {dst}, then run: flowsim submit --scheduler " + f"{args.scheduler} ..." + ) return 0 diff --git a/scripts/run_stage_profile.py b/scripts/run_stage_profile.py index 00dce4b..3c739ef 100644 --- a/scripts/run_stage_profile.py +++ b/scripts/run_stage_profile.py @@ -799,11 +799,20 @@ def _start_server( return proc -def _run_perf(args, summary: list[dict], *, bs: Optional[int] = None, input_len: Optional[int] = None, existing_ctx: Optional[int] = None) -> int: +def _run_perf( + args, + summary: list[dict], + *, + bs: Optional[int] = None, + input_len: Optional[int] = None, + existing_ctx: Optional[int] = None, +) -> int: """Collect traces for a single (bs, input_len, existing_ctx, decode_tokens) point.""" bs = bs if bs is not None else args.bs input_len = input_len if input_len is not None else args.input_len - existing_ctx = existing_ctx if existing_ctx is not None else args.existing_ctx + existing_ctx = ( + existing_ctx if existing_ctx is not None else args.existing_ctx + ) tag = f"bs{bs}_input{input_len}_ctx{existing_ctx}" sub_dir = os.path.join(args.output_dir, tag) diff --git a/scripts/status_profile.py b/scripts/status_profile.py index 5d10f84..bf389ab 100644 --- a/scripts/status_profile.py +++ b/scripts/status_profile.py @@ -31,12 +31,16 @@ import argparse import sys -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default +from schedulers.config import ( + cfg_get, + load_k8s_config, + load_slurm_config, + resolve_default, +) from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler - _d = resolve_default @@ -49,7 +53,9 @@ def _add_scheduler_args(p: argparse.ArgumentParser) -> None: ) -def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> None: +def _add_scheduler_specific_args( + p: argparse.ArgumentParser, scheduler: str +) -> None: """Add only the args relevant to the chosen scheduler (second pass).""" k8s_cfg = load_k8s_config() slurm_cfg = load_slurm_config() @@ -60,7 +66,9 @@ def _add_scheduler_specific_args(p: argparse.ArgumentParser, scheduler: str) -> elif scheduler == "k8s": p.add_argument( "--k8s-namespace", - default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), + default=_d( + "FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default" + ), ) p.add_argument( "--k8s-kubeconfig", @@ -103,7 +111,9 @@ def _build_scheduler(args: argparse.Namespace): ) -def _parse_two_pass(p: argparse.ArgumentParser, argv: list[str] | None = None) -> argparse.Namespace: +def _parse_two_pass( + p: argparse.ArgumentParser, argv: list[str] | None = None +) -> argparse.Namespace: """Two-pass parse: peek --scheduler, add scheduler-specific args, full parse.""" _pre = argparse.ArgumentParser(add_help=False) _pre.add_argument("--scheduler", choices=["local", "k8s", "slurm"]) @@ -132,8 +142,15 @@ def main_logs(argv: list[str] | None = None) -> None: p = argparse.ArgumentParser(description="Retrieve FlowSim job logs.") _add_scheduler_args(p) p.add_argument("--job", required=True, help="Job name or ID") - p.add_argument("--tail", type=int, default=100, help="Number of log lines (default: 100)") - p.add_argument("--follow", "-f", action="store_true", help="Follow log output") + p.add_argument( + "--tail", + type=int, + default=100, + help="Number of log lines (default: 100)", + ) + p.add_argument( + "--follow", "-f", action="store_true", help="Follow log output" + ) args = _parse_two_pass(p, argv) scheduler = _build_scheduler(args) @@ -148,7 +165,11 @@ def main_logs(argv: list[str] | None = None) -> None: def main_list(argv: list[str] | None = None) -> None: p = argparse.ArgumentParser(description="List FlowSim jobs.") _add_scheduler_args(p) - p.add_argument("--status", default="", help="Filter by job state (e.g. Running, Succeeded, PENDING)") + p.add_argument( + "--status", + default="", + help="Filter by job state (e.g. Running, Succeeded, PENDING)", + ) args = _parse_two_pass(p, argv) scheduler = _build_scheduler(args) @@ -159,12 +180,17 @@ def main_list(argv: list[str] | None = None) -> None: return # Print table header headers = list(jobs[0].keys()) - widths = {h: max(len(h), max(len(str(j.get(h, ""))) for j in jobs)) for h in headers} + widths = { + h: max(len(h), max(len(str(j.get(h, ""))) for j in jobs)) + for h in headers + } header_line = " ".join(h.upper().ljust(widths[h]) for h in headers) print(header_line) print("-" * len(header_line)) for job in jobs: - print(" ".join(str(job.get(h, "")).ljust(widths[h]) for h in headers)) + print( + " ".join(str(job.get(h, "")).ljust(widths[h]) for h in headers) + ) except Exception as exc: print(f"Error: {exc}", file=sys.stderr) sys.exit(1) diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 747b9b3..5e1021a 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -51,13 +51,17 @@ import sys from schedulers.base import ProfileJobSpec -from schedulers.config import cfg_get, load_k8s_config, load_slurm_config, resolve_default +from schedulers.config import ( + cfg_get, + load_k8s_config, + load_slurm_config, + resolve_default, +) from schedulers.k8s import K8sScheduler from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler from scripts import load_sweep_file, parse_sweep_point - # Short alias for argparse default= expressions _d = resolve_default @@ -97,7 +101,8 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: wl.add_argument("--decode-tokens", type=int, default=32) wl.add_argument("--warmup-n", type=int, default=5) wl.add_argument( - "--disable-chunked-prefill", action="store_true", + "--disable-chunked-prefill", + action="store_true", ) wl.add_argument("--max-prefill-tokens", type=int, default=131072) wl.add_argument( @@ -132,7 +137,10 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: infra = p.add_argument_group("infrastructure") infra.add_argument("--image", default="flowsim-image:latest") infra.add_argument( - "--gpus", type=int, default=1, help="Total GPU count", + "--gpus", + type=int, + default=1, + help="Total GPU count", ) infra.add_argument("--host", default="0.0.0.0") infra.add_argument("--port", type=int, default=30001) @@ -166,10 +174,14 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) elif pre.scheduler == "k8s": - k8s = p.add_argument_group("kubernetes options (config: ~/.flowsim/k8s.yaml)") + k8s = p.add_argument_group( + "kubernetes options (config: ~/.flowsim/k8s.yaml)" + ) k8s.add_argument( "--k8s-namespace", - default=_d("FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default"), + default=_d( + "FLOWSIM_K8S_NAMESPACE", k8s_cfg, "namespace", "default" + ), help="K8s namespace (env: FLOWSIM_K8S_NAMESPACE)", ) k8s.add_argument( @@ -214,7 +226,9 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) elif pre.scheduler == "slurm": - slurm = p.add_argument_group("slurm options (config: ~/.flowsim/slurm.yaml)") + slurm = p.add_argument_group( + "slurm options (config: ~/.flowsim/slurm.yaml)" + ) slurm.add_argument( "--slurm-partition", default=_d("FLOWSIM_SLURM_PARTITION", slurm_cfg, "partition", ""), @@ -243,7 +257,11 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=cfg_get(slurm_cfg, "container_mounts", ""), ) # Modules from config (list) + CLI (append) - cfg_modules = slurm_cfg.get("modules") if isinstance(slurm_cfg.get("modules"), list) else [] + cfg_modules = ( + slurm_cfg.get("modules") + if isinstance(slurm_cfg.get("modules"), list) + else [] + ) slurm.add_argument( "--slurm-module", action="append", @@ -316,7 +334,9 @@ def _build_scheduler(args: argparse.Namespace): for item in args.k8s_node_selector: k, _, v = item.partition("=") if not v: - sys.exit(f"Bad --k8s-node-selector format: {item!r} (use KEY=VALUE)") + sys.exit( + f"Bad --k8s-node-selector format: {item!r} (use KEY=VALUE)" + ) node_sel[k] = v return K8sScheduler( namespace=args.k8s_namespace, @@ -349,6 +369,7 @@ def main(argv: list[str] | None = None) -> None: # Smart defaults for output_dir based on scheduler. # Layout: stage_traces/{scheduler}/{timestamp}/ import time as _time + _ts = _time.strftime("%Y%m%d_%H%M%S") if not args.output_dir: if args.scheduler == "local": @@ -365,7 +386,9 @@ def main(argv: list[str] | None = None) -> None: # For local scheduler, convert absolute host model_path to relative # so it resolves correctly inside the container (workdir=/flowsim). if args.scheduler == "local" and os.path.isabs(args.model_path): - project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + project_root = os.path.dirname( + os.path.dirname(os.path.abspath(__file__)) + ) if args.model_path.startswith(project_root): args.model_path = os.path.relpath(args.model_path, project_root) @@ -389,18 +412,36 @@ def main(argv: list[str] | None = None) -> None: if args.k8s_pvc: print(f" (persisted on PVC '{args.k8s_pvc}')") else: - print(f" (persisted at hostPath '{args.k8s_host_output_dir}' on the node)") - print(f"\nTo check status: flowsim status --scheduler k8s --job {job_id}") - print(f"To view logs: flowsim logs --scheduler k8s --job {job_id}") - print(f"To follow logs: flowsim logs --scheduler k8s --job {job_id} --follow") - print(f"To cancel: flowsim cancel --scheduler k8s --job {job_id}") + print( + f" (persisted at hostPath '{args.k8s_host_output_dir}' on the node)" + ) + print( + f"\nTo check status: flowsim status --scheduler k8s --job {job_id}" + ) + print( + f"To view logs: flowsim logs --scheduler k8s --job {job_id}" + ) + print( + f"To follow logs: flowsim logs --scheduler k8s --job {job_id} --follow" + ) + print( + f"To cancel: flowsim cancel --scheduler k8s --job {job_id}" + ) elif sched == "slurm": print(f" (on cluster shared filesystem)") - print(f"\nTo check status: flowsim status --scheduler slurm --job {job_id}") - print(f"To view logs: flowsim logs --scheduler slurm --job {job_id}") - print(f"To cancel: flowsim cancel --scheduler slurm --job {job_id}") + print( + f"\nTo check status: flowsim status --scheduler slurm --job {job_id}" + ) + print( + f"To view logs: flowsim logs --scheduler slurm --job {job_id}" + ) + print( + f"To cancel: flowsim cancel --scheduler slurm --job {job_id}" + ) else: - print(f"\nTo view logs: flowsim logs --scheduler local --job {job_id}") + print( + f"\nTo view logs: flowsim logs --scheduler local --job {job_id}" + ) print(f"To list all jobs: flowsim list --scheduler {sched}") diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler_local.py index a2086f1..a6bc416 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler_local.py @@ -58,29 +58,34 @@ _PROJECT_ROOT = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..") ) -_DEV_SETUP = os.path.join(_PROJECT_ROOT, "tests", "integration", "infra", "dev-setup.sh") -_DEV_TEARDOWN = os.path.join(_PROJECT_ROOT, "tests", "integration", "infra", "dev-teardown.sh") - -MODEL = os.environ.get( - "MODEL", "workload/models/configs/Qwen3-235B-A22B" +_DEV_SETUP = os.path.join( + _PROJECT_ROOT, "tests", "integration", "infra", "dev-setup.sh" +) +_DEV_TEARDOWN = os.path.join( + _PROJECT_ROOT, "tests", "integration", "infra", "dev-teardown.sh" ) + +MODEL = os.environ.get("MODEL", "workload/models/configs/Qwen3-235B-A22B") LOAD_FORMAT = os.environ.get("LOAD_FORMAT", "dummy") # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- -def _flowsim_cli(*args: str, timeout: int = 1200) -> subprocess.CompletedProcess: + +def _flowsim_cli( + *args: str, timeout: int = 1200 +) -> subprocess.CompletedProcess: """Run a ``flowsim`` subcommand via Python entry point.""" cmd = [ - sys.executable, "-u", "-c", + sys.executable, + "-u", + "-c", "from scripts.cli import main; main()", *args, ] env = os.environ.copy() - env["PYTHONPATH"] = _PROJECT_ROOT + ( - ":" + env.get("PYTHONPATH", "") - ) + env["PYTHONPATH"] = _PROJECT_ROOT + (":" + env.get("PYTHONPATH", "")) env["PYTHONUNBUFFERED"] = "1" return subprocess.run( cmd, @@ -123,10 +128,7 @@ def _assert_logs(output_dir: str) -> None: assert len(stdout_logs) > 0, f"No stdout logs in {log_dir}" assert len(stderr_logs) > 0, f"No stderr logs in {log_dir}" # At least one log should be non-empty - sizes = [ - os.path.getsize(os.path.join(log_dir, f)) - for f in stdout_logs - ] + sizes = [os.path.getsize(os.path.join(log_dir, f)) for f in stdout_logs] assert max(sizes) > 0, "All stdout logs are empty" @@ -185,10 +187,14 @@ def _validate_shapes(output_dir, bs, input_len, existing_ctx): tag = f"bs{bs}_input{input_len}_ctx{existing_ctx}" for csv_subdir in ("merged", "shape_parsed"): extend_csvs = sorted( - glob.glob(os.path.join(output_dir, tag, csv_subdir, "*TP-0*EXTEND*.csv")) + glob.glob( + os.path.join(output_dir, tag, csv_subdir, "*TP-0*EXTEND*.csv") + ) ) decode_csvs = sorted( - glob.glob(os.path.join(output_dir, tag, csv_subdir, "*TP-0*DECODE*.csv")) + glob.glob( + os.path.join(output_dir, tag, csv_subdir, "*TP-0*DECODE*.csv") + ) ) if extend_csvs and decode_csvs: break @@ -204,23 +210,23 @@ def _validate_shapes(output_dir, bs, input_len, existing_ctx): ext_gemm_dim0 = _first_matmul_dim0(extend_rows) assert ext_gemm_dim0 is not None, "No matmul kernel found in EXTEND CSV" expected_ext = bs * input_len - assert ext_gemm_dim0 == expected_ext, ( - f"EXTEND first GEMM dim0={ext_gemm_dim0}, expected bs*input_len={expected_ext}" - ) + assert ( + ext_gemm_dim0 == expected_ext + ), f"EXTEND first GEMM dim0={ext_gemm_dim0}, expected bs*input_len={expected_ext}" # EXTEND FlashAttn dims contain [bs, seq_len] seq_len = input_len + existing_ctx attn_pair = _attention_seqlen_pair(extend_rows, bs, seq_len) - assert attn_pair is not None, ( - f"No FlashAttention dim matching [bs={bs}, seqlen={seq_len}(+1)] in EXTEND CSV" - ) + assert ( + attn_pair is not None + ), f"No FlashAttention dim matching [bs={bs}, seqlen={seq_len}(+1)] in EXTEND CSV" # DECODE first GEMM dim0 == bs dec_gemm_dim0 = _first_matmul_dim0(decode_rows) assert dec_gemm_dim0 is not None, "No matmul kernel found in DECODE CSV" - assert dec_gemm_dim0 == bs, ( - f"DECODE first GEMM dim0={dec_gemm_dim0}, expected bs={bs}" - ) + assert ( + dec_gemm_dim0 == bs + ), f"DECODE first GEMM dim0={dec_gemm_dim0}, expected bs={bs}" # ===================================================================== @@ -244,7 +250,10 @@ class TestLocalScheduler: @pytest.mark.parametrize( "point", _TP1_POINTS, - ids=[f"bs{p['bs']}_il{p['input_len']}_ctx{p['existing_ctx']}" for p in _TP1_POINTS], + ids=[ + f"bs{p['bs']}_il{p['input_len']}_ctx{p['existing_ctx']}" + for p in _TP1_POINTS + ], ) def test_local_tp1_all(self, point): bs = point["bs"] @@ -255,18 +264,30 @@ def test_local_tp1_all(self, point): # ── Step 1: submit ── r = _flowsim_cli( "submit", - "--scheduler", "local", - "--collect", "all", - "--model-path", MODEL, - "--tp", "1", - "--bs", str(bs), - "--input-len", str(input_len), - "--existing-ctx", str(existing_ctx), - "--decode-tokens", str(decode_tokens), - "--warmup-n", "2", - "--gpus", "1", - "--local-gpus", "0", - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--scheduler", + "local", + "--collect", + "all", + "--model-path", + MODEL, + "--tp", + "1", + "--bs", + str(bs), + "--input-len", + str(input_len), + "--existing-ctx", + str(existing_ctx), + "--decode-tokens", + str(decode_tokens), + "--warmup-n", + "2", + "--gpus", + "1", + "--local-gpus", + "0", + "--extra-server-opts", + f"--load-format {LOAD_FORMAT}", ) if r.returncode != 0: print("STDOUT:", r.stdout[-3000:]) @@ -284,22 +305,26 @@ def test_local_tp1_all(self, point): break if job_id: break - assert job_id, f"Could not find job_id in submit output:\n{combined[-1000:]}" + assert ( + job_id + ), f"Could not find job_id in submit output:\n{combined[-1000:]}" # ── Step 2: list — verify job appears ── r_list = _flowsim_cli("list", "--scheduler", "local") assert r_list.returncode == 0, "flowsim list failed" - assert job_id in r_list.stdout, ( - f"Job {job_id} not found in list output:\n{r_list.stdout}" - ) + assert ( + job_id in r_list.stdout + ), f"Job {job_id} not found in list output:\n{r_list.stdout}" # ── Step 3: status — should be Completed (submit is synchronous) ── - r_status = _flowsim_cli("status", "--scheduler", "local", "--job", job_id) + r_status = _flowsim_cli( + "status", "--scheduler", "local", "--job", job_id + ) assert r_status.returncode == 0, "flowsim status failed" status_out = r_status.stdout.lower() - assert "completed" in status_out, ( - f"Job {job_id} not completed:\n{r_status.stdout}" - ) + assert ( + "completed" in status_out + ), f"Job {job_id} not completed:\n{r_status.stdout}" # ── Step 4: validate trace CSVs ── # Extract output_dir from status output (Traces dir: ...) @@ -308,23 +333,29 @@ def test_local_tp1_all(self, point): if "Traces dir:" in line: output_dir = line.split("Traces dir:", 1)[1].strip() break - assert output_dir and os.path.isdir(output_dir), ( - f"Could not find traces dir in status output:\n{r_status.stdout}" - ) + assert output_dir and os.path.isdir( + output_dir + ), f"Could not find traces dir in status output:\n{r_status.stdout}" _assert_traces(output_dir) _assert_logs(output_dir) - _validate_shapes(output_dir, bs=bs, input_len=input_len, existing_ctx=existing_ctx) + _validate_shapes( + output_dir, bs=bs, input_len=input_len, existing_ctx=existing_ctx + ) # ===================================================================== # Cluster setup helpers & fixtures # ===================================================================== + def _run_dev_setup(target: str) -> None: """Run ``tests/integration/infra/dev-setup.sh `` and assert success.""" r = subprocess.run( ["bash", _DEV_SETUP, target], - capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=300, + capture_output=True, + text=True, + cwd=_PROJECT_ROOT, + timeout=300, ) if r.returncode != 0: raise RuntimeError( @@ -337,7 +368,10 @@ def _run_dev_teardown(target: str) -> None: """Run ``tests/integration/infra/dev-teardown.sh ``.""" subprocess.run( ["bash", _DEV_TEARDOWN, target], - capture_output=True, text=True, cwd=_PROJECT_ROOT, timeout=120, + capture_output=True, + text=True, + cwd=_PROJECT_ROOT, + timeout=120, ) @@ -346,7 +380,9 @@ def _kind_cluster_running() -> bool: try: r = subprocess.run( ["kubectl", "--context", "kind-flowsim", "get", "nodes"], - capture_output=True, text=True, timeout=15, + capture_output=True, + text=True, + timeout=15, ) return r.returncode == 0 and "Ready" in r.stdout except Exception: @@ -406,20 +442,34 @@ def test_k8s_real_submit_to_kind(self, kind_cluster): # ── Step 1: submit (host mount for trace retrieval) ── r = _flowsim_cli( "submit", - "--scheduler", "k8s", - "--collect", "all", - "--model-path", MODEL, - "--tp", "1", - "--bs", "1", - "--input-len", "2048", - "--existing-ctx", "0", - "--decode-tokens", "2", - "--warmup-n", "2", - "--gpus", "1", - "--k8s-namespace", "default", - "--k8s-host-output-dir", "/host-stage-traces", - "--job-name", job_name, - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--scheduler", + "k8s", + "--collect", + "all", + "--model-path", + MODEL, + "--tp", + "1", + "--bs", + "1", + "--input-len", + "2048", + "--existing-ctx", + "0", + "--decode-tokens", + "2", + "--warmup-n", + "2", + "--gpus", + "1", + "--k8s-namespace", + "default", + "--k8s-host-output-dir", + "/host-stage-traces", + "--job-name", + job_name, + "--extra-server-opts", + f"--load-format {LOAD_FORMAT}", ) combined = r.stdout + r.stderr if r.returncode != 0: @@ -429,15 +479,17 @@ def test_k8s_real_submit_to_kind(self, kind_cluster): # ── Step 2: list — verify job appears ── r_list = _flowsim_cli("list", "--scheduler", "k8s") assert r_list.returncode == 0 - assert job_name in r_list.stdout, ( - f"Job {job_name} not in list:\n{r_list.stdout}" - ) + assert ( + job_name in r_list.stdout + ), f"Job {job_name} not in list:\n{r_list.stdout}" # ── Step 3: status — poll until Completed/Succeeded (max 20 min) ── deadline = time.time() + 1200 state = "" while time.time() < deadline: - r_status = _flowsim_cli("status", "--scheduler", "k8s", "--job", job_name) + r_status = _flowsim_cli( + "status", "--scheduler", "k8s", "--job", job_name + ) assert r_status.returncode == 0 state = r_status.stdout.lower() if "completed" in state or "succeeded" in state: @@ -445,18 +497,18 @@ def test_k8s_real_submit_to_kind(self, kind_cluster): if "failed" in state: pytest.fail(f"K8s job failed:\n{r_status.stdout}") time.sleep(15) - assert "completed" in state or "succeeded" in state, ( - f"K8s job did not complete in time:\n{r_status.stdout}" - ) + assert ( + "completed" in state or "succeeded" in state + ), f"K8s job did not complete in time:\n{r_status.stdout}" # ── Step 4: traces are on host via Kind mount ── # output_dir inside container: /flowsim/stage_traces/k8s/{ts} # host_output_dir on worker: /host-stage-traces # → host: {project}/stage_traces/k8s/{ts}/ k8s_traces = os.path.join(host_traces, "k8s") - assert os.path.isdir(k8s_traces), ( - f"No k8s traces dir at {k8s_traces}" - ) + assert os.path.isdir( + k8s_traces + ), f"No k8s traces dir at {k8s_traces}" # Find the latest timestamped subdir ts_dirs = sorted(os.listdir(k8s_traces)) assert ts_dirs, f"No timestamp dirs in {k8s_traces}" @@ -476,12 +528,15 @@ def test_k8s_real_submit_to_kind(self, kind_cluster): # SLURM SCHEDULER # ===================================================================== + def _slurm_cluster_running() -> bool: """Check if local Slurm test cluster (docker compose) is running.""" try: r = subprocess.run( ["docker", "exec", "slurmctld", "sinfo", "-h"], - capture_output=True, text=True, timeout=10, + capture_output=True, + text=True, + timeout=10, ) return r.returncode == 0 and r.stdout.strip() != "" except Exception: @@ -516,21 +571,36 @@ def test_slurm_real_submit(self, slurm_cluster): # ── Step 1: submit (CLI mode, container_runtime=none) ── r = _flowsim_cli( "submit", - "--scheduler", "slurm", - "--collect", "all", - "--model-path", MODEL, - "--tp", "1", - "--bs", "1", - "--input-len", "2048", - "--existing-ctx", "0", - "--decode-tokens", "2", - "--warmup-n", "2", - "--gpus", "1", - "--slurm-partition", "normal", - "--slurm-cli-prefix", _SLURM_CLI_PREFIX, - "--slurm-container-runtime", "none", - "--output-dir", output_dir, - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", + "--scheduler", + "slurm", + "--collect", + "all", + "--model-path", + MODEL, + "--tp", + "1", + "--bs", + "1", + "--input-len", + "2048", + "--existing-ctx", + "0", + "--decode-tokens", + "2", + "--warmup-n", + "2", + "--gpus", + "1", + "--slurm-partition", + "normal", + "--slurm-cli-prefix", + _SLURM_CLI_PREFIX, + "--slurm-container-runtime", + "none", + "--output-dir", + output_dir, + "--extra-server-opts", + f"--load-format {LOAD_FORMAT}", ) combined = r.stdout + r.stderr if r.returncode != 0: @@ -546,16 +616,22 @@ def test_slurm_real_submit(self, slurm_cluster): break if job_id: break - assert job_id, f"Could not find job_id in submit output:\n{combined[-1000:]}" + assert ( + job_id + ), f"Could not find job_id in submit output:\n{combined[-1000:]}" # ── Step 2: status — poll until Completed (max 20 min) ── deadline = time.time() + 1200 state = "" while time.time() < deadline: r_status = _flowsim_cli( - "status", "--scheduler", "slurm", - "--job", job_id, - "--slurm-cli-prefix", _SLURM_CLI_PREFIX, + "status", + "--scheduler", + "slurm", + "--job", + job_id, + "--slurm-cli-prefix", + _SLURM_CLI_PREFIX, ) assert r_status.returncode == 0 state = r_status.stdout.lower() @@ -564,15 +640,15 @@ def test_slurm_real_submit(self, slurm_cluster): if "failed" in state: pytest.fail(f"Slurm job failed:\n{r_status.stdout}") time.sleep(15) - assert "completed" in state or "succeeded" in state, ( - f"Slurm job did not complete in time:\n{r_status.stdout}" - ) + assert ( + "completed" in state or "succeeded" in state + ), f"Slurm job did not complete in time:\n{r_status.stdout}" # ── Step 3: traces are on host via mount ── slurm_traces = os.path.join(host_traces, "slurm") - assert os.path.isdir(slurm_traces), ( - f"No slurm traces dir at {slurm_traces}" - ) + assert os.path.isdir( + slurm_traces + ), f"No slurm traces dir at {slurm_traces}" ts_dirs = sorted(os.listdir(slurm_traces)) assert ts_dirs, f"No test dirs in {slurm_traces}" local_traces = os.path.join(slurm_traces, ts_dirs[-1]) @@ -586,9 +662,13 @@ def test_slurm_real_submit(self, slurm_cluster): # Cleanup: cancel job (traces stay on host for inspection) if job_id: _flowsim_cli( - "cancel", "--scheduler", "slurm", - "--job", job_id, - "--slurm-cli-prefix", _SLURM_CLI_PREFIX, + "cancel", + "--scheduler", + "slurm", + "--job", + job_id, + "--slurm-cli-prefix", + _SLURM_CLI_PREFIX, ) @@ -604,7 +684,9 @@ def test_slurm_real_submit(self, slurm_cluster): ] -def _assert_sweep_output(host_output_dir: str, points: list[tuple[int, int, int]]) -> None: +def _assert_sweep_output( + host_output_dir: str, points: list[tuple[int, int, int]] +) -> None: """Validate that every sweep point produced traces and parsed CSVs.""" for bs, il, ctx in points: tag = f"bs{bs}_input{il}_ctx{ctx}" @@ -617,9 +699,9 @@ def _assert_sweep_output(host_output_dir: str, points: list[tuple[int, int, int] assert os.path.isfile(summary_path), f"Missing {summary_path}" with open(summary_path) as f: summary = json.load(f) - assert len(summary) == len(points), ( - f"Expected {len(points)} entries in sweep_summary.json, got {len(summary)}" - ) + assert len(summary) == len( + points + ), f"Expected {len(points)} entries in sweep_summary.json, got {len(summary)}" for entry in summary: assert entry["traces"] > 0, f"Point {entry} has 0 traces" @@ -637,16 +719,26 @@ def test_sweep_inline(self): r = _flowsim_cli( "submit", - "--scheduler", "local", - "--collect", "perf", - "--model-path", MODEL, - "--tp", "1", - "--decode-tokens", "2", - "--warmup-n", "2", - "--gpus", "1", - "--local-gpus", "0", - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", - "--sweep", *sweep_args, + "--scheduler", + "local", + "--collect", + "perf", + "--model-path", + MODEL, + "--tp", + "1", + "--decode-tokens", + "2", + "--warmup-n", + "2", + "--gpus", + "1", + "--local-gpus", + "0", + "--extra-server-opts", + f"--load-format {LOAD_FORMAT}", + "--sweep", + *sweep_args, ) combined = r.stdout + r.stderr if r.returncode != 0: @@ -660,9 +752,9 @@ def test_sweep_inline(self): if "Traces:" in line: output_dir = line.split("Traces:", 1)[1].strip() break - assert output_dir and os.path.isdir(output_dir), ( - f"Could not find traces dir in output:\n{combined[-1000:]}" - ) + assert output_dir and os.path.isdir( + output_dir + ), f"Could not find traces dir in output:\n{combined[-1000:]}" _assert_sweep_output(output_dir, _SWEEP_POINTS) _assert_logs(output_dir) @@ -680,22 +772,34 @@ def test_sweep_file(self): try: r = _flowsim_cli( "submit", - "--scheduler", "local", - "--collect", "perf", - "--model-path", MODEL, - "--tp", "1", - "--decode-tokens", "2", - "--warmup-n", "2", - "--gpus", "1", - "--local-gpus", "0", - "--extra-server-opts", f"--load-format {LOAD_FORMAT}", - "--sweep-file", sweep_file, + "--scheduler", + "local", + "--collect", + "perf", + "--model-path", + MODEL, + "--tp", + "1", + "--decode-tokens", + "2", + "--warmup-n", + "2", + "--gpus", + "1", + "--local-gpus", + "0", + "--extra-server-opts", + f"--load-format {LOAD_FORMAT}", + "--sweep-file", + sweep_file, ) combined = r.stdout + r.stderr if r.returncode != 0: print("STDOUT:", r.stdout[-3000:]) print("STDERR:", r.stderr[-3000:]) - assert r.returncode == 0, f"sweep-file submit failed (exit {r.returncode})" + assert ( + r.returncode == 0 + ), f"sweep-file submit failed (exit {r.returncode})" # Find host output dir from submit output output_dir = None @@ -703,9 +807,9 @@ def test_sweep_file(self): if "Traces:" in line: output_dir = line.split("Traces:", 1)[1].strip() break - assert output_dir and os.path.isdir(output_dir), ( - f"Could not find traces dir in output:\n{combined[-1000:]}" - ) + assert output_dir and os.path.isdir( + output_dir + ), f"Could not find traces dir in output:\n{combined[-1000:]}" _assert_sweep_output(output_dir, _SWEEP_POINTS) _assert_logs(output_dir) diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 08e7146..b6c1afb 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -15,11 +15,11 @@ from schedulers.local import LocalScheduler from schedulers.slurm import SlurmScheduler - # ========================================================================= # ProfileJobSpec # ========================================================================= + class TestProfileJobSpec: """Tests for ProfileJobSpec dataclass methods.""" @@ -73,6 +73,7 @@ def test_build_shell_command_quotes_server_opts(self, spec: ProfileJobSpec): # K8sScheduler.render # ========================================================================= + class TestK8sScheduler: """Tests for K8s Job manifest generation.""" @@ -153,6 +154,7 @@ def test_render_labels(self, scheduler, spec): # SlurmScheduler.render # ========================================================================= + class TestSlurmScheduler: """Tests for Slurm sbatch script generation.""" @@ -238,6 +240,7 @@ def test_render_constraint(self, spec): # LocalScheduler.render # ========================================================================= + class TestLocalScheduler: """Tests for local execution backend.""" @@ -286,11 +289,13 @@ def test_dry_run_equals_render(self, spec): # CLI: flowsim init # ========================================================================= + class TestCLIInit: """Tests for `flowsim init` subcommand.""" def test_init_no_args_shows_help(self, capsys): from scripts.cli import _cmd_init + with pytest.raises(SystemExit) as exc_info: _cmd_init([]) assert exc_info.value.code != 0 @@ -299,6 +304,7 @@ def test_init_k8s_creates_template(self, tmp_path: Path): config_dir = tmp_path / "flowsim" with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init + rc = _cmd_init(["k8s"]) assert rc == 0 cfg_file = config_dir / "k8s.yaml" @@ -317,6 +323,7 @@ def test_init_slurm_creates_template(self, tmp_path: Path): config_dir = tmp_path / "flowsim" with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init + rc = _cmd_init(["slurm"]) assert rc == 0 cfg_file = config_dir / "slurm.yaml" @@ -336,6 +343,7 @@ def test_init_refuses_overwrite(self, tmp_path: Path): with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init + rc = _cmd_init(["slurm"]) assert rc != 0 # should refuse @@ -346,6 +354,7 @@ def test_init_force_overwrite(self, tmp_path: Path): with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init + rc = _cmd_init(["slurm", "--force"]) assert rc == 0 content = (config_dir / "slurm.yaml").read_text() @@ -360,6 +369,7 @@ def test_init_config_copies_file(self, tmp_path: Path): config_dir = tmp_path / "flowsim" with mock.patch("scripts.cli._CONFIG_DIR", config_dir): from scripts.cli import _cmd_init + rc = _cmd_init(["k8s", "--config", str(user_cfg)]) assert rc == 0 installed = config_dir / "k8s.yaml" @@ -369,6 +379,7 @@ def test_init_config_copies_file(self, tmp_path: Path): def test_init_config_missing_file(self): from scripts.cli import _cmd_init + rc = _cmd_init(["k8s", "--config", "/nonexistent/path.yaml"]) assert rc != 0 @@ -377,6 +388,7 @@ def test_init_config_missing_file(self): # CLI: flowsim submit (parse/dry-run only, no actual submission) # ========================================================================= + class TestCLISubmit: """Tests for `flowsim submit` argument parsing and dry-run.""" @@ -390,6 +402,7 @@ def _run(self, *args: str, expect_ok: bool = True) -> str: from scripts.submit_profile import main as submit_main import io from contextlib import redirect_stdout + buf = io.StringIO() with redirect_stdout(buf): submit_main(list(args)) @@ -397,6 +410,7 @@ def _run(self, *args: str, expect_ok: bool = True) -> str: def test_submit_help(self, capsys): from scripts.submit_profile import main as submit_main + with pytest.raises(SystemExit) as exc_info: submit_main(["--help"]) assert exc_info.value.code == 0 @@ -406,14 +420,18 @@ def test_submit_help(self, capsys): def test_submit_missing_required(self): from scripts.submit_profile import main as submit_main + with pytest.raises(SystemExit): submit_main([]) def test_submit_local_dry_run(self): out = self._run( - "--scheduler", "local", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", + "--scheduler", + "local", + "--collect", + "perf", + "--model-path", + "Qwen/Qwen3-8B", "--dry-run", ) assert "scripts/run_stage_profile.py" in out @@ -421,20 +439,28 @@ def test_submit_local_dry_run(self): def test_submit_local_dry_run_with_gpus(self): out = self._run( - "--scheduler", "local", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", - "--local-gpus", "0,1", + "--scheduler", + "local", + "--collect", + "perf", + "--model-path", + "Qwen/Qwen3-8B", + "--local-gpus", + "0,1", "--dry-run", ) assert "device=0,1" in out def test_submit_k8s_dry_run(self): out = self._run( - "--scheduler", "k8s", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", - "--k8s-namespace", "default", + "--scheduler", + "k8s", + "--collect", + "perf", + "--model-path", + "Qwen/Qwen3-8B", + "--k8s-namespace", + "default", "--dry-run", ) assert "apiVersion: batch/v1" in out @@ -442,26 +468,31 @@ def test_submit_k8s_dry_run(self): def test_submit_slurm_dry_run(self): out = self._run( - "--scheduler", "slurm", - "--collect", "perf", - "--model-path", "Qwen/Qwen3-8B", - "--slurm-partition", "gpu", + "--scheduler", + "slurm", + "--collect", + "perf", + "--model-path", + "Qwen/Qwen3-8B", + "--slurm-partition", + "gpu", "--dry-run", ) assert "#!/bin/bash" in out assert "#SBATCH --partition=gpu" in out - # ========================================================================= # Config loading # ========================================================================= + class TestConfig: """Tests for config file loading and saving.""" def test_save_and_load_yaml(self, tmp_path: Path): from schedulers.config import _save_yaml, _load_yaml + data = {"partition": "gpu", "account": "proj"} path = tmp_path / "test.yaml" _save_yaml(path, data) @@ -470,6 +501,7 @@ def test_save_and_load_yaml(self, tmp_path: Path): def test_cfg_get(self): from schedulers.config import cfg_get + cfg = {"key": "value", "empty": ""} assert cfg_get(cfg, "key", "default") == "value" assert cfg_get(cfg, "empty", "default") == "" From 9e2541a7fb18f97cb5d88736f884243456a8c694 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 19:24:36 +0000 Subject: [PATCH 41/56] docs: add --existing-ctx and --decode-tokens to all examples, default decode-tokens to 2 --- README.md | 12 ++++++------ schedulers/README.md | 9 +++++---- scripts/run_stage_profile.py | 6 +++--- scripts/submit_profile.py | 4 ++-- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 604c2a5..6f1842b 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ pip install -e . flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" ``` @@ -109,7 +109,7 @@ FlowSim performs **stage-separated** profiling: it captures prefill (EXTEND) and Each profiling request produces **two** stage-separated traces: - **EXTEND** (prefill) — processes `input_len` new tokens (with optional `existing_ctx` tokens already in KV cache) -- **DECODE** — captures `decode-tokens` decode batch steps +- **DECODE** — captures `decode-tokens` decode batch steps (default 2) ### Collection modes @@ -126,28 +126,28 @@ Each profiling request produces **two** stage-separated traces: flowsim submit --scheduler local \ --collect perf \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" # With existing KV cache context flowsim submit --scheduler local \ --collect perf \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 4 --input-len 512 --existing-ctx 4096 --gpus 1 \ + --tp 1 --bs 4 --input-len 512 --existing-ctx 4096 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" # Full pipeline (perf + shapes) flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" # Multi-point sweep flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --sweep 1:2048:0 4:2048:0 8:2048:0 --gpus 1 \ + --sweep 1:2048:0 4:2048:0 8:2048:0 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" ``` diff --git a/schedulers/README.md b/schedulers/README.md index d0835e7..86892bb 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -22,7 +22,7 @@ flowsim --help flowsim submit --scheduler \ --collect \ --model-path \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 + --tp 1 --bs 1 --input-len 2048 --decode-tokens 2 --gpus 1 # Job lifecycle flowsim list --scheduler @@ -50,6 +50,7 @@ flowsim submit --scheduler \ | `--bs` | Batch size | `1` | | `--input-len` | Input sequence length | `2048` | | `--existing-ctx` | Existing KV cache length | `0` | +| `--decode-tokens` | Decode batches to profile | `2` | | `--gpus` | GPU count | `1` | | `--image` | Docker image | `flowsim-image:latest` | | `--output-dir` | Output directory | `stage_traces/{scheduler}/{timestamp}/` | @@ -65,7 +66,7 @@ Runs profiling via `docker run` on the host machine. flowsim submit --scheduler local \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --local-gpus 0 \ --extra-server-opts "--load-format dummy" ``` @@ -95,7 +96,7 @@ flowsim init k8s --config my-cluster.yaml # or use your own flowsim submit --scheduler k8s \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --extra-server-opts "--load-format dummy" ``` @@ -133,7 +134,7 @@ flowsim init slurm --config my-slurm.yaml # or use your own flowsim submit --scheduler slurm \ --collect all \ --model-path workload/models/configs/Qwen3-235B-A22B \ - --tp 1 --bs 1 --input-len 2048 --gpus 1 \ + --tp 1 --bs 1 --input-len 2048 --existing-ctx 0 --decode-tokens 2 --gpus 1 \ --slurm-partition gpu \ --extra-server-opts "--load-format dummy" ``` diff --git a/scripts/run_stage_profile.py b/scripts/run_stage_profile.py index 3c739ef..36505ec 100644 --- a/scripts/run_stage_profile.py +++ b/scripts/run_stage_profile.py @@ -61,14 +61,14 @@ python scripts/run_stage_profile.py \\ --collect perf \\ --host 0.0.0.0 --port 30001 \\ - --bs 1 --input-len 2048 --decode-tokens 32 \\ + --bs 1 --input-len 2048 --decode-tokens 2 \\ --output-dir /flowsim/stage_traces Example — with existing KV cache context python scripts/run_stage_profile.py \\ --collect perf \\ --host 0.0.0.0 --port 30001 \\ - --bs 4 --input-len 512 --existing-ctx 4096 --decode-tokens 32 \\ + --bs 4 --input-len 512 --existing-ctx 4096 --decode-tokens 2 \\ --output-dir /flowsim/stage_traces Example — launch server + full pipeline (perf → shapes) @@ -113,7 +113,7 @@ # Defaults # --------------------------------------------------------------------------- DEFAULT_WARMUP_N = 5 -DEFAULT_DECODE_TOKENS = 32 +DEFAULT_DECODE_TOKENS = 2 DEFAULT_MAX_PREFILL_TOKENS = 131072 diff --git a/scripts/submit_profile.py b/scripts/submit_profile.py index 5e1021a..0c3074c 100644 --- a/scripts/submit_profile.py +++ b/scripts/submit_profile.py @@ -19,7 +19,7 @@ --collect perf \\ --model-path Qwen/Qwen3-235B-A22B-FP8 \\ --tp 4 --gpus 4 \\ - --bs 1 --input-len 2048 --decode-tokens 32 \\ + --bs 1 --input-len 2048 --decode-tokens 2 \\ --image flowsim-image:latest \\ --k8s-namespace default \\ --k8s-pvc flowsim-traces \\ @@ -98,7 +98,7 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: wl.add_argument("--bs", type=int, default=1, help="Batch size") wl.add_argument("--input-len", type=int, default=2048) wl.add_argument("--existing-ctx", type=int, default=0) - wl.add_argument("--decode-tokens", type=int, default=32) + wl.add_argument("--decode-tokens", type=int, default=2) wl.add_argument("--warmup-n", type=int, default=5) wl.add_argument( "--disable-chunked-prefill", From 880fe055c117a1f5a184c5f6fc8605ac0d3f668d Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 19:31:46 +0000 Subject: [PATCH 42/56] refactor: remove PyYAML fallback, make it a core dependency --- pyproject.toml | 2 +- schedulers/config.py | 32 +++++++------------------------- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index feade94..c91de8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,13 +14,13 @@ dependencies = [ "perfetto>=0.7", "numpy>=1.24", "pandas>=1.5", + "PyYAML>=6.0", ] [project.optional-dependencies] # Scheduler backends ------------------------------------------------------- k8s = [ "kubernetes>=27.0", # K8s Python client for remote job submission - "PyYAML>=6.0", # nicer YAML dry-run output (json fallback w/o this) ] slurm = [] # Slurm REST API uses stdlib urllib only diff --git a/schedulers/config.py b/schedulers/config.py index 433c87b..10c7f8d 100644 --- a/schedulers/config.py +++ b/schedulers/config.py @@ -23,40 +23,22 @@ import os from pathlib import Path -# Optional: try PyYAML, fall back to JSON -try: - import yaml as _yaml +import yaml as _yaml - def _load_yaml(path: Path) -> dict: - with open(path) as f: - return _yaml.safe_load(f) or {} -except ImportError: - import json as _json - - def _load_yaml(path: Path) -> dict: # type: ignore[misc] - """Fallback: accept JSON (valid YAML 1.2 subset).""" - with open(path) as f: - return _json.load(f) +def _load_yaml(path: Path) -> dict: + with open(path) as f: + return _yaml.safe_load(f) or {} _CONFIG_DIR = Path.home() / ".flowsim" def _save_yaml(path: Path, data: dict) -> None: - """Write a dict to a YAML file (uses PyYAML if available, else JSON).""" + """Write a dict to a YAML file.""" path.parent.mkdir(parents=True, exist_ok=True) - try: - import yaml as _y - - with open(path, "w") as f: - _y.safe_dump(data, f, default_flow_style=False, sort_keys=False) - except ImportError: - import json as _j - - with open(path, "w") as f: - _j.dump(data, f, indent=2, ensure_ascii=False) - f.write("\n") + with open(path, "w") as f: + _yaml.safe_dump(data, f, default_flow_style=False, sort_keys=False) def _resolve_path(env_var: str, filename: str) -> Path | None: From 236548a277b6a9d933d0649e85488bfd98d87b82 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 21:03:25 +0000 Subject: [PATCH 43/56] fix: reject k8s submit when no PVC or hostPath configured --- schedulers/k8s.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/schedulers/k8s.py b/schedulers/k8s.py index e75e7f8..8c07771 100644 --- a/schedulers/k8s.py +++ b/schedulers/k8s.py @@ -186,6 +186,11 @@ def _build_job_dict(self, spec: ProfileJobSpec) -> dict: def submit(self, spec: ProfileJobSpec) -> JobResult: """Submit via the ``kubernetes`` Python client (``pip install kubernetes``).""" + if not self.pvc_name and not self.host_output_dir: + raise ValueError( + "No persistent storage configured. " + "Set --k8s-pvc or --k8s-host-output-dir to avoid losing traces when the pod exits." + ) batch_api, _ = self._load_k8s() body = self._build_job_dict(spec) @@ -281,7 +286,7 @@ def status(self, job_id: str) -> dict: elif self.host_output_dir: output_hint = f"Traces at hostPath {self.host_output_dir} on the scheduled node" else: - output_hint = "WARNING: no PVC or hostPath configured — traces are lost when pod exits" + output_hint = "WARNING: no PVC or hostPath configured — traces will be lost when pod exits" msg_parts = [ f"Job: {job_id} Namespace: {self.namespace} State: {state}" From 9daee82717c0d423822230cc3989e1ca6df76b17 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 21:27:00 +0000 Subject: [PATCH 44/56] docs: add missing parameters --- schedulers/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/schedulers/README.md b/schedulers/README.md index 86892bb..703ee9c 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -54,6 +54,9 @@ flowsim submit --scheduler \ | `--gpus` | GPU count | `1` | | `--image` | Docker image | `flowsim-image:latest` | | `--output-dir` | Output directory | `stage_traces/{scheduler}/{timestamp}/` | +| `--extra-server-opts` | Extra sglang server flags (quoted string) | `""` | +| `--sweep` | Multi-point sweep `BS:INPUT_LEN:CTX` (repeatable) | empty | +| `--job-name` | Custom job name | auto-generated | | `--dry-run` | Print script only | `false` | --- From 5e3d1bb12ffa3fe6e6ab8227d38350b7cf6ac00b Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 21:52:02 +0000 Subject: [PATCH 45/56] fix: unique job names, Slurm exclusive GPU, remove list_jobs prefix filter - Add timestamp suffix (-MMDD-HHMMSS) to auto-generated job names for uniqueness - Add #SBATCH --exclusive to Slurm scripts for profiling GPU isolation - Remove flowsim- prefix filter from Slurm list_jobs (let users filter) - Add --sweep-file to scheduler README Common Parameters table --- schedulers/README.md | 1 + schedulers/base.py | 14 +++++++++++--- schedulers/slurm.py | 3 +-- tests/unit/test_scheduler_cli.py | 3 ++- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/schedulers/README.md b/schedulers/README.md index 703ee9c..6e7600f 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -56,6 +56,7 @@ flowsim submit --scheduler \ | `--output-dir` | Output directory | `stage_traces/{scheduler}/{timestamp}/` | | `--extra-server-opts` | Extra sglang server flags (quoted string) | `""` | | `--sweep` | Multi-point sweep `BS:INPUT_LEN:CTX` (repeatable) | empty | +| `--sweep-file` | File with one `BS:INPUT_LEN:CTX` per line (mutually exclusive with `--sweep`) | none | | `--job-name` | Custom job name | auto-generated | | `--dry-run` | Print script only | `false` | diff --git a/schedulers/base.py b/schedulers/base.py index d3b32c4..ac71548 100644 --- a/schedulers/base.py +++ b/schedulers/base.py @@ -4,6 +4,7 @@ import abc import shlex +import time from dataclasses import dataclass, field from typing import Optional, Sequence @@ -126,14 +127,21 @@ def build_shell_command(self) -> str: return " ".join(quoted) def default_job_name(self) -> str: - """Generate a default job name from workload params.""" + """Generate a default job name from workload params. + + Auto-generated names include a short timestamp suffix + (``-MMDD-HHMMSS``) so repeated submissions of the same + workload get distinct names. User-supplied ``--job-name`` + values are returned as-is. + """ if self.job_name: return self.job_name model_short = self.model_path.split("/")[-1].lower().replace(".", "-") + ts = time.strftime("%m%d-%H%M%S") if self.sweep_points: - name = f"flowsim-{self.collect}-{model_short}-sweep{len(self.sweep_points)}pt" + name = f"flowsim-{self.collect}-{model_short}-sweep{len(self.sweep_points)}pt-{ts}" else: - name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}" + name = f"flowsim-{self.collect}-{model_short}-bs{self.bs}-il{self.input_len}-{ts}" return name diff --git a/schedulers/slurm.py b/schedulers/slurm.py index ad12e75..55b194d 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -78,6 +78,7 @@ def render(self, spec: ProfileJobSpec) -> str: f"#SBATCH --partition={self.partition}", f"#SBATCH --gpus-per-node={spec.gpus}", f"#SBATCH --ntasks=1", + f"#SBATCH --exclusive", f"#SBATCH --time={self.time_limit}", f"#SBATCH --output={spec.output_dir}/{job_name}_%j.log", ] @@ -312,8 +313,6 @@ def _list_jobs_cli(self, *, status_filter: str = "") -> list[dict]: continue parts = line.split("|", 4) name = parts[1] if len(parts) > 1 else "" - if not name.startswith("flowsim-"): - continue state = parts[2] if len(parts) > 2 else "UNKNOWN" if status_filter and state.upper() != status_filter.upper(): continue diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index b6c1afb..c6f329b 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -35,7 +35,7 @@ def spec(self) -> ProfileJobSpec: def test_default_job_name(self, spec: ProfileJobSpec): name = spec.default_job_name() - assert name == "flowsim-perf-qwen3-8b-bs4-il1024" + assert name.startswith("flowsim-perf-qwen3-8b-bs4-il1024-") def test_custom_job_name(self, spec: ProfileJobSpec): spec.job_name = "my-job" @@ -182,6 +182,7 @@ def test_render_sbatch_directives(self, scheduler, spec): script = scheduler.render(spec) assert "#SBATCH --partition=gpu-h100" in script assert "#SBATCH --gpus-per-node=4" in script + assert "#SBATCH --exclusive" in script assert "#SBATCH --time=01:00:00" in script assert "#SBATCH --account=my-proj" in script From 2a718a7228249ee9b69c8c7561003b3f6a39e0b1 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 22:03:28 +0000 Subject: [PATCH 46/56] refactor: restructure CLI into scripts/cli/ subpackage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cli.py → cli/__init__.py (entry point + init command) - submit_profile.py → cli/submit.py (flowsim submit) - status_profile.py → cli/manage.py (flowsim status/logs/list/cancel) - Update all import paths in tests --- scripts/{cli.py => cli/__init__.py} | 12 ++++++------ scripts/{status_profile.py => cli/manage.py} | 0 scripts/{submit_profile.py => cli/submit.py} | 6 +++--- tests/unit/test_scheduler_cli.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) rename scripts/{cli.py => cli/__init__.py} (91%) rename scripts/{status_profile.py => cli/manage.py} (100%) rename scripts/{submit_profile.py => cli/submit.py} (99%) diff --git a/scripts/cli.py b/scripts/cli/__init__.py similarity index 91% rename from scripts/cli.py rename to scripts/cli/__init__.py index 04d0c84..9d4755e 100644 --- a/scripts/cli.py +++ b/scripts/cli/__init__.py @@ -16,7 +16,7 @@ _CONFIG_DIR = Path.home() / ".flowsim" _TEMPLATES_DIR = ( - Path(__file__).resolve().parent.parent / "schedulers" / "templates" + Path(__file__).resolve().parent.parent.parent / "schedulers" / "templates" ) @@ -130,31 +130,31 @@ def main(argv: list[str] | None = None) -> int: return _cmd_init(remaining) if args.command == "submit": - from scripts.submit_profile import main as submit_main + from scripts.cli.submit import main as submit_main submit_main(remaining) return 0 if args.command == "status": - from scripts.status_profile import main_status + from scripts.cli.manage import main_status main_status(remaining) return 0 if args.command == "logs": - from scripts.status_profile import main_logs + from scripts.cli.manage import main_logs main_logs(remaining) return 0 if args.command == "list": - from scripts.status_profile import main_list + from scripts.cli.manage import main_list main_list(remaining) return 0 if args.command == "cancel": - from scripts.status_profile import main_cancel + from scripts.cli.manage import main_cancel main_cancel(remaining) return 0 diff --git a/scripts/status_profile.py b/scripts/cli/manage.py similarity index 100% rename from scripts/status_profile.py rename to scripts/cli/manage.py diff --git a/scripts/submit_profile.py b/scripts/cli/submit.py similarity index 99% rename from scripts/submit_profile.py rename to scripts/cli/submit.py index 0c3074c..a6370f8 100644 --- a/scripts/submit_profile.py +++ b/scripts/cli/submit.py @@ -14,7 +14,7 @@ Dry-run (print Kubernetes Job YAML to stdout): - python scripts/submit_profile.py \\ + flowsim submit \\ --scheduler k8s \\ --collect perf \\ --model-path Qwen/Qwen3-235B-A22B-FP8 \\ @@ -27,7 +27,7 @@ Dry-run (print Slurm sbatch script to stdout): - python scripts/submit_profile.py \\ + flowsim submit \\ --scheduler slurm \\ --collect perf \\ --model-path Qwen/Qwen3-235B-A22B-FP8 \\ @@ -38,7 +38,7 @@ Submit directly to cluster: - python scripts/submit_profile.py \\ + flowsim submit \\ --scheduler k8s \\ ... \\ --submit diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index c6f329b..2491b7e 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -400,7 +400,7 @@ def _skip_image_check(self): def _run(self, *args: str, expect_ok: bool = True) -> str: """Run submit via the Python function, capture stdout.""" - from scripts.submit_profile import main as submit_main + from scripts.cli.submit import main as submit_main import io from contextlib import redirect_stdout @@ -410,7 +410,7 @@ def _run(self, *args: str, expect_ok: bool = True) -> str: return buf.getvalue() def test_submit_help(self, capsys): - from scripts.submit_profile import main as submit_main + from scripts.cli.submit import main as submit_main with pytest.raises(SystemExit) as exc_info: submit_main(["--help"]) @@ -420,7 +420,7 @@ def test_submit_help(self, capsys): assert "local" in out def test_submit_missing_required(self): - from scripts.submit_profile import main as submit_main + from scripts.cli.submit import main as submit_main with pytest.raises(SystemExit): submit_main([]) From 31dc15ba9d1ce76a193384173057ecdbd588fedd Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 22:22:41 +0000 Subject: [PATCH 47/56] fix: use runtime:nvidia for slurm compute node GPU access Replace deploy.resources.reservations with runtime:nvidia + NVIDIA_VISIBLE_DEVICES to fix NVML initialization failure in slurmd-0. --- tests/integration/infra/slurm-compose.yaml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/integration/infra/slurm-compose.yaml b/tests/integration/infra/slurm-compose.yaml index c2369ba..d24ab6f 100644 --- a/tests/integration/infra/slurm-compose.yaml +++ b/tests/integration/infra/slurm-compose.yaml @@ -83,6 +83,9 @@ services: <<: *slurm-base container_name: slurmd-0 hostname: slurmd-0 + runtime: nvidia + environment: + NVIDIA_VISIBLE_DEVICES: "0" command: > bash -c " mkdir -p /run/munge && chown munge:munge /run/munge @@ -100,13 +103,6 @@ services: - /home/administrator/zhangt/FlowSim/stage_traces:/flowsim/stage_traces # Cgroup needed by slurmd - /sys/fs/cgroup:/sys/fs/cgroup:rw - deploy: - resources: - reservations: - devices: - - driver: nvidia - device_ids: ["0"] - capabilities: [gpu] # ---- REST API (optional, for REST mode) ---- # slurmrestd: From b7ec2cb5b963fc4cd9690b7a1914a2d1f2fbc0b6 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 22:57:12 +0000 Subject: [PATCH 48/56] =?UTF-8?q?refactor:=20rename=20test=5Fscheduler=5Fl?= =?UTF-8?q?ocal.py=20=E2=86=92=20test=5Fscheduler.py,=20rewrite=20header?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove misleading 'local' suffix (file tests all 3 backends) - Add test methodology (How It Works) and Pass Criteria to docstring - Update file references in schedulers/README.md --- schedulers/README.md | 4 +- ...t_scheduler_local.py => test_scheduler.py} | 65 +++++++++++-------- 2 files changed, 41 insertions(+), 28 deletions(-) rename tests/integration/{test_scheduler_local.py => test_scheduler.py} (93%) diff --git a/schedulers/README.md b/schedulers/README.md index 6e7600f..dc91616 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -235,7 +235,7 @@ docker compose -f slurm-compose.yaml down -v python -m pytest tests/unit/test_scheduler_cli.py -v # Integration tests -python -m pytest tests/integration/test_scheduler_local.py::TestK8sScheduler -v -x -python -m pytest tests/integration/test_scheduler_local.py::TestSlurmScheduler -v -x +python -m pytest tests/integration/test_scheduler.py::TestK8sScheduler -v -x +python -m pytest tests/integration/test_scheduler.py::TestSlurmScheduler -v -x ``` diff --git a/tests/integration/test_scheduler_local.py b/tests/integration/test_scheduler.py similarity index 93% rename from tests/integration/test_scheduler_local.py rename to tests/integration/test_scheduler.py index a6bc416..7ecaf9b 100644 --- a/tests/integration/test_scheduler_local.py +++ b/tests/integration/test_scheduler.py @@ -1,43 +1,56 @@ -"""Integration tests for the FlowSim scheduler CLI. +"""Integration tests for all FlowSim scheduler backends. -Tests all three scheduler backends (local, k8s, slurm) end-to-end. - -* **local** — submits jobs via ``flowsim submit --scheduler local`` which - launches Docker containers on the host. Validates job lifecycle (submit, - list, status) and trace CSV correctness (GEMM dim0, FlashAttn seqlen). -* **k8s** — submits a real Job to a Kind cluster, retrieves traces via - ``docker cp``, and validates trace CSVs. Auto-sets up the Kind cluster - via ``dev-setup.sh`` if not already running. -* **slurm** — submits a real job to a local docker-compose Slurm cluster, - retrieves traces via ``docker cp``, and validates trace CSVs. Auto-sets - up the Slurm cluster via ``dev-setup.sh slurm`` if not already running. +How It Works +------------ +Each test class exercises one scheduler backend end-to-end through the +``flowsim`` CLI (the same commands a user would run). The flow is: + +1. ``flowsim submit`` — submit a ``--collect all`` profiling job. +2. ``flowsim list`` — verify the job appears in the listing. +3. ``flowsim status`` — poll until Completed / Succeeded (up to 20 min). +4. Validate outputs on the host file system. + +Infrastructure is auto-provisioned by session-scoped fixtures: + +* **Local** — uses Docker on the host directly (no extra infra). +* **K8s** — spins up a Kind cluster via ``dev-setup.sh kind``. +* **Slurm** — spins up a docker-compose Slurm cluster via + ``dev-setup.sh slurm`` (slurmctld + slurmd-0 with GPU 0). + +Pass Criteria +------------- +* Job reaches Completed/Succeeded within the timeout. +* Stage-separated trace files exist (EXTEND + DECODE ``.trace.json.gz``). +* Parsed CSVs exist under ``parsed/`` with non-zero rows. +* GEMM kernels: EXTEND ``dim0 == bs * input_len``, DECODE ``dim0 == bs``. +* FlashAttn kernels: EXTEND dims contain ``[bs, input_len + existing_ctx]`` (±1). +* ``analysis_extend.json`` and ``analysis_decode.json`` are valid JSON. +* After ``--collect shapes``, ``Dims`` column is present in merged CSVs. +* Sweep jobs produce per-point subdirs + ``sweep_summary.json``. +* Log files (stdout/stderr) exist under ``logs/``. Requirements ------------ -* Docker with ``flowsim-image:latest`` built (for local tests). -* A GPU-equipped host machine (local tests run on the physical host, - NOT inside a Docker container). -* ``tests/integration/infra/dev-setup.sh`` available (Kind and Slurm clusters are - automatically created if missing). -* ``schedulers/`` available on PYTHONPATH. +* Docker with ``flowsim-image:latest`` built. +* GPU-equipped host machine. +* ``tests/integration/infra/dev-setup.sh`` available. Environment Variables --------------------- ``MODEL`` - Model path relative to project root - (default: ``workload/models/configs/Qwen3-235B-A22B``). + Model path (default: ``workload/models/configs/Qwen3-235B-A22B``). ``LOAD_FORMAT`` Load format (default: ``dummy``). Usage ----- - # On host (local scheduler tests — needs Docker + GPU): - cd FlowSim && python -m pytest \ - tests/integration/test_scheduler_local.py -v -x -k "local" + # All scheduler tests: + python -m pytest tests/integration/test_scheduler.py -v -x - # On host (k8s tests — needs kubeconfig): - python -m pytest tests/integration/test_scheduler_local.py \ - -v -x -k "k8s" + # Single backend: + python -m pytest tests/integration/test_scheduler.py -v -x -k "local" + python -m pytest tests/integration/test_scheduler.py -v -x -k "k8s" + python -m pytest tests/integration/test_scheduler.py -v -x -k "slurm" """ import ast From 7d4088944bc204f04ede0131954dc7cd346e53e0 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Thu, 19 Mar 2026 23:55:51 +0000 Subject: [PATCH 49/56] docs: update output structure to include logs, merged, and shape dirs Sync the output tree in both README.md and schedulers/README.md to reflect the actual directory layout produced by profiling jobs: - Add logs/ with server, shape_server, and job log entries - Add merged/ and shape_traces/ + shape_parsed/ inside point dirs - Add brief descriptions of each subdirectory in root README --- README.md | 22 +++++++++++++--------- schedulers/README.md | 5 +++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 6f1842b..6fa4289 100644 --- a/README.md +++ b/README.md @@ -157,20 +157,24 @@ For K8s / Slurm clusters, replace `--scheduler local` with `k8s` or `slurm`. See ``` stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ -├── sweep_summary.json ├── bs1_input2048_ctx0/ -│ ├── *-TP-*-EXTEND.trace.json.gz -│ ├── *-TP-*-DECODE.trace.json.gz -│ ├── parsed/ -│ │ ├── TP-0-EXTEND.csv -│ │ ├── TP-0-DECODE.csv -│ │ └── ... +│ ├── *.trace.json.gz +│ ├── parsed/*.csv +│ ├── merged/*_merged.trace.csv +│ ├── shape_traces/ + shape_parsed/ │ ├── analysis_extend.json │ └── analysis_decode.json -└── ... +├── logs/ +│ ├── server_*.{stdout,stderr}.log +│ ├── shape_server_*.{stdout,stderr}.log +│ └── {job_name}_*.{stdout,stderr}.log +└── sweep_summary.json ``` -After `--collect shapes`, each `parsed/TP-*-DECODE.csv` gains a `Dims` column with kernel tensor shapes. +- `parsed/`: Per-rank timing CSVs extracted from traces. +- `merged/`: Timing + shape columns joined into a single CSV per rank/stage. +- `shape_traces/` / `shape_parsed/`: Raw and parsed shape-profiling traces (generated by `--collect shapes` or `--collect all`). +- `logs/`: Server, shape-server, and job stdout/stderr logs. ### Utilities (`utils/`) diff --git a/schedulers/README.md b/schedulers/README.md index dc91616..3994cb6 100644 --- a/schedulers/README.md +++ b/schedulers/README.md @@ -206,8 +206,9 @@ stage_traces/{scheduler}/{YYYYMMDD_HHMMSS}/ │ ├── analysis_extend.json │ └── analysis_decode.json ├── logs/ -│ ├── server_*.stdout.log -│ └── server_*.stderr.log +│ ├── server_*.{stdout,stderr}.log +│ ├── shape_server_*.{stdout,stderr}.log +│ └── {job_name}_*.{stdout,stderr}.log └── sweep_summary.json ``` From 86dd517fce4b00c5454e4fd938b2510349ff14a9 Mon Sep 17 00:00:00 2001 From: Terrence <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:09:38 -0700 Subject: [PATCH 50/56] Update scripts/cli/submit.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- scripts/cli/submit.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/cli/submit.py b/scripts/cli/submit.py index a6370f8..e59d697 100644 --- a/scripts/cli/submit.py +++ b/scripts/cli/submit.py @@ -36,14 +36,12 @@ --slurm-time 02:00:00 \\ --dry-run -Submit directly to cluster: +Submit directly to cluster (omit --dry-run): flowsim submit \\ --scheduler k8s \\ - ... \\ - --submit + ... """ - from __future__ import annotations import argparse From 2dbb8966857d51dbc370c0c2bbb4386438563e4a Mon Sep 17 00:00:00 2001 From: Terrence <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:11:42 -0700 Subject: [PATCH 51/56] Update schedulers/local.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- schedulers/local.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/schedulers/local.py b/schedulers/local.py index f9c2aa8..7015d28 100644 --- a/schedulers/local.py +++ b/schedulers/local.py @@ -215,16 +215,23 @@ def _tee(src, dest_file, dest_stream): ) def cancel(self, job_id: str) -> str: - """Stop the Docker container for a local job.""" + """Stop the Docker container for a local job. + + The Docker container name is truncated to 63 characters when created. + To ensure we stop the correct container even if a longer job id is + provided (for example, the full job name), apply the same truncation + here before calling ``docker stop``. + """ + container_name = job_id[:63] proc = subprocess.run( - ["docker", "stop", job_id], + ["docker", "stop", container_name], capture_output=True, text=True, timeout=30, ) if proc.returncode == 0: - return f"Stopped container {job_id}" - return f"Could not stop container {job_id}: {proc.stderr.strip()}" + return f"Stopped container {container_name}" + return f"Could not stop container {container_name}: {proc.stderr.strip()}" def _find_log_dirs(self) -> list[str]: """Find all log directories under stage_traces/{scheduler}/*/logs/.""" From f30329a08f3157cc8cc356e6e2f2cf395dabb23f Mon Sep 17 00:00:00 2001 From: Terrence <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:12:01 -0700 Subject: [PATCH 52/56] Update tests/integration/infra/kind-multi-node.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/infra/kind-multi-node.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/infra/kind-multi-node.yaml b/tests/integration/infra/kind-multi-node.yaml index 90b4e6f..7ec1b68 100644 --- a/tests/integration/infra/kind-multi-node.yaml +++ b/tests/integration/infra/kind-multi-node.yaml @@ -29,9 +29,9 @@ nodes: extraMounts: - hostPath: /dev/null containerPath: /var/run/nvidia-container-devices/0 - - hostPath: /home/administrator/zhangt + - hostPath: /path/to/host/workspace containerPath: /workspace readOnly: true # Writable mount so K8s pods can write traces directly to host - - hostPath: /home/administrator/zhangt/FlowSim/stage_traces + - hostPath: /path/to/host/stage_traces containerPath: /host-stage-traces From 471876428bb6c8ee8adc930341ef046d6a84647b Mon Sep 17 00:00:00 2001 From: Terrence <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:12:30 -0700 Subject: [PATCH 53/56] Update tests/integration/infra/slurm-compose.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/infra/slurm-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/infra/slurm-compose.yaml b/tests/integration/infra/slurm-compose.yaml index d24ab6f..b772eb9 100644 --- a/tests/integration/infra/slurm-compose.yaml +++ b/tests/integration/infra/slurm-compose.yaml @@ -1,4 +1,4 @@ -# Slurm test cluster — slurmctld + 2 compute nodes (GPU 0, GPU 1) + slurmrestd +# Slurm test cluster — slurmctld + 1 compute node (GPU 0) # # Usage: # cd tests/integration/infra/ From 8f790542fb20556eef4bf7737035bf805012628c Mon Sep 17 00:00:00 2001 From: Terrence <39916879+TerrenceZhangX@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:15:48 -0700 Subject: [PATCH 54/56] Update tests/integration/infra/slurm.conf Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/infra/slurm.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/infra/slurm.conf b/tests/integration/infra/slurm.conf index 7a26d5c..ea7611b 100644 --- a/tests/integration/infra/slurm.conf +++ b/tests/integration/infra/slurm.conf @@ -1,8 +1,8 @@ -# slurm.conf — minimal 2-node cluster for FlowSim testing +# slurm.conf — minimal single-node cluster for FlowSim testing # # Controller: slurmctld -# Compute: slurmd-0 (1 GPU), slurmd-1 (1 GPU) -# REST API: slurmrestd on port 6820 +# Compute: slurmd-0 (1 GPU) +# REST API: not provisioned in this test configuration ClusterName=flowsim SlurmctldHost=slurmctld From 7d64cfce41c42c866efd65ed15941f6ce42c3cb9 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Fri, 20 Mar 2026 02:44:35 +0000 Subject: [PATCH 55/56] fix: parameterize hardcoded host paths in slurm-compose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace absolute /home/administrator/… bind mounts with: - ${HOST_WORKSPACE} env var for the read-only /workspace mount - Relative path ../../../stage_traces for the writable traces mount dev-setup.sh now exports HOST_WORKSPACE (defaults to parent of repo root) before invoking docker compose. --- tests/integration/infra/dev-setup.sh | 5 +++++ tests/integration/infra/slurm-compose.yaml | 10 +++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/integration/infra/dev-setup.sh b/tests/integration/infra/dev-setup.sh index afbb9f7..02e447f 100755 --- a/tests/integration/infra/dev-setup.sh +++ b/tests/integration/infra/dev-setup.sh @@ -302,7 +302,12 @@ setup_slurm() { err "docker compose v2 is required but not available." fi + # HOST_WORKSPACE is used by slurm-compose.yaml for the read-only /workspace mount. + REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + export HOST_WORKSPACE="${HOST_WORKSPACE:-$(dirname "${REPO_ROOT}")}" + log "Building and starting Slurm cluster (slurmctld + 2 slurmd + slurmrestd)..." + log " HOST_WORKSPACE=${HOST_WORKSPACE}" docker compose -f "${SCRIPT_DIR}/slurm-compose.yaml" up -d --build log "Waiting for slurmctld to become ready..." diff --git a/tests/integration/infra/slurm-compose.yaml b/tests/integration/infra/slurm-compose.yaml index b772eb9..b9ba09a 100644 --- a/tests/integration/infra/slurm-compose.yaml +++ b/tests/integration/infra/slurm-compose.yaml @@ -1,6 +1,10 @@ # Slurm test cluster — slurmctld + 1 compute node (GPU 0) # +# Requires HOST_WORKSPACE env var pointing to the directory containing +# model weights (mounted read-only into containers as /workspace). +# # Usage: +# export HOST_WORKSPACE=/path/to/workspace # cd tests/integration/infra/ # docker compose -f slurm-compose.yaml up -d # @@ -33,7 +37,7 @@ x-slurm-base: &slurm-base - slurm-etc:/etc/slurm - munge-socket:/run/munge # Share workspace for model weights / traces - - /home/administrator/zhangt:/workspace:ro + - ${HOST_WORKSPACE:?set HOST_WORKSPACE to the directory containing model weights}:/workspace:ro networks: - slurm-net @@ -98,9 +102,9 @@ services: - slurm-etc:/etc/slurm:ro - munge-key:/etc/munge:ro - munge-socket:/run/munge - - /home/administrator/zhangt:/workspace:ro + - ${HOST_WORKSPACE:?set HOST_WORKSPACE}:/workspace:ro # Writable mount so traces appear on host - - /home/administrator/zhangt/FlowSim/stage_traces:/flowsim/stage_traces + - ../../../stage_traces:/flowsim/stage_traces # Cgroup needed by slurmd - /sys/fs/cgroup:/sys/fs/cgroup:rw From 76e75bd91b9c90b71ae659bb9ca0b7502b4214e0 Mon Sep 17 00:00:00 2001 From: Terrence Zhang Date: Fri, 20 Mar 2026 02:51:08 +0000 Subject: [PATCH 56/56] fix: auto-mount output_dir in docker/enroot container modes Without mounting spec.output_dir into the container, traces and logs are written to the ephemeral container filesystem and lost on exit. Docker mode: prepend -v output_dir:output_dir to the mount list. Enroot mode: append output_dir:output_dir to --container-mounts. --- schedulers/slurm.py | 14 +++++++++----- tests/unit/test_scheduler_cli.py | 4 ++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/schedulers/slurm.py b/schedulers/slurm.py index 55b194d..543b22f 100644 --- a/schedulers/slurm.py +++ b/schedulers/slurm.py @@ -107,22 +107,26 @@ def render(self, spec: ProfileJobSpec) -> str: lines.append("") if self.container_runtime == "docker": - mounts = "" + # Always mount output_dir so traces/logs persist on the host. + mounts = f" -v {spec.output_dir}:{spec.output_dir}" if self.container_mounts: - mounts = f" -v {self.container_mounts}" + mounts += f" -v {self.container_mounts}" lines.append( f"docker run --gpus all --ipc=host --shm-size=16g" f"{mounts} -w /flowsim {spec.image} \\" ) lines.append(f" {cmd}") elif self.container_runtime == "enroot": - mounts = "" + # Always mount output_dir so traces/logs persist on the host. + out_mount = f"{spec.output_dir}:{spec.output_dir}" if self.container_mounts: - mounts = f" --container-mounts={self.container_mounts}" + all_mounts = f"{self.container_mounts},{out_mount}" + else: + all_mounts = out_mount lines.append( f"srun --container-image={spec.image}" f" --container-workdir=/flowsim" - f"{mounts} \\" + f" --container-mounts={all_mounts} \\" ) lines.append(f" {cmd}") elif self.container_runtime == "none": diff --git a/tests/unit/test_scheduler_cli.py b/tests/unit/test_scheduler_cli.py index 2491b7e..9f9c5ab 100644 --- a/tests/unit/test_scheduler_cli.py +++ b/tests/unit/test_scheduler_cli.py @@ -204,6 +204,8 @@ def test_render_docker_runtime(self, spec): script = sched.render(spec) assert "docker run" in script assert "-v /data:/data" in script + # output_dir is always auto-mounted + assert f"-v {spec.output_dir}:{spec.output_dir}" in script def test_render_enroot_runtime(self, spec): sched = SlurmScheduler( @@ -212,6 +214,8 @@ def test_render_enroot_runtime(self, spec): ) script = sched.render(spec) assert "srun --container-image" in script + # output_dir is always auto-mounted + assert f"{spec.output_dir}:{spec.output_dir}" in script def test_render_modules(self, spec): sched = SlurmScheduler(