-
Notifications
You must be signed in to change notification settings - Fork 45
Add Primus pretrain integration (primus_pretrain) #146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| [submodule "scripts/Primus"] | ||
| path = scripts/Primus | ||
| url = https://github.com/AMD-AGI/Primus |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} | ||
| # Primus launcher for MAD: one image for all Primus pretrain configs (torchtitan, megatron, MaxText, …). | ||
| # | ||
| # Build context must be the repo root (so COPY scripts/Primus works). Manual: | ||
| # docker build -f docker/primus.ubuntu.amd.Dockerfile . | ||
| # `madengine build` uses context `.` for models whose dockerfile path contains "primus" | ||
| # (see DockerBuilder.get_context_path in madengine). | ||
| # | ||
| # PRIMUS_ROOT is /workspace/Primus (Primus repo root: examples/run_pretrain.sh, examples/<backend>/…). | ||
| # WORKSPACE_DIR is the generic working directory /workspace; madengine places manifests and | ||
| # run_directory there. Do not set PRIMUS_ROOT=/workspace — that would collide with those files. | ||
| # | ||
| # Kubernetes: the Job mounts an emptyDir on /workspace, so image layers under /workspace are not | ||
| # visible in the pod. Madengine bundles `scripts/Primus/examples/...` into the ConfigMap as | ||
| # `Primus/examples/...` so the init container recreates /workspace/Primus (see madengine k8s). | ||
| # | ||
| # Local Docker / SLURM: bind-mount or shared filesystem provides scripts/Primus; run.sh prefers | ||
| # that checkout when present, else uses PRIMUS_ROOT from this image. | ||
| ARG BASE_DOCKER=docker.io/rocm/primus:v26.1 | ||
|
|
||
| FROM $BASE_DOCKER | ||
|
|
||
| USER root | ||
|
|
||
| ENV WORKSPACE_DIR=/workspace | ||
| ENV PRIMUS_ROOT=/workspace/Primus | ||
|
|
||
| RUN mkdir -p "$WORKSPACE_DIR" | ||
| WORKDIR $WORKSPACE_DIR | ||
|
|
||
| LABEL mad.launcher=primus | ||
|
|
||
| # rocm/primus base often has /workspace/Primus as a full git clone (.git is a directory). | ||
| # A submodule checkout uses .git as a file (gitlink). COPY cannot replace that tree — remove first. | ||
| RUN rm -rf /workspace/Primus | ||
|
|
||
| # Bake Primus from the build context (submodule). No git clone — matches CI and local builds. | ||
| COPY scripts/Primus/ /workspace/Primus/ | ||
|
|
||
| RUN test -f /workspace/Primus/examples/run_pretrain.sh | ||
|
|
||
| RUN pip3 list 2>/dev/null || true | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Extract Primus/Torchtitan performance metrics from training log and write | ||
| madengine multiple_results CSV (one row per metric). | ||
|
|
||
| Expected log line (last step): | ||
| step: 50 loss: ... tps: 1,444 tflops: 300.32 mfu: 23.10% | ||
|
|
||
| Output CSV format (model, performance, metric) — one row per metric: | ||
| model,performance,metric | ||
| primus_run,1444,tokens_per_second | ||
| primus_run,300.32,tflops | ||
| primus_run,23.10,model_flops_utilization | ||
| """ | ||
| import argparse | ||
| import csv | ||
| import re | ||
| import sys | ||
|
|
||
|
|
||
| def extract_metrics(log_path: str) -> dict: | ||
| """Parse log file and return tps, tflops, mfu from the last step line.""" | ||
| tps = tflops = mfu = None | ||
| # Match lines containing step, tps, tflops, mfu (e.g. Torchtitan/Primus format) | ||
| tps_re = re.compile(r"tps:\s*([0-9][0-9.,eE+-]*)") | ||
| tflops_re = re.compile(r"tflops:\s*([0-9][0-9.eE+-]*)") | ||
| mfu_re = re.compile(r"mfu:\s*([0-9][0-9.]*)%?") | ||
|
|
||
| try: | ||
| with open(log_path, "r", encoding="utf-8", errors="ignore") as f: | ||
| for line in f: | ||
| if "tps:" in line and "tflops:" in line and "mfu:" in line: | ||
| m = tps_re.search(line) | ||
| if m: | ||
| tps = m.group(1).replace(",", "").strip() | ||
| m = tflops_re.search(line) | ||
| if m: | ||
| tflops = m.group(1).strip() | ||
| m = mfu_re.search(line) | ||
| if m: | ||
| mfu = m.group(1).strip() | ||
| except OSError as e: | ||
| print(f"Error reading log {log_path}: {e}", file=sys.stderr) | ||
| return {} | ||
|
|
||
| return {"tps": tps, "tflops": tflops, "mfu": mfu} | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Extract Primus perf metrics to multiple_results CSV") | ||
| parser.add_argument("log_path", help="Path to training log (e.g. output/log_mp_pretrain_*.txt)") | ||
| parser.add_argument("output_csv", help="Path to output CSV (e.g. run_directory/primus_perf_output.csv)") | ||
| parser.add_argument("--model-id", default="primus_run", help="Model id for the CSV rows") | ||
| args = parser.parse_args() | ||
|
|
||
| metrics = extract_metrics(args.log_path) | ||
| if not metrics or metrics.get("tps") is None: | ||
| print("Warning: No tps/tflops/mfu found in log; writing empty rows.", file=sys.stderr) | ||
| metrics = {"tps": "", "tflops": "", "mfu": ""} | ||
|
|
||
| # One row per metric: model, performance, metric | ||
| rows = [ | ||
| {"model": args.model_id, "performance": metrics.get("tps") or "", "metric": "tokens_per_second"}, | ||
| {"model": args.model_id, "performance": metrics.get("tflops") or "", "metric": "tflops"}, | ||
| {"model": args.model_id, "performance": metrics.get("mfu") or "", "metric": "model_flops_utilization"}, | ||
| ] | ||
|
|
||
| with open(args.output_csv, "w", newline="") as f: | ||
| writer = csv.DictWriter(f, fieldnames=["model", "performance", "metric"]) | ||
| writer.writeheader() | ||
| writer.writerows(rows) | ||
|
|
||
| print( | ||
| f"Wrote {args.output_csv}: {len(rows)} rows (tokens_per_second={rows[0]['performance']}, " | ||
| f"tflops={rows[1]['performance']}, model_flops_utilization={rows[2]['performance']})" | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,56 @@ | ||||||
| """ | ||||||
| Discover Primus example configs as madengine models (optional). | ||||||
| Convention-based: globs examples/*/configs/**/*.yaml from the Primus submodule (scripts/Primus), | ||||||
| so all launchers (megatron, megatron_bridge, torchtitan, maxtext, moe_package, etc.) are | ||||||
| discovered. New launchers added under examples/<name>/configs/ are picked up automatically. | ||||||
| All discovered models use the same dockerfile and run.sh; args pass --config_path <relpath>. | ||||||
| For SLURM/K8s, supply distributed (launcher, nnodes, primus.config_path) via additional_context. | ||||||
| """ | ||||||
| import os | ||||||
| import glob | ||||||
|
|
||||||
| try: | ||||||
| from madengine.utils.discover_models import CustomModel # madengine v2 | ||||||
| except ImportError: | ||||||
| from madengine.tools.discover_models import CustomModel # madengine v1 | ||||||
|
|
||||||
| # This file lives in scripts/primus_pretrain; Primus submodule is scripts/Primus | ||||||
| THIS_DIR = os.path.dirname(os.path.abspath(__file__)) | ||||||
| PRIMUS_ROOT = os.path.normpath(os.path.join(THIS_DIR, "..", "Primus")) | ||||||
| # One glob for all launchers: examples/<launcher>/configs/**/*.yaml | ||||||
| CONFIGS_GLOB = os.path.join(PRIMUS_ROOT, "examples", "*", "configs", "**", "*.yaml") | ||||||
|
|
||||||
|
|
||||||
| def list_models(): | ||||||
| models = [] | ||||||
| if not os.path.isdir(PRIMUS_ROOT): | ||||||
| return models | ||||||
| for yaml_path in sorted(glob.glob(CONFIGS_GLOB)): | ||||||
|
||||||
| for yaml_path in sorted(glob.glob(CONFIGS_GLOB)): | |
| for yaml_path in sorted(glob.glob(CONFIGS_GLOB, recursive=True)): |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,88 @@ | ||||||||||||||||||
| #!/usr/bin/env bash | ||||||||||||||||||
| # Wrapper for Primus pretrain when run via madengine (local, SLURM, or K8s). | ||||||||||||||||||
| # Sets EXP from PRIMUS_CONFIG_PATH or --config_path, infers BACKEND from path, | ||||||||||||||||||
| # then runs Primus examples/run_pretrain.sh. For HF-backed configs set HF_TOKEN | ||||||||||||||||||
| # or MAD_SECRET_HFTOKEN (e.g. via additional_context.docker_env_vars in madengine v2). | ||||||||||||||||||
| # Primus root: set PRIMUS_ROOT to override; else auto-detect. | ||||||||||||||||||
| # After training, extracts tps/tflops/mfu from log and writes primus_perf_output.csv for madengine multiple_results. | ||||||||||||||||||
| set -e | ||||||||||||||||||
|
|
||||||||||||||||||
| # run_directory when invoked by madengine (cd run_directory && bash run.sh ...); used for output CSV | ||||||||||||||||||
| RUN_DIR="$(pwd)" | ||||||||||||||||||
|
|
||||||||||||||||||
| # Primus root resolution (local bind-mount, K8s ConfigMap extract, image ENV, legacy paths): | ||||||||||||||||||
| # 1) Repo submodule scripts/Primus (local Docker / SLURM with project layout) | ||||||||||||||||||
| # 2) /workspace/Primus — Dockerfile COPY and madengine K8s init (keys Primus/examples/…) | ||||||||||||||||||
| # 3) PRIMUS_ROOT from environment (image default) | ||||||||||||||||||
| # 4) Legacy /opt/primus images | ||||||||||||||||||
| script_dir="$(cd "$(dirname "$0")" && pwd)" | ||||||||||||||||||
| if [[ -f "$script_dir/../Primus/examples/run_pretrain.sh" ]]; then | ||||||||||||||||||
| export PRIMUS_ROOT="$(cd "$script_dir/../Primus" && pwd)" | ||||||||||||||||||
| elif [[ -f "/workspace/Primus/examples/run_pretrain.sh" ]]; then | ||||||||||||||||||
| export PRIMUS_ROOT="/workspace/Primus" | ||||||||||||||||||
| elif [[ -n "${PRIMUS_ROOT:-}" ]]; then | ||||||||||||||||||
| : | ||||||||||||||||||
| elif [[ -f "/opt/primus/examples/run_pretrain.sh" ]]; then | ||||||||||||||||||
| export PRIMUS_ROOT="/opt/primus" | ||||||||||||||||||
| elif [[ -f "/workspace/examples/run_pretrain.sh" ]]; then | ||||||||||||||||||
| export PRIMUS_ROOT="/workspace" | ||||||||||||||||||
| else | ||||||||||||||||||
| echo "ERROR: Could not find Primus run_pretrain.sh. Set PRIMUS_ROOT or use a repo with scripts/Primus submodule." >&2 | ||||||||||||||||||
| exit 1 | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| # EXP (required by Primus run_pretrain.sh): prefer PRIMUS_CONFIG_PATH (SLURM/K8s), else --config_path in args | ||||||||||||||||||
| if [[ -n "${PRIMUS_CONFIG_PATH:-}" ]]; then | ||||||||||||||||||
| export EXP="$PRIMUS_CONFIG_PATH" | ||||||||||||||||||
| else | ||||||||||||||||||
| export EXP="examples/megatron/exp_pretrain.yaml" | ||||||||||||||||||
| args=("$@") | ||||||||||||||||||
| for i in "${!args[@]}"; do | ||||||||||||||||||
| if [[ "${args[i]}" == "--config_path" && -n "${args[i+1]:-}" ]]; then | ||||||||||||||||||
| export EXP="${args[i+1]}" | ||||||||||||||||||
| break | ||||||||||||||||||
| fi | ||||||||||||||||||
| done | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| # Infer BACKEND from EXP path so run_pretrain.sh uses correct runner (torchtitan, megatron, maxtext, etc.) | ||||||||||||||||||
| # Primus expects BACKEND=MaxText for Jax/MaxText; lowercase for others. | ||||||||||||||||||
| exp_lower="$(echo "$EXP" | tr '[:upper:]' '[:lower:]')" | ||||||||||||||||||
| if [[ "$exp_lower" == *"/maxtext/"* ]]; then | ||||||||||||||||||
| export BACKEND="MaxText" | ||||||||||||||||||
| elif [[ "$exp_lower" == *"/torchtitan/"* ]]; then | ||||||||||||||||||
| export BACKEND="torchtitan" | ||||||||||||||||||
| elif [[ "$exp_lower" == *"/megatron_bridge/"* ]]; then | ||||||||||||||||||
| export BACKEND="megatron_bridge" | ||||||||||||||||||
| elif [[ "$exp_lower" == *"/moe_package/"* ]]; then | ||||||||||||||||||
| export BACKEND="moe_package" | ||||||||||||||||||
| else | ||||||||||||||||||
| export BACKEND="megatron" | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| # HF_TOKEN for Primus prepare (HF-backed configs): use MAD_SECRET_HFTOKEN from madengine v2 | ||||||||||||||||||
| # (set via additional_context.docker_env_vars) if HF_TOKEN not already set | ||||||||||||||||||
| if [[ -n "${HF_TOKEN:-}" ]]; then | ||||||||||||||||||
| export HF_TOKEN | ||||||||||||||||||
| elif [[ -n "${MAD_SECRET_HFTOKEN:-}" ]]; then | ||||||||||||||||||
| export HF_TOKEN="$MAD_SECRET_HFTOKEN" | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| # Redirect Primus output/outputs to run_directory (workspace root when run via madengine). | ||||||||||||||||||
| # No changes to Primus: we set env vars that run_pretrain.sh already honors (TRAIN_LOG, DUMP_HLO_DIR) | ||||||||||||||||||
| # and pass --job.dump_folder so Torchtitan writes checkpoints here. output/ = logs; outputs/ = checkpoints. | ||||||||||||||||||
| mkdir -p "$RUN_DIR/output" "$RUN_DIR/outputs" | ||||||||||||||||||
| export TRAIN_LOG="$RUN_DIR/output/log_mp_pretrain_$(basename "$EXP" .yaml).txt" | ||||||||||||||||||
| export DUMP_HLO_DIR="${DUMP_HLO_DIR:-$RUN_DIR/output/xla_dump_hlo}" | ||||||||||||||||||
|
|
||||||||||||||||||
| # Run from PRIMUS_ROOT so EXP path (e.g. examples/torchtitan/configs/...) resolves correctly. | ||||||||||||||||||
| # Do not use exec so we can run the perf extractor after training for madengine multiple_results. | ||||||||||||||||||
| # Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs). | ||||||||||||||||||
| cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs" | ||||||||||||||||||
| exitcode=$? | ||||||||||||||||||
|
Comment on lines
+80
to
+82
|
||||||||||||||||||
| # Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs). | |
| cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs" | |
| exitcode=$? | |
| # Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs). | |
| set +e | |
| cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs" | |
| exitcode=$? | |
| set -e |
Copilot
AI
Apr 15, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment describes the output as a single row with extra columns ("tflops, model_flops_utilization"), but extract_primus_perf.py writes a 3-column CSV with one row per metric (model,performance,metric). Please update the comment to match the actual CSV schema so consumers (madengine multiple_results) aren’t misled.
| # Extract tps/tflops/mfu from training log into primus_perf_output.csv (one row: model, performance, metric, tflops, model_flops_utilization) | |
| # Extract tps/tflops/mfu from training log into primus_perf_output.csv (CSV rows: model, performance, metric; one row per metric) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
COPY scripts/Primus/ /workspace/Primus/requires the repo root as the Docker build context, but the in-repo runner (tools/run_models.py) builds images with context./docker(docker_context = "./docker"). With that context,scripts/Primusis outside the build context and the build will fail. Either update the build tooling to use context.for this model, or move/bundle the Primus sources underdocker/so they’re within the build context.