ROCm · coketaste · Apr 15, 2026 · Apr 15, 2026 · Copilot · Apr 15, 2026
@@ -69,6 +69,10 @@ venv.bak/
 # Results
 *.csv
 *.json
+!models.json
+!perf_super.json
+!perf_entry_super.json
+!perf_entry.json
 *.log
 *.out
 *.html
@@ -0,0 +1,3 @@
+[submodule "scripts/Primus"]
+	path = scripts/Primus
+	url = https://github.com/AMD-AGI/Primus
@@ -24,11 +24,13 @@ Below are blueprints of supported models along with their documentation.
 | [**vLLM disaggregated P/D inference**](scripts/vllm_dissag/README.MD) | Distributed Inference P/D disaggregation with vLLM | DeepSeek-V3, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, gpt-oss-120b |
 | [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 |
 | [**KVCache Transfer Bench**](scripts/kvcache_transfer_bench/README.md) | Inter-node Transfer Benchmark | no specific models |
+| [**Primus pretrain**](#primus-pretrain) | LLM pretraining through the [Primus](https://github.com/AMD-AGI/Primus) launcher (Megatron, TorchTitan, MaxText, and other backends) | Config-driven; see `scripts/Primus/examples/` |
 
 ## Table of Contents
 
 - [Prerequisites](#prerequisites)
 - [Quick Start](#quick-start)
+- [Primus pretrain](#primus-pretrain)
 - [Usage Guide](#usage-guide)
   - [Running Models](#running-models)
   - [Tag Functionality](#tag-functionality)
@@ -50,11 +52,15 @@ Below are blueprints of supported models along with their documentation.
 
 ## Quick Start
 
-1. **Clone the repository**:
+1. **Clone the repository** (include the Primus submodule if you use [Primus pretrain](#primus-pretrain)):
    ```bash
-   git clone <repository-url>
+   git clone --recurse-submodules <repository-url>
    cd MAD
    ```
+   If you already cloned without submodules, initialize Primus with:
+   ```bash
+   git submodule update --init scripts/Primus
+   ```
 
 2. **Install dependencies**:
    ```bash
@@ -66,7 +72,20 @@ Below are blueprints of supported models along with their documentation.
    madengine run --tags pyt_huggingface_bert
    ```
 
+## Primus pretrain
+
+MAD integrates [AMD-AGI/Primus](https://github.com/AMD-AGI/Primus) as a Git submodule at **`scripts/Primus`**. The **`primus_pretrain`** entry in `models.json` uses **`docker/primus.ubuntu.amd.Dockerfile`** and **`scripts/primus_pretrain/`** (`run.sh` wraps Primus `examples/run_pretrain.sh`, copies logs under the madengine run directory, and writes **`primus_perf_output.csv`** for throughput / TFLOPs / MFU when logs include those metrics).
+
+- **Run with madengine** (tags include `primus`, `training`, `pretrain`):
+  ```bash
+  madengine run --tags primus_pretrain
+  ```
+- **Choose a config**: pass Primus YAML via script args, e.g. `--config_path examples/torchtitan/configs/MI300X/your_config.yaml` (path is relative to the Primus repo root). For SLURM or Kubernetes, you can set **`PRIMUS_CONFIG_PATH`** to the same path instead.
+- **Hugging Face–backed configs**: set **`HF_TOKEN`**, or **`MAD_SECRET_HFTOKEN`** (madengine v2 can inject the latter via `additional_context.docker_env_vars`).
+- **Docker build**: build from the **repository root** so `COPY scripts/Primus/` in `docker/primus.ubuntu.amd.Dockerfile` resolves; `madengine build` uses repo context for Dockerfiles whose path contains `primus`.
+- **Optional discovery**: `scripts/primus_pretrain/get_models_json.py` can expose individual Primus example YAMLs as separate models when used with madengine’s discover-models flow.
 
+For more detail, see comments in `docker/primus.ubuntu.amd.Dockerfile` and `scripts/primus_pretrain/run.sh`.
 
 ## Usage Guide
 

@@ -0,0 +1,42 @@
+# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
+# Primus launcher for MAD: one image for all Primus pretrain configs (torchtitan, megatron, MaxText, …).
+#
+# Build context must be the repo root (so COPY scripts/Primus works). Manual:
+#   docker build -f docker/primus.ubuntu.amd.Dockerfile .
+# `madengine build` uses context `.` for models whose dockerfile path contains "primus"
+# (see DockerBuilder.get_context_path in madengine).
+#
+# PRIMUS_ROOT is /workspace/Primus (Primus repo root: examples/run_pretrain.sh, examples/<backend>/…).
+# WORKSPACE_DIR is the generic working directory /workspace; madengine places manifests and
+# run_directory there. Do not set PRIMUS_ROOT=/workspace — that would collide with those files.
+#
+# Kubernetes: the Job mounts an emptyDir on /workspace, so image layers under /workspace are not
+# visible in the pod. Madengine bundles `scripts/Primus/examples/...` into the ConfigMap as
+# `Primus/examples/...` so the init container recreates /workspace/Primus (see madengine k8s).
+#
+# Local Docker / SLURM: bind-mount or shared filesystem provides scripts/Primus; run.sh prefers
+# that checkout when present, else uses PRIMUS_ROOT from this image.
+ARG BASE_DOCKER=docker.io/rocm/primus:v26.1
+
+FROM $BASE_DOCKER
+
+USER root
+
+ENV WORKSPACE_DIR=/workspace
+ENV PRIMUS_ROOT=/workspace/Primus
+
+RUN mkdir -p "$WORKSPACE_DIR"
+WORKDIR $WORKSPACE_DIR
+
+LABEL mad.launcher=primus
+
+# rocm/primus base often has /workspace/Primus as a full git clone (.git is a directory).
+# A submodule checkout uses .git as a file (gitlink). COPY cannot replace that tree — remove first.
+RUN rm -rf /workspace/Primus
+
+# Bake Primus from the build context (submodule). No git clone — matches CI and local builds.
+COPY scripts/Primus/ /workspace/Primus/
+
-# rocm/primus base often has /workspace/Primus as a full git clone (.git is a directory).
-# A submodule checkout uses .git as a file (gitlink). COPY cannot replace that tree — remove first.
-RUN rm -rf /workspace/Primus
-
-# Bake Primus from the build context (submodule). No git clone — matches CI and local builds.
-COPY scripts/Primus/ /workspace/Primus/
+# The in-repo runner builds Dockerfiles from the ./docker context, so sources under
+# scripts/Primus are not available to COPY here. Keep the Primus checkout provided by
+# the base image and validate that the expected launcher entrypoint exists.
-# rocm/primus base often has /workspace/Primus as a full git clone (.git is a directory).
-# A submodule checkout uses .git as a file (gitlink). COPY cannot replace that tree — remove first.
-RUN rm -rf /workspace/Primus
-
-# Bake Primus from the build context (submodule). No git clone — matches CI and local builds.
-COPY scripts/Primus/ /workspace/Primus/
+# The in-repo runner builds Dockerfiles from the ./docker context, so sources under
+# scripts/Primus are not available to COPY here. Keep the Primus checkout provided by
+# the base image and validate that the expected launcher entrypoint exists.
+RUN test -f /workspace/Primus/examples/run_pretrain.sh
+
+RUN pip3 list 2>/dev/null || true
@@ -1052,6 +1052,16 @@
     "args":
      "--model_repo pyt_train_qwen3-32b"
   },
+  {
+    "name": "primus_pretrain",
+    "dockerfile": "docker/primus",
+    "scripts": "scripts/primus_pretrain",
+    "n_gpus": "-1",
+    "owner": "mad.support@amd.com",
+    "training_precision": "bf16",
+    "tags": ["training", "primus", "megatron", "pretrain"],
+    "args": ""
+  },
   {
     "name": "primus_pyt_train_llama-3.1-8b",
     "url": "",

@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Extract Primus/Torchtitan performance metrics from training log and write
+madengine multiple_results CSV (one row per metric).
+
+Expected log line (last step):
+  step: 50  loss: ...  tps: 1,444  tflops: 300.32  mfu: 23.10%
+
+Output CSV format (model, performance, metric) — one row per metric:
+  model,performance,metric
+  primus_run,1444,tokens_per_second
+  primus_run,300.32,tflops
+  primus_run,23.10,model_flops_utilization
+"""
+import argparse
+import csv
+import re
+import sys
+
+
+def extract_metrics(log_path: str) -> dict:
+    """Parse log file and return tps, tflops, mfu from the last step line."""
+    tps = tflops = mfu = None
+    # Match lines containing step, tps, tflops, mfu (e.g. Torchtitan/Primus format)
+    tps_re = re.compile(r"tps:\s*([0-9][0-9.,eE+-]*)")
+    tflops_re = re.compile(r"tflops:\s*([0-9][0-9.eE+-]*)")
+    mfu_re = re.compile(r"mfu:\s*([0-9][0-9.]*)%?")
+
+    try:
+        with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
+            for line in f:
+                if "tps:" in line and "tflops:" in line and "mfu:" in line:
+                    m = tps_re.search(line)
+                    if m:
+                        tps = m.group(1).replace(",", "").strip()
+                    m = tflops_re.search(line)
+                    if m:
+                        tflops = m.group(1).strip()
+                    m = mfu_re.search(line)
+                    if m:
+                        mfu = m.group(1).strip()
+    except OSError as e:
+        print(f"Error reading log {log_path}: {e}", file=sys.stderr)
+        return {}
+
+    return {"tps": tps, "tflops": tflops, "mfu": mfu}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract Primus perf metrics to multiple_results CSV")
+    parser.add_argument("log_path", help="Path to training log (e.g. output/log_mp_pretrain_*.txt)")
+    parser.add_argument("output_csv", help="Path to output CSV (e.g. run_directory/primus_perf_output.csv)")
+    parser.add_argument("--model-id", default="primus_run", help="Model id for the CSV rows")
+    args = parser.parse_args()
+
+    metrics = extract_metrics(args.log_path)
+    if not metrics or metrics.get("tps") is None:
+        print("Warning: No tps/tflops/mfu found in log; writing empty rows.", file=sys.stderr)
+        metrics = {"tps": "", "tflops": "", "mfu": ""}
+
+    # One row per metric: model, performance, metric
+    rows = [
+        {"model": args.model_id, "performance": metrics.get("tps") or "", "metric": "tokens_per_second"},
+        {"model": args.model_id, "performance": metrics.get("tflops") or "", "metric": "tflops"},
+        {"model": args.model_id, "performance": metrics.get("mfu") or "", "metric": "model_flops_utilization"},
+    ]
+
+    with open(args.output_csv, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=["model", "performance", "metric"])
+        writer.writeheader()
+        writer.writerows(rows)
+
+    print(
+        f"Wrote {args.output_csv}: {len(rows)} rows (tokens_per_second={rows[0]['performance']}, "
+        f"tflops={rows[1]['performance']}, model_flops_utilization={rows[2]['performance']})"
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,56 @@
+"""
+Discover Primus example configs as madengine models (optional).
+
+Convention-based: globs examples/*/configs/**/*.yaml from the Primus submodule (scripts/Primus),
+so all launchers (megatron, megatron_bridge, torchtitan, maxtext, moe_package, etc.) are
+discovered. New launchers added under examples/<name>/configs/ are picked up automatically.
+All discovered models use the same dockerfile and run.sh; args pass --config_path <relpath>.
+For SLURM/K8s, supply distributed (launcher, nnodes, primus.config_path) via additional_context.
+"""
+import os
+import glob
+
+try:
+    from madengine.utils.discover_models import CustomModel  # madengine v2
+except ImportError:
+    from madengine.tools.discover_models import CustomModel  # madengine v1
+
+# This file lives in scripts/primus_pretrain; Primus submodule is scripts/Primus
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+PRIMUS_ROOT = os.path.normpath(os.path.join(THIS_DIR, "..", "Primus"))
+# One glob for all launchers: examples/<launcher>/configs/**/*.yaml
+CONFIGS_GLOB = os.path.join(PRIMUS_ROOT, "examples", "*", "configs", "**", "*.yaml")
+
+
+def list_models():
+    models = []
+    if not os.path.isdir(PRIMUS_ROOT):
+        return models
+    for yaml_path in sorted(glob.glob(CONFIGS_GLOB)):
-    for yaml_path in sorted(glob.glob(CONFIGS_GLOB)):
+    for yaml_path in sorted(glob.glob(CONFIGS_GLOB, recursive=True)):
-    for yaml_path in sorted(glob.glob(CONFIGS_GLOB)):
+    for yaml_path in sorted(glob.glob(CONFIGS_GLOB, recursive=True)):
+        rel_path = os.path.relpath(yaml_path, PRIMUS_ROOT)
+        # Path shape: examples/<launcher>/configs/<arch>/<file>.yaml
+        parts = rel_path.split(os.sep)
+        if len(parts) < 5:
+            continue
+        launcher = parts[1]   # megatron, torchtitan, megatron_bridge, etc.
+        arch = parts[3]       # MI300X, MI355X, etc.
+        short_name = os.path.splitext(os.path.basename(yaml_path))[0]
+        # discover_models prefixes with dirname (primus_pretrain/), so no prefix here
+        name = f"{launcher}_{arch}_{short_name}"
+        tags = ["primus", launcher, arch, short_name]
+        models.append(
+            CustomModel(
+                name=name,
+                dockerfile="../../docker/primus",
+                scripts="run.sh",
+                data="",
+                n_gpus="8",
+                owner="mad.support@amd.com",
+                timeout=86400,
+                training_precision="bf16",
+                tags=tags,
+                args=f"--config_path {rel_path}",
+                multiple_results="primus_perf_output.csv",
+            )
+        )
+    return models
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Wrapper for Primus pretrain when run via madengine (local, SLURM, or K8s).
+# Sets EXP from PRIMUS_CONFIG_PATH or --config_path, infers BACKEND from path,
+# then runs Primus examples/run_pretrain.sh. For HF-backed configs set HF_TOKEN
+# or MAD_SECRET_HFTOKEN (e.g. via additional_context.docker_env_vars in madengine v2).
+# Primus root: set PRIMUS_ROOT to override; else auto-detect.
+# After training, extracts tps/tflops/mfu from log and writes primus_perf_output.csv for madengine multiple_results.
+set -e
+
+# run_directory when invoked by madengine (cd run_directory && bash run.sh ...); used for output CSV
+RUN_DIR="$(pwd)"
+
+# Primus root resolution (local bind-mount, K8s ConfigMap extract, image ENV, legacy paths):
+# 1) Repo submodule scripts/Primus (local Docker / SLURM with project layout)
+# 2) /workspace/Primus — Dockerfile COPY and madengine K8s init (keys Primus/examples/…)
+# 3) PRIMUS_ROOT from environment (image default)
+# 4) Legacy /opt/primus images
+script_dir="$(cd "$(dirname "$0")" && pwd)"
+if [[ -f "$script_dir/../Primus/examples/run_pretrain.sh" ]]; then
+  export PRIMUS_ROOT="$(cd "$script_dir/../Primus" && pwd)"
+elif [[ -f "/workspace/Primus/examples/run_pretrain.sh" ]]; then
+  export PRIMUS_ROOT="/workspace/Primus"
+elif [[ -n "${PRIMUS_ROOT:-}" ]]; then
+  :
+elif [[ -f "/opt/primus/examples/run_pretrain.sh" ]]; then
+  export PRIMUS_ROOT="/opt/primus"
+elif [[ -f "/workspace/examples/run_pretrain.sh" ]]; then
+  export PRIMUS_ROOT="/workspace"
+else
+  echo "ERROR: Could not find Primus run_pretrain.sh. Set PRIMUS_ROOT or use a repo with scripts/Primus submodule." >&2
+  exit 1
+fi
+
+# EXP (required by Primus run_pretrain.sh): prefer PRIMUS_CONFIG_PATH (SLURM/K8s), else --config_path in args
+if [[ -n "${PRIMUS_CONFIG_PATH:-}" ]]; then
+  export EXP="$PRIMUS_CONFIG_PATH"
+else
+  export EXP="examples/megatron/exp_pretrain.yaml"
+  args=("$@")
+  for i in "${!args[@]}"; do
+    if [[ "${args[i]}" == "--config_path" && -n "${args[i+1]:-}" ]]; then
+      export EXP="${args[i+1]}"
+      break
+    fi
+  done
+fi
+
+# Infer BACKEND from EXP path so run_pretrain.sh uses correct runner (torchtitan, megatron, maxtext, etc.)
+# Primus expects BACKEND=MaxText for Jax/MaxText; lowercase for others.
+exp_lower="$(echo "$EXP" | tr '[:upper:]' '[:lower:]')"
+if [[ "$exp_lower" == *"/maxtext/"* ]]; then
+  export BACKEND="MaxText"
+elif [[ "$exp_lower" == *"/torchtitan/"* ]]; then
+  export BACKEND="torchtitan"
+elif [[ "$exp_lower" == *"/megatron_bridge/"* ]]; then
+  export BACKEND="megatron_bridge"
+elif [[ "$exp_lower" == *"/moe_package/"* ]]; then
+  export BACKEND="moe_package"
+else
+  export BACKEND="megatron"
+fi
+
+# HF_TOKEN for Primus prepare (HF-backed configs): use MAD_SECRET_HFTOKEN from madengine v2
+# (set via additional_context.docker_env_vars) if HF_TOKEN not already set
+if [[ -n "${HF_TOKEN:-}" ]]; then
+  export HF_TOKEN
+elif [[ -n "${MAD_SECRET_HFTOKEN:-}" ]]; then
+  export HF_TOKEN="$MAD_SECRET_HFTOKEN"
+fi
+
+# Redirect Primus output/outputs to run_directory (workspace root when run via madengine).
+# No changes to Primus: we set env vars that run_pretrain.sh already honors (TRAIN_LOG, DUMP_HLO_DIR)
+# and pass --job.dump_folder so Torchtitan writes checkpoints here. output/ = logs; outputs/ = checkpoints.
+mkdir -p "$RUN_DIR/output" "$RUN_DIR/outputs"
+export TRAIN_LOG="$RUN_DIR/output/log_mp_pretrain_$(basename "$EXP" .yaml).txt"
+export DUMP_HLO_DIR="${DUMP_HLO_DIR:-$RUN_DIR/output/xla_dump_hlo}"
+
+# Run from PRIMUS_ROOT so EXP path (e.g. examples/torchtitan/configs/...) resolves correctly.
+# Do not use exec so we can run the perf extractor after training for madengine multiple_results.
+# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs).
+cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs"
+exitcode=$?
-# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs).
-cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs"
-exitcode=$?
+# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs).
+set +e
+cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs"
+exitcode=$?
+set -e
-# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs).
-cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs"
-exitcode=$?
+# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs).
+set +e
+cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs"
+exitcode=$?
+set -e
+# Extract tps/tflops/mfu from training log into primus_perf_output.csv (one row: model, performance, metric, tflops, model_flops_utilization)
-# Extract tps/tflops/mfu from training log into primus_perf_output.csv (one row: model, performance, metric, tflops, model_flops_utilization)
+# Extract tps/tflops/mfu from training log into primus_perf_output.csv (CSV rows: model, performance, metric; one row per metric)
-# Extract tps/tflops/mfu from training log into primus_perf_output.csv (one row: model, performance, metric, tflops, model_flops_utilization)
+# Extract tps/tflops/mfu from training log into primus_perf_output.csv (CSV rows: model, performance, metric; one row per metric)
+LOG_PATH="$RUN_DIR/output/log_mp_pretrain_$(basename "$EXP" .yaml).txt"
+if [[ -f "$LOG_PATH" ]]; then
+  python3 "$RUN_DIR/extract_primus_perf.py" "$LOG_PATH" "$RUN_DIR/primus_perf_output.csv" || true
+fi
+exit "$exitcode"