mlcommons · karverma-amd · Jun 2, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 8, 2026
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Shared Docker helpers for DeepSeek-V4-Pro example scripts.
+
+# Writable log directory on the host (mounted into containers at /workspace).
+# Accuracy + server logs can grow large; default leaves headroom on the host.
+DOCKER_LOG_STORAGE_GB="${DOCKER_LOG_STORAGE_GB:-16}"
+
+ensure_docker_log_dir() {
+  local subdir="${1:-misc}"
+  LOG_DIR="${LOG_DIR:-${ENDPOINTS_DIR:-.}/results/docker_logs/${subdir}}"
+  mkdir -p "${LOG_DIR}"
+  export LOG_DIR
+}
+
+# Extra docker run args for a larger container writable layer (opt-in only).
+# Logs are written to the mounted host LOG_DIR; most hosts use overlay2 without xfs
+# pquota and reject --storage-opt.
+docker_storage_args() {
+  if [[ "${DOCKER_USE_LOG_STORAGE_OPT:-false}" == "true" ]]; then
+    # shellcheck disable=SC2207
+    echo --storage-opt "size=${DOCKER_LOG_STORAGE_GB}G"
+  fi
+}
+
+# Wait for an OpenAI-compatible or SGLang HTTP server (example script preflight).
+# Tries GET /health (SGLang native, vLLM), then GET /v1/models (OpenAI compatibility).
+# Args: base_url [max_wait_seconds]
+wait_openai_compatible_server() {
+  local base="${1%/}"
+  local max_wait="${2:-0}"
+  local start
+  start=$(date +%s)
+  while true; do
+    for path in /health /v1/models; do
+      if curl --output /dev/null --silent --fail --max-time 5 "${base}${path}"; then
+        echo "Inference server ready (${base}${path})"
+        return 0
+      fi
+    done
+    if (( max_wait <= 0 )); then
+      return 1
+    fi
+    if (( $(date +%s) - start >= max_wait )); then
+      return 1
+    fi
+    sleep 2
+  done
+}
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Start run_sglang_accuracy_benchmark.sh under nohup with a stable log path.
+# (Avoids `VAR=... && nohup ... &` where bash backgrounds the whole `&&` chain so
+# follow-up commands in the parent shell lose VAR.)
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# shellcheck source=docker_common.sh
+source "${SCRIPT_DIR}/docker_common.sh"
+
+ensure_docker_log_dir "accuracy"
+LOGF="${LOG_DIR}/nohup_accuracy_$(date +%Y%m%d_%H%M%S).log"
+export WAIT_FOR_SGLANG_S="${WAIT_FOR_SGLANG_S:-120}"
+export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
+
+nohup "${SCRIPT_DIR}/run_sglang_accuracy_benchmark.sh" >"${LOGF}" 2>&1 &
+echo "Started accuracy benchmark PID=$!"
+echo "Wrapper log: ${LOGF}"
+echo "Tee log (inside run): ${LOG_DIR}/accuracy_from_config.log"
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Monitor the running SGLang DeepSeek-V4-Pro accuracy benchmark.
+# Emits sentinel lines (ALERT_HANG / RUN_FAILED / RUN_FINISHED_OK) so an external
+# watcher can be notified, and logs periodic status to a monitor log.
+set -uo pipefail
+
+PROC_PAT="${PROC_PAT:-from-config.*sglang_deepseek_v4_pro_accuracy}"
+WRAPPER_LOG="${WRAPPER_LOG:?set WRAPPER_LOG to the nohup wrapper log path}"
+REPORT_DIR="${REPORT_DIR:-results/sglang_deepseek_v4_pro_accuracy}"
+INTERVAL_S="${INTERVAL_S:-300}"
+# Consecutive idle (no new completes + GPU idle) checks before declaring a hang.
+HANG_STRIKES="${HANG_STRIKES:-3}"
+GPU_IDLE_PCT="${GPU_IDLE_PCT:-5}"
+
+ts() { date '+%Y-%m-%d %H:%M:%S'; }
+
+live_events_file() {
+  # Derive the event logger's tmpfs events.jsonl from its --log-dir arg.
+  local logdir
+  logdir=$(pgrep -af "services.event_logger" \
+    | grep -oE -- "--log-dir [^ ]+" | head -1 | awk '{print $2}')
+  [[ -n "${logdir}" ]] && echo "${logdir}/events.jsonl"
+}
+
+gpu_busy_pct() {
+  rocm-smi --showuse 2>/dev/null \
+    | grep -oE "GPU use \(%\): [0-9]+" | grep -oE "[0-9]+$" \
+    | sort -rn | head -1
+}
+
+completed_count() {
+  local f="$1" c
+  if [[ -f "${f}" ]]; then
+    # grep -c always prints a count (0 when no match); capture it so a non-zero
+    # exit status on 0 matches does not also trigger a fallback echo.
+    c=$(grep -c '"event_type":"sample.complete"' "${f}" 2>/dev/null)
+    echo "${c:-0}"
+  else
+    echo 0
+  fi
+}
+
+prev_completed=-1
+strikes=0
+
+echo "[$(ts)] monitor start: pat='${PROC_PAT}' interval=${INTERVAL_S}s wrapper=${WRAPPER_LOG}"
+
+while true; do
+  if ! pgrep -f "${PROC_PAT}" >/dev/null 2>&1; then
+    # Process gone — classify success vs failure from the wrapper log.
+    if grep -q "Score for livecodebench" "${WRAPPER_LOG}" 2>/dev/null \
+       || grep -q "Saved: ${REPORT_DIR}/results.json" "${WRAPPER_LOG}" 2>/dev/null; then
+      echo "[$(ts)] RUN_FINISHED_OK"
+    else
+      echo "[$(ts)] RUN_FAILED (process exited without final score)"
+      echo "---- wrapper tail ----"
+      tr '\r' '\n' < "${WRAPPER_LOG}" 2>/dev/null | tail -25
+    fi
+    exit 0
+  fi
+
+  ev=$(live_events_file)
+  done_n=$(completed_count "${ev:-/nonexistent}")
+  gpu=$(gpu_busy_pct); gpu="${gpu:-0}"
+  echo "[$(ts)] alive completed=${done_n} gpu_busy=${gpu}% events=${ev:-none}"
+
+  # Hang heuristic: no new completions AND GPUs idle for HANG_STRIKES intervals.
+  if [[ "${done_n}" == "${prev_completed}" && "${gpu}" -lt "${GPU_IDLE_PCT}" ]]; then
+    strikes=$((strikes + 1))
+    echo "[$(ts)] no-progress strike ${strikes}/${HANG_STRIKES} (completed unchanged at ${done_n}, gpu ${gpu}%)"
+    if [[ "${strikes}" -ge "${HANG_STRIKES}" ]]; then
+      echo "[$(ts)] ALERT_HANG completed stuck at ${done_n}, gpu ${gpu}% for ${strikes} checks"
+      echo "---- wrapper tail ----"
+      tr '\r' '\n' < "${WRAPPER_LOG}" 2>/dev/null | tail -15
+    fi
+  else
+    strikes=0
+  fi
+  prev_completed="${done_n}"
+  sleep "${INTERVAL_S}"
+done
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Re-score accuracy datasets from an existing benchmark report directory.
+
+Use after `inference-endpoint benchmark from-config` when inference completed but
+scoring failed (e.g. LiveCodeBench container was not running). Dataset names must
+match the YAML preset suffixes (e.g. `gpqa::deepseek_v4`).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from inference_endpoint.dataset_manager.predefined.aime25 import AIME25
+from inference_endpoint.dataset_manager.predefined.gpqa import GPQA
+from inference_endpoint.dataset_manager.predefined.livecodebench import LiveCodeBench
+from inference_endpoint.evaluation.extractor import (
+    ABCDExtractor,
+    BoxedMathExtractor,
+    PythonCodeExtractor,
+)
+from inference_endpoint.evaluation.scoring import LiveCodeBenchScorer, PassAt1Scorer
+
+DATASET_CACHE = Path("dataset_cache")
+
+
+def score_gpqa(report_dir: Path) -> tuple[str, float, int]:
+    ds = GPQA.load_from_file(DATASET_CACHE / "gpqa/diamond/gpqa_diamond.parquet")
+    ds.load()
+    name = "gpqa::deepseek_v4"
+    scorer = PassAt1Scorer(name, ds, report_dir, extractor=ABCDExtractor)
+    score, n_repeats = scorer.score()
+    return name, score, n_repeats
+
+
+def score_aime25(report_dir: Path) -> tuple[str, float, int]:
+    ds = AIME25.load_from_file(DATASET_CACHE / "aime25/aime25.parquet")
+    ds.load()
+    name = "aime25::deepseek_v4"
+    scorer = PassAt1Scorer(
+        name,
+        ds,
+        report_dir,
+        extractor=BoxedMathExtractor,
+        ground_truth_column="answer",
+    )
+    score, n_repeats = scorer.score()
+    return name, score, n_repeats
+
+
+def score_livecodebench(
+    report_dir: Path, lcb_version: str, timeout: int
+) -> tuple[str, float, int]:
+    ds = LiveCodeBench.load_from_file(
+        DATASET_CACHE
+        / f"livecodebench/{lcb_version}/livecodebench_{lcb_version}.parquet"
+    )
+    ds.load()
+    name = "livecodebench::deepseek_v4"
+    scorer = LiveCodeBenchScorer(
+        name,
+        ds,
+        report_dir,
+        extractor=PythonCodeExtractor,
+        lcb_version=lcb_version,
+        timeout=timeout,
+    )
+    score, n_repeats = scorer.score()
+    return name, score, n_repeats
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--report-dir",
+        type=Path,
+        required=True,
+        help="Benchmark report directory (contains events.jsonl and sample_idx_map.json)",
+    )
+    parser.add_argument(
+        "--lcb-version",
+        default="release_v6",
+        help="LiveCodeBench dataset version tag",
+    )
+    parser.add_argument(
+        "--lcb-timeout",
+        type=int,
+        default=60,
+        help="Per-test timeout for LiveCodeBench evaluation (seconds)",
+    )
+    parser.add_argument(
+        "--skip-lcb",
+        action="store_true",
+        help="Skip LiveCodeBench (score GPQA and AIME25 only)",
+    )
+    parser.add_argument(
+        "--write-results-json",
+        action="store_true",
+        help="Write results.json in the same format as benchmark finalize",
+    )
+    args = parser.parse_args()
+
+    report_dir = args.report_dir.resolve()
+    accuracy_scores: dict[str, dict] = {}
+
+    def fmt_score(score: float | None) -> str:
+        return f"{score:.4f}" if score is not None else "None (scoring failed)"
+
+    for label, fn in (
+        ("GPQA", lambda: score_gpqa(report_dir)),
+        ("AIME25", lambda: score_aime25(report_dir)),
+    ):
+        name, score, n_repeats = fn()
+        print(f"{label} Pass@1 ({n_repeats} repeats): {fmt_score(score)}")
+        accuracy_scores[name] = {
+            "dataset_name": name,
+            "score": score,
+            "n_repeats": n_repeats,
+        }
+
+    if not args.skip_lcb:
+        name, score, n_repeats = score_livecodebench(
+            report_dir, args.lcb_version, args.lcb_timeout
+        )
+        print(f"LiveCodeBench Pass@1 ({n_repeats} repeats): {fmt_score(score)}")
+        accuracy_scores[name] = {
+            "dataset_name": name,
+            "score": score,
+            "n_repeats": n_repeats,
+        }
+
+    if args.write_results_json:
+        out = report_dir / "results.json"
+        payload = {"accuracy_scores": accuracy_scores}
+        out.write_text(json.dumps(payload, indent=2))
+        print(f"Wrote {out}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,17 @@
+{
+  "accuracy_scores": {
+    "gpqa::deepseek_v4": {
+      "dataset_name": "gpqa::deepseek_v4",
+      "score": 0.7765567765567766,
+      "n_repeats": 1
+    },
+    "aime25::deepseek_v4": {
+      "dataset_name": "aime25::deepseek_v4",
+      "score": 0.9545454545454546,
+      "n_repeats": 0
+    },
+    "livecodebench::deepseek_v4": {
+      "dataset_name": "livecodebench::deepseek_v4",
+      "score": 0.7559760956175299,
+      "n_repeats": 2
+    }