Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
420 changes: 420 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/README.md

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/docker_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Shared Docker helpers for DeepSeek-V4-Pro example scripts.

# Writable log directory on the host (mounted into containers at /workspace).
# Accuracy + server logs can grow large; default leaves headroom on the host.
DOCKER_LOG_STORAGE_GB="${DOCKER_LOG_STORAGE_GB:-16}"

ensure_docker_log_dir() {
local subdir="${1:-misc}"
LOG_DIR="${LOG_DIR:-${ENDPOINTS_DIR:-.}/results/docker_logs/${subdir}}"
mkdir -p "${LOG_DIR}"
export LOG_DIR
}

# Extra docker run args for a larger container writable layer (opt-in only).
# Logs are written to the mounted host LOG_DIR; most hosts use overlay2 without xfs
# pquota and reject --storage-opt.
docker_storage_args() {
if [[ "${DOCKER_USE_LOG_STORAGE_OPT:-false}" == "true" ]]; then
# shellcheck disable=SC2207
echo --storage-opt "size=${DOCKER_LOG_STORAGE_GB}G"
fi
}

# Wait for an OpenAI-compatible or SGLang HTTP server (example script preflight).
# Tries GET /health (SGLang native, vLLM), then GET /v1/models (OpenAI compatibility).
# Args: base_url [max_wait_seconds]
wait_openai_compatible_server() {
local base="${1%/}"
local max_wait="${2:-0}"
local start
start=$(date +%s)
while true; do
for path in /health /v1/models; do
if curl --output /dev/null --silent --fail --max-time 5 "${base}${path}"; then
echo "Inference server ready (${base}${path})"
return 0
fi
done
if (( max_wait <= 0 )); then
return 1
fi
if (( $(date +%s) - start >= max_wait )); then
return 1
fi
sleep 2
done
}
19 changes: 19 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/launch_accuracy_background.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Start run_sglang_accuracy_benchmark.sh under nohup with a stable log path.
# (Avoids `VAR=... && nohup ... &` where bash backgrounds the whole `&&` chain so
# follow-up commands in the parent shell lose VAR.)
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=docker_common.sh
source "${SCRIPT_DIR}/docker_common.sh"

ensure_docker_log_dir "accuracy"
LOGF="${LOG_DIR}/nohup_accuracy_$(date +%Y%m%d_%H%M%S).log"
export WAIT_FOR_SGLANG_S="${WAIT_FOR_SGLANG_S:-120}"
export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"

nohup "${SCRIPT_DIR}/run_sglang_accuracy_benchmark.sh" >"${LOGF}" 2>&1 &
echo "Started accuracy benchmark PID=$!"
echo "Wrapper log: ${LOGF}"
echo "Tee log (inside run): ${LOG_DIR}/accuracy_from_config.log"
81 changes: 81 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/monitor_accuracy_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env bash
# Monitor the running SGLang DeepSeek-V4-Pro accuracy benchmark.
# Emits sentinel lines (ALERT_HANG / RUN_FAILED / RUN_FINISHED_OK) so an external
# watcher can be notified, and logs periodic status to a monitor log.
set -uo pipefail

PROC_PAT="${PROC_PAT:-from-config.*sglang_deepseek_v4_pro_accuracy}"
WRAPPER_LOG="${WRAPPER_LOG:?set WRAPPER_LOG to the nohup wrapper log path}"
REPORT_DIR="${REPORT_DIR:-results/sglang_deepseek_v4_pro_accuracy}"
INTERVAL_S="${INTERVAL_S:-300}"
# Consecutive idle (no new completes + GPU idle) checks before declaring a hang.
HANG_STRIKES="${HANG_STRIKES:-3}"
GPU_IDLE_PCT="${GPU_IDLE_PCT:-5}"

ts() { date '+%Y-%m-%d %H:%M:%S'; }

live_events_file() {
# Derive the event logger's tmpfs events.jsonl from its --log-dir arg.
local logdir
logdir=$(pgrep -af "services.event_logger" \
| grep -oE -- "--log-dir [^ ]+" | head -1 | awk '{print $2}')
[[ -n "${logdir}" ]] && echo "${logdir}/events.jsonl"
}

gpu_busy_pct() {
rocm-smi --showuse 2>/dev/null \
| grep -oE "GPU use \(%\): [0-9]+" | grep -oE "[0-9]+$" \
| sort -rn | head -1
}

completed_count() {
local f="$1" c
if [[ -f "${f}" ]]; then
# grep -c always prints a count (0 when no match); capture it so a non-zero
# exit status on 0 matches does not also trigger a fallback echo.
c=$(grep -c '"event_type":"sample.complete"' "${f}" 2>/dev/null)
echo "${c:-0}"
else
echo 0
fi
}

prev_completed=-1
strikes=0

echo "[$(ts)] monitor start: pat='${PROC_PAT}' interval=${INTERVAL_S}s wrapper=${WRAPPER_LOG}"

while true; do
if ! pgrep -f "${PROC_PAT}" >/dev/null 2>&1; then
# Process gone — classify success vs failure from the wrapper log.
if grep -q "Score for livecodebench" "${WRAPPER_LOG}" 2>/dev/null \
|| grep -q "Saved: ${REPORT_DIR}/results.json" "${WRAPPER_LOG}" 2>/dev/null; then
echo "[$(ts)] RUN_FINISHED_OK"
else
echo "[$(ts)] RUN_FAILED (process exited without final score)"
echo "---- wrapper tail ----"
tr '\r' '\n' < "${WRAPPER_LOG}" 2>/dev/null | tail -25
fi
exit 0
fi

ev=$(live_events_file)
done_n=$(completed_count "${ev:-/nonexistent}")
gpu=$(gpu_busy_pct); gpu="${gpu:-0}"
echo "[$(ts)] alive completed=${done_n} gpu_busy=${gpu}% events=${ev:-none}"

# Hang heuristic: no new completions AND GPUs idle for HANG_STRIKES intervals.
if [[ "${done_n}" == "${prev_completed}" && "${gpu}" -lt "${GPU_IDLE_PCT}" ]]; then
strikes=$((strikes + 1))
echo "[$(ts)] no-progress strike ${strikes}/${HANG_STRIKES} (completed unchanged at ${done_n}, gpu ${gpu}%)"
if [[ "${strikes}" -ge "${HANG_STRIKES}" ]]; then
echo "[$(ts)] ALERT_HANG completed stuck at ${done_n}, gpu ${gpu}% for ${strikes} checks"
echo "---- wrapper tail ----"
tr '\r' '\n' < "${WRAPPER_LOG}" 2>/dev/null | tail -15
fi
else
strikes=0
fi
prev_completed="${done_n}"
sleep "${INTERVAL_S}"
done
144 changes: 144 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/rescore_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Re-score accuracy datasets from an existing benchmark report directory.

Use after `inference-endpoint benchmark from-config` when inference completed but
scoring failed (e.g. LiveCodeBench container was not running). Dataset names must
match the YAML preset suffixes (e.g. `gpqa::deepseek_v4`).
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path

from inference_endpoint.dataset_manager.predefined.aime25 import AIME25
from inference_endpoint.dataset_manager.predefined.gpqa import GPQA
from inference_endpoint.dataset_manager.predefined.livecodebench import LiveCodeBench
from inference_endpoint.evaluation.extractor import (
ABCDExtractor,
BoxedMathExtractor,
PythonCodeExtractor,
)
from inference_endpoint.evaluation.scoring import LiveCodeBenchScorer, PassAt1Scorer

DATASET_CACHE = Path("dataset_cache")


def score_gpqa(report_dir: Path) -> tuple[str, float, int]:
ds = GPQA.load_from_file(DATASET_CACHE / "gpqa/diamond/gpqa_diamond.parquet")
ds.load()
name = "gpqa::deepseek_v4"
scorer = PassAt1Scorer(name, ds, report_dir, extractor=ABCDExtractor)
score, n_repeats = scorer.score()
return name, score, n_repeats


def score_aime25(report_dir: Path) -> tuple[str, float, int]:
ds = AIME25.load_from_file(DATASET_CACHE / "aime25/aime25.parquet")
ds.load()
name = "aime25::deepseek_v4"
scorer = PassAt1Scorer(
name,
ds,
report_dir,
extractor=BoxedMathExtractor,
ground_truth_column="answer",
)
score, n_repeats = scorer.score()
return name, score, n_repeats


def score_livecodebench(
report_dir: Path, lcb_version: str, timeout: int
) -> tuple[str, float, int]:
ds = LiveCodeBench.load_from_file(
DATASET_CACHE
/ f"livecodebench/{lcb_version}/livecodebench_{lcb_version}.parquet"
)
ds.load()
name = "livecodebench::deepseek_v4"
scorer = LiveCodeBenchScorer(
name,
ds,
report_dir,
extractor=PythonCodeExtractor,
lcb_version=lcb_version,
timeout=timeout,
)
score, n_repeats = scorer.score()
return name, score, n_repeats


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--report-dir",
type=Path,
required=True,
help="Benchmark report directory (contains events.jsonl and sample_idx_map.json)",
)
parser.add_argument(
"--lcb-version",
default="release_v6",
help="LiveCodeBench dataset version tag",
)
parser.add_argument(
"--lcb-timeout",
type=int,
default=60,
help="Per-test timeout for LiveCodeBench evaluation (seconds)",
)
parser.add_argument(
"--skip-lcb",
action="store_true",
help="Skip LiveCodeBench (score GPQA and AIME25 only)",
)
parser.add_argument(
"--write-results-json",
action="store_true",
help="Write results.json in the same format as benchmark finalize",
)
args = parser.parse_args()

report_dir = args.report_dir.resolve()
accuracy_scores: dict[str, dict] = {}

def fmt_score(score: float | None) -> str:
return f"{score:.4f}" if score is not None else "None (scoring failed)"

for label, fn in (
("GPQA", lambda: score_gpqa(report_dir)),
("AIME25", lambda: score_aime25(report_dir)),
):
name, score, n_repeats = fn()
print(f"{label} Pass@1 ({n_repeats} repeats): {fmt_score(score)}")
accuracy_scores[name] = {
"dataset_name": name,
"score": score,
"n_repeats": n_repeats,
}

if not args.skip_lcb:
name, score, n_repeats = score_livecodebench(
report_dir, args.lcb_version, args.lcb_timeout
)
print(f"LiveCodeBench Pass@1 ({n_repeats} repeats): {fmt_score(score)}")
accuracy_scores[name] = {
"dataset_name": name,
"score": score,
"n_repeats": n_repeats,
}

if args.write_results_json:
out = report_dir / "results.json"
payload = {"accuracy_scores": accuracy_scores}
out.write_text(json.dumps(payload, indent=2))
print(f"Wrote {out}")


if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions examples/10_DeepSeekV4Pro_Example/result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"accuracy_scores": {
"gpqa::deepseek_v4": {
"dataset_name": "gpqa::deepseek_v4",
"score": 0.7765567765567766,
"n_repeats": 1
},
"aime25::deepseek_v4": {
"dataset_name": "aime25::deepseek_v4",
"score": 0.9545454545454546,
"n_repeats": 0
},
"livecodebench::deepseek_v4": {
"dataset_name": "livecodebench::deepseek_v4",
"score": 0.7559760956175299,
"n_repeats": 2
}
Loading
Loading