From d2f48dc52ccd4703a18bc9d0f75b078556172d2f Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 22 Apr 2026 09:21:56 -0700 Subject: [PATCH] [gpt-oss-120b] Add --enable_speculative_decode flag for Interactive scenario Adds an opt-in EAGLE3 speculative decoding config path to run_server.sh, mirroring the --enable_speculative_decode flag in DeepSeek-R1. Defaults are tuned for the gpt-oss-120b Interactive scenario on GB200 using the nvidia/gpt-oss-120b-Eagle3-long-context head (num_steps=3, topk=1). Preserves backward compatibility: existing --eagle_path callers are unchanged, and the default server command still matches the previous behavior when the flag is not passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- language/gpt-oss-120b/README.md | 12 ++++++++++++ language/gpt-oss-120b/sglang/run_server.sh | 22 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/language/gpt-oss-120b/README.md b/language/gpt-oss-120b/README.md index 4971d5c531..72ac7fe93b 100644 --- a/language/gpt-oss-120b/README.md +++ b/language/gpt-oss-120b/README.md @@ -55,6 +55,18 @@ Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang serve The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. +#### Speculative Decoding (Interactive scenario) + +Pass `--enable_speculative_decode` to enable EAGLE3 speculative decoding for +the Interactive scenario: + +```bash +./run_server.sh --enable_speculative_decode +``` + +Per the Interactive scenario policy, `num_steps=3` and `topk=1` are fixed; the +draft head is [`nvidia/gpt-oss-120b-Eagle3-long-context`](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context). + You may also use docker: ```bash diff --git a/language/gpt-oss-120b/sglang/run_server.sh b/language/gpt-oss-120b/sglang/run_server.sh index dd6c7e3895..9723acb2d1 100755 --- a/language/gpt-oss-120b/sglang/run_server.sh +++ b/language/gpt-oss-120b/sglang/run_server.sh @@ -8,6 +8,13 @@ eagle_path="" stream_interval=500 extra_args="" +# Interactive scenario: EAGLE3 speculative decoding with the long-context head. +# num_steps=3 and topk=1 are fixed per the Interactive scenario policy. +enable_speculative_decode=false +SPECULATIVE_DRAFT_MODEL_PATH="nvidia/gpt-oss-120b-Eagle3-long-context" +SPECULATIVE_NUM_STEPS=3 +SPECULATIVE_TOPK=1 + while [[ $# -gt 0 ]]; do case $1 in --dp) @@ -26,6 +33,10 @@ while [[ $# -gt 0 ]]; do stream_interval=$2 shift 2 ;; + --enable_speculative_decode) + enable_speculative_decode=true + shift 1 + ;; *) extra_args="$extra_args $1" shift 1 @@ -43,14 +54,17 @@ args=" --model-path $model_path \ --enable-metrics \ --stream-interval $stream_interval " +# Explicit --eagle_path wins over the --enable_speculative_decode defaults so +# existing callers keep working unchanged. if [ -n "$eagle_path" ]; then args="$args --speculative-draft-model-path $eagle_path \ --speculative-algorithm EAGLE3" +elif [ "$enable_speculative_decode" = "true" ]; then + args="$args --speculative-draft-model-path $SPECULATIVE_DRAFT_MODEL_PATH \ + --speculative-algorithm EAGLE3 \ + --speculative-num-steps $SPECULATIVE_NUM_STEPS \ + --speculative-eagle-topk $SPECULATIVE_TOPK" fi -# --speculative-num-steps 1 \ -# --speculative-eagle-topk 1 \ -# --speculative-num-draft-tokens 3 \ - set -x python3 -m sglang.launch_server "$args" "$extra_args"