From d2f48dc52ccd4703a18bc9d0f75b078556172d2f Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Wed, 22 Apr 2026 09:21:56 -0700
Subject: [PATCH] [gpt-oss-120b] Add --enable_speculative_decode flag for
 Interactive scenario

Adds an opt-in EAGLE3 speculative decoding config path to run_server.sh,
mirroring the --enable_speculative_decode flag in DeepSeek-R1. Defaults are
tuned for the gpt-oss-120b Interactive scenario on GB200 using the
nvidia/gpt-oss-120b-Eagle3-long-context head (num_steps=3, topk=1).

Preserves backward compatibility: existing --eagle_path callers are
unchanged, and the default server command still matches the previous
behavior when the flag is not passed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 language/gpt-oss-120b/README.md            | 12 ++++++++++++
 language/gpt-oss-120b/sglang/run_server.sh | 22 ++++++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/language/gpt-oss-120b/README.md b/language/gpt-oss-120b/README.md
index 4971d5c531..72ac7fe93b 100644
--- a/language/gpt-oss-120b/README.md
+++ b/language/gpt-oss-120b/README.md
@@ -55,6 +55,18 @@ Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang serve
 
 The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified.
 
+#### Speculative Decoding (Interactive scenario)
+
+Pass `--enable_speculative_decode` to enable EAGLE3 speculative decoding for
+the Interactive scenario:
+
+```bash
+./run_server.sh --enable_speculative_decode
+```
+
+Per the Interactive scenario policy, `num_steps=3` and `topk=1` are fixed; the
+draft head is [`nvidia/gpt-oss-120b-Eagle3-long-context`](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context).
+
 You may also use docker:
 
 ```bash
diff --git a/language/gpt-oss-120b/sglang/run_server.sh b/language/gpt-oss-120b/sglang/run_server.sh
index dd6c7e3895..9723acb2d1 100755
--- a/language/gpt-oss-120b/sglang/run_server.sh
+++ b/language/gpt-oss-120b/sglang/run_server.sh
@@ -8,6 +8,13 @@ eagle_path=""
 stream_interval=500
 extra_args=""
 
+# Interactive scenario: EAGLE3 speculative decoding with the long-context head.
+# num_steps=3 and topk=1 are fixed per the Interactive scenario policy.
+enable_speculative_decode=false
+SPECULATIVE_DRAFT_MODEL_PATH="nvidia/gpt-oss-120b-Eagle3-long-context"
+SPECULATIVE_NUM_STEPS=3
+SPECULATIVE_TOPK=1
+
 while [[ $# -gt 0 ]]; do
 	case $1 in
 	--dp)
@@ -26,6 +33,10 @@ while [[ $# -gt 0 ]]; do
 		stream_interval=$2
 		shift 2
 		;;
+	--enable_speculative_decode)
+		enable_speculative_decode=true
+		shift 1
+		;;
 	*)
 		extra_args="$extra_args $1"
 		shift 1
@@ -43,14 +54,17 @@ args=" --model-path $model_path \
     --enable-metrics \
     --stream-interval $stream_interval "
 
+# Explicit --eagle_path wins over the --enable_speculative_decode defaults so
+# existing callers keep working unchanged.
 if [ -n "$eagle_path" ]; then
 	args="$args --speculative-draft-model-path $eagle_path \
         --speculative-algorithm EAGLE3"
+elif [ "$enable_speculative_decode" = "true" ]; then
+	args="$args --speculative-draft-model-path $SPECULATIVE_DRAFT_MODEL_PATH \
+        --speculative-algorithm EAGLE3 \
+        --speculative-num-steps $SPECULATIVE_NUM_STEPS \
+        --speculative-eagle-topk $SPECULATIVE_TOPK"
 fi
 
-# --speculative-num-steps 1 \
-# --speculative-eagle-topk 1 \
-# --speculative-num-draft-tokens 3 \
-
 set -x
 python3 -m sglang.launch_server "$args" "$extra_args"