diff --git a/language/gpt-oss-120b/README.md b/language/gpt-oss-120b/README.md index 4971d5c531..72ac7fe93b 100644 --- a/language/gpt-oss-120b/README.md +++ b/language/gpt-oss-120b/README.md @@ -55,6 +55,18 @@ Use [`./sglang/run_server.sh`](./sglang/run_server.sh) to launch an SGLang serve The script uses `python3 -m sglang.launch_server` tp instantiate the model, with `tp=pp=ep=1`, and `dp` as specified. +#### Speculative Decoding (Interactive scenario) + +Pass `--enable_speculative_decode` to enable EAGLE3 speculative decoding for +the Interactive scenario: + +```bash +./run_server.sh --enable_speculative_decode +``` + +Per the Interactive scenario policy, `num_steps=3` and `topk=1` are fixed; the +draft head is [`nvidia/gpt-oss-120b-Eagle3-long-context`](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context). + You may also use docker: ```bash diff --git a/language/gpt-oss-120b/sglang/run_server.sh b/language/gpt-oss-120b/sglang/run_server.sh index dd6c7e3895..9723acb2d1 100755 --- a/language/gpt-oss-120b/sglang/run_server.sh +++ b/language/gpt-oss-120b/sglang/run_server.sh @@ -8,6 +8,13 @@ eagle_path="" stream_interval=500 extra_args="" +# Interactive scenario: EAGLE3 speculative decoding with the long-context head. +# num_steps=3 and topk=1 are fixed per the Interactive scenario policy. +enable_speculative_decode=false +SPECULATIVE_DRAFT_MODEL_PATH="nvidia/gpt-oss-120b-Eagle3-long-context" +SPECULATIVE_NUM_STEPS=3 +SPECULATIVE_TOPK=1 + while [[ $# -gt 0 ]]; do case $1 in --dp) @@ -26,6 +33,10 @@ while [[ $# -gt 0 ]]; do stream_interval=$2 shift 2 ;; + --enable_speculative_decode) + enable_speculative_decode=true + shift 1 + ;; *) extra_args="$extra_args $1" shift 1 @@ -43,14 +54,17 @@ args=" --model-path $model_path \ --enable-metrics \ --stream-interval $stream_interval " +# Explicit --eagle_path wins over the --enable_speculative_decode defaults so +# existing callers keep working unchanged. if [ -n "$eagle_path" ]; then args="$args --speculative-draft-model-path $eagle_path \ --speculative-algorithm EAGLE3" +elif [ "$enable_speculative_decode" = "true" ]; then + args="$args --speculative-draft-model-path $SPECULATIVE_DRAFT_MODEL_PATH \ + --speculative-algorithm EAGLE3 \ + --speculative-num-steps $SPECULATIVE_NUM_STEPS \ + --speculative-eagle-topk $SPECULATIVE_TOPK" fi -# --speculative-num-steps 1 \ -# --speculative-eagle-topk 1 \ -# --speculative-num-draft-tokens 3 \ - set -x python3 -m sglang.launch_server "$args" "$extra_args"