From 907322979e75f6854d13d7604b304257efbbac4d Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 18 Jun 2025 11:13:01 +0545 Subject: [PATCH 1/2] Add Open-R1 Example --- docs/examples.md | 10 ++ .../distributed-training/open-r1/index.md | 0 .../distributed-training/open-r1/.dstack.yml | 83 ++++++++++ .../distributed-training/open-r1/README.md | 155 ++++++++++++++++++ mkdocs.yml | 1 + 5 files changed, 249 insertions(+) create mode 100644 docs/examples/distributed-training/open-r1/index.md create mode 100644 examples/distributed-training/open-r1/.dstack.yml create mode 100644 examples/distributed-training/open-r1/README.md diff --git a/docs/examples.md b/docs/examples.md index 17df886a9f..d83ccb7783 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -74,6 +74,16 @@ hide: with RAGEN, verl, and Ray.

+ +

+ Open-R1 +

+ +

+ Fine-tune to reproduce Deepseek-R1 with separate inference and training nodes. +

+
diff --git a/docs/examples/distributed-training/open-r1/index.md b/docs/examples/distributed-training/open-r1/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/distributed-training/open-r1/.dstack.yml b/examples/distributed-training/open-r1/.dstack.yml new file mode 100644 index 0000000000..3c3ca3f278 --- /dev/null +++ b/examples/distributed-training/open-r1/.dstack.yml @@ -0,0 +1,83 @@ +type: task +name: open-r1-grpo + +# Size of the cluster +nodes: 2 + +python: 3.12 + +nvcc: true + +# Required environment variables +env: + - HF_TOKEN + - WANDB_API_KEY + - NCCL_DEBUG=INFO + # VLLM configuration + - USE_VLLM=true + - MODEL=Qwen/Qwen2.5-Coder-7B-Instruct + # Qwen2.5-Coder-7B-Instruct has 28 attention heads and should be divisible by TP and DP + - TP=4 + - DP=2 + +# Commands of the task +commands: + - uv pip install vllm==0.8.5.post1 + - uv pip install setuptools + - uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl + - git clone https://github.com/huggingface/open-r1.git + - cd open-r1 + - uv pip install . + - | + # Get the last IP from DSTACK_NODES_IPS for vLLM node + VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1) + echo "VLLM host IP (last node): $VLLM_HOST" + + if [ "$USE_VLLM" = "true" ]; then + if [ "$DSTACK_NODE_RANK" -eq $(($DSTACK_NODES_NUM - 1)) ]; then + # Last Node runs VLLM server + echo "Starting VLLM server on Last Node (IP: $VLLM_HOST)" + trl vllm-serve --model $MODEL --tensor_parallel_size $TP --data_parallel_size $DP --host 0.0.0.0 + else + # Training node - adjust world size and nodes count for training + GPUS_PER_NODE=$(($DSTACK_GPUS_NUM / $DSTACK_NODES_NUM)) + ADJUSTED_NODES_NUM=$(($DSTACK_NODES_NUM - 1)) + ADJUSTED_GPUS_TOTAL=$(($GPUS_PER_NODE * $ADJUSTED_NODES_NUM)) + # Other nodes run training + echo "Starting training with VLLM on $VLLM_HOST" + accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \ + --num_processes=$ADJUSTED_GPUS_TOTAL \ + --num_machines=$ADJUSTED_NODES_NUM \ + --machine_rank=$DSTACK_NODE_RANK \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + src/open_r1/grpo.py \ + --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml \ + --model_name_or_path $MODEL \ + --output_dir /checkpoints/Qwen2.5-Coder-7B-Instruct-GRPO \ + --hub_model_id sjbbihan/Qwen2.5-Coder-7B-Instruct \ + --vllm_server_host=$VLLM_HOST + fi + else + # Standard training mode without VLLM + echo "Running standard training without VLLM" + accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + --machine_rank=$DSTACK_NODE_RANK \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + src/open_r1/grpo.py \ + --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml \ + --model_name_or_path $MODEL \ + --output_dir /checkpoints/Qwen2.5-Coder-7B-Instruct-GRPO \ + --hub_model_id sjbbihan/Qwen2.5-Coder-7B-Instruct \ + --use_vllm false + fi + +resources: + gpu: 80GB:8 + shm_size: 128GB + +volumes: + - /checkpoints:/checkpoints diff --git a/examples/distributed-training/open-r1/README.md b/examples/distributed-training/open-r1/README.md new file mode 100644 index 0000000000..92aff2a9a9 --- /dev/null +++ b/examples/distributed-training/open-r1/README.md @@ -0,0 +1,155 @@ +# Open-R1 + +This example demonstrates how to use `dstack` and [open-r1](https://github.com/huggingface/open-r1) to run GRPO training across distributed nodes, dedicating one node for [vLLM](https://github.com/vllm-project/vllm) inference while using the remaining nodes for training. + +??? info "Prerequisites" + Once `dstack` is [installed](https://dstack.ai/docs/installation), go ahead clone the repo, and run `dstack init`. + +
+ + ```shell + $ git clone https://github.com/dstackai/dstack + $ cd dstack + $ dstack init + ``` +
+ +## Create fleet + +Before submitting distributed training runs, make sure to create a fleet with a `placement` set to `cluster`. + +> For more detials on how to use clusters with `dstack`, check the [Clusters](https://dstack.ai/docs/guides/clusters) guide. + +## Define a configurtation + +Once the fleet is created, define a distributed task configuration. + +
+ +```yaml +type: task +name: open-r1-grpo + +# Size of the cluster +nodes: 2 + +python: 3.12 + +nvcc: true + +# Required environment variables +env: + - HF_TOKEN + - HUB_MODEL_ID + - WANDB_API_KEY + - NCCL_DEBUG=INFO + # VLLM configuration + - USE_VLLM=true + - MODEL=Qwen/Qwen2.5-Coder-7B-Instruct + - TP=4 + - DP=2 + +# Commands of the task +commands: + - uv pip install vllm==0.8.5.post1 + - uv pip install setuptools + - uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl + - git clone https://github.com/huggingface/open-r1.git + - cd open-r1 + - uv pip install . + - | + # Get the last IP from DSTACK_NODES_IPS for vLLM node + VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1) + echo "VLLM host IP (last node): $VLLM_HOST" + + if [ "$USE_VLLM" = "true" ]; then + if [ "$DSTACK_NODE_RANK" -eq $(($DSTACK_NODES_NUM - 1)) ]; then + # Last Node runs VLLM server + echo "Starting VLLM server on Last Node (IP: $VLLM_HOST)" + trl vllm-serve --model $MODEL --tensor_parallel_size $TP --data_parallel_size $DP --host 0.0.0.0 + else + # Training node - adjust world size and nodes count for training + GPUS_PER_NODE=$(($DSTACK_GPUS_NUM / $DSTACK_NODES_NUM)) + ADJUSTED_NODES_NUM=$(($DSTACK_NODES_NUM - 1)) + ADJUSTED_GPUS_TOTAL=$(($GPUS_PER_NODE * $ADJUSTED_NODES_NUM)) + # Other nodes run training + echo "Starting training with VLLM on $VLLM_HOST" + accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \ + --num_processes=$ADJUSTED_GPUS_TOTAL \ + --num_machines=$ADJUSTED_NODES_NUM \ + --machine_rank=$DSTACK_NODE_RANK \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + src/open_r1/grpo.py \ + --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml \ + --model_name_or_path $MODEL \ + --output_dir /checkpoints/Qwen2.5-Coder-7B-Instruct-GRPO \ + --hub_model_id $HUB_MODEL_ID \ + --vllm_server_host=$VLLM_HOST + fi + else + # Standard training mode without VLLM + echo "Running standard training without VLLM" + accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + --machine_rank=$DSTACK_NODE_RANK \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + src/open_r1/grpo.py \ + --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml \ + --model_name_or_path $MODEL \ + --output_dir /checkpoints/Qwen2.5-Coder-7B-Instruct-GRPO \ + --hub_model_id $HUB_MODEL_ID \ + --use_vllm false + fi + +resources: + gpu: 80GB:8 + shm_size: 128GB + +volumes: + - /checkpoints:/checkpoints +``` +
+ +!!! info "NOTE:" + 1. When `nodes: N` and `USE_VLLM=true`, (N-1) nodes will be used for distributed training, while last node will be used for vLLM inference. + 2. When `USE_VLLM=false`, training happens across all nodes. + 3. The number of `attention heads` of the model should be divisible by `TP` and `DP` values. + 4. We pin the `flash-attn` to `2.7.4.post1` for compatibility with `vllm=0.8.5.post1`, as latest vLLM causes TRL's GRPO to fail. See [issue #3608](https://github.com/huggingface/trl/issues/3608) for details. + + +### Apply the configuration + +To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. + +
+ +```shell +$ HF_TOKEN=... +$ HUB_MODEL_ID=... +$ WANDB_API_KEY=... +$ dstack apply -f examples/distributed-training/open-r1/.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + +Submit the run open-r1-grpo? [y/n]: y + +Provisioning... +---> 100% +``` +
+ +## Source code + +The source-code of this example can be found in +[`examples/distributed-training/open-r1` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/open-r1){:target="_blank"}. + +!!! info "What's next?" + 1. Read the [clusters](https://dstack.ai/docs/guides/clusters) guide + 2. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), + [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets) + diff --git a/mkdocs.yml b/mkdocs.yml index 5bb5718c67..aa70e85db0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -268,6 +268,7 @@ nav: - TRL: examples/distributed-training/trl/index.md - Axolotl: examples/distributed-training/axolotl/index.md - Ray+RAGEN: examples/distributed-training/ray-ragen/index.md + - Open-R1: examples/distributed-training/open-r1/index.md - Clusters: - NCCL tests: examples/clusters/nccl-tests/index.md - RCCL tests: examples/clusters/rccl-tests/index.md From c88164f199a93bceac537dfaf6eb776a278c51a4 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Mon, 30 Jun 2025 12:12:49 +0545 Subject: [PATCH 2/2] Resolve Review Comments --- examples/distributed-training/open-r1/.dstack.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/distributed-training/open-r1/.dstack.yml b/examples/distributed-training/open-r1/.dstack.yml index 3c3ca3f278..dda383a3d6 100644 --- a/examples/distributed-training/open-r1/.dstack.yml +++ b/examples/distributed-training/open-r1/.dstack.yml @@ -29,22 +29,17 @@ commands: - cd open-r1 - uv pip install . - | - # Get the last IP from DSTACK_NODES_IPS for vLLM node - VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1) - echo "VLLM host IP (last node): $VLLM_HOST" - if [ "$USE_VLLM" = "true" ]; then + # Get the last IP from DSTACK_NODES_IPS for vLLM node + VLLM_HOST=$(echo $DSTACK_NODES_IPS | tr ' ' '\n' | tail -n 1) if [ "$DSTACK_NODE_RANK" -eq $(($DSTACK_NODES_NUM - 1)) ]; then # Last Node runs VLLM server - echo "Starting VLLM server on Last Node (IP: $VLLM_HOST)" trl vllm-serve --model $MODEL --tensor_parallel_size $TP --data_parallel_size $DP --host 0.0.0.0 else # Training node - adjust world size and nodes count for training - GPUS_PER_NODE=$(($DSTACK_GPUS_NUM / $DSTACK_NODES_NUM)) ADJUSTED_NODES_NUM=$(($DSTACK_NODES_NUM - 1)) - ADJUSTED_GPUS_TOTAL=$(($GPUS_PER_NODE * $ADJUSTED_NODES_NUM)) + ADJUSTED_GPUS_TOTAL=$(($DSTACK_GPUS_PER_NODE * $ADJUSTED_NODES_NUM)) # Other nodes run training - echo "Starting training with VLLM on $VLLM_HOST" accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \ --num_processes=$ADJUSTED_GPUS_TOTAL \ --num_machines=$ADJUSTED_NODES_NUM \