Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

- [ ] Search for similar PRs. Paste at least one query link here: ...
- [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI)
- `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`, `fully_async`, `one_step_off`
- `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `vllm_omni`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`, `fully_async`, `one_step_off`
- If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]`
- `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
- If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title.
Expand Down
17 changes: 16 additions & 1 deletion .github/workflows/vllm_omni.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@ on:
- ".github/workflows/vllm_omni.yml"
- "tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py"
- "tests/experimental/agent_loop/test_diffusion_agent_loop.py"
- "tests/special_e2e/run_flowgrpo_diffusion.sh"
- "tests/special_e2e/create_dummy_diffusion_data.py"
- "verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py"
- "verl/trainer/diffusion/ray_diffusion_trainer.py"
- "verl/trainer/main_flowgrpo.py"
- "verl/trainer/config/diffusion_trainer.yaml"

# Cancel jobs on the same ref if a new one is triggered
concurrency:
Expand Down Expand Up @@ -95,7 +100,7 @@ jobs:
vllm_omni:
needs: setup
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
timeout-minutes: 35 # Increase this timeout value as needed
timeout-minutes: 50 # Increased to accommodate e2e FlowGRPO training
env:
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
Expand All @@ -120,6 +125,16 @@ jobs:
run: |
ray stop --force
pytest tests/experimental/agent_loop/test_diffusion_agent_loop.py -v -s
- name: Install diffusers for e2e training
run: |
pip3 install diffusers==0.37.0
- name: Prepare dummy diffusion dataset
run: |
python3 tests/special_e2e/create_dummy_diffusion_data.py
- name: E2E FlowGRPO diffusion training
run: |
ray stop --force
bash tests/special_e2e/run_flowgrpo_diffusion.sh

cleanup:
runs-on: ubuntu-latest
Expand Down
103 changes: 103 additions & 0 deletions examples/flowgrpo_trainer/data_process/qwenimage_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright 2026 Bytedance Ltd. and/or its affiliates
Comment thread
AndyZhou952 marked this conversation as resolved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the OCR dataset to parquet format (for Qwen-Image training).
You can obtain the raw dataset from https://github.com/yifan123/flow_grpo/tree/main/dataset/ocr
"""

import argparse
import os

import datasets

from verl.utils.hdfs_io import copy, makedirs


def extract_solution(solution_str):
# The solution is stored in the format: 'The image displays "xxx".'
return solution_str.split('"')[1]


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default=None)
parser.add_argument("--hdfs_dir", default=None)
parser.add_argument(
"--local_dataset_path", default="~/dataset/ocr/", help="The local path to the raw dataset, if it exists."
)
parser.add_argument(
"--local_save_dir", default="~/data/ocr", help="The save directory for the preprocessed dataset."
)

args = parser.parse_args()
if args.local_dataset_path is not None:
local_dataset_path = os.path.expanduser(args.local_dataset_path)

data_source = "flow_grpo/ocr"

if local_dataset_path is not None:
dataset = datasets.load_dataset(local_dataset_path)
else:
raise NotImplementedError(
"It is not existed in huggingface hub. "
"Please get dataset from https://github.com/yifan123/flow_grpo/tree/main/dataset/ocr"
)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

system_prompt = (
"Describe the image by detailing the color, shape, size, "
"texture, quantity, text, spatial relationships of the objects and background:"
)
negative_user_prompt = " "

def make_map_fn(split):
def process_fn(example, idx):
text = example.pop("text")
solution = extract_solution(text)
data = {
"data_source": data_source,
"prompt": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
],
"negative_prompt": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": negative_user_prompt},
],
"ability": "ocr",
"reward_model": {"style": "model", "ground_truth": solution},
"extra_info": {"split": split, "index": idx},
}
return data

return process_fn

train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)

hdfs_dir = args.hdfs_dir
local_save_dir = args.local_dir
if local_save_dir is not None:
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
else:
local_save_dir = args.local_save_dir

train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))

if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_save_dir, dst=hdfs_dir)
73 changes: 73 additions & 0 deletions examples/flowgrpo_trainer/run_flowgrpo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Qwen-Image lora, vllm_omni rollout
set -x

ocr_train_path=$HOME/data/ocr/train.parquet
ocr_test_path=$HOME/data/ocr/test.parquet

ENGINE=vllm_omni
REWARD_ENGINE=vllm

reward_path=tests/experimental/reward_loop/reward_fn.py
reward_model_name=$HOME/models/Qwen/Qwen3-VL-8B-Instruct


python3 -m verl.trainer.main_flowgrpo \
algorithm.adv_estimator=flow_grpo \
data.train_files=$ocr_train_path \
data.val_files=$ocr_test_path \
data.train_batch_size=32 \
data.max_prompt_length=1058 \
data.filter_overlong_prompts=True \
+data.apply_chat_template_kwargs.max_length=1058 \
+data.apply_chat_template_kwargs.padding=True \
+data.apply_chat_template_kwargs.truncation=True \
actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen-Image \
actor_rollout_ref.model.tokenizer_path=$HOME/models/Qwen/Qwen-Image/tokenizer \
actor_rollout_ref.model.lora_rank=64 \
actor_rollout_ref.model.lora_alpha=128 \
actor_rollout_ref.model.target_modules="['to_q','to_k','to_v','to_out.0','add_q_proj','add_k_proj','add_v_proj','to_add_out','img_mlp.net.0.proj','img_mlp.net.2','txt_mlp.net.0.proj','txt_mlp.net.2']" \
actor_rollout_ref.actor.optim.lr=3e-4 \
actor_rollout_ref.actor.optim.weight_decay=0.0001 \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
actor_rollout_ref.actor.policy_loss.loss_mode=flow_grpo \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.04 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.guidance_scale=4.0 \
actor_rollout_ref.rollout.agent.default_agent_loop=diffusion_single_turn_agent \
actor_rollout_ref.rollout.agent.num_workers=4 \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.rollout.max_model_len=1058 \
actor_rollout_ref.rollout.noise_level=1.2 \
actor_rollout_ref.rollout.sde_window_size=2 \
actor_rollout_ref.rollout.sde_window_range="[0,5]" \
actor_rollout_ref.rollout.val_kwargs.num_inference_steps=50 \
+actor_rollout_ref.rollout.engine_kwargs.vllm_omni.custom_pipeline=verl.utils.vllm_omni.pipelines.QwenImagePipelineWithLogProb \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
reward.num_workers=4 \
reward.reward_manager.name=image \
reward.reward_model.enable=True \
reward.reward_model.model_path=$reward_model_name \
reward.reward_model.rollout.name=$REWARD_ENGINE \
reward.reward_model.rollout.tensor_model_parallel_size=4 \
reward.custom_reward_function.path=$reward_path \
reward.custom_reward_function.name=compute_score_ocr \
trainer.use_legacy_worker_impl=disable \
trainer.logger='["console", "wandb"]' \
trainer.project_name=flow_grpo \
trainer.experiment_name=qwen_image_ocr \
trainer.log_val_generations=8 \
trainer.val_before_train=False \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=30 \
trainer.test_freq=30 \
trainer.total_epochs=15 $@
77 changes: 77 additions & 0 deletions examples/flowgrpo_trainer/run_flowgrpo_async_reward.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Qwen-Image lora, vllm_omni rollout
set -x

ocr_train_path=$HOME/data/ocr/train.parquet
ocr_test_path=$HOME/data/ocr/test.parquet

ENGINE=vllm_omni
REWARD_ENGINE=vllm

reward_path=tests/experimental/reward_loop/reward_fn.py
reward_model_name=$HOME/models/Qwen/Qwen3-VL-8B-Instruct


python3 -m verl.trainer.main_flowgrpo \
algorithm.adv_estimator=flow_grpo \
data.train_files=$ocr_train_path \
data.val_files=$ocr_test_path \
data.train_batch_size=32 \
data.max_prompt_length=1058 \
data.filter_overlong_prompts=True \
+data.apply_chat_template_kwargs.max_length=1058 \
+data.apply_chat_template_kwargs.padding=True \
+data.apply_chat_template_kwargs.truncation=True \
actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen-Image \
actor_rollout_ref.model.tokenizer_path=$HOME/models/Qwen/Qwen-Image/tokenizer \
actor_rollout_ref.model.lora_rank=64 \
actor_rollout_ref.model.lora_alpha=128 \
actor_rollout_ref.model.target_modules="['to_q','to_k','to_v','to_out.0','add_q_proj','add_k_proj','add_v_proj','to_add_out','img_mlp.net.0.proj','img_mlp.net.2','txt_mlp.net.0.proj','txt_mlp.net.2']" \
actor_rollout_ref.actor.optim.lr=3e-4 \
actor_rollout_ref.actor.optim.weight_decay=0.0001 \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
actor_rollout_ref.actor.policy_loss.loss_mode=flow_grpo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.guidance_scale=1.0 \
actor_rollout_ref.rollout.agent.default_agent_loop=diffusion_single_turn_agent \
actor_rollout_ref.rollout.agent.num_workers=4 \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.rollout.max_model_len=1058 \
actor_rollout_ref.rollout.noise_level=1.2 \
actor_rollout_ref.rollout.sde_window_size=2 \
actor_rollout_ref.rollout.sde_window_range="[0,5]" \
actor_rollout_ref.rollout.val_kwargs.num_inference_steps=50 \
+actor_rollout_ref.rollout.engine_kwargs.vllm_omni.custom_pipeline=verl.utils.vllm_omni.pipelines.QwenImagePipelineWithLogProb \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
reward.num_workers=4 \
reward.reward_manager.name=image \
reward.reward_model.enable=True \
reward.reward_model.model_path=$reward_model_name \
reward.reward_model.rollout.name=$REWARD_ENGINE \
reward.reward_model.enable_resource_pool=True \
reward.reward_model.nnodes=1 \
reward.reward_model.n_gpus_per_node=1 \
reward.reward_model.rollout.gpu_memory_utilization=0.9 \
reward.reward_model.rollout.free_cache_engine=False \
reward.reward_model.rollout.tensor_model_parallel_size=1 \
reward.reward_model.rollout.enforce_eager=False \
reward.custom_reward_function.path=$reward_path \
reward.custom_reward_function.name=compute_score_ocr \
trainer.use_legacy_worker_impl=disable \
trainer.logger='["console", "wandb"]' \
trainer.project_name=flow_grpo \
trainer.experiment_name=qwen_image_ocr \
trainer.log_val_generations=8 \
trainer.val_before_train=False \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=30 \
trainer.test_freq=30 \
trainer.total_epochs=15 $@
71 changes: 71 additions & 0 deletions examples/flowgrpo_trainer/run_flowgrpo_fast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Qwen-Image lora, vllm_omni rollout
set -x

ocr_train_path=$HOME/data/ocr/train.parquet
ocr_test_path=$HOME/data/ocr/test.parquet

ENGINE=vllm_omni
REWARD_ENGINE=vllm

reward_path=tests/experimental/reward_loop/reward_fn.py
reward_model_name=$HOME/models/Qwen/Qwen3-VL-8B-Instruct


python3 -m verl.trainer.main_flowgrpo \
algorithm.adv_estimator=flow_grpo \
data.train_files=$ocr_train_path \
data.val_files=$ocr_test_path \
data.train_batch_size=32 \
data.max_prompt_length=1058 \
data.filter_overlong_prompts=True \
+data.apply_chat_template_kwargs.max_length=1058 \
+data.apply_chat_template_kwargs.padding=True \
+data.apply_chat_template_kwargs.truncation=True \
actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen-Image \
actor_rollout_ref.model.tokenizer_path=$HOME/models/Qwen/Qwen-Image/tokenizer \
actor_rollout_ref.model.lora_rank=64 \
actor_rollout_ref.model.lora_alpha=128 \
actor_rollout_ref.model.target_modules="['to_q','to_k','to_v','to_out.0','add_q_proj','add_k_proj','add_v_proj','to_add_out','img_mlp.net.0.proj','img_mlp.net.2','txt_mlp.net.0.proj','txt_mlp.net.2']" \
actor_rollout_ref.actor.optim.lr=3e-4 \
actor_rollout_ref.actor.optim.weight_decay=0.0001 \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
actor_rollout_ref.actor.policy_loss.loss_mode=flow_grpo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.guidance_scale=1.0 \
actor_rollout_ref.rollout.agent.default_agent_loop=diffusion_single_turn_agent \
actor_rollout_ref.rollout.agent.num_workers=4 \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.rollout.max_model_len=1058 \
actor_rollout_ref.rollout.noise_level=1.2 \
actor_rollout_ref.rollout.sde_window_size=2 \
actor_rollout_ref.rollout.sde_window_range="[0,5]" \
actor_rollout_ref.rollout.val_kwargs.num_inference_steps=50 \
+actor_rollout_ref.rollout.engine_kwargs.vllm_omni.custom_pipeline=verl.utils.vllm_omni.pipelines.QwenImagePipelineWithLogProb \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
reward.num_workers=4 \
reward.reward_manager.name=image \
reward.reward_model.enable=True \
reward.reward_model.model_path=$reward_model_name \
reward.reward_model.rollout.name=$REWARD_ENGINE \
reward.reward_model.rollout.tensor_model_parallel_size=4 \
reward.custom_reward_function.path=$reward_path \
reward.custom_reward_function.name=compute_score_ocr \
trainer.use_legacy_worker_impl=disable \
trainer.logger='["console", "wandb"]' \
trainer.project_name=flow_grpo \
trainer.experiment_name=qwen_image_ocr \
trainer.log_val_generations=8 \
trainer.val_before_train=False \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=30 \
trainer.test_freq=30 \
trainer.total_epochs=15 $@
Loading