databricks · bowenyang008 · Jul 29, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
@@ -19,6 +19,8 @@
     HFPolicyConfig,
     MPTPolicyConfig,
 )
+from compose_rl.algorithms.online.single_controller_callback import \
+    SingleControllerOnPolicyCallback
 from compose_rl.registry import kl_controllers
 
 kl_controllers.register('adaptive', func=AdaptiveKLController)
@@ -28,6 +30,7 @@
 
 __all__ = [
     'OnPolicyCallback',
+    'SingleControllerOnPolicyCallback',
     'ComposerMPTPolicyLM',
     'ComposerHFPolicyLM',
     'ComposerHFCriticFreePolicyLM',

@@ -764,7 +764,6 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
                 # When we hit this function, we should already have all the prompts we need per iteration.
                 num_gen_calls = bs // self.device_generate_batch_size
 
-                gen_batch_partial_outputs = []
                 all_sequences = []
                 for i in range(num_gen_calls):
                     gen_batch = self._extract_minibatch(
@@ -796,6 +795,15 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
 
+        # Compute rewards and populate buffer
+        self._get_reward(batch)
+
+    def _get_reward(self, batch: dict[str, torch.Tensor]):
+        """Compute rewards for a batch of generated sequences.
+
+        Args:
+            batch (dict): The batch containing generated sequences to compute rewards for.
+        """
         env_outputs, prompts_and_gens, ref_outputs, all_rewards_dict = env_reward(
             actor_critic=self.actor_critic,  # pyright: ignore
             reward_manager=self.reward_manager,
@@ -825,7 +833,9 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
                 del resolved_outputs[key]
 
         # We need to split the resolved outputs into minibatches
-        for idx in range(bs // self.device_train_batch_size):
+        for idx in range(
+            batch['prompt_id'].shape[0] // self.device_train_batch_size,
+        ):
             minibatch = self._extract_minibatch(
                 resolved_outputs,
                 idx,
@@ -834,7 +844,9 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
             self.buffer.add(minibatch)
 
         # Making sure we correctly parsed the minibatches
-        assert len(self.buffer) == self.num_batches_per_update
+        assert len(
+            self.buffer,
+        ) == self.num_batches_per_update, f'{len(self.buffer)} != {self.num_batches_per_update}'
 
         self.actor_critic.train()
 
@@ -1149,7 +1161,7 @@ def _update_inference_model(self, batch: dict[str, torch.Tensor]):
             model=self.actor_critic,
             vllm_engines=self.vllm_engines,
             model_update_group=self.model_update_group,
-            batch=batch,
+            device=batch['prompt'].device,
             loss_type=self.actor_critic.loss_type,  # type: ignore
             enable_prefix_caching=self.vllm_enable_prefix_caching,
         )

@@ -381,7 +381,7 @@ def broadcast_to_vllm(
     model: nn.Module,
     vllm_engines: list,
     model_update_group: Optional[torch.distributed.ProcessGroup],
-    batch: dict[str, torch.Tensor],
+    device: torch.device,
     loss_type: OnPolicyEnum = OnPolicyEnum.PPO,
     enable_prefix_caching: bool = False,
 ):
@@ -391,7 +391,7 @@ def broadcast_to_vllm(
         model (nn.Module): The model to broadcast
         vllm_engines (list): List of vllm engines
         model_update_group (torch.distributed.ProcessGroup): The process group for model updates
-        batch (dict[str, torch.Tensor]): The batch to use for the forward pass
+        device (torch.device): The device to use for the forward pass
         loss_type (str): The loss type which decides whether to use critic-free or not. Defaults to `ppo`.
         enable_prefix_caching (bool): Whether to enable prefix caching. Defaults to `False`.
     """
@@ -419,9 +419,6 @@ def broadcast_to_vllm(
             engine.reset_prefix_cache.remote() for engine in vllm_engines
         ]
 
-    # This is needed to get the correct model device
-    cur_device = batch['prompt'].device
-
     # These apply to llama modules, it might change for other modules
     valid_non_leaf_module_names = [
         'model.embed_tokens.weight',
@@ -438,17 +435,17 @@ def broadcast_to_vllm(
         # We need this otherwise FSDP throws an error during a standard forward pass.
         dummy_batch = {
             'obs':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
             'right_padded_attn_mask':
-                torch.tensor([[1]], dtype=torch.bool, device=cur_device),
+                torch.tensor([[1]], dtype=torch.bool, device=device),
             'actions':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
             'prompt_len':
-                torch.tensor([1], device=cur_device),
+                torch.tensor([1], device=device),
             'max_gen_len':
-                torch.tensor([1], device=cur_device),
+                torch.tensor([1], device=device),
             'action_mask':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
         }
         model(dummy_batch)
     start_time = time.time()

@@ -100,17 +100,22 @@ def eval_forward(self, batch: MutableMapping, outputs: MutableMapping):
         )
 
     def loss(self, outputs: MutableMapping, batch: MutableMapping):
+        # Get beta from config if available, otherwise use default
+        additional_kwargs = {}
+        if hasattr(self.config, 'beta'):
+            additional_kwargs['beta'] = self.config.beta
+
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
             loss_type=OnPolicyEnum.PPO,
             value_clip_range=self.config.value_clip_range,
             value_loss_weight=self.config.value_loss_weight,
             policy_clip_ratio=self.config.policy_clip_ratio,
-            beta=self.config.beta,
             add_direct_kl_loss=self.config.compute_kl_loss,
             kl_estimator=self.config.kl_estimator,
             kl_clip_range=self.config.kl_clip_range,
+            **additional_kwargs,
         )
 
         self.policy_kl.append(return_dict['kl/policy_kl'])
@@ -217,17 +222,22 @@ def eval_forward(self, batch: MutableMapping, outputs: MutableMapping):
         )
 
     def loss(self, outputs: MutableMapping, batch: MutableMapping):
+        # Get beta from config if available, otherwise use default
+        additional_kwargs = {}
+        if hasattr(self.config, 'beta'):
+            additional_kwargs['beta'] = self.config.beta
+
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
             loss_type=self.loss_type,  # pyright: ignore
             value_clip_range=self.config.value_clip_range,
             value_loss_weight=self.config.value_loss_weight,
             policy_clip_ratio=self.config.policy_clip_ratio,
-            beta = self.config.beta,
             add_direct_kl_loss=self.config.compute_kl_loss,
             kl_estimator=self.config.kl_estimator,
             kl_clip_range=self.config.kl_clip_range,
+            **additional_kwargs,
         )
 
         self.policy_kl.append(return_dict['kl/policy_kl'])

@@ -0,0 +1,58 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Online On-Policy RL callback."""
+
+from __future__ import annotations
+
+import logging
+from typing import Union
+
+from composer.core import State
+from composer.loggers import Logger
+from composer.trainer.trainer import _get_initial_device_train_microbatch_size
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+# Import the base class
+from compose_rl.algorithms.online.callback import OnPolicyCallback
+from compose_rl.algorithms.online.model import (
+    ComposerHFPolicyLM,
+    ComposerMPTPolicyLM,
+)
+
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
+
+__all__ = ['SingleControllerOnPolicyCallback']
+
+log = logging.getLogger(__name__)
+
+
+class SingleControllerOnPolicyCallback(OnPolicyCallback):
+    """Callback for managing on-policy training in an RLHF loop.
+
+    Ideally all the overwritten methods below should be implemented in the
+    trainer actor instead of the callback, we kept them here for now to minimize
+    a drastic refactor to PPO Callback code
+    """
+
+    def iteration_start(self, state: State, logger: Logger):
+        del logger  # unused
+
+        self._get_reward(self.batch_rollouts)  # type: ignore
+
+        # Reset and initialize state train dataloader
+        log.warning(
+            'trainer._train_data_spec should be updated whenever the dataloader is updated',
+        )
+        # Train Dataloader
+        state.set_dataloader(self.buffer, 'ep')
+        state.train_dataloader = state.dataloader
+        state.device_train_microbatch_size = _get_initial_device_train_microbatch_size(
+            state.device_train_microbatch_size,
+            state.auto_microbatching,
+            state.train_dataloader,
+        )
+
+        # Update IFT KL
+        self._update_ift_kl()
@@ -1,6 +1,7 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
+from tests.common.actor import BaseDistributedGPUActor
 from tests.common.datasets import (
     FineGrainedPreference,
     PairwisePreference,
@@ -11,6 +12,7 @@
 from tests.common.markers import device, world_size
 
 __all__ = [
+    'BaseDistributedGPUActor',
     'PairwisePreference',
     'FineGrainedPreference',
     'PromptDataset',

@@ -0,0 +1,101 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from datetime import timedelta
+from typing import Optional
+
+import ray
+import torch.distributed as dist
+
+from compose_rl.algorithms.online.generation_utils import init_process_group
+from compose_rl.utils.ray_utils import (
+    get_free_port,
+    get_node_ip,
+    is_cuda_visible_devices_set,
+)
+
+
+class BaseDistributedGPUActor:
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        master_addr: Optional[str] = None,
+        master_port: Optional[int] = None,
+    ):
+        """Initialize the distributed GPU actor for RAY.
+
+        Args:
+            rank: The rank of this process in the distributed group
+            world_size: Total number of processes in the distributed group
+            master_addr: Master node address. If None, will allocate dynamically for rank 0
+            master_port: Master node port. If None, will allocate dynamically for rank 0
+        """
+        self.rank = rank
+        self.world_size = world_size
+        self.master_addr = master_addr
+        self.master_port = master_port
+
+        # Set up basic environment variables
+        # TODO: may need to handle LOCAL_WORLD_SIZE as used in callback.py
+        os.environ['WORLD_SIZE'] = str(world_size)
+        os.environ['RANK'] = str(rank)
+
+        # Set LOCAL_RANK based on Ray GPU allocation
+        os.environ['LOCAL_RANK'] = '0' if is_cuda_visible_devices_set(
+        ) else str(ray.get_gpu_ids()[0])
+
+        # If this is rank 0 and no master_addr/master_port provided, allocate them
+        if rank == 0 and (master_addr is None or master_port is None):
+            self._allocate_master_address()
+
+        os.environ['MASTER_ADDR'] = self.master_addr  # type: ignore
+        os.environ['MASTER_PORT'] = str(self.master_port)  # type: ignore
+
+        self.model = None
+        self.model_update_group = None
+
+    def _allocate_master_address(self):
+        """Allocate master address and port for rank 0."""
+        if self.master_addr is None:
+            # Get the local IP address
+            self.master_addr = get_node_ip()
+
+        if self.master_port is None:
+            # Allocate a free port
+            self.master_port = get_free_port()
+
+    def get_master_address(self) -> tuple[Optional[str], Optional[int]]:
+        """Return the master address and port as a tuple."""
+        return (self.master_addr, self.master_port)
+
+    def get_free_port(self):
+        return get_free_port()
+
+    def init_train_process_group(self):
+        """Initialize the distributed process group."""
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=30))
+
+    def add_process_group(
+        self,
+        backend: str,
+        master_addr: str,
+        master_port: int,
+        world_size: int,
+        rank: int,
+        group_name: str,
+    ):
+        """Initialize the process group on trainer rank 0 and vllm engines."""
+        # NOTE vLLM seems to have a safer implementation of init_process_group:
+        # https://github.com/vllm-project/vllm/blob/v0.9.1/examples/offline_inference/rlhf.py#L105
+        # we should look into using that instead
+        self.model_update_group = init_process_group(
+            backend=backend,
+            init_method=f'tcp://{master_addr}:{master_port}',
+            world_size=world_size,
+            rank=rank,
+            group_name=group_name,
+        )
@@ -71,6 +71,7 @@ def __getitem__(self, index: int):
         return {
             'prompt': torch.ones((self.prompt_len,)).int(),
             'prompt_len': torch.Tensor([self.prompt_len]).to(torch.int64),
+            'prompt_id': torch.Tensor([index]).int(),
         }
 
 
@@ -87,6 +88,7 @@ def __getitem__(self, index: int):
         return {
             'prompt': torch.ones((self.prompt_len,)).int(),
             'prompt_len': torch.Tensor([self.prompt_len]).to(torch.int64),
+            'prompt_id': torch.Tensor([index]).int(),
             'verified_answer': '1',
         }