From 8c1516c99a0e41bf9f022c19d0e1496c185a7142 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 08:57:19 +0000
Subject: [PATCH 001/107] runs but not limited master_port

---
 ray_test/RAY_GETTING_STARTED.md         | 324 +++++++++++++++++++++
 ray_test/RAY_GPU_EXAMPLES.md            | 202 +++++++++++++
 ray_test/ray_distributed_simulation.py  | 369 ++++++++++++++++++++++++
 ray_test/ray_gpu_basic.py               |  51 ++++
 ray_test/ray_gpu_patterns.py            | 131 +++++++++
 ray_test/ray_learning_guide.py          | 248 ++++++++++++++++
 ray_test/ray_scheduling_demo.py         | 194 +++++++++++++
 ray_test/ray_single_server_multi_gpu.py | 247 ++++++++++++++++
 ray_test/test_ray.py                    |  77 +++++
 ray_test/test_ray_distributed.py        |  61 ++++
 ray_test/test_ray_init.py               |  23 ++
 ray_test/test_torch_ray_distributed.py  | 124 ++++++++
 12 files changed, 2051 insertions(+)
 create mode 100644 ray_test/RAY_GETTING_STARTED.md
 create mode 100644 ray_test/RAY_GPU_EXAMPLES.md
 create mode 100644 ray_test/ray_distributed_simulation.py
 create mode 100644 ray_test/ray_gpu_basic.py
 create mode 100644 ray_test/ray_gpu_patterns.py
 create mode 100644 ray_test/ray_learning_guide.py
 create mode 100644 ray_test/ray_scheduling_demo.py
 create mode 100644 ray_test/ray_single_server_multi_gpu.py
 create mode 100644 ray_test/test_ray.py
 create mode 100644 ray_test/test_ray_distributed.py
 create mode 100644 ray_test/test_ray_init.py
 create mode 100644 ray_test/test_torch_ray_distributed.py

diff --git a/ray_test/RAY_GETTING_STARTED.md b/ray_test/RAY_GETTING_STARTED.md
new file mode 100644
index 00000000..274aab46
--- /dev/null
+++ b/ray_test/RAY_GETTING_STARTED.md
@@ -0,0 +1,324 @@
+# Ray GPU Management - Complete Learning Guide
+
+Welcome to Ray GPU management! This guide provides everything you need to learn Ray from scratch, with hands-on examples for both single-server multi-GPU setups and distributed computing simulation.
+
+## 🎯 Learning Path Overview
+
+**You are here:** Complete beginner → Ray GPU expert
+
+```
+Step 1: Setup & Verification
+    ↓
+Step 2: Interactive Learning (Basics)  
+    ↓
+Step 3: Single Server Multi-GPU Patterns
+    ↓  
+Step 4: Distributed Simulation
+    ↓
+Step 5: Real-World Applications
+```
+
+## 📋 Prerequisites
+
+- ✅ Linux system with NVIDIA GPUs
+- ✅ CUDA toolkit installed
+- ✅ PyTorch with CUDA support
+- ✅ Ray installed (`pip install ray[default]`)
+
+## 🚀 Quick Start (3 Commands)
+
+```bash
+# 1. Verify your setup
+python check_gpu_setup.py
+
+# 2. Learn interactively  
+python ray_learning_guide.py
+
+# 3. Try advanced patterns
+python ray_single_server_multi_gpu.py
+```
+
+## 📚 Detailed Learning Steps
+
+### Step 1: Verify Your Setup
+
+First, ensure everything is working:
+
+```bash
+python check_gpu_setup.py
+```
+
+This checks:
+- ✅ CUDA availability in PyTorch  
+- ✅ nvidia-smi functionality
+- ✅ Ray GPU detection
+- ✅ Basic GPU operations
+
+**Expected output:** All checks should pass with "🎉 All checks passed!"
+
+### Step 2: Interactive Learning (START HERE!)
+
+Perfect for complete beginners:
+
+```bash
+python ray_learning_guide.py
+```
+
+**What you'll learn:**
+- Ray basic concepts (remote functions, actors)
+- GPU resource allocation (full vs fractional)
+- Tasks vs Actors differences  
+- Resource monitoring
+
+**Duration:** 10-15 minutes (interactive)
+
+### Step 3: Single Server Multi-GPU Patterns
+
+Advanced patterns on your single server with 2 GPUs:
+
+```bash
+python ray_single_server_multi_gpu.py
+```
+
+**What you'll see:**
+- 🚀 GPU Actors (long-lived workers)
+- 🔄 Fractional GPU allocation (0.5 GPU per task)
+- 🔀 Mixed CPU/GPU workloads
+- ⚡ Dynamic scheduling patterns
+
+**Duration:** 5-10 minutes (automated demos)
+
+### Step 4: Distributed Simulation  
+
+Simulate multi-server setup on localhost:
+
+```bash
+python ray_distributed_simulation.py
+```
+
+**What you'll learn:**
+- Starting Ray head and worker nodes
+- Connecting to distributed clusters
+- Task distribution across nodes
+- Node-level resource management
+
+**Note:** This simulates "Server 1" (head + GPUs) and "Server 2" (worker + CPUs)
+
+## 🔍 Monitoring Your Ray Cluster
+
+### During Learning
+
+While running examples, monitor GPU usage:
+
+```bash
+# Terminal 1: Run your Ray script
+python ray_learning_guide.py
+
+# Terminal 2: Monitor GPUs  
+watch -n 1 nvidia-smi
+```
+
+### Ray Dashboard
+
+When Ray is running, visit the dashboard:
+```
+http://localhost:8265
+```
+
+Shows:
+- 📊 Resource utilization
+- 🎯 Task execution timeline
+- 🖥️  Node status and health
+- 📈 Performance metrics
+
+## 📖 Core Concepts Reference
+
+### GPU Resource Allocation
+
+```python
+# Full GPU (exclusive access)
+@ray.remote(num_gpus=1)
+def gpu_task():
+    pass
+
+# Fractional GPU (shared access)
+@ray.remote(num_gpus=0.5)  # 2 tasks per GPU
+def light_gpu_task():
+    pass
+```
+
+### Tasks vs Actors
+
+```python
+# TASK: Stateless function
+@ray.remote(num_gpus=1)
+def process_data(data):
+    return result
+
+# ACTOR: Stateful class  
+@ray.remote(num_gpus=1)
+class DataProcessor:
+    def __init__(self):
+        self.model = load_model()
+    
+    def process(self, data):
+        return self.model(data)
+```
+
+### Key Ray Functions
+
+```python
+# Submit work
+future = task.remote(data)
+actor = Actor.remote()
+
+# Get results
+result = ray.get(future)
+results = ray.get([future1, future2])
+
+# Monitor resources
+ray.cluster_resources()    # Total
+ray.available_resources()  # Available now
+```
+
+## 🛠️ Common Patterns
+
+### Pattern 1: Parallel GPU Processing
+```python
+@ray.remote(num_gpus=1)
+def train_model(config):
+    # Your GPU training code
+    pass
+
+# Train multiple models in parallel
+configs = [config1, config2]  
+futures = [train_model.remote(c) for c in configs]
+results = ray.get(futures)
+```
+
+### Pattern 2: Mixed Workloads
+```python
+# Mix CPU preprocessing with GPU training
+cpu_tasks = [preprocess.remote(data) for data in dataset]
+processed_data = ray.get(cpu_tasks)
+
+gpu_tasks = [train.remote(data) for data in processed_data]  
+models = ray.get(gpu_tasks)
+```
+
+### Pattern 3: Pipeline Processing
+```python
+@ray.remote(num_cpus=1)
+def preprocess(data):
+    return cleaned_data
+
+@ray.remote(num_gpus=0.5)  
+def inference(data):
+    return predictions
+
+# Pipeline: preprocess → inference
+for data_batch in dataset:
+    clean_data = preprocess.remote(data_batch)
+    predictions = inference.remote(clean_data)
+    results.append(predictions)
+```
+
+## 🚨 Troubleshooting
+
+### Common Issues & Solutions
+
+**Issue:** Ray doesn't detect GPUs
+```bash
+# Solution: Force GPU detection
+ray.init(num_gpus=2)
+
+# Or check GPU visibility
+nvidia-smi
+```
+
+**Issue:** CUDA out of memory
+```bash
+# Solution: Use fractional GPUs
+@ray.remote(num_gpus=0.5)  # Instead of 1.0
+
+# Or reduce tensor sizes
+x = torch.randn(1000, 1000)  # Instead of 5000x5000
+```
+
+**Issue:** Tasks not running in parallel
+```python
+# Solution: Check available resources
+print(ray.available_resources())
+
+# Don't block waiting for results too early
+futures = [task.remote(i) for i in range(10)]
+# Do other work here...
+results = ray.get(futures)  # Wait at the end
+```
+
+**Issue:** Ray processes hanging
+```bash
+# Solution: Clean shutdown
+ray stop --force
+pkill -f ray
+```
+
+## 🎯 What's Next?
+
+After completing this guide, explore:
+
+### Advanced Ray Features
+- **Ray Tune:** Hyperparameter optimization
+- **Ray Train:** Distributed training  
+- **Ray Serve:** Model serving
+- **Ray Data:** Large-scale data processing
+
+### Real Distributed Setup
+Once comfortable with localhost simulation:
+
+```bash
+# Server 1 (head node)
+ray start --head --port=10001 --num-gpus=2
+
+# Server 2 (worker node) 
+ray start --address=192.168.1.100:10001 --num-gpus=1
+```
+
+### Production Considerations
+- Resource management policies
+- Fault tolerance and recovery
+- Monitoring and logging
+- Auto-scaling strategies
+
+## 📁 Files in This Learning Package
+
+| File | Purpose | When to Use |
+|------|---------|-------------|  
+| `check_gpu_setup.py` | Verify system setup | **Start here** - before anything else |
+| `ray_learning_guide.py` | Interactive beginner tutorial | **Step 2** - core concepts |
+| `ray_single_server_multi_gpu.py` | Advanced single-server patterns | **Step 3** - practical patterns |  
+| `ray_distributed_simulation.py` | Localhost distributed simulation | **Step 4** - distributed concepts |
+| `ray_gpu_basic.py` | Simple working example | Reference/quick test |
+| `RAY_GPU_EXAMPLES.md` | Original documentation | Additional reference |
+
+## 🎉 Success Metrics
+
+You'll know you've mastered Ray GPU management when you can:
+
+✅ Set up Ray clusters (single and distributed)  
+✅ Choose between tasks and actors appropriately
+✅ Allocate GPU resources efficiently (full vs fractional)
+✅ Monitor and debug resource usage
+✅ Design efficient parallel workflows
+✅ Handle mixed CPU/GPU workloads
+
+## 🆘 Getting Help
+
+- 📖 [Official Ray Documentation](https://docs.ray.io/)
+- 💬 [Ray Discourse Forum](https://discuss.ray.io/)  
+- 🐛 [Ray GitHub Issues](https://github.com/ray-project/ray/issues)
+- 📺 [Ray YouTube Tutorials](https://www.youtube.com/c/RayProjectIO)
+
+---
+
+**Happy learning!** 🚀 Start with `python ray_learning_guide.py` and work your way through the examples. 
\ No newline at end of file
diff --git a/ray_test/RAY_GPU_EXAMPLES.md b/ray_test/RAY_GPU_EXAMPLES.md
new file mode 100644
index 00000000..4a7f5969
--- /dev/null
+++ b/ray_test/RAY_GPU_EXAMPLES.md
@@ -0,0 +1,202 @@
+# Ray GPU Management Examples
+
+This directory contains examples to help you learn Ray for GPU workload management on a single server with 2 GPUs.
+
+## Files Overview
+
+1. **`check_gpu_setup.py`** - Verify your GPU setup
+2. **`ray_gpu_basic.py`** - Minimal Ray GPU example
+3. **`ray_gpu_patterns.py`** - Advanced GPU management patterns
+4. **`test_ray.py`** - Your existing comprehensive example
+
+## Getting Started
+
+### Step 1: Check Your Setup
+
+Before running any examples, verify your GPU setup:
+
+```bash
+python check_gpu_setup.py
+```
+
+This will check:
+- CUDA availability in PyTorch
+- nvidia-smi functionality
+- Ray GPU detection
+- Basic GPU operations
+
+### Step 2: Run the Basic Example
+
+Start with the simplest example:
+
+```bash
+python ray_gpu_basic.py
+```
+
+This demonstrates:
+- Ray initialization
+- Basic GPU task creation
+- Resource allocation
+- Simple parallel execution
+
+### Step 3: Try Advanced Patterns
+
+Explore more sophisticated GPU management:
+
+```bash
+python ray_gpu_patterns.py
+```
+
+This shows:
+- Fractional GPU allocation (0.5 GPU per task)
+- Mixed CPU/GPU workloads
+- Resource monitoring
+- Dynamic scheduling
+
+### Step 4: Study the Complete Example
+
+Your existing `test_ray.py` provides a comprehensive example with:
+- Detailed GPU assignment tracking
+- Resource visualization
+- Error handling
+- Best practices
+
+## Key Ray GPU Concepts
+
+### 1. GPU Resource Allocation
+
+```python
+# Request 1 full GPU
+@ray.remote(num_gpus=1)
+def gpu_task():
+    pass
+
+# Request 0.5 GPU (allows 2 tasks per GPU)
+@ray.remote(num_gpus=0.5)
+def light_gpu_task():
+    pass
+```
+
+### 2. GPU Assignment
+
+Ray automatically:
+- Sets `CUDA_VISIBLE_DEVICES` for each task
+- Manages GPU memory isolation
+- Schedules tasks based on available GPUs
+
+```python
+# Inside a Ray task
+gpu_ids = ray.get_gpu_ids()  # Get assigned GPU IDs
+device = torch.device("cuda")  # PyTorch sees only assigned GPUs
+```
+
+### 3. Resource Monitoring
+
+```python
+# Check available resources
+ray.cluster_resources()    # Total resources
+ray.available_resources()  # Currently available
+```
+
+## Common Patterns
+
+### Pattern 1: Parallel GPU Tasks
+```python
+# Launch multiple tasks in parallel
+tasks = [gpu_task.remote(i) for i in range(4)]
+results = ray.get(tasks)  # Wait for all to complete
+```
+
+### Pattern 2: Mixed Workloads
+```python
+# CPU and GPU tasks running together
+cpu_tasks = [cpu_task.remote(i) for i in range(4)]
+gpu_tasks = [gpu_task.remote(i) for i in range(2)]
+all_results = ray.get(cpu_tasks + gpu_tasks)
+```
+
+### Pattern 3: Dynamic Scheduling
+```python
+# Submit tasks as resources become available
+futures = []
+for i in range(10):
+    future = gpu_task.remote(i)
+    futures.append(future)
+    if len(futures) >= 2:  # Don't overwhelm the queue
+        ray.get(futures[:1])  # Wait for one to complete
+        futures = futures[1:]
+```
+
+## Monitoring GPU Usage
+
+While running examples, monitor GPU usage:
+
+```bash
+# In another terminal
+watch -n 1 nvidia-smi
+```
+
+You should see:
+- GPU utilization changes as tasks start/finish
+- Memory allocation per GPU
+- Process assignments
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Ray doesn't detect GPUs**
+   - Check `nvidia-smi` works
+   - Verify CUDA installation
+   - Try `ray.init(num_gpus=2)` to force detection
+
+2. **CUDA out of memory**
+   - Reduce tensor sizes in examples
+   - Use fractional GPU allocation
+   - Monitor memory with `nvidia-smi`
+
+3. **Tasks not running in parallel**
+   - Check available resources with `ray.available_resources()`
+   - Verify you have enough GPUs for your tasks
+   - Use `ray.get()` wisely to avoid blocking
+
+### Debug Tips
+
+```python
+# Check Ray status
+ray.cluster_resources()
+ray.available_resources()
+
+# Monitor task execution
+import time
+start = time.time()
+results = ray.get(tasks)
+print(f"Execution time: {time.time() - start:.2f}s")
+```
+
+## Next Steps
+
+1. **Experiment** with different GPU allocations (0.25, 0.5, 1.0)
+2. **Try** mixing CPU and GPU tasks
+3. **Monitor** resource usage patterns
+4. **Scale up** to more complex workloads
+5. **Learn** about Ray Tune for hyperparameter optimization
+6. **Explore** Ray Train for distributed training
+
+## Useful Commands
+
+```bash
+# Check GPU status
+nvidia-smi
+
+# Monitor Ray cluster
+ray status
+
+# Ray dashboard (if enabled)
+ray dashboard
+
+# Kill Ray processes
+ray stop
+```
+
+Happy learning with Ray! 🚀 
\ No newline at end of file
diff --git a/ray_test/ray_distributed_simulation.py b/ray_test/ray_distributed_simulation.py
new file mode 100644
index 00000000..79986266
--- /dev/null
+++ b/ray_test/ray_distributed_simulation.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""
+Ray Distributed Setup Simulation
+
+This example shows how to simulate a distributed Ray cluster on localhost.
+We'll create multiple Ray nodes on the same machine to simulate a multi-server setup.
+"""
+
+import ray
+import torch
+import time
+import subprocess
+import signal
+import sys
+import os
+import threading
+from typing import Dict, Any, List
+import psutil
+
+# Configuration
+HEAD_PORT = 10001
+WORKER_PORT_START = 10002
+REDIS_PASSWORD = "ray_demo_password"
+
+class RayClusterManager:
+    """Manages a simulated distributed Ray cluster on localhost."""
+    
+    def __init__(self):
+        self.head_process = None
+        self.worker_processes = []
+        self.head_address = None
+        
+    def start_head_node(self, num_gpus: int = 2, num_cpus: int = 8) -> str:
+        """Start the head node."""
+        print("🚀 Starting Ray head node...")
+        
+        # Kill any existing Ray processes
+        self._cleanup_existing_ray()
+        
+        head_cmd = [
+            "ray", "start", "--head",
+            f"--port={HEAD_PORT}",
+            f"--num-gpus={num_gpus}",
+            f"--num-cpus={num_cpus}",
+            f"--redis-password={REDIS_PASSWORD}",
+            "--include-dashboard=true",
+            "--dashboard-port=8265"
+        ]
+        
+        print(f"Command: {' '.join(head_cmd)}")
+        
+        # Start head node
+        self.head_process = subprocess.Popen(
+            head_cmd,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            text=True
+        )
+        
+        # Wait a bit for head to start
+        time.sleep(3)
+        
+        self.head_address = f"ray://127.0.0.1:{HEAD_PORT}"
+        print(f"✅ Head node started at {self.head_address}")
+        
+        return self.head_address
+    
+    def add_worker_node(self, node_id: int, num_gpus: int = 0, num_cpus: int = 4) -> bool:
+        """Add a worker node to the cluster."""
+        print(f"🔧 Adding worker node {node_id}...")
+        
+        worker_cmd = [
+            "ray", "start",
+            f"--address={self.head_address}",
+            f"--num-gpus={num_gpus}",
+            f"--num-cpus={num_cpus}",
+            f"--redis-password={REDIS_PASSWORD}"
+        ]
+        
+        print(f"Command: {' '.join(worker_cmd)}")
+        
+        worker_process = subprocess.Popen(
+            worker_cmd,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            text=True
+        )
+        
+        self.worker_processes.append(worker_process)
+        
+        # Wait for worker to connect
+        time.sleep(2)
+        
+        print(f"✅ Worker node {node_id} added")
+        return True
+    
+    def _cleanup_existing_ray(self):
+        """Clean up any existing Ray processes."""
+        try:
+            subprocess.run(["ray", "stop", "--force"], 
+                         capture_output=True, timeout=10)
+            time.sleep(1)
+        except:
+            pass
+    
+    def shutdown(self):
+        """Shutdown the entire cluster."""
+        print("🛑 Shutting down Ray cluster...")
+        
+        # Stop all processes
+        try:
+            subprocess.run(["ray", "stop", "--force"], 
+                         capture_output=True, timeout=10)
+        except:
+            pass
+        
+        # Kill processes if still running
+        if self.head_process:
+            self.head_process.terminate()
+        
+        for worker in self.worker_processes:
+            worker.terminate()
+        
+        print("✅ Cluster shutdown complete")
+
+@ray.remote(num_gpus=1)
+class DistributedGPUWorker:
+    """A distributed GPU worker that reports its location."""
+    
+    def __init__(self, worker_id: str):
+        self.worker_id = worker_id
+        self.node_id = ray.get_runtime_context().get_node_id()
+        self.gpu_ids = ray.get_gpu_ids()
+        self.hostname = os.uname().nodename
+        
+    def get_worker_info(self) -> Dict[str, Any]:
+        """Get information about this worker."""
+        return {
+            "worker_id": self.worker_id,
+            "node_id": self.node_id,
+            "hostname": self.hostname,
+            "gpu_ids": self.gpu_ids,
+            "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
+        }
+    
+    def distributed_computation(self, matrix_size: int = 1000) -> Dict[str, Any]:
+        """Perform computation and return node information."""
+        start_time = time.time()
+        
+        # GPU computation
+        device = torch.device("cuda")
+        A = torch.randn(matrix_size, matrix_size, device=device)
+        B = torch.randn(matrix_size, matrix_size, device=device)
+        C = torch.mm(A, B)
+        result = torch.trace(C).item()
+        
+        execution_time = time.time() - start_time
+        
+        return {
+            "worker_id": self.worker_id,
+            "node_id": self.node_id,
+            "hostname": self.hostname,
+            "gpu_ids": self.gpu_ids,
+            "result": result,
+            "execution_time": execution_time,
+            "matrix_size": matrix_size
+        }
+
+@ray.remote(num_cpus=1)
+def distributed_cpu_task(task_id: int) -> Dict[str, Any]:
+    """A CPU task that reports which node it's running on."""
+    import numpy as np
+    
+    start_time = time.time()
+    node_id = ray.get_runtime_context().get_node_id()
+    hostname = os.uname().nodename
+    
+    # CPU computation
+    result = np.sum(np.random.randn(500, 500) ** 2)
+    
+    execution_time = time.time() - start_time
+    
+    return {
+        "task_id": task_id,
+        "node_id": node_id,
+        "hostname": hostname,
+        "result": result,
+        "execution_time": execution_time,
+        "resource_type": "CPU"
+    }
+
+def demonstrate_cluster_info():
+    """Show cluster information and resource distribution."""
+    print("\n📊 CLUSTER INFORMATION")
+    print("=" * 50)
+    
+    # Get cluster resources
+    cluster_resources = ray.cluster_resources()
+    available_resources = ray.available_resources()
+    
+    print("Total Cluster Resources:")
+    for resource, amount in cluster_resources.items():
+        print(f"  {resource}: {amount}")
+    
+    print("\nAvailable Resources:")
+    for resource, amount in available_resources.items():
+        print(f"  {resource}: {amount}")
+    
+    # Get node information
+    print("\nNodes in Cluster:")
+    nodes = ray.nodes()
+    for i, node in enumerate(nodes):
+        print(f"  Node {i+1}:")
+        print(f"    ID: {node['NodeID']}")
+        print(f"    Alive: {node['Alive']}")
+        print(f"    Resources: {node['Resources']}")
+
+def demonstrate_distributed_gpu_work():
+    """Demonstrate distributed GPU work across simulated nodes."""
+    print("\n🖥️  DEMO: Distributed GPU Work")
+    print("-" * 50)
+    
+    # Create GPU workers
+    workers = [DistributedGPUWorker.remote(f"gpu_worker_{i}") for i in range(2)]
+    
+    # Get worker information
+    print("Created GPU workers:")
+    worker_info_futures = [worker.get_worker_info.remote() for worker in workers]
+    worker_infos = ray.get(worker_info_futures)
+    
+    for info in worker_infos:
+        print(f"  {info['worker_id']}: Node {info['node_id'][:8]}, GPU {info['gpu_ids']}")
+    
+    # Submit distributed computation
+    print("\nSubmitting distributed GPU computations...")
+    computation_futures = [
+        worker.distributed_computation.remote(matrix_size=1200) 
+        for worker in workers
+    ]
+    
+    results = ray.get(computation_futures)
+    
+    print("Results:")
+    for result in results:
+        print(f"  {result['worker_id']}: "
+              f"Node {result['node_id'][:8]}, "
+              f"GPU {result['gpu_ids']}, "
+              f"Result: {result['result']:.2f}, "
+              f"Time: {result['execution_time']:.2f}s")
+
+def demonstrate_mixed_distributed_work():
+    """Demonstrate mixed CPU/GPU work across nodes."""
+    print("\n🔄 DEMO: Mixed Distributed Workload")
+    print("-" * 50)
+    
+    # Submit a mix of CPU and GPU tasks
+    cpu_tasks = [distributed_cpu_task.remote(i) for i in range(4)]
+    
+    # Create lightweight GPU tasks
+    @ray.remote(num_gpus=0.5)
+    def light_gpu_task(task_id: int):
+        node_id = ray.get_runtime_context().get_node_id()
+        gpu_ids = ray.get_gpu_ids()
+        
+        device = torch.device("cuda")
+        x = torch.randn(500, 500, device=device)
+        result = torch.sum(x * x).item()
+        
+        return {
+            "task_id": task_id, 
+            "node_id": node_id,
+            "gpu_ids": gpu_ids,
+            "result": result
+        }
+    
+    gpu_tasks = [light_gpu_task.remote(i+10) for i in range(3)]
+    
+    all_tasks = cpu_tasks + gpu_tasks
+    print(f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks")
+    
+    start_time = time.time()
+    results = ray.get(all_tasks)
+    total_time = time.time() - start_time
+    
+    print(f"All tasks completed in {total_time:.2f}s")
+    
+    # Group results by node
+    node_results = {}
+    for result in results:
+        node_id = result['node_id'][:8]  # Short node ID
+        if node_id not in node_results:
+            node_results[node_id] = []
+        node_results[node_id].append(result)
+    
+    print("\nResults by Node:")
+    for node_id, node_tasks in node_results.items():
+        print(f"  Node {node_id}: {len(node_tasks)} tasks")
+
+def simulate_two_server_setup():
+    """Simulate a two-server setup using localhost."""
+    print("\n🌐 SIMULATING TWO-SERVER SETUP")
+    print("=" * 60)
+    print("This simulates Server 1 (head + GPU) and Server 2 (worker + CPU)")
+    
+    cluster_manager = RayClusterManager()
+    
+    try:
+        # Start head node (simulates Server 1 with GPUs)
+        head_address = cluster_manager.start_head_node(num_gpus=2, num_cpus=4)
+
+        # Connect Ray client
+        print(f"\n🔗 Connecting to distributed cluster at {head_address}")
+        ray.init(address=head_address, _redis_password=REDIS_PASSWORD)
+
+        demonstrate_cluster_info()
+        
+        # Add worker node (simulates Server 2 with only CPUs)
+        cluster_manager.add_worker_node(node_id=1, num_gpus=0, num_cpus=6)
+        
+        # Demonstrate distributed functionality
+        demonstrate_cluster_info()
+        demonstrate_distributed_gpu_work()
+        demonstrate_mixed_distributed_work()
+        
+        print("\n✨ Distributed simulation completed successfully!")
+        
+    except Exception as e:
+        print(f"❌ Error in distributed simulation: {e}")
+        
+    finally:
+        try:
+            ray.shutdown()
+        except:
+            pass
+        cluster_manager.shutdown()
+
+def main():
+    """Main function to demonstrate distributed Ray setup."""
+    print("🎯 Ray Distributed Setup Simulation")
+    print("=" * 60)
+    print("This example simulates a distributed Ray cluster on a single machine")
+    print("to help you understand distributed Ray concepts.")
+    
+    # Check if Ray is already running
+    try:
+        ray.init(address='auto')
+        print("⚠️  Ray is already running. Shutting down first...")
+        ray.shutdown()
+        time.sleep(2)
+    except:
+        pass
+    
+    simulate_two_server_setup()
+    
+    print("\n📚 What you learned:")
+    print("1. How to start Ray head and worker nodes")
+    print("2. How to connect to a distributed Ray cluster") 
+    print("3. How tasks are distributed across nodes")
+    print("4. How to monitor cluster resources and node distribution")
+    print("5. How GPU and CPU resources are managed in a distributed setup")
+    
+    print("\n🚀 Next steps:")
+    print("- Try this on actual multiple servers")
+    print("- Experiment with different resource configurations")
+    print("- Use Ray Tune for distributed hyperparameter tuning")
+    print("- Explore Ray Train for distributed training")
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/ray_test/ray_gpu_basic.py b/ray_test/ray_gpu_basic.py
new file mode 100644
index 00000000..9a3d625f
--- /dev/null
+++ b/ray_test/ray_gpu_basic.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Ray GPU Management - Basic Example
+
+This is the simplest possible example of using Ray to manage GPU workloads.
+Perfect for someone new to Ray who wants to understand the core concepts.
+"""
+
+import ray
+import torch
+import time
+import os
+
+
+@ray.remote(num_gpus=1)
+def simple_gpu_task(task_id: int):
+    """A minimal GPU task that just creates a tensor and does basic operations."""
+    
+    # Ray automatically manages which GPU this task gets
+    gpu_ids = ray.get_gpu_ids()
+    print(f"Task {task_id}: Using GPU {gpu_ids[0]}")
+    
+    # Create a tensor on the GPU
+    device = torch.device("cuda")
+    x = torch.randn(1000, 1000, device=device)
+    
+    # Do some computation
+    for i in range(3):
+        x = x * 2
+        time.sleep(0.5)  # Simulate work
+        print(f"  Step {i+1}: tensor shape {x.shape}")
+    
+    return f"Task {task_id} completed on GPU {gpu_ids[0]}"
+
+if __name__ == "__main__":
+    # print current pic
+    print(f"Current process ID: {os.getpid()}")
+
+    # Initialize Ray
+    ray.init()
+    
+    print("Available resources:", ray.cluster_resources())
+    
+    # Launch 2 tasks (one per GPU)
+    tasks = [simple_gpu_task.remote(i) for i in range(2)]
+    
+    # Wait for results
+    results = ray.get(tasks)
+    
+    print("Results:", results)
+    ray.shutdown() 
\ No newline at end of file
diff --git a/ray_test/ray_gpu_patterns.py b/ray_test/ray_gpu_patterns.py
new file mode 100644
index 00000000..071bf165
--- /dev/null
+++ b/ray_test/ray_gpu_patterns.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Ray GPU Management - Advanced Patterns
+
+This example demonstrates different GPU management patterns in Ray:
+1. Fractional GPU allocation
+2. Dynamic task scheduling
+3. Resource monitoring
+4. Error handling
+"""
+
+import ray
+import torch
+import time
+import psutil
+
+# Pattern 1: Fractional GPU usage (0.5 GPU per task)
+@ray.remote(num_gpus=0.5)
+def light_gpu_task(task_id: int):
+    """Task that only needs half a GPU - allows 4 tasks on 2 GPUs."""
+    gpu_ids = ray.get_gpu_ids()
+    device = torch.device("cuda")
+    
+    print(f"Light task {task_id}: Using GPU fraction on {gpu_ids}")
+    
+    # Lighter computation
+    x = torch.randn(500, 500, device=device)
+    x = torch.mm(x, x.T)
+    time.sleep(1)
+    
+    return f"Light task {task_id} done"
+
+# Pattern 2: Full GPU usage
+@ray.remote(num_gpus=1)
+def heavy_gpu_task(task_id: int):
+    """Task that needs a full GPU."""
+    gpu_ids = ray.get_gpu_ids()
+    device = torch.device("cuda")
+    
+    print(f"Heavy task {task_id}: Using full GPU {gpu_ids[0]}")
+    
+    # Heavier computation
+    x = torch.randn(2000, 2000, device=device)
+    for _ in range(5):
+        x = torch.mm(x, x.T)
+    time.sleep(2)
+    
+    return f"Heavy task {task_id} done on GPU {gpu_ids[0]}"
+
+# Pattern 3: CPU task for comparison
+@ray.remote
+def cpu_task(task_id: int):
+    """Task that runs on CPU only."""
+    print(f"CPU task {task_id}: Running on CPU")
+    
+    # CPU computation
+    x = torch.randn(1000, 1000)
+    x = torch.mm(x, x.T)
+    time.sleep(1)
+    
+    return f"CPU task {task_id} done"
+
+# Pattern 4: Resource monitoring task
+@ray.remote
+def monitor_resources():
+    """Monitor system resources while tasks are running."""
+    resources = ray.cluster_resources()
+    available = ray.available_resources()
+    
+    return {
+        "total_gpus": resources.get("GPU", 0),
+        "available_gpus": available.get("GPU", 0),
+        "total_cpus": resources.get("CPU", 0),
+        "available_cpus": available.get("CPU", 0),
+        "memory_usage": psutil.virtual_memory().percent
+    }
+
+def demonstrate_gpu_patterns():
+    """Demonstrate different GPU allocation patterns."""
+    
+    print("=== Ray GPU Patterns Demo ===\n")
+    
+    # Initialize Ray
+    ray.init()
+    
+    # Check available resources
+    print("Initial resources:", ray.cluster_resources())
+    print("Available resources:", ray.available_resources())
+    print()
+    
+    # Pattern 1: Run multiple light tasks (fractional GPU)
+    print("1. Running 4 light tasks (0.5 GPU each) - should run 4 concurrent on 2 GPUs")
+    light_tasks = [light_gpu_task.remote(i) for i in range(4)]
+    
+    # Pattern 2: Run heavy tasks (full GPU)
+    print("2. Running 2 heavy tasks (1 GPU each)")
+    heavy_tasks = [heavy_gpu_task.remote(i) for i in range(2)]
+    
+    # Pattern 3: Run CPU tasks alongside
+    print("3. Running CPU tasks in parallel")
+    cpu_tasks = [cpu_task.remote(i) for i in range(3)]
+    
+    # Pattern 4: Monitor resources while tasks run
+    monitor_task = monitor_resources.remote()
+    
+    # Wait for light tasks
+    print("\nWaiting for light tasks...")
+    light_results = ray.get(light_tasks)
+    print("Light tasks results:", light_results)
+    
+    # Check resources mid-execution
+    mid_resources = ray.get(monitor_task)
+    print("Mid-execution resources:", mid_resources)
+    
+    # Wait for remaining tasks
+    print("\nWaiting for heavy and CPU tasks...")
+    heavy_results = ray.get(heavy_tasks)
+    cpu_results = ray.get(cpu_tasks)
+    
+    print("Heavy tasks results:", heavy_results)
+    print("CPU tasks results:", cpu_results)
+    
+    # Final resource check
+    final_monitor = monitor_resources.remote()
+    final_resources = ray.get(final_monitor)
+    print("Final resources:", final_resources)
+    
+    ray.shutdown()
+
+if __name__ == "__main__":
+    demonstrate_gpu_patterns() 
\ No newline at end of file
diff --git a/ray_test/ray_learning_guide.py b/ray_test/ray_learning_guide.py
new file mode 100644
index 00000000..066dedc5
--- /dev/null
+++ b/ray_test/ray_learning_guide.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Ray GPU Learning Guide - Getting Started Script
+
+This script helps beginners understand Ray GPU management concepts
+through interactive examples and clear explanations.
+"""
+
+import ray
+import torch
+import time
+import os
+from typing import Dict, Any
+
+def step_1_basic_concepts():
+    """Step 1: Understanding Ray basic concepts."""
+    print("\n" + "="*60)
+    print("🎓 STEP 1: RAY BASIC CONCEPTS")
+    print("="*60)
+    
+    print("""
+Ray is a distributed computing framework that helps you:
+1. Parallelize your Python code across multiple cores/machines
+2. Manage GPU resources automatically  
+3. Scale from single machine to clusters seamlessly
+
+Key concepts:
+- @ray.remote: Decorator to make functions/classes distributed
+- ray.get(): Wait for and retrieve results from remote tasks
+- ray.put(): Store large objects in shared memory
+- Actors: Stateful workers that persist across tasks
+""")
+    
+    # Simple example
+    @ray.remote
+    def simple_task(x):
+        return x * x
+    
+    print("Example: Simple remote function")
+    print("@ray.remote")
+    print("def simple_task(x):")
+    print("    return x * x")
+    
+    # Execute
+    future = simple_task.remote(5)
+    result = ray.get(future)
+    print(f"\nResult: simple_task.remote(5) = {result}")
+
+def step_2_gpu_resource_management():
+    """Step 2: Understanding GPU resource management."""
+    print("\n" + "="*60)
+    print("🎮 STEP 2: GPU RESOURCE MANAGEMENT")
+    print("="*60)
+    
+    print("""
+Ray automatically manages GPU allocation:
+
+1. Full GPU allocation: @ray.remote(num_gpus=1)
+   - Task gets exclusive access to 1 GPU
+   - Ray sets CUDA_VISIBLE_DEVICES automatically
+   
+2. Fractional GPU allocation: @ray.remote(num_gpus=0.5)  
+   - Multiple tasks can share the same GPU
+   - Useful for lightweight GPU work
+   
+3. Ray handles scheduling based on available resources
+""")
+    
+    @ray.remote(num_gpus=1)
+    def gpu_task(task_id):
+        gpu_ids = ray.get_gpu_ids()
+        device = torch.device("cuda")
+        x = torch.randn(100, 100, device=device)
+        return {"task_id": task_id, "gpu_ids": gpu_ids, "shape": list(x.shape)}
+    
+    print("Example: GPU task")
+    print("@ray.remote(num_gpus=1)")
+    print("def gpu_task(task_id):")
+    print("    gpu_ids = ray.get_gpu_ids()")
+    print("    device = torch.device('cuda')")
+    print("    x = torch.randn(100, 100, device=device)")
+    print("    return {'task_id': task_id, 'gpu_ids': gpu_ids}")
+    
+    # Execute on both GPUs
+    tasks = [gpu_task.remote(i) for i in range(2)]
+    results = ray.get(tasks)
+    
+    print(f"\nResults from 2 GPU tasks:")
+    for result in results:
+        print(f"  Task {result['task_id']}: GPU {result['gpu_ids']}, Tensor {result['shape']}")
+
+def step_3_actors_vs_tasks():
+    """Step 3: Understanding the difference between actors and tasks."""
+    print("\n" + "="*60)
+    print("🎭 STEP 3: ACTORS VS TASKS")
+    print("="*60)
+    
+    print("""
+Tasks vs Actors:
+
+TASKS (@ray.remote functions):
+- Stateless and lightweight
+- Good for simple computations
+- GPU allocated only during execution
+- No memory between calls
+
+ACTORS (@ray.remote classes):
+- Stateful workers with persistent memory
+- Good for complex workflows  
+- GPU held for the lifetime of the actor
+- Can maintain state between method calls
+""")
+    
+    @ray.remote(num_gpus=0.5)
+    class GPUActor:
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+            self.device = torch.device("cuda")
+            self.counter = 0
+            
+        def process(self, data_size=500):
+            self.counter += 1
+            x = torch.randn(data_size, data_size, device=self.device)
+            y = torch.mm(x, x.T)
+            return {
+                "call_count": self.counter,
+                "gpu_ids": self.gpu_ids,
+                "result": torch.trace(y).item()
+            }
+    
+    print("Example: GPU Actor")
+    print("@ray.remote(num_gpus=0.5)")
+    print("class GPUActor:")
+    print("    def __init__(self):")
+    print("        self.gpu_ids = ray.get_gpu_ids()")
+    print("        self.device = torch.device('cuda')")
+    print("        self.counter = 0")
+    
+    # Create actors (4 actors, 2 per GPU with 0.5 GPU each)
+    actors = [GPUActor.remote() for _ in range(4)]
+    
+    # Call methods multiple times
+    futures = []
+    for actor in actors:
+        for _ in range(2):  # 2 calls per actor
+            futures.append(actor.process.remote())
+    
+    results = ray.get(futures)
+    
+    print(f"\nResults from {len(actors)} actors, each called twice:")
+    for i, result in enumerate(results):
+        print(f"  Call {i+1}: GPU {result['gpu_ids']}, Count: {result['call_count']}, Result: {result['result']:.2f}")
+
+def step_4_monitoring_resources():
+    """Step 4: Understanding resource monitoring."""
+    print("\n" + "="*60)
+    print("📊 STEP 4: MONITORING RESOURCES")
+    print("="*60)
+    
+    print("""
+Ray provides several ways to monitor resources:
+
+1. ray.cluster_resources() - Total resources in cluster
+2. ray.available_resources() - Currently available resources  
+3. ray.nodes() - Information about cluster nodes
+4. Ray Dashboard - Web UI for monitoring (http://localhost:8265)
+""")
+    
+    print("Current cluster state:")
+    print(f"  Total resources: {ray.cluster_resources()}")
+    print(f"  Available resources: {ray.available_resources()}")
+    
+    # Show how resources change during execution
+    @ray.remote(num_gpus=1)  
+    def blocking_gpu_task():
+        print(f"  📍 Task started on GPU {ray.get_gpu_ids()}")
+        time.sleep(3)  # Hold GPU for 3 seconds
+        return "done"
+    
+    print("\nWatching resources during task execution...")
+    print("Available before task:", ray.available_resources().get('GPU', 0))
+    
+    future = blocking_gpu_task.remote()
+    time.sleep(0.5)  # Give task time to start
+    print("Available during task:", ray.available_resources().get('GPU', 0))
+    
+    ray.get(future)
+    print("Available after task: ", ray.available_resources().get('GPU', 0))
+
+def interactive_learning_session():
+    """Run an interactive learning session."""
+    print("🎯 RAY GPU MANAGEMENT - INTERACTIVE LEARNING")
+    print("=" * 70)
+    
+    print("""
+Welcome to Ray GPU Management Learning!
+
+This script will teach you Ray concepts step by step.
+Each step builds on the previous one.
+
+You have 2 NVIDIA A100 GPUs available for learning.
+""")
+    
+    # Initialize Ray
+    print("🚀 Initializing Ray...")
+    ray.init(num_gpus=2)
+    print(f"✅ Ray initialized with resources: {ray.cluster_resources()}")
+    
+    try:
+        step_1_basic_concepts()
+        
+        input("\nPress Enter to continue to Step 2...")
+        step_2_gpu_resource_management()
+        
+        input("\nPress Enter to continue to Step 3...")
+        step_3_actors_vs_tasks()
+        
+        input("\nPress Enter to continue to Step 4...")
+        step_4_monitoring_resources()
+        
+        print("\n" + "="*70)
+        print("🎉 CONGRATULATIONS!")
+        print("="*70)
+        print("""
+You've learned the fundamentals of Ray GPU management:
+
+✅ Basic Ray concepts (remote functions, ray.get)
+✅ GPU resource allocation (full and fractional)  
+✅ Difference between tasks and actors
+✅ Resource monitoring and observation
+
+Next steps to continue learning:
+1. Run 'python ray_single_server_multi_gpu.py' for advanced patterns
+2. Run 'python ray_distributed_simulation.py' for distributed concepts
+3. Try the Ray dashboard at http://localhost:8265
+4. Explore Ray Tune for hyperparameter optimization
+5. Look into Ray Train for distributed training
+
+Happy learning! 🚀
+""")
+        
+    except KeyboardInterrupt:
+        print("\n\n👋 Learning session interrupted. Thanks for trying Ray!")
+    finally:
+        ray.shutdown()
+
+if __name__ == "__main__":
+    interactive_learning_session() 
\ No newline at end of file
diff --git a/ray_test/ray_scheduling_demo.py b/ray_test/ray_scheduling_demo.py
new file mode 100644
index 00000000..725eb9a2
--- /dev/null
+++ b/ray_test/ray_scheduling_demo.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""
+Ray GPU Scheduling Demo
+
+This demonstrates how Ray schedules tasks based on GPU resource availability.
+Key question: Can heavy_gpu_tasks (1.0 GPU) start when light_gpu_tasks (0.5 GPU each) are running?
+
+Answer: NO - Ray waits until sufficient resources are available.
+"""
+
+import ray
+import torch
+import time
+import os
+from datetime import datetime
+
+def timestamp():
+    """Get current timestamp for logging."""
+    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
+
+@ray.remote(num_gpus=0.5)
+def light_gpu_task(task_id: int, duration: int = 10):
+    """Light task that uses 0.5 GPU and runs for specified duration."""
+    gpu_ids = ray.get_gpu_ids()
+    pid = os.getpid()
+    
+    print(f"[{timestamp()}] 🟡 Light task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids})")
+    
+    # Create some GPU work
+    device = torch.device("cuda")
+    x = torch.randn(1000, 1000, device=device)
+    
+    # Simulate work for the specified duration
+    for i in range(duration):
+        x = torch.mm(x, x.T)
+        time.sleep(1)
+        if i % 3 == 0:  # Progress update every 3 seconds
+            print(f"[{timestamp()}] 🟡 Light task {task_id} working... ({i+1}/{duration}s)")
+    
+    print(f"[{timestamp()}] 🟡 Light task {task_id} FINISHED")
+    return f"Light task {task_id} completed"
+
+@ray.remote(num_gpus=1.0)
+def heavy_gpu_task(task_id: int, duration: int = 5):
+    """Heavy task that needs full GPU and runs for specified duration."""
+    gpu_ids = ray.get_gpu_ids()
+    pid = os.getpid()
+    
+    print(f"[{timestamp()}] 🔴 Heavy task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids[0]})")
+    
+    # Create heavier GPU work
+    device = torch.device("cuda")
+    x = torch.randn(2000, 2000, device=device)
+    
+    # Simulate work
+    for i in range(duration):
+        x = torch.mm(x, x.T)
+        time.sleep(1)
+        print(f"[{timestamp()}] 🔴 Heavy task {task_id} working... ({i+1}/{duration}s)")
+    
+    print(f"[{timestamp()}] 🔴 Heavy task {task_id} FINISHED")
+    return f"Heavy task {task_id} completed"
+
+@ray.remote
+def resource_monitor():
+    """Monitor available resources."""
+    total = ray.cluster_resources()
+    available = ray.available_resources()
+    
+    return {
+        "timestamp": timestamp(),
+        "total_gpus": total.get("GPU", 0),
+        "available_gpus": available.get("GPU", 0),
+        "available_cpus": available.get("CPU", 0)
+    }
+
+def demonstrate_scheduling():
+    """Demonstrate Ray's scheduling behavior."""
+    
+    print("=" * 60)
+    print("RAY GPU SCHEDULING DEMONSTRATION")
+    print("=" * 60)
+    print()
+    
+    ray.init()
+    
+    # Check initial resources
+    initial_resources = ray.get(resource_monitor.remote())
+    print(f"Initial resources: {initial_resources}")
+    print()
+    
+    print("SCENARIO: Testing if heavy tasks can start while light tasks are running")
+    print("- Light tasks: 0.5 GPU each, 10 seconds duration")
+    print("- Heavy tasks: 1.0 GPU each, 5 seconds duration")
+    print("- With 2 GPUs: 4 light tasks should fill both GPUs (2 per GPU)")
+    print("- Heavy tasks should WAIT until light tasks finish")
+    print()
+    
+    # Launch tasks in specific order to demonstrate scheduling
+    print(f"[{timestamp()}] 🚀 Launching 4 light tasks (should fill both GPUs)...")
+    
+    light_tasks = []
+    for i in range(4):
+        task = light_gpu_task.remote(i, duration=10)
+        light_tasks.append(task)
+        time.sleep(0.5)  # Small delay to see launch order
+    
+    # Wait a moment for light tasks to start
+    time.sleep(2)
+    
+    # Check resources after light tasks start
+    mid_resources = ray.get(resource_monitor.remote())
+    print(f"[{timestamp()}] Resources after light tasks start: {mid_resources}")
+    print()
+    
+    # Now launch heavy tasks - these should be QUEUED
+    print(f"[{timestamp()}] 🚀 Launching 2 heavy tasks (should be QUEUED)...")
+    
+    heavy_tasks = []
+    for i in range(2):
+        task = heavy_gpu_task.remote(i, duration=5)
+        heavy_tasks.append(task)
+        time.sleep(0.5)
+    
+    print()
+    print("⏳ OBSERVATION: Heavy tasks will wait until sufficient GPU resources are free!")
+    print("   - Each light task uses 0.5 GPU")
+    print("   - Each heavy task needs 1.0 GPU")
+    print("   - Heavy tasks must wait for 2 light tasks to finish to get 1.0 GPU")
+    print()
+    
+    # Monitor resources periodically
+    for i in range(3):
+        time.sleep(3)
+        current_resources = ray.get(resource_monitor.remote())
+        print(f"[{timestamp()}] Current resources: {current_resources}")
+    
+    # Wait for all tasks to complete
+    print(f"\n[{timestamp()}] ⏳ Waiting for all tasks to complete...")
+    
+    light_results = ray.get(light_tasks)
+    heavy_results = ray.get(heavy_tasks)
+    
+    print(f"\n[{timestamp()}] ✅ All tasks completed!")
+    print("Light task results:", light_results)
+    print("Heavy task results:", heavy_results)
+    
+    # Final resource check
+    final_resources = ray.get(resource_monitor.remote())
+    print(f"Final resources: {final_resources}")
+    
+    ray.shutdown()
+
+def explain_scheduling():
+    """Explain Ray's scheduling algorithm."""
+    print("\n" + "=" * 60)
+    print("RAY SCHEDULING EXPLAINED")
+    print("=" * 60)
+    print("""
+Ray's resource scheduler works like this:
+
+1. RESOURCE TRACKING:
+   - Ray tracks total and available resources (GPUs, CPUs, memory)
+   - Each task declares its resource requirements (@ray.remote(num_gpus=X))
+
+2. TASK QUEUE:
+   - Tasks are queued when submitted with .remote()
+   - Ray maintains a queue of pending tasks
+
+3. SCHEDULING DECISIONS:
+   - Ray checks if enough resources are available for each queued task
+   - Tasks only start when their FULL resource requirements can be met
+   - No partial allocation - if task needs 1.0 GPU, it waits for 1.0 GPU
+
+4. FRACTIONAL RESOURCES:
+   - 0.5 GPU tasks: 2 can run on same physical GPU
+   - 1.0 GPU tasks: Need exclusive access to 1 physical GPU
+   - If 2×0.5 GPU tasks are running, 1.0 GPU task must WAIT
+
+5. SCHEDULING ORDER:
+   - Generally FIFO (first-in-first-out)
+   - But resource availability affects actual execution order
+   - Tasks with available resources start first
+
+KEY INSIGHT: 
+Heavy tasks (1.0 GPU) CANNOT start while light tasks (0.5 GPU each) 
+occupy all GPU resources, even if the physical GPU isn't fully utilized.
+
+This ensures predictable resource allocation and prevents resource conflicts.
+""")
+
+if __name__ == "__main__":
+    demonstrate_scheduling()
+    explain_scheduling() 
\ No newline at end of file
diff --git a/ray_test/ray_single_server_multi_gpu.py b/ray_test/ray_single_server_multi_gpu.py
new file mode 100644
index 00000000..8c6c2e0c
--- /dev/null
+++ b/ray_test/ray_single_server_multi_gpu.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""
+Ray Single Server Multi-GPU Example
+
+This example demonstrates Ray GPU management on a single server with multiple GPUs.
+Shows various patterns: full GPU allocation, fractional allocation, and mixed workloads.
+"""
+
+import ray
+import torch
+import time
+import numpy as np
+from typing import List, Dict, Any
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@ray.remote(num_gpus=1)
+class GPUWorker:
+    """A Ray actor that holds a full GPU for the duration of its lifetime."""
+    
+    def __init__(self, worker_id: int):
+        self.worker_id = worker_id
+        self.gpu_ids = ray.get_gpu_ids()
+        self.device = torch.device("cuda")
+        logger.info(f"Worker {worker_id} initialized on GPU {self.gpu_ids}")
+    
+    def matrix_multiply(self, size: int = 2000, iterations: int = 5) -> Dict[str, Any]:
+        """Perform matrix multiplication to simulate GPU work."""
+        start_time = time.time()
+        
+        # Create random matrices on GPU
+        A = torch.randn(size, size, device=self.device)
+        B = torch.randn(size, size, device=self.device)
+        
+        results = []
+        for i in range(iterations):
+            C = torch.mm(A, B)
+            results.append(torch.trace(C).item())
+            
+        end_time = time.time()
+        
+        return {
+            "worker_id": self.worker_id,
+            "gpu_ids": self.gpu_ids,
+            "execution_time": end_time - start_time,
+            "results": results[:3],  # Just first 3 for brevity
+            "tensor_shape": list(C.shape)
+        }
+    
+    def get_gpu_memory_usage(self) -> Dict[str, float]:
+        """Get current GPU memory usage."""
+        if torch.cuda.is_available():
+            gpu_id = self.gpu_ids[0]
+            allocated = torch.cuda.memory_allocated(gpu_id) / 1024**3  # GB
+            cached = torch.cuda.memory_reserved(gpu_id) / 1024**3  # GB
+            return {
+                "gpu_id": gpu_id,
+                "allocated_gb": allocated,
+                "cached_gb": cached
+            }
+        return {}
+
+@ray.remote(num_gpus=0.5)
+def lightweight_gpu_task(task_id: int, work_size: int = 1000) -> Dict[str, Any]:
+    """A task that uses half a GPU - allows 2 tasks per GPU."""
+    start_time = time.time()
+    gpu_ids = ray.get_gpu_ids()
+    
+    device = torch.device("cuda")
+    x = torch.randn(work_size, work_size, device=device)
+    
+    # Simulate some computation
+    for _ in range(3):
+        x = torch.relu(x @ x.T)
+    
+    end_time = time.time()
+    
+    return {
+        "task_id": task_id,
+        "gpu_ids": gpu_ids,
+        "execution_time": end_time - start_time,
+        "final_mean": x.mean().item()
+    }
+
+@ray.remote(num_cpus=1)
+def cpu_task(task_id: int) -> Dict[str, Any]:
+    """A CPU-only task to demonstrate mixed workloads."""
+    start_time = time.time()
+    
+    # CPU computation
+    result = np.sum(np.random.randn(1000, 1000) ** 2)
+    time.sleep(1)  # Simulate work
+    
+    end_time = time.time()
+    
+    return {
+        "task_id": task_id,
+        "execution_time": end_time - start_time,
+        "result": result,
+        "resource_type": "CPU"
+    }
+
+def print_resources():
+    """Print current Ray cluster resources."""
+    print("\n" + "="*50)
+    print("RAY CLUSTER RESOURCES")
+    print("="*50)
+    print(f"Total resources: {ray.cluster_resources()}")
+    print(f"Available resources: {ray.available_resources()}")
+    print("="*50)
+
+def demo_gpu_actors():
+    """Demonstrate GPU actors (long-lived GPU workers)."""
+    print("\n🚀 DEMO 1: GPU Actors (Long-lived Workers)")
+    print("-" * 50)
+    
+    # Create 2 GPU workers (one per GPU)
+    workers = [GPUWorker.remote(i) for i in range(2)]
+    
+    # Submit work to both workers
+    futures = []
+    for i, worker in enumerate(workers):
+        future = worker.matrix_multiply.remote(size=1500, iterations=3)
+        futures.append(future)
+    
+    print("Submitted work to GPU actors...")
+    results = ray.get(futures)
+    
+    for result in results:
+        print(f"  Worker {result['worker_id']}: GPU {result['gpu_ids']}, "
+              f"Time: {result['execution_time']:.2f}s")
+    
+    # Check memory usage
+    memory_futures = [worker.get_gpu_memory_usage.remote() for worker in workers]
+    memory_results = ray.get(memory_futures)
+    
+    for mem in memory_results:
+        print(f"  GPU {mem['gpu_id']}: {mem['allocated_gb']:.2f}GB allocated, "
+              f"{mem['cached_gb']:.2f}GB cached")
+    
+    return workers
+
+def demo_fractional_gpu():
+    """Demonstrate fractional GPU allocation."""
+    print("\n🔄 DEMO 2: Fractional GPU Tasks (0.5 GPU each)")
+    print("-" * 50)
+    
+    # Launch 4 tasks with 0.5 GPU each (2 per GPU)
+    tasks = [lightweight_gpu_task.remote(i, work_size=800) for i in range(4)]
+    
+    print("Submitted 4 tasks with 0.5 GPU each...")
+    results = ray.get(tasks)
+    
+    for result in results:
+        print(f"  Task {result['task_id']}: GPU {result['gpu_ids']}, "
+              f"Time: {result['execution_time']:.2f}s")
+
+def demo_mixed_workload():
+    """Demonstrate mixed CPU and GPU workloads."""
+    print("\n🔀 DEMO 3: Mixed CPU and GPU Workloads")
+    print("-" * 50)
+    
+    # Mix of CPU and GPU tasks
+    cpu_tasks = [cpu_task.remote(i) for i in range(3)]
+    gpu_tasks = [lightweight_gpu_task.remote(i+10, work_size=600) for i in range(3)]
+    
+    all_tasks = cpu_tasks + gpu_tasks
+    print(f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks...")
+    
+    start_time = time.time()
+    results = ray.get(all_tasks)
+    total_time = time.time() - start_time
+    
+    print(f"All tasks completed in {total_time:.2f}s")
+    
+    # Separate results
+    cpu_results = [r for r in results if r.get('resource_type') == 'CPU']
+    gpu_results = [r for r in results if 'gpu_ids' in r]
+    
+    print(f"  CPU tasks: {len(cpu_results)} completed")
+    print(f"  GPU tasks: {len(gpu_results)} completed")
+
+def demo_dynamic_scheduling():
+    """Demonstrate dynamic task scheduling based on resource availability."""
+    print("\n⚡ DEMO 4: Dynamic Scheduling")
+    print("-" * 50)
+    
+    # Submit tasks gradually and monitor resource usage
+    completed_tasks = []
+    pending_tasks = []
+    
+    for i in range(8):
+        task = lightweight_gpu_task.remote(i, work_size=500)
+        pending_tasks.append(task)
+        
+        # Check if we should wait for some tasks to complete
+        if len(pending_tasks) >= 4:  # Don't overwhelm the queue
+            # Wait for at least one task to complete
+            ready, pending_tasks = ray.wait(pending_tasks, num_returns=1)
+            completed_tasks.extend(ray.get(ready))
+            print(f"  Completed {len(completed_tasks)} tasks, "
+                  f"{len(pending_tasks)} still pending")
+    
+    # Wait for remaining tasks
+    if pending_tasks:
+        completed_tasks.extend(ray.get(pending_tasks))
+    
+    print(f"Dynamic scheduling completed: {len(completed_tasks)} total tasks")
+
+def main():
+    """Main function demonstrating various Ray GPU patterns."""
+    print("🎯 Ray Single Server Multi-GPU Demo")
+    print("=" * 60)
+    
+    # Initialize Ray
+    ray.init(num_gpus=2)  # Explicitly specify 2 GPUs
+    
+    print_resources()
+    
+    # Run all demos
+    workers = demo_gpu_actors()
+    print_resources()
+    
+    demo_fractional_gpu()
+    print_resources()
+    
+    demo_mixed_workload()
+    print_resources()
+    
+    demo_dynamic_scheduling()
+    print_resources()
+    
+    print("\n🎉 All demos completed!")
+    print("\nKey Takeaways:")
+    print("1. GPU Actors: Long-lived workers for persistent GPU allocation")
+    print("2. Fractional GPUs: Share GPUs between multiple light tasks")
+    print("3. Mixed Workloads: Combine CPU and GPU tasks efficiently")
+    print("4. Dynamic Scheduling: Adapt to resource availability")
+    
+    # Cleanup
+    ray.shutdown()
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/ray_test/test_ray.py b/ray_test/test_ray.py
new file mode 100644
index 00000000..31655796
--- /dev/null
+++ b/ray_test/test_ray.py
@@ -0,0 +1,77 @@
+import ray
+import torch
+import time
+
+
+# =================================================================
+# 1. THE RAY TASK: A function that will run on a single GPU
+# =================================================================
+# The decorator is the key: it tells Ray this task requires 1 GPU.
+@ray.remote(num_gpus=1)
+def use_gpu_task(task_id: int):
+    """
+    A simple Ray task that simulates work on a GPU.
+    """
+    # Ray automatically sets the CUDA_VISIBLE_DEVICES environment variable
+    # for this worker process, so torch.cuda.current_device() will
+    # correspond to the GPU Ray assigned.
+    
+    # Let's get the physical GPU ID that Ray assigned to this task.
+    gpu_ids = ray.get_gpu_ids()
+    physical_gpu_id = gpu_ids[0]
+
+    print(f"-> Task {task_id} starting. Ray assigned me physical GPU: {physical_gpu_id}")
+
+    # Create a tensor and move it to the assigned GPU.
+    # PyTorch will only see the single GPU that Ray allocated.
+    device = torch.device("cuda")
+    tensor = torch.randn(2000, 2000, device=device)
+
+    # Perform some work to make the GPU busy.
+    for i in range(5):
+        tensor = tensor @ tensor
+        time.sleep(0.5) # Sleep to make it easier to see in nvidia-smi
+        print(f"   Task {task_id}, iteration {i+1}, on device: {tensor.device}")
+
+    print(f"<- Task {task_id} finished on GPU {physical_gpu_id}.")
+    
+    # Return the ID of the GPU we used.
+    return f"Task {task_id} ran on GPU {physical_gpu_id}"
+
+
+# =================================================================
+# 2. MAIN SCRIPT: Initialize Ray and launch the tasks
+# =================================================================
+if __name__ == "__main__":
+    # Start Ray. Ray will automatically detect the 2 GPUs.
+    # You could also be explicit: ray.init(num_gpus=2)
+    ray.init()
+
+    print("Ray Initialized.")
+    print("Cluster resources:", ray.cluster_resources())
+
+    # Verify that Ray sees our GPUs
+    if ray.cluster_resources().get("GPU", 0) < 2:
+        print("!!! WARNING: Ray did not detect 2 GPUs. Exiting.")
+        ray.shutdown()
+        exit()
+
+    # We have 2 GPUs, so let's launch 4 tasks.
+    # Ray will run 2 tasks concurrently, and queue the other 2
+    # until the first ones finish.
+    print("\nLaunching 4 GPU tasks on 2 available GPUs...")
+    task_refs = []
+    for i in range(4):
+        # .remote() immediately returns a future (a reference to the result)
+        # and executes the task in the background.
+        ref = use_gpu_task.remote(i)
+        task_refs.append(ref)
+
+    # Block until all tasks are complete and get the results.
+    results = ray.get(task_refs)
+
+    print("\n--- All tasks completed! ---")
+    print("Results:", results)
+
+    # Shut down Ray
+    ray.shutdown()
\ No newline at end of file
diff --git a/ray_test/test_ray_distributed.py b/ray_test/test_ray_distributed.py
new file mode 100644
index 00000000..e7f027cc
--- /dev/null
+++ b/ray_test/test_ray_distributed.py
@@ -0,0 +1,61 @@
+import ray
+import torch
+import time
+import socket # Import socket to see which node we're on
+
+# The task definition is IDENTICAL to the previous example.
+# No changes are needed here.
+@ray.remote(num_gpus=1)
+def use_gpu_task(task_id: int):
+    gpu_ids = ray.get_gpu_ids()
+    physical_gpu_id = gpu_ids[0]
+    
+    # Let's also get the hostname of the node Ray scheduled us on.
+    # In this simulation, it will be the same hostname, but Ray
+    # internally treats them as distinct nodes.
+    node_id = ray.get_runtime_context().get_node_id()
+    
+    print(
+        f"-> Task {task_id} starting."
+        f" Ray assigned me physical GPU: {physical_gpu_id}"
+        f" on Node ID: {node_id}"
+    )
+
+    device = torch.device("cuda")
+    tensor = torch.randn(2000, 2000, device=device)
+
+    for i in range(5):
+        tensor = tensor @ tensor
+        time.sleep(0.5)
+
+    print(f"<- Task {task_id} finished on GPU {physical_gpu_id}.")
+    return f"Task {task_id} ran on GPU {physical_gpu_id} on Node {node_id}"
+
+
+# =================================================================
+# MAIN SCRIPT
+# =================================================================
+if __name__ == "__main__":
+    # CRITICAL CHANGE: Connect to the existing Ray cluster.
+    # 'auto' tells Ray to find the running cluster from environment variables
+    # that `ray start` sets up.
+    ray.init()
+
+    print("Python script connected to Ray Cluster.")
+    print("Cluster resources:", ray.cluster_resources())
+
+    # The rest of the logic is the same.
+    print("\nLaunching 4 GPU tasks on our 2-node, 2-GPU cluster...")
+    task_refs = []
+    for i in range(4):
+        ref = use_gpu_task.remote(i)
+        task_refs.append(ref)
+
+    results = ray.get(task_refs)
+
+    print("\n--- All tasks completed! ---")
+    print("Results:", results)
+
+    # We don't call ray.shutdown() here, because we want to leave the
+    # cluster running. We will stop it manually from the terminal.
+    print("\nScript finished. The Ray cluster is still running.")
\ No newline at end of file
diff --git a/ray_test/test_ray_init.py b/ray_test/test_ray_init.py
new file mode 100644
index 00000000..53ba10a0
--- /dev/null
+++ b/ray_test/test_ray_init.py
@@ -0,0 +1,23 @@
+import ray
+import subprocess
+
+
+def ray_start():
+    subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
+
+
+def ray_stop():
+    subprocess.run(['ray', 'stop'], check=True)
+
+def ray_init():
+    ray.init()
+
+
+def ray_shutdown():
+    ray.shutdown()
+
+if __name__ == '__main__':
+    ray_start()
+    ray_init()
+    ray_shutdown()
+    ray_stop()
diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
new file mode 100644
index 00000000..699b001c
--- /dev/null
+++ b/ray_test/test_torch_ray_distributed.py
@@ -0,0 +1,124 @@
+import ray
+import torch
+import torch.distributed as dist
+import os
+import subprocess
+import socket
+import time
+
+from datetime import timedelta
+
+def ray_noset_visible_devices():
+    return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
+
+
+def get_ranks():
+    # get envs set by torchrun
+    world_size = os.environ.get('WORLD_SIZE', None)
+    rank = os.environ.get('RANK', None)
+    local_rank = os.environ.get('LOCAL_RANK', None)
+    node_rank = os.environ.get('NODE_RANK', None)
+    master_addr = os.environ.get('MASTER_ADDR', None)
+    master_port = os.environ.get('MASTER_PORT', None)
+
+    return world_size, rank, local_rank, node_rank, master_addr, master_port
+
+
+def init_ray(rank: str):
+    dist.init_process_group(backend='gloo')
+
+    # init ray on master node, rank 0
+    if rank == '0':
+        subprocess.run(['ray', 'start', '--head', '--port=6379', '--num-gpus=1'], check=True)
+        ray.init(address='auto')
+        # get existing ray ip and port
+        ctx = ray.get_runtime_context()
+        address = ctx.gcs_address
+        print(f'available gpus: {ray.available_resources()}')
+
+    else:
+        address = None
+    address_list = [address]
+    # broadcast address to all other ranks
+    dist.broadcast_object_list(address_list, src=0)
+    if rank != '0':
+        address = address_list[0]
+        subprocess.run(['ray', 'start', f'--address={address}', '--num-gpus=1'], check=True)
+        ray.init(address='auto')
+    # wait until num of gpus reach world_size
+    while ray.cluster_resources().get('GPU', 0) < dist.get_world_size():
+        time.sleep(1)
+    if rank == '0':
+        print(f'available gpus: {ray.available_resources()}')
+    else:
+        ray.shutdown()
+    dist.destroy_process_group()
+    return address
+
+
+@ray.remote(num_gpus=1)
+def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: int, world_size: int):
+    """A minimal GPU task that just creates a tensor and does basic operations."""
+    os.environ["MASTER_ADDR"] = master_addr
+    os.environ["MASTER_PORT"] = str(master_port)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["RANK"] = str(rank)
+    os.environ["NODE_RANK"] = str(node_rank)
+    # # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
+    # # environment variable for each actor, unless
+    # # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
+    # # set local rank to 0 when the flag is not applicable.
+    # # os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+    os.environ["LOCAL_RANK"] = str(rank)
+    # # number of visible devices
+    num_visible_devices = torch.cuda.device_count()
+    print(f'num_visible_devices: {num_visible_devices}')
+    print('ray run init envs:')
+    world_size, rank, local_rank, node_rank, master_addr, master_port = get_ranks()
+    print(f'rank: {rank}')
+    print(f'node_rank: {node_rank}')
+    print(f'world_size: {world_size}')
+    print(f'local_rank: {local_rank}')
+    print(f'master_addr: {master_addr}')
+    print(f'master_port: {master_port}')
+    # init_process_group(backend='nccl', init_method=f'tcp://{master_addr}:{master_port}', world_size=int(world_size), rank=int(rank), timeout=timedelta(seconds=10))
+    dist.init_process_group(timeout=timedelta(seconds=10))
+    print(f'is distributed initialized: {dist.is_initialized()}')
+
+    # Create a tensor on the GPU
+    device = torch.device(f"cuda:{local_rank}")
+    x = torch.randn(1000, 1000, device=device).sum()
+    dist.all_reduce(x)
+    dist.destroy_process_group()
+    return x.item()
+
+
+if __name__ == '__main__':
+    world_size, rank, local_rank, node_rank, master_addr, master_port = get_ranks()
+    print('torch run init envs:')
+    print(f'world_size: {world_size}')
+    print(f'rank: {rank}')
+    print(f'local_rank: {local_rank}')
+    print(f'node_rank: {node_rank}')
+    print(f'master_addr: {master_addr}')
+    print(f'master_port: {master_port}')
+    address = init_ray(rank)
+    if rank == '0':
+        try:
+            master_addr, _ = address.split(':')
+            # if I uncomment this, dist.init_process_group will timeout
+            # with socket.socket() as sock:
+            #     sock.bind(("", 0))
+            #     master_port = sock.getsockname()[1]
+
+            print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
+            tasks = [simple_gpu_task.remote(master_addr, master_port, rank, node_rank, world_size) for rank in range(int(world_size))]
+            results = ray.get(tasks)
+            print(results)
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)
+        except Exception as e:
+            print(f'Error: {e}')
+        finally:
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)

From b0b57d90c64f8d6beec358cdeb3e72a652e29478 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 09:24:56 +0000
Subject: [PATCH 002/107] hack local rank

---
 ray_test/test_torch_ray_distributed.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 699b001c..7924e5c1 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -64,12 +64,17 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
     os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["RANK"] = str(rank)
     os.environ["NODE_RANK"] = str(node_rank)
+    local_rank = rank % 8
+    os.environ["LOCAL_RANK"] = str(local_rank)
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
     # # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
     # # environment variable for each actor, unless
     # # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
     # # set local rank to 0 when the flag is not applicable.
-    # # os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
-    os.environ["LOCAL_RANK"] = str(rank)
+    # print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
+    # print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
+    # os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+
     # # number of visible devices
     num_visible_devices = torch.cuda.device_count()
     print(f'num_visible_devices: {num_visible_devices}')
@@ -86,7 +91,7 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
     print(f'is distributed initialized: {dist.is_initialized()}')
 
     # Create a tensor on the GPU
-    device = torch.device(f"cuda:{local_rank}")
+    device = torch.device(f"cuda")
     x = torch.randn(1000, 1000, device=device).sum()
     dist.all_reduce(x)
     dist.destroy_process_group()

From 9e70dbba557843a3ff6af3269bc30a810718a204 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 17:27:27 +0000
Subject: [PATCH 003/107] composer launch works; torchrun somehow only allows
 use the same init port

---
 ray_test/test_torch_ray_distributed.py | 36 ++++++++++++++------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 7924e5c1..86622faf 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -14,12 +14,12 @@ def ray_noset_visible_devices():
 
 def get_ranks():
     # get envs set by torchrun
-    world_size = os.environ.get('WORLD_SIZE', None)
-    rank = os.environ.get('RANK', None)
-    local_rank = os.environ.get('LOCAL_RANK', None)
-    node_rank = os.environ.get('NODE_RANK', None)
-    master_addr = os.environ.get('MASTER_ADDR', None)
-    master_port = os.environ.get('MASTER_PORT', None)
+    world_size = os.environ.get('WORLD_SIZE', '1')
+    rank = os.environ.get('RANK', '0')
+    local_rank = os.environ.get('LOCAL_RANK', '0')
+    node_rank = os.environ.get('NODE_RANK', '0')
+    master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
+    master_port = os.environ.get('MASTER_PORT', '8265')
 
     return world_size, rank, local_rank, node_rank, master_addr, master_port
 
@@ -29,7 +29,8 @@ def init_ray(rank: str):
 
     # init ray on master node, rank 0
     if rank == '0':
-        subprocess.run(['ray', 'start', '--head', '--port=6379', '--num-gpus=1'], check=True)
+        # subprocess.run(['ray', 'start', '--head', '--port=6379', '--num-gpus=1'], check=True)
+        subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
         ray.init(address='auto')
         # get existing ray ip and port
         ctx = ray.get_runtime_context()
@@ -43,7 +44,8 @@ def init_ray(rank: str):
     dist.broadcast_object_list(address_list, src=0)
     if rank != '0':
         address = address_list[0]
-        subprocess.run(['ray', 'start', f'--address={address}', '--num-gpus=1'], check=True)
+        # subprocess.run(['ray', 'start', f'--address={address}', '--num-gpus=1'], check=True)
+        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
         ray.init(address='auto')
     # wait until num of gpus reach world_size
     while ray.cluster_resources().get('GPU', 0) < dist.get_world_size():
@@ -64,16 +66,16 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
     os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["RANK"] = str(rank)
     os.environ["NODE_RANK"] = str(node_rank)
-    local_rank = rank % 8
-    os.environ["LOCAL_RANK"] = str(local_rank)
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
+    # local_rank = rank % 8
+    # os.environ["LOCAL_RANK"] = str(local_rank)
+    # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
     # # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
     # # environment variable for each actor, unless
     # # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
     # # set local rank to 0 when the flag is not applicable.
-    # print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
-    # print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
-    # os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+    print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
+    print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
+    os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
 
     # # number of visible devices
     num_visible_devices = torch.cuda.device_count()
@@ -112,9 +114,9 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
         try:
             master_addr, _ = address.split(':')
             # if I uncomment this, dist.init_process_group will timeout
-            # with socket.socket() as sock:
-            #     sock.bind(("", 0))
-            #     master_port = sock.getsockname()[1]
+            with socket.socket() as sock:
+                sock.bind(("", 0))
+                master_port = sock.getsockname()[1]
 
             print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
             tasks = [simple_gpu_task.remote(master_addr, master_port, rank, node_rank, world_size) for rank in range(int(world_size))]

From bc72f3e2f15131a7c3000dcbe3c1ed836c60dcf4 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 20:27:26 +0000
Subject: [PATCH 004/107] clean up

---
 ray_test/test_torch_ray_distributed.py | 69 +++++++++++---------------
 1 file changed, 30 insertions(+), 39 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 86622faf..d5054922 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -14,25 +14,26 @@ def ray_noset_visible_devices():
 
 def get_ranks():
     # get envs set by torchrun
-    world_size = os.environ.get('WORLD_SIZE', '1')
-    rank = os.environ.get('RANK', '0')
-    local_rank = os.environ.get('LOCAL_RANK', '0')
-    node_rank = os.environ.get('NODE_RANK', '0')
+    world_size = int(os.environ.get('WORLD_SIZE', '1'))
+    rank = int(os.environ.get('RANK', '0'))
+    local_rank = int(os.environ.get('LOCAL_RANK', '0'))
+    node_rank = int(os.environ.get('NODE_RANK', '0'))
     master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
-    master_port = os.environ.get('MASTER_PORT', '8265')
+    master_port = int(os.environ.get('MASTER_PORT', '8265'))
 
     return world_size, rank, local_rank, node_rank, master_addr, master_port
 
 
-def init_ray(rank: str):
+def init_ray():
+    _, rank, local_rank, *_ = get_ranks()
     dist.init_process_group(backend='gloo')
 
     # init ray on master node, rank 0
-    if rank == '0':
+    if rank == 0:
         # subprocess.run(['ray', 'start', '--head', '--port=6379', '--num-gpus=1'], check=True)
         subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
         ray.init(address='auto')
-        # get existing ray ip and port
+        # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
         print(f'available gpus: {ray.available_resources()}')
@@ -42,42 +43,40 @@ def init_ray(rank: str):
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)
-    if rank != '0':
+    if rank != 0 and local_rank == 0:
         address = address_list[0]
         # subprocess.run(['ray', 'start', f'--address={address}', '--num-gpus=1'], check=True)
         subprocess.run(['ray', 'start', f'--address={address}'], check=True)
-        ray.init(address='auto')
-    # wait until num of gpus reach world_size
-    while ray.cluster_resources().get('GPU', 0) < dist.get_world_size():
-        time.sleep(1)
-    if rank == '0':
-        print(f'available gpus: {ray.available_resources()}')
-    else:
-        ray.shutdown()
+    if rank == 0:
+        # wait until num of gpus reach world_size
+        cluster_gpus = ray.cluster_resources().get('GPU', 0)
+        while cluster_gpus < dist.get_world_size():
+            print(f'waiting for {dist.get_world_size() - cluster_gpus} gpus to be available')
+            time.sleep(1)
+        print(f'Total available gpus: {ray.available_resources()}')
     dist.destroy_process_group()
     return address
 
 
 @ray.remote(num_gpus=1)
-def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: int, world_size: int):
+def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: int):
     """A minimal GPU task that just creates a tensor and does basic operations."""
     os.environ["MASTER_ADDR"] = master_addr
     os.environ["MASTER_PORT"] = str(master_port)
     os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["RANK"] = str(rank)
-    os.environ["NODE_RANK"] = str(node_rank)
     # local_rank = rank % 8
     # os.environ["LOCAL_RANK"] = str(local_rank)
     # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
-    # # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
-    # # environment variable for each actor, unless
-    # # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
-    # # set local rank to 0 when the flag is not applicable.
-    print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
-    print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
+    # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
+    # environment variable for each actor, unless
+    # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
+    # set local rank to 0 when the flag is not applicable.
+    # print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
+    # print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
     os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
 
-    # # number of visible devices
+    # number of visible devices
     num_visible_devices = torch.cuda.device_count()
     print(f'num_visible_devices: {num_visible_devices}')
     print('ray run init envs:')
@@ -88,29 +87,21 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
     print(f'local_rank: {local_rank}')
     print(f'master_addr: {master_addr}')
     print(f'master_port: {master_port}')
-    # init_process_group(backend='nccl', init_method=f'tcp://{master_addr}:{master_port}', world_size=int(world_size), rank=int(rank), timeout=timedelta(seconds=10))
     dist.init_process_group(timeout=timedelta(seconds=10))
     print(f'is distributed initialized: {dist.is_initialized()}')
 
     # Create a tensor on the GPU
     device = torch.device(f"cuda")
-    x = torch.randn(1000, 1000, device=device).sum()
+    x = torch.ones(1, device=device)
     dist.all_reduce(x)
     dist.destroy_process_group()
     return x.item()
 
 
 if __name__ == '__main__':
-    world_size, rank, local_rank, node_rank, master_addr, master_port = get_ranks()
-    print('torch run init envs:')
-    print(f'world_size: {world_size}')
-    print(f'rank: {rank}')
-    print(f'local_rank: {local_rank}')
-    print(f'node_rank: {node_rank}')
-    print(f'master_addr: {master_addr}')
-    print(f'master_port: {master_port}')
-    address = init_ray(rank)
-    if rank == '0':
+    world_size, rank, *_ = get_ranks()
+    address = init_ray()
+    if rank == 0:
         try:
             master_addr, _ = address.split(':')
             # if I uncomment this, dist.init_process_group will timeout
@@ -119,7 +110,7 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, node_rank: in
                 master_port = sock.getsockname()[1]
 
             print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
-            tasks = [simple_gpu_task.remote(master_addr, master_port, rank, node_rank, world_size) for rank in range(int(world_size))]
+            tasks = [simple_gpu_task.remote(master_addr, master_port, i, world_size) for i in range(int(world_size))]
             results = ray.get(tasks)
             print(results)
             ray.shutdown()

From 461c8c49ef134c5a9967323f798b033db336cb3a Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 20:43:45 +0000
Subject: [PATCH 005/107] timeout to 30s

---
 ray_test/test_torch_ray_distributed.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index d5054922..6d042ed2 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -50,9 +50,15 @@ def init_ray():
     if rank == 0:
         # wait until num of gpus reach world_size
         cluster_gpus = ray.cluster_resources().get('GPU', 0)
+        counter = 0
         while cluster_gpus < dist.get_world_size():
             print(f'waiting for {dist.get_world_size() - cluster_gpus} gpus to be available')
             time.sleep(1)
+            counter += 1
+            if counter > 30:
+                ray.shutdown()
+                subprocess.run(['ray', 'stop'], check=True)
+                raise RuntimeError('Timeout waiting for gpus to be available')
         print(f'Total available gpus: {ray.available_resources()}')
     dist.destroy_process_group()
     return address

From e7803bb8f7493a91ede19961fb30d5f9b124be5a Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 20:59:27 +0000
Subject: [PATCH 006/107] update script

---
 ray_test/test_torch_ray_distributed.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 6d042ed2..1b696404 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -30,8 +30,7 @@ def init_ray():
 
     # init ray on master node, rank 0
     if rank == 0:
-        # subprocess.run(['ray', 'start', '--head', '--port=6379', '--num-gpus=1'], check=True)
-        subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
+        subprocess.run(['ray', 'start', '--head'], check=True)
         ray.init(address='auto')
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
@@ -45,7 +44,7 @@ def init_ray():
     dist.broadcast_object_list(address_list, src=0)
     if rank != 0 and local_rank == 0:
         address = address_list[0]
-        # subprocess.run(['ray', 'start', f'--address={address}', '--num-gpus=1'], check=True)
+        print(f'rank: {rank} connecting to address: {address}')
         subprocess.run(['ray', 'start', f'--address={address}'], check=True)
     if rank == 0:
         # wait until num of gpus reach world_size

From feb62d188e128cbd41c4214d39d088a85bde3a2f Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:08:15 +0000
Subject: [PATCH 007/107] None evn

---
 ray_test/test_torch_ray_distributed.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 1b696404..3722f660 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -14,10 +14,10 @@ def ray_noset_visible_devices():
 
 def get_ranks():
     # get envs set by torchrun
-    world_size = int(os.environ.get('WORLD_SIZE', '1'))
-    rank = int(os.environ.get('RANK', '0'))
-    local_rank = int(os.environ.get('LOCAL_RANK', '0'))
-    node_rank = int(os.environ.get('NODE_RANK', '0'))
+    world_size = int(os.environ.get('WORLD_SIZE', None))
+    rank = int(os.environ.get('RANK', None))
+    local_rank = int(os.environ.get('LOCAL_RANK', None))
+    node_rank = int(os.environ.get('NODE_RANK', None))
     master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
     master_port = int(os.environ.get('MASTER_PORT', '8265'))
 
@@ -45,7 +45,7 @@ def init_ray():
     if rank != 0 and local_rank == 0:
         address = address_list[0]
         print(f'rank: {rank} connecting to address: {address}')
-        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
+        subprocess.run(['ray', 'start', f'--address={address}', '--resources={"worker_node": 8, "accelerator_type:H100":8}'], check=True)
     if rank == 0:
         # wait until num of gpus reach world_size
         cluster_gpus = ray.cluster_resources().get('GPU', 0)

From 237373caf0c10cacaa6a35bdbd68dc3ba5c02261 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:13:41 +0000
Subject: [PATCH 008/107] change internval

---
 ray_test/test_torch_ray_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 3722f660..5b29b157 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -52,9 +52,9 @@ def init_ray():
         counter = 0
         while cluster_gpus < dist.get_world_size():
             print(f'waiting for {dist.get_world_size() - cluster_gpus} gpus to be available')
-            time.sleep(1)
+            time.sleep(3)
             counter += 1
-            if counter > 30:
+            if counter > 20:
                 ray.shutdown()
                 subprocess.run(['ray', 'stop'], check=True)
                 raise RuntimeError('Timeout waiting for gpus to be available')

From 4193ee3d36350c50af6bc23e353ff2ad0cb5e803 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:20:21 +0000
Subject: [PATCH 009/107] try break and nodes instead

---
 ray_test/test_torch_ray_distributed.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 5b29b157..520ef4b6 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -48,16 +48,14 @@ def init_ray():
         subprocess.run(['ray', 'start', f'--address={address}', '--resources={"worker_node": 8, "accelerator_type:H100":8}'], check=True)
     if rank == 0:
         # wait until num of gpus reach world_size
-        cluster_gpus = ray.cluster_resources().get('GPU', 0)
+        num_nodes = ray.nodes()
         counter = 0
-        while cluster_gpus < dist.get_world_size():
-            print(f'waiting for {dist.get_world_size() - cluster_gpus} gpus to be available')
-            time.sleep(3)
+        while len(num_nodes) < dist.get_world_size() // 8:
+            print(f'waiting for {dist.get_world_size() // 8 - len(num_nodes)} nodes to be available')
+            time.sleep(5)
             counter += 1
-            if counter > 20:
-                ray.shutdown()
-                subprocess.run(['ray', 'stop'], check=True)
-                raise RuntimeError('Timeout waiting for gpus to be available')
+            if counter > 4:
+                break
         print(f'Total available gpus: {ray.available_resources()}')
     dist.destroy_process_group()
     return address

From fd63dcb1eed3c9778d435ac327cbb8f36dbcc499 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:24:52 +0000
Subject: [PATCH 010/107] condition

---
 ray_test/test_torch_ray_distributed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 520ef4b6..547778d5 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -46,12 +46,14 @@ def init_ray():
         address = address_list[0]
         print(f'rank: {rank} connecting to address: {address}')
         subprocess.run(['ray', 'start', f'--address={address}', '--resources={"worker_node": 8, "accelerator_type:H100":8}'], check=True)
+    dist.barrier()
     if rank == 0:
         # wait until num of gpus reach world_size
         num_nodes = ray.nodes()
         counter = 0
         while len(num_nodes) < dist.get_world_size() // 8:
             print(f'waiting for {dist.get_world_size() // 8 - len(num_nodes)} nodes to be available')
+            num_nodes = ray.nodes()
             time.sleep(5)
             counter += 1
             if counter > 4:

From 5c3f61dd22e07b198cefa67103ce1cf5fa154e8d Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:30:21 +0000
Subject: [PATCH 011/107] rm resources

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 547778d5..766b2996 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -45,7 +45,7 @@ def init_ray():
     if rank != 0 and local_rank == 0:
         address = address_list[0]
         print(f'rank: {rank} connecting to address: {address}')
-        subprocess.run(['ray', 'start', f'--address={address}', '--resources={"worker_node": 8, "accelerator_type:H100":8}'], check=True)
+        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
     dist.barrier()
     if rank == 0:
         # wait until num of gpus reach world_size

From ff227c4cd543047c94b4ea5ac1219e401bbc1747 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 25 Jun 2025 21:31:49 +0000
Subject: [PATCH 012/107] sleep it

---
 ray_test/test_torch_ray_distributed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 766b2996..4e17cb09 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -125,3 +125,5 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: i
         finally:
             ray.shutdown()
             subprocess.run(['ray', 'stop'], check=True)
+    else:
+        time.sleep(60)

From 172c6762f4dc70f35bf1e40da43294156ced9042 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 18:42:18 +0000
Subject: [PATCH 013/107] use dist barrier to block

---
 ray_test/test_torch_ray_distributed.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 4e17cb09..20278408 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -26,8 +26,6 @@ def get_ranks():
 
 def init_ray():
     _, rank, local_rank, *_ = get_ranks()
-    dist.init_process_group(backend='gloo')
-
     # init ray on master node, rank 0
     if rank == 0:
         subprocess.run(['ray', 'start', '--head'], check=True)
@@ -59,7 +57,6 @@ def init_ray():
             if counter > 4:
                 break
         print(f'Total available gpus: {ray.available_resources()}')
-    dist.destroy_process_group()
     return address
 
 
@@ -104,6 +101,7 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: i
 
 
 if __name__ == '__main__':
+    dist.init_process_group(backend='gloo')
     world_size, rank, *_ = get_ranks()
     address = init_ray()
     if rank == 0:
@@ -125,5 +123,5 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: i
         finally:
             ray.shutdown()
             subprocess.run(['ray', 'stop'], check=True)
-    else:
-        time.sleep(60)
+    dist.barrier()
+    dist.destroy_process_group()

From f417e5b764e478912854eeeac2aa494707a28d01 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 19:32:14 +0000
Subject: [PATCH 014/107] context manager

---
 ray_test/test_torch_ray_distributed.py | 68 +++++++++++---------------
 1 file changed, 28 insertions(+), 40 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 20278408..9dde5733 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -5,6 +5,7 @@
 import subprocess
 import socket
 import time
+from contextlib import contextmanager
 
 from datetime import timedelta
 
@@ -12,22 +13,9 @@ def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
 
 
-def get_ranks():
-    # get envs set by torchrun
-    world_size = int(os.environ.get('WORLD_SIZE', None))
-    rank = int(os.environ.get('RANK', None))
-    local_rank = int(os.environ.get('LOCAL_RANK', None))
-    node_rank = int(os.environ.get('NODE_RANK', None))
-    master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
-    master_port = int(os.environ.get('MASTER_PORT', '8265'))
-
-    return world_size, rank, local_rank, node_rank, master_addr, master_port
-
-
 def init_ray():
-    _, rank, local_rank, *_ = get_ranks()
     # init ray on master node, rank 0
-    if rank == 0:
+    if dist.get_rank() == 0:
         subprocess.run(['ray', 'start', '--head'], check=True)
         ray.init(address='auto')
         # get existing ray ip and port 
@@ -40,12 +28,12 @@ def init_ray():
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)
-    if rank != 0 and local_rank == 0:
+    if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
         address = address_list[0]
-        print(f'rank: {rank} connecting to address: {address}')
+        print(f'rank: {dist.get_rank()} connecting to address: {address}')
         subprocess.run(['ray', 'start', f'--address={address}'], check=True)
     dist.barrier()
-    if rank == 0:
+    if dist.get_rank() == 0:
         # wait until num of gpus reach world_size
         num_nodes = ray.nodes()
         counter = 0
@@ -80,16 +68,15 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: i
 
     # number of visible devices
     num_visible_devices = torch.cuda.device_count()
+    dist.init_process_group(timeout=timedelta(seconds=10))
     print(f'num_visible_devices: {num_visible_devices}')
     print('ray run init envs:')
-    world_size, rank, local_rank, node_rank, master_addr, master_port = get_ranks()
-    print(f'rank: {rank}')
-    print(f'node_rank: {node_rank}')
-    print(f'world_size: {world_size}')
-    print(f'local_rank: {local_rank}')
+    print(f'rank: {dist.get_rank()}')
+    print(f'node_rank: {dist.get_rank() // 8}')
+    print(f'world_size: {dist.get_world_size()}')
+    print(f'local_rank: {dist.get_rank() % 8}')
     print(f'master_addr: {master_addr}')
     print(f'master_port: {master_port}')
-    dist.init_process_group(timeout=timedelta(seconds=10))
     print(f'is distributed initialized: {dist.is_initialized()}')
 
     # Create a tensor on the GPU
@@ -99,29 +86,30 @@ def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: i
     dist.destroy_process_group()
     return x.item()
 
-
-if __name__ == '__main__':
+@contextmanager
+def start_ray_server():
     dist.init_process_group(backend='gloo')
-    world_size, rank, *_ = get_ranks()
     address = init_ray()
-    if rank == 0:
-        try:
+    try:
+        yield address
+        dist.barrier()
+    finally:
+        if dist.get_rank() == 0:
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)
+        dist.destroy_process_group()
+
+def run():
+    with start_ray_server() as address:
+        if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
-            # if I uncomment this, dist.init_process_group will timeout
             with socket.socket() as sock:
                 sock.bind(("", 0))
                 master_port = sock.getsockname()[1]
-
             print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
-            tasks = [simple_gpu_task.remote(master_addr, master_port, i, world_size) for i in range(int(world_size))]
+            tasks = [simple_gpu_task.remote(master_addr, master_port, i, dist.get_world_size()) for i in range(int(dist.get_world_size()))]
             results = ray.get(tasks)
             print(results)
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
-        except Exception as e:
-            print(f'Error: {e}')
-        finally:
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
-    dist.barrier()
-    dist.destroy_process_group()
+
+if __name__ == '__main__':
+    run()

From b29ba4441208f67a213be765693f8ba79e3f77e0 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 20:52:55 +0000
Subject: [PATCH 015/107] try to not release port

---
 ray_test/test_torch_ray_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 9dde5733..6193c018 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -106,8 +106,8 @@ def run():
             with socket.socket() as sock:
                 sock.bind(("", 0))
                 master_port = sock.getsockname()[1]
-            print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
-            tasks = [simple_gpu_task.remote(master_addr, master_port, i, dist.get_world_size()) for i in range(int(dist.get_world_size()))]
+                print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
+                tasks = [simple_gpu_task.remote(master_addr, master_port, i, dist.get_world_size()) for i in range(int(dist.get_world_size()))]
             results = ray.get(tasks)
             print(results)
 

From 9a300281b1fa1cd213dc51faf8962af6ec68e4ea Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 22:31:15 +0000
Subject: [PATCH 016/107] use ray actor

---
 ray_test/test_torch_ray_distributed.py | 151 ++++++++++++++++++-------
 1 file changed, 110 insertions(+), 41 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 6193c018..407da10a 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -6,6 +6,7 @@
 import socket
 import time
 from contextlib import contextmanager
+from typing import Optional, Tuple
 
 from datetime import timedelta
 
@@ -49,42 +50,86 @@ def init_ray():
 
 
 @ray.remote(num_gpus=1)
-def simple_gpu_task(master_addr: str, master_port: int, rank: int, world_size: int):
-    """A minimal GPU task that just creates a tensor and does basic operations."""
-    os.environ["MASTER_ADDR"] = master_addr
-    os.environ["MASTER_PORT"] = str(master_port)
-    os.environ["WORLD_SIZE"] = str(world_size)
-    os.environ["RANK"] = str(rank)
-    # local_rank = rank % 8
-    # os.environ["LOCAL_RANK"] = str(local_rank)
-    # os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
-    # NOTE: Ray will automatically set the *_VISIBLE_DEVICES
-    # environment variable for each actor, unless
-    # RAY_EXPERIMENTAL_NOSET_*_VISIBLE_DEVICES is set, so
-    # set local rank to 0 when the flag is not applicable.
-    # print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
-    # print(f'ray.get_gpu_ids(): {ray.get_gpu_ids()}')
-    os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+class DistributedGPUActor:
+    def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None, master_port: Optional[int] = None):
+        """Initialize the distributed GPU actor.
+        
+        Args:
+            rank: The rank of this process in the distributed group
+            world_size: Total number of processes in the distributed group
+            master_addr: Master node address. If None, will allocate dynamically for rank 0
+            master_port: Master node port. If None, will allocate dynamically for rank 0
+        """
+        self.rank = rank
+        self.world_size = world_size
+        self.master_addr = master_addr
+        self.master_port = master_port
+        
+        # Set up basic environment variables
+        os.environ["WORLD_SIZE"] = str(world_size)
+        os.environ["RANK"] = str(rank)
+        
+        # Set LOCAL_RANK based on Ray GPU allocation
+        os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+        
+        # If this is rank 0 and no master_addr/master_port provided, allocate them
+        if rank == 0 and (master_addr is None or master_port is None):
+            self._allocate_master_address()
+
+        os.environ["MASTER_ADDR"] = self.master_addr
+        os.environ["MASTER_PORT"] = str(self.master_port)
+    
+    def _allocate_master_address(self):
+        """Allocate master address and port for rank 0."""
+        if self.master_addr is None:
+            # Get the local IP address
+            hostname = socket.gethostname()
+            self.master_addr = socket.gethostbyname(hostname)
+        
+        if self.master_port is None:
+            # Allocate a free port
+            with socket.socket() as sock:
+                sock.bind(("", 0))
+                self.master_port = sock.getsockname()[1]
+    
+    def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
+        """Return the master address and port as a tuple."""
+        return (self.master_addr, self.master_port)
+    
+    def init_process_group(self) -> bool:
+        """Initialize the distributed process group."""
+        try:
+            
+            # Initialize process group
+            dist.init_process_group(timeout=timedelta(seconds=10))
+            
+            # Print debug information
+            num_visible_devices = torch.cuda.device_count()
+            print(f'num_visible_devices: {num_visible_devices}')
+            print('Ray actor init envs:')
+            print(f'rank: {dist.get_rank()}')
+            print(f'node_rank: {dist.get_rank() // 8}')
+            print(f'world_size: {dist.get_world_size()}')
+            print(f'local_rank: {dist.get_rank() % 8}')
+            print(f'master_addr: {self.master_addr}')
+            print(f'master_port: {self.master_port}')
+            print(f'is distributed initialized: {dist.is_initialized()}')
+            
+            return True
+        except Exception as e:
+            print(f"Failed to initialize process group: {e}")
+            return False
+    
+    def tensor_all_reduce(self) -> float:
+        """Perform a simple tensor all_reduce operation."""
+        # Create a tensor on the GPU and perform all_reduce
+        device = torch.device("cuda")
+        x = torch.ones(1, device=device)
+        dist.all_reduce(x)
+        
+        return x.item()
 
-    # number of visible devices
-    num_visible_devices = torch.cuda.device_count()
-    dist.init_process_group(timeout=timedelta(seconds=10))
-    print(f'num_visible_devices: {num_visible_devices}')
-    print('ray run init envs:')
-    print(f'rank: {dist.get_rank()}')
-    print(f'node_rank: {dist.get_rank() // 8}')
-    print(f'world_size: {dist.get_world_size()}')
-    print(f'local_rank: {dist.get_rank() % 8}')
-    print(f'master_addr: {master_addr}')
-    print(f'master_port: {master_port}')
-    print(f'is distributed initialized: {dist.is_initialized()}')
 
-    # Create a tensor on the GPU
-    device = torch.device(f"cuda")
-    x = torch.ones(1, device=device)
-    dist.all_reduce(x)
-    dist.destroy_process_group()
-    return x.item()
 
 @contextmanager
 def start_ray_server():
@@ -103,13 +148,37 @@ def run():
     with start_ray_server() as address:
         if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
-            with socket.socket() as sock:
-                sock.bind(("", 0))
-                master_port = sock.getsockname()[1]
-                print(f"\n=== STARTING DISTRIBUTED TRAINING ===")
-                tasks = [simple_gpu_task.remote(master_addr, master_port, i, dist.get_world_size()) for i in range(int(dist.get_world_size()))]
-            results = ray.get(tasks)
-            print(results)
+            
+            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+            
+            # Create actors - rank 0 will allocate master address/port
+            actors = []
+
+            # master actor will allocate master_addr and master_port
+            master_actor = DistributedGPUActor.remote(0, dist.get_world_size())
+            actors.append(master_actor)
+            
+            # Get master address from rank 0 actor
+            master_info = ray.get(master_actor.get_master_address.remote())
+            master_addr, master_port = master_info
+            print(f"Master address allocated: {master_addr}:{master_port}")
+            
+            # Create remaining actors with the master address/port
+            for i in range(1, dist.get_world_size()):
+                actor = DistributedGPUActor.remote(i, dist.get_world_size(), master_addr, master_port)
+                actors.append(actor)
+            
+            # Initialize process groups for all actors
+            init_tasks = [actor.init_process_group.remote() for actor in actors]
+            init_results = ray.get(init_tasks)
+            print(f"Process group initialization results: {init_results}")
+            
+            # Perform tensor all_reduce on all actors
+            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in actors]
+            results = ray.get(reduce_tasks)
+            print(f"All-reduce results: {results}")
+            
+
 
 if __name__ == '__main__':
     run()

From 12fbcef8639cb8717543fc1ff1c9b80d6c986614 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 22:55:44 +0000
Subject: [PATCH 017/107] another way to get ip address

---
 ray_test/test_torch_ray_distributed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 407da10a..9271ea10 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -83,8 +83,7 @@ def _allocate_master_address(self):
         """Allocate master address and port for rank 0."""
         if self.master_addr is None:
             # Get the local IP address
-            hostname = socket.gethostname()
-            self.master_addr = socket.gethostbyname(hostname)
+            self.master_addr = ray.util.get_node_ip_address().strip('[]')
         
         if self.master_port is None:
             # Allocate a free port

From 4ad2e65f64af3101cf342accc26033f8df83c263 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:12:55 +0000
Subject: [PATCH 018/107] ray init

---
 ray_test/test_torch_ray_distributed.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 9271ea10..1bb4fce1 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -2,7 +2,6 @@
 import torch
 import torch.distributed as dist
 import os
-import subprocess
 import socket
 import time
 from contextlib import contextmanager
@@ -17,8 +16,8 @@ def ray_noset_visible_devices():
 def init_ray():
     # init ray on master node, rank 0
     if dist.get_rank() == 0:
-        subprocess.run(['ray', 'start', '--head'], check=True)
-        ray.init(address='auto')
+        # Start head node - Ray will auto-detect available GPUs
+        ray.init()
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
@@ -32,19 +31,20 @@ def init_ray():
     if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
         address = address_list[0]
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
-        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
+        # Connect to head node - Ray will auto-detect local GPUs and contribute them
+        ray.init(address=address)
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size
-        num_nodes = ray.nodes()
+        num_gpus = int(ray.cluster_resources()['GPU'])
         counter = 0
-        while len(num_nodes) < dist.get_world_size() // 8:
-            print(f'waiting for {dist.get_world_size() // 8 - len(num_nodes)} nodes to be available')
-            num_nodes = ray.nodes()
+        while num_gpus < dist.get_world_size():
+            print(f'waiting for {dist.get_world_size() - num_gpus} gpus to be available')
+            num_gpus = int(ray.cluster_resources()['GPU'])
             time.sleep(5)
             counter += 1
             if counter > 4:
-                break
+                raise RuntimeError(f'Failed to start {dist.get_world_size()} gpus')
         print(f'Total available gpus: {ray.available_resources()}')
     return address
 
@@ -138,9 +138,6 @@ def start_ray_server():
         yield address
         dist.barrier()
     finally:
-        if dist.get_rank() == 0:
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
         dist.destroy_process_group()
 
 def run():

From 8df20ec5e77dfcaffc8a0026de87a797067a2518 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:20:46 +0000
Subject: [PATCH 019/107] ray remote

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 1bb4fce1..c81bb5b4 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -32,7 +32,7 @@ def init_ray():
         address = address_list[0]
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
         # Connect to head node - Ray will auto-detect local GPUs and contribute them
-        ray.init(address=address)
+        ray.init(f'ray://{address}')
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From 76c73a31daf002e75622d00df8d640816494bf70 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:27:51 +0000
Subject: [PATCH 020/107] barrier

---
 ray_test/test_torch_ray_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index c81bb5b4..2d78441c 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -22,9 +22,9 @@ def init_ray():
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
         print(f'available gpus: {ray.available_resources()}')
-
     else:
-        address = None
+        address = ''
+    dist.barrier()
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)

From 8f519684b0725b463bcbeb9056012768eac4c556 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:36:28 +0000
Subject: [PATCH 021/107] half subprocess

---
 ray_test/test_torch_ray_distributed.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 2d78441c..898c55a6 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -3,6 +3,7 @@
 import torch.distributed as dist
 import os
 import socket
+import subprocess
 import time
 from contextlib import contextmanager
 from typing import Optional, Tuple
@@ -24,7 +25,6 @@ def init_ray():
         print(f'available gpus: {ray.available_resources()}')
     else:
         address = ''
-    dist.barrier()
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)
@@ -32,7 +32,8 @@ def init_ray():
         address = address_list[0]
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
         # Connect to head node - Ray will auto-detect local GPUs and contribute them
-        ray.init(f'ray://{address}')
+        # ray.init(f'ray://{address}')
+        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From a00369cf82ab468d76bcc47e8c5dde24e2e4e323 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:38:50 +0000
Subject: [PATCH 022/107] try w/o port

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 898c55a6..c8996f80 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -33,7 +33,7 @@ def init_ray():
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
         # Connect to head node - Ray will auto-detect local GPUs and contribute them
         # ray.init(f'ray://{address}')
-        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
+        subprocess.run(['ray', 'start', f'--address={os.environ["MASTER_ADDR"]}'], check=True)
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From a777c542a5ba9b858b1472025609071e4ac99649 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:44:49 +0000
Subject: [PATCH 023/107] claude fix; questionable

---
 ray_test/test_torch_ray_distributed.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index c8996f80..864a637c 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -3,7 +3,6 @@
 import torch.distributed as dist
 import os
 import socket
-import subprocess
 import time
 from contextlib import contextmanager
 from typing import Optional, Tuple
@@ -22,18 +21,23 @@ def init_ray():
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
+        print(f'Head node Ray address: {address}')
         print(f'available gpus: {ray.available_resources()}')
     else:
-        address = ''
+        address = None
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)
     if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
         address = address_list[0]
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
-        # Connect to head node - Ray will auto-detect local GPUs and contribute them
-        # ray.init(f'ray://{address}')
-        subprocess.run(['ray', 'start', f'--address={os.environ["MASTER_ADDR"]}'], check=True)
+        # Connect to head node - use the address directly without ray:// prefix
+        try:
+            ray.init(address=address)
+            print(f'rank: {dist.get_rank()} successfully connected to Ray cluster')
+        except Exception as e:
+            print(f'rank: {dist.get_rank()} failed to connect to Ray: {e}')
+            raise
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From dd97cf299cfa5461daa1f3fee1fdcdd83d82de74 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 26 Jun 2025 23:51:03 +0000
Subject: [PATCH 024/107] do not raise

---
 ray_test/test_torch_ray_distributed.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 864a637c..fb2e5e3f 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -37,7 +37,6 @@ def init_ray():
             print(f'rank: {dist.get_rank()} successfully connected to Ray cluster')
         except Exception as e:
             print(f'rank: {dist.get_rank()} failed to connect to Ray: {e}')
-            raise
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From 09c92ed305a6d61bb21ad2ada1b3b2820ec58d6a Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 27 Jun 2025 00:02:07 +0000
Subject: [PATCH 025/107] revert back to subprocess

---
 ray_test/test_torch_ray_distributed.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index fb2e5e3f..c1f1639c 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -3,6 +3,7 @@
 import torch.distributed as dist
 import os
 import socket
+import subprocess
 import time
 from contextlib import contextmanager
 from typing import Optional, Tuple
@@ -17,26 +18,21 @@ def init_ray():
     # init ray on master node, rank 0
     if dist.get_rank() == 0:
         # Start head node - Ray will auto-detect available GPUs
-        ray.init()
+        subprocess.run(['ray', 'start', '--head'], check=True)
+        ray.init('auto')
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
-        print(f'Head node Ray address: {address}')
         print(f'available gpus: {ray.available_resources()}')
     else:
-        address = None
+        address = ''
     address_list = [address]
     # broadcast address to all other ranks
     dist.broadcast_object_list(address_list, src=0)
     if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
         address = address_list[0]
         print(f'rank: {dist.get_rank()} connecting to address: {address}')
-        # Connect to head node - use the address directly without ray:// prefix
-        try:
-            ray.init(address=address)
-            print(f'rank: {dist.get_rank()} successfully connected to Ray cluster')
-        except Exception as e:
-            print(f'rank: {dist.get_rank()} failed to connect to Ray: {e}')
+        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
     dist.barrier()
     if dist.get_rank() == 0:
         # wait until num of gpus reach world_size

From cef2b9ed516105598c1c910b304f2335756008b1 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 27 Jun 2025 00:28:48 +0000
Subject: [PATCH 026/107] mix init

---
 ray_test/test_torch_ray_distributed.py | 46 +++++++++++---------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index c1f1639c..3e430936 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -17,9 +17,8 @@ def ray_noset_visible_devices():
 def init_ray():
     # init ray on master node, rank 0
     if dist.get_rank() == 0:
-        # Start head node - Ray will auto-detect available GPUs
-        subprocess.run(['ray', 'start', '--head'], check=True)
-        ray.init('auto')
+        # Start head node
+        ray.init()
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
@@ -96,28 +95,22 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         return (self.master_addr, self.master_port)
     
     def init_process_group(self) -> bool:
-        """Initialize the distributed process group."""
-        try:
-            
-            # Initialize process group
-            dist.init_process_group(timeout=timedelta(seconds=10))
-            
-            # Print debug information
-            num_visible_devices = torch.cuda.device_count()
-            print(f'num_visible_devices: {num_visible_devices}')
-            print('Ray actor init envs:')
-            print(f'rank: {dist.get_rank()}')
-            print(f'node_rank: {dist.get_rank() // 8}')
-            print(f'world_size: {dist.get_world_size()}')
-            print(f'local_rank: {dist.get_rank() % 8}')
-            print(f'master_addr: {self.master_addr}')
-            print(f'master_port: {self.master_port}')
-            print(f'is distributed initialized: {dist.is_initialized()}')
-            
-            return True
-        except Exception as e:
-            print(f"Failed to initialize process group: {e}")
-            return False
+        """Initialize the distributed process group."""       
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=10))
+        
+        # Print debug information
+        num_visible_devices = torch.cuda.device_count()
+        print(f'num_visible_devices: {num_visible_devices}')
+        print('Ray actor init envs:')
+        print(f'rank: {dist.get_rank()}')
+        print(f'node_rank: {dist.get_rank() // 8}')
+        print(f'world_size: {dist.get_world_size()}')
+        print(f'local_rank: {dist.get_rank() % 8}')
+        print(f'master_addr: {self.master_addr}')
+        print(f'master_port: {self.master_port}')
+        print(f'is distributed initialized: {dist.is_initialized()}')
+
     
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
@@ -166,8 +159,7 @@ def run():
             
             # Initialize process groups for all actors
             init_tasks = [actor.init_process_group.remote() for actor in actors]
-            init_results = ray.get(init_tasks)
-            print(f"Process group initialization results: {init_results}")
+            ray.get(init_tasks)
             
             # Perform tensor all_reduce on all actors
             reduce_tasks = [actor.tensor_all_reduce.remote() for actor in actors]

From 9d9300146dc1e6ad0a010e4b781054b921b9e969 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 27 Jun 2025 00:36:21 +0000
Subject: [PATCH 027/107] manual stop

---
 ray_test/test_torch_ray_distributed.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 3e430936..fa7d4244 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -18,7 +18,8 @@ def init_ray():
     # init ray on master node, rank 0
     if dist.get_rank() == 0:
         # Start head node
-        ray.init()
+        subprocess.run(['ray', 'start', '--head'], check=True)
+        ray.init('auto')
         # get existing ray ip and port 
         ctx = ray.get_runtime_context()
         address = ctx.gcs_address
@@ -95,7 +96,8 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         return (self.master_addr, self.master_port)
     
     def init_process_group(self) -> bool:
-        """Initialize the distributed process group."""       
+        """Initialize the distributed process group."""
+            
         # Initialize process group
         dist.init_process_group(timeout=timedelta(seconds=10))
         
@@ -131,6 +133,9 @@ def start_ray_server():
         yield address
         dist.barrier()
     finally:
+        if dist.get_rank() == 0:
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)
         dist.destroy_process_group()
 
 def run():

From 9dc7a74e98cb087107623f856f6a0ef49848de65 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 27 Jun 2025 07:38:40 +0000
Subject: [PATCH 028/107] two gpus runs

---
 ray_test/test_torch_ray_distributed.py | 85 ++++++++++++++++++++------
 1 file changed, 66 insertions(+), 19 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index fa7d4244..fcb18a7e 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,6 +10,9 @@
 
 from datetime import timedelta
 
+from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
+
+
 def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
 
@@ -79,23 +82,29 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
         os.environ["MASTER_ADDR"] = self.master_addr
         os.environ["MASTER_PORT"] = str(self.master_port)
     
+    def get_node_ip(self):
+        return ray.util.get_node_ip_address().strip('[]')
+    
+    def get_free_port(self):
+        with socket.socket() as sock:
+            sock.bind(("", 0))
+            return sock.getsockname()[1]
+    
     def _allocate_master_address(self):
         """Allocate master address and port for rank 0."""
         if self.master_addr is None:
             # Get the local IP address
-            self.master_addr = ray.util.get_node_ip_address().strip('[]')
+            self.master_addr = self.get_node_ip()
         
         if self.master_port is None:
             # Allocate a free port
-            with socket.socket() as sock:
-                sock.bind(("", 0))
-                self.master_port = sock.getsockname()[1]
+            self.master_port = self.get_free_port()
     
     def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         """Return the master address and port as a tuple."""
         return (self.master_addr, self.master_port)
     
-    def init_process_group(self) -> bool:
+    def init_default_process_group(self) -> bool:
         """Initialize the distributed process group."""
             
         # Initialize process group
@@ -113,6 +122,12 @@ def init_process_group(self) -> bool:
         print(f'master_port: {self.master_port}')
         print(f'is distributed initialized: {dist.is_initialized()}')
 
+    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
+        """Initialize the vLLM process group."""
+        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+        return dist.get_world_size(group)
+
+
     
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
@@ -124,7 +139,6 @@ def tensor_all_reduce(self) -> float:
         return x.item()
 
 
-
 @contextmanager
 def start_ray_server():
     dist.init_process_group(backend='gloo')
@@ -144,13 +158,13 @@ def run():
             master_addr, _ = address.split(':')
             
             print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-            
+            num_train_actors = dist.get_world_size() // 2
             # Create actors - rank 0 will allocate master address/port
-            actors = []
+            train_actors = []
 
             # master actor will allocate master_addr and master_port
-            master_actor = DistributedGPUActor.remote(0, dist.get_world_size())
-            actors.append(master_actor)
+            master_actor = DistributedGPUActor.remote(0, num_train_actors)
+            train_actors.append(master_actor)
             
             # Get master address from rank 0 actor
             master_info = ray.get(master_actor.get_master_address.remote())
@@ -158,19 +172,52 @@ def run():
             print(f"Master address allocated: {master_addr}:{master_port}")
             
             # Create remaining actors with the master address/port
-            for i in range(1, dist.get_world_size()):
-                actor = DistributedGPUActor.remote(i, dist.get_world_size(), master_addr, master_port)
-                actors.append(actor)
+            for i in range(1, num_train_actors):
+                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
+                train_actors.append(actor)
             
             # Initialize process groups for all actors
-            init_tasks = [actor.init_process_group.remote() for actor in actors]
+            init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
             ray.get(init_tasks)
             
-            # Perform tensor all_reduce on all actors
-            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in actors]
-            results = ray.get(reduce_tasks)
-            print(f"All-reduce results: {results}")
-            
+            # # Perform tensor all_reduce on all actors
+            # reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
+            # results = ray.get(reduce_tasks)
+            # print(f"All-reduce results: {results}")
+
+            vllm_tensor_parallel_size = 1
+            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            vllm_engines = create_vllm_engines(
+                num_engines=num_vllm_engines,
+                tensor_parallel_size=vllm_tensor_parallel_size,
+                enforce_eager=True,
+                pretrain='meta-llama/Llama-3.2-1B-Instruct',
+                revision=None,
+                seed=1,
+                enable_prefix_caching=False,
+                max_model_len=2048,
+            )
+
+            new_port = ray.get(master_actor.get_free_port.remote())
+            refs = [
+                engine.init_process_group.remote(
+                    master_addr,
+                    new_port,
+                    i * vllm_tensor_parallel_size + 1,
+                    dist.get_world_size(),
+                    'weight-update',
+                    backend='nccl',
+                ) for i, engine in enumerate(vllm_engines)
+            ]
+            refs.append(master_actor.init_vllm_process_group.remote(
+                backend='nccl',
+                master_addr=master_addr,
+                master_port=new_port,
+                world_size=dist.get_world_size(),
+                rank=0,
+                group_name='weight-update',
+            ))
+            print(ray.get(refs))
 
 
 if __name__ == '__main__':

From c3db295d9c88dbc0de5fae399ac267f531e5fc06 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 30 Jun 2025 20:36:20 +0000
Subject: [PATCH 029/107] tensor parallel size 8

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index fcb18a7e..9f0f59a2 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -185,7 +185,7 @@ def run():
             # results = ray.get(reduce_tasks)
             # print(f"All-reduce results: {results}")
 
-            vllm_tensor_parallel_size = 1
+            vllm_tensor_parallel_size = 8
             num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
             vllm_engines = create_vllm_engines(
                 num_engines=num_vllm_engines,

From 824f08631e483c4c8e808c510e62e249e459239c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 30 Jun 2025 22:47:11 +0000
Subject: [PATCH 030/107] update world size

---
 ray_test/test_torch_ray_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 9f0f59a2..21a9a526 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -204,7 +204,7 @@ def run():
                     master_addr,
                     new_port,
                     i * vllm_tensor_parallel_size + 1,
-                    dist.get_world_size(),
+                    dist.get_world_size() // 2 + 1,
                     'weight-update',
                     backend='nccl',
                 ) for i, engine in enumerate(vllm_engines)
@@ -213,7 +213,7 @@ def run():
                 backend='nccl',
                 master_addr=master_addr,
                 master_port=new_port,
-                world_size=dist.get_world_size(),
+                world_size=dist.get_world_size() // 2 + 1,
                 rank=0,
                 group_name='weight-update',
             ))

From 443d6393e1ec583e579c7929f8589d82687547b5 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 30 Jun 2025 23:06:07 +0000
Subject: [PATCH 031/107] try all gpus for train again

---
 ray_test/test_torch_ray_distributed.py | 68 +++++++++++++-------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 21a9a526..543329ff 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -158,7 +158,7 @@ def run():
             master_addr, _ = address.split(':')
             
             print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-            num_train_actors = dist.get_world_size() // 2
+            num_train_actors = dist.get_world_size()
             # Create actors - rank 0 will allocate master address/port
             train_actors = []
 
@@ -185,39 +185,39 @@ def run():
             # results = ray.get(reduce_tasks)
             # print(f"All-reduce results: {results}")
 
-            vllm_tensor_parallel_size = 8
-            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            vllm_engines = create_vllm_engines(
-                num_engines=num_vllm_engines,
-                tensor_parallel_size=vllm_tensor_parallel_size,
-                enforce_eager=True,
-                pretrain='meta-llama/Llama-3.2-1B-Instruct',
-                revision=None,
-                seed=1,
-                enable_prefix_caching=False,
-                max_model_len=2048,
-            )
-
-            new_port = ray.get(master_actor.get_free_port.remote())
-            refs = [
-                engine.init_process_group.remote(
-                    master_addr,
-                    new_port,
-                    i * vllm_tensor_parallel_size + 1,
-                    dist.get_world_size() // 2 + 1,
-                    'weight-update',
-                    backend='nccl',
-                ) for i, engine in enumerate(vllm_engines)
-            ]
-            refs.append(master_actor.init_vllm_process_group.remote(
-                backend='nccl',
-                master_addr=master_addr,
-                master_port=new_port,
-                world_size=dist.get_world_size() // 2 + 1,
-                rank=0,
-                group_name='weight-update',
-            ))
-            print(ray.get(refs))
+            # vllm_tensor_parallel_size = 8
+            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            # vllm_engines = create_vllm_engines(
+            #     num_engines=num_vllm_engines,
+            #     tensor_parallel_size=vllm_tensor_parallel_size,
+            #     enforce_eager=True,
+            #     pretrain='meta-llama/Llama-3.2-1B-Instruct',
+            #     revision=None,
+            #     seed=1,
+            #     enable_prefix_caching=False,
+            #     max_model_len=2048,
+            # )
+
+            # new_port = ray.get(master_actor.get_free_port.remote())
+            # refs = [
+            #     engine.init_process_group.remote(
+            #         master_addr,
+            #         new_port,
+            #         i * vllm_tensor_parallel_size + 1,
+            #         dist.get_world_size() // 2 + 1,
+            #         'weight-update',
+            #         backend='nccl',
+            #     ) for i, engine in enumerate(vllm_engines)
+            # ]
+            # refs.append(master_actor.init_vllm_process_group.remote(
+            #     backend='nccl',
+            #     master_addr=master_addr,
+            #     master_port=new_port,
+            #     world_size=dist.get_world_size() // 2 + 1,
+            #     rank=0,
+            #     group_name='weight-update',
+            # ))
+            # print(ray.get(refs))
 
 
 if __name__ == '__main__':

From fde296005e00771b5fca10c24c88b8941edee91f Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 30 Jun 2025 23:29:47 +0000
Subject: [PATCH 032/107] use old port assignment

---
 ray_test/test_torch_ray_distributed.py | 58 +++++---------------------
 1 file changed, 10 insertions(+), 48 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 543329ff..dd3be33c 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,8 +10,6 @@
 
 from datetime import timedelta
 
-from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
-
 
 def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
@@ -94,11 +92,13 @@ def _allocate_master_address(self):
         """Allocate master address and port for rank 0."""
         if self.master_addr is None:
             # Get the local IP address
-            self.master_addr = self.get_node_ip()
-        
+            self.master_addr = ray.util.get_node_ip_address().strip('[]')
+
         if self.master_port is None:
             # Allocate a free port
-            self.master_port = self.get_free_port()
+            with socket.socket() as sock:
+                sock.bind(("", 0))
+                self.master_port = sock.getsockname()[1]
     
     def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         """Return the master address and port as a tuple."""
@@ -122,11 +122,6 @@ def init_default_process_group(self) -> bool:
         print(f'master_port: {self.master_port}')
         print(f'is distributed initialized: {dist.is_initialized()}')
 
-    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
-        """Initialize the vLLM process group."""
-        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-        return dist.get_world_size(group)
-
 
     
     def tensor_all_reduce(self) -> float:
@@ -180,44 +175,11 @@ def run():
             init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
             ray.get(init_tasks)
             
-            # # Perform tensor all_reduce on all actors
-            # reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
-            # results = ray.get(reduce_tasks)
-            # print(f"All-reduce results: {results}")
-
-            # vllm_tensor_parallel_size = 8
-            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            # vllm_engines = create_vllm_engines(
-            #     num_engines=num_vllm_engines,
-            #     tensor_parallel_size=vllm_tensor_parallel_size,
-            #     enforce_eager=True,
-            #     pretrain='meta-llama/Llama-3.2-1B-Instruct',
-            #     revision=None,
-            #     seed=1,
-            #     enable_prefix_caching=False,
-            #     max_model_len=2048,
-            # )
-
-            # new_port = ray.get(master_actor.get_free_port.remote())
-            # refs = [
-            #     engine.init_process_group.remote(
-            #         master_addr,
-            #         new_port,
-            #         i * vllm_tensor_parallel_size + 1,
-            #         dist.get_world_size() // 2 + 1,
-            #         'weight-update',
-            #         backend='nccl',
-            #     ) for i, engine in enumerate(vllm_engines)
-            # ]
-            # refs.append(master_actor.init_vllm_process_group.remote(
-            #     backend='nccl',
-            #     master_addr=master_addr,
-            #     master_port=new_port,
-            #     world_size=dist.get_world_size() // 2 + 1,
-            #     rank=0,
-            #     group_name='weight-update',
-            # ))
-            # print(ray.get(refs))
+            # Perform tensor all_reduce on all actors
+            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
+            results = ray.get(reduce_tasks)
+            print(f"All-reduce results: {results}")
+
 
 
 if __name__ == '__main__':

From 8b625aa15891720fd478a511df7512de9da48c9b Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:09:54 +0000
Subject: [PATCH 033/107] get freeport again

---
 ray_test/test_torch_ray_distributed.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index dd3be33c..3e3dd27a 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -96,9 +96,10 @@ def _allocate_master_address(self):
 
         if self.master_port is None:
             # Allocate a free port
-            with socket.socket() as sock:
-                sock.bind(("", 0))
-                self.master_port = sock.getsockname()[1]
+            # with socket.socket() as sock:
+            #     sock.bind(("", 0))
+            #     self.master_port = sock.getsockname()[1]
+            self.master_port = self.get_free_port()
     
     def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         """Return the master address and port as a tuple."""
@@ -106,10 +107,6 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
     
     def init_default_process_group(self) -> bool:
         """Initialize the distributed process group."""
-            
-        # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=10))
-        
         # Print debug information
         num_visible_devices = torch.cuda.device_count()
         print(f'num_visible_devices: {num_visible_devices}')
@@ -120,10 +117,12 @@ def init_default_process_group(self) -> bool:
         print(f'local_rank: {dist.get_rank() % 8}')
         print(f'master_addr: {self.master_addr}')
         print(f'master_port: {self.master_port}')
+         
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=10))
         print(f'is distributed initialized: {dist.is_initialized()}')
 
 
-    
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
         # Create a tensor on the GPU and perform all_reduce

From 52bb8aaaf278bb80e457d70e77e5e4e7990c4602 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:20:59 +0000
Subject: [PATCH 034/107] change order again

---
 ray_test/test_torch_ray_distributed.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 3e3dd27a..b122abfa 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -106,7 +106,10 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         return (self.master_addr, self.master_port)
     
     def init_default_process_group(self) -> bool:
-        """Initialize the distributed process group."""
+        """Initialize the distributed process group."""         
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=10))
+        print(f'is distributed initialized: {dist.is_initialized()}')
         # Print debug information
         num_visible_devices = torch.cuda.device_count()
         print(f'num_visible_devices: {num_visible_devices}')
@@ -117,11 +120,6 @@ def init_default_process_group(self) -> bool:
         print(f'local_rank: {dist.get_rank() % 8}')
         print(f'master_addr: {self.master_addr}')
         print(f'master_port: {self.master_port}')
-         
-        # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=10))
-        print(f'is distributed initialized: {dist.is_initialized()}')
-
 
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
@@ -141,9 +139,9 @@ def start_ray_server():
         yield address
         dist.barrier()
     finally:
-        if dist.get_rank() == 0:
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
+        # if dist.get_rank() == 0:
+        ray.shutdown()
+        subprocess.run(['ray', 'stop'], check=True)
         dist.destroy_process_group()
 
 def run():

From 934ef7a3524d8fb53fb3a789a9212771d4d25d36 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:23:10 +0000
Subject: [PATCH 035/107] rank 0 again

---
 ray_test/test_torch_ray_distributed.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index b122abfa..f2db3540 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -139,9 +139,10 @@ def start_ray_server():
         yield address
         dist.barrier()
     finally:
-        # if dist.get_rank() == 0:
-        ray.shutdown()
-        subprocess.run(['ray', 'stop'], check=True)
+        if dist.get_rank() == 0:
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)
+        dist.barrier()
         dist.destroy_process_group()
 
 def run():

From 04217ab148bcc4d0c66e284b63bbd81b571f9c80 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:25:55 +0000
Subject: [PATCH 036/107] half trainers

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index f2db3540..08dc3617 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -151,7 +151,7 @@ def run():
             master_addr, _ = address.split(':')
             
             print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-            num_train_actors = dist.get_world_size()
+            num_train_actors = dist.get_world_size() // 2
             # Create actors - rank 0 will allocate master address/port
             train_actors = []
 

From f68c703d4ee77907cfb353c7fb2e0d1a20847b7b Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:33:23 +0000
Subject: [PATCH 037/107] import; new method

---
 ray_test/test_torch_ray_distributed.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 08dc3617..9934d5fe 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,6 +10,8 @@
 
 from datetime import timedelta
 
+from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
+
 
 def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
@@ -96,9 +98,6 @@ def _allocate_master_address(self):
 
         if self.master_port is None:
             # Allocate a free port
-            # with socket.socket() as sock:
-            #     sock.bind(("", 0))
-            #     self.master_port = sock.getsockname()[1]
             self.master_port = self.get_free_port()
     
     def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
@@ -130,6 +129,10 @@ def tensor_all_reduce(self) -> float:
         
         return x.item()
 
+    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
+        """Initialize the vLLM process group."""
+        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+        return dist.get_world_size(group)
 
 @contextmanager
 def start_ray_server():

From 1fda4e38aaab6410247c261afdeffece9f270e9c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:35:58 +0000
Subject: [PATCH 038/107] no import

---
 ray_test/test_torch_ray_distributed.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 9934d5fe..9083309c 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,7 +10,7 @@
 
 from datetime import timedelta
 
-from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
+# from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
 
 
 def ray_noset_visible_devices():
@@ -129,10 +129,10 @@ def tensor_all_reduce(self) -> float:
         
         return x.item()
 
-    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
-        """Initialize the vLLM process group."""
-        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-        return dist.get_world_size(group)
+    # def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
+    #     """Initialize the vLLM process group."""
+    #     group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+    #     return dist.get_world_size(group)
 
 @contextmanager
 def start_ray_server():

From 6daa40c10fa0b369923fd11b2f004133c97ae45c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:40:11 +0000
Subject: [PATCH 039/107] import seems the killer

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 9083309c..8f3dc356 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,7 +10,7 @@
 
 from datetime import timedelta
 
-# from compose_rl.algorithms.online.generation_utils import create_vllm_engines, init_process_group
+from compose_rl.algorithms.online.generation_utils import init_process_group
 
 
 def ray_noset_visible_devices():

From 42ab5219ddb3f364c6199e042171749ee752c2e3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:49:41 +0000
Subject: [PATCH 040/107] import vllm

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 8f3dc356..37d62d12 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,7 +10,7 @@
 
 from datetime import timedelta
 
-from compose_rl.algorithms.online.generation_utils import init_process_group
+from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
 
 def ray_noset_visible_devices():

From c86b2dfcfe1609faa687042ac020b2a0a5da67b3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:52:21 +0000
Subject: [PATCH 041/107] change rend time to 30s

---
 ray_test/test_torch_ray_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 37d62d12..b0f34747 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -107,7 +107,7 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
     def init_default_process_group(self) -> bool:
         """Initialize the distributed process group."""         
         # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=10))
+        dist.init_process_group(timeout=timedelta(seconds=30))
         print(f'is distributed initialized: {dist.is_initialized()}')
         # Print debug information
         num_visible_devices = torch.cuda.device_count()

From 160d86175c27bf4351d972cc3ad02c3469adbfd3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 00:54:16 +0000
Subject: [PATCH 042/107] def method

---
 ray_test/test_torch_ray_distributed.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index b0f34747..94ffb95a 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -129,10 +129,10 @@ def tensor_all_reduce(self) -> float:
         
         return x.item()
 
-    # def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
-    #     """Initialize the vLLM process group."""
-    #     group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-    #     return dist.get_world_size(group)
+    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
+        """Initialize the vLLM process group."""
+        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+        return dist.get_world_size(group)
 
 @contextmanager
 def start_ray_server():

From 8eafdc6dd4846d6cdf4eaf104a3f86432b7efb57 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 07:42:37 +0000
Subject: [PATCH 043/107] add back vllm init

---
 ray_test/test_torch_ray_distributed.py | 41 +++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 94ffb95a..879241ca 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -94,7 +94,7 @@ def _allocate_master_address(self):
         """Allocate master address and port for rank 0."""
         if self.master_addr is None:
             # Get the local IP address
-            self.master_addr = ray.util.get_node_ip_address().strip('[]')
+            self.master_addr = self.get_node_ip()
 
         if self.master_port is None:
             # Allocate a free port
@@ -181,7 +181,46 @@ def run():
             results = ray.get(reduce_tasks)
             print(f"All-reduce results: {results}")
 
+            # Perform tensor all_reduce on all actors
+            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
+            results = ray.get(reduce_tasks)
+            print(f"All-reduce results: {results}")
 
+            vllm_tensor_parallel_size = 8
+            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            print(f'num_vllm_engines: {num_vllm_engines}')
+            vllm_engines = create_vllm_engines(
+                num_engines=num_vllm_engines,
+                tensor_parallel_size=vllm_tensor_parallel_size,
+                enforce_eager=True,
+                pretrain='meta-llama/Llama-3.2-1B-Instruct',
+                revision=None,
+                seed=1,
+                enable_prefix_caching=False,
+                max_model_len=2048,
+            )
+
+            new_port = ray.get(master_actor.get_free_port.remote())
+            print(f'new_port: {new_port}')
+            refs = [
+                engine.init_process_group.remote(
+                    master_addr,
+                    new_port,
+                    i * vllm_tensor_parallel_size + 1,
+                    dist.get_world_size() // 2 + 1,
+                    'weight-update',
+                    backend='nccl',
+                ) for i, engine in enumerate(vllm_engines)
+            ]
+            refs.append(master_actor.init_vllm_process_group.remote(
+                backend='nccl',
+                master_addr=master_addr,
+                master_port=new_port,
+                world_size=dist.get_world_size() // 2 + 1,
+                rank=0,
+                group_name='weight-update',
+            ))
+            print(ray.get(refs))
 
 if __name__ == '__main__':
     run()

From 0dc9f9583811567a5b3bbfb4231572c714603326 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 07:43:56 +0000
Subject: [PATCH 044/107] rm dup code

---
 ray_test/test_torch_ray_distributed.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 879241ca..45e86ef4 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -181,11 +181,6 @@ def run():
             results = ray.get(reduce_tasks)
             print(f"All-reduce results: {results}")
 
-            # Perform tensor all_reduce on all actors
-            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
-            results = ray.get(reduce_tasks)
-            print(f"All-reduce results: {results}")
-
             vllm_tensor_parallel_size = 8
             num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
             print(f'num_vllm_engines: {num_vllm_engines}')

From d3c1a7c091efaa07070e8b94c25108e9d651fe3b Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 07:49:21 +0000
Subject: [PATCH 045/107] comment out worker node

---
 compose_rl/algorithms/online/generation_utils/vllm_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compose_rl/algorithms/online/generation_utils/vllm_utils.py b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
index b844480d..80c6703d 100644
--- a/compose_rl/algorithms/online/generation_utils/vllm_utils.py
+++ b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
@@ -205,7 +205,7 @@ def create_vllm_engines(
     bundles = [{
         'GPU': 1,
         'CPU': 1,
-        'worker_node': 1,
+        # 'worker_node': 1,
     }] * tensor_parallel_size * num_engines
     pg = placement_group(bundles, strategy='PACK')  # type: ignore
 

From 8eb01160047adf4dfc8793278b27ff8d699e3acc Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 1 Jul 2025 19:40:51 +0000
Subject: [PATCH 046/107] running generations

---
 ray_test/test_torch_ray_distributed.py | 35 ++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 45e86ef4..775a4710 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -7,9 +7,10 @@
 import time
 from contextlib import contextmanager
 from typing import Optional, Tuple
-
+import argparse
 from datetime import timedelta
-
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from vllm import LLM
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
 
@@ -81,6 +82,8 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
 
         os.environ["MASTER_ADDR"] = self.master_addr
         os.environ["MASTER_PORT"] = str(self.master_port)
+
+        self.model = None
     
     def get_node_ip(self):
         return ray.util.get_node_ip_address().strip('[]')
@@ -119,6 +122,13 @@ def init_default_process_group(self) -> bool:
         print(f'local_rank: {dist.get_rank() % 8}')
         print(f'master_addr: {self.master_addr}')
         print(f'master_port: {self.master_port}')
+    
+    def init_tokenizer_and_llm(self, model_name: str):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
+        embedding_layer = transformers_model.get_input_embeddings()
+        llm = LLM(model=model_name, enable_prompt_embeds=True)
+        return tokenizer, embedding_layer, llm
 
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
@@ -148,7 +158,12 @@ def start_ray_server():
         dist.barrier()
         dist.destroy_process_group()
 
-def run():
+
+def run(tp_size: int = 8):
+    prompts = [
+        "what is RAY?",
+        "what is vLLM?",
+    ]
     with start_ray_server() as address:
         if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
@@ -181,7 +196,7 @@ def run():
             results = ray.get(reduce_tasks)
             print(f"All-reduce results: {results}")
 
-            vllm_tensor_parallel_size = 8
+            vllm_tensor_parallel_size = tp_size
             num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
             print(f'num_vllm_engines: {num_vllm_engines}')
             vllm_engines = create_vllm_engines(
@@ -217,5 +232,15 @@ def run():
             ))
             print(ray.get(refs))
 
+            ref = vllm_engines[0].generate.remote(prompts)
+            gen_results = ray.get(ref)
+            for output in gen_results:
+                prompt = output.prompt
+                generated_text = output.outputs[0].text
+                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
 if __name__ == '__main__':
-    run()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--tp_size', type=int, default=8)
+    args = parser.parse_args()
+    run(tp_size=args.tp_size)

From 231674deccdf36ef13ae5644011edc32862f0c00 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 2 Jul 2025 00:33:55 +0000
Subject: [PATCH 047/107] weight update donw

---
 ray_test/test_torch_ray_distributed.py | 35 ++++++++++++++++++--------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 775a4710..0bd20690 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -9,8 +9,8 @@
 from typing import Optional, Tuple
 import argparse
 from datetime import timedelta
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from vllm import LLM
+from transformers import AutoModelForCausalLM
+from compose_rl.algorithms.online.generation_utils.vllm_actor import LLMRayActor
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
 
@@ -84,6 +84,7 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
         os.environ["MASTER_PORT"] = str(self.master_port)
 
         self.model = None
+        self.model_update_group = None
     
     def get_node_ip(self):
         return ray.util.get_node_ip_address().strip('[]')
@@ -123,12 +124,15 @@ def init_default_process_group(self) -> bool:
         print(f'master_addr: {self.master_addr}')
         print(f'master_port: {self.master_port}')
     
-    def init_tokenizer_and_llm(self, model_name: str):
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
-        embedding_layer = transformers_model.get_input_embeddings()
-        llm = LLM(model=model_name, enable_prompt_embeds=True)
-        return tokenizer, embedding_layer, llm
+    def init_model(self, model_name: str):
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
+        self.model.to('cuda')
+
+    def sync_weights(self, vllm_engines: list[LLMRayActor]):
+        for name, p in self.model.named_parameters():
+            refs = [engine.update_weight.remote(name, p.dtype, p.shape, empty_cache=False) for engine in vllm_engines]
+            dist.broadcast(p, src=0, group=self.model_update_group)
+            ray.get(refs)
 
     def tensor_all_reduce(self) -> float:
         """Perform a simple tensor all_reduce operation."""
@@ -141,8 +145,8 @@ def tensor_all_reduce(self) -> float:
 
     def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
         """Initialize the vLLM process group."""
-        group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-        return dist.get_world_size(group)
+        self.model_update_group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+        return dist.get_world_size(self.model_update_group)
 
 @contextmanager
 def start_ray_server():
@@ -164,6 +168,7 @@ def run(tp_size: int = 8):
         "what is RAY?",
         "what is vLLM?",
     ]
+    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
     with start_ray_server() as address:
         if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
@@ -203,7 +208,7 @@ def run(tp_size: int = 8):
                 num_engines=num_vllm_engines,
                 tensor_parallel_size=vllm_tensor_parallel_size,
                 enforce_eager=True,
-                pretrain='meta-llama/Llama-3.2-1B-Instruct',
+                pretrain=pretrain_model_name,
                 revision=None,
                 seed=1,
                 enable_prefix_caching=False,
@@ -232,6 +237,14 @@ def run(tp_size: int = 8):
             ))
             print(ray.get(refs))
 
+            refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
+            ray.get(refs)
+            print('init model done')
+
+            refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
+            ray.get(refs)
+            print('sync weights done')
+
             ref = vllm_engines[0].generate.remote(prompts)
             gen_results = ray.get(ref)
             for output in gen_results:

From d296b4ef7ab2411e89c1290a3f40bb415485e352 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 2 Jul 2025 06:14:07 +0000
Subject: [PATCH 048/107] not sync weight

---
 ray_test/test_torch_ray_distributed.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 0bd20690..d93aa906 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -241,9 +241,9 @@ def run(tp_size: int = 8):
             ray.get(refs)
             print('init model done')
 
-            refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
-            ray.get(refs)
-            print('sync weights done')
+            # refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
+            # ray.get(refs)
+            # print('sync weights done')
 
             ref = vllm_engines[0].generate.remote(prompts)
             gen_results = ray.get(ref)

From fe582a64e03cf354c3b35c1fb3838eb861eaf8fe Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 2 Jul 2025 06:20:18 +0000
Subject: [PATCH 049/107] revert back

---
 ray_test/test_torch_ray_distributed.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index d93aa906..0bd20690 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -241,9 +241,9 @@ def run(tp_size: int = 8):
             ray.get(refs)
             print('init model done')
 
-            # refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
-            # ray.get(refs)
-            # print('sync weights done')
+            refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
+            ray.get(refs)
+            print('sync weights done')
 
             ref = vllm_engines[0].generate.remote(prompts)
             gen_results = ray.get(ref)

From f44f4691f3df9c37358b3aae7863b56f3d7a090e Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 2 Jul 2025 06:33:34 +0000
Subject: [PATCH 050/107] call master only

---
 ray_test/test_torch_ray_distributed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 0bd20690..61a73350 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -241,8 +241,7 @@ def run(tp_size: int = 8):
             ray.get(refs)
             print('init model done')
 
-            refs = [actor.sync_weights.remote(vllm_engines) for actor in train_actors]
-            ray.get(refs)
+            ray.get(master_actor.sync_weights.remote(vllm_engines))
             print('sync weights done')
 
             ref = vllm_engines[0].generate.remote(prompts)

From 01862b931d620f11079449f6b5a7567027e91ff9 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 4 Jul 2025 01:02:28 +0000
Subject: [PATCH 051/107] update pyproject for pyright filtering

---
 pyproject.toml                         | 5 +++++
 ray_test/test_torch_ray_distributed.py | 7 ++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 038ff73b..64761916 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,11 @@ include = [
 exclude = ['env-**', 'venv*', '.venv']
 stubPath = ""  # suppress useless 'stubPath is not a valid directory' errors
 
+typeCheckingMode = "standard"
+reportArgumentType = "none"
+reportAttributeAccessIssue = "none"
+reportOptionalMemberAccess = "none"
+reportCallIssue = "none"
 reportUnnecessaryIsInstance = "none" # it is ok to do this for clarity or safety
 reportMissingTypeStubs = "none"
 reportIncompatibleMethodOverride = "none"
diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
index 61a73350..e8926f5e 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/ray_test/test_torch_ray_distributed.py
@@ -10,9 +10,10 @@
 import argparse
 from datetime import timedelta
 from transformers import AutoModelForCausalLM
-from compose_rl.algorithms.online.generation_utils.vllm_actor import LLMRayActor
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
+from typing import Any
+
 
 def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
@@ -108,7 +109,7 @@ def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
         """Return the master address and port as a tuple."""
         return (self.master_addr, self.master_port)
     
-    def init_default_process_group(self) -> bool:
+    def init_default_process_group(self):
         """Initialize the distributed process group."""         
         # Initialize process group
         dist.init_process_group(timeout=timedelta(seconds=30))
@@ -128,7 +129,7 @@ def init_model(self, model_name: str):
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
         self.model.to('cuda')
 
-    def sync_weights(self, vllm_engines: list[LLMRayActor]):
+    def sync_weights(self, vllm_engines: list[Any]):
         for name, p in self.model.named_parameters():
             refs = [engine.update_weight.remote(name, p.dtype, p.shape, empty_cache=False) for engine in vllm_engines]
             dist.broadcast(p, src=0, group=self.model_update_group)

From b7ebe7df961cdbd01ce699da2a3f7ff69029e06b Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 4 Jul 2025 01:03:36 +0000
Subject: [PATCH 052/107] temp single controller

---
 .../online/single_controller_callback.py      | 996 ++++++++++++++++++
 1 file changed, 996 insertions(+)
 create mode 100644 compose_rl/algorithms/online/single_controller_callback.py

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
new file mode 100644
index 00000000..07bb9eb9
--- /dev/null
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -0,0 +1,996 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Online On-Policy RL callback."""
+
+from __future__ import annotations
+
+import logging
+import time
+from itertools import chain
+from typing import Any, Optional, Union
+
+import torch
+import wandb
+from composer.core import (
+    Precision,
+    State,
+    TimeUnit,
+    ensure_time,
+    get_precision_context,
+)
+from composer.core.data_spec import _default_split_batch
+from composer.loggers import Logger, MLFlowLogger, WandBLogger
+from composer.trainer.trainer import _get_initial_device_train_microbatch_size
+from composer.utils import dist, ensure_tuple
+from llmfoundry.interfaces import CallbackWithConfig
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from compose_rl.algorithms.online.generation_utils import (
+    broadcast_to_vllm,
+    vllm_generate,
+)
+from compose_rl.algorithms.online.model import (
+    ComposerHFPolicyLM,
+    ComposerMPTPolicyLM,
+)
+from compose_rl.algorithms.online.model_methods import (
+    OnPolicyEnum,
+)
+from compose_rl.algorithms.online.reward_manager import (
+    ReferenceOutput,
+    RewardManager,
+    RewardOutput,
+)
+from compose_rl.data.buffer import MinibatchRolloutBuffer
+from compose_rl.registry_builders import build_kl_controller
+from compose_rl.utils import (
+    compute_advantages,
+    dist_compute_masked_mean_and_var,
+    flatten,
+    get_decoded_sequence,
+    get_entropies,
+    get_log_probs,
+    mask_eos,
+    masked_mean,
+    masked_sum,
+    switch_left_to_right_padding,
+)
+
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
+
+__all__ = ['OnPolicyCallback', 'env_reward']
+
+log = logging.getLogger(__name__)
+
+
+def env_reward(
+    actor_critic: Policy,
+    reward_manager: RewardManager,
+    batch: dict,
+    max_gen_len: int,
+    precision: Precision,
+    device_train_microbatch_size: int,
+    tokenizer: Tokenizer,
+    eos_token_ids: list[int],
+    kl_estimator: Optional[str] = 'k1',
+    kl_clip_range: Optional[float] = 40.0,
+) -> tuple[
+    dict[str, torch.Tensor],
+    list[tuple[str, str]],
+    ReferenceOutput,
+    RewardOutput,
+]:
+    """Run reward on the model generated responses.
+
+    Runs reward over a set of sequences in the batch. It also does extra computation
+    that is required for later loss computation.
+
+    Args:
+        actor_critic (ComposerMosaicPolicy): Actor critic model to run reward over.
+        reward_manager (RewardManager): Composes the reference IFT model and all reward models.
+        batch (dict): The batch of data to run reward over.
+        max_gen_len (int): Maximum generation length.
+        precision (Precision): Precision to run computation.
+        device_train_microbatch_size (int): Device train microbatch size for the training job.
+            We need to do all log_prob computation with this in order to maintain numerics.
+        tokenizer (Tokenizer): The actor critic's tokenizer.
+        eos_token_ids (list[int]): A list of eos token ids.
+        kl_estimator (str): Which kl estimator to use. Options are 'k1', 'k2', 'k3' and 'k3_offpolicy'.
+        kl_clip_range (float): The clip range for the KL divergence.
+
+    Returns:
+        partial_env_output (dict[str, tensor]): Partially complete dictionary of return elements suitable
+            for PPO training
+        untokenized_prompt_and_responses (list): List of [str, str] tuples, each containing the decoded
+            prompt and responses tokens sequences, respectively
+        ref_output (ReferenceOutput): Pair of tensors corresponding to the KL penalty and
+            log prob sequences obtained from the reference model. If the reference model is non-blocking,
+            this will be an AsyncResult object that will resolve to the described output.
+        all_rewards (RewardOutput): Dictionary of tensors containing the reward output
+            from each reward model managed by the reward manager. If reward model "X" is non-blocking,
+            then all_rewards["X"] will be an AsyncResult object that will resolve to associated reward tensor.
+
+    Note:
+        Use the .get() method on an AsyncResult object (see Returns, above) to resolve it.
+    """
+    prompt_tokens = batch['prompt']
+
+    batch_size, _ = prompt_tokens.shape
+
+    pad_token_id = tokenizer.pad_token_id
+
+    if pad_token_id is None:
+        raise ValueError(
+            'Tokenizer does not have a pad token id. Please use a different tokenizer or add a pad token id.',
+        )
+
+    with get_precision_context(precision), torch.no_grad():
+        prompt_len = batch['prompt_len']
+        verified_answers = batch.get('verified_answer', None)
+        prompt_id = batch['prompt_id']
+        cur_device = prompt_tokens.device
+        prompt_dtype = prompt_tokens.dtype
+
+        assert 'sequences' in batch, f'sequences is not in batch {batch.keys()=}'
+
+        sequences = batch['sequences']
+        generated_len = torch.ones(
+            batch_size,
+            device=cur_device,
+            dtype=prompt_dtype,
+        ) * max_gen_len
+
+        # If all the processes early exit generate, then we need to manually pad everything
+        # we can pad this with pad tokens, since we switch the padding between left and right
+        # padding based on the sequence length + max_sequence_length.
+        if prompt_tokens.size(1) + max_gen_len > sequences.size(1):
+            len_to_pad = max_gen_len - (
+                sequences.size(1) - prompt_tokens.size(1)
+            )
+
+            extra_padding = torch.ones(
+                (batch_size, len_to_pad),
+                device=cur_device,
+                dtype=prompt_dtype,
+            ) * pad_token_id
+            sequences = torch.cat(
+                [sequences, extra_padding],  # type: ignore
+                dim=-1,  # type: ignore
+            )
+
+        # Sanity checking we're adding max_gen_len to prompt_tokens
+        if prompt_tokens.size(1) + max_gen_len != sequences.size(1):
+            raise ValueError(
+                f'Prompts {prompt_tokens.size(1)} + max_gen_len {max_gen_len} != sequences {sequences.size(1)}',
+            )
+
+        # Actions are what tokens the current policy would generate.
+        actions = sequences[:, -max_gen_len:]
+
+        right_padded_obs = switch_left_to_right_padding(
+            sequences,
+            prompt_len,
+            max_gen_len,
+            pad_token_id,  # type: ignore
+        )
+        right_padded_attn_mask = torch.logical_not(
+            torch.eq(right_padded_obs, pad_token_id),  # type: ignore
+        )
+
+        (
+            right_padded_obs,
+            right_padded_attn_mask,
+            generated_len,
+            action_mask,
+        ) = mask_eos(
+            actions=actions,
+            right_padded_obs=right_padded_obs,
+            right_padded_attn_mask=right_padded_attn_mask,
+            prompt_len=prompt_len,
+            generated_len=generated_len,
+            max_gen_len=max_gen_len,
+            eos_token_ids=eos_token_ids,  # type: ignore
+            pad_token=pad_token_id,  # type: ignore
+        )
+
+        untokenized_prompt_and_responses = []
+        for i in range(batch_size):
+            prompt = tokenizer.decode(  # type: ignore
+                right_padded_obs[i, :prompt_len[i]])
+            generated_text = tokenizer.decode(  # type:  ignore
+                get_decoded_sequence(actions[i], generated_len[i],
+                                            max_gen_len))
+            untokenized_prompt_and_responses.append((prompt, generated_text),)
+
+        # Making logits [batch_size, generated_len, vocab_size]
+        # We need to recompute the logits here. Otherwise there are numerical differences
+        # We also need to do it on the size of `device_train_microbatch_size` otherwise
+        # there are numerical differences at training time.
+        # log probs will be [batch_size, generated_len]
+        log_probs = []
+        entropies = []
+        values = []
+
+        input_model_kwargs = {
+            'obs': right_padded_obs,
+            'right_padded_attn_mask': right_padded_attn_mask,
+            'prompt_len': prompt_len,
+            'max_gen_len': max_gen_len,
+            'action_mask': action_mask,
+            'actions': actions,
+        }
+
+        microbatch_splits = _default_split_batch(
+            batch=input_model_kwargs,
+            microbatch_size=device_train_microbatch_size,
+        )
+        # Compute the device_train_microbatch_log_probs inside the for loop to reduce the softmax overhead
+        for split in microbatch_splits:
+            curr_kwargs = split
+
+            cur_output = actor_critic(curr_kwargs)
+            cur_logits = cur_output['logits']
+            # need to pull out current actions and prompt len
+            cur_actions = curr_kwargs['actions']
+            cur_action_mask = curr_kwargs['action_mask']
+            cur_prompt_len = curr_kwargs['prompt_len']
+
+            cur_log_probs = get_log_probs(
+                logits=cur_logits,
+                actions=cur_actions,
+                prompt_len=cur_prompt_len,
+                max_gen_len=max_gen_len,
+            )
+            cur_entropies = get_entropies(
+                logits=cur_logits,
+                action_mask=cur_action_mask,
+                prompt_len=cur_prompt_len,
+                max_gen_len=max_gen_len,
+            )
+            log_probs.append(cur_log_probs)
+            entropies.append(cur_entropies)
+            # Ignore values when the model doesn't have a value head
+            if 'values' in cur_output:
+                cur_values = cur_output['values']
+                values.append(cur_values)
+
+        device_train_microbatch_log_probs = torch.cat(log_probs)
+        device_train_microbatch_entropies = torch.cat(entropies)
+
+        partial_env_output = {
+            'prompt_id': prompt_id,
+            'actions': actions,
+            'old_log_probs': device_train_microbatch_log_probs,
+            'old_entropies': device_train_microbatch_entropies,
+            'obs': right_padded_obs,
+            'generated_len': generated_len,
+            'action_mask': action_mask,
+        }
+        if len(values) > 0:
+            device_train_microbatch_values = torch.cat(values)
+
+            # Need to add in the padding for the value function
+            value_action_mask = torch.cat([
+                action_mask,
+                torch.zeros((batch_size, 1), device=cur_device),
+            ],
+                                          dim=-1)
+            device_train_microbatch_values *= value_action_mask
+            partial_env_output['values'] = device_train_microbatch_values
+        # Future implementations may change the way reward_seq_len is defined
+        # e.g., if special formatting is applied
+        reward_seq_len = prompt_len + generated_len
+
+        ref_output, all_rewards = reward_manager(
+            raw_untokenized_texts=untokenized_prompt_and_responses,
+            right_padded_obses=right_padded_obs,
+            attention_masks=right_padded_attn_mask,
+            seq_lens=reward_seq_len,
+            generated_lens=generated_len,
+            prompt_lens=prompt_len,
+            max_gen_length=max_gen_len,
+            actions=actions,
+            action_log_probs=device_train_microbatch_log_probs,
+            device_train_microbatch_size=device_train_microbatch_size,
+            kl_estimator=kl_estimator,
+            kl_clip_range=kl_clip_range,
+            verified_answers=verified_answers,
+        )
+
+    return (
+        partial_env_output,
+        untokenized_prompt_and_responses,
+        ref_output,
+        all_rewards,
+    )
+
+
+class OnPolicyCallback(CallbackWithConfig):
+    """Callback for managing on-policy training in an RLHF loop.
+
+    Args:
+        train_config (dict): Training config passed to callback via foundry train.py as
+            callback is registered under callbacks_with_config registry.
+    """
+
+    def __init__(
+        self,
+        train_config: dict,
+    ):
+        var_config = train_config['variables']
+
+        # The maximum generation length.
+        self.max_gen_len: int = var_config.get('max_gen_len', 32)
+        # Gamma discounting for computing returns.
+        self.gamma = var_config.get('gamma', 1.0)
+        # Value used in the generalized advantage estimate calculation.
+        self.lambda_gae = var_config.get('lambda_gae', 1.0)
+
+        # Other algo specific hparams
+
+        # Which kl estimator to use
+        if 'kl_estimator' not in train_config['model']:
+            # TODO: Modify PPO to nuke config_overrides in the future
+            # Check in model's config_overrides
+            kl_estimator = train_config['model']['config_overrides'].get(
+                'kl_estimator',
+                'k1',
+            )
+        else:
+            kl_estimator = train_config['model'].get('kl_estimator', 'k1')
+        if kl_estimator not in ['k1', 'k2', 'k3', 'k3_offpolicy']:
+            raise ValueError(
+                f'Invalid kl estimator: {kl_estimator}. ' +
+                'Valid options are: k1, k2, k3, k3_offpolicy.',
+            )
+        self.kl_estimator = kl_estimator
+
+        if 'kl_clip_range' not in train_config['model']:
+            # TODO: Modify PPO to nuke config_overrides in the future
+            # Check in model's config_overrides
+            kl_clip_range = train_config['model']['config_overrides'].get(
+                'kl_clip_range',
+                40.0,
+            )
+        else:
+            kl_clip_range = train_config['model'].get('kl_clip_range', 40.0)
+        if kl_clip_range <= 0:
+            raise ValueError(
+                f'Invalid kl clip range: {kl_clip_range}. ' +
+                'Must be greater than 0.',
+            )
+        # check for precision and clip range
+        precision = train_config['precision']
+        if precision != 'fp32':
+            if kl_clip_range > 50.0:
+                log.warning(
+                    f'Clip value of {kl_clip_range=} will not be effective with {precision=} as range for tensors is too small',
+                )
+        self.kl_clip_range = kl_clip_range
+
+        # Generation keyword arguments.
+        self.generation_kwargs = var_config.get('generation_kwargs')
+        # The value to center the reward mean around.
+        self.center_reward_mean = var_config.get('center_reward_mean', None)
+
+        # The reward config which we will use to make the RewardManager.
+        self.reward_cfg = var_config['rewards']
+        self.max_seq_len = train_config['max_seq_len']
+        self.non_train_fsdp_config = var_config.get(
+            'non_train_fsdp_config',
+            train_config['fsdp_config'],
+        )
+        self.ref_config = var_config['reference_model']
+
+        # Per-device generate size.
+        self.device_generate_batch_size: int = var_config.get(
+            'device_generate_batch_size',
+            1,
+        )
+        self.device_train_batch_size: int = train_config.get(
+            'device_train_batch_size',
+            None,
+        )
+        assert self.device_train_batch_size is not None
+
+        # Number of batches to use for a single PPO epoch.
+        self.num_batches_per_update = var_config.get(
+            'num_batches_per_update',
+            1,
+        )
+        # Number of generations per prompt for a single PPO epoch.
+        self.generations_per_prompt: int = var_config.get(
+            'generations_per_prompt',
+            1,
+        )
+
+        if self.num_batches_per_update % self.generations_per_prompt != 0:
+            raise ValueError(
+                f'{self.num_batches_per_update=} must be divisible by {self.generations_per_prompt=}',
+            )
+
+        self.epochs_per_iteration = ensure_time(
+            var_config.get('epoch_per_iteration', 1),
+            TimeUnit.EPOCH,
+        )
+        assert self.epochs_per_iteration.unit == TimeUnit.EPOCH
+
+        # Programmatically setting the max buffer size instead of the yaml
+        var_config['buffer']['max_buffer_size'] = self.num_batches_per_update
+        self.buffer = MinibatchRolloutBuffer(var_config['buffer'])
+
+        # Build the KL controller through registries
+        kl_ctl_name = var_config['kl_controller'].pop('kl_ctl_type')
+        self.kl_ctl = build_kl_controller(
+            name=kl_ctl_name,
+            kwargs=var_config['kl_controller'],
+        )
+
+        self.kl_ift = []
+
+        self.wandb_logger = None
+        self.mlflow_logger = None
+        self.prompts_and_gens = []
+        self.prompt_ids_rewards_and_answers = []
+        self.iter_num = 0
+        self.train_prompt_loader_state_dict = None
+        self.train_prompt_loader = None
+
+        self.input_eos_token_ids = var_config.get('eos_token_ids', None)
+
+        if train_config.get('python_log_level', None) is not None:
+            logging.getLogger('compose_rl').setLevel(
+                train_config['python_log_level'].upper(),
+            )
+            logging.getLogger(__name__).setLevel(
+                train_config['python_log_level'].upper(),
+            )
+
+
+    def init(self, state: State, logger: Logger):
+        self.pad_token_idx = state.model.tokenizer.pad_token_id  # type: ignore
+        self.actor_critic = state.model
+
+        if self.actor_critic.loss_type == OnPolicyEnum.GRPO:
+            assert self.generations_per_prompt > 1, \
+                'GRPO requires multiple generations per prompt. ' + \
+                f'Current generations_per_prompt is: {self.generations_per_prompt}.'
+
+        # TODO (#158): do this through composer.
+        for destination in ensure_tuple(logger.destinations):
+            if isinstance(destination, WandBLogger):
+                self.wandb_logger = destination
+            if isinstance(destination, MLFlowLogger):
+                self.mlflow_logger = destination
+
+        # Set iteration_length
+        state._iteration_length = self.epochs_per_iteration
+
+        self.precision = state.precision
+        self.device_train_microbatch_size: int = state.device_train_microbatch_size  # type: ignore
+        if self.device_train_microbatch_size == 'auto':  # type: ignore
+            raise ValueError('auto microbatching is not supported for PPO')
+
+        self.iter_batch_size = self.num_batches_per_update * self.device_train_batch_size
+
+        # The KL penalty in the reward should only exist if we aren't minimizing
+        # the KL directly in the loss.
+        kl_penalty_in_reward = True
+
+        if hasattr(self.actor_critic, 'compute_kl_loss'):
+            kl_penalty_in_reward = not self.actor_critic.compute_kl_loss
+
+        self.reward_manager = RewardManager(
+            config=self.reward_cfg,
+            ref_config=self.ref_config,
+            tokenizer=self.actor_critic.tokenizer, # type: ignore
+            max_seq_len=self.max_seq_len,
+            fsdp_config=self.non_train_fsdp_config,
+            precision=state.precision,
+            kl_penalty_in_reward=kl_penalty_in_reward,
+        )
+
+        # This is needed to ensure PyTorch 2.4 checkpointing doesn't break
+        self.actor_critic.tokenizer.batch_encode_plus( # type: ignore
+            batch_text_or_text_pairs=['Dummy input'],
+            padding='longest',
+            truncation=True,
+            return_attention_mask=True,
+        )
+
+
+    def before_load(self, state: State, logger: Logger):
+        del logger
+        self.train_prompt_loader = state.train_dataloader
+
+    def after_load(self, state: State, logger: Logger):
+        del logger  # unused
+        # This needs to be done here becuase callbacks are init'd before we attach
+        # the dataloader as a property to state
+        self.tokenizer = state.model.tokenizer
+        self.eos_token_ids = [self.tokenizer.eos_token_id]  # type: ignore
+        if self.input_eos_token_ids is not None:
+            self.eos_token_ids = self.input_eos_token_ids
+            log.info(
+                f'The online RL loop will assume the following eos token ids {self.eos_token_ids}',
+            )
+            for eos_token_id in self.eos_token_ids:
+                log.info(
+                    f'Token {eos_token_id} is {self.tokenizer.decode([eos_token_id])}.',  # type: ignore
+                )
+
+        if self.pad_token_idx in self.eos_token_ids:
+            log.warning(
+                'pad_token_id is in eos_token_ids list. Be careful with any data processing going forward!',
+            )
+
+        self.train_prompt_loader_iter = iter(
+            self.train_prompt_loader,  # pyright: ignore
+        )
+
+        if self.train_prompt_loader_state_dict is not None:
+            self.train_prompt_loader.load_state_dict( # pyright: ignore
+                self.train_prompt_loader_state_dict,
+            )
+
+    def iteration_start(self, state: State, logger: Logger):
+        del logger  # unused
+
+        # batch = self._get_next_iter_prompts()
+        # batch = state.device.batch_to_device(batch)
+
+        # self._update_inference_model(batch)
+
+        # batch = self._interact_with_env(batch)
+
+        # self._get_reward(batch)
+
+        # Reset and initialize state train dataloader
+        log.warning(
+            'trainer._train_data_spec should be updated whenever the dataloader is updated',
+        )
+        # Train Dataloader
+        state.set_dataloader(self.buffer, 'ep')
+        state.train_dataloader = state.dataloader
+        state.device_train_microbatch_size = _get_initial_device_train_microbatch_size(
+            state.device_train_microbatch_size,
+            state.auto_microbatching,
+            state.train_dataloader,
+        )
+
+        # Update IFT KL
+        self._update_ift_kl()
+
+    def epoch_end(self, state: State, logger: Logger):
+        del logger  # unused
+        assert self.epochs_per_iteration == state._iteration_length
+        if self.actor_critic.determine_early_stop():  # type: ignore
+            state.timestamp.epoch_in_iteration = self.epochs_per_iteration
+
+    def iteration_end(self, state: State, logger: Logger):
+        del logger  # unused
+        self._log_generations_to_logger(state)
+        self._increment_rl_iter()
+        self.buffer.reset()
+        self.buffer.set_state_dict(
+            self.train_prompt_loader.state_dict(), # pyright: ignore
+            0,
+        )
+
+    def _get_next_iter_prompts(self):
+        """Gets the next iteration's batch of prompts."""
+        # Sample fewer batches for the Online RL interation depending on the number of generations per prompt
+        n_unique_batches = self.num_batches_per_update // self.generations_per_prompt
+        batches = [
+            self._get_single_batch_prompts() for _ in range(n_unique_batches)
+        ]
+
+        ret_batch = {}
+        for key in batches[0].keys():
+            curr_values = []
+
+            max_len = 0
+            if isinstance(batches[0][key], torch.Tensor):
+                max_len = max([batch[key].shape[-1] for batch in batches])
+
+            padding_key = None
+            for batch in batches:
+                # Explode the batch into multiple batches for each generation
+                for _ in range(self.generations_per_prompt):
+                    # For keys that do not require additional processing
+                    if key in ['prompt_len', 'verified_answer', 'prompt_id']:
+                        curr_values.append(batch[key])
+                        continue
+
+                    bs, seq_len = batch[key].shape
+
+                    if key == 'prompt':
+                        padding_key = self.pad_token_idx
+                        if (batch[key][:, -1] == padding_key).any():
+                            raise ValueError(
+                                'The last token in the prompt should not be the pad token. Please double '
+                                +
+                                'check the dataloader and prompt and dataloader.',
+                            )
+                    elif key == 'prompt_attention_mask':
+                        padding_key = False
+
+                    # Compute the required padding and concatenate with the batch tensor
+                    pad = torch.ones(
+                        (bs, max_len - seq_len),
+                        dtype=batch[key].dtype,
+                    ) * padding_key  # type: ignore
+                    curr_values.append(torch.cat([pad, batch[key]], dim=-1))
+
+            # For tensor fields, use torch.cat to combine the values; for string fields, just use the list
+            if isinstance(curr_values[0], torch.Tensor):
+                ret_batch[key] = torch.cat(curr_values)
+            else:
+                if key == 'verified_answer':
+                    ret_batch[key] = list(flatten(curr_values))
+                else:
+                    # this is an edge case that we will not hit currently, but just handling it as needed
+                    ret_batch[key] = curr_values
+
+        return ret_batch
+
+    def _get_single_batch_prompts(self):
+        """Gets a single batch of prompts from the dataloader."""
+        try:
+            return next(self.train_prompt_loader_iter)
+        except StopIteration:
+            # Reset the iterator to the beginning of the dataloader
+            self.train_prompt_loader_iter = iter(
+                self.train_prompt_loader,  # pyright: ignore
+            )
+            # Get the first sample from the dataloader
+            return next(self.train_prompt_loader_iter)
+
+    def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
+        """Have the policy interact with the environment.
+
+        Here, we redo microbatching, and run generate appropriately. We add the environment
+        interactions to the buffer.
+
+        Args:
+            batch (dict): the iteration level batch we want to interact with the environment.
+        """
+        max_gen_len = self.max_gen_len
+        generation_kwargs = self.generation_kwargs
+        with get_precision_context(self.precision), torch.no_grad():
+            # If vllm engines are available, we use them to generate sequences in one go
+            sequences = vllm_generate(
+                vllm_engines=vllm_engines,
+                batch=batch,
+                max_gen_len=max_gen_len,
+                generation_kwargs=generation_kwargs,
+                tokenizer=self.tokenizer,  # type: ignore
+            )
+        # Add the prepared sequences to the batch again
+        batch['sequences'] = sequences
+        return batch
+
+    def _get_reward(self, batch: dict[str, torch.Tensor]):
+        log.debug('Beginning reward computation for the rollout.')
+        start_reward_time = time.time()
+        env_outputs, prompts_and_gens, ref_outputs, all_rewards_dict = env_reward(
+            actor_critic=self.actor_critic,  # pyright: ignore
+            reward_manager=self.reward_manager,
+            batch=batch,
+            max_gen_len=self.max_gen_len,
+            precision=self.precision,
+            device_train_microbatch_size=self.device_train_microbatch_size,
+            tokenizer=self.tokenizer,  # type: ignore
+            eos_token_ids=self.eos_token_ids,  # type: ignore
+            kl_estimator=self.kl_estimator,
+            kl_clip_range=self.kl_clip_range,
+        )
+
+        end_reward_time = time.time()
+        total_reward_time = end_reward_time - start_reward_time
+        log.debug(
+            f'Finished reward computation for the rollout in {total_reward_time:.4f} seconds.',
+        )
+
+        self.prompts_and_gens.extend(prompts_and_gens)
+
+        gen_batch_partial_outputs = (env_outputs, ref_outputs, all_rewards_dict)
+        # For every partial output we want to resolve them together
+        # And compute the global per iteration batch advantage's mean and variance
+        resolved_outputs = self._resolve_outputs(
+            batch,
+            gen_batch_partial_outputs,
+        )
+
+        # We need to split the resolved outputs into minibatches
+        for idx in range(self.iter_batch_size // self.device_train_batch_size):
+            minibatch = self._extract_minibatch(
+                resolved_outputs,
+                idx,
+                self.device_train_batch_size,
+            )
+            self.buffer.add(minibatch)
+
+        # Making sure we correctly parsed the minibatches
+        assert len(self.buffer) == self.num_batches_per_update
+
+        self.actor_critic.train()
+
+    def _extract_minibatch(
+        self,
+        batch: dict[str, torch.Tensor],
+        idx: int,
+        minibatch_size: int,
+    ) -> dict[str, torch.Tensor]:
+        """Extracts a minibatch from a composite batch.
+
+        This helper is used to extract a particular minibatch of size
+        minibatch_size from `batch`, where `batch` may
+        have a batch size that exceeds the minibatch size.
+
+        Args:
+            batch (dict[str, torch.Tensor]): an arbitrary batch, where
+                each entry has batch size >= minibatch_size,
+                representing the concatenation of >= 1 minibatches.
+            idx (int): The index of the batch (see above description) to extract.
+
+        Returns:
+            curr_gen_batch (dict[str, torch.Tensor]): The gen_batch_idx'th
+                gen_batch extracted from the batch input.
+        """
+        start_idx = idx * minibatch_size
+        end_idx = (idx + 1) * minibatch_size
+        curr_gen_batch = {
+            batch_key: tensor[start_idx:end_idx]
+            for batch_key, tensor in batch.items()
+        }
+        return curr_gen_batch
+
+    def _resolve_outputs(
+        self,
+        iter_batch: dict[str, torch.Tensor],
+        partial_outputs: tuple[dict, ReferenceOutput, RewardOutput],
+    ) -> dict[str, torch.Tensor]:
+        """Resolve env/reference/reward outputs into a PPO minibatch.
+
+        Args:
+            iter_batch (dict): The batch for the current iteration.
+            partial_outputs (tuple): A tuple of (env_output, reference_output, reward_output),
+                one tuple for entire ppo iter batch. This tuple is created from `env_reward`.
+
+        Returns:
+            output_minibatch (dict): The final minibatch from the environment, with all AsyncResult
+                objects resolved and outputs processed for PPO training.
+        """
+        env_outs, ref_outs, rew_dict = partial_outputs
+        rew_outs = self.reward_manager.resolve_outputs(
+            ref_output=ref_outs,
+            reward_output=rew_dict,
+            kl_ctl=self.kl_ctl,
+            action_mask=env_outs['action_mask'],
+            center_reward_mean=self.center_reward_mean,
+        )
+        env_outs.update(rew_outs)
+
+        # Keep track of prompt ids, rewards and verified answers for logging
+        prompt_ids = env_outs['prompt_id'].detach().cpu().tolist()
+        rewards = env_outs['rewards'].sum(dim=-1).detach().cpu().tolist()
+        self.prompt_ids_rewards_and_answers.extend(
+            list(zip(prompt_ids, rewards, iter_batch['verified_answer'])),
+        )
+
+        # Adding the right_padded_attn_mask to the env_outputs
+        env_outs['right_padded_attn_mask'] = torch.logical_not(
+            torch.eq(env_outs['obs'], self.pad_token_idx),  # type: ignore
+        )
+
+        # Now that rewards are resolved, we can compute advantages
+        if self.actor_critic.loss_type == OnPolicyEnum.PPO:
+            env_outs['advantages'] = compute_advantages(
+                rewards=env_outs['rewards'],
+                values=env_outs['values'],
+                gamma=self.gamma,
+                lambda_gae=self.lambda_gae,
+            )
+        elif self.actor_critic.loss_type == OnPolicyEnum.GRPO:
+            # compute GRPO advantages
+            prompt_id = env_outs['prompt_id']
+            rewards = env_outs['rewards']
+
+            # Flatten the rewards by summing on sequence length/action_mask
+            flat_rewards = masked_sum(
+                rewards,
+                env_outs['action_mask'],
+                dim=-1,
+            )
+
+            # Get unique prompt IDs and their indices
+            unique_prompt_ids, inverse_indices = torch.unique(
+                prompt_id,
+                return_inverse=True,
+            )
+
+            # Use scatter to compute means and standard deviations
+            # First, we'll create a tensor to track counts, sums, and sum of squares
+            n_unique = len(unique_prompt_ids)
+            counts = torch.zeros(n_unique, device=prompt_id.device)
+            sums = torch.zeros(n_unique, device=prompt_id.device)
+            sum_squares = torch.zeros(n_unique, device=prompt_id.device)
+
+            # Use scatter_add to accumulate values
+            counts.scatter_add_(
+                0,
+                inverse_indices,
+                torch.ones_like(flat_rewards),
+            )
+            sums.scatter_add_(0, inverse_indices, flat_rewards)
+            sum_squares.scatter_add_(0, inverse_indices, flat_rewards**2)
+
+            # Compute means and standard deviations
+            means = sums / counts
+            variances = (sum_squares / counts) - (means**2)
+            stds = torch.sqrt(variances)
+
+            # Map back to original tensor shape
+            mean_rewards = means[inverse_indices]
+            std_rewards = stds[inverse_indices]
+
+            # Calculate GRPO advantage
+            grpo_advantage = (flat_rewards - mean_rewards)
+            # Only normalize the advantage if flag is set
+            if self.actor_critic.normalize_advantage:
+                grpo_advantage /= (std_rewards + 1e-4)
+
+            # Create advantages of the same shape as original rewards
+            advantages = torch.zeros_like(rewards)
+            # Copy the flat grpo_advantage according to action_mask
+            expanded_advantages = grpo_advantage.unsqueeze(1).expand_as(
+                env_outs['action_mask'],
+            )
+            advantages = torch.where(
+                env_outs['action_mask'].bool(),
+                expanded_advantages,
+                advantages,
+            )
+            env_outs['advantages'] = advantages
+        else:
+            raise ValueError(
+                f'Invalid loss type: {self.actor_critic.loss_type}. ' +
+                'Valid options are: ppo, grpo.',
+            )
+
+        batch_adv_mean, batch_adv_var = dist_compute_masked_mean_and_var(
+            env_outs['advantages'],
+            env_outs['action_mask'],
+        )
+
+        mean_ift = masked_mean(
+            env_outs['ift_kl'],
+            env_outs['action_mask'],
+        )
+        self.kl_ift.append(mean_ift.cpu())
+
+        iter_batch.update(env_outs)
+
+        iter_batch.update({
+            'max_gen_len':
+                torch.ones(self.iter_batch_size).to(torch.int32) *
+                self.max_gen_len,
+            'adv_masked_mean':
+                torch.ones(self.iter_batch_size) * batch_adv_mean.cpu(),
+            'adv_masked_var':
+                torch.ones(self.iter_batch_size) * batch_adv_var.cpu(),
+            'ift_kl_scalar':
+                torch.ones(self.iter_batch_size) * self.kl_ctl.value,
+            'reward_std':
+                torch.ones(self.iter_batch_size) *
+                env_outs['rewards'].std().to('cpu'),
+        })
+
+        # Moving minibatches to CPU to not take additional GPU memory
+        for k, v in iter_batch.items():
+            if hasattr(v, 'cpu'):
+                iter_batch[k] = v.cpu()
+
+        return iter_batch
+
+    def _log_generations_to_logger(self, state: State):
+        # Gather all prompts, generations, prompt_ids and rewards from all ranks
+        prompts_and_gens = list(
+            chain(*dist.all_gather_object(self.prompts_and_gens)),
+        )
+        prompt_ids_rewards_and_answers = list(
+            chain(*dist.all_gather_object(self.prompt_ids_rewards_and_answers)),
+        )
+        # Make a final list of tuple in the format: (prompt_id, reward, prompt, generation, verified_answer)
+        columns = [
+            'prompt_id',
+            'reward',
+            'prompt',
+            'generation',
+            'verified_answer',
+        ]
+        save_data = [[prompt_id, reward, prompt, generation, verified_answer]
+                     for (prompt_id, reward,
+                          verified_answer), (prompt, generation) in zip(
+                              prompt_ids_rewards_and_answers,
+                              prompts_and_gens,
+                          )]
+        # Sort the save_data by reward in descending order
+        save_data = sorted(save_data, key=lambda x: x[1], reverse=True)
+
+        if dist.get_global_rank() == 0:
+            if self.wandb_logger is not None:
+                assert wandb.run is not None, 'wandb should have started the run'
+
+                artifact = wandb.Artifact(
+                    'generate_samples_' + str(wandb.run.id),
+                    type='predictions',
+                )
+
+                text_table = wandb.Table(
+                    data=save_data,
+                    columns=columns,
+                )
+
+                artifact.add(text_table, 'predictions')
+                wandb.log_artifact(artifact)
+                wandb.log({'generations': text_table},
+                          step=state.timestamp.batch.value)
+
+            if self.mlflow_logger is not None:
+                self.mlflow_logger.log_table(
+                    columns=columns,
+                    rows=save_data,
+                    name=f'Prompt_generations_{self.iter_num}',
+                )
+
+        self.prompts_and_gens = []
+        self.prompt_ids_rewards_and_answers = []
+
+    def _update_ift_kl(self):
+        local_kl = torch.stack(self.kl_ift)
+
+        global_ift_kl = torch.cat(dist.all_gather_object(local_kl))
+        ift_kl_update = torch.mean(global_ift_kl)
+
+        self.kl_ctl.update(
+            ift_kl_update,
+            self.num_batches_per_update * self.device_train_batch_size *
+            dist.get_world_size(),
+        )
+
+        self.kl_ift = []
+
+    def _increment_rl_iter(self):
+        self.iter_num += 1
+
+    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
+        start_time = time.time()
+        log.info('Before broadcast to vLLM')
+        broadcast_to_vllm(
+            self.actor_critic,
+            vllm_engines,
+            self.model_update_group,
+            batch,
+            #loss_type=self.actor_critic.loss_type.value,  # type: ignore
+            loss_type=self.actor_critic.loss_type,  # type: ignore
+        )
+        log.info('Finished broadcasting to vLLM')
+        log.info(f'Took: {time.time() - start_time} to broadcast to vllm.')
+        dist.barrier()
+
+    def state_dict(self):
+        return {
+            'KL_ctl_state_dict': self.kl_ctl.state_dict(),
+            'iter_num': self.iter_num,
+            'train_prompt_loader':
+                self.train_prompt_loader.state_dict(),  # pyright: ignore
+        }
+
+    def load_state_dict(self, state_dict: dict[str, Any]):
+        self.kl_ctl.load_state_dict(state_dict['KL_ctl_state_dict'])
+        self.iter_num = state_dict['iter_num']
+        self.train_prompt_loader_state_dict = state_dict['train_prompt_loader']

From 1916fcdd9e7cd7b09a031b91941bf458106aed05 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 11 Jul 2025 00:43:12 +0000
Subject: [PATCH 053/107] separate out roundtrip code

---
 .../online/single_controller_callback.py      | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 07bb9eb9..3020a196 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -448,6 +448,8 @@ def __init__(
                 train_config['python_log_level'].upper(),
             )
 
+        self.batch_rollouts = None
+
 
     def init(self, state: State, logger: Logger):
         self.pad_token_idx = state.model.tokenizer.pad_token_id  # type: ignore
@@ -535,17 +537,20 @@ def after_load(self, state: State, logger: Logger):
                 self.train_prompt_loader_state_dict,
             )
 
+    def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+        """Round trip to inference engines.
+        
+        Args:
+            vllm_engines (list[Any]): The vllm engines to round trip to.
+        """
+        batch = device.batch_to_device(self._get_next_iter_prompts())
+        self._update_inference_model(batch, vllm_engines, model_update_group)
+        self.batch_rollouts = self._interact_with_env(batch, vllm_engines)
+
     def iteration_start(self, state: State, logger: Logger):
         del logger  # unused
 
-        # batch = self._get_next_iter_prompts()
-        # batch = state.device.batch_to_device(batch)
-
-        # self._update_inference_model(batch)
-
-        # batch = self._interact_with_env(batch)
-
-        # self._get_reward(batch)
+        self._get_reward(self.batch_rollouts)
 
         # Reset and initialize state train dataloader
         log.warning(
@@ -967,13 +972,13 @@ def _update_ift_kl(self):
     def _increment_rl_iter(self):
         self.iter_num += 1
 
-    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
+    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         start_time = time.time()
         log.info('Before broadcast to vLLM')
         broadcast_to_vllm(
             self.actor_critic,
             vllm_engines,
-            self.model_update_group,
+            model_update_group,
             batch,
             #loss_type=self.actor_critic.loss_type.value,  # type: ignore
             loss_type=self.actor_critic.loss_type,  # type: ignore

From d8d48167a4d47c0e553a9fe10750e0a8bafedcca Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 11 Jul 2025 08:05:19 +0000
Subject: [PATCH 054/107] relocate to use test

---
 ...ibuted.py => test_torch_ray_distributed.py | 181 +++++++++++++-----
 1 file changed, 132 insertions(+), 49 deletions(-)
 rename ray_test/test_torch_ray_distributed.py => test_torch_ray_distributed.py (62%)

diff --git a/ray_test/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
similarity index 62%
rename from ray_test/test_torch_ray_distributed.py
rename to test_torch_ray_distributed.py
index e8926f5e..49a1f457 100644
--- a/ray_test/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -9,7 +9,28 @@
 from typing import Optional, Tuple
 import argparse
 from datetime import timedelta
-from transformers import AutoModelForCausalLM
+
+from functools import partial
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from composer.utils import dist as composer_dist
+from composer import Trainer
+from composer.optim import DecoupledAdamW
+from llmfoundry.models import ComposerHFCausalLM, ComposerMPTCausalLM
+from torch.utils.data import DataLoader
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers.models.gpt2 import GPT2LMHeadModel
+
+from compose_rl.algorithms.online import (
+    ComposerHFPolicyLM,
+    ComposerMPTPolicyLM,
+    OnPolicyCallback,
+)
+from compose_rl.algorithms.online.model_methods import OnPolicyEnum
+from compose_rl.algorithms.online.modeling_hf import ComposerHFPolicy
+from compose_rl.data import prompt_dataset_collate_fn
+from tests.common import PromptDataset, VerifiablePromptDataset, world_size
+
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
 from typing import Any
@@ -86,7 +107,65 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
 
         self.model = None
         self.model_update_group = None
-    
+
+    def build_ref_model(self):
+        max_seq_len = 32
+        prompt_len = 10
+
+        model_name = 'gpt2'
+        tiny_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
+
+        dataset = PromptDataset(prompt_len=prompt_len)
+        dataloader = DataLoader(
+            dataset,
+            collate_fn=partial(
+                prompt_dataset_collate_fn,
+                tiny_gpt2_tokenizer,
+                max_seq_len,
+            ),
+            sampler=composer_dist.get_sampler(dataset),
+            batch_size=4,
+        )
+
+        # We need to mock this method, since our dataset isn't a StreamingDataset
+        dataloader.state_dict = lambda: {}
+        dataloader.load_state_dict = lambda x: None
+
+        model_config = {
+            'tokenizer': tiny_gpt2_tokenizer,
+            'pretrained_model_name_or_path': model_name,
+            'pretrained': True,
+            'use_flash_attention_2': True,
+            'allow_embedding_resizing': True,
+        }
+        tmp_model = ComposerHFCausalLM(**model_config)
+
+        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
+
+        tmp_ref_path = str('./ref_checkpoints')
+
+        temp_dataloader = [{
+            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
+            'attention_mask': torch.ones((2, 15)),
+            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
+        }]
+
+        temp_trainer = Trainer(
+            model=tmp_model,
+            train_dataloader=temp_dataloader,
+            optimizers=tmp_optimizer,
+            max_duration='1ba',
+            parallelism_config={'fsdp': {}},
+            save_folder=tmp_ref_path,
+            save_weights_only=True,
+            device_train_microbatch_size=2,
+        )
+
+        temp_trainer.fit()
+
+        # After making the reference model, we can proceed with the PPO training
+        tmp_ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+
     def get_node_ip(self):
         return ray.util.get_node_ip_address().strip('[]')
     
@@ -202,55 +281,59 @@ def run(tp_size: int = 8):
             results = ray.get(reduce_tasks)
             print(f"All-reduce results: {results}")
 
-            vllm_tensor_parallel_size = tp_size
-            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            print(f'num_vllm_engines: {num_vllm_engines}')
-            vllm_engines = create_vllm_engines(
-                num_engines=num_vllm_engines,
-                tensor_parallel_size=vllm_tensor_parallel_size,
-                enforce_eager=True,
-                pretrain=pretrain_model_name,
-                revision=None,
-                seed=1,
-                enable_prefix_caching=False,
-                max_model_len=2048,
-            )
-
-            new_port = ray.get(master_actor.get_free_port.remote())
-            print(f'new_port: {new_port}')
-            refs = [
-                engine.init_process_group.remote(
-                    master_addr,
-                    new_port,
-                    i * vllm_tensor_parallel_size + 1,
-                    dist.get_world_size() // 2 + 1,
-                    'weight-update',
-                    backend='nccl',
-                ) for i, engine in enumerate(vllm_engines)
-            ]
-            refs.append(master_actor.init_vllm_process_group.remote(
-                backend='nccl',
-                master_addr=master_addr,
-                master_port=new_port,
-                world_size=dist.get_world_size() // 2 + 1,
-                rank=0,
-                group_name='weight-update',
-            ))
-            print(ray.get(refs))
-
-            refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
-            ray.get(refs)
-            print('init model done')
+            build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
+            ray.get(build_ref_model_tasks)
+            print('build ref model done')
+
+            # vllm_tensor_parallel_size = tp_size
+            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            # print(f'num_vllm_engines: {num_vllm_engines}')
+            # vllm_engines = create_vllm_engines(
+            #     num_engines=num_vllm_engines,
+            #     tensor_parallel_size=vllm_tensor_parallel_size,
+            #     enforce_eager=True,
+            #     pretrain=pretrain_model_name,
+            #     revision=None,
+            #     seed=1,
+            #     enable_prefix_caching=False,
+            #     max_model_len=2048,
+            # )
+
+            # new_port = ray.get(master_actor.get_free_port.remote())
+            # print(f'new_port: {new_port}')
+            # refs = [
+            #     engine.init_process_group.remote(
+            #         master_addr,
+            #         new_port,
+            #         i * vllm_tensor_parallel_size + 1,
+            #         dist.get_world_size() // 2 + 1,
+            #         'weight-update',
+            #         backend='nccl',
+            #     ) for i, engine in enumerate(vllm_engines)
+            # ]
+            # refs.append(master_actor.init_vllm_process_group.remote(
+            #     backend='nccl',
+            #     master_addr=master_addr,
+            #     master_port=new_port,
+            #     world_size=dist.get_world_size() // 2 + 1,
+            #     rank=0,
+            #     group_name='weight-update',
+            # ))
+            # print(ray.get(refs))
+
+            # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
+            # ray.get(refs)
+            # print('init model done')
 
-            ray.get(master_actor.sync_weights.remote(vllm_engines))
-            print('sync weights done')
+            # ray.get(master_actor.sync_weights.remote(vllm_engines))
+            # print('sync weights done')
 
-            ref = vllm_engines[0].generate.remote(prompts)
-            gen_results = ray.get(ref)
-            for output in gen_results:
-                prompt = output.prompt
-                generated_text = output.outputs[0].text
-                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            # ref = vllm_engines[0].generate.remote(prompts)
+            # gen_results = ray.get(ref)
+            # for output in gen_results:
+            #     prompt = output.prompt
+            #     generated_text = output.outputs[0].text
+            #     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

From 9d9850b66c4d3889735164a2c6b4a77617fd9e13 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 11 Jul 2025 18:45:13 +0000
Subject: [PATCH 055/107] ref built

---
 test_torch_ray_distributed.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 49a1f457..8c83b4d5 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -109,6 +109,7 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
         self.model_update_group = None
 
     def build_ref_model(self):
+        composer_dist.initialize_dist('gpu')
         max_seq_len = 32
         prompt_len = 10
 
@@ -272,14 +273,14 @@ def run(tp_size: int = 8):
                 actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
                 train_actors.append(actor)
             
-            # Initialize process groups for all actors
-            init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
-            ray.get(init_tasks)
+            # # Initialize process groups for all actors
+            # init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
+            # ray.get(init_tasks)
             
-            # Perform tensor all_reduce on all actors
-            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
-            results = ray.get(reduce_tasks)
-            print(f"All-reduce results: {results}")
+            # # Perform tensor all_reduce on all actors
+            # reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
+            # results = ray.get(reduce_tasks)
+            # print(f"All-reduce results: {results}")
 
             build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
             ray.get(build_ref_model_tasks)

From 8352a08e0368992eabc28c06498410567a1ce887 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 11 Jul 2025 21:38:12 +0000
Subject: [PATCH 056/107] build ppo trainer

---
 compose_rl/algorithms/online/__init__.py      |   2 +
 .../online/single_controller_callback.py      |   4 +-
 test_torch_ray_distributed.py                 | 166 ++++++++++++++++--
 3 files changed, 154 insertions(+), 18 deletions(-)

diff --git a/compose_rl/algorithms/online/__init__.py b/compose_rl/algorithms/online/__init__.py
index 84efe4ce..92ab35d3 100644
--- a/compose_rl/algorithms/online/__init__.py
+++ b/compose_rl/algorithms/online/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from compose_rl.algorithms.online.callback import OnPolicyCallback
+from compose_rl.algorithms.online.single_controller_callback import SingleControllerOnPolicyCallback
 from compose_rl.algorithms.online.kl_controller import (
     AdaptiveKLController,
     BallKLController,
@@ -28,6 +29,7 @@
 
 __all__ = [
     'OnPolicyCallback',
+    'SingleControllerOnPolicyCallback',
     'ComposerMPTPolicyLM',
     'ComposerHFPolicyLM',
     'ComposerHFCriticFreePolicyLM',
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 3020a196..6703af41 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -60,7 +60,7 @@
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
 
-__all__ = ['OnPolicyCallback', 'env_reward']
+__all__ = ['SingleControllerOnPolicyCallback', 'env_reward']
 
 log = logging.getLogger(__name__)
 
@@ -307,7 +307,7 @@ def env_reward(
     )
 
 
-class OnPolicyCallback(CallbackWithConfig):
+class SingleControllerOnPolicyCallback(CallbackWithConfig):
     """Callback for managing on-policy training in an RLHF loop.
 
     Args:
diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 8c83b4d5..5e964d92 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -24,12 +24,13 @@
 from compose_rl.algorithms.online import (
     ComposerHFPolicyLM,
     ComposerMPTPolicyLM,
-    OnPolicyCallback,
+    SingleControllerOnPolicyCallback,
 )
 from compose_rl.algorithms.online.model_methods import OnPolicyEnum
 from compose_rl.algorithms.online.modeling_hf import ComposerHFPolicy
 from compose_rl.data import prompt_dataset_collate_fn
 from tests.common import PromptDataset, VerifiablePromptDataset, world_size
+from tests.fixtures.fixtures import assets_tokenizer_helper
 
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
@@ -108,12 +109,16 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
         self.model = None
         self.model_update_group = None
 
-    def build_ref_model(self):
-        composer_dist.initialize_dist('gpu')
+        self.model_name = 'gpt2'
+        self.ref_path = None
+        self._dataloader = None
+        self._tokenizer = None
+        self.ppo_callback = None
+        self.ppo_trainer: Trainer = None
+
+    def build_dataloader(self):
         max_seq_len = 32
         prompt_len = 10
-
-        model_name = 'gpt2'
         tiny_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
 
         dataset = PromptDataset(prompt_len=prompt_len)
@@ -127,19 +132,46 @@ def build_ref_model(self):
             sampler=composer_dist.get_sampler(dataset),
             batch_size=4,
         )
-
         # We need to mock this method, since our dataset isn't a StreamingDataset
         dataloader.state_dict = lambda: {}
         dataloader.load_state_dict = lambda x: None
-
-        model_config = {
-            'tokenizer': tiny_gpt2_tokenizer,
-            'pretrained_model_name_or_path': model_name,
+        return dataloader
+
+    @property
+    def dataloader(self):
+        if self._dataloader is None:
+            self._dataloader = self.build_dataloader()
+        return self._dataloader
+
+    def build_tokenizer(self):
+        tokenizer = assets_tokenizer_helper('gpt2')
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        return tokenizer
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = self.build_tokenizer()
+        return self._tokenizer
+
+    @property
+    def model_config(self):
+        return {
+            'tokenizer': self.tokenizer,
+            'pretrained_model_name_or_path': self.model_name,
             'pretrained': True,
             'use_flash_attention_2': True,
             'allow_embedding_resizing': True,
         }
-        tmp_model = ComposerHFCausalLM(**model_config)
+
+    @property
+    def fsdp_config(self):
+        return dict()
+
+    def build_ref_model(self):
+        composer_dist.initialize_dist('gpu')
+
+        tmp_model = ComposerHFCausalLM(**self.model_config)
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
 
@@ -156,7 +188,7 @@ def build_ref_model(self):
             train_dataloader=temp_dataloader,
             optimizers=tmp_optimizer,
             max_duration='1ba',
-            parallelism_config={'fsdp': {}},
+            parallelism_config={'fsdp': self.fsdp_config},
             save_folder=tmp_ref_path,
             save_weights_only=True,
             device_train_microbatch_size=2,
@@ -165,7 +197,105 @@ def build_ref_model(self):
         temp_trainer.fit()
 
         # After making the reference model, we can proceed with the PPO training
-        tmp_ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+        self.ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+
+    def build_ppo_trainer(self):
+        composer_dist.initialize_dist('gpu')
+
+        max_seq_len = 32
+        precision = 'amp_bf16'
+
+        model = ComposerHFPolicyLM(**self.model_config)
+
+        optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
+
+        num_batches_per_update = 2
+
+        # ref_model_config = copy.deepcopy(self.model_config)
+        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
+
+        variables = {
+            'buffer': {
+                'name': 'MinibatchRolloutBuffer',
+                'max_buffer_size': num_batches_per_update,
+            },
+            'max_gen_len': 8,
+            'gamma': 0.99,
+            'lambda_gae': 0.95,
+            'generation_kwargs': {
+                'use_cache': True,
+                'do_sample': False,
+            },
+            'kl_controller': {
+                'init_kl_coef': 0.2,
+                'target': 0.01,
+                'horizon': 12800,
+                'kl_ctl_type': 'adaptive',
+            },
+            'reference_model': {
+                'model_config': ref_model_config,
+                'precision': precision,
+                'load_path': self.ref_path,
+                'non_train_fsdp_config': self.fsdp_config,
+            },
+            'device_generate_batch_size': 2,
+            'epoch_per_iteration': 1,
+            'num_batches_per_update': num_batches_per_update,
+            'rewards': {
+                'output_length': {
+                    'reward_type': 'output_length',
+                    'max_gen_len': 10,
+                },
+            },
+        }
+        train_config = {
+            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
+            'fsdp_config': self.fsdp_config,
+            'seed': 17,
+            'precision': precision,
+            'variables': variables,
+            'max_seq_len': max_seq_len,
+            'global_train_batch_size': 2,
+            'device_train_batch_size': 2,
+            'device_train_microbatch_size': 1,
+        }
+
+        tmp_save_path = str('./checkpoints')
+        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
+        self.ppo_trainer = Trainer(
+            model=model,
+            optimizers=optimizer,
+            callbacks=self.ppo_callback,
+            train_dataloader=self.dataloader,
+            precision=precision,
+            parallelism_config={'fsdp': self.fsdp_config},
+            max_duration='3iter',
+            device_train_microbatch_size=1,
+            load_path=self.ref_path,
+            save_folder=tmp_save_path,
+            save_interval='1iter',
+        )
+
+        # trainer.fit(duration='1iter')
+
+        # This is the KL assert that must be true if we are truly loading from the same model.
+        # This is only true on the first iteration
+        # assert torch.allclose(
+        #     trainer.state.loss['kl/ift_kl'], # pyright: ignore
+        #     torch.tensor(0.0),
+        #     atol=5e-5,
+        # )
+
+    def train_1_iter(self):
+        self.ppo_trainer.fit(duration='1iter')
+
+    def sync_weight_and_gen(self, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+        self.ppo_callback.round_trip_to_inference_engines(
+            device=self.ppo_trainer.state.device,
+            vllm_engines=vllm_engines,
+            model_update_group=self.model_update_group,
+        )
+
 
     def get_node_ip(self):
         return ray.util.get_node_ip_address().strip('[]')
@@ -282,9 +412,13 @@ def run(tp_size: int = 8):
             # results = ray.get(reduce_tasks)
             # print(f"All-reduce results: {results}")
 
-            build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
-            ray.get(build_ref_model_tasks)
-            print('build ref model done')
+            # build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
+            # ray.get(build_ref_model_tasks)
+            # print('build ref model done')
+
+            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in train_actors]
+            ray.get(build_ppo_trainer_tasks)
+            print('build ppo trainer done')
 
             # vllm_tensor_parallel_size = tp_size
             # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size

From 4f093a43df0cc791af55e46c108200ce9794c80c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 11 Jul 2025 23:17:26 +0000
Subject: [PATCH 057/107] weight update name mismatch; inference and
 prompts/gen exchange and train works

---
 .../online/single_controller_callback.py      |   3 +-
 test_torch_ray_distributed.py                 | 103 ++++++++++--------
 tests/common/datasets.py                      |   2 +
 3 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 6703af41..b07fa172 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -544,7 +544,7 @@ def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any],
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
         batch = device.batch_to_device(self._get_next_iter_prompts())
-        self._update_inference_model(batch, vllm_engines, model_update_group)
+        # self._update_inference_model(batch, vllm_engines, model_update_group)
         self.batch_rollouts = self._interact_with_env(batch, vllm_engines)
 
     def iteration_start(self, state: State, logger: Logger):
@@ -593,6 +593,7 @@ def _get_next_iter_prompts(self):
         ]
 
         ret_batch = {}
+        assert 'prompt_id' in batches[0], 'prompt_id must be in the batch'
         for key in batches[0].keys():
             curr_values = []
 
diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 5e964d92..440ba537 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -109,7 +109,7 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
         self.model = None
         self.model_update_group = None
 
-        self.model_name = 'gpt2'
+        self.pretrain_model_name = None
         self.ref_path = None
         self._dataloader = None
         self._tokenizer = None
@@ -119,14 +119,13 @@ def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None
     def build_dataloader(self):
         max_seq_len = 32
         prompt_len = 10
-        tiny_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
 
-        dataset = PromptDataset(prompt_len=prompt_len)
+        dataset = VerifiablePromptDataset(prompt_len=prompt_len)
         dataloader = DataLoader(
             dataset,
             collate_fn=partial(
                 prompt_dataset_collate_fn,
-                tiny_gpt2_tokenizer,
+                self.tokenizer,
                 max_seq_len,
             ),
             sampler=composer_dist.get_sampler(dataset),
@@ -144,7 +143,7 @@ def dataloader(self):
         return self._dataloader
 
     def build_tokenizer(self):
-        tokenizer = assets_tokenizer_helper('gpt2')
+        tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         return tokenizer
 
@@ -158,7 +157,7 @@ def tokenizer(self):
     def model_config(self):
         return {
             'tokenizer': self.tokenizer,
-            'pretrained_model_name_or_path': self.model_name,
+            'pretrained_model_name_or_path': self.pretrain_model_name,
             'pretrained': True,
             'use_flash_attention_2': True,
             'allow_embedding_resizing': True,
@@ -199,9 +198,9 @@ def build_ref_model(self):
         # After making the reference model, we can proceed with the PPO training
         self.ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
 
-    def build_ppo_trainer(self):
+    def build_ppo_trainer(self, pretrain_model_name: str):
+        self.pretrain_model_name = pretrain_model_name
         composer_dist.initialize_dist('gpu')
-
         max_seq_len = 32
         precision = 'amp_bf16'
 
@@ -272,8 +271,8 @@ def build_ppo_trainer(self):
             max_duration='3iter',
             device_train_microbatch_size=1,
             load_path=self.ref_path,
-            save_folder=tmp_save_path,
-            save_interval='1iter',
+            # save_folder=tmp_save_path,
+            # save_interval='1iter',
         )
 
         # trainer.fit(duration='1iter')
@@ -289,7 +288,7 @@ def build_ppo_trainer(self):
     def train_1_iter(self):
         self.ppo_trainer.fit(duration='1iter')
 
-    def sync_weight_and_gen(self, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+    def sync_weight_and_gen(self, vllm_engines: list[Any]):
         self.ppo_callback.round_trip_to_inference_engines(
             device=self.ppo_trainer.state.device,
             vllm_engines=vllm_engines,
@@ -379,7 +378,7 @@ def run(tp_size: int = 8):
         "what is RAY?",
         "what is vLLM?",
     ]
-    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
+    pretrain_model_name = 'gpt2'
     with start_ray_server() as address:
         if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
@@ -416,46 +415,54 @@ def run(tp_size: int = 8):
             # ray.get(build_ref_model_tasks)
             # print('build ref model done')
 
-            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in train_actors]
+            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in train_actors]
             ray.get(build_ppo_trainer_tasks)
             print('build ppo trainer done')
 
-            # vllm_tensor_parallel_size = tp_size
-            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            # print(f'num_vllm_engines: {num_vllm_engines}')
-            # vllm_engines = create_vllm_engines(
-            #     num_engines=num_vllm_engines,
-            #     tensor_parallel_size=vllm_tensor_parallel_size,
-            #     enforce_eager=True,
-            #     pretrain=pretrain_model_name,
-            #     revision=None,
-            #     seed=1,
-            #     enable_prefix_caching=False,
-            #     max_model_len=2048,
-            # )
-
-            # new_port = ray.get(master_actor.get_free_port.remote())
-            # print(f'new_port: {new_port}')
-            # refs = [
-            #     engine.init_process_group.remote(
-            #         master_addr,
-            #         new_port,
-            #         i * vllm_tensor_parallel_size + 1,
-            #         dist.get_world_size() // 2 + 1,
-            #         'weight-update',
-            #         backend='nccl',
-            #     ) for i, engine in enumerate(vllm_engines)
-            # ]
-            # refs.append(master_actor.init_vllm_process_group.remote(
-            #     backend='nccl',
-            #     master_addr=master_addr,
-            #     master_port=new_port,
-            #     world_size=dist.get_world_size() // 2 + 1,
-            #     rank=0,
-            #     group_name='weight-update',
-            # ))
-            # print(ray.get(refs))
+            vllm_tensor_parallel_size = min(tp_size, dist.get_world_size() - num_train_actors)
+            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            print(f'num_vllm_engines: {num_vllm_engines}')
+            vllm_engines = create_vllm_engines(
+                num_engines=num_vllm_engines,
+                tensor_parallel_size=vllm_tensor_parallel_size,
+                enforce_eager=True,
+                pretrain=pretrain_model_name,
+                revision=None,
+                seed=1,
+                enable_prefix_caching=False,
+                max_model_len=512,
+            )
+
+            new_port = ray.get(master_actor.get_free_port.remote())
+            print(f'new_port: {new_port}')
+            refs = [
+                engine.init_process_group.remote(
+                    master_addr,
+                    new_port,
+                    i * vllm_tensor_parallel_size + 1,
+                    dist.get_world_size() // 2 + 1,
+                    'weight-update',
+                    backend='nccl',
+                ) for i, engine in enumerate(vllm_engines)
+            ]
+            refs.append(master_actor.init_vllm_process_group.remote(
+                backend='nccl',
+                master_addr=master_addr,
+                master_port=new_port,
+                world_size=dist.get_world_size() // 2 + 1,
+                rank=0,
+                group_name='weight-update',
+            ))
+            # should only get refs of both master and vllm_engines together, otherwise it will hang
+            print(ray.get(refs))
+
+            refs = [actor.sync_weight_and_gen.remote(vllm_engines) for actor in train_actors]
+            ray.get(refs)
+            print('sync weight and gen done')
 
+            refs = [actor.train_1_iter.remote() for actor in train_actors]
+            ray.get(refs)
+            print('train 1 iter done')
             # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
             # ray.get(refs)
             # print('init model done')
diff --git a/tests/common/datasets.py b/tests/common/datasets.py
index 98da559e..0bb525cf 100644
--- a/tests/common/datasets.py
+++ b/tests/common/datasets.py
@@ -71,6 +71,7 @@ def __getitem__(self, index: int):
         return {
             'prompt': torch.ones((self.prompt_len,)).int(),
             'prompt_len': torch.Tensor([self.prompt_len]).to(torch.int64),
+            'prompt_id': torch.Tensor([index]).int(),
         }
 
 
@@ -87,5 +88,6 @@ def __getitem__(self, index: int):
         return {
             'prompt': torch.ones((self.prompt_len,)).int(),
             'prompt_len': torch.Tensor([self.prompt_len]).to(torch.int64),
+            'prompt_id': torch.Tensor([index]).int(),
             'verified_answer': '1',
         }

From 88758c9938b6284b1a1a0c3cb366d825dc34a70d Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Sat, 12 Jul 2025 00:02:04 +0000
Subject: [PATCH 058/107] update ref model build

---
 test_torch_ray_distributed.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 440ba537..3ca1eaeb 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -143,7 +143,8 @@ def dataloader(self):
         return self._dataloader
 
     def build_tokenizer(self):
-        tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
+        # tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         return tokenizer
 
@@ -167,7 +168,8 @@ def model_config(self):
     def fsdp_config(self):
         return dict()
 
-    def build_ref_model(self):
+    def build_ref_model(self, pretrain_model_name: str):
+        self.pretrain_model_name = pretrain_model_name
         composer_dist.initialize_dist('gpu')
 
         tmp_model = ComposerHFCausalLM(**self.model_config)
@@ -259,7 +261,7 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             'device_train_microbatch_size': 1,
         }
 
-        tmp_save_path = str('./checkpoints')
+        # tmp_save_path = str('./checkpoints')
         self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
         self.ppo_trainer = Trainer(
             model=model,
@@ -378,7 +380,8 @@ def run(tp_size: int = 8):
         "what is RAY?",
         "what is vLLM?",
     ]
-    pretrain_model_name = 'gpt2'
+    # pretrain_model_name = 'gpt2'
+    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
     with start_ray_server() as address:
         if dist.get_rank() == 0:
             master_addr, _ = address.split(':')
@@ -411,9 +414,11 @@ def run(tp_size: int = 8):
             # results = ray.get(reduce_tasks)
             # print(f"All-reduce results: {results}")
 
-            # build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
-            # ray.get(build_ref_model_tasks)
-            # print('build ref model done')
+
+
+            build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in train_actors]
+            ray.get(build_ref_model_tasks)
+            print('build ref model done')
 
             build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in train_actors]
             ray.get(build_ppo_trainer_tasks)
@@ -463,6 +468,9 @@ def run(tp_size: int = 8):
             refs = [actor.train_1_iter.remote() for actor in train_actors]
             ray.get(refs)
             print('train 1 iter done')
+
+
+
             # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
             # ray.get(refs)
             # print('init model done')

From 1871b12e57e1dff7db50a8ce1b92b42c605bd60c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Sat, 12 Jul 2025 00:06:58 +0000
Subject: [PATCH 059/107] trains e2e!

---
 test_torch_ray_distributed.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 3ca1eaeb..c1502330 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -169,6 +169,12 @@ def fsdp_config(self):
         return dict()
 
     def build_ref_model(self, pretrain_model_name: str):
+        tmp_ref_path = str('./ref_checkpoints')
+        ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+        if os.path.exists(ref_path):
+            self.ref_path = ref_path
+            return
+
         self.pretrain_model_name = pretrain_model_name
         composer_dist.initialize_dist('gpu')
 
@@ -176,8 +182,6 @@ def build_ref_model(self, pretrain_model_name: str):
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
 
-        tmp_ref_path = str('./ref_checkpoints')
-
         temp_dataloader = [{
             'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
             'attention_mask': torch.ones((2, 15)),
@@ -198,7 +202,7 @@ def build_ref_model(self, pretrain_model_name: str):
         temp_trainer.fit()
 
         # After making the reference model, we can proceed with the PPO training
-        self.ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+        self.ref_path = ref_path
 
     def build_ppo_trainer(self, pretrain_model_name: str):
         self.pretrain_model_name = pretrain_model_name

From d2eb8f403f142a3fa5ca2861f1acbd5a0ffe4dd6 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 15 Jul 2025 18:44:33 +0000
Subject: [PATCH 060/107] clean up imports

---
 test_torch_ray_distributed.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index c1502330..49f0125b 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -16,21 +16,15 @@
 from composer.utils import dist as composer_dist
 from composer import Trainer
 from composer.optim import DecoupledAdamW
-from llmfoundry.models import ComposerHFCausalLM, ComposerMPTCausalLM
+from llmfoundry.models import ComposerHFCausalLM
 from torch.utils.data import DataLoader
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
-from transformers.models.gpt2 import GPT2LMHeadModel
 
 from compose_rl.algorithms.online import (
     ComposerHFPolicyLM,
-    ComposerMPTPolicyLM,
     SingleControllerOnPolicyCallback,
 )
-from compose_rl.algorithms.online.model_methods import OnPolicyEnum
-from compose_rl.algorithms.online.modeling_hf import ComposerHFPolicy
 from compose_rl.data import prompt_dataset_collate_fn
-from tests.common import PromptDataset, VerifiablePromptDataset, world_size
-from tests.fixtures.fixtures import assets_tokenizer_helper
+from tests.common import VerifiablePromptDataset
 
 from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
 
@@ -41,6 +35,10 @@ def ray_noset_visible_devices():
     return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
 
 
+# 1. how to enable tests with llama 3.2 1b
+# 2. why is there authorization issue with composer wrapper
+
+
 def init_ray():
     # init ray on master node, rank 0
     if dist.get_rank() == 0:

From 567e37bb943fe42d63bd155c5127b80f6008412e Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 17 Jul 2025 07:25:24 +0000
Subject: [PATCH 061/107] recover assert

---
 test_torch_ray_distributed.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/test_torch_ray_distributed.py b/test_torch_ray_distributed.py
index 49f0125b..0f6ca384 100644
--- a/test_torch_ray_distributed.py
+++ b/test_torch_ray_distributed.py
@@ -176,7 +176,7 @@ def build_ref_model(self, pretrain_model_name: str):
         self.pretrain_model_name = pretrain_model_name
         composer_dist.initialize_dist('gpu')
 
-        tmp_model = ComposerHFCausalLM(**self.model_config)
+        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
 
@@ -208,7 +208,7 @@ def build_ppo_trainer(self, pretrain_model_name: str):
         max_seq_len = 32
         precision = 'amp_bf16'
 
-        model = ComposerHFPolicyLM(**self.model_config)
+        model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
 
         optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
 
@@ -291,6 +291,13 @@ def build_ppo_trainer(self, pretrain_model_name: str):
 
     def train_1_iter(self):
         self.ppo_trainer.fit(duration='1iter')
+        # This is the KL assert that must be true if we are truly loading from the same model.
+        # This is only true on the first iteration
+        assert torch.allclose(
+            self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
+            torch.tensor(0.0),
+            atol=5e-5,
+        )
 
     def sync_weight_and_gen(self, vllm_engines: list[Any]):
         self.ppo_callback.round_trip_to_inference_engines(
@@ -480,12 +487,12 @@ def run(tp_size: int = 8):
             # ray.get(master_actor.sync_weights.remote(vllm_engines))
             # print('sync weights done')
 
-            # ref = vllm_engines[0].generate.remote(prompts)
-            # gen_results = ray.get(ref)
-            # for output in gen_results:
-            #     prompt = output.prompt
-            #     generated_text = output.outputs[0].text
-            #     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            ref = vllm_engines[0].generate.remote(prompts)
+            gen_results = ray.get(ref)
+            for output in gen_results:
+                prompt = output.prompt
+                generated_text = output.outputs[0].text
+                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

From f06dcfa6f8bbfb44d7a14fa44dc28ee95bcebdca Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 18 Jul 2025 20:52:14 +0000
Subject: [PATCH 062/107] add test files

---
 check_gpu_setup.py                      | 132 +++++++++
 ray_test/RAY_GETTING_STARTED.md         |  38 +--
 ray_test/RAY_GPU_EXAMPLES.md            |   2 +-
 ray_test/ray_distributed_simulation.py  | 355 +++++++++++++-----------
 ray_test/ray_gpu_basic.py               |  36 ++-
 ray_test/ray_gpu_patterns.py            | 105 ++++---
 ray_test/ray_learning_guide.py          | 240 +++++++++-------
 ray_test/ray_scheduling_demo.py         | 158 ++++++-----
 ray_test/ray_single_server_multi_gpu.py | 241 +++++++++-------
 ray_test/test_ray.py                    |  36 ++-
 ray_test/test_ray_chain.py              |  51 ++++
 ray_test/test_ray_distributed.py        |  30 +-
 ray_test/test_ray_init.py               |   8 +-
 ray_test/test_socket.py                 |  45 +++
 ray_test/test_torch_ray_distributed.py  | 342 +++++++++++++++++++++++
 test_async_better.py                    |  38 +++
 test_original_trace.py                  |  41 +++
 vllm_test/test_vllm.py                  |  32 +++
 18 files changed, 1384 insertions(+), 546 deletions(-)
 create mode 100644 check_gpu_setup.py
 create mode 100644 ray_test/test_ray_chain.py
 create mode 100644 ray_test/test_socket.py
 create mode 100644 ray_test/test_torch_ray_distributed.py
 create mode 100644 test_async_better.py
 create mode 100644 test_original_trace.py
 create mode 100644 vllm_test/test_vllm.py

diff --git a/check_gpu_setup.py b/check_gpu_setup.py
new file mode 100644
index 00000000..bdc92090
--- /dev/null
+++ b/check_gpu_setup.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+GPU Setup Checker
+
+Run this script first to verify your GPU setup is working correctly with Ray.
+"""
+
+import ray
+import torch
+import subprocess
+import sys
+
+def check_cuda_available():
+    """Check if CUDA is available in PyTorch."""
+    print("=== CUDA Check ===")
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    
+    if torch.cuda.is_available():
+        print(f"CUDA version: {torch.version.cuda}")
+        print(f"Number of GPUs: {torch.cuda.device_count()}")
+        
+        for i in range(torch.cuda.device_count()):
+            gpu_name = torch.cuda.get_device_name(i)
+            print(f"  GPU {i}: {gpu_name}")
+    else:
+        print("❌ CUDA not available! Check your PyTorch installation.")
+        return False
+    
+    return True
+
+def check_nvidia_smi():
+    """Check if nvidia-smi is available."""
+    print("\n=== nvidia-smi Check ===")
+    try:
+        result = subprocess.run(['nvidia-smi', '--query-gpu=index,name,memory.total', '--format=csv'], 
+                              capture_output=True, text=True, check=True)
+        print("nvidia-smi output:")
+        print(result.stdout)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("❌ nvidia-smi not found or failed")
+        return False
+
+def check_ray_gpu_detection():
+    """Check if Ray can detect GPUs."""
+    print("\n=== Ray GPU Detection ===")
+    
+    try:
+        ray.init()
+        
+        resources = ray.cluster_resources()
+        print(f"Ray cluster resources: {resources}")
+        
+        gpu_count = resources.get("GPU", 0)
+        if gpu_count >= 2:
+            print(f"✅ Ray detected {gpu_count} GPUs")
+        elif gpu_count == 1:
+            print(f"⚠️  Ray detected only {gpu_count} GPU (expected 2)")
+        else:
+            print("❌ Ray detected no GPUs")
+            
+        ray.shutdown()
+        return gpu_count >= 2
+        
+    except Exception as e:
+        print(f"❌ Ray initialization failed: {e}")
+        return False
+
+def run_simple_gpu_test():
+    """Run a simple GPU test to verify everything works."""
+    print("\n=== Simple GPU Test ===")
+    
+    if not torch.cuda.is_available():
+        print("❌ Skipping GPU test - CUDA not available")
+        return False
+    
+    try:
+        # Test each GPU
+        for gpu_id in range(torch.cuda.device_count()):
+            device = torch.device(f"cuda:{gpu_id}")
+            x = torch.randn(100, 100, device=device)
+            y = torch.mm(x, x.T)
+            print(f"✅ GPU {gpu_id} test passed - tensor shape: {y.shape} on {device}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"❌ GPU test failed: {e}")
+        return False
+
+def main():
+    """Run all checks."""
+    print("Ray + GPU Setup Checker")
+    print("=" * 40)
+    
+    checks = [
+        ("CUDA/PyTorch", check_cuda_available),
+        ("nvidia-smi", check_nvidia_smi),
+        ("Ray GPU Detection", check_ray_gpu_detection),
+        ("Simple GPU Test", run_simple_gpu_test),
+    ]
+    
+    results = []
+    for name, check_func in checks:
+        try:
+            result = check_func()
+            results.append((name, result))
+        except Exception as e:
+            print(f"❌ {name} check failed with error: {e}")
+            results.append((name, False))
+    
+    print("\n" + "=" * 40)
+    print("SUMMARY:")
+    all_passed = True
+    for name, passed in results:
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"  {name}: {status}")
+        if not passed:
+            all_passed = False
+    
+    if all_passed:
+        print("\n🎉 All checks passed! You're ready to use Ray with GPUs.")
+        print("Try running: python ray_gpu_basic.py")
+    else:
+        print("\n⚠️  Some checks failed. Please fix the issues above before proceeding.")
+    
+    return all_passed
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1) 
\ No newline at end of file
diff --git a/ray_test/RAY_GETTING_STARTED.md b/ray_test/RAY_GETTING_STARTED.md
index 274aab46..17b4085d 100644
--- a/ray_test/RAY_GETTING_STARTED.md
+++ b/ray_test/RAY_GETTING_STARTED.md
@@ -9,10 +9,10 @@ Welcome to Ray GPU management! This guide provides everything you need to learn
 ```
 Step 1: Setup & Verification
     ↓
-Step 2: Interactive Learning (Basics)  
+Step 2: Interactive Learning (Basics)
     ↓
 Step 3: Single Server Multi-GPU Patterns
-    ↓  
+    ↓
 Step 4: Distributed Simulation
     ↓
 Step 5: Real-World Applications
@@ -31,7 +31,7 @@ Step 5: Real-World Applications
 # 1. Verify your setup
 python check_gpu_setup.py
 
-# 2. Learn interactively  
+# 2. Learn interactively
 python ray_learning_guide.py
 
 # 3. Try advanced patterns
@@ -49,7 +49,7 @@ python check_gpu_setup.py
 ```
 
 This checks:
-- ✅ CUDA availability in PyTorch  
+- ✅ CUDA availability in PyTorch
 - ✅ nvidia-smi functionality
 - ✅ Ray GPU detection
 - ✅ Basic GPU operations
@@ -67,7 +67,7 @@ python ray_learning_guide.py
 **What you'll learn:**
 - Ray basic concepts (remote functions, actors)
 - GPU resource allocation (full vs fractional)
-- Tasks vs Actors differences  
+- Tasks vs Actors differences
 - Resource monitoring
 
 **Duration:** 10-15 minutes (interactive)
@@ -88,7 +88,7 @@ python ray_single_server_multi_gpu.py
 
 **Duration:** 5-10 minutes (automated demos)
 
-### Step 4: Distributed Simulation  
+### Step 4: Distributed Simulation
 
 Simulate multi-server setup on localhost:
 
@@ -114,7 +114,7 @@ While running examples, monitor GPU usage:
 # Terminal 1: Run your Ray script
 python ray_learning_guide.py
 
-# Terminal 2: Monitor GPUs  
+# Terminal 2: Monitor GPUs
 watch -n 1 nvidia-smi
 ```
 
@@ -155,12 +155,12 @@ def light_gpu_task():
 def process_data(data):
     return result
 
-# ACTOR: Stateful class  
+# ACTOR: Stateful class
 @ray.remote(num_gpus=1)
 class DataProcessor:
     def __init__(self):
         self.model = load_model()
-    
+
     def process(self, data):
         return self.model(data)
 ```
@@ -191,7 +191,7 @@ def train_model(config):
     pass
 
 # Train multiple models in parallel
-configs = [config1, config2]  
+configs = [config1, config2]
 futures = [train_model.remote(c) for c in configs]
 results = ray.get(futures)
 ```
@@ -202,7 +202,7 @@ results = ray.get(futures)
 cpu_tasks = [preprocess.remote(data) for data in dataset]
 processed_data = ray.get(cpu_tasks)
 
-gpu_tasks = [train.remote(data) for data in processed_data]  
+gpu_tasks = [train.remote(data) for data in processed_data]
 models = ray.get(gpu_tasks)
 ```
 
@@ -212,7 +212,7 @@ models = ray.get(gpu_tasks)
 def preprocess(data):
     return cleaned_data
 
-@ray.remote(num_gpus=0.5)  
+@ray.remote(num_gpus=0.5)
 def inference(data):
     return predictions
 
@@ -269,7 +269,7 @@ After completing this guide, explore:
 
 ### Advanced Ray Features
 - **Ray Tune:** Hyperparameter optimization
-- **Ray Train:** Distributed training  
+- **Ray Train:** Distributed training
 - **Ray Serve:** Model serving
 - **Ray Data:** Large-scale data processing
 
@@ -280,7 +280,7 @@ Once comfortable with localhost simulation:
 # Server 1 (head node)
 ray start --head --port=10001 --num-gpus=2
 
-# Server 2 (worker node) 
+# Server 2 (worker node)
 ray start --address=192.168.1.100:10001 --num-gpus=1
 ```
 
@@ -293,10 +293,10 @@ ray start --address=192.168.1.100:10001 --num-gpus=1
 ## 📁 Files in This Learning Package
 
 | File | Purpose | When to Use |
-|------|---------|-------------|  
+|------|---------|-------------|
 | `check_gpu_setup.py` | Verify system setup | **Start here** - before anything else |
 | `ray_learning_guide.py` | Interactive beginner tutorial | **Step 2** - core concepts |
-| `ray_single_server_multi_gpu.py` | Advanced single-server patterns | **Step 3** - practical patterns |  
+| `ray_single_server_multi_gpu.py` | Advanced single-server patterns | **Step 3** - practical patterns |
 | `ray_distributed_simulation.py` | Localhost distributed simulation | **Step 4** - distributed concepts |
 | `ray_gpu_basic.py` | Simple working example | Reference/quick test |
 | `RAY_GPU_EXAMPLES.md` | Original documentation | Additional reference |
@@ -305,7 +305,7 @@ ray start --address=192.168.1.100:10001 --num-gpus=1
 
 You'll know you've mastered Ray GPU management when you can:
 
-✅ Set up Ray clusters (single and distributed)  
+✅ Set up Ray clusters (single and distributed)
 ✅ Choose between tasks and actors appropriately
 ✅ Allocate GPU resources efficiently (full vs fractional)
 ✅ Monitor and debug resource usage
@@ -315,10 +315,10 @@ You'll know you've mastered Ray GPU management when you can:
 ## 🆘 Getting Help
 
 - 📖 [Official Ray Documentation](https://docs.ray.io/)
-- 💬 [Ray Discourse Forum](https://discuss.ray.io/)  
+- 💬 [Ray Discourse Forum](https://discuss.ray.io/)
 - 🐛 [Ray GitHub Issues](https://github.com/ray-project/ray/issues)
 - 📺 [Ray YouTube Tutorials](https://www.youtube.com/c/RayProjectIO)
 
 ---
 
-**Happy learning!** 🚀 Start with `python ray_learning_guide.py` and work your way through the examples. 
\ No newline at end of file
+**Happy learning!** 🚀 Start with `python ray_learning_guide.py` and work your way through the examples.
diff --git a/ray_test/RAY_GPU_EXAMPLES.md b/ray_test/RAY_GPU_EXAMPLES.md
index 4a7f5969..02845bb5 100644
--- a/ray_test/RAY_GPU_EXAMPLES.md
+++ b/ray_test/RAY_GPU_EXAMPLES.md
@@ -199,4 +199,4 @@ ray dashboard
 ray stop
 ```
 
-Happy learning with Ray! 🚀 
\ No newline at end of file
+Happy learning with Ray! 🚀
diff --git a/ray_test/ray_distributed_simulation.py b/ray_test/ray_distributed_simulation.py
index 79986266..baf9f343 100644
--- a/ray_test/ray_distributed_simulation.py
+++ b/ray_test/ray_distributed_simulation.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray Distributed Setup Simulation
 
@@ -6,208 +10,224 @@
 We'll create multiple Ray nodes on the same machine to simulate a multi-server setup.
 """
 
-import ray
-import torch
-import time
+import os
 import subprocess
-import signal
 import sys
-import os
-import threading
-from typing import Dict, Any, List
+import time
+from typing import Any
+
 import psutil
+import ray
+import torch
 
 # Configuration
 HEAD_PORT = 10001
 WORKER_PORT_START = 10002
-REDIS_PASSWORD = "ray_demo_password"
+REDIS_PASSWORD = 'ray_demo_password'
+
 
 class RayClusterManager:
     """Manages a simulated distributed Ray cluster on localhost."""
-    
+
     def __init__(self):
         self.head_process = None
         self.worker_processes = []
         self.head_address = None
-        
+
     def start_head_node(self, num_gpus: int = 2, num_cpus: int = 8) -> str:
         """Start the head node."""
-        print("🚀 Starting Ray head node...")
-        
+        print('🚀 Starting Ray head node...')
+
         # Kill any existing Ray processes
         self._cleanup_existing_ray()
-        
+
         head_cmd = [
-            "ray", "start", "--head",
+            'ray',
+            'start',
+            '--head',
             f"--port={HEAD_PORT}",
             f"--num-gpus={num_gpus}",
             f"--num-cpus={num_cpus}",
             f"--redis-password={REDIS_PASSWORD}",
-            "--include-dashboard=true",
-            "--dashboard-port=8265"
+            '--include-dashboard=true',
+            '--dashboard-port=8265',
         ]
-        
+
         print(f"Command: {' '.join(head_cmd)}")
-        
+
         # Start head node
         self.head_process = subprocess.Popen(
             head_cmd,
             stdout=sys.stdout,
             stderr=sys.stderr,
-            text=True
+            text=True,
         )
-        
+
         # Wait a bit for head to start
         time.sleep(3)
-        
+
         self.head_address = f"ray://127.0.0.1:{HEAD_PORT}"
         print(f"✅ Head node started at {self.head_address}")
-        
+
         return self.head_address
-    
-    def add_worker_node(self, node_id: int, num_gpus: int = 0, num_cpus: int = 4) -> bool:
+
+    def add_worker_node(
+        self, node_id: int, num_gpus: int = 0, num_cpus: int = 4
+    ) -> bool:
         """Add a worker node to the cluster."""
         print(f"🔧 Adding worker node {node_id}...")
-        
+
         worker_cmd = [
-            "ray", "start",
+            'ray',
+            'start',
             f"--address={self.head_address}",
             f"--num-gpus={num_gpus}",
             f"--num-cpus={num_cpus}",
-            f"--redis-password={REDIS_PASSWORD}"
+            f"--redis-password={REDIS_PASSWORD}",
         ]
-        
+
         print(f"Command: {' '.join(worker_cmd)}")
-        
+
         worker_process = subprocess.Popen(
             worker_cmd,
             stdout=sys.stdout,
             stderr=sys.stderr,
-            text=True
+            text=True,
         )
-        
+
         self.worker_processes.append(worker_process)
-        
+
         # Wait for worker to connect
         time.sleep(2)
-        
+
         print(f"✅ Worker node {node_id} added")
         return True
-    
+
     def _cleanup_existing_ray(self):
         """Clean up any existing Ray processes."""
         try:
-            subprocess.run(["ray", "stop", "--force"], 
-                         capture_output=True, timeout=10)
+            subprocess.run(['ray', 'stop', '--force'],
+                           capture_output=True,
+                           timeout=10)
             time.sleep(1)
         except:
             pass
-    
+
     def shutdown(self):
         """Shutdown the entire cluster."""
-        print("🛑 Shutting down Ray cluster...")
-        
+        print('🛑 Shutting down Ray cluster...')
+
         # Stop all processes
         try:
-            subprocess.run(["ray", "stop", "--force"], 
-                         capture_output=True, timeout=10)
+            subprocess.run(['ray', 'stop', '--force'],
+                           capture_output=True,
+                           timeout=10)
         except:
             pass
-        
+
         # Kill processes if still running
         if self.head_process:
             self.head_process.terminate()
-        
+
         for worker in self.worker_processes:
             worker.terminate()
-        
-        print("✅ Cluster shutdown complete")
+
+        print('✅ Cluster shutdown complete')
+
 
 @ray.remote(num_gpus=1)
 class DistributedGPUWorker:
     """A distributed GPU worker that reports its location."""
-    
+
     def __init__(self, worker_id: str):
         self.worker_id = worker_id
         self.node_id = ray.get_runtime_context().get_node_id()
         self.gpu_ids = ray.get_gpu_ids()
         self.hostname = os.uname().nodename
-        
-    def get_worker_info(self) -> Dict[str, Any]:
+
+    def get_worker_info(self) -> dict[str, Any]:
         """Get information about this worker."""
         return {
-            "worker_id": self.worker_id,
-            "node_id": self.node_id,
-            "hostname": self.hostname,
-            "gpu_ids": self.gpu_ids,
-            "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
+            'worker_id':
+                self.worker_id,
+            'node_id':
+                self.node_id,
+            'hostname':
+                self.hostname,
+            'gpu_ids':
+                self.gpu_ids,
+            'cuda_visible_devices':
+                os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set'),
         }
-    
-    def distributed_computation(self, matrix_size: int = 1000) -> Dict[str, Any]:
+
+    def distributed_computation(self,
+                                matrix_size: int = 1000) -> dict[str, Any]:
         """Perform computation and return node information."""
         start_time = time.time()
-        
+
         # GPU computation
-        device = torch.device("cuda")
+        device = torch.device('cuda')
         A = torch.randn(matrix_size, matrix_size, device=device)
         B = torch.randn(matrix_size, matrix_size, device=device)
         C = torch.mm(A, B)
         result = torch.trace(C).item()
-        
+
         execution_time = time.time() - start_time
-        
+
         return {
-            "worker_id": self.worker_id,
-            "node_id": self.node_id,
-            "hostname": self.hostname,
-            "gpu_ids": self.gpu_ids,
-            "result": result,
-            "execution_time": execution_time,
-            "matrix_size": matrix_size
+            'worker_id': self.worker_id,
+            'node_id': self.node_id,
+            'hostname': self.hostname,
+            'gpu_ids': self.gpu_ids,
+            'result': result,
+            'execution_time': execution_time,
+            'matrix_size': matrix_size,
         }
 
+
 @ray.remote(num_cpus=1)
-def distributed_cpu_task(task_id: int) -> Dict[str, Any]:
+def distributed_cpu_task(task_id: int) -> dict[str, Any]:
     """A CPU task that reports which node it's running on."""
     import numpy as np
-    
+
     start_time = time.time()
     node_id = ray.get_runtime_context().get_node_id()
     hostname = os.uname().nodename
-    
+
     # CPU computation
-    result = np.sum(np.random.randn(500, 500) ** 2)
-    
+    result = np.sum(np.random.randn(500, 500)**2)
+
     execution_time = time.time() - start_time
-    
+
     return {
-        "task_id": task_id,
-        "node_id": node_id,
-        "hostname": hostname,
-        "result": result,
-        "execution_time": execution_time,
-        "resource_type": "CPU"
+        'task_id': task_id,
+        'node_id': node_id,
+        'hostname': hostname,
+        'result': result,
+        'execution_time': execution_time,
+        'resource_type': 'CPU',
     }
 
+
 def demonstrate_cluster_info():
     """Show cluster information and resource distribution."""
-    print("\n📊 CLUSTER INFORMATION")
-    print("=" * 50)
-    
+    print('\n📊 CLUSTER INFORMATION')
+    print('=' * 50)
+
     # Get cluster resources
     cluster_resources = ray.cluster_resources()
     available_resources = ray.available_resources()
-    
-    print("Total Cluster Resources:")
+
+    print('Total Cluster Resources:')
     for resource, amount in cluster_resources.items():
         print(f"  {resource}: {amount}")
-    
-    print("\nAvailable Resources:")
+
+    print('\nAvailable Resources:')
     for resource, amount in available_resources.items():
         print(f"  {resource}: {amount}")
-    
+
     # Get node information
-    print("\nNodes in Cluster:")
+    print('\nNodes in Cluster:')
     nodes = ray.nodes()
     for i, node in enumerate(nodes):
         print(f"  Node {i+1}:")
@@ -215,75 +235,85 @@ def demonstrate_cluster_info():
         print(f"    Alive: {node['Alive']}")
         print(f"    Resources: {node['Resources']}")
 
+
 def demonstrate_distributed_gpu_work():
     """Demonstrate distributed GPU work across simulated nodes."""
-    print("\n🖥️  DEMO: Distributed GPU Work")
-    print("-" * 50)
-    
+    print('\n🖥️  DEMO: Distributed GPU Work')
+    print('-' * 50)
+
     # Create GPU workers
     workers = [DistributedGPUWorker.remote(f"gpu_worker_{i}") for i in range(2)]
-    
+
     # Get worker information
-    print("Created GPU workers:")
-    worker_info_futures = [worker.get_worker_info.remote() for worker in workers]
+    print('Created GPU workers:')
+    worker_info_futures = [
+        worker.get_worker_info.remote() for worker in workers
+    ]
     worker_infos = ray.get(worker_info_futures)
-    
+
     for info in worker_infos:
-        print(f"  {info['worker_id']}: Node {info['node_id'][:8]}, GPU {info['gpu_ids']}")
-    
+        print(
+            f"  {info['worker_id']}: Node {info['node_id'][:8]}, GPU {info['gpu_ids']}"
+        )
+
     # Submit distributed computation
-    print("\nSubmitting distributed GPU computations...")
+    print('\nSubmitting distributed GPU computations...')
     computation_futures = [
-        worker.distributed_computation.remote(matrix_size=1200) 
+        worker.distributed_computation.remote(matrix_size=1200)
         for worker in workers
     ]
-    
+
     results = ray.get(computation_futures)
-    
-    print("Results:")
+
+    print('Results:')
     for result in results:
-        print(f"  {result['worker_id']}: "
-              f"Node {result['node_id'][:8]}, "
-              f"GPU {result['gpu_ids']}, "
-              f"Result: {result['result']:.2f}, "
-              f"Time: {result['execution_time']:.2f}s")
+        print(
+            f"  {result['worker_id']}: "
+            f"Node {result['node_id'][:8]}, "
+            f"GPU {result['gpu_ids']}, "
+            f"Result: {result['result']:.2f}, "
+            f"Time: {result['execution_time']:.2f}s"
+        )
+
 
 def demonstrate_mixed_distributed_work():
     """Demonstrate mixed CPU/GPU work across nodes."""
-    print("\n🔄 DEMO: Mixed Distributed Workload")
-    print("-" * 50)
-    
+    print('\n🔄 DEMO: Mixed Distributed Workload')
+    print('-' * 50)
+
     # Submit a mix of CPU and GPU tasks
     cpu_tasks = [distributed_cpu_task.remote(i) for i in range(4)]
-    
+
     # Create lightweight GPU tasks
     @ray.remote(num_gpus=0.5)
     def light_gpu_task(task_id: int):
         node_id = ray.get_runtime_context().get_node_id()
         gpu_ids = ray.get_gpu_ids()
-        
-        device = torch.device("cuda")
+
+        device = torch.device('cuda')
         x = torch.randn(500, 500, device=device)
         result = torch.sum(x * x).item()
-        
+
         return {
-            "task_id": task_id, 
-            "node_id": node_id,
-            "gpu_ids": gpu_ids,
-            "result": result
+            'task_id': task_id,
+            'node_id': node_id,
+            'gpu_ids': gpu_ids,
+            'result': result,
         }
-    
-    gpu_tasks = [light_gpu_task.remote(i+10) for i in range(3)]
-    
+
+    gpu_tasks = [light_gpu_task.remote(i + 10) for i in range(3)]
+
     all_tasks = cpu_tasks + gpu_tasks
-    print(f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks")
-    
+    print(
+        f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks"
+    )
+
     start_time = time.time()
     results = ray.get(all_tasks)
     total_time = time.time() - start_time
-    
+
     print(f"All tasks completed in {total_time:.2f}s")
-    
+
     # Group results by node
     node_results = {}
     for result in results:
@@ -291,19 +321,20 @@ def light_gpu_task(task_id: int):
         if node_id not in node_results:
             node_results[node_id] = []
         node_results[node_id].append(result)
-    
-    print("\nResults by Node:")
+
+    print('\nResults by Node:')
     for node_id, node_tasks in node_results.items():
         print(f"  Node {node_id}: {len(node_tasks)} tasks")
 
+
 def simulate_two_server_setup():
     """Simulate a two-server setup using localhost."""
-    print("\n🌐 SIMULATING TWO-SERVER SETUP")
-    print("=" * 60)
-    print("This simulates Server 1 (head + GPU) and Server 2 (worker + CPU)")
-    
+    print('\n🌐 SIMULATING TWO-SERVER SETUP')
+    print('=' * 60)
+    print('This simulates Server 1 (head + GPU) and Server 2 (worker + CPU)')
+
     cluster_manager = RayClusterManager()
-    
+
     try:
         # Start head node (simulates Server 1 with GPUs)
         head_address = cluster_manager.start_head_node(num_gpus=2, num_cpus=4)
@@ -313,20 +344,20 @@ def simulate_two_server_setup():
         ray.init(address=head_address, _redis_password=REDIS_PASSWORD)
 
         demonstrate_cluster_info()
-        
+
         # Add worker node (simulates Server 2 with only CPUs)
         cluster_manager.add_worker_node(node_id=1, num_gpus=0, num_cpus=6)
-        
+
         # Demonstrate distributed functionality
         demonstrate_cluster_info()
         demonstrate_distributed_gpu_work()
         demonstrate_mixed_distributed_work()
-        
-        print("\n✨ Distributed simulation completed successfully!")
-        
+
+        print('\n✨ Distributed simulation completed successfully!')
+
     except Exception as e:
         print(f"❌ Error in distributed simulation: {e}")
-        
+
     finally:
         try:
             ray.shutdown()
@@ -334,36 +365,40 @@ def simulate_two_server_setup():
             pass
         cluster_manager.shutdown()
 
+
 def main():
     """Main function to demonstrate distributed Ray setup."""
-    print("🎯 Ray Distributed Setup Simulation")
-    print("=" * 60)
-    print("This example simulates a distributed Ray cluster on a single machine")
-    print("to help you understand distributed Ray concepts.")
-    
+    print('🎯 Ray Distributed Setup Simulation')
+    print('=' * 60)
+    print(
+        'This example simulates a distributed Ray cluster on a single machine'
+    )
+    print('to help you understand distributed Ray concepts.')
+
     # Check if Ray is already running
     try:
         ray.init(address='auto')
-        print("⚠️  Ray is already running. Shutting down first...")
+        print('⚠️  Ray is already running. Shutting down first...')
         ray.shutdown()
         time.sleep(2)
     except:
         pass
-    
+
     simulate_two_server_setup()
-    
-    print("\n📚 What you learned:")
-    print("1. How to start Ray head and worker nodes")
-    print("2. How to connect to a distributed Ray cluster") 
-    print("3. How tasks are distributed across nodes")
-    print("4. How to monitor cluster resources and node distribution")
-    print("5. How GPU and CPU resources are managed in a distributed setup")
-    
-    print("\n🚀 Next steps:")
-    print("- Try this on actual multiple servers")
-    print("- Experiment with different resource configurations")
-    print("- Use Ray Tune for distributed hyperparameter tuning")
-    print("- Explore Ray Train for distributed training")
-
-if __name__ == "__main__":
-    main() 
\ No newline at end of file
+
+    print('\n📚 What you learned:')
+    print('1. How to start Ray head and worker nodes')
+    print('2. How to connect to a distributed Ray cluster')
+    print('3. How tasks are distributed across nodes')
+    print('4. How to monitor cluster resources and node distribution')
+    print('5. How GPU and CPU resources are managed in a distributed setup')
+
+    print('\n🚀 Next steps:')
+    print('- Try this on actual multiple servers')
+    print('- Experiment with different resource configurations')
+    print('- Use Ray Tune for distributed hyperparameter tuning')
+    print('- Explore Ray Train for distributed training')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ray_test/ray_gpu_basic.py b/ray_test/ray_gpu_basic.py
index 9a3d625f..8c0aa10e 100644
--- a/ray_test/ray_gpu_basic.py
+++ b/ray_test/ray_gpu_basic.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray GPU Management - Basic Example
 
@@ -6,46 +10,48 @@
 Perfect for someone new to Ray who wants to understand the core concepts.
 """
 
+import os
+import time
+
 import ray
 import torch
-import time
-import os
 
 
 @ray.remote(num_gpus=1)
 def simple_gpu_task(task_id: int):
     """A minimal GPU task that just creates a tensor and does basic operations."""
-    
+
     # Ray automatically manages which GPU this task gets
     gpu_ids = ray.get_gpu_ids()
     print(f"Task {task_id}: Using GPU {gpu_ids[0]}")
-    
+
     # Create a tensor on the GPU
-    device = torch.device("cuda")
+    device = torch.device('cuda')
     x = torch.randn(1000, 1000, device=device)
-    
+
     # Do some computation
     for i in range(3):
         x = x * 2
         time.sleep(0.5)  # Simulate work
         print(f"  Step {i+1}: tensor shape {x.shape}")
-    
+
     return f"Task {task_id} completed on GPU {gpu_ids[0]}"
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
     # print current pic
     print(f"Current process ID: {os.getpid()}")
 
     # Initialize Ray
     ray.init()
-    
-    print("Available resources:", ray.cluster_resources())
-    
+
+    print('Available resources:', ray.cluster_resources())
+
     # Launch 2 tasks (one per GPU)
     tasks = [simple_gpu_task.remote(i) for i in range(2)]
-    
+
     # Wait for results
     results = ray.get(tasks)
-    
-    print("Results:", results)
-    ray.shutdown() 
\ No newline at end of file
+
+    print('Results:', results)
+    ray.shutdown()
diff --git a/ray_test/ray_gpu_patterns.py b/ray_test/ray_gpu_patterns.py
index 071bf165..c3119378 100644
--- a/ray_test/ray_gpu_patterns.py
+++ b/ray_test/ray_gpu_patterns.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray GPU Management - Advanced Patterns
 
@@ -9,123 +13,132 @@
 4. Error handling
 """
 
-import ray
-import torch
 import time
+
 import psutil
+import ray
+import torch
+
 
 # Pattern 1: Fractional GPU usage (0.5 GPU per task)
 @ray.remote(num_gpus=0.5)
 def light_gpu_task(task_id: int):
     """Task that only needs half a GPU - allows 4 tasks on 2 GPUs."""
     gpu_ids = ray.get_gpu_ids()
-    device = torch.device("cuda")
-    
+    device = torch.device('cuda')
+
     print(f"Light task {task_id}: Using GPU fraction on {gpu_ids}")
-    
+
     # Lighter computation
     x = torch.randn(500, 500, device=device)
     x = torch.mm(x, x.T)
     time.sleep(1)
-    
+
     return f"Light task {task_id} done"
 
+
 # Pattern 2: Full GPU usage
 @ray.remote(num_gpus=1)
 def heavy_gpu_task(task_id: int):
     """Task that needs a full GPU."""
     gpu_ids = ray.get_gpu_ids()
-    device = torch.device("cuda")
-    
+    device = torch.device('cuda')
+
     print(f"Heavy task {task_id}: Using full GPU {gpu_ids[0]}")
-    
+
     # Heavier computation
     x = torch.randn(2000, 2000, device=device)
     for _ in range(5):
         x = torch.mm(x, x.T)
     time.sleep(2)
-    
+
     return f"Heavy task {task_id} done on GPU {gpu_ids[0]}"
 
+
 # Pattern 3: CPU task for comparison
 @ray.remote
 def cpu_task(task_id: int):
     """Task that runs on CPU only."""
     print(f"CPU task {task_id}: Running on CPU")
-    
+
     # CPU computation
     x = torch.randn(1000, 1000)
     x = torch.mm(x, x.T)
     time.sleep(1)
-    
+
     return f"CPU task {task_id} done"
 
+
 # Pattern 4: Resource monitoring task
 @ray.remote
 def monitor_resources():
     """Monitor system resources while tasks are running."""
     resources = ray.cluster_resources()
     available = ray.available_resources()
-    
+
     return {
-        "total_gpus": resources.get("GPU", 0),
-        "available_gpus": available.get("GPU", 0),
-        "total_cpus": resources.get("CPU", 0),
-        "available_cpus": available.get("CPU", 0),
-        "memory_usage": psutil.virtual_memory().percent
+        'total_gpus': resources.get('GPU', 0),
+        'available_gpus': available.get('GPU', 0),
+        'total_cpus': resources.get('CPU', 0),
+        'available_cpus': available.get('CPU', 0),
+        'memory_usage': psutil.virtual_memory().percent,
     }
 
+
 def demonstrate_gpu_patterns():
     """Demonstrate different GPU allocation patterns."""
-    
-    print("=== Ray GPU Patterns Demo ===\n")
-    
+
+    print('=== Ray GPU Patterns Demo ===\n')
+
     # Initialize Ray
     ray.init()
-    
+
     # Check available resources
-    print("Initial resources:", ray.cluster_resources())
-    print("Available resources:", ray.available_resources())
+    print('Initial resources:', ray.cluster_resources())
+    print('Available resources:', ray.available_resources())
     print()
-    
+
     # Pattern 1: Run multiple light tasks (fractional GPU)
-    print("1. Running 4 light tasks (0.5 GPU each) - should run 4 concurrent on 2 GPUs")
+    print(
+        '1. Running 4 light tasks (0.5 GPU each) - should run 4 concurrent on 2 GPUs'
+    )
     light_tasks = [light_gpu_task.remote(i) for i in range(4)]
-    
+
     # Pattern 2: Run heavy tasks (full GPU)
-    print("2. Running 2 heavy tasks (1 GPU each)")
+    print('2. Running 2 heavy tasks (1 GPU each)')
     heavy_tasks = [heavy_gpu_task.remote(i) for i in range(2)]
-    
+
     # Pattern 3: Run CPU tasks alongside
-    print("3. Running CPU tasks in parallel")
+    print('3. Running CPU tasks in parallel')
     cpu_tasks = [cpu_task.remote(i) for i in range(3)]
-    
+
     # Pattern 4: Monitor resources while tasks run
     monitor_task = monitor_resources.remote()
-    
+
     # Wait for light tasks
-    print("\nWaiting for light tasks...")
+    print('\nWaiting for light tasks...')
     light_results = ray.get(light_tasks)
-    print("Light tasks results:", light_results)
-    
+    print('Light tasks results:', light_results)
+
     # Check resources mid-execution
     mid_resources = ray.get(monitor_task)
-    print("Mid-execution resources:", mid_resources)
-    
+    print('Mid-execution resources:', mid_resources)
+
     # Wait for remaining tasks
-    print("\nWaiting for heavy and CPU tasks...")
+    print('\nWaiting for heavy and CPU tasks...')
     heavy_results = ray.get(heavy_tasks)
     cpu_results = ray.get(cpu_tasks)
-    
-    print("Heavy tasks results:", heavy_results)
-    print("CPU tasks results:", cpu_results)
-    
+
+    print('Heavy tasks results:', heavy_results)
+    print('CPU tasks results:', cpu_results)
+
     # Final resource check
     final_monitor = monitor_resources.remote()
     final_resources = ray.get(final_monitor)
-    print("Final resources:", final_resources)
-    
+    print('Final resources:', final_resources)
+
     ray.shutdown()
 
-if __name__ == "__main__":
-    demonstrate_gpu_patterns() 
\ No newline at end of file
+
+if __name__ == '__main__':
+    demonstrate_gpu_patterns()
diff --git a/ray_test/ray_learning_guide.py b/ray_test/ray_learning_guide.py
index 066dedc5..0d12deda 100644
--- a/ray_test/ray_learning_guide.py
+++ b/ray_test/ray_learning_guide.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray GPU Learning Guide - Getting Started Script
 
@@ -6,22 +10,23 @@
 through interactive examples and clear explanations.
 """
 
+import time
+
 import ray
 import torch
-import time
-import os
-from typing import Dict, Any
+
 
 def step_1_basic_concepts():
     """Step 1: Understanding Ray basic concepts."""
-    print("\n" + "="*60)
-    print("🎓 STEP 1: RAY BASIC CONCEPTS")
-    print("="*60)
-    
-    print("""
+    print('\n' + '=' * 60)
+    print('🎓 STEP 1: RAY BASIC CONCEPTS')
+    print('=' * 60)
+
+    print(
+        """
 Ray is a distributed computing framework that helps you:
 1. Parallelize your Python code across multiple cores/machines
-2. Manage GPU resources automatically  
+2. Manage GPU resources automatically
 3. Scale from single machine to clusters seamlessly
 
 Key concepts:
@@ -29,73 +34,81 @@ def step_1_basic_concepts():
 - ray.get(): Wait for and retrieve results from remote tasks
 - ray.put(): Store large objects in shared memory
 - Actors: Stateful workers that persist across tasks
-""")
-    
+"""
+    )
+
     # Simple example
     @ray.remote
     def simple_task(x):
         return x * x
-    
-    print("Example: Simple remote function")
-    print("@ray.remote")
-    print("def simple_task(x):")
-    print("    return x * x")
-    
+
+    print('Example: Simple remote function')
+    print('@ray.remote')
+    print('def simple_task(x):')
+    print('    return x * x')
+
     # Execute
     future = simple_task.remote(5)
     result = ray.get(future)
     print(f"\nResult: simple_task.remote(5) = {result}")
 
+
 def step_2_gpu_resource_management():
     """Step 2: Understanding GPU resource management."""
-    print("\n" + "="*60)
-    print("🎮 STEP 2: GPU RESOURCE MANAGEMENT")
-    print("="*60)
-    
-    print("""
+    print('\n' + '=' * 60)
+    print('🎮 STEP 2: GPU RESOURCE MANAGEMENT')
+    print('=' * 60)
+
+    print(
+        """
 Ray automatically manages GPU allocation:
 
 1. Full GPU allocation: @ray.remote(num_gpus=1)
    - Task gets exclusive access to 1 GPU
    - Ray sets CUDA_VISIBLE_DEVICES automatically
-   
-2. Fractional GPU allocation: @ray.remote(num_gpus=0.5)  
+
+2. Fractional GPU allocation: @ray.remote(num_gpus=0.5)
    - Multiple tasks can share the same GPU
    - Useful for lightweight GPU work
-   
+
 3. Ray handles scheduling based on available resources
-""")
-    
+"""
+    )
+
     @ray.remote(num_gpus=1)
     def gpu_task(task_id):
         gpu_ids = ray.get_gpu_ids()
-        device = torch.device("cuda")
+        device = torch.device('cuda')
         x = torch.randn(100, 100, device=device)
-        return {"task_id": task_id, "gpu_ids": gpu_ids, "shape": list(x.shape)}
-    
-    print("Example: GPU task")
-    print("@ray.remote(num_gpus=1)")
-    print("def gpu_task(task_id):")
-    print("    gpu_ids = ray.get_gpu_ids()")
+        return {'task_id': task_id, 'gpu_ids': gpu_ids, 'shape': list(x.shape)}
+
+    print('Example: GPU task')
+    print('@ray.remote(num_gpus=1)')
+    print('def gpu_task(task_id):')
+    print('    gpu_ids = ray.get_gpu_ids()')
     print("    device = torch.device('cuda')")
-    print("    x = torch.randn(100, 100, device=device)")
+    print('    x = torch.randn(100, 100, device=device)')
     print("    return {'task_id': task_id, 'gpu_ids': gpu_ids}")
-    
+
     # Execute on both GPUs
     tasks = [gpu_task.remote(i) for i in range(2)]
     results = ray.get(tasks)
-    
+
     print(f"\nResults from 2 GPU tasks:")
     for result in results:
-        print(f"  Task {result['task_id']}: GPU {result['gpu_ids']}, Tensor {result['shape']}")
+        print(
+            f"  Task {result['task_id']}: GPU {result['gpu_ids']}, Tensor {result['shape']}"
+        )
+
 
 def step_3_actors_vs_tasks():
     """Step 3: Understanding the difference between actors and tasks."""
-    print("\n" + "="*60)
-    print("🎭 STEP 3: ACTORS VS TASKS")
-    print("="*60)
-    
-    print("""
+    print('\n' + '=' * 60)
+    print('🎭 STEP 3: ACTORS VS TASKS')
+    print('=' * 60)
+
+    print(
+        """
 Tasks vs Actors:
 
 TASKS (@ray.remote functions):
@@ -106,126 +119,137 @@ def step_3_actors_vs_tasks():
 
 ACTORS (@ray.remote classes):
 - Stateful workers with persistent memory
-- Good for complex workflows  
+- Good for complex workflows
 - GPU held for the lifetime of the actor
 - Can maintain state between method calls
-""")
-    
+"""
+    )
+
     @ray.remote(num_gpus=0.5)
     class GPUActor:
+
         def __init__(self):
             self.gpu_ids = ray.get_gpu_ids()
-            self.device = torch.device("cuda")
+            self.device = torch.device('cuda')
             self.counter = 0
-            
+
         def process(self, data_size=500):
             self.counter += 1
             x = torch.randn(data_size, data_size, device=self.device)
             y = torch.mm(x, x.T)
             return {
-                "call_count": self.counter,
-                "gpu_ids": self.gpu_ids,
-                "result": torch.trace(y).item()
+                'call_count': self.counter,
+                'gpu_ids': self.gpu_ids,
+                'result': torch.trace(y).item(),
             }
-    
-    print("Example: GPU Actor")
-    print("@ray.remote(num_gpus=0.5)")
-    print("class GPUActor:")
-    print("    def __init__(self):")
-    print("        self.gpu_ids = ray.get_gpu_ids()")
+
+    print('Example: GPU Actor')
+    print('@ray.remote(num_gpus=0.5)')
+    print('class GPUActor:')
+    print('    def __init__(self):')
+    print('        self.gpu_ids = ray.get_gpu_ids()')
     print("        self.device = torch.device('cuda')")
-    print("        self.counter = 0")
-    
+    print('        self.counter = 0')
+
     # Create actors (4 actors, 2 per GPU with 0.5 GPU each)
     actors = [GPUActor.remote() for _ in range(4)]
-    
+
     # Call methods multiple times
     futures = []
     for actor in actors:
         for _ in range(2):  # 2 calls per actor
             futures.append(actor.process.remote())
-    
+
     results = ray.get(futures)
-    
+
     print(f"\nResults from {len(actors)} actors, each called twice:")
     for i, result in enumerate(results):
-        print(f"  Call {i+1}: GPU {result['gpu_ids']}, Count: {result['call_count']}, Result: {result['result']:.2f}")
+        print(
+            f"  Call {i+1}: GPU {result['gpu_ids']}, Count: {result['call_count']}, Result: {result['result']:.2f}"
+        )
+
 
 def step_4_monitoring_resources():
     """Step 4: Understanding resource monitoring."""
-    print("\n" + "="*60)
-    print("📊 STEP 4: MONITORING RESOURCES")
-    print("="*60)
-    
-    print("""
+    print('\n' + '=' * 60)
+    print('📊 STEP 4: MONITORING RESOURCES')
+    print('=' * 60)
+
+    print(
+        """
 Ray provides several ways to monitor resources:
 
 1. ray.cluster_resources() - Total resources in cluster
-2. ray.available_resources() - Currently available resources  
+2. ray.available_resources() - Currently available resources
 3. ray.nodes() - Information about cluster nodes
 4. Ray Dashboard - Web UI for monitoring (http://localhost:8265)
-""")
-    
-    print("Current cluster state:")
+"""
+    )
+
+    print('Current cluster state:')
     print(f"  Total resources: {ray.cluster_resources()}")
     print(f"  Available resources: {ray.available_resources()}")
-    
+
     # Show how resources change during execution
-    @ray.remote(num_gpus=1)  
+    @ray.remote(num_gpus=1)
     def blocking_gpu_task():
         print(f"  📍 Task started on GPU {ray.get_gpu_ids()}")
         time.sleep(3)  # Hold GPU for 3 seconds
-        return "done"
-    
-    print("\nWatching resources during task execution...")
-    print("Available before task:", ray.available_resources().get('GPU', 0))
-    
+        return 'done'
+
+    print('\nWatching resources during task execution...')
+    print('Available before task:', ray.available_resources().get('GPU', 0))
+
     future = blocking_gpu_task.remote()
     time.sleep(0.5)  # Give task time to start
-    print("Available during task:", ray.available_resources().get('GPU', 0))
-    
+    print('Available during task:', ray.available_resources().get('GPU', 0))
+
     ray.get(future)
-    print("Available after task: ", ray.available_resources().get('GPU', 0))
+    print('Available after task: ', ray.available_resources().get('GPU', 0))
+
 
 def interactive_learning_session():
     """Run an interactive learning session."""
-    print("🎯 RAY GPU MANAGEMENT - INTERACTIVE LEARNING")
-    print("=" * 70)
-    
-    print("""
+    print('🎯 RAY GPU MANAGEMENT - INTERACTIVE LEARNING')
+    print('=' * 70)
+
+    print(
+        """
 Welcome to Ray GPU Management Learning!
 
 This script will teach you Ray concepts step by step.
 Each step builds on the previous one.
 
 You have 2 NVIDIA A100 GPUs available for learning.
-""")
-    
+"""
+    )
+
     # Initialize Ray
-    print("🚀 Initializing Ray...")
+    print('🚀 Initializing Ray...')
     ray.init(num_gpus=2)
     print(f"✅ Ray initialized with resources: {ray.cluster_resources()}")
-    
+
     try:
         step_1_basic_concepts()
-        
-        input("\nPress Enter to continue to Step 2...")
+
+        input('\nPress Enter to continue to Step 2...')
         step_2_gpu_resource_management()
-        
-        input("\nPress Enter to continue to Step 3...")
+
+        input('\nPress Enter to continue to Step 3...')
         step_3_actors_vs_tasks()
-        
-        input("\nPress Enter to continue to Step 4...")
+
+        input('\nPress Enter to continue to Step 4...')
         step_4_monitoring_resources()
-        
-        print("\n" + "="*70)
-        print("🎉 CONGRATULATIONS!")
-        print("="*70)
-        print("""
+
+        print('\n' + '=' * 70)
+        print('🎉 CONGRATULATIONS!')
+        print('=' * 70)
+        print(
+            """
 You've learned the fundamentals of Ray GPU management:
 
 ✅ Basic Ray concepts (remote functions, ray.get)
-✅ GPU resource allocation (full and fractional)  
+✅ GPU resource allocation (full and fractional)
 ✅ Difference between tasks and actors
 ✅ Resource monitoring and observation
 
@@ -237,12 +261,14 @@ def interactive_learning_session():
 5. Look into Ray Train for distributed training
 
 Happy learning! 🚀
-""")
-        
+"""
+        )
+
     except KeyboardInterrupt:
-        print("\n\n👋 Learning session interrupted. Thanks for trying Ray!")
+        print('\n\n👋 Learning session interrupted. Thanks for trying Ray!')
     finally:
         ray.shutdown()
 
-if __name__ == "__main__":
-    interactive_learning_session() 
\ No newline at end of file
+
+if __name__ == '__main__':
+    interactive_learning_session()
diff --git a/ray_test/ray_scheduling_demo.py b/ray_test/ray_scheduling_demo.py
index 725eb9a2..1e194b42 100644
--- a/ray_test/ray_scheduling_demo.py
+++ b/ray_test/ray_scheduling_demo.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray GPU Scheduling Demo
 
@@ -8,155 +12,179 @@
 Answer: NO - Ray waits until sufficient resources are available.
 """
 
-import ray
-import torch
-import time
 import os
+import time
 from datetime import datetime
 
+import ray
+import torch
+
+
 def timestamp():
     """Get current timestamp for logging."""
-    return datetime.now().strftime("%H:%M:%S.%f")[:-3]
+    return datetime.now().strftime('%H:%M:%S.%f')[:-3]
+
 
 @ray.remote(num_gpus=0.5)
 def light_gpu_task(task_id: int, duration: int = 10):
     """Light task that uses 0.5 GPU and runs for specified duration."""
     gpu_ids = ray.get_gpu_ids()
     pid = os.getpid()
-    
-    print(f"[{timestamp()}] 🟡 Light task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids})")
-    
+
+    print(
+        f"[{timestamp()}] 🟡 Light task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids})"
+    )
+
     # Create some GPU work
-    device = torch.device("cuda")
+    device = torch.device('cuda')
     x = torch.randn(1000, 1000, device=device)
-    
+
     # Simulate work for the specified duration
     for i in range(duration):
         x = torch.mm(x, x.T)
         time.sleep(1)
         if i % 3 == 0:  # Progress update every 3 seconds
-            print(f"[{timestamp()}] 🟡 Light task {task_id} working... ({i+1}/{duration}s)")
-    
+            print(
+                f"[{timestamp()}] 🟡 Light task {task_id} working... ({i+1}/{duration}s)"
+            )
+
     print(f"[{timestamp()}] 🟡 Light task {task_id} FINISHED")
     return f"Light task {task_id} completed"
 
+
 @ray.remote(num_gpus=1.0)
 def heavy_gpu_task(task_id: int, duration: int = 5):
     """Heavy task that needs full GPU and runs for specified duration."""
     gpu_ids = ray.get_gpu_ids()
     pid = os.getpid()
-    
-    print(f"[{timestamp()}] 🔴 Heavy task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids[0]})")
-    
+
+    print(
+        f"[{timestamp()}] 🔴 Heavy task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids[0]})"
+    )
+
     # Create heavier GPU work
-    device = torch.device("cuda")
+    device = torch.device('cuda')
     x = torch.randn(2000, 2000, device=device)
-    
+
     # Simulate work
     for i in range(duration):
         x = torch.mm(x, x.T)
         time.sleep(1)
-        print(f"[{timestamp()}] 🔴 Heavy task {task_id} working... ({i+1}/{duration}s)")
-    
+        print(
+            f"[{timestamp()}] 🔴 Heavy task {task_id} working... ({i+1}/{duration}s)"
+        )
+
     print(f"[{timestamp()}] 🔴 Heavy task {task_id} FINISHED")
     return f"Heavy task {task_id} completed"
 
+
 @ray.remote
 def resource_monitor():
     """Monitor available resources."""
     total = ray.cluster_resources()
     available = ray.available_resources()
-    
+
     return {
-        "timestamp": timestamp(),
-        "total_gpus": total.get("GPU", 0),
-        "available_gpus": available.get("GPU", 0),
-        "available_cpus": available.get("CPU", 0)
+        'timestamp': timestamp(),
+        'total_gpus': total.get('GPU', 0),
+        'available_gpus': available.get('GPU', 0),
+        'available_cpus': available.get('CPU', 0),
     }
 
+
 def demonstrate_scheduling():
     """Demonstrate Ray's scheduling behavior."""
-    
-    print("=" * 60)
-    print("RAY GPU SCHEDULING DEMONSTRATION")
-    print("=" * 60)
+
+    print('=' * 60)
+    print('RAY GPU SCHEDULING DEMONSTRATION')
+    print('=' * 60)
     print()
-    
+
     ray.init()
-    
+
     # Check initial resources
     initial_resources = ray.get(resource_monitor.remote())
     print(f"Initial resources: {initial_resources}")
     print()
-    
-    print("SCENARIO: Testing if heavy tasks can start while light tasks are running")
-    print("- Light tasks: 0.5 GPU each, 10 seconds duration")
-    print("- Heavy tasks: 1.0 GPU each, 5 seconds duration")
-    print("- With 2 GPUs: 4 light tasks should fill both GPUs (2 per GPU)")
-    print("- Heavy tasks should WAIT until light tasks finish")
+
+    print(
+        'SCENARIO: Testing if heavy tasks can start while light tasks are running'
+    )
+    print('- Light tasks: 0.5 GPU each, 10 seconds duration')
+    print('- Heavy tasks: 1.0 GPU each, 5 seconds duration')
+    print('- With 2 GPUs: 4 light tasks should fill both GPUs (2 per GPU)')
+    print('- Heavy tasks should WAIT until light tasks finish')
     print()
-    
+
     # Launch tasks in specific order to demonstrate scheduling
-    print(f"[{timestamp()}] 🚀 Launching 4 light tasks (should fill both GPUs)...")
-    
+    print(
+        f"[{timestamp()}] 🚀 Launching 4 light tasks (should fill both GPUs)..."
+    )
+
     light_tasks = []
     for i in range(4):
         task = light_gpu_task.remote(i, duration=10)
         light_tasks.append(task)
         time.sleep(0.5)  # Small delay to see launch order
-    
+
     # Wait a moment for light tasks to start
     time.sleep(2)
-    
+
     # Check resources after light tasks start
     mid_resources = ray.get(resource_monitor.remote())
     print(f"[{timestamp()}] Resources after light tasks start: {mid_resources}")
     print()
-    
+
     # Now launch heavy tasks - these should be QUEUED
     print(f"[{timestamp()}] 🚀 Launching 2 heavy tasks (should be QUEUED)...")
-    
+
     heavy_tasks = []
     for i in range(2):
         task = heavy_gpu_task.remote(i, duration=5)
         heavy_tasks.append(task)
         time.sleep(0.5)
-    
+
     print()
-    print("⏳ OBSERVATION: Heavy tasks will wait until sufficient GPU resources are free!")
-    print("   - Each light task uses 0.5 GPU")
-    print("   - Each heavy task needs 1.0 GPU")
-    print("   - Heavy tasks must wait for 2 light tasks to finish to get 1.0 GPU")
+    print(
+        '⏳ OBSERVATION: Heavy tasks will wait until sufficient GPU resources are free!'
+    )
+    print('   - Each light task uses 0.5 GPU')
+    print('   - Each heavy task needs 1.0 GPU')
+    print(
+        '   - Heavy tasks must wait for 2 light tasks to finish to get 1.0 GPU'
+    )
     print()
-    
+
     # Monitor resources periodically
     for i in range(3):
         time.sleep(3)
         current_resources = ray.get(resource_monitor.remote())
         print(f"[{timestamp()}] Current resources: {current_resources}")
-    
+
     # Wait for all tasks to complete
     print(f"\n[{timestamp()}] ⏳ Waiting for all tasks to complete...")
-    
+
     light_results = ray.get(light_tasks)
     heavy_results = ray.get(heavy_tasks)
-    
+
     print(f"\n[{timestamp()}] ✅ All tasks completed!")
-    print("Light task results:", light_results)
-    print("Heavy task results:", heavy_results)
-    
+    print('Light task results:', light_results)
+    print('Heavy task results:', heavy_results)
+
     # Final resource check
     final_resources = ray.get(resource_monitor.remote())
     print(f"Final resources: {final_resources}")
-    
+
     ray.shutdown()
 
+
 def explain_scheduling():
     """Explain Ray's scheduling algorithm."""
-    print("\n" + "=" * 60)
-    print("RAY SCHEDULING EXPLAINED")
-    print("=" * 60)
-    print("""
+    print('\n' + '=' * 60)
+    print('RAY SCHEDULING EXPLAINED')
+    print('=' * 60)
+    print(
+        """
 Ray's resource scheduler works like this:
 
 1. RESOURCE TRACKING:
@@ -182,13 +210,15 @@ def explain_scheduling():
    - But resource availability affects actual execution order
    - Tasks with available resources start first
 
-KEY INSIGHT: 
-Heavy tasks (1.0 GPU) CANNOT start while light tasks (0.5 GPU each) 
+KEY INSIGHT:
+Heavy tasks (1.0 GPU) CANNOT start while light tasks (0.5 GPU each)
 occupy all GPU resources, even if the physical GPU isn't fully utilized.
 
 This ensures predictable resource allocation and prevents resource conflicts.
-""")
+"""
+    )
+
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demonstrate_scheduling()
-    explain_scheduling() 
\ No newline at end of file
+    explain_scheduling()
diff --git a/ray_test/ray_single_server_multi_gpu.py b/ray_test/ray_single_server_multi_gpu.py
index 8c6c2e0c..50974f27 100644
--- a/ray_test/ray_single_server_multi_gpu.py
+++ b/ray_test/ray_single_server_multi_gpu.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 """
 Ray Single Server Multi-GPU Example
 
@@ -6,242 +10,269 @@
 Shows various patterns: full GPU allocation, fractional allocation, and mixed workloads.
 """
 
-import ray
-import torch
+import logging
 import time
+from typing import Any
+
 import numpy as np
-from typing import List, Dict, Any
-import logging
+import ray
+import torch
 
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 @ray.remote(num_gpus=1)
 class GPUWorker:
     """A Ray actor that holds a full GPU for the duration of its lifetime."""
-    
+
     def __init__(self, worker_id: int):
         self.worker_id = worker_id
         self.gpu_ids = ray.get_gpu_ids()
-        self.device = torch.device("cuda")
+        self.device = torch.device('cuda')
         logger.info(f"Worker {worker_id} initialized on GPU {self.gpu_ids}")
-    
-    def matrix_multiply(self, size: int = 2000, iterations: int = 5) -> Dict[str, Any]:
+
+    def matrix_multiply(self,
+                        size: int = 2000,
+                        iterations: int = 5) -> dict[str, Any]:
         """Perform matrix multiplication to simulate GPU work."""
         start_time = time.time()
-        
+
         # Create random matrices on GPU
         A = torch.randn(size, size, device=self.device)
         B = torch.randn(size, size, device=self.device)
-        
+
         results = []
         for i in range(iterations):
             C = torch.mm(A, B)
             results.append(torch.trace(C).item())
-            
+
         end_time = time.time()
-        
+
         return {
-            "worker_id": self.worker_id,
-            "gpu_ids": self.gpu_ids,
-            "execution_time": end_time - start_time,
-            "results": results[:3],  # Just first 3 for brevity
-            "tensor_shape": list(C.shape)
+            'worker_id': self.worker_id,
+            'gpu_ids': self.gpu_ids,
+            'execution_time': end_time - start_time,
+            'results': results[:3],  # Just first 3 for brevity
+            'tensor_shape': list(C.shape),
         }
-    
-    def get_gpu_memory_usage(self) -> Dict[str, float]:
+
+    def get_gpu_memory_usage(self) -> dict[str, float]:
         """Get current GPU memory usage."""
         if torch.cuda.is_available():
             gpu_id = self.gpu_ids[0]
             allocated = torch.cuda.memory_allocated(gpu_id) / 1024**3  # GB
             cached = torch.cuda.memory_reserved(gpu_id) / 1024**3  # GB
             return {
-                "gpu_id": gpu_id,
-                "allocated_gb": allocated,
-                "cached_gb": cached
+                'gpu_id': gpu_id,
+                'allocated_gb': allocated,
+                'cached_gb': cached,
             }
         return {}
 
+
 @ray.remote(num_gpus=0.5)
-def lightweight_gpu_task(task_id: int, work_size: int = 1000) -> Dict[str, Any]:
+def lightweight_gpu_task(task_id: int, work_size: int = 1000) -> dict[str, Any]:
     """A task that uses half a GPU - allows 2 tasks per GPU."""
     start_time = time.time()
     gpu_ids = ray.get_gpu_ids()
-    
-    device = torch.device("cuda")
+
+    device = torch.device('cuda')
     x = torch.randn(work_size, work_size, device=device)
-    
+
     # Simulate some computation
     for _ in range(3):
         x = torch.relu(x @ x.T)
-    
+
     end_time = time.time()
-    
+
     return {
-        "task_id": task_id,
-        "gpu_ids": gpu_ids,
-        "execution_time": end_time - start_time,
-        "final_mean": x.mean().item()
+        'task_id': task_id,
+        'gpu_ids': gpu_ids,
+        'execution_time': end_time - start_time,
+        'final_mean': x.mean().item(),
     }
 
+
 @ray.remote(num_cpus=1)
-def cpu_task(task_id: int) -> Dict[str, Any]:
+def cpu_task(task_id: int) -> dict[str, Any]:
     """A CPU-only task to demonstrate mixed workloads."""
     start_time = time.time()
-    
+
     # CPU computation
-    result = np.sum(np.random.randn(1000, 1000) ** 2)
+    result = np.sum(np.random.randn(1000, 1000)**2)
     time.sleep(1)  # Simulate work
-    
+
     end_time = time.time()
-    
+
     return {
-        "task_id": task_id,
-        "execution_time": end_time - start_time,
-        "result": result,
-        "resource_type": "CPU"
+        'task_id': task_id,
+        'execution_time': end_time - start_time,
+        'result': result,
+        'resource_type': 'CPU',
     }
 
+
 def print_resources():
     """Print current Ray cluster resources."""
-    print("\n" + "="*50)
-    print("RAY CLUSTER RESOURCES")
-    print("="*50)
+    print('\n' + '=' * 50)
+    print('RAY CLUSTER RESOURCES')
+    print('=' * 50)
     print(f"Total resources: {ray.cluster_resources()}")
     print(f"Available resources: {ray.available_resources()}")
-    print("="*50)
+    print('=' * 50)
+
 
 def demo_gpu_actors():
     """Demonstrate GPU actors (long-lived GPU workers)."""
-    print("\n🚀 DEMO 1: GPU Actors (Long-lived Workers)")
-    print("-" * 50)
-    
+    print('\n🚀 DEMO 1: GPU Actors (Long-lived Workers)')
+    print('-' * 50)
+
     # Create 2 GPU workers (one per GPU)
     workers = [GPUWorker.remote(i) for i in range(2)]
-    
+
     # Submit work to both workers
     futures = []
     for i, worker in enumerate(workers):
         future = worker.matrix_multiply.remote(size=1500, iterations=3)
         futures.append(future)
-    
-    print("Submitted work to GPU actors...")
+
+    print('Submitted work to GPU actors...')
     results = ray.get(futures)
-    
+
     for result in results:
-        print(f"  Worker {result['worker_id']}: GPU {result['gpu_ids']}, "
-              f"Time: {result['execution_time']:.2f}s")
-    
+        print(
+            f"  Worker {result['worker_id']}: GPU {result['gpu_ids']}, "
+            f"Time: {result['execution_time']:.2f}s"
+        )
+
     # Check memory usage
-    memory_futures = [worker.get_gpu_memory_usage.remote() for worker in workers]
+    memory_futures = [
+        worker.get_gpu_memory_usage.remote() for worker in workers
+    ]
     memory_results = ray.get(memory_futures)
-    
+
     for mem in memory_results:
-        print(f"  GPU {mem['gpu_id']}: {mem['allocated_gb']:.2f}GB allocated, "
-              f"{mem['cached_gb']:.2f}GB cached")
-    
+        print(
+            f"  GPU {mem['gpu_id']}: {mem['allocated_gb']:.2f}GB allocated, "
+            f"{mem['cached_gb']:.2f}GB cached"
+        )
+
     return workers
 
+
 def demo_fractional_gpu():
     """Demonstrate fractional GPU allocation."""
-    print("\n🔄 DEMO 2: Fractional GPU Tasks (0.5 GPU each)")
-    print("-" * 50)
-    
+    print('\n🔄 DEMO 2: Fractional GPU Tasks (0.5 GPU each)')
+    print('-' * 50)
+
     # Launch 4 tasks with 0.5 GPU each (2 per GPU)
     tasks = [lightweight_gpu_task.remote(i, work_size=800) for i in range(4)]
-    
-    print("Submitted 4 tasks with 0.5 GPU each...")
+
+    print('Submitted 4 tasks with 0.5 GPU each...')
     results = ray.get(tasks)
-    
+
     for result in results:
-        print(f"  Task {result['task_id']}: GPU {result['gpu_ids']}, "
-              f"Time: {result['execution_time']:.2f}s")
+        print(
+            f"  Task {result['task_id']}: GPU {result['gpu_ids']}, "
+            f"Time: {result['execution_time']:.2f}s"
+        )
+
 
 def demo_mixed_workload():
     """Demonstrate mixed CPU and GPU workloads."""
-    print("\n🔀 DEMO 3: Mixed CPU and GPU Workloads")
-    print("-" * 50)
-    
+    print('\n🔀 DEMO 3: Mixed CPU and GPU Workloads')
+    print('-' * 50)
+
     # Mix of CPU and GPU tasks
     cpu_tasks = [cpu_task.remote(i) for i in range(3)]
-    gpu_tasks = [lightweight_gpu_task.remote(i+10, work_size=600) for i in range(3)]
-    
+    gpu_tasks = [
+        lightweight_gpu_task.remote(i + 10, work_size=600) for i in range(3)
+    ]
+
     all_tasks = cpu_tasks + gpu_tasks
-    print(f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks...")
-    
+    print(
+        f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks..."
+    )
+
     start_time = time.time()
     results = ray.get(all_tasks)
     total_time = time.time() - start_time
-    
+
     print(f"All tasks completed in {total_time:.2f}s")
-    
+
     # Separate results
     cpu_results = [r for r in results if r.get('resource_type') == 'CPU']
     gpu_results = [r for r in results if 'gpu_ids' in r]
-    
+
     print(f"  CPU tasks: {len(cpu_results)} completed")
     print(f"  GPU tasks: {len(gpu_results)} completed")
 
+
 def demo_dynamic_scheduling():
     """Demonstrate dynamic task scheduling based on resource availability."""
-    print("\n⚡ DEMO 4: Dynamic Scheduling")
-    print("-" * 50)
-    
+    print('\n⚡ DEMO 4: Dynamic Scheduling')
+    print('-' * 50)
+
     # Submit tasks gradually and monitor resource usage
     completed_tasks = []
     pending_tasks = []
-    
+
     for i in range(8):
         task = lightweight_gpu_task.remote(i, work_size=500)
         pending_tasks.append(task)
-        
+
         # Check if we should wait for some tasks to complete
         if len(pending_tasks) >= 4:  # Don't overwhelm the queue
             # Wait for at least one task to complete
             ready, pending_tasks = ray.wait(pending_tasks, num_returns=1)
             completed_tasks.extend(ray.get(ready))
-            print(f"  Completed {len(completed_tasks)} tasks, "
-                  f"{len(pending_tasks)} still pending")
-    
+            print(
+                f"  Completed {len(completed_tasks)} tasks, "
+                f"{len(pending_tasks)} still pending"
+            )
+
     # Wait for remaining tasks
     if pending_tasks:
         completed_tasks.extend(ray.get(pending_tasks))
-    
+
     print(f"Dynamic scheduling completed: {len(completed_tasks)} total tasks")
 
+
 def main():
     """Main function demonstrating various Ray GPU patterns."""
-    print("🎯 Ray Single Server Multi-GPU Demo")
-    print("=" * 60)
-    
+    print('🎯 Ray Single Server Multi-GPU Demo')
+    print('=' * 60)
+
     # Initialize Ray
     ray.init(num_gpus=2)  # Explicitly specify 2 GPUs
-    
+
     print_resources()
-    
+
     # Run all demos
     workers = demo_gpu_actors()
     print_resources()
-    
+
     demo_fractional_gpu()
     print_resources()
-    
+
     demo_mixed_workload()
     print_resources()
-    
+
     demo_dynamic_scheduling()
     print_resources()
-    
-    print("\n🎉 All demos completed!")
-    print("\nKey Takeaways:")
-    print("1. GPU Actors: Long-lived workers for persistent GPU allocation")
-    print("2. Fractional GPUs: Share GPUs between multiple light tasks")
-    print("3. Mixed Workloads: Combine CPU and GPU tasks efficiently")
-    print("4. Dynamic Scheduling: Adapt to resource availability")
-    
+
+    print('\n🎉 All demos completed!')
+    print('\nKey Takeaways:')
+    print('1. GPU Actors: Long-lived workers for persistent GPU allocation')
+    print('2. Fractional GPUs: Share GPUs between multiple light tasks')
+    print('3. Mixed Workloads: Combine CPU and GPU tasks efficiently')
+    print('4. Dynamic Scheduling: Adapt to resource availability')
+
     # Cleanup
     ray.shutdown()
 
-if __name__ == "__main__":
-    main() 
\ No newline at end of file
+
+if __name__ == '__main__':
+    main()
diff --git a/ray_test/test_ray.py b/ray_test/test_ray.py
index 31655796..e9e5b2f0 100644
--- a/ray_test/test_ray.py
+++ b/ray_test/test_ray.py
@@ -1,6 +1,10 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
 import ray
 import torch
-import time
 
 
 # =================================================================
@@ -15,26 +19,28 @@ def use_gpu_task(task_id: int):
     # Ray automatically sets the CUDA_VISIBLE_DEVICES environment variable
     # for this worker process, so torch.cuda.current_device() will
     # correspond to the GPU Ray assigned.
-    
+
     # Let's get the physical GPU ID that Ray assigned to this task.
     gpu_ids = ray.get_gpu_ids()
     physical_gpu_id = gpu_ids[0]
 
-    print(f"-> Task {task_id} starting. Ray assigned me physical GPU: {physical_gpu_id}")
+    print(
+        f"-> Task {task_id} starting. Ray assigned me physical GPU: {physical_gpu_id}"
+    )
 
     # Create a tensor and move it to the assigned GPU.
     # PyTorch will only see the single GPU that Ray allocated.
-    device = torch.device("cuda")
+    device = torch.device('cuda')
     tensor = torch.randn(2000, 2000, device=device)
 
     # Perform some work to make the GPU busy.
     for i in range(5):
         tensor = tensor @ tensor
-        time.sleep(0.5) # Sleep to make it easier to see in nvidia-smi
+        time.sleep(0.5)  # Sleep to make it easier to see in nvidia-smi
         print(f"   Task {task_id}, iteration {i+1}, on device: {tensor.device}")
 
     print(f"<- Task {task_id} finished on GPU {physical_gpu_id}.")
-    
+
     # Return the ID of the GPU we used.
     return f"Task {task_id} ran on GPU {physical_gpu_id}"
 
@@ -42,24 +48,24 @@ def use_gpu_task(task_id: int):
 # =================================================================
 # 2. MAIN SCRIPT: Initialize Ray and launch the tasks
 # =================================================================
-if __name__ == "__main__":
+if __name__ == '__main__':
     # Start Ray. Ray will automatically detect the 2 GPUs.
     # You could also be explicit: ray.init(num_gpus=2)
     ray.init()
 
-    print("Ray Initialized.")
-    print("Cluster resources:", ray.cluster_resources())
+    print('Ray Initialized.')
+    print('Cluster resources:', ray.cluster_resources())
 
     # Verify that Ray sees our GPUs
-    if ray.cluster_resources().get("GPU", 0) < 2:
-        print("!!! WARNING: Ray did not detect 2 GPUs. Exiting.")
+    if ray.cluster_resources().get('GPU', 0) < 2:
+        print('!!! WARNING: Ray did not detect 2 GPUs. Exiting.')
         ray.shutdown()
         exit()
 
     # We have 2 GPUs, so let's launch 4 tasks.
     # Ray will run 2 tasks concurrently, and queue the other 2
     # until the first ones finish.
-    print("\nLaunching 4 GPU tasks on 2 available GPUs...")
+    print('\nLaunching 4 GPU tasks on 2 available GPUs...')
     task_refs = []
     for i in range(4):
         # .remote() immediately returns a future (a reference to the result)
@@ -70,8 +76,8 @@ def use_gpu_task(task_id: int):
     # Block until all tasks are complete and get the results.
     results = ray.get(task_refs)
 
-    print("\n--- All tasks completed! ---")
-    print("Results:", results)
+    print('\n--- All tasks completed! ---')
+    print('Results:', results)
 
     # Shut down Ray
-    ray.shutdown()
\ No newline at end of file
+    ray.shutdown()
diff --git a/ray_test/test_ray_chain.py b/ray_test/test_ray_chain.py
new file mode 100644
index 00000000..44e21d32
--- /dev/null
+++ b/ray_test/test_ray_chain.py
@@ -0,0 +1,51 @@
+import ray
+
+
+@ray.remote
+class ChainActor:
+    def __init__(self):
+        pass
+
+    def get(self, obj: ray.ObjectRef):
+        actual = ray.get(obj)
+        return actual + " Actor"
+
+
+@ray.remote
+def task1():
+    return "Task 1"
+
+@ray.remote
+def task2():
+    return task1.remote()
+
+
+@ray.remote
+class Foo:
+    def __init__(self):
+        print("Foo")
+
+    def print(self, actor):
+        print('I am in Foo', actor)
+        actor.print.remote()
+
+@ray.remote
+class Bar:
+    def __init__(self):
+        print("Bar")
+
+    def print(self):
+        print('I am in Bar')
+
+def main():
+    ray.init()
+    actor = ChainActor.remote()
+    res = actor.get.remote(task2.remote())
+    print(ray.get(res))
+
+    foo = Foo.remote()
+    bar = Bar.remote()
+    foo.print.remote(bar)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/ray_test/test_ray_distributed.py b/ray_test/test_ray_distributed.py
index e7f027cc..c6bc4662 100644
--- a/ray_test/test_ray_distributed.py
+++ b/ray_test/test_ray_distributed.py
@@ -1,7 +1,11 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
 import ray
 import torch
-import time
-import socket # Import socket to see which node we're on
+
 
 # The task definition is IDENTICAL to the previous example.
 # No changes are needed here.
@@ -9,19 +13,19 @@
 def use_gpu_task(task_id: int):
     gpu_ids = ray.get_gpu_ids()
     physical_gpu_id = gpu_ids[0]
-    
+
     # Let's also get the hostname of the node Ray scheduled us on.
     # In this simulation, it will be the same hostname, but Ray
     # internally treats them as distinct nodes.
     node_id = ray.get_runtime_context().get_node_id()
-    
+
     print(
         f"-> Task {task_id} starting."
         f" Ray assigned me physical GPU: {physical_gpu_id}"
-        f" on Node ID: {node_id}"
+        f" on Node ID: {node_id}",
     )
 
-    device = torch.device("cuda")
+    device = torch.device('cuda')
     tensor = torch.randn(2000, 2000, device=device)
 
     for i in range(5):
@@ -35,17 +39,17 @@ def use_gpu_task(task_id: int):
 # =================================================================
 # MAIN SCRIPT
 # =================================================================
-if __name__ == "__main__":
+if __name__ == '__main__':
     # CRITICAL CHANGE: Connect to the existing Ray cluster.
     # 'auto' tells Ray to find the running cluster from environment variables
     # that `ray start` sets up.
     ray.init()
 
-    print("Python script connected to Ray Cluster.")
-    print("Cluster resources:", ray.cluster_resources())
+    print('Python script connected to Ray Cluster.')
+    print('Cluster resources:', ray.cluster_resources())
 
     # The rest of the logic is the same.
-    print("\nLaunching 4 GPU tasks on our 2-node, 2-GPU cluster...")
+    print('\nLaunching 4 GPU tasks on our 2-node, 2-GPU cluster...')
     task_refs = []
     for i in range(4):
         ref = use_gpu_task.remote(i)
@@ -53,9 +57,9 @@ def use_gpu_task(task_id: int):
 
     results = ray.get(task_refs)
 
-    print("\n--- All tasks completed! ---")
-    print("Results:", results)
+    print('\n--- All tasks completed! ---')
+    print('Results:', results)
 
     # We don't call ray.shutdown() here, because we want to leave the
     # cluster running. We will stop it manually from the terminal.
-    print("\nScript finished. The Ray cluster is still running.")
\ No newline at end of file
+    print('\nScript finished. The Ray cluster is still running.')
diff --git a/ray_test/test_ray_init.py b/ray_test/test_ray_init.py
index 53ba10a0..ac5d2738 100644
--- a/ray_test/test_ray_init.py
+++ b/ray_test/test_ray_init.py
@@ -1,6 +1,10 @@
-import ray
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 import subprocess
 
+import ray
+
 
 def ray_start():
     subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
@@ -9,6 +13,7 @@ def ray_start():
 def ray_stop():
     subprocess.run(['ray', 'stop'], check=True)
 
+
 def ray_init():
     ray.init()
 
@@ -16,6 +21,7 @@ def ray_init():
 def ray_shutdown():
     ray.shutdown()
 
+
 if __name__ == '__main__':
     ray_start()
     ray_init()
diff --git a/ray_test/test_socket.py b/ray_test/test_socket.py
new file mode 100644
index 00000000..5bada510
--- /dev/null
+++ b/ray_test/test_socket.py
@@ -0,0 +1,45 @@
+import os
+import socket
+import time
+
+def start_server_on_inherited_socket(fd):
+    """This function runs in the child process."""
+    print(f"[Child PID {os.getpid()}] Inherited file descriptor: {fd}")
+    
+    # Re-create the socket object from the file descriptor
+    server_socket = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+    
+    # Get the address it's already bound to
+    host, port = server_socket.getsockname()
+    print(f"[Child PID {os.getpid()}] Socket is bound to {host}:{port}")
+
+    # Now the child can use the socket
+    server_socket.listen(1)
+    print("[Child PID {os.getpid()}] Server is listening...")
+    # ... accept connections, etc. ...
+    time.sleep(5) # Simulate server work
+    server_socket.close()
+    print("[Child PID {os.getpid()}] Server finished.")
+
+
+# --- Parent Process Logic ---
+if __name__ == "__main__":
+    # 1. Parent creates and binds the socket
+    parent_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    parent_socket.bind(('', 0)) # Bind to a free port
+    host, port = parent_socket.getsockname()
+    fd = parent_socket.fileno() # Get the integer file descriptor
+
+    print(f"[Parent PID {os.getpid()}] Bound to {host}:{port}, file descriptor: {fd}")
+
+    start_server_on_inherited_socket(fd)
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server_socket:
+        server_socket.bind(('', port))
+        host, port = server_socket.getsockname()
+        print(f"[Parent PID {os.getpid()}] Bound to {host}:{port}")
+        server_socket.listen(1)
+        print("[Parent PID {os.getpid()}] Server is listening...")
+        time.sleep(5) # Simulate server work
+        server_socket.close()
+        print("[Parent PID {os.getpid()}] Server finished.")
\ No newline at end of file
diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
new file mode 100644
index 00000000..39912d0a
--- /dev/null
+++ b/ray_test/test_torch_ray_distributed.py
@@ -0,0 +1,342 @@
+import ray
+import torch
+import torch.distributed as dist
+import os
+import socket
+import subprocess
+import time
+from contextlib import contextmanager
+from typing import Optional, Tuple
+import argparse
+from datetime import timedelta
+
+from functools import partial
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from composer.utils import dist as composer_dist
+from composer import Trainer
+from composer.optim import DecoupledAdamW
+from llmfoundry.models import ComposerHFCausalLM, ComposerMPTCausalLM
+from torch.utils.data import DataLoader
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers.models.gpt2 import GPT2LMHeadModel
+
+from compose_rl.algorithms.online import (
+    ComposerHFPolicyLM,
+    ComposerMPTPolicyLM,
+    OnPolicyCallback,
+)
+from compose_rl.algorithms.online.model_methods import OnPolicyEnum
+from compose_rl.algorithms.online.modeling_hf import ComposerHFPolicy
+from compose_rl.data import prompt_dataset_collate_fn
+from tests.common import PromptDataset, VerifiablePromptDataset, world_size
+
+from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
+
+from typing import Any
+
+
+def ray_noset_visible_devices():
+    return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
+
+
+def init_ray():
+    # init ray on master node, rank 0
+    if dist.get_rank() == 0:
+        # Start head node
+        subprocess.run(['ray', 'start', '--head'], check=True)
+        ray.init('auto')
+        # get existing ray ip and port 
+        ctx = ray.get_runtime_context()
+        address = ctx.gcs_address
+        print(f'available gpus: {ray.available_resources()}')
+    else:
+        address = ''
+    address_list = [address]
+    # broadcast address to all other ranks
+    dist.broadcast_object_list(address_list, src=0)
+    if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
+        address = address_list[0]
+        print(f'rank: {dist.get_rank()} connecting to address: {address}')
+        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
+    dist.barrier()
+    if dist.get_rank() == 0:
+        # wait until num of gpus reach world_size
+        num_gpus = int(ray.cluster_resources()['GPU'])
+        counter = 0
+        while num_gpus < dist.get_world_size():
+            print(f'waiting for {dist.get_world_size() - num_gpus} gpus to be available')
+            num_gpus = int(ray.cluster_resources()['GPU'])
+            time.sleep(5)
+            counter += 1
+            if counter > 4:
+                raise RuntimeError(f'Failed to start {dist.get_world_size()} gpus')
+        print(f'Total available gpus: {ray.available_resources()}')
+    return address
+
+
+@ray.remote(num_gpus=1)
+class DistributedGPUActor:
+    def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None, master_port: Optional[int] = None):
+        """Initialize the distributed GPU actor.
+        
+        Args:
+            rank: The rank of this process in the distributed group
+            world_size: Total number of processes in the distributed group
+            master_addr: Master node address. If None, will allocate dynamically for rank 0
+            master_port: Master node port. If None, will allocate dynamically for rank 0
+        """
+        self.rank = rank
+        self.world_size = world_size
+        self.master_addr = master_addr
+        self.master_port = master_port
+        
+        # Set up basic environment variables
+        os.environ["WORLD_SIZE"] = str(world_size)
+        os.environ["RANK"] = str(rank)
+        
+        # Set LOCAL_RANK based on Ray GPU allocation
+        os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
+        
+        # If this is rank 0 and no master_addr/master_port provided, allocate them
+        if rank == 0 and (master_addr is None or master_port is None):
+            self._allocate_master_address()
+
+        os.environ["MASTER_ADDR"] = self.master_addr
+        os.environ["MASTER_PORT"] = str(self.master_port)
+
+        self.model = None
+        self.model_update_group = None
+
+    def build_ref_model(self):
+        max_seq_len = 32
+        prompt_len = 10
+
+        model_name = 'gpt2'
+        tiny_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
+
+        dataset = PromptDataset(prompt_len=prompt_len)
+        dataloader = DataLoader(
+            dataset,
+            collate_fn=partial(
+                prompt_dataset_collate_fn,
+                tiny_gpt2_tokenizer,
+                max_seq_len,
+            ),
+            sampler=composer_dist.get_sampler(dataset),
+            batch_size=4,
+        )
+
+        # We need to mock this method, since our dataset isn't a StreamingDataset
+        dataloader.state_dict = lambda: {}
+        dataloader.load_state_dict = lambda x: None
+
+        model_config = {
+            'tokenizer': tiny_gpt2_tokenizer,
+            'pretrained_model_name_or_path': model_name,
+            'pretrained': True,
+            'use_flash_attention_2': True,
+            'allow_embedding_resizing': True,
+        }
+        tmp_model = ComposerHFCausalLM(**model_config)
+
+        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
+
+        tmp_ref_path = str('./ref_checkpoints')
+
+        temp_dataloader = [{
+            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
+            'attention_mask': torch.ones((2, 15)),
+            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
+        }]
+
+        temp_trainer = Trainer(
+            model=tmp_model,
+            train_dataloader=temp_dataloader,
+            optimizers=tmp_optimizer,
+            max_duration='1ba',
+            parallelism_config={'fsdp': {}},
+            save_folder=tmp_ref_path,
+            save_weights_only=True,
+            device_train_microbatch_size=2,
+        )
+
+        temp_trainer.fit()
+
+        # After making the reference model, we can proceed with the PPO training
+        tmp_ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+
+    def get_node_ip(self):
+        return ray.util.get_node_ip_address().strip('[]')
+    
+    def get_free_port(self):
+        with socket.socket() as sock:
+            sock.bind(("", 0))
+            return sock.getsockname()[1]
+    
+    def _allocate_master_address(self):
+        """Allocate master address and port for rank 0."""
+        if self.master_addr is None:
+            # Get the local IP address
+            self.master_addr = self.get_node_ip()
+
+        if self.master_port is None:
+            # Allocate a free port
+            self.master_port = self.get_free_port()
+    
+    def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
+        """Return the master address and port as a tuple."""
+        return (self.master_addr, self.master_port)
+    
+    def init_default_process_group(self):
+        """Initialize the distributed process group."""         
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=30), backend='nccl')
+        print(f'is distributed initialized: {dist.is_initialized()}')
+        # Print debug information
+        num_visible_devices = torch.cuda.device_count()
+        print(f'num_visible_devices: {num_visible_devices}')
+        print('Ray actor init envs:')
+        print(f'rank: {dist.get_rank()}')
+        print(f'node_rank: {dist.get_rank() // 8}')
+        print(f'world_size: {dist.get_world_size()}')
+        print(f'local_rank: {dist.get_rank() % 8}')
+        print(f'master_addr: {self.master_addr}')
+        print(f'master_port: {self.master_port}')
+    
+    def init_model(self, model_name: str):
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
+        self.model.to('cuda')
+
+    def sync_weights(self, vllm_engines: list[Any]):
+        for name, p in self.model.named_parameters():
+            refs = [engine.update_weight.remote(name, p.dtype, p.shape, empty_cache=False) for engine in vllm_engines]
+            dist.broadcast(p, src=0, group=self.model_update_group)
+            ray.get(refs)
+
+    def tensor_all_reduce(self) -> float:
+        """Perform a simple tensor all_reduce operation."""
+        # Create a tensor on the GPU and perform all_reduce
+        device = torch.device("cuda")
+        x = torch.ones(1, device=device)
+        dist.all_reduce(x)
+        
+        return x.item()
+
+    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
+        """Initialize the vLLM process group."""
+        self.model_update_group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
+        return dist.get_world_size(self.model_update_group)
+
+@contextmanager
+def start_ray_server():
+    dist.init_process_group(backend='gloo')
+    address = init_ray()
+    try:
+        yield address
+        dist.barrier()
+    finally:
+        if dist.get_rank() == 0:
+            ray.shutdown()
+            subprocess.run(['ray', 'stop'], check=True)
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+def run(tp_size: int = 8):
+    prompts = [
+        "what is RAY?",
+        "what is vLLM?",
+    ]
+    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
+    with start_ray_server() as address:
+        if dist.get_rank() == 0:
+            master_addr, _ = address.split(':')
+            
+            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+            num_train_actors = dist.get_world_size() // 2
+            # Create actors - rank 0 will allocate master address/port
+            train_actors = []
+
+            # master actor will allocate master_addr and master_port
+            master_actor = DistributedGPUActor.remote(0, num_train_actors)
+            train_actors.append(master_actor)
+            
+            # Get master address from rank 0 actor
+            master_info = ray.get(master_actor.get_master_address.remote())
+            master_addr, master_port = master_info
+            print(f"Master address allocated: {master_addr}:{master_port}")
+            
+            # Create remaining actors with the master address/port
+            for i in range(1, num_train_actors):
+                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
+                train_actors.append(actor)
+            
+            # Initialize process groups for all actors
+            init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
+            ray.get(init_tasks)
+            
+            # Perform tensor all_reduce on all actors
+            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
+            results = ray.get(reduce_tasks)
+            print(f"All-reduce results: {results}")
+
+            build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
+            ray.get(build_ref_model_tasks)
+            print('build ref model done')
+
+            # vllm_tensor_parallel_size = tp_size
+            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
+            # print(f'num_vllm_engines: {num_vllm_engines}')
+            # vllm_engines = create_vllm_engines(
+            #     num_engines=num_vllm_engines,
+            #     tensor_parallel_size=vllm_tensor_parallel_size,
+            #     enforce_eager=True,
+            #     pretrain=pretrain_model_name,
+            #     revision=None,
+            #     seed=1,
+            #     enable_prefix_caching=False,
+            #     max_model_len=2048,
+            # )
+
+            # new_port = ray.get(master_actor.get_free_port.remote())
+            # print(f'new_port: {new_port}')
+            # refs = [
+            #     engine.init_process_group.remote(
+            #         master_addr,
+            #         new_port,
+            #         i * vllm_tensor_parallel_size + 1,
+            #         dist.get_world_size() // 2 + 1,
+            #         'weight-update',
+            #         backend='nccl',
+            #     ) for i, engine in enumerate(vllm_engines)
+            # ]
+            # refs.append(master_actor.init_vllm_process_group.remote(
+            #     backend='nccl',
+            #     master_addr=master_addr,
+            #     master_port=new_port,
+            #     world_size=dist.get_world_size() // 2 + 1,
+            #     rank=0,
+            #     group_name='weight-update',
+            # ))
+            # print(ray.get(refs))
+
+            # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
+            # ray.get(refs)
+            # print('init model done')
+
+            # ray.get(master_actor.sync_weights.remote(vllm_engines))
+            # print('sync weights done')
+
+            # ref = vllm_engines[0].generate.remote(prompts)
+            # gen_results = ray.get(ref)
+            # for output in gen_results:
+            #     prompt = output.prompt
+            #     generated_text = output.outputs[0].text
+            #     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--tp_size', type=int, default=8)
+    args = parser.parse_args()
+    run(tp_size=args.tp_size)
diff --git a/test_async_better.py b/test_async_better.py
new file mode 100644
index 00000000..d4914d18
--- /dev/null
+++ b/test_async_better.py
@@ -0,0 +1,38 @@
+import asyncio
+import time
+
+program_start_time = time.time()
+
+async def async_function(name: str):
+    print(f"Async function {name} started at {time.time() - program_start_time:.2f}s")
+    await asyncio.sleep(1)
+    print(f"Async function {name} finished at {time.time() - program_start_time:.2f}s")
+
+# Better semaphore pattern
+total_tasks = 4
+concurrent_tasks = 2
+semaphore = asyncio.Semaphore(concurrent_tasks)
+
+async def async_function_with_semaphore_better(name: str):
+    async with semaphore:  # This acquires and releases automatically
+        await async_function(name)
+
+# Alternative explicit version:
+async def async_function_with_semaphore_explicit(name: str):
+    await semaphore.acquire()
+    try:
+        await async_function(name)
+    finally:
+        semaphore.release()
+
+async def run_tasks_better():
+    # Create all tasks at once - semaphore limiting happens inside each task
+    tasks = [
+        asyncio.create_task(async_function_with_semaphore_better(f"Task {i}"))
+        for i in range(total_tasks)
+    ]
+    await asyncio.gather(*tasks)
+
+if __name__ == "__main__":
+    print("Running better pattern:")
+    asyncio.run(run_tasks_better()) 
\ No newline at end of file
diff --git a/test_original_trace.py b/test_original_trace.py
new file mode 100644
index 00000000..b686f867
--- /dev/null
+++ b/test_original_trace.py
@@ -0,0 +1,41 @@
+import asyncio
+import time
+
+program_start_time = time.time()
+
+def log(message: str):
+    print(f"{time.time() - program_start_time:.3f}s: {message}")
+
+async def async_function(name: str):
+    log(f"  → {name} STARTED")
+    await asyncio.sleep(1)
+    log(f"  ← {name} FINISHED")
+
+total_tasks = 4
+concurrent_tasks = 2
+semaphore = asyncio.Semaphore(concurrent_tasks)
+
+async def async_function_with_semaphore(name: str):
+    cor = async_function(name)
+    await asyncio.sleep(1)
+    await cor
+    log(f"  {name} calling semaphore.release()")
+    semaphore.release()
+
+async def run_tasks_with_trace():
+    log("Starting run_tasks()")
+    tasks = []
+    
+    for i in range(total_tasks):
+        log(f"Iteration {i}: About to acquire semaphore")
+        await semaphore.acquire()  # This is where the magic happens!
+        log(f"Iteration {i}: Acquired semaphore, creating task")
+        tasks.append(asyncio.create_task(async_function_with_semaphore(f"Task {i}")))
+        log(f"Iteration {i}: Task created")
+    
+    log("All tasks created, calling gather()")
+    await asyncio.gather(*tasks)
+    log("gather() completed")
+
+if __name__ == "__main__":
+    asyncio.run(run_tasks_with_trace()) 
\ No newline at end of file
diff --git a/vllm_test/test_vllm.py b/vllm_test/test_vllm.py
new file mode 100644
index 00000000..9d40e87a
--- /dev/null
+++ b/vllm_test/test_vllm.py
@@ -0,0 +1,32 @@
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoModelForCausalLM
+
+def load_weights(self, weights: list[tuple[str, torch.Tensor]]):
+    self.model_runner.model.load_weights( # type: ignore
+        weights=weights,
+    )
+    
+if __name__ == '__main__':
+    prompts = [
+        "what is RAY?",
+        "what is vLLM?",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    model_name = "facebook/opt-125m"
+    print(f'loading model {model_name}...')
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
+    print('load model done')
+    llm = LLM(model=model_name)
+    for name, p in model.named_parameters():
+        llm.collective_rpc(
+            load_weights,
+            args=([(name, p)],),
+        )
+    print('load weights done')
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

From da377bddc99a1c4ba2c3ca9c4d5429ea86037aea Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 22 Jul 2025 20:43:29 +0000
Subject: [PATCH 063/107] clean up dir

---
 check_gpu_setup.py                            | 132 ------
 ray_test/RAY_GETTING_STARTED.md               | 324 --------------
 ray_test/RAY_GPU_EXAMPLES.md                  | 202 ---------
 ray_test/ray_distributed_simulation.py        | 404 ------------------
 ray_test/ray_gpu_basic.py                     |  57 ---
 ray_test/ray_gpu_patterns.py                  | 144 -------
 ray_test/ray_learning_guide.py                | 274 ------------
 ray_test/ray_scheduling_demo.py               | 224 ----------
 ray_test/ray_single_server_multi_gpu.py       | 278 ------------
 ray_test/test_ray.py                          |  83 ----
 ray_test/test_ray_chain.py                    |  51 ---
 ray_test/test_ray_distributed.py              |  65 ---
 ray_test/test_ray_init.py                     |  29 --
 ray_test/test_socket.py                       |  45 --
 ray_test/test_torch_ray_distributed.py        | 342 ---------------
 test_async_better.py                          |  38 --
 test_original_trace.py                        |  41 --
 .../test_torch_ray_distributed.py             |   0
 18 files changed, 2733 deletions(-)
 delete mode 100644 check_gpu_setup.py
 delete mode 100644 ray_test/RAY_GETTING_STARTED.md
 delete mode 100644 ray_test/RAY_GPU_EXAMPLES.md
 delete mode 100644 ray_test/ray_distributed_simulation.py
 delete mode 100644 ray_test/ray_gpu_basic.py
 delete mode 100644 ray_test/ray_gpu_patterns.py
 delete mode 100644 ray_test/ray_learning_guide.py
 delete mode 100644 ray_test/ray_scheduling_demo.py
 delete mode 100644 ray_test/ray_single_server_multi_gpu.py
 delete mode 100644 ray_test/test_ray.py
 delete mode 100644 ray_test/test_ray_chain.py
 delete mode 100644 ray_test/test_ray_distributed.py
 delete mode 100644 ray_test/test_ray_init.py
 delete mode 100644 ray_test/test_socket.py
 delete mode 100644 ray_test/test_torch_ray_distributed.py
 delete mode 100644 test_async_better.py
 delete mode 100644 test_original_trace.py
 rename test_torch_ray_distributed.py => tests/test_torch_ray_distributed.py (100%)

diff --git a/check_gpu_setup.py b/check_gpu_setup.py
deleted file mode 100644
index bdc92090..00000000
--- a/check_gpu_setup.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python3
-"""
-GPU Setup Checker
-
-Run this script first to verify your GPU setup is working correctly with Ray.
-"""
-
-import ray
-import torch
-import subprocess
-import sys
-
-def check_cuda_available():
-    """Check if CUDA is available in PyTorch."""
-    print("=== CUDA Check ===")
-    print(f"PyTorch version: {torch.__version__}")
-    print(f"CUDA available: {torch.cuda.is_available()}")
-    
-    if torch.cuda.is_available():
-        print(f"CUDA version: {torch.version.cuda}")
-        print(f"Number of GPUs: {torch.cuda.device_count()}")
-        
-        for i in range(torch.cuda.device_count()):
-            gpu_name = torch.cuda.get_device_name(i)
-            print(f"  GPU {i}: {gpu_name}")
-    else:
-        print("❌ CUDA not available! Check your PyTorch installation.")
-        return False
-    
-    return True
-
-def check_nvidia_smi():
-    """Check if nvidia-smi is available."""
-    print("\n=== nvidia-smi Check ===")
-    try:
-        result = subprocess.run(['nvidia-smi', '--query-gpu=index,name,memory.total', '--format=csv'], 
-                              capture_output=True, text=True, check=True)
-        print("nvidia-smi output:")
-        print(result.stdout)
-        return True
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        print("❌ nvidia-smi not found or failed")
-        return False
-
-def check_ray_gpu_detection():
-    """Check if Ray can detect GPUs."""
-    print("\n=== Ray GPU Detection ===")
-    
-    try:
-        ray.init()
-        
-        resources = ray.cluster_resources()
-        print(f"Ray cluster resources: {resources}")
-        
-        gpu_count = resources.get("GPU", 0)
-        if gpu_count >= 2:
-            print(f"✅ Ray detected {gpu_count} GPUs")
-        elif gpu_count == 1:
-            print(f"⚠️  Ray detected only {gpu_count} GPU (expected 2)")
-        else:
-            print("❌ Ray detected no GPUs")
-            
-        ray.shutdown()
-        return gpu_count >= 2
-        
-    except Exception as e:
-        print(f"❌ Ray initialization failed: {e}")
-        return False
-
-def run_simple_gpu_test():
-    """Run a simple GPU test to verify everything works."""
-    print("\n=== Simple GPU Test ===")
-    
-    if not torch.cuda.is_available():
-        print("❌ Skipping GPU test - CUDA not available")
-        return False
-    
-    try:
-        # Test each GPU
-        for gpu_id in range(torch.cuda.device_count()):
-            device = torch.device(f"cuda:{gpu_id}")
-            x = torch.randn(100, 100, device=device)
-            y = torch.mm(x, x.T)
-            print(f"✅ GPU {gpu_id} test passed - tensor shape: {y.shape} on {device}")
-        
-        return True
-        
-    except Exception as e:
-        print(f"❌ GPU test failed: {e}")
-        return False
-
-def main():
-    """Run all checks."""
-    print("Ray + GPU Setup Checker")
-    print("=" * 40)
-    
-    checks = [
-        ("CUDA/PyTorch", check_cuda_available),
-        ("nvidia-smi", check_nvidia_smi),
-        ("Ray GPU Detection", check_ray_gpu_detection),
-        ("Simple GPU Test", run_simple_gpu_test),
-    ]
-    
-    results = []
-    for name, check_func in checks:
-        try:
-            result = check_func()
-            results.append((name, result))
-        except Exception as e:
-            print(f"❌ {name} check failed with error: {e}")
-            results.append((name, False))
-    
-    print("\n" + "=" * 40)
-    print("SUMMARY:")
-    all_passed = True
-    for name, passed in results:
-        status = "✅ PASS" if passed else "❌ FAIL"
-        print(f"  {name}: {status}")
-        if not passed:
-            all_passed = False
-    
-    if all_passed:
-        print("\n🎉 All checks passed! You're ready to use Ray with GPUs.")
-        print("Try running: python ray_gpu_basic.py")
-    else:
-        print("\n⚠️  Some checks failed. Please fix the issues above before proceeding.")
-    
-    return all_passed
-
-if __name__ == "__main__":
-    success = main()
-    sys.exit(0 if success else 1) 
\ No newline at end of file
diff --git a/ray_test/RAY_GETTING_STARTED.md b/ray_test/RAY_GETTING_STARTED.md
deleted file mode 100644
index 17b4085d..00000000
--- a/ray_test/RAY_GETTING_STARTED.md
+++ /dev/null
@@ -1,324 +0,0 @@
-# Ray GPU Management - Complete Learning Guide
-
-Welcome to Ray GPU management! This guide provides everything you need to learn Ray from scratch, with hands-on examples for both single-server multi-GPU setups and distributed computing simulation.
-
-## 🎯 Learning Path Overview
-
-**You are here:** Complete beginner → Ray GPU expert
-
-```
-Step 1: Setup & Verification
-    ↓
-Step 2: Interactive Learning (Basics)
-    ↓
-Step 3: Single Server Multi-GPU Patterns
-    ↓
-Step 4: Distributed Simulation
-    ↓
-Step 5: Real-World Applications
-```
-
-## 📋 Prerequisites
-
-- ✅ Linux system with NVIDIA GPUs
-- ✅ CUDA toolkit installed
-- ✅ PyTorch with CUDA support
-- ✅ Ray installed (`pip install ray[default]`)
-
-## 🚀 Quick Start (3 Commands)
-
-```bash
-# 1. Verify your setup
-python check_gpu_setup.py
-
-# 2. Learn interactively
-python ray_learning_guide.py
-
-# 3. Try advanced patterns
-python ray_single_server_multi_gpu.py
-```
-
-## 📚 Detailed Learning Steps
-
-### Step 1: Verify Your Setup
-
-First, ensure everything is working:
-
-```bash
-python check_gpu_setup.py
-```
-
-This checks:
-- ✅ CUDA availability in PyTorch
-- ✅ nvidia-smi functionality
-- ✅ Ray GPU detection
-- ✅ Basic GPU operations
-
-**Expected output:** All checks should pass with "🎉 All checks passed!"
-
-### Step 2: Interactive Learning (START HERE!)
-
-Perfect for complete beginners:
-
-```bash
-python ray_learning_guide.py
-```
-
-**What you'll learn:**
-- Ray basic concepts (remote functions, actors)
-- GPU resource allocation (full vs fractional)
-- Tasks vs Actors differences
-- Resource monitoring
-
-**Duration:** 10-15 minutes (interactive)
-
-### Step 3: Single Server Multi-GPU Patterns
-
-Advanced patterns on your single server with 2 GPUs:
-
-```bash
-python ray_single_server_multi_gpu.py
-```
-
-**What you'll see:**
-- 🚀 GPU Actors (long-lived workers)
-- 🔄 Fractional GPU allocation (0.5 GPU per task)
-- 🔀 Mixed CPU/GPU workloads
-- ⚡ Dynamic scheduling patterns
-
-**Duration:** 5-10 minutes (automated demos)
-
-### Step 4: Distributed Simulation
-
-Simulate multi-server setup on localhost:
-
-```bash
-python ray_distributed_simulation.py
-```
-
-**What you'll learn:**
-- Starting Ray head and worker nodes
-- Connecting to distributed clusters
-- Task distribution across nodes
-- Node-level resource management
-
-**Note:** This simulates "Server 1" (head + GPUs) and "Server 2" (worker + CPUs)
-
-## 🔍 Monitoring Your Ray Cluster
-
-### During Learning
-
-While running examples, monitor GPU usage:
-
-```bash
-# Terminal 1: Run your Ray script
-python ray_learning_guide.py
-
-# Terminal 2: Monitor GPUs
-watch -n 1 nvidia-smi
-```
-
-### Ray Dashboard
-
-When Ray is running, visit the dashboard:
-```
-http://localhost:8265
-```
-
-Shows:
-- 📊 Resource utilization
-- 🎯 Task execution timeline
-- 🖥️  Node status and health
-- 📈 Performance metrics
-
-## 📖 Core Concepts Reference
-
-### GPU Resource Allocation
-
-```python
-# Full GPU (exclusive access)
-@ray.remote(num_gpus=1)
-def gpu_task():
-    pass
-
-# Fractional GPU (shared access)
-@ray.remote(num_gpus=0.5)  # 2 tasks per GPU
-def light_gpu_task():
-    pass
-```
-
-### Tasks vs Actors
-
-```python
-# TASK: Stateless function
-@ray.remote(num_gpus=1)
-def process_data(data):
-    return result
-
-# ACTOR: Stateful class
-@ray.remote(num_gpus=1)
-class DataProcessor:
-    def __init__(self):
-        self.model = load_model()
-
-    def process(self, data):
-        return self.model(data)
-```
-
-### Key Ray Functions
-
-```python
-# Submit work
-future = task.remote(data)
-actor = Actor.remote()
-
-# Get results
-result = ray.get(future)
-results = ray.get([future1, future2])
-
-# Monitor resources
-ray.cluster_resources()    # Total
-ray.available_resources()  # Available now
-```
-
-## 🛠️ Common Patterns
-
-### Pattern 1: Parallel GPU Processing
-```python
-@ray.remote(num_gpus=1)
-def train_model(config):
-    # Your GPU training code
-    pass
-
-# Train multiple models in parallel
-configs = [config1, config2]
-futures = [train_model.remote(c) for c in configs]
-results = ray.get(futures)
-```
-
-### Pattern 2: Mixed Workloads
-```python
-# Mix CPU preprocessing with GPU training
-cpu_tasks = [preprocess.remote(data) for data in dataset]
-processed_data = ray.get(cpu_tasks)
-
-gpu_tasks = [train.remote(data) for data in processed_data]
-models = ray.get(gpu_tasks)
-```
-
-### Pattern 3: Pipeline Processing
-```python
-@ray.remote(num_cpus=1)
-def preprocess(data):
-    return cleaned_data
-
-@ray.remote(num_gpus=0.5)
-def inference(data):
-    return predictions
-
-# Pipeline: preprocess → inference
-for data_batch in dataset:
-    clean_data = preprocess.remote(data_batch)
-    predictions = inference.remote(clean_data)
-    results.append(predictions)
-```
-
-## 🚨 Troubleshooting
-
-### Common Issues & Solutions
-
-**Issue:** Ray doesn't detect GPUs
-```bash
-# Solution: Force GPU detection
-ray.init(num_gpus=2)
-
-# Or check GPU visibility
-nvidia-smi
-```
-
-**Issue:** CUDA out of memory
-```bash
-# Solution: Use fractional GPUs
-@ray.remote(num_gpus=0.5)  # Instead of 1.0
-
-# Or reduce tensor sizes
-x = torch.randn(1000, 1000)  # Instead of 5000x5000
-```
-
-**Issue:** Tasks not running in parallel
-```python
-# Solution: Check available resources
-print(ray.available_resources())
-
-# Don't block waiting for results too early
-futures = [task.remote(i) for i in range(10)]
-# Do other work here...
-results = ray.get(futures)  # Wait at the end
-```
-
-**Issue:** Ray processes hanging
-```bash
-# Solution: Clean shutdown
-ray stop --force
-pkill -f ray
-```
-
-## 🎯 What's Next?
-
-After completing this guide, explore:
-
-### Advanced Ray Features
-- **Ray Tune:** Hyperparameter optimization
-- **Ray Train:** Distributed training
-- **Ray Serve:** Model serving
-- **Ray Data:** Large-scale data processing
-
-### Real Distributed Setup
-Once comfortable with localhost simulation:
-
-```bash
-# Server 1 (head node)
-ray start --head --port=10001 --num-gpus=2
-
-# Server 2 (worker node)
-ray start --address=192.168.1.100:10001 --num-gpus=1
-```
-
-### Production Considerations
-- Resource management policies
-- Fault tolerance and recovery
-- Monitoring and logging
-- Auto-scaling strategies
-
-## 📁 Files in This Learning Package
-
-| File | Purpose | When to Use |
-|------|---------|-------------|
-| `check_gpu_setup.py` | Verify system setup | **Start here** - before anything else |
-| `ray_learning_guide.py` | Interactive beginner tutorial | **Step 2** - core concepts |
-| `ray_single_server_multi_gpu.py` | Advanced single-server patterns | **Step 3** - practical patterns |
-| `ray_distributed_simulation.py` | Localhost distributed simulation | **Step 4** - distributed concepts |
-| `ray_gpu_basic.py` | Simple working example | Reference/quick test |
-| `RAY_GPU_EXAMPLES.md` | Original documentation | Additional reference |
-
-## 🎉 Success Metrics
-
-You'll know you've mastered Ray GPU management when you can:
-
-✅ Set up Ray clusters (single and distributed)
-✅ Choose between tasks and actors appropriately
-✅ Allocate GPU resources efficiently (full vs fractional)
-✅ Monitor and debug resource usage
-✅ Design efficient parallel workflows
-✅ Handle mixed CPU/GPU workloads
-
-## 🆘 Getting Help
-
-- 📖 [Official Ray Documentation](https://docs.ray.io/)
-- 💬 [Ray Discourse Forum](https://discuss.ray.io/)
-- 🐛 [Ray GitHub Issues](https://github.com/ray-project/ray/issues)
-- 📺 [Ray YouTube Tutorials](https://www.youtube.com/c/RayProjectIO)
-
----
-
-**Happy learning!** 🚀 Start with `python ray_learning_guide.py` and work your way through the examples.
diff --git a/ray_test/RAY_GPU_EXAMPLES.md b/ray_test/RAY_GPU_EXAMPLES.md
deleted file mode 100644
index 02845bb5..00000000
--- a/ray_test/RAY_GPU_EXAMPLES.md
+++ /dev/null
@@ -1,202 +0,0 @@
-# Ray GPU Management Examples
-
-This directory contains examples to help you learn Ray for GPU workload management on a single server with 2 GPUs.
-
-## Files Overview
-
-1. **`check_gpu_setup.py`** - Verify your GPU setup
-2. **`ray_gpu_basic.py`** - Minimal Ray GPU example
-3. **`ray_gpu_patterns.py`** - Advanced GPU management patterns
-4. **`test_ray.py`** - Your existing comprehensive example
-
-## Getting Started
-
-### Step 1: Check Your Setup
-
-Before running any examples, verify your GPU setup:
-
-```bash
-python check_gpu_setup.py
-```
-
-This will check:
-- CUDA availability in PyTorch
-- nvidia-smi functionality
-- Ray GPU detection
-- Basic GPU operations
-
-### Step 2: Run the Basic Example
-
-Start with the simplest example:
-
-```bash
-python ray_gpu_basic.py
-```
-
-This demonstrates:
-- Ray initialization
-- Basic GPU task creation
-- Resource allocation
-- Simple parallel execution
-
-### Step 3: Try Advanced Patterns
-
-Explore more sophisticated GPU management:
-
-```bash
-python ray_gpu_patterns.py
-```
-
-This shows:
-- Fractional GPU allocation (0.5 GPU per task)
-- Mixed CPU/GPU workloads
-- Resource monitoring
-- Dynamic scheduling
-
-### Step 4: Study the Complete Example
-
-Your existing `test_ray.py` provides a comprehensive example with:
-- Detailed GPU assignment tracking
-- Resource visualization
-- Error handling
-- Best practices
-
-## Key Ray GPU Concepts
-
-### 1. GPU Resource Allocation
-
-```python
-# Request 1 full GPU
-@ray.remote(num_gpus=1)
-def gpu_task():
-    pass
-
-# Request 0.5 GPU (allows 2 tasks per GPU)
-@ray.remote(num_gpus=0.5)
-def light_gpu_task():
-    pass
-```
-
-### 2. GPU Assignment
-
-Ray automatically:
-- Sets `CUDA_VISIBLE_DEVICES` for each task
-- Manages GPU memory isolation
-- Schedules tasks based on available GPUs
-
-```python
-# Inside a Ray task
-gpu_ids = ray.get_gpu_ids()  # Get assigned GPU IDs
-device = torch.device("cuda")  # PyTorch sees only assigned GPUs
-```
-
-### 3. Resource Monitoring
-
-```python
-# Check available resources
-ray.cluster_resources()    # Total resources
-ray.available_resources()  # Currently available
-```
-
-## Common Patterns
-
-### Pattern 1: Parallel GPU Tasks
-```python
-# Launch multiple tasks in parallel
-tasks = [gpu_task.remote(i) for i in range(4)]
-results = ray.get(tasks)  # Wait for all to complete
-```
-
-### Pattern 2: Mixed Workloads
-```python
-# CPU and GPU tasks running together
-cpu_tasks = [cpu_task.remote(i) for i in range(4)]
-gpu_tasks = [gpu_task.remote(i) for i in range(2)]
-all_results = ray.get(cpu_tasks + gpu_tasks)
-```
-
-### Pattern 3: Dynamic Scheduling
-```python
-# Submit tasks as resources become available
-futures = []
-for i in range(10):
-    future = gpu_task.remote(i)
-    futures.append(future)
-    if len(futures) >= 2:  # Don't overwhelm the queue
-        ray.get(futures[:1])  # Wait for one to complete
-        futures = futures[1:]
-```
-
-## Monitoring GPU Usage
-
-While running examples, monitor GPU usage:
-
-```bash
-# In another terminal
-watch -n 1 nvidia-smi
-```
-
-You should see:
-- GPU utilization changes as tasks start/finish
-- Memory allocation per GPU
-- Process assignments
-
-## Troubleshooting
-
-### Common Issues
-
-1. **Ray doesn't detect GPUs**
-   - Check `nvidia-smi` works
-   - Verify CUDA installation
-   - Try `ray.init(num_gpus=2)` to force detection
-
-2. **CUDA out of memory**
-   - Reduce tensor sizes in examples
-   - Use fractional GPU allocation
-   - Monitor memory with `nvidia-smi`
-
-3. **Tasks not running in parallel**
-   - Check available resources with `ray.available_resources()`
-   - Verify you have enough GPUs for your tasks
-   - Use `ray.get()` wisely to avoid blocking
-
-### Debug Tips
-
-```python
-# Check Ray status
-ray.cluster_resources()
-ray.available_resources()
-
-# Monitor task execution
-import time
-start = time.time()
-results = ray.get(tasks)
-print(f"Execution time: {time.time() - start:.2f}s")
-```
-
-## Next Steps
-
-1. **Experiment** with different GPU allocations (0.25, 0.5, 1.0)
-2. **Try** mixing CPU and GPU tasks
-3. **Monitor** resource usage patterns
-4. **Scale up** to more complex workloads
-5. **Learn** about Ray Tune for hyperparameter optimization
-6. **Explore** Ray Train for distributed training
-
-## Useful Commands
-
-```bash
-# Check GPU status
-nvidia-smi
-
-# Monitor Ray cluster
-ray status
-
-# Ray dashboard (if enabled)
-ray dashboard
-
-# Kill Ray processes
-ray stop
-```
-
-Happy learning with Ray! 🚀
diff --git a/ray_test/ray_distributed_simulation.py b/ray_test/ray_distributed_simulation.py
deleted file mode 100644
index baf9f343..00000000
--- a/ray_test/ray_distributed_simulation.py
+++ /dev/null
@@ -1,404 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray Distributed Setup Simulation
-
-This example shows how to simulate a distributed Ray cluster on localhost.
-We'll create multiple Ray nodes on the same machine to simulate a multi-server setup.
-"""
-
-import os
-import subprocess
-import sys
-import time
-from typing import Any
-
-import psutil
-import ray
-import torch
-
-# Configuration
-HEAD_PORT = 10001
-WORKER_PORT_START = 10002
-REDIS_PASSWORD = 'ray_demo_password'
-
-
-class RayClusterManager:
-    """Manages a simulated distributed Ray cluster on localhost."""
-
-    def __init__(self):
-        self.head_process = None
-        self.worker_processes = []
-        self.head_address = None
-
-    def start_head_node(self, num_gpus: int = 2, num_cpus: int = 8) -> str:
-        """Start the head node."""
-        print('🚀 Starting Ray head node...')
-
-        # Kill any existing Ray processes
-        self._cleanup_existing_ray()
-
-        head_cmd = [
-            'ray',
-            'start',
-            '--head',
-            f"--port={HEAD_PORT}",
-            f"--num-gpus={num_gpus}",
-            f"--num-cpus={num_cpus}",
-            f"--redis-password={REDIS_PASSWORD}",
-            '--include-dashboard=true',
-            '--dashboard-port=8265',
-        ]
-
-        print(f"Command: {' '.join(head_cmd)}")
-
-        # Start head node
-        self.head_process = subprocess.Popen(
-            head_cmd,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-            text=True,
-        )
-
-        # Wait a bit for head to start
-        time.sleep(3)
-
-        self.head_address = f"ray://127.0.0.1:{HEAD_PORT}"
-        print(f"✅ Head node started at {self.head_address}")
-
-        return self.head_address
-
-    def add_worker_node(
-        self, node_id: int, num_gpus: int = 0, num_cpus: int = 4
-    ) -> bool:
-        """Add a worker node to the cluster."""
-        print(f"🔧 Adding worker node {node_id}...")
-
-        worker_cmd = [
-            'ray',
-            'start',
-            f"--address={self.head_address}",
-            f"--num-gpus={num_gpus}",
-            f"--num-cpus={num_cpus}",
-            f"--redis-password={REDIS_PASSWORD}",
-        ]
-
-        print(f"Command: {' '.join(worker_cmd)}")
-
-        worker_process = subprocess.Popen(
-            worker_cmd,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-            text=True,
-        )
-
-        self.worker_processes.append(worker_process)
-
-        # Wait for worker to connect
-        time.sleep(2)
-
-        print(f"✅ Worker node {node_id} added")
-        return True
-
-    def _cleanup_existing_ray(self):
-        """Clean up any existing Ray processes."""
-        try:
-            subprocess.run(['ray', 'stop', '--force'],
-                           capture_output=True,
-                           timeout=10)
-            time.sleep(1)
-        except:
-            pass
-
-    def shutdown(self):
-        """Shutdown the entire cluster."""
-        print('🛑 Shutting down Ray cluster...')
-
-        # Stop all processes
-        try:
-            subprocess.run(['ray', 'stop', '--force'],
-                           capture_output=True,
-                           timeout=10)
-        except:
-            pass
-
-        # Kill processes if still running
-        if self.head_process:
-            self.head_process.terminate()
-
-        for worker in self.worker_processes:
-            worker.terminate()
-
-        print('✅ Cluster shutdown complete')
-
-
-@ray.remote(num_gpus=1)
-class DistributedGPUWorker:
-    """A distributed GPU worker that reports its location."""
-
-    def __init__(self, worker_id: str):
-        self.worker_id = worker_id
-        self.node_id = ray.get_runtime_context().get_node_id()
-        self.gpu_ids = ray.get_gpu_ids()
-        self.hostname = os.uname().nodename
-
-    def get_worker_info(self) -> dict[str, Any]:
-        """Get information about this worker."""
-        return {
-            'worker_id':
-                self.worker_id,
-            'node_id':
-                self.node_id,
-            'hostname':
-                self.hostname,
-            'gpu_ids':
-                self.gpu_ids,
-            'cuda_visible_devices':
-                os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set'),
-        }
-
-    def distributed_computation(self,
-                                matrix_size: int = 1000) -> dict[str, Any]:
-        """Perform computation and return node information."""
-        start_time = time.time()
-
-        # GPU computation
-        device = torch.device('cuda')
-        A = torch.randn(matrix_size, matrix_size, device=device)
-        B = torch.randn(matrix_size, matrix_size, device=device)
-        C = torch.mm(A, B)
-        result = torch.trace(C).item()
-
-        execution_time = time.time() - start_time
-
-        return {
-            'worker_id': self.worker_id,
-            'node_id': self.node_id,
-            'hostname': self.hostname,
-            'gpu_ids': self.gpu_ids,
-            'result': result,
-            'execution_time': execution_time,
-            'matrix_size': matrix_size,
-        }
-
-
-@ray.remote(num_cpus=1)
-def distributed_cpu_task(task_id: int) -> dict[str, Any]:
-    """A CPU task that reports which node it's running on."""
-    import numpy as np
-
-    start_time = time.time()
-    node_id = ray.get_runtime_context().get_node_id()
-    hostname = os.uname().nodename
-
-    # CPU computation
-    result = np.sum(np.random.randn(500, 500)**2)
-
-    execution_time = time.time() - start_time
-
-    return {
-        'task_id': task_id,
-        'node_id': node_id,
-        'hostname': hostname,
-        'result': result,
-        'execution_time': execution_time,
-        'resource_type': 'CPU',
-    }
-
-
-def demonstrate_cluster_info():
-    """Show cluster information and resource distribution."""
-    print('\n📊 CLUSTER INFORMATION')
-    print('=' * 50)
-
-    # Get cluster resources
-    cluster_resources = ray.cluster_resources()
-    available_resources = ray.available_resources()
-
-    print('Total Cluster Resources:')
-    for resource, amount in cluster_resources.items():
-        print(f"  {resource}: {amount}")
-
-    print('\nAvailable Resources:')
-    for resource, amount in available_resources.items():
-        print(f"  {resource}: {amount}")
-
-    # Get node information
-    print('\nNodes in Cluster:')
-    nodes = ray.nodes()
-    for i, node in enumerate(nodes):
-        print(f"  Node {i+1}:")
-        print(f"    ID: {node['NodeID']}")
-        print(f"    Alive: {node['Alive']}")
-        print(f"    Resources: {node['Resources']}")
-
-
-def demonstrate_distributed_gpu_work():
-    """Demonstrate distributed GPU work across simulated nodes."""
-    print('\n🖥️  DEMO: Distributed GPU Work')
-    print('-' * 50)
-
-    # Create GPU workers
-    workers = [DistributedGPUWorker.remote(f"gpu_worker_{i}") for i in range(2)]
-
-    # Get worker information
-    print('Created GPU workers:')
-    worker_info_futures = [
-        worker.get_worker_info.remote() for worker in workers
-    ]
-    worker_infos = ray.get(worker_info_futures)
-
-    for info in worker_infos:
-        print(
-            f"  {info['worker_id']}: Node {info['node_id'][:8]}, GPU {info['gpu_ids']}"
-        )
-
-    # Submit distributed computation
-    print('\nSubmitting distributed GPU computations...')
-    computation_futures = [
-        worker.distributed_computation.remote(matrix_size=1200)
-        for worker in workers
-    ]
-
-    results = ray.get(computation_futures)
-
-    print('Results:')
-    for result in results:
-        print(
-            f"  {result['worker_id']}: "
-            f"Node {result['node_id'][:8]}, "
-            f"GPU {result['gpu_ids']}, "
-            f"Result: {result['result']:.2f}, "
-            f"Time: {result['execution_time']:.2f}s"
-        )
-
-
-def demonstrate_mixed_distributed_work():
-    """Demonstrate mixed CPU/GPU work across nodes."""
-    print('\n🔄 DEMO: Mixed Distributed Workload')
-    print('-' * 50)
-
-    # Submit a mix of CPU and GPU tasks
-    cpu_tasks = [distributed_cpu_task.remote(i) for i in range(4)]
-
-    # Create lightweight GPU tasks
-    @ray.remote(num_gpus=0.5)
-    def light_gpu_task(task_id: int):
-        node_id = ray.get_runtime_context().get_node_id()
-        gpu_ids = ray.get_gpu_ids()
-
-        device = torch.device('cuda')
-        x = torch.randn(500, 500, device=device)
-        result = torch.sum(x * x).item()
-
-        return {
-            'task_id': task_id,
-            'node_id': node_id,
-            'gpu_ids': gpu_ids,
-            'result': result,
-        }
-
-    gpu_tasks = [light_gpu_task.remote(i + 10) for i in range(3)]
-
-    all_tasks = cpu_tasks + gpu_tasks
-    print(
-        f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks"
-    )
-
-    start_time = time.time()
-    results = ray.get(all_tasks)
-    total_time = time.time() - start_time
-
-    print(f"All tasks completed in {total_time:.2f}s")
-
-    # Group results by node
-    node_results = {}
-    for result in results:
-        node_id = result['node_id'][:8]  # Short node ID
-        if node_id not in node_results:
-            node_results[node_id] = []
-        node_results[node_id].append(result)
-
-    print('\nResults by Node:')
-    for node_id, node_tasks in node_results.items():
-        print(f"  Node {node_id}: {len(node_tasks)} tasks")
-
-
-def simulate_two_server_setup():
-    """Simulate a two-server setup using localhost."""
-    print('\n🌐 SIMULATING TWO-SERVER SETUP')
-    print('=' * 60)
-    print('This simulates Server 1 (head + GPU) and Server 2 (worker + CPU)')
-
-    cluster_manager = RayClusterManager()
-
-    try:
-        # Start head node (simulates Server 1 with GPUs)
-        head_address = cluster_manager.start_head_node(num_gpus=2, num_cpus=4)
-
-        # Connect Ray client
-        print(f"\n🔗 Connecting to distributed cluster at {head_address}")
-        ray.init(address=head_address, _redis_password=REDIS_PASSWORD)
-
-        demonstrate_cluster_info()
-
-        # Add worker node (simulates Server 2 with only CPUs)
-        cluster_manager.add_worker_node(node_id=1, num_gpus=0, num_cpus=6)
-
-        # Demonstrate distributed functionality
-        demonstrate_cluster_info()
-        demonstrate_distributed_gpu_work()
-        demonstrate_mixed_distributed_work()
-
-        print('\n✨ Distributed simulation completed successfully!')
-
-    except Exception as e:
-        print(f"❌ Error in distributed simulation: {e}")
-
-    finally:
-        try:
-            ray.shutdown()
-        except:
-            pass
-        cluster_manager.shutdown()
-
-
-def main():
-    """Main function to demonstrate distributed Ray setup."""
-    print('🎯 Ray Distributed Setup Simulation')
-    print('=' * 60)
-    print(
-        'This example simulates a distributed Ray cluster on a single machine'
-    )
-    print('to help you understand distributed Ray concepts.')
-
-    # Check if Ray is already running
-    try:
-        ray.init(address='auto')
-        print('⚠️  Ray is already running. Shutting down first...')
-        ray.shutdown()
-        time.sleep(2)
-    except:
-        pass
-
-    simulate_two_server_setup()
-
-    print('\n📚 What you learned:')
-    print('1. How to start Ray head and worker nodes')
-    print('2. How to connect to a distributed Ray cluster')
-    print('3. How tasks are distributed across nodes')
-    print('4. How to monitor cluster resources and node distribution')
-    print('5. How GPU and CPU resources are managed in a distributed setup')
-
-    print('\n🚀 Next steps:')
-    print('- Try this on actual multiple servers')
-    print('- Experiment with different resource configurations')
-    print('- Use Ray Tune for distributed hyperparameter tuning')
-    print('- Explore Ray Train for distributed training')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ray_test/ray_gpu_basic.py b/ray_test/ray_gpu_basic.py
deleted file mode 100644
index 8c0aa10e..00000000
--- a/ray_test/ray_gpu_basic.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray GPU Management - Basic Example
-
-This is the simplest possible example of using Ray to manage GPU workloads.
-Perfect for someone new to Ray who wants to understand the core concepts.
-"""
-
-import os
-import time
-
-import ray
-import torch
-
-
-@ray.remote(num_gpus=1)
-def simple_gpu_task(task_id: int):
-    """A minimal GPU task that just creates a tensor and does basic operations."""
-
-    # Ray automatically manages which GPU this task gets
-    gpu_ids = ray.get_gpu_ids()
-    print(f"Task {task_id}: Using GPU {gpu_ids[0]}")
-
-    # Create a tensor on the GPU
-    device = torch.device('cuda')
-    x = torch.randn(1000, 1000, device=device)
-
-    # Do some computation
-    for i in range(3):
-        x = x * 2
-        time.sleep(0.5)  # Simulate work
-        print(f"  Step {i+1}: tensor shape {x.shape}")
-
-    return f"Task {task_id} completed on GPU {gpu_ids[0]}"
-
-
-if __name__ == '__main__':
-    # print current pic
-    print(f"Current process ID: {os.getpid()}")
-
-    # Initialize Ray
-    ray.init()
-
-    print('Available resources:', ray.cluster_resources())
-
-    # Launch 2 tasks (one per GPU)
-    tasks = [simple_gpu_task.remote(i) for i in range(2)]
-
-    # Wait for results
-    results = ray.get(tasks)
-
-    print('Results:', results)
-    ray.shutdown()
diff --git a/ray_test/ray_gpu_patterns.py b/ray_test/ray_gpu_patterns.py
deleted file mode 100644
index c3119378..00000000
--- a/ray_test/ray_gpu_patterns.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray GPU Management - Advanced Patterns
-
-This example demonstrates different GPU management patterns in Ray:
-1. Fractional GPU allocation
-2. Dynamic task scheduling
-3. Resource monitoring
-4. Error handling
-"""
-
-import time
-
-import psutil
-import ray
-import torch
-
-
-# Pattern 1: Fractional GPU usage (0.5 GPU per task)
-@ray.remote(num_gpus=0.5)
-def light_gpu_task(task_id: int):
-    """Task that only needs half a GPU - allows 4 tasks on 2 GPUs."""
-    gpu_ids = ray.get_gpu_ids()
-    device = torch.device('cuda')
-
-    print(f"Light task {task_id}: Using GPU fraction on {gpu_ids}")
-
-    # Lighter computation
-    x = torch.randn(500, 500, device=device)
-    x = torch.mm(x, x.T)
-    time.sleep(1)
-
-    return f"Light task {task_id} done"
-
-
-# Pattern 2: Full GPU usage
-@ray.remote(num_gpus=1)
-def heavy_gpu_task(task_id: int):
-    """Task that needs a full GPU."""
-    gpu_ids = ray.get_gpu_ids()
-    device = torch.device('cuda')
-
-    print(f"Heavy task {task_id}: Using full GPU {gpu_ids[0]}")
-
-    # Heavier computation
-    x = torch.randn(2000, 2000, device=device)
-    for _ in range(5):
-        x = torch.mm(x, x.T)
-    time.sleep(2)
-
-    return f"Heavy task {task_id} done on GPU {gpu_ids[0]}"
-
-
-# Pattern 3: CPU task for comparison
-@ray.remote
-def cpu_task(task_id: int):
-    """Task that runs on CPU only."""
-    print(f"CPU task {task_id}: Running on CPU")
-
-    # CPU computation
-    x = torch.randn(1000, 1000)
-    x = torch.mm(x, x.T)
-    time.sleep(1)
-
-    return f"CPU task {task_id} done"
-
-
-# Pattern 4: Resource monitoring task
-@ray.remote
-def monitor_resources():
-    """Monitor system resources while tasks are running."""
-    resources = ray.cluster_resources()
-    available = ray.available_resources()
-
-    return {
-        'total_gpus': resources.get('GPU', 0),
-        'available_gpus': available.get('GPU', 0),
-        'total_cpus': resources.get('CPU', 0),
-        'available_cpus': available.get('CPU', 0),
-        'memory_usage': psutil.virtual_memory().percent,
-    }
-
-
-def demonstrate_gpu_patterns():
-    """Demonstrate different GPU allocation patterns."""
-
-    print('=== Ray GPU Patterns Demo ===\n')
-
-    # Initialize Ray
-    ray.init()
-
-    # Check available resources
-    print('Initial resources:', ray.cluster_resources())
-    print('Available resources:', ray.available_resources())
-    print()
-
-    # Pattern 1: Run multiple light tasks (fractional GPU)
-    print(
-        '1. Running 4 light tasks (0.5 GPU each) - should run 4 concurrent on 2 GPUs'
-    )
-    light_tasks = [light_gpu_task.remote(i) for i in range(4)]
-
-    # Pattern 2: Run heavy tasks (full GPU)
-    print('2. Running 2 heavy tasks (1 GPU each)')
-    heavy_tasks = [heavy_gpu_task.remote(i) for i in range(2)]
-
-    # Pattern 3: Run CPU tasks alongside
-    print('3. Running CPU tasks in parallel')
-    cpu_tasks = [cpu_task.remote(i) for i in range(3)]
-
-    # Pattern 4: Monitor resources while tasks run
-    monitor_task = monitor_resources.remote()
-
-    # Wait for light tasks
-    print('\nWaiting for light tasks...')
-    light_results = ray.get(light_tasks)
-    print('Light tasks results:', light_results)
-
-    # Check resources mid-execution
-    mid_resources = ray.get(monitor_task)
-    print('Mid-execution resources:', mid_resources)
-
-    # Wait for remaining tasks
-    print('\nWaiting for heavy and CPU tasks...')
-    heavy_results = ray.get(heavy_tasks)
-    cpu_results = ray.get(cpu_tasks)
-
-    print('Heavy tasks results:', heavy_results)
-    print('CPU tasks results:', cpu_results)
-
-    # Final resource check
-    final_monitor = monitor_resources.remote()
-    final_resources = ray.get(final_monitor)
-    print('Final resources:', final_resources)
-
-    ray.shutdown()
-
-
-if __name__ == '__main__':
-    demonstrate_gpu_patterns()
diff --git a/ray_test/ray_learning_guide.py b/ray_test/ray_learning_guide.py
deleted file mode 100644
index 0d12deda..00000000
--- a/ray_test/ray_learning_guide.py
+++ /dev/null
@@ -1,274 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray GPU Learning Guide - Getting Started Script
-
-This script helps beginners understand Ray GPU management concepts
-through interactive examples and clear explanations.
-"""
-
-import time
-
-import ray
-import torch
-
-
-def step_1_basic_concepts():
-    """Step 1: Understanding Ray basic concepts."""
-    print('\n' + '=' * 60)
-    print('🎓 STEP 1: RAY BASIC CONCEPTS')
-    print('=' * 60)
-
-    print(
-        """
-Ray is a distributed computing framework that helps you:
-1. Parallelize your Python code across multiple cores/machines
-2. Manage GPU resources automatically
-3. Scale from single machine to clusters seamlessly
-
-Key concepts:
-- @ray.remote: Decorator to make functions/classes distributed
-- ray.get(): Wait for and retrieve results from remote tasks
-- ray.put(): Store large objects in shared memory
-- Actors: Stateful workers that persist across tasks
-"""
-    )
-
-    # Simple example
-    @ray.remote
-    def simple_task(x):
-        return x * x
-
-    print('Example: Simple remote function')
-    print('@ray.remote')
-    print('def simple_task(x):')
-    print('    return x * x')
-
-    # Execute
-    future = simple_task.remote(5)
-    result = ray.get(future)
-    print(f"\nResult: simple_task.remote(5) = {result}")
-
-
-def step_2_gpu_resource_management():
-    """Step 2: Understanding GPU resource management."""
-    print('\n' + '=' * 60)
-    print('🎮 STEP 2: GPU RESOURCE MANAGEMENT')
-    print('=' * 60)
-
-    print(
-        """
-Ray automatically manages GPU allocation:
-
-1. Full GPU allocation: @ray.remote(num_gpus=1)
-   - Task gets exclusive access to 1 GPU
-   - Ray sets CUDA_VISIBLE_DEVICES automatically
-
-2. Fractional GPU allocation: @ray.remote(num_gpus=0.5)
-   - Multiple tasks can share the same GPU
-   - Useful for lightweight GPU work
-
-3. Ray handles scheduling based on available resources
-"""
-    )
-
-    @ray.remote(num_gpus=1)
-    def gpu_task(task_id):
-        gpu_ids = ray.get_gpu_ids()
-        device = torch.device('cuda')
-        x = torch.randn(100, 100, device=device)
-        return {'task_id': task_id, 'gpu_ids': gpu_ids, 'shape': list(x.shape)}
-
-    print('Example: GPU task')
-    print('@ray.remote(num_gpus=1)')
-    print('def gpu_task(task_id):')
-    print('    gpu_ids = ray.get_gpu_ids()')
-    print("    device = torch.device('cuda')")
-    print('    x = torch.randn(100, 100, device=device)')
-    print("    return {'task_id': task_id, 'gpu_ids': gpu_ids}")
-
-    # Execute on both GPUs
-    tasks = [gpu_task.remote(i) for i in range(2)]
-    results = ray.get(tasks)
-
-    print(f"\nResults from 2 GPU tasks:")
-    for result in results:
-        print(
-            f"  Task {result['task_id']}: GPU {result['gpu_ids']}, Tensor {result['shape']}"
-        )
-
-
-def step_3_actors_vs_tasks():
-    """Step 3: Understanding the difference between actors and tasks."""
-    print('\n' + '=' * 60)
-    print('🎭 STEP 3: ACTORS VS TASKS')
-    print('=' * 60)
-
-    print(
-        """
-Tasks vs Actors:
-
-TASKS (@ray.remote functions):
-- Stateless and lightweight
-- Good for simple computations
-- GPU allocated only during execution
-- No memory between calls
-
-ACTORS (@ray.remote classes):
-- Stateful workers with persistent memory
-- Good for complex workflows
-- GPU held for the lifetime of the actor
-- Can maintain state between method calls
-"""
-    )
-
-    @ray.remote(num_gpus=0.5)
-    class GPUActor:
-
-        def __init__(self):
-            self.gpu_ids = ray.get_gpu_ids()
-            self.device = torch.device('cuda')
-            self.counter = 0
-
-        def process(self, data_size=500):
-            self.counter += 1
-            x = torch.randn(data_size, data_size, device=self.device)
-            y = torch.mm(x, x.T)
-            return {
-                'call_count': self.counter,
-                'gpu_ids': self.gpu_ids,
-                'result': torch.trace(y).item(),
-            }
-
-    print('Example: GPU Actor')
-    print('@ray.remote(num_gpus=0.5)')
-    print('class GPUActor:')
-    print('    def __init__(self):')
-    print('        self.gpu_ids = ray.get_gpu_ids()')
-    print("        self.device = torch.device('cuda')")
-    print('        self.counter = 0')
-
-    # Create actors (4 actors, 2 per GPU with 0.5 GPU each)
-    actors = [GPUActor.remote() for _ in range(4)]
-
-    # Call methods multiple times
-    futures = []
-    for actor in actors:
-        for _ in range(2):  # 2 calls per actor
-            futures.append(actor.process.remote())
-
-    results = ray.get(futures)
-
-    print(f"\nResults from {len(actors)} actors, each called twice:")
-    for i, result in enumerate(results):
-        print(
-            f"  Call {i+1}: GPU {result['gpu_ids']}, Count: {result['call_count']}, Result: {result['result']:.2f}"
-        )
-
-
-def step_4_monitoring_resources():
-    """Step 4: Understanding resource monitoring."""
-    print('\n' + '=' * 60)
-    print('📊 STEP 4: MONITORING RESOURCES')
-    print('=' * 60)
-
-    print(
-        """
-Ray provides several ways to monitor resources:
-
-1. ray.cluster_resources() - Total resources in cluster
-2. ray.available_resources() - Currently available resources
-3. ray.nodes() - Information about cluster nodes
-4. Ray Dashboard - Web UI for monitoring (http://localhost:8265)
-"""
-    )
-
-    print('Current cluster state:')
-    print(f"  Total resources: {ray.cluster_resources()}")
-    print(f"  Available resources: {ray.available_resources()}")
-
-    # Show how resources change during execution
-    @ray.remote(num_gpus=1)
-    def blocking_gpu_task():
-        print(f"  📍 Task started on GPU {ray.get_gpu_ids()}")
-        time.sleep(3)  # Hold GPU for 3 seconds
-        return 'done'
-
-    print('\nWatching resources during task execution...')
-    print('Available before task:', ray.available_resources().get('GPU', 0))
-
-    future = blocking_gpu_task.remote()
-    time.sleep(0.5)  # Give task time to start
-    print('Available during task:', ray.available_resources().get('GPU', 0))
-
-    ray.get(future)
-    print('Available after task: ', ray.available_resources().get('GPU', 0))
-
-
-def interactive_learning_session():
-    """Run an interactive learning session."""
-    print('🎯 RAY GPU MANAGEMENT - INTERACTIVE LEARNING')
-    print('=' * 70)
-
-    print(
-        """
-Welcome to Ray GPU Management Learning!
-
-This script will teach you Ray concepts step by step.
-Each step builds on the previous one.
-
-You have 2 NVIDIA A100 GPUs available for learning.
-"""
-    )
-
-    # Initialize Ray
-    print('🚀 Initializing Ray...')
-    ray.init(num_gpus=2)
-    print(f"✅ Ray initialized with resources: {ray.cluster_resources()}")
-
-    try:
-        step_1_basic_concepts()
-
-        input('\nPress Enter to continue to Step 2...')
-        step_2_gpu_resource_management()
-
-        input('\nPress Enter to continue to Step 3...')
-        step_3_actors_vs_tasks()
-
-        input('\nPress Enter to continue to Step 4...')
-        step_4_monitoring_resources()
-
-        print('\n' + '=' * 70)
-        print('🎉 CONGRATULATIONS!')
-        print('=' * 70)
-        print(
-            """
-You've learned the fundamentals of Ray GPU management:
-
-✅ Basic Ray concepts (remote functions, ray.get)
-✅ GPU resource allocation (full and fractional)
-✅ Difference between tasks and actors
-✅ Resource monitoring and observation
-
-Next steps to continue learning:
-1. Run 'python ray_single_server_multi_gpu.py' for advanced patterns
-2. Run 'python ray_distributed_simulation.py' for distributed concepts
-3. Try the Ray dashboard at http://localhost:8265
-4. Explore Ray Tune for hyperparameter optimization
-5. Look into Ray Train for distributed training
-
-Happy learning! 🚀
-"""
-        )
-
-    except KeyboardInterrupt:
-        print('\n\n👋 Learning session interrupted. Thanks for trying Ray!')
-    finally:
-        ray.shutdown()
-
-
-if __name__ == '__main__':
-    interactive_learning_session()
diff --git a/ray_test/ray_scheduling_demo.py b/ray_test/ray_scheduling_demo.py
deleted file mode 100644
index 1e194b42..00000000
--- a/ray_test/ray_scheduling_demo.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray GPU Scheduling Demo
-
-This demonstrates how Ray schedules tasks based on GPU resource availability.
-Key question: Can heavy_gpu_tasks (1.0 GPU) start when light_gpu_tasks (0.5 GPU each) are running?
-
-Answer: NO - Ray waits until sufficient resources are available.
-"""
-
-import os
-import time
-from datetime import datetime
-
-import ray
-import torch
-
-
-def timestamp():
-    """Get current timestamp for logging."""
-    return datetime.now().strftime('%H:%M:%S.%f')[:-3]
-
-
-@ray.remote(num_gpus=0.5)
-def light_gpu_task(task_id: int, duration: int = 10):
-    """Light task that uses 0.5 GPU and runs for specified duration."""
-    gpu_ids = ray.get_gpu_ids()
-    pid = os.getpid()
-
-    print(
-        f"[{timestamp()}] 🟡 Light task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids})"
-    )
-
-    # Create some GPU work
-    device = torch.device('cuda')
-    x = torch.randn(1000, 1000, device=device)
-
-    # Simulate work for the specified duration
-    for i in range(duration):
-        x = torch.mm(x, x.T)
-        time.sleep(1)
-        if i % 3 == 0:  # Progress update every 3 seconds
-            print(
-                f"[{timestamp()}] 🟡 Light task {task_id} working... ({i+1}/{duration}s)"
-            )
-
-    print(f"[{timestamp()}] 🟡 Light task {task_id} FINISHED")
-    return f"Light task {task_id} completed"
-
-
-@ray.remote(num_gpus=1.0)
-def heavy_gpu_task(task_id: int, duration: int = 5):
-    """Heavy task that needs full GPU and runs for specified duration."""
-    gpu_ids = ray.get_gpu_ids()
-    pid = os.getpid()
-
-    print(
-        f"[{timestamp()}] 🔴 Heavy task {task_id} STARTED (PID: {pid}, GPU: {gpu_ids[0]})"
-    )
-
-    # Create heavier GPU work
-    device = torch.device('cuda')
-    x = torch.randn(2000, 2000, device=device)
-
-    # Simulate work
-    for i in range(duration):
-        x = torch.mm(x, x.T)
-        time.sleep(1)
-        print(
-            f"[{timestamp()}] 🔴 Heavy task {task_id} working... ({i+1}/{duration}s)"
-        )
-
-    print(f"[{timestamp()}] 🔴 Heavy task {task_id} FINISHED")
-    return f"Heavy task {task_id} completed"
-
-
-@ray.remote
-def resource_monitor():
-    """Monitor available resources."""
-    total = ray.cluster_resources()
-    available = ray.available_resources()
-
-    return {
-        'timestamp': timestamp(),
-        'total_gpus': total.get('GPU', 0),
-        'available_gpus': available.get('GPU', 0),
-        'available_cpus': available.get('CPU', 0),
-    }
-
-
-def demonstrate_scheduling():
-    """Demonstrate Ray's scheduling behavior."""
-
-    print('=' * 60)
-    print('RAY GPU SCHEDULING DEMONSTRATION')
-    print('=' * 60)
-    print()
-
-    ray.init()
-
-    # Check initial resources
-    initial_resources = ray.get(resource_monitor.remote())
-    print(f"Initial resources: {initial_resources}")
-    print()
-
-    print(
-        'SCENARIO: Testing if heavy tasks can start while light tasks are running'
-    )
-    print('- Light tasks: 0.5 GPU each, 10 seconds duration')
-    print('- Heavy tasks: 1.0 GPU each, 5 seconds duration')
-    print('- With 2 GPUs: 4 light tasks should fill both GPUs (2 per GPU)')
-    print('- Heavy tasks should WAIT until light tasks finish')
-    print()
-
-    # Launch tasks in specific order to demonstrate scheduling
-    print(
-        f"[{timestamp()}] 🚀 Launching 4 light tasks (should fill both GPUs)..."
-    )
-
-    light_tasks = []
-    for i in range(4):
-        task = light_gpu_task.remote(i, duration=10)
-        light_tasks.append(task)
-        time.sleep(0.5)  # Small delay to see launch order
-
-    # Wait a moment for light tasks to start
-    time.sleep(2)
-
-    # Check resources after light tasks start
-    mid_resources = ray.get(resource_monitor.remote())
-    print(f"[{timestamp()}] Resources after light tasks start: {mid_resources}")
-    print()
-
-    # Now launch heavy tasks - these should be QUEUED
-    print(f"[{timestamp()}] 🚀 Launching 2 heavy tasks (should be QUEUED)...")
-
-    heavy_tasks = []
-    for i in range(2):
-        task = heavy_gpu_task.remote(i, duration=5)
-        heavy_tasks.append(task)
-        time.sleep(0.5)
-
-    print()
-    print(
-        '⏳ OBSERVATION: Heavy tasks will wait until sufficient GPU resources are free!'
-    )
-    print('   - Each light task uses 0.5 GPU')
-    print('   - Each heavy task needs 1.0 GPU')
-    print(
-        '   - Heavy tasks must wait for 2 light tasks to finish to get 1.0 GPU'
-    )
-    print()
-
-    # Monitor resources periodically
-    for i in range(3):
-        time.sleep(3)
-        current_resources = ray.get(resource_monitor.remote())
-        print(f"[{timestamp()}] Current resources: {current_resources}")
-
-    # Wait for all tasks to complete
-    print(f"\n[{timestamp()}] ⏳ Waiting for all tasks to complete...")
-
-    light_results = ray.get(light_tasks)
-    heavy_results = ray.get(heavy_tasks)
-
-    print(f"\n[{timestamp()}] ✅ All tasks completed!")
-    print('Light task results:', light_results)
-    print('Heavy task results:', heavy_results)
-
-    # Final resource check
-    final_resources = ray.get(resource_monitor.remote())
-    print(f"Final resources: {final_resources}")
-
-    ray.shutdown()
-
-
-def explain_scheduling():
-    """Explain Ray's scheduling algorithm."""
-    print('\n' + '=' * 60)
-    print('RAY SCHEDULING EXPLAINED')
-    print('=' * 60)
-    print(
-        """
-Ray's resource scheduler works like this:
-
-1. RESOURCE TRACKING:
-   - Ray tracks total and available resources (GPUs, CPUs, memory)
-   - Each task declares its resource requirements (@ray.remote(num_gpus=X))
-
-2. TASK QUEUE:
-   - Tasks are queued when submitted with .remote()
-   - Ray maintains a queue of pending tasks
-
-3. SCHEDULING DECISIONS:
-   - Ray checks if enough resources are available for each queued task
-   - Tasks only start when their FULL resource requirements can be met
-   - No partial allocation - if task needs 1.0 GPU, it waits for 1.0 GPU
-
-4. FRACTIONAL RESOURCES:
-   - 0.5 GPU tasks: 2 can run on same physical GPU
-   - 1.0 GPU tasks: Need exclusive access to 1 physical GPU
-   - If 2×0.5 GPU tasks are running, 1.0 GPU task must WAIT
-
-5. SCHEDULING ORDER:
-   - Generally FIFO (first-in-first-out)
-   - But resource availability affects actual execution order
-   - Tasks with available resources start first
-
-KEY INSIGHT:
-Heavy tasks (1.0 GPU) CANNOT start while light tasks (0.5 GPU each)
-occupy all GPU resources, even if the physical GPU isn't fully utilized.
-
-This ensures predictable resource allocation and prevents resource conflicts.
-"""
-    )
-
-
-if __name__ == '__main__':
-    demonstrate_scheduling()
-    explain_scheduling()
diff --git a/ray_test/ray_single_server_multi_gpu.py b/ray_test/ray_single_server_multi_gpu.py
deleted file mode 100644
index 50974f27..00000000
--- a/ray_test/ray_single_server_multi_gpu.py
+++ /dev/null
@@ -1,278 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Ray Single Server Multi-GPU Example
-
-This example demonstrates Ray GPU management on a single server with multiple GPUs.
-Shows various patterns: full GPU allocation, fractional allocation, and mixed workloads.
-"""
-
-import logging
-import time
-from typing import Any
-
-import numpy as np
-import ray
-import torch
-
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-@ray.remote(num_gpus=1)
-class GPUWorker:
-    """A Ray actor that holds a full GPU for the duration of its lifetime."""
-
-    def __init__(self, worker_id: int):
-        self.worker_id = worker_id
-        self.gpu_ids = ray.get_gpu_ids()
-        self.device = torch.device('cuda')
-        logger.info(f"Worker {worker_id} initialized on GPU {self.gpu_ids}")
-
-    def matrix_multiply(self,
-                        size: int = 2000,
-                        iterations: int = 5) -> dict[str, Any]:
-        """Perform matrix multiplication to simulate GPU work."""
-        start_time = time.time()
-
-        # Create random matrices on GPU
-        A = torch.randn(size, size, device=self.device)
-        B = torch.randn(size, size, device=self.device)
-
-        results = []
-        for i in range(iterations):
-            C = torch.mm(A, B)
-            results.append(torch.trace(C).item())
-
-        end_time = time.time()
-
-        return {
-            'worker_id': self.worker_id,
-            'gpu_ids': self.gpu_ids,
-            'execution_time': end_time - start_time,
-            'results': results[:3],  # Just first 3 for brevity
-            'tensor_shape': list(C.shape),
-        }
-
-    def get_gpu_memory_usage(self) -> dict[str, float]:
-        """Get current GPU memory usage."""
-        if torch.cuda.is_available():
-            gpu_id = self.gpu_ids[0]
-            allocated = torch.cuda.memory_allocated(gpu_id) / 1024**3  # GB
-            cached = torch.cuda.memory_reserved(gpu_id) / 1024**3  # GB
-            return {
-                'gpu_id': gpu_id,
-                'allocated_gb': allocated,
-                'cached_gb': cached,
-            }
-        return {}
-
-
-@ray.remote(num_gpus=0.5)
-def lightweight_gpu_task(task_id: int, work_size: int = 1000) -> dict[str, Any]:
-    """A task that uses half a GPU - allows 2 tasks per GPU."""
-    start_time = time.time()
-    gpu_ids = ray.get_gpu_ids()
-
-    device = torch.device('cuda')
-    x = torch.randn(work_size, work_size, device=device)
-
-    # Simulate some computation
-    for _ in range(3):
-        x = torch.relu(x @ x.T)
-
-    end_time = time.time()
-
-    return {
-        'task_id': task_id,
-        'gpu_ids': gpu_ids,
-        'execution_time': end_time - start_time,
-        'final_mean': x.mean().item(),
-    }
-
-
-@ray.remote(num_cpus=1)
-def cpu_task(task_id: int) -> dict[str, Any]:
-    """A CPU-only task to demonstrate mixed workloads."""
-    start_time = time.time()
-
-    # CPU computation
-    result = np.sum(np.random.randn(1000, 1000)**2)
-    time.sleep(1)  # Simulate work
-
-    end_time = time.time()
-
-    return {
-        'task_id': task_id,
-        'execution_time': end_time - start_time,
-        'result': result,
-        'resource_type': 'CPU',
-    }
-
-
-def print_resources():
-    """Print current Ray cluster resources."""
-    print('\n' + '=' * 50)
-    print('RAY CLUSTER RESOURCES')
-    print('=' * 50)
-    print(f"Total resources: {ray.cluster_resources()}")
-    print(f"Available resources: {ray.available_resources()}")
-    print('=' * 50)
-
-
-def demo_gpu_actors():
-    """Demonstrate GPU actors (long-lived GPU workers)."""
-    print('\n🚀 DEMO 1: GPU Actors (Long-lived Workers)')
-    print('-' * 50)
-
-    # Create 2 GPU workers (one per GPU)
-    workers = [GPUWorker.remote(i) for i in range(2)]
-
-    # Submit work to both workers
-    futures = []
-    for i, worker in enumerate(workers):
-        future = worker.matrix_multiply.remote(size=1500, iterations=3)
-        futures.append(future)
-
-    print('Submitted work to GPU actors...')
-    results = ray.get(futures)
-
-    for result in results:
-        print(
-            f"  Worker {result['worker_id']}: GPU {result['gpu_ids']}, "
-            f"Time: {result['execution_time']:.2f}s"
-        )
-
-    # Check memory usage
-    memory_futures = [
-        worker.get_gpu_memory_usage.remote() for worker in workers
-    ]
-    memory_results = ray.get(memory_futures)
-
-    for mem in memory_results:
-        print(
-            f"  GPU {mem['gpu_id']}: {mem['allocated_gb']:.2f}GB allocated, "
-            f"{mem['cached_gb']:.2f}GB cached"
-        )
-
-    return workers
-
-
-def demo_fractional_gpu():
-    """Demonstrate fractional GPU allocation."""
-    print('\n🔄 DEMO 2: Fractional GPU Tasks (0.5 GPU each)')
-    print('-' * 50)
-
-    # Launch 4 tasks with 0.5 GPU each (2 per GPU)
-    tasks = [lightweight_gpu_task.remote(i, work_size=800) for i in range(4)]
-
-    print('Submitted 4 tasks with 0.5 GPU each...')
-    results = ray.get(tasks)
-
-    for result in results:
-        print(
-            f"  Task {result['task_id']}: GPU {result['gpu_ids']}, "
-            f"Time: {result['execution_time']:.2f}s"
-        )
-
-
-def demo_mixed_workload():
-    """Demonstrate mixed CPU and GPU workloads."""
-    print('\n🔀 DEMO 3: Mixed CPU and GPU Workloads')
-    print('-' * 50)
-
-    # Mix of CPU and GPU tasks
-    cpu_tasks = [cpu_task.remote(i) for i in range(3)]
-    gpu_tasks = [
-        lightweight_gpu_task.remote(i + 10, work_size=600) for i in range(3)
-    ]
-
-    all_tasks = cpu_tasks + gpu_tasks
-    print(
-        f"Submitted {len(cpu_tasks)} CPU tasks and {len(gpu_tasks)} GPU tasks..."
-    )
-
-    start_time = time.time()
-    results = ray.get(all_tasks)
-    total_time = time.time() - start_time
-
-    print(f"All tasks completed in {total_time:.2f}s")
-
-    # Separate results
-    cpu_results = [r for r in results if r.get('resource_type') == 'CPU']
-    gpu_results = [r for r in results if 'gpu_ids' in r]
-
-    print(f"  CPU tasks: {len(cpu_results)} completed")
-    print(f"  GPU tasks: {len(gpu_results)} completed")
-
-
-def demo_dynamic_scheduling():
-    """Demonstrate dynamic task scheduling based on resource availability."""
-    print('\n⚡ DEMO 4: Dynamic Scheduling')
-    print('-' * 50)
-
-    # Submit tasks gradually and monitor resource usage
-    completed_tasks = []
-    pending_tasks = []
-
-    for i in range(8):
-        task = lightweight_gpu_task.remote(i, work_size=500)
-        pending_tasks.append(task)
-
-        # Check if we should wait for some tasks to complete
-        if len(pending_tasks) >= 4:  # Don't overwhelm the queue
-            # Wait for at least one task to complete
-            ready, pending_tasks = ray.wait(pending_tasks, num_returns=1)
-            completed_tasks.extend(ray.get(ready))
-            print(
-                f"  Completed {len(completed_tasks)} tasks, "
-                f"{len(pending_tasks)} still pending"
-            )
-
-    # Wait for remaining tasks
-    if pending_tasks:
-        completed_tasks.extend(ray.get(pending_tasks))
-
-    print(f"Dynamic scheduling completed: {len(completed_tasks)} total tasks")
-
-
-def main():
-    """Main function demonstrating various Ray GPU patterns."""
-    print('🎯 Ray Single Server Multi-GPU Demo')
-    print('=' * 60)
-
-    # Initialize Ray
-    ray.init(num_gpus=2)  # Explicitly specify 2 GPUs
-
-    print_resources()
-
-    # Run all demos
-    workers = demo_gpu_actors()
-    print_resources()
-
-    demo_fractional_gpu()
-    print_resources()
-
-    demo_mixed_workload()
-    print_resources()
-
-    demo_dynamic_scheduling()
-    print_resources()
-
-    print('\n🎉 All demos completed!')
-    print('\nKey Takeaways:')
-    print('1. GPU Actors: Long-lived workers for persistent GPU allocation')
-    print('2. Fractional GPUs: Share GPUs between multiple light tasks')
-    print('3. Mixed Workloads: Combine CPU and GPU tasks efficiently')
-    print('4. Dynamic Scheduling: Adapt to resource availability')
-
-    # Cleanup
-    ray.shutdown()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ray_test/test_ray.py b/ray_test/test_ray.py
deleted file mode 100644
index e9e5b2f0..00000000
--- a/ray_test/test_ray.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-import ray
-import torch
-
-
-# =================================================================
-# 1. THE RAY TASK: A function that will run on a single GPU
-# =================================================================
-# The decorator is the key: it tells Ray this task requires 1 GPU.
-@ray.remote(num_gpus=1)
-def use_gpu_task(task_id: int):
-    """
-    A simple Ray task that simulates work on a GPU.
-    """
-    # Ray automatically sets the CUDA_VISIBLE_DEVICES environment variable
-    # for this worker process, so torch.cuda.current_device() will
-    # correspond to the GPU Ray assigned.
-
-    # Let's get the physical GPU ID that Ray assigned to this task.
-    gpu_ids = ray.get_gpu_ids()
-    physical_gpu_id = gpu_ids[0]
-
-    print(
-        f"-> Task {task_id} starting. Ray assigned me physical GPU: {physical_gpu_id}"
-    )
-
-    # Create a tensor and move it to the assigned GPU.
-    # PyTorch will only see the single GPU that Ray allocated.
-    device = torch.device('cuda')
-    tensor = torch.randn(2000, 2000, device=device)
-
-    # Perform some work to make the GPU busy.
-    for i in range(5):
-        tensor = tensor @ tensor
-        time.sleep(0.5)  # Sleep to make it easier to see in nvidia-smi
-        print(f"   Task {task_id}, iteration {i+1}, on device: {tensor.device}")
-
-    print(f"<- Task {task_id} finished on GPU {physical_gpu_id}.")
-
-    # Return the ID of the GPU we used.
-    return f"Task {task_id} ran on GPU {physical_gpu_id}"
-
-
-# =================================================================
-# 2. MAIN SCRIPT: Initialize Ray and launch the tasks
-# =================================================================
-if __name__ == '__main__':
-    # Start Ray. Ray will automatically detect the 2 GPUs.
-    # You could also be explicit: ray.init(num_gpus=2)
-    ray.init()
-
-    print('Ray Initialized.')
-    print('Cluster resources:', ray.cluster_resources())
-
-    # Verify that Ray sees our GPUs
-    if ray.cluster_resources().get('GPU', 0) < 2:
-        print('!!! WARNING: Ray did not detect 2 GPUs. Exiting.')
-        ray.shutdown()
-        exit()
-
-    # We have 2 GPUs, so let's launch 4 tasks.
-    # Ray will run 2 tasks concurrently, and queue the other 2
-    # until the first ones finish.
-    print('\nLaunching 4 GPU tasks on 2 available GPUs...')
-    task_refs = []
-    for i in range(4):
-        # .remote() immediately returns a future (a reference to the result)
-        # and executes the task in the background.
-        ref = use_gpu_task.remote(i)
-        task_refs.append(ref)
-
-    # Block until all tasks are complete and get the results.
-    results = ray.get(task_refs)
-
-    print('\n--- All tasks completed! ---')
-    print('Results:', results)
-
-    # Shut down Ray
-    ray.shutdown()
diff --git a/ray_test/test_ray_chain.py b/ray_test/test_ray_chain.py
deleted file mode 100644
index 44e21d32..00000000
--- a/ray_test/test_ray_chain.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import ray
-
-
-@ray.remote
-class ChainActor:
-    def __init__(self):
-        pass
-
-    def get(self, obj: ray.ObjectRef):
-        actual = ray.get(obj)
-        return actual + " Actor"
-
-
-@ray.remote
-def task1():
-    return "Task 1"
-
-@ray.remote
-def task2():
-    return task1.remote()
-
-
-@ray.remote
-class Foo:
-    def __init__(self):
-        print("Foo")
-
-    def print(self, actor):
-        print('I am in Foo', actor)
-        actor.print.remote()
-
-@ray.remote
-class Bar:
-    def __init__(self):
-        print("Bar")
-
-    def print(self):
-        print('I am in Bar')
-
-def main():
-    ray.init()
-    actor = ChainActor.remote()
-    res = actor.get.remote(task2.remote())
-    print(ray.get(res))
-
-    foo = Foo.remote()
-    bar = Bar.remote()
-    foo.print.remote(bar)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/ray_test/test_ray_distributed.py b/ray_test/test_ray_distributed.py
deleted file mode 100644
index c6bc4662..00000000
--- a/ray_test/test_ray_distributed.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-import ray
-import torch
-
-
-# The task definition is IDENTICAL to the previous example.
-# No changes are needed here.
-@ray.remote(num_gpus=1)
-def use_gpu_task(task_id: int):
-    gpu_ids = ray.get_gpu_ids()
-    physical_gpu_id = gpu_ids[0]
-
-    # Let's also get the hostname of the node Ray scheduled us on.
-    # In this simulation, it will be the same hostname, but Ray
-    # internally treats them as distinct nodes.
-    node_id = ray.get_runtime_context().get_node_id()
-
-    print(
-        f"-> Task {task_id} starting."
-        f" Ray assigned me physical GPU: {physical_gpu_id}"
-        f" on Node ID: {node_id}",
-    )
-
-    device = torch.device('cuda')
-    tensor = torch.randn(2000, 2000, device=device)
-
-    for i in range(5):
-        tensor = tensor @ tensor
-        time.sleep(0.5)
-
-    print(f"<- Task {task_id} finished on GPU {physical_gpu_id}.")
-    return f"Task {task_id} ran on GPU {physical_gpu_id} on Node {node_id}"
-
-
-# =================================================================
-# MAIN SCRIPT
-# =================================================================
-if __name__ == '__main__':
-    # CRITICAL CHANGE: Connect to the existing Ray cluster.
-    # 'auto' tells Ray to find the running cluster from environment variables
-    # that `ray start` sets up.
-    ray.init()
-
-    print('Python script connected to Ray Cluster.')
-    print('Cluster resources:', ray.cluster_resources())
-
-    # The rest of the logic is the same.
-    print('\nLaunching 4 GPU tasks on our 2-node, 2-GPU cluster...')
-    task_refs = []
-    for i in range(4):
-        ref = use_gpu_task.remote(i)
-        task_refs.append(ref)
-
-    results = ray.get(task_refs)
-
-    print('\n--- All tasks completed! ---')
-    print('Results:', results)
-
-    # We don't call ray.shutdown() here, because we want to leave the
-    # cluster running. We will stop it manually from the terminal.
-    print('\nScript finished. The Ray cluster is still running.')
diff --git a/ray_test/test_ray_init.py b/ray_test/test_ray_init.py
deleted file mode 100644
index ac5d2738..00000000
--- a/ray_test/test_ray_init.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-import subprocess
-
-import ray
-
-
-def ray_start():
-    subprocess.run(['ray', 'start', '--head', '--port=6379'], check=True)
-
-
-def ray_stop():
-    subprocess.run(['ray', 'stop'], check=True)
-
-
-def ray_init():
-    ray.init()
-
-
-def ray_shutdown():
-    ray.shutdown()
-
-
-if __name__ == '__main__':
-    ray_start()
-    ray_init()
-    ray_shutdown()
-    ray_stop()
diff --git a/ray_test/test_socket.py b/ray_test/test_socket.py
deleted file mode 100644
index 5bada510..00000000
--- a/ray_test/test_socket.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-import socket
-import time
-
-def start_server_on_inherited_socket(fd):
-    """This function runs in the child process."""
-    print(f"[Child PID {os.getpid()}] Inherited file descriptor: {fd}")
-    
-    # Re-create the socket object from the file descriptor
-    server_socket = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
-    
-    # Get the address it's already bound to
-    host, port = server_socket.getsockname()
-    print(f"[Child PID {os.getpid()}] Socket is bound to {host}:{port}")
-
-    # Now the child can use the socket
-    server_socket.listen(1)
-    print("[Child PID {os.getpid()}] Server is listening...")
-    # ... accept connections, etc. ...
-    time.sleep(5) # Simulate server work
-    server_socket.close()
-    print("[Child PID {os.getpid()}] Server finished.")
-
-
-# --- Parent Process Logic ---
-if __name__ == "__main__":
-    # 1. Parent creates and binds the socket
-    parent_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    parent_socket.bind(('', 0)) # Bind to a free port
-    host, port = parent_socket.getsockname()
-    fd = parent_socket.fileno() # Get the integer file descriptor
-
-    print(f"[Parent PID {os.getpid()}] Bound to {host}:{port}, file descriptor: {fd}")
-
-    start_server_on_inherited_socket(fd)
-
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server_socket:
-        server_socket.bind(('', port))
-        host, port = server_socket.getsockname()
-        print(f"[Parent PID {os.getpid()}] Bound to {host}:{port}")
-        server_socket.listen(1)
-        print("[Parent PID {os.getpid()}] Server is listening...")
-        time.sleep(5) # Simulate server work
-        server_socket.close()
-        print("[Parent PID {os.getpid()}] Server finished.")
\ No newline at end of file
diff --git a/ray_test/test_torch_ray_distributed.py b/ray_test/test_torch_ray_distributed.py
deleted file mode 100644
index 39912d0a..00000000
--- a/ray_test/test_torch_ray_distributed.py
+++ /dev/null
@@ -1,342 +0,0 @@
-import ray
-import torch
-import torch.distributed as dist
-import os
-import socket
-import subprocess
-import time
-from contextlib import contextmanager
-from typing import Optional, Tuple
-import argparse
-from datetime import timedelta
-
-from functools import partial
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from composer.utils import dist as composer_dist
-from composer import Trainer
-from composer.optim import DecoupledAdamW
-from llmfoundry.models import ComposerHFCausalLM, ComposerMPTCausalLM
-from torch.utils.data import DataLoader
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
-from transformers.models.gpt2 import GPT2LMHeadModel
-
-from compose_rl.algorithms.online import (
-    ComposerHFPolicyLM,
-    ComposerMPTPolicyLM,
-    OnPolicyCallback,
-)
-from compose_rl.algorithms.online.model_methods import OnPolicyEnum
-from compose_rl.algorithms.online.modeling_hf import ComposerHFPolicy
-from compose_rl.data import prompt_dataset_collate_fn
-from tests.common import PromptDataset, VerifiablePromptDataset, world_size
-
-from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
-
-from typing import Any
-
-
-def ray_noset_visible_devices():
-    return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
-
-
-def init_ray():
-    # init ray on master node, rank 0
-    if dist.get_rank() == 0:
-        # Start head node
-        subprocess.run(['ray', 'start', '--head'], check=True)
-        ray.init('auto')
-        # get existing ray ip and port 
-        ctx = ray.get_runtime_context()
-        address = ctx.gcs_address
-        print(f'available gpus: {ray.available_resources()}')
-    else:
-        address = ''
-    address_list = [address]
-    # broadcast address to all other ranks
-    dist.broadcast_object_list(address_list, src=0)
-    if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
-        address = address_list[0]
-        print(f'rank: {dist.get_rank()} connecting to address: {address}')
-        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
-    dist.barrier()
-    if dist.get_rank() == 0:
-        # wait until num of gpus reach world_size
-        num_gpus = int(ray.cluster_resources()['GPU'])
-        counter = 0
-        while num_gpus < dist.get_world_size():
-            print(f'waiting for {dist.get_world_size() - num_gpus} gpus to be available')
-            num_gpus = int(ray.cluster_resources()['GPU'])
-            time.sleep(5)
-            counter += 1
-            if counter > 4:
-                raise RuntimeError(f'Failed to start {dist.get_world_size()} gpus')
-        print(f'Total available gpus: {ray.available_resources()}')
-    return address
-
-
-@ray.remote(num_gpus=1)
-class DistributedGPUActor:
-    def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None, master_port: Optional[int] = None):
-        """Initialize the distributed GPU actor.
-        
-        Args:
-            rank: The rank of this process in the distributed group
-            world_size: Total number of processes in the distributed group
-            master_addr: Master node address. If None, will allocate dynamically for rank 0
-            master_port: Master node port. If None, will allocate dynamically for rank 0
-        """
-        self.rank = rank
-        self.world_size = world_size
-        self.master_addr = master_addr
-        self.master_port = master_port
-        
-        # Set up basic environment variables
-        os.environ["WORLD_SIZE"] = str(world_size)
-        os.environ["RANK"] = str(rank)
-        
-        # Set LOCAL_RANK based on Ray GPU allocation
-        os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
-        
-        # If this is rank 0 and no master_addr/master_port provided, allocate them
-        if rank == 0 and (master_addr is None or master_port is None):
-            self._allocate_master_address()
-
-        os.environ["MASTER_ADDR"] = self.master_addr
-        os.environ["MASTER_PORT"] = str(self.master_port)
-
-        self.model = None
-        self.model_update_group = None
-
-    def build_ref_model(self):
-        max_seq_len = 32
-        prompt_len = 10
-
-        model_name = 'gpt2'
-        tiny_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
-
-        dataset = PromptDataset(prompt_len=prompt_len)
-        dataloader = DataLoader(
-            dataset,
-            collate_fn=partial(
-                prompt_dataset_collate_fn,
-                tiny_gpt2_tokenizer,
-                max_seq_len,
-            ),
-            sampler=composer_dist.get_sampler(dataset),
-            batch_size=4,
-        )
-
-        # We need to mock this method, since our dataset isn't a StreamingDataset
-        dataloader.state_dict = lambda: {}
-        dataloader.load_state_dict = lambda x: None
-
-        model_config = {
-            'tokenizer': tiny_gpt2_tokenizer,
-            'pretrained_model_name_or_path': model_name,
-            'pretrained': True,
-            'use_flash_attention_2': True,
-            'allow_embedding_resizing': True,
-        }
-        tmp_model = ComposerHFCausalLM(**model_config)
-
-        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
-
-        tmp_ref_path = str('./ref_checkpoints')
-
-        temp_dataloader = [{
-            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
-            'attention_mask': torch.ones((2, 15)),
-            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
-        }]
-
-        temp_trainer = Trainer(
-            model=tmp_model,
-            train_dataloader=temp_dataloader,
-            optimizers=tmp_optimizer,
-            max_duration='1ba',
-            parallelism_config={'fsdp': {}},
-            save_folder=tmp_ref_path,
-            save_weights_only=True,
-            device_train_microbatch_size=2,
-        )
-
-        temp_trainer.fit()
-
-        # After making the reference model, we can proceed with the PPO training
-        tmp_ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
-
-    def get_node_ip(self):
-        return ray.util.get_node_ip_address().strip('[]')
-    
-    def get_free_port(self):
-        with socket.socket() as sock:
-            sock.bind(("", 0))
-            return sock.getsockname()[1]
-    
-    def _allocate_master_address(self):
-        """Allocate master address and port for rank 0."""
-        if self.master_addr is None:
-            # Get the local IP address
-            self.master_addr = self.get_node_ip()
-
-        if self.master_port is None:
-            # Allocate a free port
-            self.master_port = self.get_free_port()
-    
-    def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
-        """Return the master address and port as a tuple."""
-        return (self.master_addr, self.master_port)
-    
-    def init_default_process_group(self):
-        """Initialize the distributed process group."""         
-        # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=30), backend='nccl')
-        print(f'is distributed initialized: {dist.is_initialized()}')
-        # Print debug information
-        num_visible_devices = torch.cuda.device_count()
-        print(f'num_visible_devices: {num_visible_devices}')
-        print('Ray actor init envs:')
-        print(f'rank: {dist.get_rank()}')
-        print(f'node_rank: {dist.get_rank() // 8}')
-        print(f'world_size: {dist.get_world_size()}')
-        print(f'local_rank: {dist.get_rank() % 8}')
-        print(f'master_addr: {self.master_addr}')
-        print(f'master_port: {self.master_port}')
-    
-    def init_model(self, model_name: str):
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
-        self.model.to('cuda')
-
-    def sync_weights(self, vllm_engines: list[Any]):
-        for name, p in self.model.named_parameters():
-            refs = [engine.update_weight.remote(name, p.dtype, p.shape, empty_cache=False) for engine in vllm_engines]
-            dist.broadcast(p, src=0, group=self.model_update_group)
-            ray.get(refs)
-
-    def tensor_all_reduce(self) -> float:
-        """Perform a simple tensor all_reduce operation."""
-        # Create a tensor on the GPU and perform all_reduce
-        device = torch.device("cuda")
-        x = torch.ones(1, device=device)
-        dist.all_reduce(x)
-        
-        return x.item()
-
-    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
-        """Initialize the vLLM process group."""
-        self.model_update_group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-        return dist.get_world_size(self.model_update_group)
-
-@contextmanager
-def start_ray_server():
-    dist.init_process_group(backend='gloo')
-    address = init_ray()
-    try:
-        yield address
-        dist.barrier()
-    finally:
-        if dist.get_rank() == 0:
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
-        dist.barrier()
-        dist.destroy_process_group()
-
-
-def run(tp_size: int = 8):
-    prompts = [
-        "what is RAY?",
-        "what is vLLM?",
-    ]
-    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
-    with start_ray_server() as address:
-        if dist.get_rank() == 0:
-            master_addr, _ = address.split(':')
-            
-            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-            num_train_actors = dist.get_world_size() // 2
-            # Create actors - rank 0 will allocate master address/port
-            train_actors = []
-
-            # master actor will allocate master_addr and master_port
-            master_actor = DistributedGPUActor.remote(0, num_train_actors)
-            train_actors.append(master_actor)
-            
-            # Get master address from rank 0 actor
-            master_info = ray.get(master_actor.get_master_address.remote())
-            master_addr, master_port = master_info
-            print(f"Master address allocated: {master_addr}:{master_port}")
-            
-            # Create remaining actors with the master address/port
-            for i in range(1, num_train_actors):
-                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
-                train_actors.append(actor)
-            
-            # Initialize process groups for all actors
-            init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
-            ray.get(init_tasks)
-            
-            # Perform tensor all_reduce on all actors
-            reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
-            results = ray.get(reduce_tasks)
-            print(f"All-reduce results: {results}")
-
-            build_ref_model_tasks = [actor.build_ref_model.remote() for actor in train_actors]
-            ray.get(build_ref_model_tasks)
-            print('build ref model done')
-
-            # vllm_tensor_parallel_size = tp_size
-            # num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            # print(f'num_vllm_engines: {num_vllm_engines}')
-            # vllm_engines = create_vllm_engines(
-            #     num_engines=num_vllm_engines,
-            #     tensor_parallel_size=vllm_tensor_parallel_size,
-            #     enforce_eager=True,
-            #     pretrain=pretrain_model_name,
-            #     revision=None,
-            #     seed=1,
-            #     enable_prefix_caching=False,
-            #     max_model_len=2048,
-            # )
-
-            # new_port = ray.get(master_actor.get_free_port.remote())
-            # print(f'new_port: {new_port}')
-            # refs = [
-            #     engine.init_process_group.remote(
-            #         master_addr,
-            #         new_port,
-            #         i * vllm_tensor_parallel_size + 1,
-            #         dist.get_world_size() // 2 + 1,
-            #         'weight-update',
-            #         backend='nccl',
-            #     ) for i, engine in enumerate(vllm_engines)
-            # ]
-            # refs.append(master_actor.init_vllm_process_group.remote(
-            #     backend='nccl',
-            #     master_addr=master_addr,
-            #     master_port=new_port,
-            #     world_size=dist.get_world_size() // 2 + 1,
-            #     rank=0,
-            #     group_name='weight-update',
-            # ))
-            # print(ray.get(refs))
-
-            # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
-            # ray.get(refs)
-            # print('init model done')
-
-            # ray.get(master_actor.sync_weights.remote(vllm_engines))
-            # print('sync weights done')
-
-            # ref = vllm_engines[0].generate.remote(prompts)
-            # gen_results = ray.get(ref)
-            # for output in gen_results:
-            #     prompt = output.prompt
-            #     generated_text = output.outputs[0].text
-            #     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--tp_size', type=int, default=8)
-    args = parser.parse_args()
-    run(tp_size=args.tp_size)
diff --git a/test_async_better.py b/test_async_better.py
deleted file mode 100644
index d4914d18..00000000
--- a/test_async_better.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import asyncio
-import time
-
-program_start_time = time.time()
-
-async def async_function(name: str):
-    print(f"Async function {name} started at {time.time() - program_start_time:.2f}s")
-    await asyncio.sleep(1)
-    print(f"Async function {name} finished at {time.time() - program_start_time:.2f}s")
-
-# Better semaphore pattern
-total_tasks = 4
-concurrent_tasks = 2
-semaphore = asyncio.Semaphore(concurrent_tasks)
-
-async def async_function_with_semaphore_better(name: str):
-    async with semaphore:  # This acquires and releases automatically
-        await async_function(name)
-
-# Alternative explicit version:
-async def async_function_with_semaphore_explicit(name: str):
-    await semaphore.acquire()
-    try:
-        await async_function(name)
-    finally:
-        semaphore.release()
-
-async def run_tasks_better():
-    # Create all tasks at once - semaphore limiting happens inside each task
-    tasks = [
-        asyncio.create_task(async_function_with_semaphore_better(f"Task {i}"))
-        for i in range(total_tasks)
-    ]
-    await asyncio.gather(*tasks)
-
-if __name__ == "__main__":
-    print("Running better pattern:")
-    asyncio.run(run_tasks_better()) 
\ No newline at end of file
diff --git a/test_original_trace.py b/test_original_trace.py
deleted file mode 100644
index b686f867..00000000
--- a/test_original_trace.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import asyncio
-import time
-
-program_start_time = time.time()
-
-def log(message: str):
-    print(f"{time.time() - program_start_time:.3f}s: {message}")
-
-async def async_function(name: str):
-    log(f"  → {name} STARTED")
-    await asyncio.sleep(1)
-    log(f"  ← {name} FINISHED")
-
-total_tasks = 4
-concurrent_tasks = 2
-semaphore = asyncio.Semaphore(concurrent_tasks)
-
-async def async_function_with_semaphore(name: str):
-    cor = async_function(name)
-    await asyncio.sleep(1)
-    await cor
-    log(f"  {name} calling semaphore.release()")
-    semaphore.release()
-
-async def run_tasks_with_trace():
-    log("Starting run_tasks()")
-    tasks = []
-    
-    for i in range(total_tasks):
-        log(f"Iteration {i}: About to acquire semaphore")
-        await semaphore.acquire()  # This is where the magic happens!
-        log(f"Iteration {i}: Acquired semaphore, creating task")
-        tasks.append(asyncio.create_task(async_function_with_semaphore(f"Task {i}")))
-        log(f"Iteration {i}: Task created")
-    
-    log("All tasks created, calling gather()")
-    await asyncio.gather(*tasks)
-    log("gather() completed")
-
-if __name__ == "__main__":
-    asyncio.run(run_tasks_with_trace()) 
\ No newline at end of file
diff --git a/test_torch_ray_distributed.py b/tests/test_torch_ray_distributed.py
similarity index 100%
rename from test_torch_ray_distributed.py
rename to tests/test_torch_ray_distributed.py

From 3240715ed2669549840982cf6c66a6bbef458389 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 22 Jul 2025 22:46:50 +0000
Subject: [PATCH 064/107] refactor base actor

---
 tests/common/__init__.py        |   2 +
 tests/common/actor.py           | 107 ++++++++++++++++++++++++++++
 tests/test_single_controller.py | 121 ++------------------------------
 3 files changed, 115 insertions(+), 115 deletions(-)
 create mode 100644 tests/common/actor.py

diff --git a/tests/common/__init__.py b/tests/common/__init__.py
index a815cb38..79b2fba9 100644
--- a/tests/common/__init__.py
+++ b/tests/common/__init__.py
@@ -9,8 +9,10 @@
     VerifiablePromptDataset,
 )
 from tests.common.markers import device, world_size
+from tests.common.actor import BaseDistributedGPUActor
 
 __all__ = [
+    'BaseDistributedGPUActor',
     'PairwisePreference',
     'FineGrainedPreference',
     'PromptDataset',
diff --git a/tests/common/actor.py b/tests/common/actor.py
new file mode 100644
index 00000000..8ad45de7
--- /dev/null
+++ b/tests/common/actor.py
@@ -0,0 +1,107 @@
+import os
+from datetime import timedelta
+from typing import Optional
+
+import ray
+import torch
+import torch.distributed as dist
+
+from compose_rl.algorithms.online.generation_utils import init_process_group
+from compose_rl.utils.ray_utils import (
+    get_free_port,
+    get_node_ip,
+    is_cuda_visible_devices_set,
+)
+
+
+class BaseDistributedGPUActor:
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        master_addr: Optional[str] = None,
+        master_port: Optional[int] = None,
+    ):
+        """Initialize the distributed GPU actor for RAY.
+
+        Args:
+            rank: The rank of this process in the distributed group
+            world_size: Total number of processes in the distributed group
+            master_addr: Master node address. If None, will allocate dynamically for rank 0
+            master_port: Master node port. If None, will allocate dynamically for rank 0
+        """
+        self.rank = rank
+        self.world_size = world_size
+        self.master_addr = master_addr
+        self.master_port = master_port
+
+        # Set up basic environment variables
+        os.environ['WORLD_SIZE'] = str(world_size)
+        os.environ['RANK'] = str(rank)
+
+        # Set LOCAL_RANK based on Ray GPU allocation
+        os.environ['LOCAL_RANK'] = '0' if is_cuda_visible_devices_set(
+        ) else str(ray.get_gpu_ids()[0])
+
+        # If this is rank 0 and no master_addr/master_port provided, allocate them
+        if rank == 0 and (master_addr is None or master_port is None):
+            self._allocate_master_address()
+
+        os.environ['MASTER_ADDR'] = self.master_addr  # type: ignore
+        os.environ['MASTER_PORT'] = str(self.master_port)  # type: ignore
+
+        self.model = None
+        self.model_update_group = None
+
+    def _allocate_master_address(self):
+        """Allocate master address and port for rank 0."""
+        if self.master_addr is None:
+            # Get the local IP address
+            self.master_addr = get_node_ip()
+
+        if self.master_port is None:
+            # Allocate a free port
+            self.master_port = get_free_port()
+
+    def get_master_address(self) -> tuple[Optional[str], Optional[int]]:
+        """Return the master address and port as a tuple."""
+        return (self.master_addr, self.master_port)
+
+    def get_free_port(self):
+        return get_free_port()
+
+    def init_train_process_group(self):
+        """Initialize the distributed process group."""
+        # Initialize process group
+        dist.init_process_group(timeout=timedelta(seconds=30))
+
+    def test_tensor_all_reduce(self) -> float:
+        """Perform a simple tensor all_reduce operation."""
+        # Create a tensor on the GPU and perform all_reduce
+        device = torch.device('cuda')
+        x = torch.ones(1, device=device, dtype=torch.int32)
+        dist.all_reduce(x)
+
+        return x.item()
+
+    def add_process_group(
+        self,
+        backend: str,
+        master_addr: str,
+        master_port: int,
+        world_size: int,
+        rank: int,
+        group_name: str,
+    ):
+        """Initialize the process group on trainer rank 0 and e.g., vllm engines."""
+        # NOTE vLLM seems to have a safer implementation of init_process_group:
+        # https://github.com/vllm-project/vllm/blob/v0.9.1/examples/offline_inference/rlhf.py#L105
+        # we should look into using that instead
+        self.model_update_group = init_process_group(
+            backend=backend,
+            init_method=f'tcp://{master_addr}:{master_port}',
+            world_size=world_size,
+            rank=rank,
+            group_name=group_name,
+        )
diff --git a/tests/test_single_controller.py b/tests/test_single_controller.py
index efb2d41e..5bd3784b 100644
--- a/tests/test_single_controller.py
+++ b/tests/test_single_controller.py
@@ -4,12 +4,9 @@
 import logging
 import os
 import pathlib
-from datetime import timedelta
-from typing import Optional
 
 import pytest
 import ray
-import torch
 import torch.distributed as dist
 from transformers import (
     AutoModelForCausalLM,
@@ -19,93 +16,17 @@
 
 from compose_rl.algorithms.online.generation_utils import (
     create_vllm_engines,
-    init_process_group,
 )
-from compose_rl.utils.ray_utils import (
-    get_free_port,
-    get_node_ip,
-    is_cuda_visible_devices_set,
-    start_ray_server,
-)
-from tests.common import world_size
+from compose_rl.utils.ray_utils import start_ray_server
+from tests.common import world_size, BaseDistributedGPUActor
 
 # Set up logging
 logger = logging.getLogger(__name__)
 
 
 @ray.remote(num_gpus=1)
-class DistributedGPUActor:
-
-    def __init__(
-        self,
-        rank: int,
-        world_size: int,
-        master_addr: Optional[str] = None,
-        master_port: Optional[int] = None,
-    ):
-        """Initialize the distributed GPU actor.
-
-        Args:
-            rank: The rank of this process in the distributed group
-            world_size: Total number of processes in the distributed group
-            master_addr: Master node address. If None, will allocate dynamically for rank 0
-            master_port: Master node port. If None, will allocate dynamically for rank 0
-        """
-        self.rank = rank
-        self.world_size = world_size
-        self.master_addr = master_addr
-        self.master_port = master_port
-
-        # Set up basic environment variables
-        os.environ['WORLD_SIZE'] = str(world_size)
-        os.environ['RANK'] = str(rank)
-
-        # Set LOCAL_RANK based on Ray GPU allocation
-        os.environ['LOCAL_RANK'] = '0' if is_cuda_visible_devices_set(
-        ) else str(ray.get_gpu_ids()[0])
-
-        # If this is rank 0 and no master_addr/master_port provided, allocate them
-        if rank == 0 and (master_addr is None or master_port is None):
-            self._allocate_master_address()
-
-        os.environ['MASTER_ADDR'] = self.master_addr  # type: ignore
-        os.environ['MASTER_PORT'] = str(self.master_port)  # type: ignore
-
-        self.model = None
-        self.model_update_group = None
-
-    def _allocate_master_address(self):
-        """Allocate master address and port for rank 0."""
-        if self.master_addr is None:
-            # Get the local IP address
-            self.master_addr = get_node_ip()
-
-        if self.master_port is None:
-            # Allocate a free port
-            self.master_port = get_free_port()
-
-    def get_master_address(self) -> tuple[Optional[str], Optional[int]]:
-        """Return the master address and port as a tuple."""
-        return (self.master_addr, self.master_port)
-
-    def get_free_port(self):
-        return get_free_port()
-
-    def init_train_process_group(self):
-        """Initialize the distributed process group."""
-        # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=30))
-        logger.info(f'is distributed initialized: {dist.is_initialized()}')
-        # Print debug information
-        num_visible_devices = torch.cuda.device_count()
-        logger.info(f'num_visible_devices: {num_visible_devices}')
-        logger.info('Ray actor init envs:')
-        logger.info(f'rank: {dist.get_rank()}')
-        logger.info(f'node_rank: {dist.get_rank() // 8}')
-        logger.info(f'world_size: {dist.get_world_size()}')
-        logger.info(f'local_rank: {dist.get_rank() % 8}')
-        logger.info(f'master_addr: {self.master_addr}')
-        logger.info(f'master_port: {self.master_port}')
+class DistributedGPUActor(BaseDistributedGPUActor):
+    """Distributed GPU actor for testing."""
 
     def init_model(self, model_name: str):
         """Initialize the model."""
@@ -132,36 +53,6 @@ def sync_weights(self, vllm_engines: list):
             dist.broadcast(p, src=0, group=self.model_update_group)
             ray.get(refs)
 
-    def tensor_all_reduce(self) -> float:
-        """Perform a simple tensor all_reduce operation."""
-        # Create a tensor on the GPU and perform all_reduce
-        device = torch.device('cuda')
-        x = torch.ones(1, device=device, dtype=torch.int32)
-        dist.all_reduce(x)
-
-        return x.item()
-
-    def init_vllm_process_group(
-        self,
-        backend: str,
-        master_addr: str,
-        master_port: int,
-        world_size: int,
-        rank: int,
-        group_name: str,
-    ):
-        """Initialize the process group on trainer rank 0 and vllm engines."""
-        # NOTE vLLM seems to have a safer implementation of init_process_group:
-        # https://github.com/vllm-project/vllm/blob/v0.9.1/examples/offline_inference/rlhf.py#L105
-        # we should look into using that instead
-        self.model_update_group = init_process_group(
-            backend=backend,
-            init_method=f'tcp://{master_addr}:{master_port}',
-            world_size=world_size,
-            rank=rank,
-            group_name=group_name,
-        )
-
 
 @pytest.mark.gpu
 @world_size(4)
@@ -228,7 +119,7 @@ def test_distributed_ray_actors(
 
             # Perform tensor all_reduce on all actors
             reduce_tasks = [
-                actor.tensor_all_reduce.remote()  # type: ignore
+                actor.test_tensor_all_reduce.remote()  # type: ignore
                 for actor in train_actors
             ]
             results = ray.get(reduce_tasks)
@@ -275,7 +166,7 @@ def test_distributed_ray_actors(
                 ) for i, engine in enumerate(vllm_engines)
             ]
             refs.append(
-                master_actor.init_vllm_process_group.remote(  # type: ignore
+                master_actor.add_process_group.remote(  # type: ignore
                     backend='nccl',
                     master_addr=master_addr,
                     master_port=new_port,

From 861768b64fe2a7e4170b002af447ef8c57495be3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 22 Jul 2025 23:47:36 +0000
Subject: [PATCH 065/107] rel method

---
 tests/common/actor.py           | 10 ----------
 tests/test_single_controller.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/common/actor.py b/tests/common/actor.py
index 8ad45de7..1632a8a7 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 import ray
-import torch
 import torch.distributed as dist
 
 from compose_rl.algorithms.online.generation_utils import init_process_group
@@ -76,15 +75,6 @@ def init_train_process_group(self):
         # Initialize process group
         dist.init_process_group(timeout=timedelta(seconds=30))
 
-    def test_tensor_all_reduce(self) -> float:
-        """Perform a simple tensor all_reduce operation."""
-        # Create a tensor on the GPU and perform all_reduce
-        device = torch.device('cuda')
-        x = torch.ones(1, device=device, dtype=torch.int32)
-        dist.all_reduce(x)
-
-        return x.item()
-
     def add_process_group(
         self,
         backend: str,
diff --git a/tests/test_single_controller.py b/tests/test_single_controller.py
index 5bd3784b..c74cc9e5 100644
--- a/tests/test_single_controller.py
+++ b/tests/test_single_controller.py
@@ -7,6 +7,7 @@
 
 import pytest
 import ray
+import torch
 import torch.distributed as dist
 from transformers import (
     AutoModelForCausalLM,
@@ -53,6 +54,15 @@ def sync_weights(self, vllm_engines: list):
             dist.broadcast(p, src=0, group=self.model_update_group)
             ray.get(refs)
 
+    def test_tensor_all_reduce(self) -> float:
+        """Perform a simple tensor all_reduce operation."""
+        # Create a tensor on the GPU and perform all_reduce
+        device = torch.device('cuda')
+        x = torch.ones(1, device=device, dtype=torch.int32)
+        dist.all_reduce(x)
+
+        return x.item()
+
 
 @pytest.mark.gpu
 @world_size(4)

From d0b5a445dc99331b69f472c7fc5989a8aa244738 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 00:09:40 +0000
Subject: [PATCH 066/107] callback fails

---
 test_single_controller_ppo.py | 358 ++++++++++++++++++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 test_single_controller_ppo.py

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
new file mode 100644
index 00000000..b1d17671
--- /dev/null
+++ b/test_single_controller_ppo.py
@@ -0,0 +1,358 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import pathlib
+from functools import partial
+from typing import Any, Optional
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
+
+from composer import Trainer
+from composer.optim import DecoupledAdamW
+from composer.utils import dist as composer_dist
+from llmfoundry.models import ComposerHFCausalLM
+
+from compose_rl.algorithms.online import (
+    ComposerHFPolicyLM,
+    SingleControllerOnPolicyCallback,
+)
+from compose_rl.algorithms.online.generation_utils import create_vllm_engines
+from compose_rl.data import prompt_dataset_collate_fn
+from compose_rl.utils.ray_utils import start_ray_server
+# add a PYTHONPATH tot he tests
+# import sys
+# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from tests.common import (
+    BaseDistributedGPUActor,
+    VerifiablePromptDataset,
+    world_size,
+)
+
+# Set up logging
+logger = logging.getLogger(__name__)
+
+
+@ray.remote(num_gpus=1)
+class DistributedGPUActor(BaseDistributedGPUActor):
+    """Distributed GPU actor for testing."""
+
+    def __init__(self,
+        rank: int,
+        world_size: int,
+        master_addr: Optional[str] = None,
+        master_port: Optional[int] = None):
+        super().__init__(rank, world_size, master_addr, master_port)
+        self.model = None
+        self.model_update_group = None
+        self.pretrain_model_name = None
+        self.ref_path = None
+        self._dataloader = None
+        self._tokenizer = None
+        self.ppo_callback = None
+        self.ppo_trainer: Trainer = None
+
+    def build_dataloader(self):
+        max_seq_len = 32
+        prompt_len = 10
+
+        dataset = VerifiablePromptDataset(prompt_len=prompt_len)
+        dataloader = DataLoader(
+            dataset,
+            collate_fn=partial(
+                prompt_dataset_collate_fn,
+                self.tokenizer,
+                max_seq_len,
+            ),
+            sampler=composer_dist.get_sampler(dataset),
+            batch_size=4,
+        )
+        # We need to mock this method, since our dataset isn't a StreamingDataset
+        dataloader.state_dict = lambda: {}
+        dataloader.load_state_dict = lambda x: None
+        return dataloader
+
+    @property
+    def dataloader(self):
+        if self._dataloader is None:
+            self._dataloader = self.build_dataloader()
+        return self._dataloader
+
+    def build_tokenizer(self):
+        # tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        return tokenizer
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = self.build_tokenizer()
+        return self._tokenizer
+
+    @property
+    def model_config(self):
+        return {
+            'tokenizer': self.tokenizer,
+            'pretrained_model_name_or_path': self.pretrain_model_name,
+            'pretrained': True,
+            'use_flash_attention_2': True,
+            'allow_embedding_resizing': True,
+        }
+
+    @property
+    def fsdp_config(self):
+        return dict()
+
+    def build_ref_model(self, pretrain_model_name: str):
+        tmp_ref_path = str('./ref_checkpoints')
+        ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+        if os.path.exists(ref_path):
+            self.ref_path = ref_path
+            return
+
+        self.pretrain_model_name = pretrain_model_name
+        composer_dist.initialize_dist('gpu')
+
+        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
+
+        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
+
+        temp_dataloader = [{
+            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
+            'attention_mask': torch.ones((2, 15)),
+            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
+        }]
+
+        temp_trainer = Trainer(
+            model=tmp_model,
+            train_dataloader=temp_dataloader,
+            optimizers=tmp_optimizer,
+            max_duration='1ba',
+            parallelism_config={'fsdp': self.fsdp_config},
+            save_folder=tmp_ref_path,
+            save_weights_only=True,
+            device_train_microbatch_size=2,
+        )
+
+        temp_trainer.fit()
+
+        # After making the reference model, we can proceed with the PPO training
+        self.ref_path = ref_path
+
+    def build_ppo_trainer(self, pretrain_model_name: str):
+        self.pretrain_model_name = pretrain_model_name
+        composer_dist.initialize_dist('gpu')
+        max_seq_len = 32
+        precision = 'amp_bf16'
+
+        model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
+
+        optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
+
+        num_batches_per_update = 2
+
+        # ref_model_config = copy.deepcopy(self.model_config)
+        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
+
+        variables = {
+            'buffer': {
+                'name': 'MinibatchRolloutBuffer',
+                'max_buffer_size': num_batches_per_update,
+            },
+            'max_gen_len': 8,
+            'gamma': 0.99,
+            'lambda_gae': 0.95,
+            'generation_kwargs': {
+                'use_cache': True,
+                'do_sample': False,
+            },
+            'kl_controller': {
+                'init_kl_coef': 0.2,
+                'target': 0.01,
+                'horizon': 12800,
+                'kl_ctl_type': 'adaptive',
+            },
+            'reference_model': {
+                'model_config': ref_model_config,
+                'precision': precision,
+                'load_path': self.ref_path,
+                'non_train_fsdp_config': self.fsdp_config,
+            },
+            'device_generate_batch_size': 2,
+            'epoch_per_iteration': 1,
+            'num_batches_per_update': num_batches_per_update,
+            'rewards': {
+                'output_length': {
+                    'reward_type': 'output_length',
+                    'max_gen_len': 10,
+                },
+            },
+        }
+        train_config = {
+            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
+            'fsdp_config': self.fsdp_config,
+            'seed': 17,
+            'precision': precision,
+            'variables': variables,
+            'max_seq_len': max_seq_len,
+            'global_train_batch_size': 2,
+            'device_train_batch_size': 2,
+            'device_train_microbatch_size': 1,
+        }
+
+        # tmp_save_path = str('./checkpoints')
+        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
+        self.ppo_trainer = Trainer(
+            model=model,
+            optimizers=optimizer,
+            callbacks=self.ppo_callback,
+            train_dataloader=self.dataloader,
+            precision=precision,
+            parallelism_config={'fsdp': self.fsdp_config},
+            max_duration='3iter',
+            device_train_microbatch_size=1,
+            load_path=self.ref_path,
+            # save_folder=tmp_save_path,
+            # save_interval='1iter',
+        )
+
+        # trainer.fit(duration='1iter')
+
+        # This is the KL assert that must be true if we are truly loading from the same model.
+        # This is only true on the first iteration
+        # assert torch.allclose(
+        #     trainer.state.loss['kl/ift_kl'], # pyright: ignore
+        #     torch.tensor(0.0),
+        #     atol=5e-5,
+        # )
+
+    def train_1_iter(self):
+        self.ppo_trainer.fit(duration='1iter')
+        # This is the KL assert that must be true if we are truly loading from the same model.
+        # This is only true on the first iteration
+        assert torch.allclose(
+            self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
+            torch.tensor(0.0),
+            atol=5e-5,
+        )
+
+    def sync_weight_and_gen(self, vllm_engines: list[Any]):
+        self.ppo_callback.round_trip_to_inference_engines(
+            device=self.ppo_trainer.state.device,
+            vllm_engines=vllm_engines,
+            model_update_group=self.model_update_group,
+        )
+
+
+def run():
+    prompts = [
+        "what is RAY?",
+        "what is vLLM?",
+    ]
+    # pretrain_model_name = 'gpt2'
+    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
+    with start_ray_server() as address:
+        if dist.get_rank() == 0:
+            master_addr, _ = address.split(':')
+            
+            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+            num_train_actors = dist.get_world_size() // 2
+            # Create actors - rank 0 will allocate master address/port
+            train_actors = []
+
+            # master actor will allocate master_addr and master_port
+            master_actor = DistributedGPUActor.remote(0, num_train_actors)
+            train_actors.append(master_actor)
+            
+            # Get master address from rank 0 actor
+            master_info = ray.get(master_actor.get_master_address.remote())
+            master_addr, master_port = master_info
+            print(f"Master address allocated: {master_addr}:{master_port}")
+            
+            # Create remaining actors with the master address/port
+            for i in range(1, num_train_actors):
+                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
+                train_actors.append(actor)
+
+            # composer will initialize the process group for each actor, so no need to initialize them explicitly
+            build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in train_actors]
+            ray.get(build_ref_model_tasks)
+            print('build ref model done')
+
+            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in train_actors]
+            ray.get(build_ppo_trainer_tasks)
+            print('build ppo trainer done')
+
+            world_size = dist.get_world_size()
+            vllm_tensor_parallel_size = world_size - num_train_actors
+            num_vllm_engines = (
+                world_size - num_train_actors
+            ) // vllm_tensor_parallel_size
+            print(f'num_vllm_engines: {num_vllm_engines}')
+            vllm_engines = create_vllm_engines(
+                num_engines=num_vllm_engines,
+                tensor_parallel_size=vllm_tensor_parallel_size,
+                enforce_eager=True,
+                pretrain=pretrain_model_name,
+                revision=None,
+                seed=1,
+                enable_prefix_caching=False,
+                max_model_len=512,
+                device_bundle={
+                    'GPU': 1,
+                    'CPU': 1,
+                    'worker_node': 0,
+                },
+            )
+
+            new_port = ray.get(master_actor.get_free_port.remote())
+            print(f'new_port: {new_port}')
+            refs = [
+                engine.init_process_group.remote(
+                    master_addr,
+                    new_port,
+                    i * vllm_tensor_parallel_size + 1,
+                    dist.get_world_size() // 2 + 1,
+                    'weight-update',
+                    backend='nccl',
+                ) for i, engine in enumerate(vllm_engines)
+            ]
+            refs.append(master_actor.add_process_group.remote(
+                backend='nccl',
+                master_addr=master_addr,
+                master_port=new_port,
+                world_size=dist.get_world_size() // 2 + 1,
+                rank=0,
+                group_name='weight-update',
+            ))
+            # should only get refs of both master and vllm_engines together, otherwise it will hang
+            print(ray.get(refs))
+
+            refs = [actor.sync_weight_and_gen.remote(vllm_engines) for actor in train_actors]
+            ray.get(refs)
+            print('sync weight and gen done')
+
+            refs = [actor.train_1_iter.remote() for actor in train_actors]
+            ray.get(refs)
+            print('train 1 iter done')
+
+            ref = vllm_engines[0].generate.remote(prompts)
+            gen_results = ray.get(ref)
+            for output in gen_results:
+                prompt = output.prompt
+                generated_text = output.outputs[0].text
+                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+if __name__ == '__main__':
+    run()

From ec7e550c6e512717592382ecac17379d522a4981 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 05:21:59 +0000
Subject: [PATCH 067/107] revive ppo training again

---
 compose_rl/algorithms/online/model.py              | 14 ++++++++++++--
 .../online/single_controller_callback.py           |  1 +
 test_single_controller_ppo.py                      |  2 --
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/compose_rl/algorithms/online/model.py b/compose_rl/algorithms/online/model.py
index efe5eeda..fdda1363 100644
--- a/compose_rl/algorithms/online/model.py
+++ b/compose_rl/algorithms/online/model.py
@@ -100,6 +100,11 @@ def eval_forward(self, batch: MutableMapping, outputs: MutableMapping):
         )
 
     def loss(self, outputs: MutableMapping, batch: MutableMapping):
+        # Get beta from config if available, otherwise use default
+        additional_kwargs = {}
+        if hasattr(self.config, 'beta'):
+            additional_kwargs['beta'] = self.config.beta
+        
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
@@ -107,10 +112,10 @@ def loss(self, outputs: MutableMapping, batch: MutableMapping):
             value_clip_range=self.config.value_clip_range,
             value_loss_weight=self.config.value_loss_weight,
             policy_clip_ratio=self.config.policy_clip_ratio,
-            beta=self.config.beta,
             add_direct_kl_loss=self.config.compute_kl_loss,
             kl_estimator=self.config.kl_estimator,
             kl_clip_range=self.config.kl_clip_range,
+            **additional_kwargs,
         )
 
         self.policy_kl.append(return_dict['kl/policy_kl'])
@@ -217,6 +222,11 @@ def eval_forward(self, batch: MutableMapping, outputs: MutableMapping):
         )
 
     def loss(self, outputs: MutableMapping, batch: MutableMapping):
+        # Get beta from config if available, otherwise use default
+        additional_kwargs = {}
+        if hasattr(self.config, 'beta'):
+            additional_kwargs['beta'] = self.config.beta
+        
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
@@ -224,10 +234,10 @@ def loss(self, outputs: MutableMapping, batch: MutableMapping):
             value_clip_range=self.config.value_clip_range,
             value_loss_weight=self.config.value_loss_weight,
             policy_clip_ratio=self.config.policy_clip_ratio,
-            beta = self.config.beta,
             add_direct_kl_loss=self.config.compute_kl_loss,
             kl_estimator=self.config.kl_estimator,
             kl_clip_range=self.config.kl_clip_range,
+            **additional_kwargs,
         )
 
         self.policy_kl.append(return_dict['kl/policy_kl'])
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index b07fa172..76f78ce0 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -673,6 +673,7 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[
                 max_gen_len=max_gen_len,
                 generation_kwargs=generation_kwargs,
                 tokenizer=self.tokenizer,  # type: ignore
+                vllm_generate_function='generate',
             )
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index b1d17671..fd174890 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -14,8 +14,6 @@
 from torch.utils.data import DataLoader
 from transformers import (
     AutoTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
 )
 
 from composer import Trainer

From f7952b4898c37fee28ddccb072f02fbb1229a837 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 05:24:42 +0000
Subject: [PATCH 068/107] clean up

---
 test_single_controller_ppo.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index fd174890..dd083525 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -3,7 +3,6 @@
 
 import logging
 import os
-import pathlib
 from functools import partial
 from typing import Any, Optional
 
@@ -209,7 +208,6 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             'device_train_microbatch_size': 1,
         }
 
-        # tmp_save_path = str('./checkpoints')
         self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
         self.ppo_trainer = Trainer(
             model=model,
@@ -221,20 +219,8 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             max_duration='3iter',
             device_train_microbatch_size=1,
             load_path=self.ref_path,
-            # save_folder=tmp_save_path,
-            # save_interval='1iter',
         )
 
-        # trainer.fit(duration='1iter')
-
-        # This is the KL assert that must be true if we are truly loading from the same model.
-        # This is only true on the first iteration
-        # assert torch.allclose(
-        #     trainer.state.loss['kl/ift_kl'], # pyright: ignore
-        #     torch.tensor(0.0),
-        #     atol=5e-5,
-        # )
-
     def train_1_iter(self):
         self.ppo_trainer.fit(duration='1iter')
         # This is the KL assert that must be true if we are truly loading from the same model.

From 95b96ef6aa5074b4e60ea7632b40e4d48df91b05 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 06:04:18 +0000
Subject: [PATCH 069/107] first round code reduction done

---
 .../online/single_controller_callback.py      | 423 +-----------------
 test_single_controller_ppo.py                 |   4 +-
 2 files changed, 10 insertions(+), 417 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 76f78ce0..e3e61cbf 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -57,6 +57,9 @@
     switch_left_to_right_padding,
 )
 
+# Import the base class
+from compose_rl.algorithms.online.callback import OnPolicyCallback, env_reward
+
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
 
@@ -65,249 +68,7 @@
 log = logging.getLogger(__name__)
 
 
-def env_reward(
-    actor_critic: Policy,
-    reward_manager: RewardManager,
-    batch: dict,
-    max_gen_len: int,
-    precision: Precision,
-    device_train_microbatch_size: int,
-    tokenizer: Tokenizer,
-    eos_token_ids: list[int],
-    kl_estimator: Optional[str] = 'k1',
-    kl_clip_range: Optional[float] = 40.0,
-) -> tuple[
-    dict[str, torch.Tensor],
-    list[tuple[str, str]],
-    ReferenceOutput,
-    RewardOutput,
-]:
-    """Run reward on the model generated responses.
-
-    Runs reward over a set of sequences in the batch. It also does extra computation
-    that is required for later loss computation.
-
-    Args:
-        actor_critic (ComposerMosaicPolicy): Actor critic model to run reward over.
-        reward_manager (RewardManager): Composes the reference IFT model and all reward models.
-        batch (dict): The batch of data to run reward over.
-        max_gen_len (int): Maximum generation length.
-        precision (Precision): Precision to run computation.
-        device_train_microbatch_size (int): Device train microbatch size for the training job.
-            We need to do all log_prob computation with this in order to maintain numerics.
-        tokenizer (Tokenizer): The actor critic's tokenizer.
-        eos_token_ids (list[int]): A list of eos token ids.
-        kl_estimator (str): Which kl estimator to use. Options are 'k1', 'k2', 'k3' and 'k3_offpolicy'.
-        kl_clip_range (float): The clip range for the KL divergence.
-
-    Returns:
-        partial_env_output (dict[str, tensor]): Partially complete dictionary of return elements suitable
-            for PPO training
-        untokenized_prompt_and_responses (list): List of [str, str] tuples, each containing the decoded
-            prompt and responses tokens sequences, respectively
-        ref_output (ReferenceOutput): Pair of tensors corresponding to the KL penalty and
-            log prob sequences obtained from the reference model. If the reference model is non-blocking,
-            this will be an AsyncResult object that will resolve to the described output.
-        all_rewards (RewardOutput): Dictionary of tensors containing the reward output
-            from each reward model managed by the reward manager. If reward model "X" is non-blocking,
-            then all_rewards["X"] will be an AsyncResult object that will resolve to associated reward tensor.
-
-    Note:
-        Use the .get() method on an AsyncResult object (see Returns, above) to resolve it.
-    """
-    prompt_tokens = batch['prompt']
-
-    batch_size, _ = prompt_tokens.shape
-
-    pad_token_id = tokenizer.pad_token_id
-
-    if pad_token_id is None:
-        raise ValueError(
-            'Tokenizer does not have a pad token id. Please use a different tokenizer or add a pad token id.',
-        )
-
-    with get_precision_context(precision), torch.no_grad():
-        prompt_len = batch['prompt_len']
-        verified_answers = batch.get('verified_answer', None)
-        prompt_id = batch['prompt_id']
-        cur_device = prompt_tokens.device
-        prompt_dtype = prompt_tokens.dtype
-
-        assert 'sequences' in batch, f'sequences is not in batch {batch.keys()=}'
-
-        sequences = batch['sequences']
-        generated_len = torch.ones(
-            batch_size,
-            device=cur_device,
-            dtype=prompt_dtype,
-        ) * max_gen_len
-
-        # If all the processes early exit generate, then we need to manually pad everything
-        # we can pad this with pad tokens, since we switch the padding between left and right
-        # padding based on the sequence length + max_sequence_length.
-        if prompt_tokens.size(1) + max_gen_len > sequences.size(1):
-            len_to_pad = max_gen_len - (
-                sequences.size(1) - prompt_tokens.size(1)
-            )
-
-            extra_padding = torch.ones(
-                (batch_size, len_to_pad),
-                device=cur_device,
-                dtype=prompt_dtype,
-            ) * pad_token_id
-            sequences = torch.cat(
-                [sequences, extra_padding],  # type: ignore
-                dim=-1,  # type: ignore
-            )
-
-        # Sanity checking we're adding max_gen_len to prompt_tokens
-        if prompt_tokens.size(1) + max_gen_len != sequences.size(1):
-            raise ValueError(
-                f'Prompts {prompt_tokens.size(1)} + max_gen_len {max_gen_len} != sequences {sequences.size(1)}',
-            )
-
-        # Actions are what tokens the current policy would generate.
-        actions = sequences[:, -max_gen_len:]
-
-        right_padded_obs = switch_left_to_right_padding(
-            sequences,
-            prompt_len,
-            max_gen_len,
-            pad_token_id,  # type: ignore
-        )
-        right_padded_attn_mask = torch.logical_not(
-            torch.eq(right_padded_obs, pad_token_id),  # type: ignore
-        )
-
-        (
-            right_padded_obs,
-            right_padded_attn_mask,
-            generated_len,
-            action_mask,
-        ) = mask_eos(
-            actions=actions,
-            right_padded_obs=right_padded_obs,
-            right_padded_attn_mask=right_padded_attn_mask,
-            prompt_len=prompt_len,
-            generated_len=generated_len,
-            max_gen_len=max_gen_len,
-            eos_token_ids=eos_token_ids,  # type: ignore
-            pad_token=pad_token_id,  # type: ignore
-        )
-
-        untokenized_prompt_and_responses = []
-        for i in range(batch_size):
-            prompt = tokenizer.decode(  # type: ignore
-                right_padded_obs[i, :prompt_len[i]])
-            generated_text = tokenizer.decode(  # type:  ignore
-                get_decoded_sequence(actions[i], generated_len[i],
-                                            max_gen_len))
-            untokenized_prompt_and_responses.append((prompt, generated_text),)
-
-        # Making logits [batch_size, generated_len, vocab_size]
-        # We need to recompute the logits here. Otherwise there are numerical differences
-        # We also need to do it on the size of `device_train_microbatch_size` otherwise
-        # there are numerical differences at training time.
-        # log probs will be [batch_size, generated_len]
-        log_probs = []
-        entropies = []
-        values = []
-
-        input_model_kwargs = {
-            'obs': right_padded_obs,
-            'right_padded_attn_mask': right_padded_attn_mask,
-            'prompt_len': prompt_len,
-            'max_gen_len': max_gen_len,
-            'action_mask': action_mask,
-            'actions': actions,
-        }
-
-        microbatch_splits = _default_split_batch(
-            batch=input_model_kwargs,
-            microbatch_size=device_train_microbatch_size,
-        )
-        # Compute the device_train_microbatch_log_probs inside the for loop to reduce the softmax overhead
-        for split in microbatch_splits:
-            curr_kwargs = split
-
-            cur_output = actor_critic(curr_kwargs)
-            cur_logits = cur_output['logits']
-            # need to pull out current actions and prompt len
-            cur_actions = curr_kwargs['actions']
-            cur_action_mask = curr_kwargs['action_mask']
-            cur_prompt_len = curr_kwargs['prompt_len']
-
-            cur_log_probs = get_log_probs(
-                logits=cur_logits,
-                actions=cur_actions,
-                prompt_len=cur_prompt_len,
-                max_gen_len=max_gen_len,
-            )
-            cur_entropies = get_entropies(
-                logits=cur_logits,
-                action_mask=cur_action_mask,
-                prompt_len=cur_prompt_len,
-                max_gen_len=max_gen_len,
-            )
-            log_probs.append(cur_log_probs)
-            entropies.append(cur_entropies)
-            # Ignore values when the model doesn't have a value head
-            if 'values' in cur_output:
-                cur_values = cur_output['values']
-                values.append(cur_values)
-
-        device_train_microbatch_log_probs = torch.cat(log_probs)
-        device_train_microbatch_entropies = torch.cat(entropies)
-
-        partial_env_output = {
-            'prompt_id': prompt_id,
-            'actions': actions,
-            'old_log_probs': device_train_microbatch_log_probs,
-            'old_entropies': device_train_microbatch_entropies,
-            'obs': right_padded_obs,
-            'generated_len': generated_len,
-            'action_mask': action_mask,
-        }
-        if len(values) > 0:
-            device_train_microbatch_values = torch.cat(values)
-
-            # Need to add in the padding for the value function
-            value_action_mask = torch.cat([
-                action_mask,
-                torch.zeros((batch_size, 1), device=cur_device),
-            ],
-                                          dim=-1)
-            device_train_microbatch_values *= value_action_mask
-            partial_env_output['values'] = device_train_microbatch_values
-        # Future implementations may change the way reward_seq_len is defined
-        # e.g., if special formatting is applied
-        reward_seq_len = prompt_len + generated_len
-
-        ref_output, all_rewards = reward_manager(
-            raw_untokenized_texts=untokenized_prompt_and_responses,
-            right_padded_obses=right_padded_obs,
-            attention_masks=right_padded_attn_mask,
-            seq_lens=reward_seq_len,
-            generated_lens=generated_len,
-            prompt_lens=prompt_len,
-            max_gen_length=max_gen_len,
-            actions=actions,
-            action_log_probs=device_train_microbatch_log_probs,
-            device_train_microbatch_size=device_train_microbatch_size,
-            kl_estimator=kl_estimator,
-            kl_clip_range=kl_clip_range,
-            verified_answers=verified_answers,
-        )
-
-    return (
-        partial_env_output,
-        untokenized_prompt_and_responses,
-        ref_output,
-        all_rewards,
-    )
-
-
-class SingleControllerOnPolicyCallback(CallbackWithConfig):
+class SingleControllerOnPolicyCallback(OnPolicyCallback):
     """Callback for managing on-policy training in an RLHF loop.
 
     Args:
@@ -503,40 +264,6 @@ def init(self, state: State, logger: Logger):
         )
 
 
-    def before_load(self, state: State, logger: Logger):
-        del logger
-        self.train_prompt_loader = state.train_dataloader
-
-    def after_load(self, state: State, logger: Logger):
-        del logger  # unused
-        # This needs to be done here becuase callbacks are init'd before we attach
-        # the dataloader as a property to state
-        self.tokenizer = state.model.tokenizer
-        self.eos_token_ids = [self.tokenizer.eos_token_id]  # type: ignore
-        if self.input_eos_token_ids is not None:
-            self.eos_token_ids = self.input_eos_token_ids
-            log.info(
-                f'The online RL loop will assume the following eos token ids {self.eos_token_ids}',
-            )
-            for eos_token_id in self.eos_token_ids:
-                log.info(
-                    f'Token {eos_token_id} is {self.tokenizer.decode([eos_token_id])}.',  # type: ignore
-                )
-
-        if self.pad_token_idx in self.eos_token_ids:
-            log.warning(
-                'pad_token_id is in eos_token_ids list. Be careful with any data processing going forward!',
-            )
-
-        self.train_prompt_loader_iter = iter(
-            self.train_prompt_loader,  # pyright: ignore
-        )
-
-        if self.train_prompt_loader_state_dict is not None:
-            self.train_prompt_loader.load_state_dict( # pyright: ignore
-                self.train_prompt_loader_state_dict,
-            )
-
     def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         """Round trip to inference engines.
         
@@ -568,21 +295,7 @@ def iteration_start(self, state: State, logger: Logger):
         # Update IFT KL
         self._update_ift_kl()
 
-    def epoch_end(self, state: State, logger: Logger):
-        del logger  # unused
-        assert self.epochs_per_iteration == state._iteration_length
-        if self.actor_critic.determine_early_stop():  # type: ignore
-            state.timestamp.epoch_in_iteration = self.epochs_per_iteration
-
-    def iteration_end(self, state: State, logger: Logger):
-        del logger  # unused
-        self._log_generations_to_logger(state)
-        self._increment_rl_iter()
-        self.buffer.reset()
-        self.buffer.set_state_dict(
-            self.train_prompt_loader.state_dict(), # pyright: ignore
-            0,
-        )
+    # epoch_end and iteration_end methods are inherited from OnPolicyCallback
 
     def _get_next_iter_prompts(self):
         """Gets the next iteration's batch of prompts."""
@@ -642,17 +355,7 @@ def _get_next_iter_prompts(self):
 
         return ret_batch
 
-    def _get_single_batch_prompts(self):
-        """Gets a single batch of prompts from the dataloader."""
-        try:
-            return next(self.train_prompt_loader_iter)
-        except StopIteration:
-            # Reset the iterator to the beginning of the dataloader
-            self.train_prompt_loader_iter = iter(
-                self.train_prompt_loader,  # pyright: ignore
-            )
-            # Get the first sample from the dataloader
-            return next(self.train_prompt_loader_iter)
+    # _get_single_batch_prompts method is inherited from OnPolicyCallback
 
     def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
         """Have the policy interact with the environment.
@@ -725,35 +428,7 @@ def _get_reward(self, batch: dict[str, torch.Tensor]):
 
         self.actor_critic.train()
 
-    def _extract_minibatch(
-        self,
-        batch: dict[str, torch.Tensor],
-        idx: int,
-        minibatch_size: int,
-    ) -> dict[str, torch.Tensor]:
-        """Extracts a minibatch from a composite batch.
-
-        This helper is used to extract a particular minibatch of size
-        minibatch_size from `batch`, where `batch` may
-        have a batch size that exceeds the minibatch size.
-
-        Args:
-            batch (dict[str, torch.Tensor]): an arbitrary batch, where
-                each entry has batch size >= minibatch_size,
-                representing the concatenation of >= 1 minibatches.
-            idx (int): The index of the batch (see above description) to extract.
-
-        Returns:
-            curr_gen_batch (dict[str, torch.Tensor]): The gen_batch_idx'th
-                gen_batch extracted from the batch input.
-        """
-        start_idx = idx * minibatch_size
-        end_idx = (idx + 1) * minibatch_size
-        curr_gen_batch = {
-            batch_key: tensor[start_idx:end_idx]
-            for batch_key, tensor in batch.items()
-        }
-        return curr_gen_batch
+    # _extract_minibatch method is inherited from OnPolicyCallback
 
     def _resolve_outputs(
         self,
@@ -903,76 +578,9 @@ def _resolve_outputs(
 
         return iter_batch
 
-    def _log_generations_to_logger(self, state: State):
-        # Gather all prompts, generations, prompt_ids and rewards from all ranks
-        prompts_and_gens = list(
-            chain(*dist.all_gather_object(self.prompts_and_gens)),
-        )
-        prompt_ids_rewards_and_answers = list(
-            chain(*dist.all_gather_object(self.prompt_ids_rewards_and_answers)),
-        )
-        # Make a final list of tuple in the format: (prompt_id, reward, prompt, generation, verified_answer)
-        columns = [
-            'prompt_id',
-            'reward',
-            'prompt',
-            'generation',
-            'verified_answer',
-        ]
-        save_data = [[prompt_id, reward, prompt, generation, verified_answer]
-                     for (prompt_id, reward,
-                          verified_answer), (prompt, generation) in zip(
-                              prompt_ids_rewards_and_answers,
-                              prompts_and_gens,
-                          )]
-        # Sort the save_data by reward in descending order
-        save_data = sorted(save_data, key=lambda x: x[1], reverse=True)
-
-        if dist.get_global_rank() == 0:
-            if self.wandb_logger is not None:
-                assert wandb.run is not None, 'wandb should have started the run'
-
-                artifact = wandb.Artifact(
-                    'generate_samples_' + str(wandb.run.id),
-                    type='predictions',
-                )
-
-                text_table = wandb.Table(
-                    data=save_data,
-                    columns=columns,
-                )
-
-                artifact.add(text_table, 'predictions')
-                wandb.log_artifact(artifact)
-                wandb.log({'generations': text_table},
-                          step=state.timestamp.batch.value)
+    # _log_generations_to_logger method is inherited from OnPolicyCallback
 
-            if self.mlflow_logger is not None:
-                self.mlflow_logger.log_table(
-                    columns=columns,
-                    rows=save_data,
-                    name=f'Prompt_generations_{self.iter_num}',
-                )
-
-        self.prompts_and_gens = []
-        self.prompt_ids_rewards_and_answers = []
-
-    def _update_ift_kl(self):
-        local_kl = torch.stack(self.kl_ift)
-
-        global_ift_kl = torch.cat(dist.all_gather_object(local_kl))
-        ift_kl_update = torch.mean(global_ift_kl)
-
-        self.kl_ctl.update(
-            ift_kl_update,
-            self.num_batches_per_update * self.device_train_batch_size *
-            dist.get_world_size(),
-        )
-
-        self.kl_ift = []
-
-    def _increment_rl_iter(self):
-        self.iter_num += 1
+    # _update_ift_kl, _increment_rl_iter, state_dict, and load_state_dict methods are inherited from OnPolicyCallback
 
     def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         start_time = time.time()
@@ -988,16 +596,3 @@ def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines:
         log.info('Finished broadcasting to vLLM')
         log.info(f'Took: {time.time() - start_time} to broadcast to vllm.')
         dist.barrier()
-
-    def state_dict(self):
-        return {
-            'KL_ctl_state_dict': self.kl_ctl.state_dict(),
-            'iter_num': self.iter_num,
-            'train_prompt_loader':
-                self.train_prompt_loader.state_dict(),  # pyright: ignore
-        }
-
-    def load_state_dict(self, state_dict: dict[str, Any]):
-        self.kl_ctl.load_state_dict(state_dict['KL_ctl_state_dict'])
-        self.iter_num = state_dict['iter_num']
-        self.train_prompt_loader_state_dict = state_dict['train_prompt_loader']
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index dd083525..1cb77767 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -27,9 +27,7 @@
 from compose_rl.algorithms.online.generation_utils import create_vllm_engines
 from compose_rl.data import prompt_dataset_collate_fn
 from compose_rl.utils.ray_utils import start_ray_server
-# add a PYTHONPATH tot he tests
-# import sys
-# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 from tests.common import (
     BaseDistributedGPUActor,
     VerifiablePromptDataset,

From dee910ba9d4e9b0930260e7cb96486ad51b19724 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 06:16:21 +0000
Subject: [PATCH 070/107] another rm

---
 .../online/single_controller_callback.py      | 165 +-----------------
 1 file changed, 1 insertion(+), 164 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index e3e61cbf..337a1f69 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -7,23 +7,18 @@
 
 import logging
 import time
-from itertools import chain
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import torch
-import wandb
 from composer.core import (
-    Precision,
     State,
     TimeUnit,
     ensure_time,
     get_precision_context,
 )
-from composer.core.data_spec import _default_split_batch
 from composer.loggers import Logger, MLFlowLogger, WandBLogger
 from composer.trainer.trainer import _get_initial_device_train_microbatch_size
 from composer.utils import dist, ensure_tuple
-from llmfoundry.interfaces import CallbackWithConfig
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from compose_rl.algorithms.online.generation_utils import (
@@ -48,13 +43,8 @@
     compute_advantages,
     dist_compute_masked_mean_and_var,
     flatten,
-    get_decoded_sequence,
-    get_entropies,
-    get_log_probs,
-    mask_eos,
     masked_mean,
     masked_sum,
-    switch_left_to_right_padding,
 )
 
 # Import the base class
@@ -428,159 +418,6 @@ def _get_reward(self, batch: dict[str, torch.Tensor]):
 
         self.actor_critic.train()
 
-    # _extract_minibatch method is inherited from OnPolicyCallback
-
-    def _resolve_outputs(
-        self,
-        iter_batch: dict[str, torch.Tensor],
-        partial_outputs: tuple[dict, ReferenceOutput, RewardOutput],
-    ) -> dict[str, torch.Tensor]:
-        """Resolve env/reference/reward outputs into a PPO minibatch.
-
-        Args:
-            iter_batch (dict): The batch for the current iteration.
-            partial_outputs (tuple): A tuple of (env_output, reference_output, reward_output),
-                one tuple for entire ppo iter batch. This tuple is created from `env_reward`.
-
-        Returns:
-            output_minibatch (dict): The final minibatch from the environment, with all AsyncResult
-                objects resolved and outputs processed for PPO training.
-        """
-        env_outs, ref_outs, rew_dict = partial_outputs
-        rew_outs = self.reward_manager.resolve_outputs(
-            ref_output=ref_outs,
-            reward_output=rew_dict,
-            kl_ctl=self.kl_ctl,
-            action_mask=env_outs['action_mask'],
-            center_reward_mean=self.center_reward_mean,
-        )
-        env_outs.update(rew_outs)
-
-        # Keep track of prompt ids, rewards and verified answers for logging
-        prompt_ids = env_outs['prompt_id'].detach().cpu().tolist()
-        rewards = env_outs['rewards'].sum(dim=-1).detach().cpu().tolist()
-        self.prompt_ids_rewards_and_answers.extend(
-            list(zip(prompt_ids, rewards, iter_batch['verified_answer'])),
-        )
-
-        # Adding the right_padded_attn_mask to the env_outputs
-        env_outs['right_padded_attn_mask'] = torch.logical_not(
-            torch.eq(env_outs['obs'], self.pad_token_idx),  # type: ignore
-        )
-
-        # Now that rewards are resolved, we can compute advantages
-        if self.actor_critic.loss_type == OnPolicyEnum.PPO:
-            env_outs['advantages'] = compute_advantages(
-                rewards=env_outs['rewards'],
-                values=env_outs['values'],
-                gamma=self.gamma,
-                lambda_gae=self.lambda_gae,
-            )
-        elif self.actor_critic.loss_type == OnPolicyEnum.GRPO:
-            # compute GRPO advantages
-            prompt_id = env_outs['prompt_id']
-            rewards = env_outs['rewards']
-
-            # Flatten the rewards by summing on sequence length/action_mask
-            flat_rewards = masked_sum(
-                rewards,
-                env_outs['action_mask'],
-                dim=-1,
-            )
-
-            # Get unique prompt IDs and their indices
-            unique_prompt_ids, inverse_indices = torch.unique(
-                prompt_id,
-                return_inverse=True,
-            )
-
-            # Use scatter to compute means and standard deviations
-            # First, we'll create a tensor to track counts, sums, and sum of squares
-            n_unique = len(unique_prompt_ids)
-            counts = torch.zeros(n_unique, device=prompt_id.device)
-            sums = torch.zeros(n_unique, device=prompt_id.device)
-            sum_squares = torch.zeros(n_unique, device=prompt_id.device)
-
-            # Use scatter_add to accumulate values
-            counts.scatter_add_(
-                0,
-                inverse_indices,
-                torch.ones_like(flat_rewards),
-            )
-            sums.scatter_add_(0, inverse_indices, flat_rewards)
-            sum_squares.scatter_add_(0, inverse_indices, flat_rewards**2)
-
-            # Compute means and standard deviations
-            means = sums / counts
-            variances = (sum_squares / counts) - (means**2)
-            stds = torch.sqrt(variances)
-
-            # Map back to original tensor shape
-            mean_rewards = means[inverse_indices]
-            std_rewards = stds[inverse_indices]
-
-            # Calculate GRPO advantage
-            grpo_advantage = (flat_rewards - mean_rewards)
-            # Only normalize the advantage if flag is set
-            if self.actor_critic.normalize_advantage:
-                grpo_advantage /= (std_rewards + 1e-4)
-
-            # Create advantages of the same shape as original rewards
-            advantages = torch.zeros_like(rewards)
-            # Copy the flat grpo_advantage according to action_mask
-            expanded_advantages = grpo_advantage.unsqueeze(1).expand_as(
-                env_outs['action_mask'],
-            )
-            advantages = torch.where(
-                env_outs['action_mask'].bool(),
-                expanded_advantages,
-                advantages,
-            )
-            env_outs['advantages'] = advantages
-        else:
-            raise ValueError(
-                f'Invalid loss type: {self.actor_critic.loss_type}. ' +
-                'Valid options are: ppo, grpo.',
-            )
-
-        batch_adv_mean, batch_adv_var = dist_compute_masked_mean_and_var(
-            env_outs['advantages'],
-            env_outs['action_mask'],
-        )
-
-        mean_ift = masked_mean(
-            env_outs['ift_kl'],
-            env_outs['action_mask'],
-        )
-        self.kl_ift.append(mean_ift.cpu())
-
-        iter_batch.update(env_outs)
-
-        iter_batch.update({
-            'max_gen_len':
-                torch.ones(self.iter_batch_size).to(torch.int32) *
-                self.max_gen_len,
-            'adv_masked_mean':
-                torch.ones(self.iter_batch_size) * batch_adv_mean.cpu(),
-            'adv_masked_var':
-                torch.ones(self.iter_batch_size) * batch_adv_var.cpu(),
-            'ift_kl_scalar':
-                torch.ones(self.iter_batch_size) * self.kl_ctl.value,
-            'reward_std':
-                torch.ones(self.iter_batch_size) *
-                env_outs['rewards'].std().to('cpu'),
-        })
-
-        # Moving minibatches to CPU to not take additional GPU memory
-        for k, v in iter_batch.items():
-            if hasattr(v, 'cpu'):
-                iter_batch[k] = v.cpu()
-
-        return iter_batch
-
-    # _log_generations_to_logger method is inherited from OnPolicyCallback
-
-    # _update_ift_kl, _increment_rl_iter, state_dict, and load_state_dict methods are inherited from OnPolicyCallback
 
     def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         start_time = time.time()

From 1e0b07aba9de234bf8ff2760a37b7b75706cda58 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 08:24:17 +0000
Subject: [PATCH 071/107] train again; trim down

---
 compose_rl/algorithms/online/callback.py      |  14 +-
 .../online/single_controller_callback.py      | 324 +-----------------
 test_single_controller_ppo.py                 |  21 +-
 3 files changed, 26 insertions(+), 333 deletions(-)

diff --git a/compose_rl/algorithms/online/callback.py b/compose_rl/algorithms/online/callback.py
index d32b5795..1a97eb0a 100644
--- a/compose_rl/algorithms/online/callback.py
+++ b/compose_rl/algorithms/online/callback.py
@@ -764,7 +764,6 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
                 # When we hit this function, we should already have all the prompts we need per iteration.
                 num_gen_calls = bs // self.device_generate_batch_size
 
-                gen_batch_partial_outputs = []
                 all_sequences = []
                 for i in range(num_gen_calls):
                     gen_batch = self._extract_minibatch(
@@ -796,6 +795,15 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
 
+        # Compute rewards and populate buffer
+        self._get_reward(batch)
+
+    def _get_reward(self, batch: dict[str, torch.Tensor]):
+        """Compute rewards for a batch of generated sequences.
+        
+        Args:
+            batch (dict): The batch containing generated sequences to compute rewards for.
+        """
         env_outputs, prompts_and_gens, ref_outputs, all_rewards_dict = env_reward(
             actor_critic=self.actor_critic,  # pyright: ignore
             reward_manager=self.reward_manager,
@@ -825,7 +833,7 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
                 del resolved_outputs[key]
 
         # We need to split the resolved outputs into minibatches
-        for idx in range(bs // self.device_train_batch_size):
+        for idx in range(batch['prompt_id'].shape[0] // self.device_train_batch_size):
             minibatch = self._extract_minibatch(
                 resolved_outputs,
                 idx,
@@ -834,7 +842,7 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
             self.buffer.add(minibatch)
 
         # Making sure we correctly parsed the minibatches
-        assert len(self.buffer) == self.num_batches_per_update
+        assert len(self.buffer) == self.num_batches_per_update, f'{len(self.buffer)} != {self.num_batches_per_update}'
 
         self.actor_critic.train()
 
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 337a1f69..94ed62e2 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -12,13 +12,11 @@
 import torch
 from composer.core import (
     State,
-    TimeUnit,
-    ensure_time,
     get_precision_context,
 )
-from composer.loggers import Logger, MLFlowLogger, WandBLogger
+from composer.loggers import Logger
 from composer.trainer.trainer import _get_initial_device_train_microbatch_size
-from composer.utils import dist, ensure_tuple
+from composer.utils import dist
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from compose_rl.algorithms.online.generation_utils import (
@@ -29,31 +27,14 @@
     ComposerHFPolicyLM,
     ComposerMPTPolicyLM,
 )
-from compose_rl.algorithms.online.model_methods import (
-    OnPolicyEnum,
-)
-from compose_rl.algorithms.online.reward_manager import (
-    ReferenceOutput,
-    RewardManager,
-    RewardOutput,
-)
-from compose_rl.data.buffer import MinibatchRolloutBuffer
-from compose_rl.registry_builders import build_kl_controller
-from compose_rl.utils import (
-    compute_advantages,
-    dist_compute_masked_mean_and_var,
-    flatten,
-    masked_mean,
-    masked_sum,
-)
 
 # Import the base class
-from compose_rl.algorithms.online.callback import OnPolicyCallback, env_reward
+from compose_rl.algorithms.online.callback import OnPolicyCallback
 
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
 
-__all__ = ['SingleControllerOnPolicyCallback', 'env_reward']
+__all__ = ['SingleControllerOnPolicyCallback']
 
 log = logging.getLogger(__name__)
 
@@ -66,194 +47,6 @@ class SingleControllerOnPolicyCallback(OnPolicyCallback):
             callback is registered under callbacks_with_config registry.
     """
 
-    def __init__(
-        self,
-        train_config: dict,
-    ):
-        var_config = train_config['variables']
-
-        # The maximum generation length.
-        self.max_gen_len: int = var_config.get('max_gen_len', 32)
-        # Gamma discounting for computing returns.
-        self.gamma = var_config.get('gamma', 1.0)
-        # Value used in the generalized advantage estimate calculation.
-        self.lambda_gae = var_config.get('lambda_gae', 1.0)
-
-        # Other algo specific hparams
-
-        # Which kl estimator to use
-        if 'kl_estimator' not in train_config['model']:
-            # TODO: Modify PPO to nuke config_overrides in the future
-            # Check in model's config_overrides
-            kl_estimator = train_config['model']['config_overrides'].get(
-                'kl_estimator',
-                'k1',
-            )
-        else:
-            kl_estimator = train_config['model'].get('kl_estimator', 'k1')
-        if kl_estimator not in ['k1', 'k2', 'k3', 'k3_offpolicy']:
-            raise ValueError(
-                f'Invalid kl estimator: {kl_estimator}. ' +
-                'Valid options are: k1, k2, k3, k3_offpolicy.',
-            )
-        self.kl_estimator = kl_estimator
-
-        if 'kl_clip_range' not in train_config['model']:
-            # TODO: Modify PPO to nuke config_overrides in the future
-            # Check in model's config_overrides
-            kl_clip_range = train_config['model']['config_overrides'].get(
-                'kl_clip_range',
-                40.0,
-            )
-        else:
-            kl_clip_range = train_config['model'].get('kl_clip_range', 40.0)
-        if kl_clip_range <= 0:
-            raise ValueError(
-                f'Invalid kl clip range: {kl_clip_range}. ' +
-                'Must be greater than 0.',
-            )
-        # check for precision and clip range
-        precision = train_config['precision']
-        if precision != 'fp32':
-            if kl_clip_range > 50.0:
-                log.warning(
-                    f'Clip value of {kl_clip_range=} will not be effective with {precision=} as range for tensors is too small',
-                )
-        self.kl_clip_range = kl_clip_range
-
-        # Generation keyword arguments.
-        self.generation_kwargs = var_config.get('generation_kwargs')
-        # The value to center the reward mean around.
-        self.center_reward_mean = var_config.get('center_reward_mean', None)
-
-        # The reward config which we will use to make the RewardManager.
-        self.reward_cfg = var_config['rewards']
-        self.max_seq_len = train_config['max_seq_len']
-        self.non_train_fsdp_config = var_config.get(
-            'non_train_fsdp_config',
-            train_config['fsdp_config'],
-        )
-        self.ref_config = var_config['reference_model']
-
-        # Per-device generate size.
-        self.device_generate_batch_size: int = var_config.get(
-            'device_generate_batch_size',
-            1,
-        )
-        self.device_train_batch_size: int = train_config.get(
-            'device_train_batch_size',
-            None,
-        )
-        assert self.device_train_batch_size is not None
-
-        # Number of batches to use for a single PPO epoch.
-        self.num_batches_per_update = var_config.get(
-            'num_batches_per_update',
-            1,
-        )
-        # Number of generations per prompt for a single PPO epoch.
-        self.generations_per_prompt: int = var_config.get(
-            'generations_per_prompt',
-            1,
-        )
-
-        if self.num_batches_per_update % self.generations_per_prompt != 0:
-            raise ValueError(
-                f'{self.num_batches_per_update=} must be divisible by {self.generations_per_prompt=}',
-            )
-
-        self.epochs_per_iteration = ensure_time(
-            var_config.get('epoch_per_iteration', 1),
-            TimeUnit.EPOCH,
-        )
-        assert self.epochs_per_iteration.unit == TimeUnit.EPOCH
-
-        # Programmatically setting the max buffer size instead of the yaml
-        var_config['buffer']['max_buffer_size'] = self.num_batches_per_update
-        self.buffer = MinibatchRolloutBuffer(var_config['buffer'])
-
-        # Build the KL controller through registries
-        kl_ctl_name = var_config['kl_controller'].pop('kl_ctl_type')
-        self.kl_ctl = build_kl_controller(
-            name=kl_ctl_name,
-            kwargs=var_config['kl_controller'],
-        )
-
-        self.kl_ift = []
-
-        self.wandb_logger = None
-        self.mlflow_logger = None
-        self.prompts_and_gens = []
-        self.prompt_ids_rewards_and_answers = []
-        self.iter_num = 0
-        self.train_prompt_loader_state_dict = None
-        self.train_prompt_loader = None
-
-        self.input_eos_token_ids = var_config.get('eos_token_ids', None)
-
-        if train_config.get('python_log_level', None) is not None:
-            logging.getLogger('compose_rl').setLevel(
-                train_config['python_log_level'].upper(),
-            )
-            logging.getLogger(__name__).setLevel(
-                train_config['python_log_level'].upper(),
-            )
-
-        self.batch_rollouts = None
-
-
-    def init(self, state: State, logger: Logger):
-        self.pad_token_idx = state.model.tokenizer.pad_token_id  # type: ignore
-        self.actor_critic = state.model
-
-        if self.actor_critic.loss_type == OnPolicyEnum.GRPO:
-            assert self.generations_per_prompt > 1, \
-                'GRPO requires multiple generations per prompt. ' + \
-                f'Current generations_per_prompt is: {self.generations_per_prompt}.'
-
-        # TODO (#158): do this through composer.
-        for destination in ensure_tuple(logger.destinations):
-            if isinstance(destination, WandBLogger):
-                self.wandb_logger = destination
-            if isinstance(destination, MLFlowLogger):
-                self.mlflow_logger = destination
-
-        # Set iteration_length
-        state._iteration_length = self.epochs_per_iteration
-
-        self.precision = state.precision
-        self.device_train_microbatch_size: int = state.device_train_microbatch_size  # type: ignore
-        if self.device_train_microbatch_size == 'auto':  # type: ignore
-            raise ValueError('auto microbatching is not supported for PPO')
-
-        self.iter_batch_size = self.num_batches_per_update * self.device_train_batch_size
-
-        # The KL penalty in the reward should only exist if we aren't minimizing
-        # the KL directly in the loss.
-        kl_penalty_in_reward = True
-
-        if hasattr(self.actor_critic, 'compute_kl_loss'):
-            kl_penalty_in_reward = not self.actor_critic.compute_kl_loss
-
-        self.reward_manager = RewardManager(
-            config=self.reward_cfg,
-            ref_config=self.ref_config,
-            tokenizer=self.actor_critic.tokenizer, # type: ignore
-            max_seq_len=self.max_seq_len,
-            fsdp_config=self.non_train_fsdp_config,
-            precision=state.precision,
-            kl_penalty_in_reward=kl_penalty_in_reward,
-        )
-
-        # This is needed to ensure PyTorch 2.4 checkpointing doesn't break
-        self.actor_critic.tokenizer.batch_encode_plus( # type: ignore
-            batch_text_or_text_pairs=['Dummy input'],
-            padding='longest',
-            truncation=True,
-            return_attention_mask=True,
-        )
-
-
     def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         """Round trip to inference engines.
         
@@ -285,68 +78,6 @@ def iteration_start(self, state: State, logger: Logger):
         # Update IFT KL
         self._update_ift_kl()
 
-    # epoch_end and iteration_end methods are inherited from OnPolicyCallback
-
-    def _get_next_iter_prompts(self):
-        """Gets the next iteration's batch of prompts."""
-        # Sample fewer batches for the Online RL interation depending on the number of generations per prompt
-        n_unique_batches = self.num_batches_per_update // self.generations_per_prompt
-        batches = [
-            self._get_single_batch_prompts() for _ in range(n_unique_batches)
-        ]
-
-        ret_batch = {}
-        assert 'prompt_id' in batches[0], 'prompt_id must be in the batch'
-        for key in batches[0].keys():
-            curr_values = []
-
-            max_len = 0
-            if isinstance(batches[0][key], torch.Tensor):
-                max_len = max([batch[key].shape[-1] for batch in batches])
-
-            padding_key = None
-            for batch in batches:
-                # Explode the batch into multiple batches for each generation
-                for _ in range(self.generations_per_prompt):
-                    # For keys that do not require additional processing
-                    if key in ['prompt_len', 'verified_answer', 'prompt_id']:
-                        curr_values.append(batch[key])
-                        continue
-
-                    bs, seq_len = batch[key].shape
-
-                    if key == 'prompt':
-                        padding_key = self.pad_token_idx
-                        if (batch[key][:, -1] == padding_key).any():
-                            raise ValueError(
-                                'The last token in the prompt should not be the pad token. Please double '
-                                +
-                                'check the dataloader and prompt and dataloader.',
-                            )
-                    elif key == 'prompt_attention_mask':
-                        padding_key = False
-
-                    # Compute the required padding and concatenate with the batch tensor
-                    pad = torch.ones(
-                        (bs, max_len - seq_len),
-                        dtype=batch[key].dtype,
-                    ) * padding_key  # type: ignore
-                    curr_values.append(torch.cat([pad, batch[key]], dim=-1))
-
-            # For tensor fields, use torch.cat to combine the values; for string fields, just use the list
-            if isinstance(curr_values[0], torch.Tensor):
-                ret_batch[key] = torch.cat(curr_values)
-            else:
-                if key == 'verified_answer':
-                    ret_batch[key] = list(flatten(curr_values))
-                else:
-                    # this is an edge case that we will not hit currently, but just handling it as needed
-                    ret_batch[key] = curr_values
-
-        return ret_batch
-
-    # _get_single_batch_prompts method is inherited from OnPolicyCallback
-
     def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
         """Have the policy interact with the environment.
 
@@ -372,52 +103,6 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[
         batch['sequences'] = sequences
         return batch
 
-    def _get_reward(self, batch: dict[str, torch.Tensor]):
-        log.debug('Beginning reward computation for the rollout.')
-        start_reward_time = time.time()
-        env_outputs, prompts_and_gens, ref_outputs, all_rewards_dict = env_reward(
-            actor_critic=self.actor_critic,  # pyright: ignore
-            reward_manager=self.reward_manager,
-            batch=batch,
-            max_gen_len=self.max_gen_len,
-            precision=self.precision,
-            device_train_microbatch_size=self.device_train_microbatch_size,
-            tokenizer=self.tokenizer,  # type: ignore
-            eos_token_ids=self.eos_token_ids,  # type: ignore
-            kl_estimator=self.kl_estimator,
-            kl_clip_range=self.kl_clip_range,
-        )
-
-        end_reward_time = time.time()
-        total_reward_time = end_reward_time - start_reward_time
-        log.debug(
-            f'Finished reward computation for the rollout in {total_reward_time:.4f} seconds.',
-        )
-
-        self.prompts_and_gens.extend(prompts_and_gens)
-
-        gen_batch_partial_outputs = (env_outputs, ref_outputs, all_rewards_dict)
-        # For every partial output we want to resolve them together
-        # And compute the global per iteration batch advantage's mean and variance
-        resolved_outputs = self._resolve_outputs(
-            batch,
-            gen_batch_partial_outputs,
-        )
-
-        # We need to split the resolved outputs into minibatches
-        for idx in range(self.iter_batch_size // self.device_train_batch_size):
-            minibatch = self._extract_minibatch(
-                resolved_outputs,
-                idx,
-                self.device_train_batch_size,
-            )
-            self.buffer.add(minibatch)
-
-        # Making sure we correctly parsed the minibatches
-        assert len(self.buffer) == self.num_batches_per_update
-
-        self.actor_critic.train()
-
 
     def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         start_time = time.time()
@@ -427,7 +112,6 @@ def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines:
             vllm_engines,
             model_update_group,
             batch,
-            #loss_type=self.actor_critic.loss_type.value,  # type: ignore
             loss_type=self.actor_critic.loss_type,  # type: ignore
         )
         log.info('Finished broadcasting to vLLM')
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 1cb77767..835e0881 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -57,6 +57,9 @@ def __init__(self,
         self.ppo_callback = None
         self.ppo_trainer: Trainer = None
 
+        self.device_train_batch_size = 4
+        self.num_batches_per_update = 2
+
     def build_dataloader(self):
         max_seq_len = 32
         prompt_len = 10
@@ -70,7 +73,7 @@ def build_dataloader(self):
                 max_seq_len,
             ),
             sampler=composer_dist.get_sampler(dataset),
-            batch_size=4,
+            batch_size=self.device_train_batch_size,
         )
         # We need to mock this method, since our dataset isn't a StreamingDataset
         dataloader.state_dict = lambda: {}
@@ -137,7 +140,7 @@ def build_ref_model(self, pretrain_model_name: str):
             parallelism_config={'fsdp': self.fsdp_config},
             save_folder=tmp_ref_path,
             save_weights_only=True,
-            device_train_microbatch_size=2,
+            device_train_microbatch_size=self.device_train_microbatch_size,
         )
 
         temp_trainer.fit()
@@ -155,15 +158,13 @@ def build_ppo_trainer(self, pretrain_model_name: str):
 
         optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
 
-        num_batches_per_update = 2
-
         # ref_model_config = copy.deepcopy(self.model_config)
         ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
 
         variables = {
             'buffer': {
                 'name': 'MinibatchRolloutBuffer',
-                'max_buffer_size': num_batches_per_update,
+                'max_buffer_size': self.num_batches_per_update,
             },
             'max_gen_len': 8,
             'gamma': 0.99,
@@ -171,6 +172,7 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             'generation_kwargs': {
                 'use_cache': True,
                 'do_sample': False,
+                'temperature': 1.0,
             },
             'kl_controller': {
                 'init_kl_coef': 0.2,
@@ -184,9 +186,8 @@ def build_ppo_trainer(self, pretrain_model_name: str):
                 'load_path': self.ref_path,
                 'non_train_fsdp_config': self.fsdp_config,
             },
-            'device_generate_batch_size': 2,
             'epoch_per_iteration': 1,
-            'num_batches_per_update': num_batches_per_update,
+            'num_batches_per_update': self.num_batches_per_update,
             'rewards': {
                 'output_length': {
                     'reward_type': 'output_length',
@@ -201,9 +202,9 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             'precision': precision,
             'variables': variables,
             'max_seq_len': max_seq_len,
-            'global_train_batch_size': 2,
-            'device_train_batch_size': 2,
-            'device_train_microbatch_size': 1,
+            'global_train_batch_size': self.device_train_batch_size * self.world_size,
+            'device_train_batch_size': self.device_train_batch_size,
+            'device_train_microbatch_size': self.device_train_batch_size,
         }
 
         self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)

From b23b80e1d91be611905b737bea105a00423cb88d Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 08:27:30 +0000
Subject: [PATCH 072/107] recover weight update

---
 compose_rl/algorithms/online/single_controller_callback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 94ed62e2..e0e4967f 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -54,7 +54,7 @@ def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any],
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
         batch = device.batch_to_device(self._get_next_iter_prompts())
-        # self._update_inference_model(batch, vllm_engines, model_update_group)
+        self._update_inference_model(batch, vllm_engines, model_update_group)
         self.batch_rollouts = self._interact_with_env(batch, vllm_engines)
 
     def iteration_start(self, state: State, logger: Logger):

From 2f8c9460845dc04fa8c5e4b9c6aad294b0844b90 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 08:38:54 +0000
Subject: [PATCH 073/107] share method

---
 compose_rl/algorithms/online/callback.py       |  9 ++++-----
 .../online/single_controller_callback.py       | 18 +-----------------
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/compose_rl/algorithms/online/callback.py b/compose_rl/algorithms/online/callback.py
index 1a97eb0a..f013e599 100644
--- a/compose_rl/algorithms/online/callback.py
+++ b/compose_rl/algorithms/online/callback.py
@@ -623,7 +623,7 @@ def iteration_start(self, state: State, logger: Logger):
         batch = state.device.batch_to_device(batch)
 
         if self.vllm_engines is not None:
-            self._update_inference_model(batch)
+            self._update_inference_model(batch, self.vllm_engines, self.model_update_group)
 
         self._interact_with_env(batch)
         # Reset and initialize state train dataloader
@@ -1149,14 +1149,13 @@ def _create_vllm_engines(self):
         dist.barrier()
         log.info('All ranks have completed the vLLM engine create function.')
 
-    def _update_inference_model(self, batch: dict[str, torch.Tensor]):
+    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list, model_update_group: dist.ProcessGroup):
         start_time = time.time()
         log.info('Before broadcast to vLLM')
-        assert self.vllm_engines is not None
         broadcast_to_vllm(
             model=self.actor_critic,
-            vllm_engines=self.vllm_engines,
-            model_update_group=self.model_update_group,
+            vllm_engines=vllm_engines,
+            model_update_group=model_update_group,
             batch=batch,
             loss_type=self.actor_critic.loss_type,  # type: ignore
             enable_prefix_caching=self.vllm_enable_prefix_caching,
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index e0e4967f..5490f4bc 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -20,7 +20,6 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from compose_rl.algorithms.online.generation_utils import (
-    broadcast_to_vllm,
     vllm_generate,
 )
 from compose_rl.algorithms.online.model import (
@@ -101,19 +100,4 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[
             )
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
-        return batch
-
-
-    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
-        start_time = time.time()
-        log.info('Before broadcast to vLLM')
-        broadcast_to_vllm(
-            self.actor_critic,
-            vllm_engines,
-            model_update_group,
-            batch,
-            loss_type=self.actor_critic.loss_type,  # type: ignore
-        )
-        log.info('Finished broadcasting to vLLM')
-        log.info(f'Took: {time.time() - start_time} to broadcast to vllm.')
-        dist.barrier()
+        return batch
\ No newline at end of file

From 2966ccf06d7be2b035775788eeaba8f14c97907c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 08:38:59 +0000
Subject: [PATCH 074/107] Revert "share method"

This reverts commit 2f8c9460845dc04fa8c5e4b9c6aad294b0844b90.
---
 compose_rl/algorithms/online/callback.py       |  9 +++++----
 .../online/single_controller_callback.py       | 18 +++++++++++++++++-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/compose_rl/algorithms/online/callback.py b/compose_rl/algorithms/online/callback.py
index f013e599..1a97eb0a 100644
--- a/compose_rl/algorithms/online/callback.py
+++ b/compose_rl/algorithms/online/callback.py
@@ -623,7 +623,7 @@ def iteration_start(self, state: State, logger: Logger):
         batch = state.device.batch_to_device(batch)
 
         if self.vllm_engines is not None:
-            self._update_inference_model(batch, self.vllm_engines, self.model_update_group)
+            self._update_inference_model(batch)
 
         self._interact_with_env(batch)
         # Reset and initialize state train dataloader
@@ -1149,13 +1149,14 @@ def _create_vllm_engines(self):
         dist.barrier()
         log.info('All ranks have completed the vLLM engine create function.')
 
-    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list, model_update_group: dist.ProcessGroup):
+    def _update_inference_model(self, batch: dict[str, torch.Tensor]):
         start_time = time.time()
         log.info('Before broadcast to vLLM')
+        assert self.vllm_engines is not None
         broadcast_to_vllm(
             model=self.actor_critic,
-            vllm_engines=vllm_engines,
-            model_update_group=model_update_group,
+            vllm_engines=self.vllm_engines,
+            model_update_group=self.model_update_group,
             batch=batch,
             loss_type=self.actor_critic.loss_type,  # type: ignore
             enable_prefix_caching=self.vllm_enable_prefix_caching,
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 5490f4bc..e0e4967f 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -20,6 +20,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from compose_rl.algorithms.online.generation_utils import (
+    broadcast_to_vllm,
     vllm_generate,
 )
 from compose_rl.algorithms.online.model import (
@@ -100,4 +101,19 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[
             )
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
-        return batch
\ No newline at end of file
+        return batch
+
+
+    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+        start_time = time.time()
+        log.info('Before broadcast to vLLM')
+        broadcast_to_vllm(
+            self.actor_critic,
+            vllm_engines,
+            model_update_group,
+            batch,
+            loss_type=self.actor_critic.loss_type,  # type: ignore
+        )
+        log.info('Finished broadcasting to vLLM')
+        log.info(f'Took: {time.time() - start_time} to broadcast to vllm.')
+        dist.barrier()

From fb9822546cc1423cf211f7afd4325d485b1e1ce3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Wed, 23 Jul 2025 08:40:29 +0000
Subject: [PATCH 075/107] rm files

---
 tests/test_torch_ray_distributed.py | 501 ----------------------------
 vllm_test/test_vllm.py              |  32 --
 2 files changed, 533 deletions(-)
 delete mode 100644 tests/test_torch_ray_distributed.py
 delete mode 100644 vllm_test/test_vllm.py

diff --git a/tests/test_torch_ray_distributed.py b/tests/test_torch_ray_distributed.py
deleted file mode 100644
index 0f6ca384..00000000
--- a/tests/test_torch_ray_distributed.py
+++ /dev/null
@@ -1,501 +0,0 @@
-import ray
-import torch
-import torch.distributed as dist
-import os
-import socket
-import subprocess
-import time
-from contextlib import contextmanager
-from typing import Optional, Tuple
-import argparse
-from datetime import timedelta
-
-from functools import partial
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from composer.utils import dist as composer_dist
-from composer import Trainer
-from composer.optim import DecoupledAdamW
-from llmfoundry.models import ComposerHFCausalLM
-from torch.utils.data import DataLoader
-
-from compose_rl.algorithms.online import (
-    ComposerHFPolicyLM,
-    SingleControllerOnPolicyCallback,
-)
-from compose_rl.data import prompt_dataset_collate_fn
-from tests.common import VerifiablePromptDataset
-
-from compose_rl.algorithms.online.generation_utils import init_process_group, create_vllm_engines
-
-from typing import Any
-
-
-def ray_noset_visible_devices():
-    return os.environ.get('RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0') == '1'
-
-
-# 1. how to enable tests with llama 3.2 1b
-# 2. why is there authorization issue with composer wrapper
-
-
-def init_ray():
-    # init ray on master node, rank 0
-    if dist.get_rank() == 0:
-        # Start head node
-        subprocess.run(['ray', 'start', '--head'], check=True)
-        ray.init('auto')
-        # get existing ray ip and port 
-        ctx = ray.get_runtime_context()
-        address = ctx.gcs_address
-        print(f'available gpus: {ray.available_resources()}')
-    else:
-        address = ''
-    address_list = [address]
-    # broadcast address to all other ranks
-    dist.broadcast_object_list(address_list, src=0)
-    if dist.get_rank() != 0 and os.environ.get('LOCAL_RANK', None) == '0':
-        address = address_list[0]
-        print(f'rank: {dist.get_rank()} connecting to address: {address}')
-        subprocess.run(['ray', 'start', f'--address={address}'], check=True)
-    dist.barrier()
-    if dist.get_rank() == 0:
-        # wait until num of gpus reach world_size
-        num_gpus = int(ray.cluster_resources()['GPU'])
-        counter = 0
-        while num_gpus < dist.get_world_size():
-            print(f'waiting for {dist.get_world_size() - num_gpus} gpus to be available')
-            num_gpus = int(ray.cluster_resources()['GPU'])
-            time.sleep(5)
-            counter += 1
-            if counter > 4:
-                raise RuntimeError(f'Failed to start {dist.get_world_size()} gpus')
-        print(f'Total available gpus: {ray.available_resources()}')
-    return address
-
-
-@ray.remote(num_gpus=1)
-class DistributedGPUActor:
-    def __init__(self, rank: int, world_size: int, master_addr: Optional[str] = None, master_port: Optional[int] = None):
-        """Initialize the distributed GPU actor.
-        
-        Args:
-            rank: The rank of this process in the distributed group
-            world_size: Total number of processes in the distributed group
-            master_addr: Master node address. If None, will allocate dynamically for rank 0
-            master_port: Master node port. If None, will allocate dynamically for rank 0
-        """
-        self.rank = rank
-        self.world_size = world_size
-        self.master_addr = master_addr
-        self.master_port = master_port
-        
-        # Set up basic environment variables
-        os.environ["WORLD_SIZE"] = str(world_size)
-        os.environ["RANK"] = str(rank)
-        
-        # Set LOCAL_RANK based on Ray GPU allocation
-        os.environ["LOCAL_RANK"] = str(ray.get_gpu_ids()[0]) if ray_noset_visible_devices() else "0"
-        
-        # If this is rank 0 and no master_addr/master_port provided, allocate them
-        if rank == 0 and (master_addr is None or master_port is None):
-            self._allocate_master_address()
-
-        os.environ["MASTER_ADDR"] = self.master_addr
-        os.environ["MASTER_PORT"] = str(self.master_port)
-
-        self.model = None
-        self.model_update_group = None
-
-        self.pretrain_model_name = None
-        self.ref_path = None
-        self._dataloader = None
-        self._tokenizer = None
-        self.ppo_callback = None
-        self.ppo_trainer: Trainer = None
-
-    def build_dataloader(self):
-        max_seq_len = 32
-        prompt_len = 10
-
-        dataset = VerifiablePromptDataset(prompt_len=prompt_len)
-        dataloader = DataLoader(
-            dataset,
-            collate_fn=partial(
-                prompt_dataset_collate_fn,
-                self.tokenizer,
-                max_seq_len,
-            ),
-            sampler=composer_dist.get_sampler(dataset),
-            batch_size=4,
-        )
-        # We need to mock this method, since our dataset isn't a StreamingDataset
-        dataloader.state_dict = lambda: {}
-        dataloader.load_state_dict = lambda x: None
-        return dataloader
-
-    @property
-    def dataloader(self):
-        if self._dataloader is None:
-            self._dataloader = self.build_dataloader()
-        return self._dataloader
-
-    def build_tokenizer(self):
-        # tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
-        tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-        return tokenizer
-
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            self._tokenizer = self.build_tokenizer()
-        return self._tokenizer
-
-    @property
-    def model_config(self):
-        return {
-            'tokenizer': self.tokenizer,
-            'pretrained_model_name_or_path': self.pretrain_model_name,
-            'pretrained': True,
-            'use_flash_attention_2': True,
-            'allow_embedding_resizing': True,
-        }
-
-    @property
-    def fsdp_config(self):
-        return dict()
-
-    def build_ref_model(self, pretrain_model_name: str):
-        tmp_ref_path = str('./ref_checkpoints')
-        ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
-        if os.path.exists(ref_path):
-            self.ref_path = ref_path
-            return
-
-        self.pretrain_model_name = pretrain_model_name
-        composer_dist.initialize_dist('gpu')
-
-        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
-
-        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
-
-        temp_dataloader = [{
-            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
-            'attention_mask': torch.ones((2, 15)),
-            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
-        }]
-
-        temp_trainer = Trainer(
-            model=tmp_model,
-            train_dataloader=temp_dataloader,
-            optimizers=tmp_optimizer,
-            max_duration='1ba',
-            parallelism_config={'fsdp': self.fsdp_config},
-            save_folder=tmp_ref_path,
-            save_weights_only=True,
-            device_train_microbatch_size=2,
-        )
-
-        temp_trainer.fit()
-
-        # After making the reference model, we can proceed with the PPO training
-        self.ref_path = ref_path
-
-    def build_ppo_trainer(self, pretrain_model_name: str):
-        self.pretrain_model_name = pretrain_model_name
-        composer_dist.initialize_dist('gpu')
-        max_seq_len = 32
-        precision = 'amp_bf16'
-
-        model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
-
-        optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
-
-        num_batches_per_update = 2
-
-        # ref_model_config = copy.deepcopy(self.model_config)
-        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
-
-        variables = {
-            'buffer': {
-                'name': 'MinibatchRolloutBuffer',
-                'max_buffer_size': num_batches_per_update,
-            },
-            'max_gen_len': 8,
-            'gamma': 0.99,
-            'lambda_gae': 0.95,
-            'generation_kwargs': {
-                'use_cache': True,
-                'do_sample': False,
-            },
-            'kl_controller': {
-                'init_kl_coef': 0.2,
-                'target': 0.01,
-                'horizon': 12800,
-                'kl_ctl_type': 'adaptive',
-            },
-            'reference_model': {
-                'model_config': ref_model_config,
-                'precision': precision,
-                'load_path': self.ref_path,
-                'non_train_fsdp_config': self.fsdp_config,
-            },
-            'device_generate_batch_size': 2,
-            'epoch_per_iteration': 1,
-            'num_batches_per_update': num_batches_per_update,
-            'rewards': {
-                'output_length': {
-                    'reward_type': 'output_length',
-                    'max_gen_len': 10,
-                },
-            },
-        }
-        train_config = {
-            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
-            'fsdp_config': self.fsdp_config,
-            'seed': 17,
-            'precision': precision,
-            'variables': variables,
-            'max_seq_len': max_seq_len,
-            'global_train_batch_size': 2,
-            'device_train_batch_size': 2,
-            'device_train_microbatch_size': 1,
-        }
-
-        # tmp_save_path = str('./checkpoints')
-        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
-        self.ppo_trainer = Trainer(
-            model=model,
-            optimizers=optimizer,
-            callbacks=self.ppo_callback,
-            train_dataloader=self.dataloader,
-            precision=precision,
-            parallelism_config={'fsdp': self.fsdp_config},
-            max_duration='3iter',
-            device_train_microbatch_size=1,
-            load_path=self.ref_path,
-            # save_folder=tmp_save_path,
-            # save_interval='1iter',
-        )
-
-        # trainer.fit(duration='1iter')
-
-        # This is the KL assert that must be true if we are truly loading from the same model.
-        # This is only true on the first iteration
-        # assert torch.allclose(
-        #     trainer.state.loss['kl/ift_kl'], # pyright: ignore
-        #     torch.tensor(0.0),
-        #     atol=5e-5,
-        # )
-
-    def train_1_iter(self):
-        self.ppo_trainer.fit(duration='1iter')
-        # This is the KL assert that must be true if we are truly loading from the same model.
-        # This is only true on the first iteration
-        assert torch.allclose(
-            self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
-            torch.tensor(0.0),
-            atol=5e-5,
-        )
-
-    def sync_weight_and_gen(self, vllm_engines: list[Any]):
-        self.ppo_callback.round_trip_to_inference_engines(
-            device=self.ppo_trainer.state.device,
-            vllm_engines=vllm_engines,
-            model_update_group=self.model_update_group,
-        )
-
-
-    def get_node_ip(self):
-        return ray.util.get_node_ip_address().strip('[]')
-    
-    def get_free_port(self):
-        with socket.socket() as sock:
-            sock.bind(("", 0))
-            return sock.getsockname()[1]
-    
-    def _allocate_master_address(self):
-        """Allocate master address and port for rank 0."""
-        if self.master_addr is None:
-            # Get the local IP address
-            self.master_addr = self.get_node_ip()
-
-        if self.master_port is None:
-            # Allocate a free port
-            self.master_port = self.get_free_port()
-    
-    def get_master_address(self) -> Tuple[Optional[str], Optional[int]]:
-        """Return the master address and port as a tuple."""
-        return (self.master_addr, self.master_port)
-    
-    def init_default_process_group(self):
-        """Initialize the distributed process group."""         
-        # Initialize process group
-        dist.init_process_group(timeout=timedelta(seconds=30))
-        print(f'is distributed initialized: {dist.is_initialized()}')
-        # Print debug information
-        num_visible_devices = torch.cuda.device_count()
-        print(f'num_visible_devices: {num_visible_devices}')
-        print('Ray actor init envs:')
-        print(f'rank: {dist.get_rank()}')
-        print(f'node_rank: {dist.get_rank() // 8}')
-        print(f'world_size: {dist.get_world_size()}')
-        print(f'local_rank: {dist.get_rank() % 8}')
-        print(f'master_addr: {self.master_addr}')
-        print(f'master_port: {self.master_port}')
-    
-    def init_model(self, model_name: str):
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
-        self.model.to('cuda')
-
-    def sync_weights(self, vllm_engines: list[Any]):
-        for name, p in self.model.named_parameters():
-            refs = [engine.update_weight.remote(name, p.dtype, p.shape, empty_cache=False) for engine in vllm_engines]
-            dist.broadcast(p, src=0, group=self.model_update_group)
-            ray.get(refs)
-
-    def tensor_all_reduce(self) -> float:
-        """Perform a simple tensor all_reduce operation."""
-        # Create a tensor on the GPU and perform all_reduce
-        device = torch.device("cuda")
-        x = torch.ones(1, device=device)
-        dist.all_reduce(x)
-        
-        return x.item()
-
-    def init_vllm_process_group(self, backend: str, master_addr: str, master_port: int, world_size: int, rank: int, group_name: str):
-        """Initialize the vLLM process group."""
-        self.model_update_group = init_process_group(backend=backend, init_method=f'tcp://{master_addr}:{master_port}', world_size=world_size, rank=rank, group_name=group_name)
-        return dist.get_world_size(self.model_update_group)
-
-@contextmanager
-def start_ray_server():
-    dist.init_process_group(backend='gloo')
-    address = init_ray()
-    try:
-        yield address
-        dist.barrier()
-    finally:
-        if dist.get_rank() == 0:
-            ray.shutdown()
-            subprocess.run(['ray', 'stop'], check=True)
-        dist.barrier()
-        dist.destroy_process_group()
-
-
-def run(tp_size: int = 8):
-    prompts = [
-        "what is RAY?",
-        "what is vLLM?",
-    ]
-    # pretrain_model_name = 'gpt2'
-    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
-    with start_ray_server() as address:
-        if dist.get_rank() == 0:
-            master_addr, _ = address.split(':')
-            
-            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-            num_train_actors = dist.get_world_size() // 2
-            # Create actors - rank 0 will allocate master address/port
-            train_actors = []
-
-            # master actor will allocate master_addr and master_port
-            master_actor = DistributedGPUActor.remote(0, num_train_actors)
-            train_actors.append(master_actor)
-            
-            # Get master address from rank 0 actor
-            master_info = ray.get(master_actor.get_master_address.remote())
-            master_addr, master_port = master_info
-            print(f"Master address allocated: {master_addr}:{master_port}")
-            
-            # Create remaining actors with the master address/port
-            for i in range(1, num_train_actors):
-                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
-                train_actors.append(actor)
-            
-            # # Initialize process groups for all actors
-            # init_tasks = [actor.init_default_process_group.remote() for actor in train_actors]
-            # ray.get(init_tasks)
-            
-            # # Perform tensor all_reduce on all actors
-            # reduce_tasks = [actor.tensor_all_reduce.remote() for actor in train_actors]
-            # results = ray.get(reduce_tasks)
-            # print(f"All-reduce results: {results}")
-
-
-
-            build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in train_actors]
-            ray.get(build_ref_model_tasks)
-            print('build ref model done')
-
-            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in train_actors]
-            ray.get(build_ppo_trainer_tasks)
-            print('build ppo trainer done')
-
-            vllm_tensor_parallel_size = min(tp_size, dist.get_world_size() - num_train_actors)
-            num_vllm_engines = dist.get_world_size() // 2 // vllm_tensor_parallel_size
-            print(f'num_vllm_engines: {num_vllm_engines}')
-            vllm_engines = create_vllm_engines(
-                num_engines=num_vllm_engines,
-                tensor_parallel_size=vllm_tensor_parallel_size,
-                enforce_eager=True,
-                pretrain=pretrain_model_name,
-                revision=None,
-                seed=1,
-                enable_prefix_caching=False,
-                max_model_len=512,
-            )
-
-            new_port = ray.get(master_actor.get_free_port.remote())
-            print(f'new_port: {new_port}')
-            refs = [
-                engine.init_process_group.remote(
-                    master_addr,
-                    new_port,
-                    i * vllm_tensor_parallel_size + 1,
-                    dist.get_world_size() // 2 + 1,
-                    'weight-update',
-                    backend='nccl',
-                ) for i, engine in enumerate(vllm_engines)
-            ]
-            refs.append(master_actor.init_vllm_process_group.remote(
-                backend='nccl',
-                master_addr=master_addr,
-                master_port=new_port,
-                world_size=dist.get_world_size() // 2 + 1,
-                rank=0,
-                group_name='weight-update',
-            ))
-            # should only get refs of both master and vllm_engines together, otherwise it will hang
-            print(ray.get(refs))
-
-            refs = [actor.sync_weight_and_gen.remote(vllm_engines) for actor in train_actors]
-            ray.get(refs)
-            print('sync weight and gen done')
-
-            refs = [actor.train_1_iter.remote() for actor in train_actors]
-            ray.get(refs)
-            print('train 1 iter done')
-
-
-
-            # refs = [actor.init_model.remote(pretrain_model_name) for actor in train_actors]
-            # ray.get(refs)
-            # print('init model done')
-
-            # ray.get(master_actor.sync_weights.remote(vllm_engines))
-            # print('sync weights done')
-
-            ref = vllm_engines[0].generate.remote(prompts)
-            gen_results = ray.get(ref)
-            for output in gen_results:
-                prompt = output.prompt
-                generated_text = output.outputs[0].text
-                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--tp_size', type=int, default=8)
-    args = parser.parse_args()
-    run(tp_size=args.tp_size)
diff --git a/vllm_test/test_vllm.py b/vllm_test/test_vllm.py
deleted file mode 100644
index 9d40e87a..00000000
--- a/vllm_test/test_vllm.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-from vllm import LLM, SamplingParams
-from transformers import AutoModelForCausalLM
-
-def load_weights(self, weights: list[tuple[str, torch.Tensor]]):
-    self.model_runner.model.load_weights( # type: ignore
-        weights=weights,
-    )
-    
-if __name__ == '__main__':
-    prompts = [
-        "what is RAY?",
-        "what is vLLM?",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    model_name = "facebook/opt-125m"
-    print(f'loading model {model_name}...')
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')
-    print('load model done')
-    llm = LLM(model=model_name)
-    for name, p in model.named_parameters():
-        llm.collective_rpc(
-            load_weights,
-            args=([(name, p)],),
-        )
-    print('load weights done')
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

From be08a75dd4d921e379b79f2557693ea979615f49 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 24 Jul 2025 07:19:44 +0000
Subject: [PATCH 076/107] docs

---
 .../online/single_controller_callback.py      |  2 +-
 test_single_controller_ppo.py                 | 24 ++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index e0e4967f..cd1d8921 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -47,7 +47,7 @@ class SingleControllerOnPolicyCallback(OnPolicyCallback):
             callback is registered under callbacks_with_config registry.
     """
 
-    def round_trip_to_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+    def update_and_query_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         """Round trip to inference engines.
         
         Args:
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 835e0881..f6d9d720 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -40,7 +40,7 @@
 
 @ray.remote(num_gpus=1)
 class DistributedGPUActor(BaseDistributedGPUActor):
-    """Distributed GPU actor for testing."""
+    """Distributed GPU actor for testing. Moved part of controller logic from PPO Callback to here."""
 
     def __init__(self,
         rank: int,
@@ -61,6 +61,8 @@ def __init__(self,
         self.num_batches_per_update = 2
 
     def build_dataloader(self):
+        # dataloader should be built with inference agent instead with this trainer actor,
+        # it is still attached to trainer actor here to avoid a full refactor to PPO Callback code
         max_seq_len = 32
         prompt_len = 10
 
@@ -87,7 +89,6 @@ def dataloader(self):
         return self._dataloader
 
     def build_tokenizer(self):
-        # tokenizer = assets_tokenizer_helper(self.pretrain_model_name)
         tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         return tokenizer
@@ -113,6 +114,9 @@ def fsdp_config(self):
         return dict()
 
     def build_ref_model(self, pretrain_model_name: str):
+        # train a reference model for the PPO training
+        # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
+        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class 
         tmp_ref_path = str('./ref_checkpoints')
         ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
         if os.path.exists(ref_path):
@@ -207,6 +211,8 @@ def build_ppo_trainer(self, pretrain_model_name: str):
             'device_train_microbatch_size': self.device_train_batch_size,
         }
 
+        # ideally we should pull the rest of the training logic from the callback to this class as well,
+        # e.g, how to interact with env, calculate rewards etc
         self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
         self.ppo_trainer = Trainer(
             model=model,
@@ -221,6 +227,8 @@ def build_ppo_trainer(self, pretrain_model_name: str):
         )
 
     def train_1_iter(self):
+        # we should implement the top level PPO algo here instead of the callback
+        # algorithmic researchers are expected to implement this function along with above policy/value/reward/ref trainers or models
         self.ppo_trainer.fit(duration='1iter')
         # This is the KL assert that must be true if we are truly loading from the same model.
         # This is only true on the first iteration
@@ -230,8 +238,8 @@ def train_1_iter(self):
             atol=5e-5,
         )
 
-    def sync_weight_and_gen(self, vllm_engines: list[Any]):
-        self.ppo_callback.round_trip_to_inference_engines(
+    def update_and_query_inference_engines(self, vllm_engines: list[Any]):
+        self.ppo_callback.update_and_query_inference_engines(
             device=self.ppo_trainer.state.device,
             vllm_engines=vllm_engines,
             model_update_group=self.model_update_group,
@@ -239,16 +247,18 @@ def sync_weight_and_gen(self, vllm_engines: list[Any]):
 
 
 def run():
+    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
     prompts = [
         "what is RAY?",
         "what is vLLM?",
     ]
-    # pretrain_model_name = 'gpt2'
     pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
     with start_ray_server() as address:
         if dist.get_rank() == 0:
+            # only rank 0 is the master controller
             master_addr, _ = address.split(':')
             
+            # Init all actors (training, inference, env, etc) of the system
             print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
             num_train_actors = dist.get_world_size() // 2
             # Create actors - rank 0 will allocate master address/port
@@ -299,6 +309,7 @@ def run():
                 },
             )
 
+            # init additional process groups for vllm_engines and master actor
             new_port = ray.get(master_actor.get_free_port.remote())
             print(f'new_port: {new_port}')
             refs = [
@@ -322,7 +333,8 @@ def run():
             # should only get refs of both master and vllm_engines together, otherwise it will hang
             print(ray.get(refs))
 
-            refs = [actor.sync_weight_and_gen.remote(vllm_engines) for actor in train_actors]
+            # core controller logic, should be implemented according to the algorithm (ppo, multi-turn, etc)
+            refs = [actor.update_and_query_inference_engines.remote(vllm_engines) for actor in train_actors]
             ray.get(refs)
             print('sync weight and gen done')
 

From 7dac06044d2641489fa0740daef908dfd3657edf Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Thu, 24 Jul 2025 07:22:33 +0000
Subject: [PATCH 077/107] doc

---
 compose_rl/algorithms/online/single_controller_callback.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index cd1d8921..85cf8dba 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -42,9 +42,7 @@
 class SingleControllerOnPolicyCallback(OnPolicyCallback):
     """Callback for managing on-policy training in an RLHF loop.
 
-    Args:
-        train_config (dict): Training config passed to callback via foundry train.py as
-            callback is registered under callbacks_with_config registry.
+    Ideally all the overwritten methods below should be implemented in the trainer actor instead of the callback, we kept them here for now to minimize a drastic refactor to PPO Callback code
     """
 
     def update_and_query_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):

From d4a76e1ce9c5fd718d0686aee6f6a57383302c16 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 06:07:54 +0000
Subject: [PATCH 078/107] actor group

---
 test_single_controller_ppo.py | 157 ++++++++++++++++++++++------------
 1 file changed, 102 insertions(+), 55 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index f6d9d720..6e1ed884 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -6,7 +6,6 @@
 from functools import partial
 from typing import Any, Optional
 
-import pytest
 import ray
 import torch
 import torch.distributed as dist
@@ -31,7 +30,6 @@
 from tests.common import (
     BaseDistributedGPUActor,
     VerifiablePromptDataset,
-    world_size,
 )
 
 # Set up logging
@@ -245,6 +243,99 @@ def update_and_query_inference_engines(self, vllm_engines: list[Any]):
             model_update_group=self.model_update_group,
         )
 
+def setup_process_groups(master_actor: Any, vllm_engines: list[Any], master_addr: str, num_train_actors: int):
+    """Initialize process groups for vLLM engines and master actor."""
+    # Get a new port for the weight-update process group
+    new_port = ray.get(master_actor.get_free_port.remote())
+    print(f'new_port: {new_port}')
+    
+    world_size = dist.get_world_size()
+    vllm_tensor_parallel_size = world_size - num_train_actors
+    
+    # Initialize process groups for vLLM engines
+    refs = [
+        engine.init_process_group.remote(
+            master_addr,
+            new_port,
+            i * vllm_tensor_parallel_size + 1,
+            world_size // 2 + 1,
+            'weight-update',
+            backend='nccl',
+        ) for i, engine in enumerate(vllm_engines)
+    ]
+    
+    # Add master actor to the process group
+    refs.append(master_actor.add_process_group.remote(
+        backend='nccl',
+        master_addr=master_addr,
+        master_port=new_port,
+        world_size=world_size // 2 + 1,
+        rank=0,
+        group_name='weight-update',
+    ))
+    
+    # Wait for all process groups to be initialized
+    print(ray.get(refs))
+
+
+class DistributedActorGroup:
+    def __init__(self, num_train_actors: int, master_addr: str, master_port: int):
+        self.num_train_actors = num_train_actors
+        self.master_addr = master_addr
+        self.master_port = master_port
+        self._master_actor = None
+        self._train_actors = []
+
+    def create_actors(self, pretrain_model_name: str):
+        """Create and initialize all training actors."""
+        print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+        
+        # Create master actor first
+        self._master_actor = DistributedGPUActor.remote(0, self.num_train_actors)
+        self._train_actors.append(self._master_actor)
+        
+        # Get master address from rank 0 actor
+        master_info = ray.get(self._master_actor.get_master_address.remote())
+        self.master_addr, self.master_port = master_info
+        print(f"Master address allocated: {self.master_addr}:{self.master_port}")
+        
+        # Create remaining actors with the master address/port
+        for i in range(1, self.num_train_actors):
+            actor = DistributedGPUActor.remote(i, self.num_train_actors, self.master_addr, self.master_port)
+            self._train_actors.append(actor)
+
+    def build_models(self, pretrain_model_name: str):
+        """Build reference models and PPO trainers for all actors."""
+        # Build reference models
+        build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in self._train_actors]
+        ray.get(build_ref_model_tasks)
+        print('build ref model done')
+
+        # Build PPO trainers
+        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in self._train_actors]
+        ray.get(build_ppo_trainer_tasks)
+        print('build ppo trainer done')
+
+    def sync_weights_and_generate(self, vllm_engines: list[Any]):
+        """Sync weights and generate with inference engines."""
+        refs = [actor.update_and_query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
+        ray.get(refs)
+        print('sync weight and gen done')
+
+    def train_iteration(self):
+        """Run one training iteration on all actors."""
+        refs = [actor.train_1_iter.remote() for actor in self._train_actors]
+        ray.get(refs)
+        print('train 1 iter done')
+
+    @property
+    def train_actors(self):
+        return self._train_actors
+
+    @property
+    def master_actor(self):
+        return self._master_actor
+
 
 def run():
     # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
@@ -259,40 +350,21 @@ def run():
             master_addr, _ = address.split(':')
             
             # Init all actors (training, inference, env, etc) of the system
-            print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
             num_train_actors = dist.get_world_size() // 2
-            # Create actors - rank 0 will allocate master address/port
-            train_actors = []
-
-            # master actor will allocate master_addr and master_port
-            master_actor = DistributedGPUActor.remote(0, num_train_actors)
-            train_actors.append(master_actor)
-            
-            # Get master address from rank 0 actor
-            master_info = ray.get(master_actor.get_master_address.remote())
-            master_addr, master_port = master_info
-            print(f"Master address allocated: {master_addr}:{master_port}")
-            
-            # Create remaining actors with the master address/port
-            for i in range(1, num_train_actors):
-                actor = DistributedGPUActor.remote(i, num_train_actors, master_addr, master_port)
-                train_actors.append(actor)
+            actor_group = DistributedActorGroup(num_train_actors, master_addr, 0)  # master_port will be updated by create_actors
+            actor_group.create_actors(pretrain_model_name)
 
             # composer will initialize the process group for each actor, so no need to initialize them explicitly
-            build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in train_actors]
-            ray.get(build_ref_model_tasks)
-            print('build ref model done')
-
-            build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in train_actors]
-            ray.get(build_ppo_trainer_tasks)
-            print('build ppo trainer done')
+            actor_group.build_models(pretrain_model_name)
 
+            # Create vLLM engines
             world_size = dist.get_world_size()
             vllm_tensor_parallel_size = world_size - num_train_actors
             num_vllm_engines = (
                 world_size - num_train_actors
             ) // vllm_tensor_parallel_size
             print(f'num_vllm_engines: {num_vllm_engines}')
+            
             vllm_engines = create_vllm_engines(
                 num_engines=num_vllm_engines,
                 tensor_parallel_size=vllm_tensor_parallel_size,
@@ -309,39 +381,14 @@ def run():
                 },
             )
 
-            # init additional process groups for vllm_engines and master actor
-            new_port = ray.get(master_actor.get_free_port.remote())
-            print(f'new_port: {new_port}')
-            refs = [
-                engine.init_process_group.remote(
-                    master_addr,
-                    new_port,
-                    i * vllm_tensor_parallel_size + 1,
-                    dist.get_world_size() // 2 + 1,
-                    'weight-update',
-                    backend='nccl',
-                ) for i, engine in enumerate(vllm_engines)
-            ]
-            refs.append(master_actor.add_process_group.remote(
-                backend='nccl',
-                master_addr=master_addr,
-                master_port=new_port,
-                world_size=dist.get_world_size() // 2 + 1,
-                rank=0,
-                group_name='weight-update',
-            ))
-            # should only get refs of both master and vllm_engines together, otherwise it will hang
-            print(ray.get(refs))
+            setup_process_groups(actor_group.master_actor, vllm_engines, actor_group.master_addr, num_train_actors)
 
             # core controller logic, should be implemented according to the algorithm (ppo, multi-turn, etc)
-            refs = [actor.update_and_query_inference_engines.remote(vllm_engines) for actor in train_actors]
-            ray.get(refs)
-            print('sync weight and gen done')
+            actor_group.sync_weights_and_generate(vllm_engines)
 
-            refs = [actor.train_1_iter.remote() for actor in train_actors]
-            ray.get(refs)
-            print('train 1 iter done')
+            actor_group.train_iteration()
 
+            # Generate text using the first vLLM engine
             ref = vllm_engines[0].generate.remote(prompts)
             gen_results = ray.get(ref)
             for output in gen_results:

From 24d250854782adbca094f6f767bcc8e32d5277db Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 06:27:00 +0000
Subject: [PATCH 079/107] no more explicit master addr

---
 test_single_controller_ppo.py | 62 +++++++++++++++++------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 6e1ed884..ab461b1f 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -111,6 +111,9 @@ def model_config(self):
     def fsdp_config(self):
         return dict()
 
+    def init_composer_dist(self):
+        composer_dist.initialize_dist('gpu')
+
     def build_ref_model(self, pretrain_model_name: str):
         # train a reference model for the PPO training
         # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
@@ -122,8 +125,6 @@ def build_ref_model(self, pretrain_model_name: str):
             return
 
         self.pretrain_model_name = pretrain_model_name
-        composer_dist.initialize_dist('gpu')
-
         tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
@@ -243,14 +244,14 @@ def update_and_query_inference_engines(self, vllm_engines: list[Any]):
             model_update_group=self.model_update_group,
         )
 
-def setup_process_groups(master_actor: Any, vllm_engines: list[Any], master_addr: str, num_train_actors: int):
+def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor_parallel_size: int):
     """Initialize process groups for vLLM engines and master actor."""
     # Get a new port for the weight-update process group
+    master_addr, _ = ray.get(master_actor.get_master_address.remote())
     new_port = ray.get(master_actor.get_free_port.remote())
     print(f'new_port: {new_port}')
     
     world_size = dist.get_world_size()
-    vllm_tensor_parallel_size = world_size - num_train_actors
     
     # Initialize process groups for vLLM engines
     refs = [
@@ -278,15 +279,11 @@ def setup_process_groups(master_actor: Any, vllm_engines: list[Any], master_addr
     print(ray.get(refs))
 
 
-class DistributedActorGroup:
-    def __init__(self, num_train_actors: int, master_addr: str, master_port: int):
+class SPMDActorGroup:
+    def __init__(self, num_train_actors: int):
         self.num_train_actors = num_train_actors
-        self.master_addr = master_addr
-        self.master_port = master_port
-        self._master_actor = None
-        self._train_actors = []
 
-    def create_actors(self, pretrain_model_name: str):
+        self._train_actors = []
         """Create and initialize all training actors."""
         print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
         
@@ -295,17 +292,30 @@ def create_actors(self, pretrain_model_name: str):
         self._train_actors.append(self._master_actor)
         
         # Get master address from rank 0 actor
-        master_info = ray.get(self._master_actor.get_master_address.remote())
-        self.master_addr, self.master_port = master_info
-        print(f"Master address allocated: {self.master_addr}:{self.master_port}")
+        master_addr, master_port = ray.get(self._master_actor.get_master_address.remote())
+        print(f"Master address allocated: {master_addr}:{master_port}")
         
         # Create remaining actors with the master address/port
         for i in range(1, self.num_train_actors):
-            actor = DistributedGPUActor.remote(i, self.num_train_actors, self.master_addr, self.master_port)
+            actor = DistributedGPUActor.remote(i, self.num_train_actors, master_addr, master_port)
             self._train_actors.append(actor)
 
+    @property
+    def train_actors(self):
+        return self._train_actors
+
+    @property
+    def master_actor(self):
+        return self._master_actor
+
+class TrainActorGroup(SPMDActorGroup):
+
     def build_models(self, pretrain_model_name: str):
         """Build reference models and PPO trainers for all actors."""
+        init_task = [actor.init_composer_dist.remote() for actor in self._train_actors]
+        ray.get(init_task)
+        print('init composer dist done')
+
         # Build reference models
         build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in self._train_actors]
         ray.get(build_ref_model_tasks)
@@ -328,14 +338,6 @@ def train_iteration(self):
         ray.get(refs)
         print('train 1 iter done')
 
-    @property
-    def train_actors(self):
-        return self._train_actors
-
-    @property
-    def master_actor(self):
-        return self._master_actor
-
 
 def run():
     # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
@@ -344,20 +346,16 @@ def run():
         "what is vLLM?",
     ]
     pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
-    with start_ray_server() as address:
+    with start_ray_server():
         if dist.get_rank() == 0:
             # only rank 0 is the master controller
-            master_addr, _ = address.split(':')
             
-            # Init all actors (training, inference, env, etc) of the system
+            # create SPMD training actors of the system
             num_train_actors = dist.get_world_size() // 2
-            actor_group = DistributedActorGroup(num_train_actors, master_addr, 0)  # master_port will be updated by create_actors
-            actor_group.create_actors(pretrain_model_name)
-
-            # composer will initialize the process group for each actor, so no need to initialize them explicitly
+            actor_group = TrainActorGroup(num_train_actors)
             actor_group.build_models(pretrain_model_name)
 
-            # Create vLLM engines
+            # Create vLLM engines (or inference actors)
             world_size = dist.get_world_size()
             vllm_tensor_parallel_size = world_size - num_train_actors
             num_vllm_engines = (
@@ -381,7 +379,7 @@ def run():
                 },
             )
 
-            setup_process_groups(actor_group.master_actor, vllm_engines, actor_group.master_addr, num_train_actors)
+            setup_process_groups(actor_group.master_actor, vllm_engines, vllm_tensor_parallel_size)
 
             # core controller logic, should be implemented according to the algorithm (ppo, multi-turn, etc)
             actor_group.sync_weights_and_generate(vllm_engines)

From e37d2ec10041346752442a05e9def1cede6f295c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 06:49:04 +0000
Subject: [PATCH 080/107] all class ready

---
 test_single_controller_ppo.py | 81 +++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 31 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index ab461b1f..05a234c7 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -339,6 +339,51 @@ def train_iteration(self):
         print('train 1 iter done')
 
 
+class InferenceAgent:
+
+    def __init__(self, num_vllm_engines: int, vllm_tensor_parallel_size: int, pretrain_model_name: str):
+        self.num_vllm_engines = num_vllm_engines
+        self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
+        self.vllm_engines = create_vllm_engines(
+            num_engines=num_vllm_engines,
+            tensor_parallel_size=vllm_tensor_parallel_size,
+            enforce_eager=True,
+            pretrain=pretrain_model_name,
+            revision=None,
+            seed=1,
+            enable_prefix_caching=False,
+            max_model_len=512,
+            device_bundle={
+                'GPU': 1,
+                'CPU': 1,
+                'worker_node': 0,
+            },
+        )
+
+    def generate(self, prompts: list[str]):
+        ref = self.vllm_engines[0].generate.remote(prompts)
+        gen_results = ray.get(ref)
+        for output in gen_results:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+class PPOController:
+
+    def __init__(self, train_actor: TrainActorGroup, inference_client: InferenceAgent, pretrain_model_name: str):
+        self.train_actor = train_actor
+        self.inference_client = inference_client
+
+        self.train_actor.build_models(pretrain_model_name)
+        setup_process_groups(self.train_actor.master_actor, self.inference_client.vllm_engines, self.inference_client.vllm_tensor_parallel_size)
+
+    
+    def train(self):
+        self.train_actor.sync_weights_and_generate(self.inference_client.vllm_engines)
+        self.train_actor.train_iteration()
+
+
 def run():
     # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
     prompts = [
@@ -352,8 +397,7 @@ def run():
             
             # create SPMD training actors of the system
             num_train_actors = dist.get_world_size() // 2
-            actor_group = TrainActorGroup(num_train_actors)
-            actor_group.build_models(pretrain_model_name)
+            train_actor = TrainActorGroup(num_train_actors)
 
             # Create vLLM engines (or inference actors)
             world_size = dist.get_world_size()
@@ -361,38 +405,13 @@ def run():
             num_vllm_engines = (
                 world_size - num_train_actors
             ) // vllm_tensor_parallel_size
-            print(f'num_vllm_engines: {num_vllm_engines}')
-            
-            vllm_engines = create_vllm_engines(
-                num_engines=num_vllm_engines,
-                tensor_parallel_size=vllm_tensor_parallel_size,
-                enforce_eager=True,
-                pretrain=pretrain_model_name,
-                revision=None,
-                seed=1,
-                enable_prefix_caching=False,
-                max_model_len=512,
-                device_bundle={
-                    'GPU': 1,
-                    'CPU': 1,
-                    'worker_node': 0,
-                },
-            )
-
-            setup_process_groups(actor_group.master_actor, vllm_engines, vllm_tensor_parallel_size)
 
-            # core controller logic, should be implemented according to the algorithm (ppo, multi-turn, etc)
-            actor_group.sync_weights_and_generate(vllm_engines)
+            inference_client = InferenceAgent(num_vllm_engines, vllm_tensor_parallel_size, pretrain_model_name)
 
-            actor_group.train_iteration()
+            ppo_controller = PPOController(train_actor, inference_client, pretrain_model_name)
+            ppo_controller.train()
 
-            # Generate text using the first vLLM engine
-            ref = vllm_engines[0].generate.remote(prompts)
-            gen_results = ray.get(ref)
-            for output in gen_results:
-                prompt = output.prompt
-                generated_text = output.outputs[0].text
-                print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            inference_client.generate(prompts)
 
 if __name__ == '__main__':
     run()

From 7feb36c0b2d6d96bfc2e7fa9ecfbf2809d82f571 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 06:52:38 +0000
Subject: [PATCH 081/107] run cmd

---
 test_single_controller_ppo.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 05a234c7..8fd6bb2f 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -1,6 +1,8 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
+# run cmd: VLLM_ATTENTION_BACKEND=FLASH_ATTN composer test_single_controller_ppo.py
+
 import logging
 import os
 from functools import partial
@@ -339,7 +341,7 @@ def train_iteration(self):
         print('train 1 iter done')
 
 
-class InferenceAgent:
+class RolloutAgent:
 
     def __init__(self, num_vllm_engines: int, vllm_tensor_parallel_size: int, pretrain_model_name: str):
         self.num_vllm_engines = num_vllm_engines
@@ -371,7 +373,7 @@ def generate(self, prompts: list[str]):
 
 class PPOController:
 
-    def __init__(self, train_actor: TrainActorGroup, inference_client: InferenceAgent, pretrain_model_name: str):
+    def __init__(self, train_actor: TrainActorGroup, inference_client: RolloutAgent, pretrain_model_name: str):
         self.train_actor = train_actor
         self.inference_client = inference_client
 
@@ -406,7 +408,7 @@ def run():
                 world_size - num_train_actors
             ) // vllm_tensor_parallel_size
 
-            inference_client = InferenceAgent(num_vllm_engines, vllm_tensor_parallel_size, pretrain_model_name)
+            inference_client = RolloutAgent(num_vllm_engines, vllm_tensor_parallel_size, pretrain_model_name)
 
             ppo_controller = PPOController(train_actor, inference_client, pretrain_model_name)
             ppo_controller.train()

From c25bebc7e7ca1685836778c8a38347d56d0de19f Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 07:19:49 +0000
Subject: [PATCH 082/107] use device direclty

---
 compose_rl/algorithms/online/callback.py      |  2 +-
 .../online/generation_utils/vllm_utils.py     | 19 +++++-------
 .../online/single_controller_callback.py      |  6 ++--
 test_single_controller_ppo.py                 | 29 ++++++++++++++++++-
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/compose_rl/algorithms/online/callback.py b/compose_rl/algorithms/online/callback.py
index 1a97eb0a..81deb765 100644
--- a/compose_rl/algorithms/online/callback.py
+++ b/compose_rl/algorithms/online/callback.py
@@ -1157,7 +1157,7 @@ def _update_inference_model(self, batch: dict[str, torch.Tensor]):
             model=self.actor_critic,
             vllm_engines=self.vllm_engines,
             model_update_group=self.model_update_group,
-            batch=batch,
+            device=batch['prompt'].device,
             loss_type=self.actor_critic.loss_type,  # type: ignore
             enable_prefix_caching=self.vllm_enable_prefix_caching,
         )
diff --git a/compose_rl/algorithms/online/generation_utils/vllm_utils.py b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
index 62c86d41..4da62390 100644
--- a/compose_rl/algorithms/online/generation_utils/vllm_utils.py
+++ b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
@@ -381,7 +381,7 @@ def broadcast_to_vllm(
     model: nn.Module,
     vllm_engines: list,
     model_update_group: Optional[torch.distributed.ProcessGroup],
-    batch: dict[str, torch.Tensor],
+    device: torch.device,
     loss_type: OnPolicyEnum = OnPolicyEnum.PPO,
     enable_prefix_caching: bool = False,
 ):
@@ -391,7 +391,7 @@ def broadcast_to_vllm(
         model (nn.Module): The model to broadcast
         vllm_engines (list): List of vllm engines
         model_update_group (torch.distributed.ProcessGroup): The process group for model updates
-        batch (dict[str, torch.Tensor]): The batch to use for the forward pass
+        device (torch.device): The device to use for the forward pass
         loss_type (str): The loss type which decides whether to use critic-free or not. Defaults to `ppo`.
         enable_prefix_caching (bool): Whether to enable prefix caching. Defaults to `False`.
     """
@@ -419,9 +419,6 @@ def broadcast_to_vllm(
             engine.reset_prefix_cache.remote() for engine in vllm_engines
         ]
 
-    # This is needed to get the correct model device
-    cur_device = batch['prompt'].device
-
     # These apply to llama modules, it might change for other modules
     valid_non_leaf_module_names = [
         'model.embed_tokens.weight',
@@ -438,17 +435,17 @@ def broadcast_to_vllm(
         # We need this otherwise FSDP throws an error during a standard forward pass.
         dummy_batch = {
             'obs':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
             'right_padded_attn_mask':
-                torch.tensor([[1]], dtype=torch.bool, device=cur_device),
+                torch.tensor([[1]], dtype=torch.bool, device=device),
             'actions':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
             'prompt_len':
-                torch.tensor([1], device=cur_device),
+                torch.tensor([1], device=device),
             'max_gen_len':
-                torch.tensor([1], device=cur_device),
+                torch.tensor([1], device=device),
             'action_mask':
-                torch.tensor([[0]], dtype=torch.long, device=cur_device),
+                torch.tensor([[0]], dtype=torch.long, device=device),
         }
         model(dummy_batch)
     start_time = time.time()
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 85cf8dba..a6c31b29 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -52,7 +52,7 @@ def update_and_query_inference_engines(self, device: Any, vllm_engines: list[Any
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
         batch = device.batch_to_device(self._get_next_iter_prompts())
-        self._update_inference_model(batch, vllm_engines, model_update_group)
+        self._update_inference_model(vllm_engines, model_update_group)
         self.batch_rollouts = self._interact_with_env(batch, vllm_engines)
 
     def iteration_start(self, state: State, logger: Logger):
@@ -102,14 +102,14 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[
         return batch
 
 
-    def _update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+    def _update_inference_model(self, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
         start_time = time.time()
         log.info('Before broadcast to vLLM')
         broadcast_to_vllm(
             self.actor_critic,
             vllm_engines,
             model_update_group,
-            batch,
+            device=torch.device('cuda'),
             loss_type=self.actor_critic.loss_type,  # type: ignore
         )
         log.info('Finished broadcasting to vLLM')
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 8fd6bb2f..2f7e6d2e 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import time
 from functools import partial
 from typing import Any, Optional
 
@@ -25,7 +26,10 @@
     ComposerHFPolicyLM,
     SingleControllerOnPolicyCallback,
 )
-from compose_rl.algorithms.online.generation_utils import create_vllm_engines
+from compose_rl.algorithms.online.generation_utils import (
+    broadcast_to_vllm,
+    create_vllm_engines,
+)
 from compose_rl.data import prompt_dataset_collate_fn
 from compose_rl.utils.ray_utils import start_ray_server
 
@@ -239,6 +243,29 @@ def train_1_iter(self):
             atol=5e-5,
         )
 
+    def update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+        start_time = time.time()
+        print('Before broadcast to vLLM')
+        broadcast_to_vllm(
+            self.ppo_callback.actor_critic,
+            vllm_engines,
+            model_update_group,
+            device=batch['prompt'].device,
+            loss_type=self.ppo_callback.actor_critic.loss_type,  # type: ignore
+        )
+        print('Finished broadcasting to vLLM')
+        print(f'Took: {time.time() - start_time} to broadcast to vllm.')
+        dist.barrier()
+
+    def query_inference_engines(self, device: Any, vllm_engines: list[Any]):
+        """Round trip to inference engines.
+        
+        Args:
+            vllm_engines (list[Any]): The vllm engines to round trip to.
+        """
+        batch = device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
+        self.ppo_callback.batch_rollouts = self.ppo_callback._interact_with_env(batch, vllm_engines)
+    
     def update_and_query_inference_engines(self, vllm_engines: list[Any]):
         self.ppo_callback.update_and_query_inference_engines(
             device=self.ppo_trainer.state.device,

From 7195e4e295c5e67ab0612456105dbb0d20d4d9d8 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 07:54:48 +0000
Subject: [PATCH 083/107] clean up to last method

---
 .../online/single_controller_callback.py      |  49 -----
 test_single_controller_ppo.py                 | 184 ++++++++++--------
 2 files changed, 103 insertions(+), 130 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index a6c31b29..e376bf12 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -45,16 +45,6 @@ class SingleControllerOnPolicyCallback(OnPolicyCallback):
     Ideally all the overwritten methods below should be implemented in the trainer actor instead of the callback, we kept them here for now to minimize a drastic refactor to PPO Callback code
     """
 
-    def update_and_query_inference_engines(self, device: Any, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
-        """Round trip to inference engines.
-        
-        Args:
-            vllm_engines (list[Any]): The vllm engines to round trip to.
-        """
-        batch = device.batch_to_device(self._get_next_iter_prompts())
-        self._update_inference_model(vllm_engines, model_update_group)
-        self.batch_rollouts = self._interact_with_env(batch, vllm_engines)
-
     def iteration_start(self, state: State, logger: Logger):
         del logger  # unused
 
@@ -76,42 +66,3 @@ def iteration_start(self, state: State, logger: Logger):
         # Update IFT KL
         self._update_ift_kl()
 
-    def _interact_with_env(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any]):
-        """Have the policy interact with the environment.
-
-        Here, we redo microbatching, and run generate appropriately. We add the environment
-        interactions to the buffer.
-
-        Args:
-            batch (dict): the iteration level batch we want to interact with the environment.
-        """
-        max_gen_len = self.max_gen_len
-        generation_kwargs = self.generation_kwargs
-        with get_precision_context(self.precision), torch.no_grad():
-            # If vllm engines are available, we use them to generate sequences in one go
-            sequences = vllm_generate(
-                vllm_engines=vllm_engines,
-                batch=batch,
-                max_gen_len=max_gen_len,
-                generation_kwargs=generation_kwargs,
-                tokenizer=self.tokenizer,  # type: ignore
-                vllm_generate_function='generate',
-            )
-        # Add the prepared sequences to the batch again
-        batch['sequences'] = sequences
-        return batch
-
-
-    def _update_inference_model(self, vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
-        start_time = time.time()
-        log.info('Before broadcast to vLLM')
-        broadcast_to_vllm(
-            self.actor_critic,
-            vllm_engines,
-            model_update_group,
-            device=torch.device('cuda'),
-            loss_type=self.actor_critic.loss_type,  # type: ignore
-        )
-        log.info('Finished broadcasting to vLLM')
-        log.info(f'Took: {time.time() - start_time} to broadcast to vllm.')
-        dist.barrier()
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index 2f7e6d2e..c49aacb7 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -20,6 +20,7 @@
 from composer import Trainer
 from composer.optim import DecoupledAdamW
 from composer.utils import dist as composer_dist
+from composer.core import get_precision_context
 from llmfoundry.models import ComposerHFCausalLM
 
 from compose_rl.algorithms.online import (
@@ -29,6 +30,7 @@
 from compose_rl.algorithms.online.generation_utils import (
     broadcast_to_vllm,
     create_vllm_engines,
+    vllm_generate,
 )
 from compose_rl.data import prompt_dataset_collate_fn
 from compose_rl.utils.ray_utils import start_ray_server
@@ -50,19 +52,77 @@ def __init__(self,
         rank: int,
         world_size: int,
         master_addr: Optional[str] = None,
-        master_port: Optional[int] = None):
+        master_port: Optional[int] = None,):
         super().__init__(rank, world_size, master_addr, master_port)
         self.model = None
         self.model_update_group = None
-        self.pretrain_model_name = None
         self.ref_path = None
         self._dataloader = None
         self._tokenizer = None
         self.ppo_callback = None
         self.ppo_trainer: Trainer = None
 
+        self.pretrain_model_name = None
+        self.device_train_batch_size = None
+        self.num_batches_per_update = None
+        self.max_seq_len = None
+        self.precision = None
+        self.train_config: dict = None
+
+    def build_train_config(self, pretrain_model_name: str):
+        self.pretrain_model_name = pretrain_model_name
         self.device_train_batch_size = 4
         self.num_batches_per_update = 2
+        self.max_seq_len = 32
+        self.precision = 'amp_bf16'
+
+        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
+
+        variables = {
+            'buffer': {
+                'name': 'MinibatchRolloutBuffer',
+                'max_buffer_size': self.num_batches_per_update,
+            },
+            'max_gen_len': 8,
+            'gamma': 0.99,
+            'lambda_gae': 0.95,
+            'generation_kwargs': {
+                'use_cache': True,
+                'do_sample': False,
+                'temperature': 1.0,
+            },
+            'kl_controller': {
+                'init_kl_coef': 0.2,
+                'target': 0.01,
+                'horizon': 12800,
+                'kl_ctl_type': 'adaptive',
+            },
+            'reference_model': {
+                'model_config': ref_model_config,
+                'precision': self.precision,
+                'load_path': self.ref_path,
+                'non_train_fsdp_config': self.fsdp_config,
+            },
+            'epoch_per_iteration': 1,
+            'num_batches_per_update': self.num_batches_per_update,
+            'rewards': {
+                'output_length': {
+                    'reward_type': 'output_length',
+                    'max_gen_len': 10,
+                },
+            },
+        }
+        self.train_config = {
+            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
+            'fsdp_config': self.fsdp_config,
+            'seed': 17,
+            'precision': self.precision,
+            'variables': variables,
+            'max_seq_len': self.max_seq_len,
+            'global_train_batch_size': self.device_train_batch_size * self.world_size,
+            'device_train_batch_size': self.device_train_batch_size,
+            'device_train_microbatch_size': self.device_train_batch_size,
+        }
 
     def build_dataloader(self):
         # dataloader should be built with inference agent instead with this trainer actor,
@@ -120,7 +180,7 @@ def fsdp_config(self):
     def init_composer_dist(self):
         composer_dist.initialize_dist('gpu')
 
-    def build_ref_model(self, pretrain_model_name: str):
+    def build_ref_model(self):
         # train a reference model for the PPO training
         # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
         # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class 
@@ -130,7 +190,6 @@ def build_ref_model(self, pretrain_model_name: str):
             self.ref_path = ref_path
             return
 
-        self.pretrain_model_name = pretrain_model_name
         tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
@@ -157,74 +216,22 @@ def build_ref_model(self, pretrain_model_name: str):
         # After making the reference model, we can proceed with the PPO training
         self.ref_path = ref_path
 
-    def build_ppo_trainer(self, pretrain_model_name: str):
-        self.pretrain_model_name = pretrain_model_name
+    def build_ppo_trainer(self):
         composer_dist.initialize_dist('gpu')
-        max_seq_len = 32
-        precision = 'amp_bf16'
 
         model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
 
         optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
 
-        # ref_model_config = copy.deepcopy(self.model_config)
-        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
-
-        variables = {
-            'buffer': {
-                'name': 'MinibatchRolloutBuffer',
-                'max_buffer_size': self.num_batches_per_update,
-            },
-            'max_gen_len': 8,
-            'gamma': 0.99,
-            'lambda_gae': 0.95,
-            'generation_kwargs': {
-                'use_cache': True,
-                'do_sample': False,
-                'temperature': 1.0,
-            },
-            'kl_controller': {
-                'init_kl_coef': 0.2,
-                'target': 0.01,
-                'horizon': 12800,
-                'kl_ctl_type': 'adaptive',
-            },
-            'reference_model': {
-                'model_config': ref_model_config,
-                'precision': precision,
-                'load_path': self.ref_path,
-                'non_train_fsdp_config': self.fsdp_config,
-            },
-            'epoch_per_iteration': 1,
-            'num_batches_per_update': self.num_batches_per_update,
-            'rewards': {
-                'output_length': {
-                    'reward_type': 'output_length',
-                    'max_gen_len': 10,
-                },
-            },
-        }
-        train_config = {
-            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
-            'fsdp_config': self.fsdp_config,
-            'seed': 17,
-            'precision': precision,
-            'variables': variables,
-            'max_seq_len': max_seq_len,
-            'global_train_batch_size': self.device_train_batch_size * self.world_size,
-            'device_train_batch_size': self.device_train_batch_size,
-            'device_train_microbatch_size': self.device_train_batch_size,
-        }
-
         # ideally we should pull the rest of the training logic from the callback to this class as well,
         # e.g, how to interact with env, calculate rewards etc
-        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=train_config)
+        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=self.train_config)
         self.ppo_trainer = Trainer(
             model=model,
             optimizers=optimizer,
             callbacks=self.ppo_callback,
             train_dataloader=self.dataloader,
-            precision=precision,
+            precision=self.precision,
             parallelism_config={'fsdp': self.fsdp_config},
             max_duration='3iter',
             device_train_microbatch_size=1,
@@ -243,35 +250,43 @@ def train_1_iter(self):
             atol=5e-5,
         )
 
-    def update_inference_model(self, batch: dict[str, torch.Tensor], vllm_engines: list[Any], model_update_group: dist.ProcessGroup):
+    def update_inference_model(self, vllm_engines: list[Any]):
         start_time = time.time()
         print('Before broadcast to vLLM')
         broadcast_to_vllm(
             self.ppo_callback.actor_critic,
             vllm_engines,
-            model_update_group,
-            device=batch['prompt'].device,
+            self.model_update_group,
+            device=torch.device('cuda'),
             loss_type=self.ppo_callback.actor_critic.loss_type,  # type: ignore
         )
         print('Finished broadcasting to vLLM')
         print(f'Took: {time.time() - start_time} to broadcast to vllm.')
         dist.barrier()
 
-    def query_inference_engines(self, device: Any, vllm_engines: list[Any]):
+    def query_inference_engines(self, vllm_engines: list[Any]):
         """Round trip to inference engines.
         
         Args:
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
-        batch = device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
-        self.ppo_callback.batch_rollouts = self.ppo_callback._interact_with_env(batch, vllm_engines)
-    
-    def update_and_query_inference_engines(self, vllm_engines: list[Any]):
-        self.ppo_callback.update_and_query_inference_engines(
-            device=self.ppo_trainer.state.device,
-            vllm_engines=vllm_engines,
-            model_update_group=self.model_update_group,
-        )
+        batch = self.ppo_trainer.state.device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
+        max_gen_len = self.train_config['variables']['max_gen_len']
+        generation_kwargs = self.train_config['variables']['generation_kwargs']
+        with get_precision_context(self.precision), torch.no_grad():
+            # If vllm engines are available, we use them to generate sequences in one go
+            sequences = vllm_generate(
+                vllm_engines=vllm_engines,
+                batch=batch,
+                max_gen_len=max_gen_len,
+                generation_kwargs=generation_kwargs,
+                tokenizer=self.tokenizer,  # type: ignore
+                vllm_generate_function='generate',
+            )
+        # Add the prepared sequences to the batch again
+        batch['sequences'] = sequences
+        self.ppo_callback.batch_rollouts = batch
+
 
 def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor_parallel_size: int):
     """Initialize process groups for vLLM engines and master actor."""
@@ -341,25 +356,31 @@ class TrainActorGroup(SPMDActorGroup):
 
     def build_models(self, pretrain_model_name: str):
         """Build reference models and PPO trainers for all actors."""
+        build_train_config_tasks = [actor.build_train_config.remote(pretrain_model_name) for actor in self._train_actors]
+        ray.get(build_train_config_tasks)
+
         init_task = [actor.init_composer_dist.remote() for actor in self._train_actors]
         ray.get(init_task)
-        print('init composer dist done')
 
         # Build reference models
-        build_ref_model_tasks = [actor.build_ref_model.remote(pretrain_model_name) for actor in self._train_actors]
+        build_ref_model_tasks = [actor.build_ref_model.remote() for actor in self._train_actors]
         ray.get(build_ref_model_tasks)
         print('build ref model done')
 
         # Build PPO trainers
-        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote(pretrain_model_name) for actor in self._train_actors]
+        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in self._train_actors]
         ray.get(build_ppo_trainer_tasks)
         print('build ppo trainer done')
-
-    def sync_weights_and_generate(self, vllm_engines: list[Any]):
-        """Sync weights and generate with inference engines."""
-        refs = [actor.update_and_query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
+    
+    def update_inference_model(self, vllm_engines: list[Any]):
+        refs = [actor.update_inference_model.remote(vllm_engines) for actor in self._train_actors]
+        ray.get(refs)
+        print('update inference model done')
+    
+    def query_inference_engines(self, vllm_engines: list[Any]):
+        refs = [actor.query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
         ray.get(refs)
-        print('sync weight and gen done')
+        print('query inference engines done')
 
     def train_iteration(self):
         """Run one training iteration on all actors."""
@@ -409,7 +430,8 @@ def __init__(self, train_actor: TrainActorGroup, inference_client: RolloutAgent,
 
     
     def train(self):
-        self.train_actor.sync_weights_and_generate(self.inference_client.vllm_engines)
+        self.train_actor.update_inference_model(self.inference_client.vllm_engines)
+        self.train_actor.query_inference_engines(self.inference_client.vllm_engines)
         self.train_actor.train_iteration()
 
 

From f28b8c809a4eb6aa55d8d497e530759245929f16 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 07:56:02 +0000
Subject: [PATCH 084/107] clean up

---
 .../online/single_controller_callback.py           | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index e376bf12..a2267aaf 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -6,23 +6,13 @@
 from __future__ import annotations
 
 import logging
-import time
-from typing import Any, Union
+from typing import Union
 
-import torch
-from composer.core import (
-    State,
-    get_precision_context,
-)
+from composer.core import State
 from composer.loggers import Logger
 from composer.trainer.trainer import _get_initial_device_train_microbatch_size
-from composer.utils import dist
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from compose_rl.algorithms.online.generation_utils import (
-    broadcast_to_vllm,
-    vllm_generate,
-)
 from compose_rl.algorithms.online.model import (
     ComposerHFPolicyLM,
     ComposerMPTPolicyLM,

From d18053360b1ee54de43e18152654fb4ab6fc7e65 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 20:50:47 +0000
Subject: [PATCH 085/107] mv vllm engines out

---
 test_single_controller_ppo.py | 42 +++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
index c49aacb7..02df5c19 100644
--- a/test_single_controller_ppo.py
+++ b/test_single_controller_ppo.py
@@ -391,24 +391,13 @@ def train_iteration(self):
 
 class RolloutAgent:
 
-    def __init__(self, num_vllm_engines: int, vllm_tensor_parallel_size: int, pretrain_model_name: str):
-        self.num_vllm_engines = num_vllm_engines
+    def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
+        self.vllm_engines = vllm_engines
         self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
-        self.vllm_engines = create_vllm_engines(
-            num_engines=num_vllm_engines,
-            tensor_parallel_size=vllm_tensor_parallel_size,
-            enforce_eager=True,
-            pretrain=pretrain_model_name,
-            revision=None,
-            seed=1,
-            enable_prefix_caching=False,
-            max_model_len=512,
-            device_bundle={
-                'GPU': 1,
-                'CPU': 1,
-                'worker_node': 0,
-            },
-        )
+    
+    @property
+    def num_vllm_engines(self):
+        return len(self.vllm_engines)
 
     def generate(self, prompts: list[str]):
         ref = self.vllm_engines[0].generate.remote(prompts)
@@ -456,8 +445,23 @@ def run():
             num_vllm_engines = (
                 world_size - num_train_actors
             ) // vllm_tensor_parallel_size
-
-            inference_client = RolloutAgent(num_vllm_engines, vllm_tensor_parallel_size, pretrain_model_name)
+            # TODO: Encapsulate this into a inference server manager class
+            vllm_engines = create_vllm_engines(
+                        num_engines=num_vllm_engines,
+                        tensor_parallel_size=vllm_tensor_parallel_size,
+                        enforce_eager=True,
+                        pretrain=pretrain_model_name,
+                        revision=None,
+                        seed=1,
+                        enable_prefix_caching=False,
+                        max_model_len=512,
+                        device_bundle={
+                            'GPU': 1,
+                            'CPU': 1,
+                            'worker_node': 0,
+                        },
+                    )
+            inference_client = RolloutAgent(vllm_engines, vllm_tensor_parallel_size)
 
             ppo_controller = PPOController(train_actor, inference_client, pretrain_model_name)
             ppo_controller.train()

From 5c18c99804a359e3a96c26dbe97728f79f6f6925 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 23:41:51 +0000
Subject: [PATCH 086/107] yeah works

---
 test_single_controller_ppo.py | 472 ----------------------------------
 tests/fixtures/fixtures.py    |  42 +++
 2 files changed, 42 insertions(+), 472 deletions(-)
 delete mode 100644 test_single_controller_ppo.py

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
deleted file mode 100644
index 02df5c19..00000000
--- a/test_single_controller_ppo.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2024 MosaicML ComposeRL authors
-# SPDX-License-Identifier: Apache-2.0
-
-# run cmd: VLLM_ATTENTION_BACKEND=FLASH_ATTN composer test_single_controller_ppo.py
-
-import logging
-import os
-import time
-from functools import partial
-from typing import Any, Optional
-
-import ray
-import torch
-import torch.distributed as dist
-from torch.utils.data import DataLoader
-from transformers import (
-    AutoTokenizer,
-)
-
-from composer import Trainer
-from composer.optim import DecoupledAdamW
-from composer.utils import dist as composer_dist
-from composer.core import get_precision_context
-from llmfoundry.models import ComposerHFCausalLM
-
-from compose_rl.algorithms.online import (
-    ComposerHFPolicyLM,
-    SingleControllerOnPolicyCallback,
-)
-from compose_rl.algorithms.online.generation_utils import (
-    broadcast_to_vllm,
-    create_vllm_engines,
-    vllm_generate,
-)
-from compose_rl.data import prompt_dataset_collate_fn
-from compose_rl.utils.ray_utils import start_ray_server
-
-from tests.common import (
-    BaseDistributedGPUActor,
-    VerifiablePromptDataset,
-)
-
-# Set up logging
-logger = logging.getLogger(__name__)
-
-
-@ray.remote(num_gpus=1)
-class DistributedGPUActor(BaseDistributedGPUActor):
-    """Distributed GPU actor for testing. Moved part of controller logic from PPO Callback to here."""
-
-    def __init__(self,
-        rank: int,
-        world_size: int,
-        master_addr: Optional[str] = None,
-        master_port: Optional[int] = None,):
-        super().__init__(rank, world_size, master_addr, master_port)
-        self.model = None
-        self.model_update_group = None
-        self.ref_path = None
-        self._dataloader = None
-        self._tokenizer = None
-        self.ppo_callback = None
-        self.ppo_trainer: Trainer = None
-
-        self.pretrain_model_name = None
-        self.device_train_batch_size = None
-        self.num_batches_per_update = None
-        self.max_seq_len = None
-        self.precision = None
-        self.train_config: dict = None
-
-    def build_train_config(self, pretrain_model_name: str):
-        self.pretrain_model_name = pretrain_model_name
-        self.device_train_batch_size = 4
-        self.num_batches_per_update = 2
-        self.max_seq_len = 32
-        self.precision = 'amp_bf16'
-
-        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
-
-        variables = {
-            'buffer': {
-                'name': 'MinibatchRolloutBuffer',
-                'max_buffer_size': self.num_batches_per_update,
-            },
-            'max_gen_len': 8,
-            'gamma': 0.99,
-            'lambda_gae': 0.95,
-            'generation_kwargs': {
-                'use_cache': True,
-                'do_sample': False,
-                'temperature': 1.0,
-            },
-            'kl_controller': {
-                'init_kl_coef': 0.2,
-                'target': 0.01,
-                'horizon': 12800,
-                'kl_ctl_type': 'adaptive',
-            },
-            'reference_model': {
-                'model_config': ref_model_config,
-                'precision': self.precision,
-                'load_path': self.ref_path,
-                'non_train_fsdp_config': self.fsdp_config,
-            },
-            'epoch_per_iteration': 1,
-            'num_batches_per_update': self.num_batches_per_update,
-            'rewards': {
-                'output_length': {
-                    'reward_type': 'output_length',
-                    'max_gen_len': 10,
-                },
-            },
-        }
-        self.train_config = {
-            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
-            'fsdp_config': self.fsdp_config,
-            'seed': 17,
-            'precision': self.precision,
-            'variables': variables,
-            'max_seq_len': self.max_seq_len,
-            'global_train_batch_size': self.device_train_batch_size * self.world_size,
-            'device_train_batch_size': self.device_train_batch_size,
-            'device_train_microbatch_size': self.device_train_batch_size,
-        }
-
-    def build_dataloader(self):
-        # dataloader should be built with inference agent instead with this trainer actor,
-        # it is still attached to trainer actor here to avoid a full refactor to PPO Callback code
-        max_seq_len = 32
-        prompt_len = 10
-
-        dataset = VerifiablePromptDataset(prompt_len=prompt_len)
-        dataloader = DataLoader(
-            dataset,
-            collate_fn=partial(
-                prompt_dataset_collate_fn,
-                self.tokenizer,
-                max_seq_len,
-            ),
-            sampler=composer_dist.get_sampler(dataset),
-            batch_size=self.device_train_batch_size,
-        )
-        # We need to mock this method, since our dataset isn't a StreamingDataset
-        dataloader.state_dict = lambda: {}
-        dataloader.load_state_dict = lambda x: None
-        return dataloader
-
-    @property
-    def dataloader(self):
-        if self._dataloader is None:
-            self._dataloader = self.build_dataloader()
-        return self._dataloader
-
-    def build_tokenizer(self):
-        tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-        return tokenizer
-
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            self._tokenizer = self.build_tokenizer()
-        return self._tokenizer
-
-    @property
-    def model_config(self):
-        return {
-            'tokenizer': self.tokenizer,
-            'pretrained_model_name_or_path': self.pretrain_model_name,
-            'pretrained': True,
-            'use_flash_attention_2': True,
-            'allow_embedding_resizing': True,
-        }
-
-    @property
-    def fsdp_config(self):
-        return dict()
-
-    def init_composer_dist(self):
-        composer_dist.initialize_dist('gpu')
-
-    def build_ref_model(self):
-        # train a reference model for the PPO training
-        # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
-        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class 
-        tmp_ref_path = str('./ref_checkpoints')
-        ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
-        if os.path.exists(ref_path):
-            self.ref_path = ref_path
-            return
-
-        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
-
-        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
-
-        temp_dataloader = [{
-            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
-            'attention_mask': torch.ones((2, 15)),
-            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
-        }]
-
-        temp_trainer = Trainer(
-            model=tmp_model,
-            train_dataloader=temp_dataloader,
-            optimizers=tmp_optimizer,
-            max_duration='1ba',
-            parallelism_config={'fsdp': self.fsdp_config},
-            save_folder=tmp_ref_path,
-            save_weights_only=True,
-            device_train_microbatch_size=self.device_train_microbatch_size,
-        )
-
-        temp_trainer.fit()
-
-        # After making the reference model, we can proceed with the PPO training
-        self.ref_path = ref_path
-
-    def build_ppo_trainer(self):
-        composer_dist.initialize_dist('gpu')
-
-        model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
-
-        optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
-
-        # ideally we should pull the rest of the training logic from the callback to this class as well,
-        # e.g, how to interact with env, calculate rewards etc
-        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=self.train_config)
-        self.ppo_trainer = Trainer(
-            model=model,
-            optimizers=optimizer,
-            callbacks=self.ppo_callback,
-            train_dataloader=self.dataloader,
-            precision=self.precision,
-            parallelism_config={'fsdp': self.fsdp_config},
-            max_duration='3iter',
-            device_train_microbatch_size=1,
-            load_path=self.ref_path,
-        )
-
-    def train_1_iter(self):
-        # we should implement the top level PPO algo here instead of the callback
-        # algorithmic researchers are expected to implement this function along with above policy/value/reward/ref trainers or models
-        self.ppo_trainer.fit(duration='1iter')
-        # This is the KL assert that must be true if we are truly loading from the same model.
-        # This is only true on the first iteration
-        assert torch.allclose(
-            self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
-            torch.tensor(0.0),
-            atol=5e-5,
-        )
-
-    def update_inference_model(self, vllm_engines: list[Any]):
-        start_time = time.time()
-        print('Before broadcast to vLLM')
-        broadcast_to_vllm(
-            self.ppo_callback.actor_critic,
-            vllm_engines,
-            self.model_update_group,
-            device=torch.device('cuda'),
-            loss_type=self.ppo_callback.actor_critic.loss_type,  # type: ignore
-        )
-        print('Finished broadcasting to vLLM')
-        print(f'Took: {time.time() - start_time} to broadcast to vllm.')
-        dist.barrier()
-
-    def query_inference_engines(self, vllm_engines: list[Any]):
-        """Round trip to inference engines.
-        
-        Args:
-            vllm_engines (list[Any]): The vllm engines to round trip to.
-        """
-        batch = self.ppo_trainer.state.device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
-        max_gen_len = self.train_config['variables']['max_gen_len']
-        generation_kwargs = self.train_config['variables']['generation_kwargs']
-        with get_precision_context(self.precision), torch.no_grad():
-            # If vllm engines are available, we use them to generate sequences in one go
-            sequences = vllm_generate(
-                vllm_engines=vllm_engines,
-                batch=batch,
-                max_gen_len=max_gen_len,
-                generation_kwargs=generation_kwargs,
-                tokenizer=self.tokenizer,  # type: ignore
-                vllm_generate_function='generate',
-            )
-        # Add the prepared sequences to the batch again
-        batch['sequences'] = sequences
-        self.ppo_callback.batch_rollouts = batch
-
-
-def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor_parallel_size: int):
-    """Initialize process groups for vLLM engines and master actor."""
-    # Get a new port for the weight-update process group
-    master_addr, _ = ray.get(master_actor.get_master_address.remote())
-    new_port = ray.get(master_actor.get_free_port.remote())
-    print(f'new_port: {new_port}')
-    
-    world_size = dist.get_world_size()
-    
-    # Initialize process groups for vLLM engines
-    refs = [
-        engine.init_process_group.remote(
-            master_addr,
-            new_port,
-            i * vllm_tensor_parallel_size + 1,
-            world_size // 2 + 1,
-            'weight-update',
-            backend='nccl',
-        ) for i, engine in enumerate(vllm_engines)
-    ]
-    
-    # Add master actor to the process group
-    refs.append(master_actor.add_process_group.remote(
-        backend='nccl',
-        master_addr=master_addr,
-        master_port=new_port,
-        world_size=world_size // 2 + 1,
-        rank=0,
-        group_name='weight-update',
-    ))
-    
-    # Wait for all process groups to be initialized
-    print(ray.get(refs))
-
-
-class SPMDActorGroup:
-    def __init__(self, num_train_actors: int):
-        self.num_train_actors = num_train_actors
-
-        self._train_actors = []
-        """Create and initialize all training actors."""
-        print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-        
-        # Create master actor first
-        self._master_actor = DistributedGPUActor.remote(0, self.num_train_actors)
-        self._train_actors.append(self._master_actor)
-        
-        # Get master address from rank 0 actor
-        master_addr, master_port = ray.get(self._master_actor.get_master_address.remote())
-        print(f"Master address allocated: {master_addr}:{master_port}")
-        
-        # Create remaining actors with the master address/port
-        for i in range(1, self.num_train_actors):
-            actor = DistributedGPUActor.remote(i, self.num_train_actors, master_addr, master_port)
-            self._train_actors.append(actor)
-
-    @property
-    def train_actors(self):
-        return self._train_actors
-
-    @property
-    def master_actor(self):
-        return self._master_actor
-
-class TrainActorGroup(SPMDActorGroup):
-
-    def build_models(self, pretrain_model_name: str):
-        """Build reference models and PPO trainers for all actors."""
-        build_train_config_tasks = [actor.build_train_config.remote(pretrain_model_name) for actor in self._train_actors]
-        ray.get(build_train_config_tasks)
-
-        init_task = [actor.init_composer_dist.remote() for actor in self._train_actors]
-        ray.get(init_task)
-
-        # Build reference models
-        build_ref_model_tasks = [actor.build_ref_model.remote() for actor in self._train_actors]
-        ray.get(build_ref_model_tasks)
-        print('build ref model done')
-
-        # Build PPO trainers
-        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in self._train_actors]
-        ray.get(build_ppo_trainer_tasks)
-        print('build ppo trainer done')
-    
-    def update_inference_model(self, vllm_engines: list[Any]):
-        refs = [actor.update_inference_model.remote(vllm_engines) for actor in self._train_actors]
-        ray.get(refs)
-        print('update inference model done')
-    
-    def query_inference_engines(self, vllm_engines: list[Any]):
-        refs = [actor.query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
-        ray.get(refs)
-        print('query inference engines done')
-
-    def train_iteration(self):
-        """Run one training iteration on all actors."""
-        refs = [actor.train_1_iter.remote() for actor in self._train_actors]
-        ray.get(refs)
-        print('train 1 iter done')
-
-
-class RolloutAgent:
-
-    def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
-        self.vllm_engines = vllm_engines
-        self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
-    
-    @property
-    def num_vllm_engines(self):
-        return len(self.vllm_engines)
-
-    def generate(self, prompts: list[str]):
-        ref = self.vllm_engines[0].generate.remote(prompts)
-        gen_results = ray.get(ref)
-        for output in gen_results:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-class PPOController:
-
-    def __init__(self, train_actor: TrainActorGroup, inference_client: RolloutAgent, pretrain_model_name: str):
-        self.train_actor = train_actor
-        self.inference_client = inference_client
-
-        self.train_actor.build_models(pretrain_model_name)
-        setup_process_groups(self.train_actor.master_actor, self.inference_client.vllm_engines, self.inference_client.vllm_tensor_parallel_size)
-
-    
-    def train(self):
-        self.train_actor.update_inference_model(self.inference_client.vllm_engines)
-        self.train_actor.query_inference_engines(self.inference_client.vllm_engines)
-        self.train_actor.train_iteration()
-
-
-def run():
-    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
-    prompts = [
-        "what is RAY?",
-        "what is vLLM?",
-    ]
-    pretrain_model_name = 'meta-llama/Llama-3.2-1B-Instruct'
-    with start_ray_server():
-        if dist.get_rank() == 0:
-            # only rank 0 is the master controller
-            
-            # create SPMD training actors of the system
-            num_train_actors = dist.get_world_size() // 2
-            train_actor = TrainActorGroup(num_train_actors)
-
-            # Create vLLM engines (or inference actors)
-            world_size = dist.get_world_size()
-            vllm_tensor_parallel_size = world_size - num_train_actors
-            num_vllm_engines = (
-                world_size - num_train_actors
-            ) // vllm_tensor_parallel_size
-            # TODO: Encapsulate this into a inference server manager class
-            vllm_engines = create_vllm_engines(
-                        num_engines=num_vllm_engines,
-                        tensor_parallel_size=vllm_tensor_parallel_size,
-                        enforce_eager=True,
-                        pretrain=pretrain_model_name,
-                        revision=None,
-                        seed=1,
-                        enable_prefix_caching=False,
-                        max_model_len=512,
-                        device_bundle={
-                            'GPU': 1,
-                            'CPU': 1,
-                            'worker_node': 0,
-                        },
-                    )
-            inference_client = RolloutAgent(vllm_engines, vllm_tensor_parallel_size)
-
-            ppo_controller = PPOController(train_actor, inference_client, pretrain_model_name)
-            ppo_controller.train()
-
-            inference_client.generate(prompts)
-
-if __name__ == '__main__':
-    run()
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
index 6271d071..409203cf 100644
--- a/tests/fixtures/fixtures.py
+++ b/tests/fixtures/fixtures.py
@@ -58,6 +58,33 @@ def tiny_gpt2_config_helper():
     return config_object
 
 
+def tiny_llama_config_helper():
+    pytest.importorskip('transformers')
+    from transformers.models.llama.configuration_llama import LlamaConfig
+    config_dict = {
+        'architectures': ['LlamaForCausalLM'],
+        'bos_token_id': 1,
+        'eos_token_id': 2,
+        'hidden_act': 'silu',
+        'hidden_size': 128,
+        'intermediate_size': 256,
+        'max_position_embeddings': 2048,
+        'model_type': 'llama',
+        'num_attention_heads': 4,
+        'num_hidden_layers': 2,
+        'num_key_value_heads': 4,
+        'rms_norm_eps': 1e-06,
+        'rope_theta': 10000.0,
+        'use_cache': True,
+        'vocab_size': 50258,  # Match GPT-2 tokenizer vocabulary size
+    }
+
+    config_object = LlamaConfig(
+        **config_dict,
+    )
+    return config_object
+
+
 def assets_path():
     rank = os.environ.get('RANK', '0')
     folder_name = 'tokenizers' + (f'_{rank}' if rank != '0' else '')
@@ -144,12 +171,22 @@ def _session_tiny_gpt2_model(_session_tiny_gpt2_config):  # type: ignore
     return causal_lm_model_helper(_session_tiny_gpt2_config)
 
 
+@pytest.fixture(scope='session')
+def _session_tiny_llama_model(_session_tiny_llama_config):  # type: ignore
+    return causal_lm_model_helper(_session_tiny_llama_config)
+
+
 ## SESSION CONFIGS ##
 @pytest.fixture(scope='session')
 def _session_tiny_gpt2_config():  # type: ignore
     return tiny_gpt2_config_helper()
 
 
+@pytest.fixture(scope='session')
+def _session_tiny_llama_config():  # type: ignore
+    return tiny_llama_config_helper()
+
+
 ## SESSION TOKENIZERS ##
 @pytest.fixture(scope='session')
 def _session_tiny_gpt2_tokenizer(tokenizers_assets):  # type: ignore
@@ -164,6 +201,11 @@ def tiny_gpt2_model(_session_tiny_gpt2_model):  # type: ignore
     return copy.deepcopy(_session_tiny_gpt2_model)
 
 
+@pytest.fixture
+def tiny_llama_model(_session_tiny_llama_model):  # type: ignore
+    return copy.deepcopy(_session_tiny_llama_model)
+
+
 ## TOKENIZER FIXTURES ##
 @pytest.fixture
 def tiny_gpt2_tokenizer(_session_tiny_gpt2_tokenizer):  # type: ignore

From 369e58e3d9a3c3f2ba9f555964a91093f6bff861 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 23:56:59 +0000
Subject: [PATCH 087/107] relocate file and pytest works

---
 test_single_controller_ppo.py       |   1 +
 tests/test_single_controller_ppo.py | 511 ++++++++++++++++++++++++++++
 2 files changed, 512 insertions(+)
 create mode 120000 test_single_controller_ppo.py
 create mode 100644 tests/test_single_controller_ppo.py

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
new file mode 120000
index 00000000..77eee340
--- /dev/null
+++ b/test_single_controller_ppo.py
@@ -0,0 +1 @@
+tests/test_single_controller_ppo.py
\ No newline at end of file
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
new file mode 100644
index 00000000..078e7f1b
--- /dev/null
+++ b/tests/test_single_controller_ppo.py
@@ -0,0 +1,511 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
+# run cmd: cd compose-rl && cp tests/test_single_controller_ppo.py . && composer test_single_controller_ppo.py
+
+import os
+import pathlib
+import time
+from functools import partial
+from typing import Any, Optional
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
+
+from composer import Trainer
+from composer.optim import DecoupledAdamW
+from composer.utils import dist as composer_dist
+from composer.core import get_precision_context
+from llmfoundry.models import ComposerHFCausalLM
+
+from compose_rl.algorithms.online import (
+    ComposerHFPolicyLM,
+    SingleControllerOnPolicyCallback,
+)
+from compose_rl.algorithms.online.generation_utils import (
+    broadcast_to_vllm,
+    create_vllm_engines,
+    vllm_generate,
+)
+from compose_rl.data import prompt_dataset_collate_fn
+from compose_rl.utils.ray_utils import start_ray_server
+
+from tests.common import (
+    BaseDistributedGPUActor,
+    VerifiablePromptDataset,
+    world_size,
+)
+
+
+@ray.remote(num_gpus=1)
+class DistributedGPUActor(BaseDistributedGPUActor):
+    """Distributed GPU actor for testing. Moved part of controller logic from PPO Callback to here."""
+
+    def __init__(self,
+        rank: int,
+        world_size: int,
+        master_addr: Optional[str] = None,
+        master_port: Optional[int] = None,):
+        super().__init__(rank, world_size, master_addr, master_port)
+        self.model = None
+        self.model_update_group = None
+        self.ref_path = None
+        self._dataloader = None
+        self._tokenizer = None
+        self.ppo_callback = None
+        self.ppo_trainer: Trainer = None
+
+        self.pretrain_model_name = None
+        self.device_train_batch_size = None
+        self.num_batches_per_update = None
+        self.max_seq_len = None
+        self.precision = None
+        self.train_config: dict = None
+
+    def build_train_config(self, pretrain_model_name: str):
+        self.pretrain_model_name = pretrain_model_name
+        self.device_train_batch_size = 4
+        self.num_batches_per_update = 2
+        self.max_seq_len = 32
+        self.precision = 'amp_bf16'
+
+        ref_model_config = {**self.model_config, 'name': 'hf_causal_lm'}
+
+        variables = {
+            'buffer': {
+                'name': 'MinibatchRolloutBuffer',
+                'max_buffer_size': self.num_batches_per_update,
+            },
+            'max_gen_len': 8,
+            'gamma': 0.99,
+            'lambda_gae': 0.95,
+            'generation_kwargs': {
+                'use_cache': True,
+                'do_sample': False,
+                'temperature': 1.0,
+            },
+            'kl_controller': {
+                'init_kl_coef': 0.2,
+                'target': 0.01,
+                'horizon': 12800,
+                'kl_ctl_type': 'adaptive',
+            },
+            'reference_model': {
+                'model_config': ref_model_config,
+                'precision': self.precision,
+                'load_path': self.ref_path,
+                'non_train_fsdp_config': self.fsdp_config,
+            },
+            'epoch_per_iteration': 1,
+            'num_batches_per_update': self.num_batches_per_update,
+            'rewards': {
+                'output_length': {
+                    'reward_type': 'output_length',
+                    'max_gen_len': 10,
+                },
+            },
+        }
+        self.train_config = {
+            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
+            'fsdp_config': self.fsdp_config,
+            'seed': 17,
+            'precision': self.precision,
+            'variables': variables,
+            'max_seq_len': self.max_seq_len,
+            'global_train_batch_size': self.device_train_batch_size * self.world_size,
+            'device_train_batch_size': self.device_train_batch_size,
+            'device_train_microbatch_size': self.device_train_batch_size,
+        }
+
+    def build_dataloader(self):
+        # dataloader should be built with inference agent instead with this trainer actor,
+        # it is still attached to trainer actor here to avoid a full refactor to PPO Callback code
+        max_seq_len = 32
+        prompt_len = 10
+
+        dataset = VerifiablePromptDataset(prompt_len=prompt_len)
+        dataloader = DataLoader(
+            dataset,
+            collate_fn=partial(
+                prompt_dataset_collate_fn,
+                self.tokenizer,
+                max_seq_len,
+            ),
+            sampler=composer_dist.get_sampler(dataset),
+            batch_size=self.device_train_batch_size,
+        )
+        # We need to mock this method, since our dataset isn't a StreamingDataset
+        dataloader.state_dict = lambda: {}
+        dataloader.load_state_dict = lambda x: None
+        return dataloader
+
+    @property
+    def dataloader(self):
+        if self._dataloader is None:
+            self._dataloader = self.build_dataloader()
+        return self._dataloader
+
+    def build_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        return tokenizer
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = self.build_tokenizer()
+        return self._tokenizer
+
+    @property
+    def model_config(self):
+        return {
+            'tokenizer': self.tokenizer,
+            'pretrained_model_name_or_path': self.pretrain_model_name,
+            'pretrained': True,
+            'use_flash_attention_2': True,
+            'allow_embedding_resizing': True,
+        }
+
+    @property
+    def fsdp_config(self):
+        return dict()
+
+    def init_composer_dist(self):
+        composer_dist.initialize_dist('gpu')
+
+    def build_ref_model(self):
+        # train a reference model for the PPO training
+        # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
+        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class 
+        tmp_ref_path = str('./ref_checkpoints')
+        ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
+        if os.path.exists(ref_path):
+            self.ref_path = ref_path
+            return
+
+        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
+
+        tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
+
+        temp_dataloader = [{
+            'input_ids': torch.ones((2, 15)).to(dtype=torch.int64),
+            'attention_mask': torch.ones((2, 15)),
+            'labels': torch.ones((2, 15)).to(dtype=torch.int64),
+        }]
+
+        temp_trainer = Trainer(
+            model=tmp_model,
+            train_dataloader=temp_dataloader,
+            optimizers=tmp_optimizer,
+            max_duration='1ba',
+            parallelism_config={'fsdp': self.fsdp_config},
+            save_folder=tmp_ref_path,
+            save_weights_only=True,
+            device_train_microbatch_size=self.device_train_microbatch_size,
+        )
+
+        temp_trainer.fit()
+
+        # After making the reference model, we can proceed with the PPO training
+        self.ref_path = ref_path
+
+    def build_ppo_trainer(self):
+        composer_dist.initialize_dist('gpu')
+
+        model = ComposerHFPolicyLM(**self.model_config, use_auth_token=True)
+
+        optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
+
+        # ideally we should pull the rest of the training logic from the callback to this class as well,
+        # e.g, how to interact with env, calculate rewards etc
+        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=self.train_config)
+        self.ppo_trainer = Trainer(
+            model=model,
+            optimizers=optimizer,
+            callbacks=self.ppo_callback,
+            train_dataloader=self.dataloader,
+            precision=self.precision,
+            parallelism_config={'fsdp': self.fsdp_config},
+            max_duration='3iter',
+            device_train_microbatch_size=1,
+            load_path=self.ref_path,
+        )
+
+    def train_1_iter(self):
+        # we should implement the top level PPO algo here instead of the callback
+        # algorithmic researchers are expected to implement this function along with above policy/value/reward/ref trainers or models
+        self.ppo_trainer.fit(duration='1iter')
+        # This is the KL assert that must be true if we are truly loading from the same model.
+        # This is only true on the first iteration
+        assert torch.allclose(
+            self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
+            torch.tensor(0.0),
+            atol=5e-5,
+        )
+
+    def update_inference_model(self, vllm_engines: list[Any]):
+        start_time = time.time()
+        print('Before broadcast to vLLM')
+        broadcast_to_vllm(
+            self.ppo_callback.actor_critic,
+            vllm_engines,
+            self.model_update_group,
+            device=torch.device('cuda'),
+            loss_type=self.ppo_callback.actor_critic.loss_type,  # type: ignore
+        )
+        print('Finished broadcasting to vLLM')
+        print(f'Took: {time.time() - start_time} to broadcast to vllm.')
+        dist.barrier()
+
+    def query_inference_engines(self, vllm_engines: list[Any]):
+        """Round trip to inference engines.
+        
+        Args:
+            vllm_engines (list[Any]): The vllm engines to round trip to.
+        """
+        batch = self.ppo_trainer.state.device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
+        max_gen_len = self.train_config['variables']['max_gen_len']
+        generation_kwargs = self.train_config['variables']['generation_kwargs']
+        with get_precision_context(self.precision), torch.no_grad():
+            # If vllm engines are available, we use them to generate sequences in one go
+            sequences = vllm_generate(
+                vllm_engines=vllm_engines,
+                batch=batch,
+                max_gen_len=max_gen_len,
+                generation_kwargs=generation_kwargs,
+                tokenizer=self.tokenizer,  # type: ignore
+                vllm_generate_function='generate',
+            )
+        # Add the prepared sequences to the batch again
+        batch['sequences'] = sequences
+        self.ppo_callback.batch_rollouts = batch
+
+
+def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor_parallel_size: int):
+    """Initialize process groups for vLLM engines and master actor."""
+    # Get a new port for the weight-update process group
+    master_addr, _ = ray.get(master_actor.get_master_address.remote())
+    new_port = ray.get(master_actor.get_free_port.remote())
+    print(f'new_port: {new_port}')
+    
+    world_size = dist.get_world_size()
+    
+    # Initialize process groups for vLLM engines
+    refs = [
+        engine.init_process_group.remote(
+            master_addr,
+            new_port,
+            i * vllm_tensor_parallel_size + 1,
+            world_size // 2 + 1,
+            'weight-update',
+            backend='nccl',
+        ) for i, engine in enumerate(vllm_engines)
+    ]
+    
+    # Add master actor to the process group
+    refs.append(master_actor.add_process_group.remote(
+        backend='nccl',
+        master_addr=master_addr,
+        master_port=new_port,
+        world_size=world_size // 2 + 1,
+        rank=0,
+        group_name='weight-update',
+    ))
+    
+    # Wait for all process groups to be initialized
+    print(ray.get(refs))
+
+
+class SPMDActorGroup:
+    def __init__(self, num_train_actors: int):
+        self.num_train_actors = num_train_actors
+
+        self._train_actors = []
+        """Create and initialize all training actors."""
+        print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+        
+        # Create master actor first
+        self._master_actor = DistributedGPUActor.remote(0, self.num_train_actors)
+        self._train_actors.append(self._master_actor)
+        
+        # Get master address from rank 0 actor
+        master_addr, master_port = ray.get(self._master_actor.get_master_address.remote())
+        print(f"Master address allocated: {master_addr}:{master_port}")
+        
+        # Create remaining actors with the master address/port
+        for i in range(1, self.num_train_actors):
+            actor = DistributedGPUActor.remote(i, self.num_train_actors, master_addr, master_port)
+            self._train_actors.append(actor)
+
+    @property
+    def train_actors(self):
+        return self._train_actors
+
+    @property
+    def master_actor(self):
+        return self._master_actor
+
+class TrainActorGroup(SPMDActorGroup):
+
+    def build_models(self, pretrain_model_name: str):
+        """Build reference models and PPO trainers for all actors."""
+        build_train_config_tasks = [actor.build_train_config.remote(pretrain_model_name) for actor in self._train_actors]
+        ray.get(build_train_config_tasks)
+
+        init_task = [actor.init_composer_dist.remote() for actor in self._train_actors]
+        ray.get(init_task)
+
+        # Build reference models
+        build_ref_model_tasks = [actor.build_ref_model.remote() for actor in self._train_actors]
+        ray.get(build_ref_model_tasks)
+        print('build ref model done')
+
+        # Build PPO trainers
+        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in self._train_actors]
+        ray.get(build_ppo_trainer_tasks)
+        print('build ppo trainer done')
+    
+    def update_inference_model(self, vllm_engines: list[Any]):
+        refs = [actor.update_inference_model.remote(vllm_engines) for actor in self._train_actors]
+        ray.get(refs)
+        print('update inference model done')
+    
+    def query_inference_engines(self, vllm_engines: list[Any]):
+        refs = [actor.query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
+        ray.get(refs)
+        print('query inference engines done')
+
+    def train_iteration(self):
+        """Run one training iteration on all actors."""
+        refs = [actor.train_1_iter.remote() for actor in self._train_actors]
+        ray.get(refs)
+        print('train 1 iter done')
+
+
+class RolloutAgent:
+
+    def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
+        self.vllm_engines = vllm_engines
+        self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
+    
+    @property
+    def num_vllm_engines(self):
+        return len(self.vllm_engines)
+
+    def generate(self, prompts: list[str]):
+        ref = self.vllm_engines[0].generate.remote(prompts)
+        gen_results = ray.get(ref)
+        for output in gen_results:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+class PPOController:
+
+    def __init__(self, train_actor: TrainActorGroup, inference_client: RolloutAgent, pretrain_model_name: str):
+        self.train_actor = train_actor
+        self.inference_client = inference_client
+
+        self.train_actor.build_models(pretrain_model_name)
+        setup_process_groups(self.train_actor.master_actor, self.inference_client.vllm_engines, self.inference_client.vllm_tensor_parallel_size)
+
+    
+    def train(self):
+        self.train_actor.update_inference_model(self.inference_client.vllm_engines)
+        self.train_actor.query_inference_engines(self.inference_client.vllm_engines)
+        self.train_actor.train_iteration()
+
+
+def _run_single_controller_ppo(
+    pretrain_model_path: str,
+    world_size: int = 0,
+):
+    """Shared function for running single controller PPO with Ray actors and vLLM engines.
+    
+    Args:
+        pretrain_model_path: Path to the pretrained model (either local path or model name)
+        world_size: Number of distributed processes
+        prompts: List of prompts to test generation with
+    """
+    # Set vLLM attention backend to FLASH_ATTN otherwise FlashInfer backend takes too long to jit compile
+    os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASH_ATTN'
+    
+    prompts = [
+        "what is RAY?",
+        "what is vLLM?",
+    ]
+
+    with start_ray_server() as _address:
+        if dist.get_rank() == 0:
+            # only rank 0 is the master controller
+            
+            # create SPMD training actors of the system
+            if world_size == 0:
+                world_size = dist.get_world_size()
+            num_train_actors = world_size // 2
+            train_actor = TrainActorGroup(num_train_actors)
+
+            # Create vLLM engines (or inference actors)
+            vllm_tensor_parallel_size = world_size - num_train_actors
+            num_vllm_engines = (
+                world_size - num_train_actors
+            ) // vllm_tensor_parallel_size
+            # TODO: Encapsulate this into a inference server manager class
+            vllm_engines = create_vllm_engines(
+                        num_engines=num_vllm_engines,
+                        tensor_parallel_size=vllm_tensor_parallel_size,
+                        enforce_eager=True,
+                        pretrain=pretrain_model_path,
+                        revision=None,
+                        seed=1,
+                        enable_prefix_caching=False,
+                        max_model_len=512,
+                        device_bundle={
+                            'GPU': 1,
+                            'CPU': 1,
+                            'worker_node': 0,
+                        },
+                    )
+            inference_client = RolloutAgent(vllm_engines, vllm_tensor_parallel_size)
+
+            ppo_controller = PPOController(train_actor, inference_client, pretrain_model_path)
+            ppo_controller.train()
+
+            inference_client.generate(prompts)
+
+
+@pytest.mark.gpu
+@world_size(4)
+def test_single_controller_ppo(
+    world_size: int,
+    tiny_llama_model: PreTrainedModel,
+    tiny_gpt2_tokenizer: PreTrainedTokenizerBase,
+    tmp_path: pathlib.Path,
+):
+    """Test single controller PPO with Ray actors and vLLM engines."""
+    
+    # Save the model and tokenizer to a temporary directory
+    local_save_path = str(tmp_path / 'llama_model')
+    tiny_llama_model.save_pretrained(local_save_path)
+    tiny_gpt2_tokenizer.save_pretrained(local_save_path)
+
+    _run_single_controller_ppo(
+        pretrain_model_path=local_save_path,
+        world_size=world_size,
+    )
+
+
+if __name__ == '__main__':
+    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,    
+    _run_single_controller_ppo(
+        pretrain_model_path='meta-llama/Llama-3.2-1B-Instruct',
+    )

From 5a69a1552435dd632995e95af7145be774f8e4e2 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Fri, 25 Jul 2025 23:58:04 +0000
Subject: [PATCH 088/107] rm file

---
 test_single_controller_ppo.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 test_single_controller_ppo.py

diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
deleted file mode 120000
index 77eee340..00000000
--- a/test_single_controller_ppo.py
+++ /dev/null
@@ -1 +0,0 @@
-tests/test_single_controller_ppo.py
\ No newline at end of file

From 5e2ebc26f0867ee83ebc6e0dc7eefaf552eb6920 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Sat, 26 Jul 2025 00:00:12 +0000
Subject: [PATCH 089/107] format

---
 compose_rl/algorithms/online/__init__.py      |   3 +-
 compose_rl/algorithms/online/callback.py      |  10 +-
 compose_rl/algorithms/online/model.py         |   4 +-
 compose_rl/algorithms/online/model_methods.py |   2 +-
 .../online/single_controller_callback.py      |   6 +-
 tests/common/__init__.py                      |   2 +-
 tests/common/actor.py                         |   3 +
 tests/test_single_controller.py               |   2 +-
 tests/test_single_controller_ppo.py           | 227 ++++++++++++------
 9 files changed, 166 insertions(+), 93 deletions(-)

diff --git a/compose_rl/algorithms/online/__init__.py b/compose_rl/algorithms/online/__init__.py
index 92ab35d3..3857e92f 100644
--- a/compose_rl/algorithms/online/__init__.py
+++ b/compose_rl/algorithms/online/__init__.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from compose_rl.algorithms.online.callback import OnPolicyCallback
-from compose_rl.algorithms.online.single_controller_callback import SingleControllerOnPolicyCallback
 from compose_rl.algorithms.online.kl_controller import (
     AdaptiveKLController,
     BallKLController,
@@ -20,6 +19,8 @@
     HFPolicyConfig,
     MPTPolicyConfig,
 )
+from compose_rl.algorithms.online.single_controller_callback import \
+    SingleControllerOnPolicyCallback
 from compose_rl.registry import kl_controllers
 
 kl_controllers.register('adaptive', func=AdaptiveKLController)
diff --git a/compose_rl/algorithms/online/callback.py b/compose_rl/algorithms/online/callback.py
index 81deb765..41d96d59 100644
--- a/compose_rl/algorithms/online/callback.py
+++ b/compose_rl/algorithms/online/callback.py
@@ -800,7 +800,7 @@ def _interact_with_env(self, batch: dict[str, torch.Tensor]):
 
     def _get_reward(self, batch: dict[str, torch.Tensor]):
         """Compute rewards for a batch of generated sequences.
-        
+
         Args:
             batch (dict): The batch containing generated sequences to compute rewards for.
         """
@@ -833,7 +833,9 @@ def _get_reward(self, batch: dict[str, torch.Tensor]):
                 del resolved_outputs[key]
 
         # We need to split the resolved outputs into minibatches
-        for idx in range(batch['prompt_id'].shape[0] // self.device_train_batch_size):
+        for idx in range(
+            batch['prompt_id'].shape[0] // self.device_train_batch_size,
+        ):
             minibatch = self._extract_minibatch(
                 resolved_outputs,
                 idx,
@@ -842,7 +844,9 @@ def _get_reward(self, batch: dict[str, torch.Tensor]):
             self.buffer.add(minibatch)
 
         # Making sure we correctly parsed the minibatches
-        assert len(self.buffer) == self.num_batches_per_update, f'{len(self.buffer)} != {self.num_batches_per_update}'
+        assert len(
+            self.buffer,
+        ) == self.num_batches_per_update, f'{len(self.buffer)} != {self.num_batches_per_update}'
 
         self.actor_critic.train()
 
diff --git a/compose_rl/algorithms/online/model.py b/compose_rl/algorithms/online/model.py
index fdda1363..842fc36d 100644
--- a/compose_rl/algorithms/online/model.py
+++ b/compose_rl/algorithms/online/model.py
@@ -104,7 +104,7 @@ def loss(self, outputs: MutableMapping, batch: MutableMapping):
         additional_kwargs = {}
         if hasattr(self.config, 'beta'):
             additional_kwargs['beta'] = self.config.beta
-        
+
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
@@ -226,7 +226,7 @@ def loss(self, outputs: MutableMapping, batch: MutableMapping):
         additional_kwargs = {}
         if hasattr(self.config, 'beta'):
             additional_kwargs['beta'] = self.config.beta
-        
+
         return_dict = online_rl_loss(
             outputs=outputs,
             batch=batch,
diff --git a/compose_rl/algorithms/online/model_methods.py b/compose_rl/algorithms/online/model_methods.py
index acd00f4e..57615322 100644
--- a/compose_rl/algorithms/online/model_methods.py
+++ b/compose_rl/algorithms/online/model_methods.py
@@ -247,7 +247,7 @@ def policy_loss(
             logits=gen_logits,
         )
         assert token_entropies.shape == batch['action_mask'].shape, (
-            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch["action_mask"].shape}.',
+            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch['action_mask'].shape}.',
         )
         seq_entropies = utils.get_sequence_entropies(
             token_entropies=token_entropies,
diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index a2267aaf..33db74a2 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -13,14 +13,13 @@
 from composer.trainer.trainer import _get_initial_device_train_microbatch_size
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
+# Import the base class
+from compose_rl.algorithms.online.callback import OnPolicyCallback
 from compose_rl.algorithms.online.model import (
     ComposerHFPolicyLM,
     ComposerMPTPolicyLM,
 )
 
-# Import the base class
-from compose_rl.algorithms.online.callback import OnPolicyCallback
-
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 Policy = Union[ComposerHFPolicyLM, ComposerMPTPolicyLM]
 
@@ -55,4 +54,3 @@ def iteration_start(self, state: State, logger: Logger):
 
         # Update IFT KL
         self._update_ift_kl()
-
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
index 79b2fba9..9d71832c 100644
--- a/tests/common/__init__.py
+++ b/tests/common/__init__.py
@@ -1,6 +1,7 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
+from tests.common.actor import BaseDistributedGPUActor
 from tests.common.datasets import (
     FineGrainedPreference,
     PairwisePreference,
@@ -9,7 +10,6 @@
     VerifiablePromptDataset,
 )
 from tests.common.markers import device, world_size
-from tests.common.actor import BaseDistributedGPUActor
 
 __all__ = [
     'BaseDistributedGPUActor',
diff --git a/tests/common/actor.py b/tests/common/actor.py
index 1632a8a7..0190f05a 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML ComposeRL authors
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from datetime import timedelta
 from typing import Optional
diff --git a/tests/test_single_controller.py b/tests/test_single_controller.py
index c74cc9e5..1df09c74 100644
--- a/tests/test_single_controller.py
+++ b/tests/test_single_controller.py
@@ -19,7 +19,7 @@
     create_vllm_engines,
 )
 from compose_rl.utils.ray_utils import start_ray_server
-from tests.common import world_size, BaseDistributedGPUActor
+from tests.common import BaseDistributedGPUActor, world_size
 
 # Set up logging
 logger = logging.getLogger(__name__)
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 078e7f1b..a2e0ef9e 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -13,6 +13,11 @@
 import ray
 import torch
 import torch.distributed as dist
+from composer import Trainer
+from composer.core import get_precision_context
+from composer.optim import DecoupledAdamW
+from composer.utils import dist as composer_dist
+from llmfoundry.models import ComposerHFCausalLM
 from torch.utils.data import DataLoader
 from transformers import (
     AutoTokenizer,
@@ -20,12 +25,6 @@
     PreTrainedTokenizerBase,
 )
 
-from composer import Trainer
-from composer.optim import DecoupledAdamW
-from composer.utils import dist as composer_dist
-from composer.core import get_precision_context
-from llmfoundry.models import ComposerHFCausalLM
-
 from compose_rl.algorithms.online import (
     ComposerHFPolicyLM,
     SingleControllerOnPolicyCallback,
@@ -37,7 +36,6 @@
 )
 from compose_rl.data import prompt_dataset_collate_fn
 from compose_rl.utils.ray_utils import start_ray_server
-
 from tests.common import (
     BaseDistributedGPUActor,
     VerifiablePromptDataset,
@@ -49,11 +47,13 @@
 class DistributedGPUActor(BaseDistributedGPUActor):
     """Distributed GPU actor for testing. Moved part of controller logic from PPO Callback to here."""
 
-    def __init__(self,
+    def __init__(
+        self,
         rank: int,
         world_size: int,
         master_addr: Optional[str] = None,
-        master_port: Optional[int] = None,):
+        master_port: Optional[int] = None,
+    ):
         super().__init__(rank, world_size, master_addr, master_port)
         self.model = None
         self.model_update_group = None
@@ -114,15 +114,27 @@ def build_train_config(self, pretrain_model_name: str):
             },
         }
         self.train_config = {
-            'model': {**self.model_config, 'kl_estimator': 'k1', 'kl_clip_range': 40.0},
-            'fsdp_config': self.fsdp_config,
-            'seed': 17,
-            'precision': self.precision,
-            'variables': variables,
-            'max_seq_len': self.max_seq_len,
-            'global_train_batch_size': self.device_train_batch_size * self.world_size,
-            'device_train_batch_size': self.device_train_batch_size,
-            'device_train_microbatch_size': self.device_train_batch_size,
+            'model': {
+                **self.model_config,
+                'kl_estimator': 'k1',
+                'kl_clip_range': 40.0,
+            },
+            'fsdp_config':
+                self.fsdp_config,
+            'seed':
+                17,
+            'precision':
+                self.precision,
+            'variables':
+                variables,
+            'max_seq_len':
+                self.max_seq_len,
+            'global_train_batch_size':
+                self.device_train_batch_size * self.world_size,
+            'device_train_batch_size':
+                self.device_train_batch_size,
+            'device_train_microbatch_size':
+                self.device_train_batch_size,
         }
 
     def build_dataloader(self):
@@ -176,7 +188,7 @@ def model_config(self):
 
     @property
     def fsdp_config(self):
-        return dict()
+        return {}
 
     def init_composer_dist(self):
         composer_dist.initialize_dist('gpu')
@@ -184,7 +196,7 @@ def init_composer_dist(self):
     def build_ref_model(self):
         # train a reference model for the PPO training
         # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
-        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class 
+        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class
         tmp_ref_path = str('./ref_checkpoints')
         ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
         if os.path.exists(ref_path):
@@ -226,7 +238,9 @@ def build_ppo_trainer(self):
 
         # ideally we should pull the rest of the training logic from the callback to this class as well,
         # e.g, how to interact with env, calculate rewards etc
-        self.ppo_callback = SingleControllerOnPolicyCallback(train_config=self.train_config)
+        self.ppo_callback = SingleControllerOnPolicyCallback(
+            train_config=self.train_config,
+        )
         self.ppo_trainer = Trainer(
             model=model,
             optimizers=optimizer,
@@ -267,11 +281,13 @@ def update_inference_model(self, vllm_engines: list[Any]):
 
     def query_inference_engines(self, vllm_engines: list[Any]):
         """Round trip to inference engines.
-        
+
         Args:
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
-        batch = self.ppo_trainer.state.device.batch_to_device(self.ppo_callback._get_next_iter_prompts())
+        batch = self.ppo_trainer.state.device.batch_to_device(
+            self.ppo_callback._get_next_iter_prompts(),
+        )
         max_gen_len = self.train_config['variables']['max_gen_len']
         generation_kwargs = self.train_config['variables']['generation_kwargs']
         with get_precision_context(self.precision), torch.no_grad():
@@ -289,15 +305,19 @@ def query_inference_engines(self, vllm_engines: list[Any]):
         self.ppo_callback.batch_rollouts = batch
 
 
-def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor_parallel_size: int):
+def setup_process_groups(
+    master_actor: Any,
+    vllm_engines: list[Any],
+    vllm_tensor_parallel_size: int,
+):
     """Initialize process groups for vLLM engines and master actor."""
     # Get a new port for the weight-update process group
     master_addr, _ = ray.get(master_actor.get_master_address.remote())
     new_port = ray.get(master_actor.get_free_port.remote())
     print(f'new_port: {new_port}')
-    
+
     world_size = dist.get_world_size()
-    
+
     # Initialize process groups for vLLM engines
     refs = [
         engine.init_process_group.remote(
@@ -309,40 +329,53 @@ def setup_process_groups(master_actor: Any, vllm_engines: list[Any], vllm_tensor
             backend='nccl',
         ) for i, engine in enumerate(vllm_engines)
     ]
-    
+
     # Add master actor to the process group
-    refs.append(master_actor.add_process_group.remote(
-        backend='nccl',
-        master_addr=master_addr,
-        master_port=new_port,
-        world_size=world_size // 2 + 1,
-        rank=0,
-        group_name='weight-update',
-    ))
-    
+    refs.append(
+        master_actor.add_process_group.remote(
+            backend='nccl',
+            master_addr=master_addr,
+            master_port=new_port,
+            world_size=world_size // 2 + 1,
+            rank=0,
+            group_name='weight-update',
+        ),
+    )
+
     # Wait for all process groups to be initialized
     print(ray.get(refs))
 
 
 class SPMDActorGroup:
+
     def __init__(self, num_train_actors: int):
         self.num_train_actors = num_train_actors
 
         self._train_actors = []
         """Create and initialize all training actors."""
         print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
-        
+
         # Create master actor first
-        self._master_actor = DistributedGPUActor.remote(0, self.num_train_actors)
+        self._master_actor = DistributedGPUActor.remote(
+            0,
+            self.num_train_actors,
+        )
         self._train_actors.append(self._master_actor)
-        
+
         # Get master address from rank 0 actor
-        master_addr, master_port = ray.get(self._master_actor.get_master_address.remote())
+        master_addr, master_port = ray.get(
+            self._master_actor.get_master_address.remote(),
+        )
         print(f"Master address allocated: {master_addr}:{master_port}")
-        
+
         # Create remaining actors with the master address/port
         for i in range(1, self.num_train_actors):
-            actor = DistributedGPUActor.remote(i, self.num_train_actors, master_addr, master_port)
+            actor = DistributedGPUActor.remote(
+                i,
+                self.num_train_actors,
+                master_addr,
+                master_port,
+            )
             self._train_actors.append(actor)
 
     @property
@@ -353,33 +386,49 @@ def train_actors(self):
     def master_actor(self):
         return self._master_actor
 
+
 class TrainActorGroup(SPMDActorGroup):
 
     def build_models(self, pretrain_model_name: str):
         """Build reference models and PPO trainers for all actors."""
-        build_train_config_tasks = [actor.build_train_config.remote(pretrain_model_name) for actor in self._train_actors]
+        build_train_config_tasks = [
+            actor.build_train_config.remote(pretrain_model_name)
+            for actor in self._train_actors
+        ]
         ray.get(build_train_config_tasks)
 
-        init_task = [actor.init_composer_dist.remote() for actor in self._train_actors]
+        init_task = [
+            actor.init_composer_dist.remote() for actor in self._train_actors
+        ]
         ray.get(init_task)
 
         # Build reference models
-        build_ref_model_tasks = [actor.build_ref_model.remote() for actor in self._train_actors]
+        build_ref_model_tasks = [
+            actor.build_ref_model.remote() for actor in self._train_actors
+        ]
         ray.get(build_ref_model_tasks)
         print('build ref model done')
 
         # Build PPO trainers
-        build_ppo_trainer_tasks = [actor.build_ppo_trainer.remote() for actor in self._train_actors]
+        build_ppo_trainer_tasks = [
+            actor.build_ppo_trainer.remote() for actor in self._train_actors
+        ]
         ray.get(build_ppo_trainer_tasks)
         print('build ppo trainer done')
-    
+
     def update_inference_model(self, vllm_engines: list[Any]):
-        refs = [actor.update_inference_model.remote(vllm_engines) for actor in self._train_actors]
+        refs = [
+            actor.update_inference_model.remote(vllm_engines)
+            for actor in self._train_actors
+        ]
         ray.get(refs)
         print('update inference model done')
-    
+
     def query_inference_engines(self, vllm_engines: list[Any]):
-        refs = [actor.query_inference_engines.remote(vllm_engines) for actor in self._train_actors]
+        refs = [
+            actor.query_inference_engines.remote(vllm_engines)
+            for actor in self._train_actors
+        ]
         ray.get(refs)
         print('query inference engines done')
 
@@ -395,7 +444,7 @@ class RolloutAgent:
     def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
         self.vllm_engines = vllm_engines
         self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
-    
+
     @property
     def num_vllm_engines(self):
         return len(self.vllm_engines)
@@ -411,17 +460,29 @@ def generate(self, prompts: list[str]):
 
 class PPOController:
 
-    def __init__(self, train_actor: TrainActorGroup, inference_client: RolloutAgent, pretrain_model_name: str):
+    def __init__(
+        self,
+        train_actor: TrainActorGroup,
+        inference_client: RolloutAgent,
+        pretrain_model_name: str,
+    ):
         self.train_actor = train_actor
         self.inference_client = inference_client
 
         self.train_actor.build_models(pretrain_model_name)
-        setup_process_groups(self.train_actor.master_actor, self.inference_client.vllm_engines, self.inference_client.vllm_tensor_parallel_size)
+        setup_process_groups(
+            self.train_actor.master_actor,
+            self.inference_client.vllm_engines,
+            self.inference_client.vllm_tensor_parallel_size,
+        )
 
-    
     def train(self):
-        self.train_actor.update_inference_model(self.inference_client.vllm_engines)
-        self.train_actor.query_inference_engines(self.inference_client.vllm_engines)
+        self.train_actor.update_inference_model(
+            self.inference_client.vllm_engines,
+        )
+        self.train_actor.query_inference_engines(
+            self.inference_client.vllm_engines,
+        )
         self.train_actor.train_iteration()
 
 
@@ -430,7 +491,7 @@ def _run_single_controller_ppo(
     world_size: int = 0,
 ):
     """Shared function for running single controller PPO with Ray actors and vLLM engines.
-    
+
     Args:
         pretrain_model_path: Path to the pretrained model (either local path or model name)
         world_size: Number of distributed processes
@@ -438,16 +499,16 @@ def _run_single_controller_ppo(
     """
     # Set vLLM attention backend to FLASH_ATTN otherwise FlashInfer backend takes too long to jit compile
     os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASH_ATTN'
-    
+
     prompts = [
-        "what is RAY?",
-        "what is vLLM?",
+        'what is RAY?',
+        'what is vLLM?',
     ]
 
     with start_ray_server() as _address:
         if dist.get_rank() == 0:
             # only rank 0 is the master controller
-            
+
             # create SPMD training actors of the system
             if world_size == 0:
                 world_size = dist.get_world_size()
@@ -461,23 +522,30 @@ def _run_single_controller_ppo(
             ) // vllm_tensor_parallel_size
             # TODO: Encapsulate this into a inference server manager class
             vllm_engines = create_vllm_engines(
-                        num_engines=num_vllm_engines,
-                        tensor_parallel_size=vllm_tensor_parallel_size,
-                        enforce_eager=True,
-                        pretrain=pretrain_model_path,
-                        revision=None,
-                        seed=1,
-                        enable_prefix_caching=False,
-                        max_model_len=512,
-                        device_bundle={
-                            'GPU': 1,
-                            'CPU': 1,
-                            'worker_node': 0,
-                        },
-                    )
-            inference_client = RolloutAgent(vllm_engines, vllm_tensor_parallel_size)
-
-            ppo_controller = PPOController(train_actor, inference_client, pretrain_model_path)
+                num_engines=num_vllm_engines,
+                tensor_parallel_size=vllm_tensor_parallel_size,
+                enforce_eager=True,
+                pretrain=pretrain_model_path,
+                revision=None,
+                seed=1,
+                enable_prefix_caching=False,
+                max_model_len=512,
+                device_bundle={
+                    'GPU': 1,
+                    'CPU': 1,
+                    'worker_node': 0,
+                },
+            )
+            inference_client = RolloutAgent(
+                vllm_engines,
+                vllm_tensor_parallel_size,
+            )
+
+            ppo_controller = PPOController(
+                train_actor,
+                inference_client,
+                pretrain_model_path,
+            )
             ppo_controller.train()
 
             inference_client.generate(prompts)
@@ -492,7 +560,6 @@ def test_single_controller_ppo(
     tmp_path: pathlib.Path,
 ):
     """Test single controller PPO with Ray actors and vLLM engines."""
-    
     # Save the model and tokenizer to a temporary directory
     local_save_path = str(tmp_path / 'llama_model')
     tiny_llama_model.save_pretrained(local_save_path)
@@ -505,7 +572,7 @@ def test_single_controller_ppo(
 
 
 if __name__ == '__main__':
-    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,    
+    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
     _run_single_controller_ppo(
         pretrain_model_path='meta-llama/Llama-3.2-1B-Instruct',
     )

From 470bef9f31c00602f9b9c3950d0e94231d69f56d Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Sat, 26 Jul 2025 00:21:48 +0000
Subject: [PATCH 090/107] format

---
 compose_rl/algorithms/online/model_methods.py |  2 +-
 tests/common/actor.py                         |  2 +-
 tests/test_single_controller_ppo.py           | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/compose_rl/algorithms/online/model_methods.py b/compose_rl/algorithms/online/model_methods.py
index 57615322..acd00f4e 100644
--- a/compose_rl/algorithms/online/model_methods.py
+++ b/compose_rl/algorithms/online/model_methods.py
@@ -247,7 +247,7 @@ def policy_loss(
             logits=gen_logits,
         )
         assert token_entropies.shape == batch['action_mask'].shape, (
-            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch['action_mask'].shape}.',
+            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch["action_mask"].shape}.',
         )
         seq_entropies = utils.get_sequence_entropies(
             token_entropies=token_entropies,
diff --git a/tests/common/actor.py b/tests/common/actor.py
index 0190f05a..156347de 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -87,7 +87,7 @@ def add_process_group(
         rank: int,
         group_name: str,
     ):
-        """Initialize the process group on trainer rank 0 and e.g., vllm engines."""
+        """Initialize the process group on trainer rank 0 and vllm engines."""
         # NOTE vLLM seems to have a safer implementation of init_process_group:
         # https://github.com/vllm-project/vllm/blob/v0.9.1/examples/offline_inference/rlhf.py#L105
         # we should look into using that instead
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index a2e0ef9e..aadd4919 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -353,7 +353,7 @@ def __init__(self, num_train_actors: int):
 
         self._train_actors = []
         """Create and initialize all training actors."""
-        print(f"\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===")
+        print(f'\n=== STARTING DISTRIBUTED TRAINING WITH RAY ACTORS ===')
 
         # Create master actor first
         self._master_actor = DistributedGPUActor.remote(
@@ -366,7 +366,7 @@ def __init__(self, num_train_actors: int):
         master_addr, master_port = ray.get(
             self._master_actor.get_master_address.remote(),
         )
-        print(f"Master address allocated: {master_addr}:{master_port}")
+        print(f'Master address allocated: {master_addr}:{master_port}')
 
         # Create remaining actors with the master address/port
         for i in range(1, self.num_train_actors):
@@ -455,7 +455,7 @@ def generate(self, prompts: list[str]):
         for output in gen_results:
             prompt = output.prompt
             generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}')
 
 
 class PPOController:
@@ -490,10 +490,10 @@ def _run_single_controller_ppo(
     pretrain_model_path: str,
     world_size: int = 0,
 ):
-    """Shared function for running single controller PPO with Ray actors and vLLM engines.
+    """Shared function for running single controller PPO.
 
     Args:
-        pretrain_model_path: Path to the pretrained model (either local path or model name)
+        pretrain_model_path: Path to the pretrained model
         world_size: Number of distributed processes
         prompts: List of prompts to test generation with
     """

From 44b66098606f5f820dec1f28561c9a3ff5efc4e3 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 07:08:35 +0000
Subject: [PATCH 091/107] type ignore

---
 .../online/single_controller_callback.py       |  2 +-
 tests/test_single_controller_ppo.py            | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index 33db74a2..a19fdf85 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -37,7 +37,7 @@ class SingleControllerOnPolicyCallback(OnPolicyCallback):
     def iteration_start(self, state: State, logger: Logger):
         del logger  # unused
 
-        self._get_reward(self.batch_rollouts)
+        self._get_reward(self.batch_rollouts)  # type: ignore
 
         # Reset and initialize state train dataloader
         log.warning(
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index aadd4919..6de2d88f 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -61,14 +61,14 @@ def __init__(
         self._dataloader = None
         self._tokenizer = None
         self.ppo_callback = None
-        self.ppo_trainer: Trainer = None
+        self.ppo_trainer: Trainer = None  # type: ignore
 
         self.pretrain_model_name = None
         self.device_train_batch_size = None
         self.num_batches_per_update = None
         self.max_seq_len = None
         self.precision = None
-        self.train_config: dict = None
+        self.train_config: dict = None  # type: ignore
 
     def build_train_config(self, pretrain_model_name: str):
         self.pretrain_model_name = pretrain_model_name
@@ -221,7 +221,7 @@ def build_ref_model(self):
             parallelism_config={'fsdp': self.fsdp_config},
             save_folder=tmp_ref_path,
             save_weights_only=True,
-            device_train_microbatch_size=self.device_train_microbatch_size,
+            device_train_microbatch_size=self.device_train_microbatch_size,  # type: ignore
         )
 
         temp_trainer.fit()
@@ -290,7 +290,7 @@ def query_inference_engines(self, vllm_engines: list[Any]):
         )
         max_gen_len = self.train_config['variables']['max_gen_len']
         generation_kwargs = self.train_config['variables']['generation_kwargs']
-        with get_precision_context(self.precision), torch.no_grad():
+        with get_precision_context(self.precision), torch.no_grad():  # type: ignore
             # If vllm engines are available, we use them to generate sequences in one go
             sequences = vllm_generate(
                 vllm_engines=vllm_engines,
@@ -302,7 +302,7 @@ def query_inference_engines(self, vllm_engines: list[Any]):
             )
         # Add the prepared sequences to the batch again
         batch['sequences'] = sequences
-        self.ppo_callback.batch_rollouts = batch
+        self.ppo_callback.batch_rollouts = batch  # type: ignore
 
 
 def setup_process_groups(
@@ -312,8 +312,8 @@ def setup_process_groups(
 ):
     """Initialize process groups for vLLM engines and master actor."""
     # Get a new port for the weight-update process group
-    master_addr, _ = ray.get(master_actor.get_master_address.remote())
-    new_port = ray.get(master_actor.get_free_port.remote())
+    master_addr, _ = ray.get(master_actor.get_master_address.remote())  # type: ignore
+    new_port = ray.get(master_actor.get_free_port.remote())  # type: ignore
     print(f'new_port: {new_port}')
 
     world_size = dist.get_world_size()
@@ -364,7 +364,7 @@ def __init__(self, num_train_actors: int):
 
         # Get master address from rank 0 actor
         master_addr, master_port = ray.get(
-            self._master_actor.get_master_address.remote(),
+            self._master_actor.get_master_address.remote(),  # type: ignore
         )
         print(f'Master address allocated: {master_addr}:{master_port}')
 
@@ -373,7 +373,7 @@ def __init__(self, num_train_actors: int):
             actor = DistributedGPUActor.remote(
                 i,
                 self.num_train_actors,
-                master_addr,
+                master_addr,  # type: ignore
                 master_port,
             )
             self._train_actors.append(actor)

From 06628107f9b7fe3c47a79070bd2bffbd3d7c1397 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 08:02:44 +0000
Subject: [PATCH 092/107] todo

---
 tests/test_single_controller_ppo.py | 67 ++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 6de2d88f..cc889936 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -45,7 +45,7 @@
 
 @ray.remote(num_gpus=1)
 class DistributedGPUActor(BaseDistributedGPUActor):
-    """Distributed GPU actor for testing. Moved part of controller logic from PPO Callback to here."""
+    """Distributed GPU actor for testing"""
 
     def __init__(
         self,
@@ -138,8 +138,8 @@ def build_train_config(self, pretrain_model_name: str):
         }
 
     def build_dataloader(self):
-        # dataloader should be built with inference agent instead with this trainer actor,
-        # it is still attached to trainer actor here to avoid a full refactor to PPO Callback code
+        # TODO (infra): build prompt dataloader with rollout agent instead of
+        # trainer actor
         max_seq_len = 32
         prompt_len = 10
 
@@ -166,6 +166,8 @@ def dataloader(self):
         return self._dataloader
 
     def build_tokenizer(self):
+        # TODO (algo): decide if we should use tokens or messages given
+        # we may need token level log prob
         tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         return tokenizer
@@ -188,15 +190,18 @@ def model_config(self):
 
     @property
     def fsdp_config(self):
+        # TODO (infra): use actual fsdp2 config
         return {}
 
     def init_composer_dist(self):
         composer_dist.initialize_dist('gpu')
 
     def build_ref_model(self):
-        # train a reference model for the PPO training
-        # The key observation here is that we should construct our high level model training logic in the actor instead of the callback
-        # e.g., we can build ref/reward/policy/value model and create/colocate multiple trainers all in this class
+        # pre-train a reference model for the PPO training
+        # The key observation here is that we should construct model
+        # training pipeline in the actor instead of the callback
+        # e.g., we can build ref/reward/policy/value model and create/colocate
+        # multiple trainers all in this class
         tmp_ref_path = str('./ref_checkpoints')
         ref_path = os.path.join(tmp_ref_path, 'latest-rank0.pt')
         if os.path.exists(ref_path):
@@ -221,12 +226,11 @@ def build_ref_model(self):
             parallelism_config={'fsdp': self.fsdp_config},
             save_folder=tmp_ref_path,
             save_weights_only=True,
-            device_train_microbatch_size=self.device_train_microbatch_size,  # type: ignore
+            device_train_microbatch_size=self.
+            device_train_microbatch_size,  # type: ignore
         )
 
         temp_trainer.fit()
-
-        # After making the reference model, we can proceed with the PPO training
         self.ref_path = ref_path
 
     def build_ppo_trainer(self):
@@ -236,8 +240,8 @@ def build_ppo_trainer(self):
 
         optimizer = DecoupledAdamW(model.parameters(), lr=1e-8)
 
-        # ideally we should pull the rest of the training logic from the callback to this class as well,
-        # e.g, how to interact with env, calculate rewards etc
+        # TODO (infra): pull the rest of the training logic from the callback
+        # to this class, e.g, how to interact with env, calculate rewards etc
         self.ppo_callback = SingleControllerOnPolicyCallback(
             train_config=self.train_config,
         )
@@ -254,11 +258,12 @@ def build_ppo_trainer(self):
         )
 
     def train_1_iter(self):
-        # we should implement the top level PPO algo here instead of the callback
-        # algorithmic researchers are expected to implement this function along with above policy/value/reward/ref trainers or models
+        # TODO (algo): implement the top level PPO algo here instead of the
+        # callback. Algorithmic researchers are expected to implement this
+        # function along with above policy/value/reward/ref trainers or models
         self.ppo_trainer.fit(duration='1iter')
-        # This is the KL assert that must be true if we are truly loading from the same model.
-        # This is only true on the first iteration
+        # This is the KL assert that must be true if we are truly loading
+        # from the same model. This is only true on the first iteration
         assert torch.allclose(
             self.ppo_trainer.state.loss['kl/ift_kl'], # pyright: ignore
             torch.tensor(0.0),
@@ -268,6 +273,9 @@ def train_1_iter(self):
     def update_inference_model(self, vllm_engines: list[Any]):
         start_time = time.time()
         print('Before broadcast to vLLM')
+        # TODO (infra) instead of direcly broadcasting to vllm, we should
+        # push the model parameters to a parameter buffer manager and have
+        # the buffer manager initiate broadcast of parameters to vllm engines
         broadcast_to_vllm(
             self.ppo_callback.actor_critic,
             vllm_engines,
@@ -285,13 +293,20 @@ def query_inference_engines(self, vllm_engines: list[Any]):
         Args:
             vllm_engines (list[Any]): The vllm engines to round trip to.
         """
+        # TODO (infra): we should use the rollout agent to generate sequences
+        # instead of the trainer actor, e.g,. reimplment _get_next_iter_prompts
+        # in the rollout agent
         batch = self.ppo_trainer.state.device.batch_to_device(
             self.ppo_callback._get_next_iter_prompts(),
         )
         max_gen_len = self.train_config['variables']['max_gen_len']
         generation_kwargs = self.train_config['variables']['generation_kwargs']
-        with get_precision_context(self.precision), torch.no_grad():  # type: ignore
-            # If vllm engines are available, we use them to generate sequences in one go
+        with get_precision_context(self.precision
+                                  ), torch.no_grad():  # type: ignore
+            # TODO (infra): refactor this code to isolate gather of
+            # prompts on the trainer actor and gather/scatter of sequences
+            # on the trainer actor, the first half is uesless while
+            # the second half should be managed throught a experience manager
             sequences = vllm_generate(
                 vllm_engines=vllm_engines,
                 batch=batch,
@@ -312,7 +327,9 @@ def setup_process_groups(
 ):
     """Initialize process groups for vLLM engines and master actor."""
     # Get a new port for the weight-update process group
-    master_addr, _ = ray.get(master_actor.get_master_address.remote())  # type: ignore
+    master_addr, _ = ray.get(
+        master_actor.get_master_address.remote()
+    )  # type: ignore
     new_port = ray.get(master_actor.get_free_port.remote())  # type: ignore
     print(f'new_port: {new_port}')
 
@@ -347,6 +364,7 @@ def setup_process_groups(
 
 
 class SPMDActorGroup:
+    # TODO (infra): refactor this to a proper base class
 
     def __init__(self, num_train_actors: int):
         self.num_train_actors = num_train_actors
@@ -389,6 +407,9 @@ def master_actor(self):
 
 class TrainActorGroup(SPMDActorGroup):
 
+    # TODO: this class is mainly pass through gang scheduler,
+    # we should refactor this class to be more generic and reusable
+
     def build_models(self, pretrain_model_name: str):
         """Build reference models and PPO trainers for all actors."""
         build_train_config_tasks = [
@@ -450,6 +471,8 @@ def num_vllm_engines(self):
         return len(self.vllm_engines)
 
     def generate(self, prompts: list[str]):
+        # TODO (infra): try integrate this with the multi-turn rollout
+        # repo
         ref = self.vllm_engines[0].generate.remote(prompts)
         gen_results = ray.get(ref)
         for output in gen_results:
@@ -458,6 +481,7 @@ def generate(self, prompts: list[str]):
             print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}')
 
 
+# TODO (infra): implement parameter buffer manager and experience manager
 class PPOController:
 
     def __init__(
@@ -497,7 +521,8 @@ def _run_single_controller_ppo(
         world_size: Number of distributed processes
         prompts: List of prompts to test generation with
     """
-    # Set vLLM attention backend to FLASH_ATTN otherwise FlashInfer backend takes too long to jit compile
+    # Set vLLM attention backend to FLASH_ATTN otherwise FlashInfer backend
+    # takes too long to jit compile
     os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASH_ATTN'
 
     prompts = [
@@ -572,7 +597,9 @@ def test_single_controller_ppo(
 
 
 if __name__ == '__main__':
-    # This is an example of how to move the controller logic from PPO Callback to a separate trainer actor above and this main single controller function,
+    # This is an example of how to move the controller logic from PPO Callback
+    # to a separate trainer actor above and this main single controller
+    # function.
     _run_single_controller_ppo(
         pretrain_model_path='meta-llama/Llama-3.2-1B-Instruct',
     )

From ab4faadcd8b6061572e9421f36e7476e31f6be83 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 18:02:50 +0000
Subject: [PATCH 093/107] different type fix

---
 tests/test_single_controller_ppo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index cc889936..cc4f116d 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -67,7 +67,7 @@ def __init__(
         self.device_train_batch_size = None
         self.num_batches_per_update = None
         self.max_seq_len = None
-        self.precision = None
+        self.precision: str = None # type: ignore
         self.train_config: dict = None  # type: ignore
 
     def build_train_config(self, pretrain_model_name: str):
@@ -301,8 +301,7 @@ def query_inference_engines(self, vllm_engines: list[Any]):
         )
         max_gen_len = self.train_config['variables']['max_gen_len']
         generation_kwargs = self.train_config['variables']['generation_kwargs']
-        with get_precision_context(self.precision
-                                  ), torch.no_grad():  # type: ignore
+        with get_precision_context(self.precision), torch.no_grad():
             # TODO (infra): refactor this code to isolate gather of
             # prompts on the trainer actor and gather/scatter of sequences
             # on the trainer actor, the first half is uesless while

From 7ff58679cf6956e34d491c9e2a7d31c64c6e1238 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 20:16:50 +0000
Subject: [PATCH 094/107] todos

---
 tests/test_single_controller_ppo.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index cc4f116d..68b9519e 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -1,7 +1,8 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
-# run cmd: cd compose-rl && cp tests/test_single_controller_ppo.py . && composer test_single_controller_ppo.py
+# run cmd: `cd compose-rl && cp tests/test_single_controller_ppo.py .
+# && composer test_single_controller_ppo.py`
 
 import os
 import pathlib
@@ -154,7 +155,8 @@ def build_dataloader(self):
             sampler=composer_dist.get_sampler(dataset),
             batch_size=self.device_train_batch_size,
         )
-        # We need to mock this method, since our dataset isn't a StreamingDataset
+        # We need to mock this method, since our dataset isn't a
+        # StreamingDataset
         dataloader.state_dict = lambda: {}
         dataloader.load_state_dict = lambda x: None
         return dataloader
@@ -168,6 +170,8 @@ def dataloader(self):
     def build_tokenizer(self):
         # TODO (algo): decide if we should use tokens or messages given
         # we may need token level log prob
+        # TODO (infra): use the tokenizer/texts for prompt dataloader but
+        # token (ids) for the experience buffer/manager
         tokenizer = AutoTokenizer.from_pretrained(self.pretrain_model_name)
         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         return tokenizer
@@ -190,7 +194,7 @@ def model_config(self):
 
     @property
     def fsdp_config(self):
-        # TODO (infra): use actual fsdp2 config
+        # TODO (infra): use actual fsdp1 config
         return {}
 
     def init_composer_dist(self):
@@ -208,7 +212,10 @@ def build_ref_model(self):
             self.ref_path = ref_path
             return
 
-        tmp_model = ComposerHFCausalLM(**self.model_config, use_auth_token=True)
+        tmp_model = ComposerHFCausalLM(
+            **self.model_config,
+            use_auth_token=True,
+        )
 
         tmp_optimizer = DecoupledAdamW(tmp_model.parameters(), lr=1e-6)
 
@@ -261,6 +268,9 @@ def train_1_iter(self):
         # TODO (algo): implement the top level PPO algo here instead of the
         # callback. Algorithmic researchers are expected to implement this
         # function along with above policy/value/reward/ref trainers or models
+        # TODO (infra): try multiple fit to see if the (mlflow) logger, etc
+        # TODO (infra): fault tolerance at iteration level first
+        # TODO (infra): enable batch level control
         self.ppo_trainer.fit(duration='1iter')
         # This is the KL assert that must be true if we are truly loading
         # from the same model. This is only true on the first iteration

From 092da43e2073b2ffcf7d9c1d86e548e217b70e7f Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 20:46:04 +0000
Subject: [PATCH 095/107] doc fix

---
 .../algorithms/online/single_controller_callback.py      | 4 +++-
 tests/test_single_controller_ppo.py                      | 9 +++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/compose_rl/algorithms/online/single_controller_callback.py b/compose_rl/algorithms/online/single_controller_callback.py
index a19fdf85..b5b4cd6b 100644
--- a/compose_rl/algorithms/online/single_controller_callback.py
+++ b/compose_rl/algorithms/online/single_controller_callback.py
@@ -31,7 +31,9 @@
 class SingleControllerOnPolicyCallback(OnPolicyCallback):
     """Callback for managing on-policy training in an RLHF loop.
 
-    Ideally all the overwritten methods below should be implemented in the trainer actor instead of the callback, we kept them here for now to minimize a drastic refactor to PPO Callback code
+    Ideally all the overwritten methods below should be implemented in the
+    trainer actor instead of the callback, we kept them here for now to minimize
+    a drastic refactor to PPO Callback code
     """
 
     def iteration_start(self, state: State, logger: Logger):
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 68b9519e..82864407 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -46,7 +46,7 @@
 
 @ray.remote(num_gpus=1)
 class DistributedGPUActor(BaseDistributedGPUActor):
-    """Distributed GPU actor for testing"""
+    """Distributed GPU actor for testing."""
 
     def __init__(
         self,
@@ -68,7 +68,7 @@ def __init__(
         self.device_train_batch_size = None
         self.num_batches_per_update = None
         self.max_seq_len = None
-        self.precision: str = None # type: ignore
+        self.precision: str = None  # type: ignore
         self.train_config: dict = None  # type: ignore
 
     def build_train_config(self, pretrain_model_name: str):
@@ -337,7 +337,7 @@ def setup_process_groups(
     """Initialize process groups for vLLM engines and master actor."""
     # Get a new port for the weight-update process group
     master_addr, _ = ray.get(
-        master_actor.get_master_address.remote()
+        master_actor.get_master_address.remote(),
     )  # type: ignore
     new_port = ray.get(master_actor.get_free_port.remote())  # type: ignore
     print(f'new_port: {new_port}')
@@ -415,7 +415,6 @@ def master_actor(self):
 
 
 class TrainActorGroup(SPMDActorGroup):
-
     # TODO: this class is mainly pass through gang scheduler,
     # we should refactor this class to be more generic and reusable
 
@@ -470,7 +469,6 @@ def train_iteration(self):
 
 
 class RolloutAgent:
-
     def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
         self.vllm_engines = vllm_engines
         self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
@@ -492,7 +490,6 @@ def generate(self, prompts: list[str]):
 
 # TODO (infra): implement parameter buffer manager and experience manager
 class PPOController:
-
     def __init__(
         self,
         train_actor: TrainActorGroup,

From 6c5f216c99f1357c327009f8b6a75c4a85c702c8 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 20:57:56 +0000
Subject: [PATCH 096/107] format

---
 tests/common/actor.py    | 1 -
 tests/common/datasets.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/tests/common/actor.py b/tests/common/actor.py
index 156347de..d25bff9b 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -17,7 +17,6 @@
 
 
 class BaseDistributedGPUActor:
-
     def __init__(
         self,
         rank: int,
diff --git a/tests/common/datasets.py b/tests/common/datasets.py
index 795d264c..d1792644 100644
--- a/tests/common/datasets.py
+++ b/tests/common/datasets.py
@@ -59,7 +59,6 @@ def __getitem__(self, index: int):
 
 
 class PromptDataset(Dataset):
-
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len
@@ -76,7 +75,6 @@ def __getitem__(self, index: int):
 
 
 class VerifiablePromptDataset(Dataset):
-
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len
@@ -94,7 +92,6 @@ def __getitem__(self, index: int):
 
 
 class VerifiableMessagesDataset(Dataset):
-
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len

From 00f32f348d60005115fe78eb149d438e76a6880e Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 21:04:07 +0000
Subject: [PATCH 097/107] change gpu test to 2

---
 tests/test_single_controller.py     | 2 +-
 tests/test_single_controller_ppo.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_single_controller.py b/tests/test_single_controller.py
index 1df09c74..67a92c47 100644
--- a/tests/test_single_controller.py
+++ b/tests/test_single_controller.py
@@ -65,7 +65,7 @@ def test_tensor_all_reduce(self) -> float:
 
 
 @pytest.mark.gpu
-@world_size(4)
+@world_size(2)
 def test_distributed_ray_actors(
     world_size: int,
     tiny_gpt2_model: PreTrainedModel,
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 82864407..14770c3a 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -583,7 +583,7 @@ def _run_single_controller_ppo(
 
 
 @pytest.mark.gpu
-@world_size(4)
+@world_size(2)
 def test_single_controller_ppo(
     world_size: int,
     tiny_llama_model: PreTrainedModel,

From 7094033b0f94f7546604b34ca5f14f3f867f37d2 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 21:16:56 +0000
Subject: [PATCH 098/107] format

---
 compose_rl/algorithms/online/generation_utils/vllm_utils.py | 1 -
 compose_rl/algorithms/online/model.py                       | 2 --
 2 files changed, 3 deletions(-)

diff --git a/compose_rl/algorithms/online/generation_utils/vllm_utils.py b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
index 4da62390..0930cdec 100644
--- a/compose_rl/algorithms/online/generation_utils/vllm_utils.py
+++ b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
@@ -113,7 +113,6 @@ def init_process_group(
 
 
 class WorkerWrap:
-
     def init_process_group(
         self,
         master_address: str,
diff --git a/compose_rl/algorithms/online/model.py b/compose_rl/algorithms/online/model.py
index 842fc36d..6df0fcea 100644
--- a/compose_rl/algorithms/online/model.py
+++ b/compose_rl/algorithms/online/model.py
@@ -35,7 +35,6 @@
 
 
 class ComposerMPTPolicyLM(HuggingFaceModel):
-
     def __init__(
         self,
         tokenizer: Tokenizer,
@@ -140,7 +139,6 @@ def set_batch_stats(self, batch_stats: dict[str, Any]):
 
 
 class ComposerHFPolicyLM(ComposerHFPolicy):
-
     def __init__(
         self,
         *,

From 59305a6aca1edd394c65951eabd7b46d8531608c Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 21:22:54 +0000
Subject: [PATCH 099/107] revert 4 gpu for now, regression does not like this
 test

---
 tests/test_single_controller.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_single_controller.py b/tests/test_single_controller.py
index 67a92c47..1df09c74 100644
--- a/tests/test_single_controller.py
+++ b/tests/test_single_controller.py
@@ -65,7 +65,7 @@ def test_tensor_all_reduce(self) -> float:
 
 
 @pytest.mark.gpu
-@world_size(2)
+@world_size(4)
 def test_distributed_ray_actors(
     world_size: int,
     tiny_gpt2_model: PreTrainedModel,

From 47d9b5f9fc7e4c60674855b7bd69932c10df174e Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 22:04:25 +0000
Subject: [PATCH 100/107] revert 2 gpu for now

---
 tests/test_single_controller_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 14770c3a..82864407 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -583,7 +583,7 @@ def _run_single_controller_ppo(
 
 
 @pytest.mark.gpu
-@world_size(2)
+@world_size(4)
 def test_single_controller_ppo(
     world_size: int,
     tiny_llama_model: PreTrainedModel,

From 9cb6e987904a43eda3696aea3fb26b31fda5478f Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 22:14:28 +0000
Subject: [PATCH 101/107] todo

---
 tests/test_single_controller_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 82864407..17a75dfb 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -583,7 +583,7 @@ def _run_single_controller_ppo(
 
 
 @pytest.mark.gpu
-@world_size(4)
+@world_size(4)  # TODO change this to 2 for CI testing (hit fatal python error)
 def test_single_controller_ppo(
     world_size: int,
     tiny_llama_model: PreTrainedModel,

From 0b5d91137b58d3b7f0e27e4a59fa51e00c52aee6 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 22:15:13 +0000
Subject: [PATCH 102/107] use 3.11 and update doc formatter

---
 .github/workflows/code-quality.yaml | 2 +-
 .pre-commit-config.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index bf01044e..1a0ac267 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       matrix:
         python_version:
-        - "3.11"
+        - "3.12"
         pip_deps:
         - "[dev]"
     steps:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a67d723a..ed9e8cb0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -71,7 +71,7 @@ repos:
     - --allow-past-years
     types: [python]
 - repo: https://github.com/PyCQA/docformatter
-  rev: v1.5.0
+  rev: v1.5.1
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]

From e2686265d6304bbcb788351511168a6701a0d270 Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Mon, 28 Jul 2025 22:27:28 +0000
Subject: [PATCH 103/107] revert change

---
 .github/workflows/code-quality.yaml | 2 +-
 .pre-commit-config.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 1a0ac267..bf01044e 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       matrix:
         python_version:
-        - "3.12"
+        - "3.11"
         pip_deps:
         - "[dev]"
     steps:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ed9e8cb0..a67d723a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -71,7 +71,7 @@ repos:
     - --allow-past-years
     types: [python]
 - repo: https://github.com/PyCQA/docformatter
-  rev: v1.5.1
+  rev: v1.5.0
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]

From e9e9b3c1ec9c1c6266c8fc7713913a42cf4f78ad Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 29 Jul 2025 00:17:43 +0000
Subject: [PATCH 104/107] try diff

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a67d723a..d236ae46 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
     name: yapf
     description: A formatter for Python files.
     entry: yapf
-    args: [-i, -vv, -p]       # inplace
+    args: [--diff]       # inplace
     language: python
     types: [python]
     additional_dependencies:

From 6a513e4018e66dec0faae1be940ee0e3a5d8efad Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 29 Jul 2025 00:30:48 +0000
Subject: [PATCH 105/107] yapf again

---
 .pre-commit-config.yaml                                     | 2 +-
 compose_rl/algorithms/online/generation_utils/vllm_utils.py | 1 +
 compose_rl/algorithms/online/model.py                       | 2 ++
 tests/common/actor.py                                       | 1 +
 tests/common/datasets.py                                    | 3 +++
 tests/test_single_controller_ppo.py                         | 2 ++
 6 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d236ae46..a67d723a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
     name: yapf
     description: A formatter for Python files.
     entry: yapf
-    args: [--diff]       # inplace
+    args: [-i, -vv, -p]       # inplace
     language: python
     types: [python]
     additional_dependencies:
diff --git a/compose_rl/algorithms/online/generation_utils/vllm_utils.py b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
index 0930cdec..4da62390 100644
--- a/compose_rl/algorithms/online/generation_utils/vllm_utils.py
+++ b/compose_rl/algorithms/online/generation_utils/vllm_utils.py
@@ -113,6 +113,7 @@ def init_process_group(
 
 
 class WorkerWrap:
+
     def init_process_group(
         self,
         master_address: str,
diff --git a/compose_rl/algorithms/online/model.py b/compose_rl/algorithms/online/model.py
index 6df0fcea..842fc36d 100644
--- a/compose_rl/algorithms/online/model.py
+++ b/compose_rl/algorithms/online/model.py
@@ -35,6 +35,7 @@
 
 
 class ComposerMPTPolicyLM(HuggingFaceModel):
+
     def __init__(
         self,
         tokenizer: Tokenizer,
@@ -139,6 +140,7 @@ def set_batch_stats(self, batch_stats: dict[str, Any]):
 
 
 class ComposerHFPolicyLM(ComposerHFPolicy):
+
     def __init__(
         self,
         *,
diff --git a/tests/common/actor.py b/tests/common/actor.py
index d25bff9b..156347de 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -17,6 +17,7 @@
 
 
 class BaseDistributedGPUActor:
+
     def __init__(
         self,
         rank: int,
diff --git a/tests/common/datasets.py b/tests/common/datasets.py
index d1792644..795d264c 100644
--- a/tests/common/datasets.py
+++ b/tests/common/datasets.py
@@ -59,6 +59,7 @@ def __getitem__(self, index: int):
 
 
 class PromptDataset(Dataset):
+
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len
@@ -75,6 +76,7 @@ def __getitem__(self, index: int):
 
 
 class VerifiablePromptDataset(Dataset):
+
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len
@@ -92,6 +94,7 @@ def __getitem__(self, index: int):
 
 
 class VerifiableMessagesDataset(Dataset):
+
     def __init__(self, size: int = 8, prompt_len: int = 5):
         self.size = size
         self.prompt_len = prompt_len
diff --git a/tests/test_single_controller_ppo.py b/tests/test_single_controller_ppo.py
index 17a75dfb..401683cf 100644
--- a/tests/test_single_controller_ppo.py
+++ b/tests/test_single_controller_ppo.py
@@ -469,6 +469,7 @@ def train_iteration(self):
 
 
 class RolloutAgent:
+
     def __init__(self, vllm_engines: list, vllm_tensor_parallel_size: int):
         self.vllm_engines = vllm_engines
         self.vllm_tensor_parallel_size = vllm_tensor_parallel_size
@@ -490,6 +491,7 @@ def generate(self, prompts: list[str]):
 
 # TODO (infra): implement parameter buffer manager and experience manager
 class PPOController:
+
     def __init__(
         self,
         train_actor: TrainActorGroup,

From 81fce8c1a69230476b0d9e1354f151160eb5f35e Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 29 Jul 2025 06:39:19 +0000
Subject: [PATCH 106/107] todo

---
 compose_rl/algorithms/online/model_methods.py | 2 +-
 tests/common/actor.py                         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/compose_rl/algorithms/online/model_methods.py b/compose_rl/algorithms/online/model_methods.py
index acd00f4e..57615322 100644
--- a/compose_rl/algorithms/online/model_methods.py
+++ b/compose_rl/algorithms/online/model_methods.py
@@ -247,7 +247,7 @@ def policy_loss(
             logits=gen_logits,
         )
         assert token_entropies.shape == batch['action_mask'].shape, (
-            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch["action_mask"].shape}.',
+            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch['action_mask'].shape}.',
         )
         seq_entropies = utils.get_sequence_entropies(
             token_entropies=token_entropies,
diff --git a/tests/common/actor.py b/tests/common/actor.py
index 156347de..ff22e4d3 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -39,6 +39,7 @@ def __init__(
         self.master_port = master_port
 
         # Set up basic environment variables
+        # TODO: may need to handle 'LOCAL_WORLD_SIZE' as used in callback.py
         os.environ['WORLD_SIZE'] = str(world_size)
         os.environ['RANK'] = str(rank)
 

From eb889f1d706a6cd73cbbcf457153116de9d01baf Mon Sep 17 00:00:00 2001
From: bowenyang008 <byang008@ucr.edu>
Date: Tue, 29 Jul 2025 06:47:59 +0000
Subject: [PATCH 107/107] revert

---
 compose_rl/algorithms/online/model_methods.py | 2 +-
 tests/common/actor.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compose_rl/algorithms/online/model_methods.py b/compose_rl/algorithms/online/model_methods.py
index 57615322..acd00f4e 100644
--- a/compose_rl/algorithms/online/model_methods.py
+++ b/compose_rl/algorithms/online/model_methods.py
@@ -247,7 +247,7 @@ def policy_loss(
             logits=gen_logits,
         )
         assert token_entropies.shape == batch['action_mask'].shape, (
-            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch['action_mask'].shape}.',
+            f'Token entropies shape {token_entropies.shape} does not match action mask shape {batch["action_mask"].shape}.',
         )
         seq_entropies = utils.get_sequence_entropies(
             token_entropies=token_entropies,
diff --git a/tests/common/actor.py b/tests/common/actor.py
index ff22e4d3..a2eab75f 100644
--- a/tests/common/actor.py
+++ b/tests/common/actor.py
@@ -39,7 +39,7 @@ def __init__(
         self.master_port = master_port
 
         # Set up basic environment variables
-        # TODO: may need to handle 'LOCAL_WORLD_SIZE' as used in callback.py
+        # TODO: may need to handle LOCAL_WORLD_SIZE as used in callback.py
         os.environ['WORLD_SIZE'] = str(world_size)
         os.environ['RANK'] = str(rank)