NVIDIA-NeMo · HaoZhang534 · Feb 12, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -236,6 +236,7 @@ logs
 /_test_workspace
 /debug
 cache
+results
 
 # configuration
 config.toml

diff --git a/NVIDIA_Assets/swe-bench.png b/NVIDIA_Assets/swe-bench.png
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ This command starts the FastAPI-based async evaluation server and listens on the
 It exposes /start, /process, and /status endpoints, and uses --max-init-workers/--max-run-workers and --timeout to control concurrency and time limits.
 
 ```bash
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
 python scripts/start_server.py --host 0.0.0.0 --port 8006 --max-init-workers 64 --max-run-workers 64 --timeout 300
 ```
 
@@ -154,6 +155,28 @@ Output (response body):
 }
 ```
 
+## 💻 Do RL Training with verl
+1) Clone [verl](https://github.com/verl-project/verl) and switch to specific commit
+```shell
+cd /path/to/verl
+git checkout 60138ebd
+```
+2) Install verl following [verl](https://github.com/verl-project/verl)'s instructions
+3) Install our verl patch
+```shell
+cd ProRL-Agent-Server/trainer_integration/verl
+pip install -e .
+```
+4) Start agent server
+```shell
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
+python scripts/start_server.py --host 0.0.0.0 --port 8006 --max-init-workers 64 --max-run-workers 64 --timeout 1000
+```
+5) Run training script
+```shell
+bash trainer_integration/verl/verl_custom/nvidia/scripts/run_proagent_qwn3_4B_instruct.sh
+```
+
 ## 💻 Add a New Task/Handler
 
 To add a new task:
@@ -195,7 +218,7 @@ TEST_RUNTIME=singularity RUN_AS_OPENHANDS=False PYTHONPATH='.' pytest tests/runt
 #### Image Storage Location
 **`OH_RUNTIME_SINGULARITY_IMAGE_REPO`** - Specifies the directory where Singularity runtime images will be stored.
 ```bash
-OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
+export OH_RUNTIME_SINGULARITY_IMAGE_REPO=/path/to/singularity_images
 ```
 
 ## 📄 Documentation
@@ -211,6 +234,6 @@ More module READMEs (click to open):
 ## 💡 Current Results
 
 
-To validate the functionality of the ProRLAgent servers, we conducted experiments on software engineering (SWE) tasks by integrating the server with our ProRLAgent Training framework based on verl. We did some initial RL training on Qwen3-4B-Instruct-2507 model. We used 32 A100 GPUs to train the model. Our training data is a subset of [SWE-GYM](https://huggingface.co/datasets/NovaSky-AI/SkyRL-v0-293-data) with 293 training examples. Training for around 66 steps have allowed the Pass@1 on SWE-Bench-Verified to be improved from 14.2% to 20.8%，the following charts shows the test results on SWE-Bench-Verified. It increases during training.
+To validate the functionality of the ProRLAgent servers, we conducted experiments on software engineering (SWE) tasks by integrating the server with our ProRLAgent Training framework based on verl. We did some initial RL training on Qwen3-4B-Instruct-2507 model. We used 32 A100 GPUs to train the model. Our training data is a subset of [SWE-GYM](https://huggingface.co/datasets/NovaSky-AI/SkyRL-v0-293-data) with 293 training examples. Training for around 66 steps have allowed the Pass@1 on SWE-Bench-Verified to be improved from 14.8% to 21.2%，the following charts shows the test results on SWE-Bench-Verified. It increases during training.
 <img src="NVIDIA_Assets/swe-bench.png" alt="swe-bench curve" width="600" />
 
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
@@ -228,22 +228,46 @@ def __init__(
             kwargs['chat_template_kwargs'] = {'enable_thinking': False}
 
         if self.token_level_generation:
-            if 'VL' or 'vl' in self.config.model:
-                from openhands.llm.nvidia.qwen2_5_vl import request_response_tokens
+            if self.config.use_gym_api:
+                from openhands.llm.nvidia.gym_vllm import (
+                    request_response_tokens,
+                )
+
+                logger.info(
+                    f'Using Gym-compatible vLLM API for token-level generation '
+                    f'(model={self.config.model}, base_url={self.config.base_url})'
+                )
+                self._completion = partial(
+                    request_response_tokens,
+                    model=self.config.model,
+                    tokenizer=self.tokenizer,
+                    base_url=self.config.base_url,
+                    timeout=self.config.timeout,
+                    top_p=self.config.top_p,
+                    seed=self.config.seed,
+                    max_model_len=self.config.max_model_len,
+                    api_key=self.config.api_key.get_secret_value()
+                    if self.config.api_key
+                    else None,
+                    **kwargs,
+                )
             else:
-                from openhands.llm.nvidia.qwen3 import request_response_tokens
+                if 'VL' or 'vl' in self.config.model:
+                    from openhands.llm.nvidia.qwen2_5_vl import request_response_tokens
+                else:
+                    from openhands.llm.nvidia.qwen3 import request_response_tokens
 
-            self._completion = partial(
-                request_response_tokens,
-                model=self.config.model,
-                tokenizer=self.tokenizer,
-                base_url=self.config.base_url,
-                timeout=self.config.timeout,
-                top_p=self.config.top_p,
-                seed=self.config.seed,
-                max_model_len=self.config.max_model_len,
-                **kwargs,
-            )
+                self._completion = partial(
+                    request_response_tokens,
+                    model=self.config.model,
+                    tokenizer=self.tokenizer,
+                    base_url=self.config.base_url,
+                    timeout=self.config.timeout,
+                    top_p=self.config.top_p,
+                    seed=self.config.seed,
+                    max_model_len=self.config.max_model_len,
+                    **kwargs,
+                )
         else:
             self._completion = partial(
                 litellm_completion,
-Original file line number
+Diff line change
@@ Expand Up / @@ -236,6 +236,7 @@ logs @@
     /_test_workspace
     /debug
     cache
+    results
     # configuration
     config.toml
@@ Expand Down @@