Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions tests/test_verifier_advantages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from unittest.mock import MagicMock

import pytest

from verifiers import Rubric
from verifiers.types import (
AssistantMessage,
RolloutInput,
State,
TrajectoryStep,
TrajectoryStepTokens,
UserMessage,
)
from verifiers.utils.save_utils import states_to_outputs


def _make_step(
*,
completion_ids: list[int],
advantage: float | None,
) -> TrajectoryStep:
completion_length = len(completion_ids)
return TrajectoryStep(
prompt=[UserMessage(content="q")],
completion=[AssistantMessage(content="a")],
response=MagicMock(),
tokens=TrajectoryStepTokens(
prompt_ids=[1],
prompt_mask=[0],
completion_ids=completion_ids,
completion_mask=[1] * completion_length,
completion_logprobs=[-0.1] * completion_length,
overlong_prompt=False,
is_truncated=False,
routed_experts=None,
),
reward=None,
advantage=advantage,
is_truncated=False,
trajectory_id="trajectory-1",
extras={},
)


@pytest.mark.asyncio
async def test_score_group_populates_step_completion_advantages_from_state_advantage():
def reward_by_length(completion, **kwargs):
return float(len(str(completion)))

rubric = Rubric(funcs=[reward_by_length], weights=[1.0])

state_a = State(
input=RolloutInput(
prompt=[UserMessage(content="prompt-a")],
answer="",
task="task",
example_id=0,
)
)
state_a["completion"] = "a"
state_a["trajectory"] = [_make_step(completion_ids=[11, 12], advantage=None)]
state_a["timing"] = {
"generation_ms": 0.0,
"scoring_ms": 0.0,
"total_ms": 0.0,
"start_time": 0.0,
}

state_b = State(
input=RolloutInput(
prompt=[UserMessage(content="prompt-b")],
answer="",
task="task",
example_id=1,
)
)
state_b["completion"] = "bbb"
state_b["trajectory"] = [_make_step(completion_ids=[21, 22, 23], advantage=None)]
state_b["timing"] = {
"generation_ms": 0.0,
"scoring_ms": 0.0,
"total_ms": 0.0,
"start_time": 0.0,
}

await rubric.score_group([state_a, state_b])

step_a = state_a["trajectory"][0]
step_b = state_b["trajectory"][0]

assert state_a["advantage"] == -1.0
assert state_b["advantage"] == 1.0
assert step_a["advantage"] == state_a["advantage"]
assert step_b["advantage"] == state_b["advantage"]
assert step_a["completion_advantages"] == [-1.0, -1.0]
assert step_b["completion_advantages"] == [1.0, 1.0, 1.0]


@pytest.mark.asyncio
async def test_score_group_preserves_existing_step_advantage_for_completion_advantages():
def constant_reward(completion, **kwargs):
return 1.0

rubric = Rubric(funcs=[constant_reward], weights=[1.0])

state = State(
input=RolloutInput(
prompt=[UserMessage(content="prompt")],
answer="",
task="task",
example_id=0,
)
)
state["completion"] = "answer"
state["trajectory"] = [_make_step(completion_ids=[31, 32], advantage=0.25)]
state["timing"] = {
"generation_ms": 0.0,
"scoring_ms": 0.0,
"total_ms": 0.0,
"start_time": 0.0,
}

await rubric.score_group([state])

step = state["trajectory"][0]
assert state["advantage"] == 0.0
assert step["advantage"] == 0.25
assert step["completion_advantages"] == [0.25, 0.25]


def test_states_to_outputs_includes_use_verifiers_advantages(make_state):
state = make_state()
state["use_verifiers_advantages"] = True

output = states_to_outputs([state], state_columns=["use_verifiers_advantages"])[0]

assert output["use_verifiers_advantages"] is True
3 changes: 3 additions & 0 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def __init__(
map_kwargs: dict = {},
max_seq_len: int | None = None,
score_rollouts: bool = True,
use_verifiers_advantages: bool = False,
**kwargs,
):
if message_type is _MESSAGE_TYPE_UNSET:
Expand Down Expand Up @@ -143,6 +144,7 @@ def __init__(
self.env_args = env_args or {}
self.max_seq_len = max_seq_len
self.map_kwargs = map_kwargs
self.use_verifiers_advantages = use_verifiers_advantages

self.set_score_rollouts(score_rollouts)

Expand Down Expand Up @@ -627,6 +629,7 @@ async def init_state(
state["reward"] = None
state["metrics"] = None
state["error"] = None
state["use_verifiers_advantages"] = self.use_verifiers_advantages
state["final_env_response"] = None
state["timing"] = RolloutTiming(
generation_ms=0.0,
Expand Down
31 changes: 26 additions & 5 deletions verifiers/rubrics/rubric.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,27 @@ async def dummy_score_group(self, states: list[State]):
for state in states:
await self.dummy_score_rollout(state)

def _populate_step_completion_advantages(
self,
state: State,
fallback_reward: float | None,
fallback_advantage: float | None,
) -> None:
for step in state["trajectory"]:
if step["advantage"] is None:
step["advantage"] = fallback_advantage
if step["reward"] is None:
step["reward"] = fallback_reward

tokens = step.get("tokens")
step_advantage = step.get("advantage")
if tokens is None or step_advantage is None:
step["completion_advantages"] = None
continue
step["completion_advantages"] = [step_advantage] * len(
tokens["completion_ids"]
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RubricGroup produces incorrect completion_advantages after aggregation

Medium Severity

When RubricGroup.score_group is used, each sub-rubric's score_group calls _populate_step_completion_advantages with that sub-rubric's partial advantage. Since step advantages are only set when None, the first sub-rubric's partial advantage "sticks" on the steps and completion_advantages, while RubricGroup never recomputes them from the final aggregated reward. The result is completion_advantages reflecting only one sub-rubric's contribution rather than the total.

Fix in Cursor Fix in Web

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was already the implementation before, the rubric group only does aggregated trajectory level rewards, if someone wants to do step level advantages they will probably have a single reward function in a rubric that does this. probably a cleaner way to get around this but not sure if this is worth fixing right now


async def score_group(self, states: list[State]):
"""
Score a group of rollouts together.
Expand Down Expand Up @@ -312,11 +333,11 @@ async def score_group(self, states: list[State]):
for i, state in enumerate(states):
state["reward"] = aggregated_rewards[i]
state["advantage"] = aggregated_rewards[i] - avg_reward
for t in state["trajectory"]:
if t["advantage"] is None:
t["advantage"] = state["advantage"]
if t["reward"] is None:
t["reward"] = state["reward"]
self._populate_step_completion_advantages(
state=state,
fallback_reward=state["reward"],
fallback_advantage=state["advantage"],
)
state["metrics"] = {
func_name: values[i] for func_name, values in aggregated_metrics.items()
}
Expand Down
3 changes: 2 additions & 1 deletion verifiers/scripts/gepa.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,8 @@ def run_gepa_optimization(
logger.debug(f"Results saved to {run_dir}")

# Set result info for final summary
best_prompt = result.best_candidate.get("system_prompt", "")
bc = result.best_candidate
best_prompt = bc.get("system_prompt", "") if isinstance(bc, dict) else bc
display.set_result(best_prompt=best_prompt, save_path=save_path)

return result
Expand Down
3 changes: 3 additions & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ class TrajectoryStep(TypedDict):
tokens: TrajectoryStepTokens | None
reward: float | None
advantage: float | None
completion_advantages: NotRequired[list[float] | None]
is_truncated: bool
trajectory_id: str
extras: dict[str, Any]
Expand Down Expand Up @@ -282,6 +283,7 @@ class RolloutOutput(dict):
trajectory: list["TrajectoryStep"]
tool_defs: list[Tool]
token_usage: TokenUsage
use_verifiers_advantages: bool


class State(dict):
Expand All @@ -300,6 +302,7 @@ class State(dict):
completion: Messages | None
reward: float | None
advantage: float | None
use_verifiers_advantages: bool | None
metrics: dict[str, float] | None
timing: RolloutTiming | None
error: Error | None
Expand Down
Loading