keep intermediate llm usage stats even for failure trajectories

mayinghan · mayinghan · commit b6f6b7fda5db · 2025-08-08T12:23:28.000-07:00
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -260,8 +260,6 @@ async def _execute_rollout(
                 {"role": "user", "content": user_prompt},
             ]
 
-            usage_stats_list: List[CompletionUsage] = []
-
             logger.info(f"🎯 Starting rollout {rollout_idx} in thread {threading.current_thread().name}")
 
             # Run rollout loop for this specific environment
@@ -375,7 +373,9 @@ async def _execute_rollout(
 
                     # calc llm usage stats happened in this turn if there is aany
                     if usage_stats:
-                        usage_stats_list.append(usage_stats)
+                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
+                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
+                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
 
                 # With user simulator, increment step after an entire conversation step
                 if user_simulator is not None:
@@ -409,7 +409,9 @@ async def _execute_rollout(
                     # tool indicates rollout should be terminated, call policy one last time to get the final response
                     _, usage_stats = await policy(tool_schema, rollout_idx, conversation_history)
                     if usage_stats:
-                        usage_stats_list.append(usage_stats)
+                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
+                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
+                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
 
                     # Add final control plane summary
                     trajectory.control_plane_summary.update(
@@ -460,11 +462,6 @@ async def _execute_rollout(
                     msg["control_plane_step"]["termination_reason"] = trajectory.termination_reason
                     break
 
-            for usage_stats in usage_stats_list:
-                trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
-                trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
-                trajectory.usage["total_tokens"] += usage_stats.total_tokens
-
             logger.info(
                 f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
             )