record rollout status

mayinghan · mayinghan · commit 86294ed77d9d · 2025-08-08T00:39:29.000-07:00
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -103,8 +103,7 @@ async def _execute_with_semaphore(idx):
                 )
 
         tasks = [_execute_with_semaphore(i) for i in range(envs.n)]
-        # exceptions should be try catched inside single _execute_rollout
-        # exceptions should be try catched inside single _execute_rollout
+        # exceptions will be try catched inside single _execute_rollout
         trajectories = await asyncio.gather(*tasks)
 
         # Calculate durations
@@ -171,6 +170,21 @@ async def _execute_with_semaphore(idx):
                 max_tokens=getattr(policy, "max_tokens", None),
                 max_tool_calls=getattr(policy, "max_tools_per_turn", None),
             )
+            if trajectory.terminated:
+                if trajectory.termination_reason in {
+                    TerminationReason.CONTROL_PLANE_SIGNAL,
+                    TerminationReason.USER_STOP,
+                }:
+                    evaluation_rows[idx].rollout_status.status = "finished"
+                elif trajectory.termination_reason == TerminationReason.MAX_STEPS:
+                    evaluation_rows[idx].rollout_status.status = "stopped"
+                else:
+                    evaluation_rows[idx].rollout_status.status = "error"
+                    evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
+                        "error_message", None
+                    )
+            else:
+                evaluation_rows[idx].rollout_status.status = "running"
 
         return evaluation_rows
 
@@ -458,8 +472,7 @@ async def _execute_rollout(
             logger.error(f"🚨 Error in rollout {rollout_idx}: {e}", exc_info=True)
             trajectory.terminated = True
             trajectory.termination_reason = TerminationReason.ERROR
-            trajectory.input_metadata.session_data["error"] = True
-            trajectory.input_metadata.session_data["error_message"] = str(e)
+            trajectory.control_plane_summary.update({"error_message": str(e)})
         return trajectory
 
     async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]:
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -220,6 +220,21 @@ class EvalMetadata(BaseModel):
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
+class RolloutStatus(BaseModel):
+    """Status of the rollout."""
+
+    """
+    running: Unfinished rollout which is still in progress.
+    finished: Rollout finished successfully.
+    error: Rollout failed.
+    stopped: Rollout terminated unexpectedly (e.g. max step, control plane signal, user stop).
+    """
+    status: Literal["running", "finished", "error", "stopped"] = Field(
+        "finished", description="Status of the rollout."
+    )
+    error_message: Optional[str] = Field(None, description="Error message if the rollout failed.")
+
+
 class EvaluationRow(BaseModel):
     """
     Unified data structure for a single evaluation unit that contains messages,
@@ -244,6 +259,11 @@ class EvaluationRow(BaseModel):
         description="Metadata related to the input (dataset info, model config, session data, etc.).",
     )
 
+    rollout_status: RolloutStatus = Field(
+        default_factory=RolloutStatus,
+        description="The status of the rollout.",
+    )
+
     # Ground truth reference (moved from EvaluateResult to top level)
     ground_truth: Optional[str] = Field(
         default=None, description="Optional ground truth reference for this evaluation."