migrate eval_metadata.status to AIP-193

Dylan Huang · Dylan Huang · commit 29b3aeab0001 · 2025-08-21T21:15:27.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -107,13 +107,29 @@ class Code(int, Enum):
         DATA_LOSS = 15
         UNAUTHENTICATED = 16
 
-        # Custom codes for rollout states (using higher numbers to avoid conflicts)
+        # Custom codes for EP (using higher numbers to avoid conflicts)
         FINISHED = 100
+        RUNNING = 101
 
     @classmethod
     def rollout_running(cls) -> "Status":
         """Create a status indicating the rollout is running."""
-        return cls(code=cls.Code.OK, message="Rollout is running", details=[])
+        return cls(code=cls.Code.RUNNING, message="Rollout is running", details=[])
+
+    @classmethod
+    def eval_running(cls) -> "Status":
+        """Create a status indicating the evaluation is running."""
+        return cls(code=cls.Code.RUNNING, message="Evaluation is running", details=[])
+
+    @classmethod
+    def eval_finished(cls) -> "Status":
+        """Create a status indicating the evaluation finished."""
+        return cls(code=cls.Code.FINISHED, message="Evaluation finished", details=[])
+
+    @classmethod
+    def aborted(cls, message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating the evaluation was aborted."""
+        return cls(code=cls.Code.ABORTED, message=message, details=details or [])
 
     @classmethod
     def rollout_finished(
@@ -144,7 +160,7 @@ def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = Non
 
     def is_running(self) -> bool:
         """Check if the status indicates the rollout is running."""
-        return self.code == self.Code.OK and self.message == "Rollout is running"
+        return self.code == self.Code.RUNNING
 
     def is_finished(self) -> bool:
         """Check if the status indicates the rollout finished successfully."""
@@ -436,9 +452,7 @@ class EvalMetadata(BaseModel):
         default_factory=get_pep440_version,
         description="Version of the evaluation. Should be populated with a PEP 440 version string.",
     )
-    status: Optional[Literal["running", "finished", "error", "stopped"]] = Field(
-        None, description="Status of the evaluation"
-    )
+    status: Optional[Status] = Field(None, description="Status of the evaluation")
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
     passed_threshold: Optional[EvaluationThreshold] = Field(
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -21,11 +21,13 @@
 from eval_protocol.human_id import generate_id, num_combinations
 from eval_protocol.models import (
     CompletionParams,
+    ErrorInfo,
     EvalMetadata,
     EvaluationRow,
     EvaluationThreshold,
     InputMetadata,
     Message,
+    Status,
 )
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
@@ -57,6 +59,7 @@
 )
 from eval_protocol.pytest.exception_config import ExceptionHandlerConfig
 from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci
+from eval_protocol.types.types import TerminationReason
 
 from ..common_utils import load_jsonl
 
@@ -419,7 +422,7 @@ async def execute_with_params(
         if mode == "groupwise":
             combinations = generate_parameter_combinations(
                 input_dataset,
-                None,
+                completion_params,
                 input_messages,
                 input_rows,
                 evaluation_test_kwargs,
@@ -482,9 +485,7 @@ async def wrapper_body(**kwargs):
 
                 experiment_id = generate_id()
 
-                def _log_eval_error(
-                    status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
-                ) -> None:
+                def _log_eval_error(status: Status, rows: Optional[List[EvaluationRow]] | None, passed: bool) -> None:
                     log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
 
                 try:
@@ -556,7 +557,7 @@ def _log_eval_error(
                     eval_metadata = EvalMetadata(
                         name=test_func.__name__,
                         description=test_func.__doc__,
-                        status="running",
+                        status=Status.eval_running(),
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
                         passed_threshold=threshold,
@@ -727,9 +728,11 @@ async def _collect_result(config, lst):
                         for r in results:
                             if r.eval_metadata is not None:
                                 if r.rollout_status.is_error():
-                                    r.eval_metadata.status = "error"
+                                    r.eval_metadata.status = Status.error(
+                                        r.rollout_status.message, r.rollout_status.details
+                                    )
                                 else:
-                                    r.eval_metadata.status = "finished"
+                                    r.eval_metadata.status = Status.eval_finished()
                             active_logger.log(r)
 
                     # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
@@ -767,14 +770,16 @@ async def _collect_result(config, lst):
 
                 except AssertionError:
                     _log_eval_error(
-                        "finished",
+                        Status.eval_finished(),
                         processed_rows_in_run if "processed_rows_in_run" in locals() else None,
                         passed=False,
                     )
                     raise
-                except Exception:
+                except Exception as e:
                     _log_eval_error(
-                        "error", processed_rows_in_run if "processed_rows_in_run" in locals() else None, passed=False
+                        Status.error(str(e)),
+                        processed_rows_in_run if "processed_rows_in_run" in locals() else None,
+                        passed=False,
                     )
                     raise
 
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -113,7 +113,7 @@ async def wrapper(**kwargs):
 def log_eval_status_and_rows(
     eval_metadata: Optional[EvalMetadata],
     rows: Optional[List[EvaluationRow]] | None,
-    status: Literal["finished", "error"],
+    status: Status,
     passed: bool,
     logger: DatasetLogger,
 ) -> None:
diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py
@@ -15,6 +15,7 @@
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import LOG_EVENT_TYPE
 from eval_protocol.event_bus import event_bus
+from eval_protocol.models import Status
 from eval_protocol.utils.vite_server import ViteServer
 
 if TYPE_CHECKING:
@@ -179,7 +180,9 @@ def _check_running_evaluations(self):
                 if self._should_update_status(row):
                     logger.info(f"Updating status to 'stopped' for row {row.input_metadata.row_id} (PID {row.pid})")
                     if row.eval_metadata is not None:
-                        row.eval_metadata.status = "stopped"
+                        row.eval_metadata.status = Status.aborted(
+                            f"Evaluation aborted since process {row.pid} stopped"
+                        )
                     updated_rows.append(row)
 
             # Log all updated rows
@@ -194,7 +197,12 @@ def _check_running_evaluations(self):
     def _should_update_status(self, row: "EvaluationRow") -> bool:
         """Check if a row's status should be updated to 'stopped'."""
         # Check if the row has running status and a PID
-        if row.eval_metadata and row.eval_metadata.status == "running" and row.pid is not None:
+        if (
+            row.eval_metadata
+            and row.eval_metadata.status
+            and row.eval_metadata.status.is_running()
+            and row.pid is not None
+        ):
             # Check if the process is still running
             try:
                 process = psutil.Process(row.pid)
diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py
@@ -60,4 +60,4 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
 
     # assert that the status of eval_metadata.status is "error"
     assert len(rollouts) == 5
-    assert all(row.eval_metadata.status == "error" for row in rollouts.values())
+    assert all(row.eval_metadata.status.is_error() for row in rollouts.values())
diff --git a/vite-app/src/App.tsx b/vite-app/src/App.tsx
@@ -150,7 +150,11 @@ const App = observer(() => {
             </div>
             <div className="flex items-center gap-2">
               <StatusIndicator
-                status={state.isConnected ? "connected" : "disconnected"}
+                status={
+                  state.isConnected
+                    ? { code: 0, message: "Connected", details: [] }
+                    : { code: 1, message: "Disconnected", details: [] }
+                }
               />
               <Button onClick={handleManualRefresh} className="ml-2">
                 Refresh
diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx
@@ -1,5 +1,8 @@
 import { observer } from "mobx-react";
-import type { EvaluationRow as EvaluationRowType } from "../types/eval-protocol";
+import type {
+  EvaluationRow as EvaluationRowType,
+  Status,
+} from "../types/eval-protocol";
 import { ChatInterface } from "./ChatInterface";
 import { MetadataSection } from "./MetadataSection";
 import StatusIndicator from "./StatusIndicator";
@@ -146,11 +149,14 @@ const RowStatus = observer(
     status,
     showSpinner,
   }: {
-    status: string | undefined;
+    status: Status | undefined;
     showSpinner: boolean;
   }) => (
     <div className="whitespace-nowrap">
-      <StatusIndicator showSpinner={showSpinner} status={status || "N/A"} />
+      <StatusIndicator
+        showSpinner={showSpinner}
+        status={status || { code: 2, message: "N/A", details: [] }}
+      />
     </div>
   )
 );
@@ -340,7 +346,7 @@ export const EvaluationRow = observer(
           <TableCell className="py-3 text-xs">
             <RowStatus
               status={row.eval_metadata?.status}
-              showSpinner={row.eval_metadata?.status === "running"}
+              showSpinner={row.eval_metadata?.status?.code === 101}
             />
           </TableCell>
 
diff --git a/vite-app/src/components/StatusIndicator.tsx b/vite-app/src/components/StatusIndicator.tsx
@@ -1,7 +1,8 @@
 import React from "react";
+import { getStatusCodeName, type Status } from "../types/eval-protocol";
 
 interface StatusIndicatorProps {
-  status: string;
+  status: Status;
   className?: string;
   showSpinner?: boolean;
 }
@@ -17,39 +18,41 @@ const StatusIndicator: React.FC<StatusIndicatorProps> = ({
   className = "",
   showSpinner = false,
 }) => {
-  const getStatusConfig = (status: string) => {
-    switch (status.toLowerCase()) {
-      case "connected":
+  const getStatusConfig = (status: Status) => {
+    const statusCodeName = getStatusCodeName(status.code);
+
+    switch (statusCodeName) {
+      case "OK":
         return {
           dotColor: "bg-green-500",
           textColor: "text-green-700",
           text: "Connected",
         };
-      case "disconnected":
+      case "CANCELLED":
         return {
           dotColor: "bg-red-500",
           textColor: "text-red-700",
           text: "Disconnected",
         };
-      case "finished":
+      case "FINISHED":
         return {
           dotColor: "bg-green-500",
           textColor: "text-green-700",
           text: "finished",
         };
-      case "running":
+      case "RUNNING":
         return {
           dotColor: "bg-blue-500",
           textColor: "text-blue-700",
           text: "running",
         };
-      case "error":
+      case "INTERNAL":
         return {
           dotColor: "bg-red-500",
           textColor: "text-red-700",
           text: "error",
         };
-      case "stopped":
+      case "ABORTED":
         return {
           dotColor: "bg-yellow-500",
           textColor: "text-yellow-700",
@@ -59,13 +62,13 @@ const StatusIndicator: React.FC<StatusIndicatorProps> = ({
         return {
           dotColor: "bg-gray-500",
           textColor: "text-gray-700",
-          text: status,
+          text: status.message,
         };
     }
   };
 
   const config = getStatusConfig(status);
-  const shouldShowSpinner = showSpinner && status.toLowerCase() === "running";
+  const shouldShowSpinner = showSpinner && status.code === 101;
 
   return (
     <div
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts