vite build

Dylan Huang · Dylan Huang · commit 67b03c60fe3c · 2025-08-09T00:28:57.000-07:00
use model_dump(mode="json")

Add run_id to EvalMetadataSchema for unique run identification

- Introduced run_id as an optional string to the EvalMetadataSchema to uniquely identify evaluation runs.
- Updated description to clarify the purpose of the run_id field.

Add run_id field to EvalMetadata for unique run identification

- Added run_id as an optional string to the EvalMetadata class to uniquely identify groups of evaluation rows.
- Updated the field description to clarify its purpose in relation to evaluation tests.

Fix evaluation result assignment in markdown highlighting test

- Updated the test_markdown_highlighting_evaluation function to assign the evaluation result directly to the row when no assistant response is found, ensuring proper handling of evaluation results.

Add run_id generation in evaluation_test for unique identification

- Integrated the generate_id function to create a run_id within the evaluation_test function.
- Passed the generated run_id to the evaluation function, ensuring unique identification of evaluation runs.
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -214,6 +214,14 @@ class EvalMetadata(BaseModel):
     status: Optional[Literal["running", "finished", "error", "stopped"]] = Field(
         None, description="Status of the evaluation"
     )
+    run_id: Optional[str] = Field(
+        None,
+        description=(
+            "Unique identifier for the run. A 'run' is a group of rows"
+            "that were evaluated together in single configuration of a @evaluation_test."
+            " This means that running the save @evaluation_test with "
+        ),
+    )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
     threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -5,6 +5,7 @@
 import pytest
 
 from eval_protocol.dataset_logger import default_logger
+from eval_protocol.human_id import generate_id
 from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
@@ -191,6 +192,8 @@ def generate_combinations():
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature() -> Callable:
             # Create the function body that will be used
+            run_id = generate_id()
+
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
@@ -220,6 +223,7 @@ def wrapper_body(**kwargs):
                         aggregation_method=aggregation_method,
                         threshold_of_success=threshold_of_success,
                         passed=None,
+                        run_id=run_id,
                     )
 
                     # Populate completion_params in input_metadata for all rows and initialize eval_metadata BEFORE rollouts
diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py
@@ -40,7 +40,9 @@ async def connect(self, websocket: WebSocket):
         logger.info(f"WebSocket connected. Total connections: {connection_count}")
         logs = default_logger.read()
         await websocket.send_text(
-            json.dumps({"type": "initialize_logs", "logs": [log.model_dump_json(exclude_none=True) for log in logs]})
+            json.dumps(
+                {"type": "initialize_logs", "logs": [log.model_dump(exclude_none=True, mode="json") for log in logs]}
+            )
         )
 
     def disconnect(self, websocket: WebSocket):
@@ -57,7 +59,7 @@ def broadcast_row_upserted(self, row: "EvaluationRow"):
         """
         try:
             # Serialize pydantic model
-            json_message = json.dumps({"type": "log", "row": json.loads(row.model_dump_json(exclude_none=True))})
+            json_message = json.dumps({"type": "log", "row": row.model_dump(exclude_none=True, mode="json")})
             # Queue the message for broadcasting in the main event loop
             self._broadcast_queue.put(json_message)
         except Exception as e:
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -5,7 +5,7 @@
 """
 
 import re
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
@@ -39,7 +39,8 @@ def test_markdown_highlighting_evaluation(row: EvaluationRow) -> EvaluationRow:
     assistant_response = row.messages[-1].content
 
     if not assistant_response:
-        return EvaluateResult(score=0.0, reason="❌ No assistant response found")
+        row.evaluation_result = EvaluateResult(score=0.0, reason="❌ No assistant response found")
+        return row
 
     required_highlights = int(row.ground_truth)
 
diff --git a/vite-app/dist/assets/index-D9iVTBbF.css b/vite-app/dist/assets/index-D9iVTBbF.css
diff --git a/vite-app/dist/assets/index-DiF_B1x_.js b/vite-app/dist/assets/index-DiF_B1x_.js
diff --git a/vite-app/dist/assets/index-DiF_B1x_.js.map b/vite-app/dist/assets/index-DiF_B1x_.js.map
diff --git a/vite-app/dist/assets/index-Dp7ms4NJ.css b/vite-app/dist/assets/index-Dp7ms4NJ.css
diff --git a/vite-app/dist/index.html b/vite-app/dist/index.html
@@ -5,8 +5,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>EP | Log Viewer</title>
     <link rel="icon" href="/assets/favicon-BkAAWQga.png" />
-    <script type="module" crossorigin src="/assets/index-CmKCiozr.js"></script>
-    <link rel="stylesheet" crossorigin href="/assets/index-Dp7ms4NJ.css">
+    <script type="module" crossorigin src="/assets/index-DiF_B1x_.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-D9iVTBbF.css">
   </head>
   <body>
     <div id="root"></div>
diff --git a/vite-app/src/App.tsx b/vite-app/src/App.tsx
@@ -39,7 +39,7 @@ const App = observer(() => {
         );
         if (update.type === "initialize_logs") {
           const rows: EvaluationRow[] = update.logs.map((log) => {
-            return EvaluationRowSchema.parse(JSON.parse(log));
+            return EvaluationRowSchema.parse(log);
           });
           console.log("initialize_logs", rows);
           state.setDataset(rows);
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -82,7 +82,8 @@ export const EvalMetadataSchema = z.object({
   num_runs: z.number().int().describe('Number of times the evaluation was repeated'),
   aggregation_method: z.string().describe('Method used to aggregate scores across runs'),
   threshold_of_success: z.number().optional().describe('Threshold score for test success'),
-  passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold')
+  passed: z.boolean().optional().describe('Whether the evaluation passed based on the threshold'),
+  run_id: z.string().optional().describe('Unique identifier for the run. A "run" is a group of rows that were evaluated together in single configuration of a @evaluation_test.')
 });
 
 export const EvaluationRowSchema = z.object({
diff --git a/vite-app/src/types/websocket.ts b/vite-app/src/types/websocket.ts
@@ -7,7 +7,7 @@ import { EvaluationRowSchema } from './eval-protocol';
 // Initialize logs message schema
 export const InitializeLogsMessageSchema = z.object({
   type: z.literal('initialize_logs'),
-  logs: z.array(z.string()),
+  logs: z.array(z.any()),
 });
 
 export const LogMessageSchema = z.object({