Skip to content

Commit 67b03c6

Browse files
author
Dylan Huang
committed
vite build
use model_dump(mode="json") Add run_id to EvalMetadataSchema for unique run identification - Introduced run_id as an optional string to the EvalMetadataSchema to uniquely identify evaluation runs. - Updated description to clarify the purpose of the run_id field. Add run_id field to EvalMetadata for unique run identification - Added run_id as an optional string to the EvalMetadata class to uniquely identify groups of evaluation rows. - Updated the field description to clarify its purpose in relation to evaluation tests. Fix evaluation result assignment in markdown highlighting test - Updated the test_markdown_highlighting_evaluation function to assign the evaluation result directly to the row when no assistant response is found, ensuring proper handling of evaluation results. Add run_id generation in evaluation_test for unique identification - Integrated the generate_id function to create a run_id within the evaluation_test function. - Passed the generated run_id to the evaluation function, ensuring unique identification of evaluation runs.
1 parent b9c88a1 commit 67b03c6

12 files changed

Lines changed: 30 additions & 14 deletions

File tree

eval_protocol/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,14 @@ class EvalMetadata(BaseModel):
214214
status: Optional[Literal["running", "finished", "error", "stopped"]] = Field(
215215
None, description="Status of the evaluation"
216216
)
217+
run_id: Optional[str] = Field(
218+
None,
219+
description=(
220+
"Unique identifier for the run. A 'run' is a group of rows"
221+
"that were evaluated together in single configuration of a @evaluation_test."
222+
" This means that running the save @evaluation_test with "
223+
),
224+
)
217225
num_runs: int = Field(..., description="Number of times the evaluation was repeated")
218226
aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
219227
threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")

eval_protocol/pytest/evaluation_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66

77
from eval_protocol.dataset_logger import default_logger
8+
from eval_protocol.human_id import generate_id
89
from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata
910
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
1011
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
@@ -191,6 +192,8 @@ def generate_combinations():
191192
# Create wrapper function with exact signature that pytest expects
192193
def create_wrapper_with_signature() -> Callable:
193194
# Create the function body that will be used
195+
run_id = generate_id()
196+
194197
def wrapper_body(**kwargs):
195198
model_name = kwargs["model"]
196199
eval_metadata = None
@@ -220,6 +223,7 @@ def wrapper_body(**kwargs):
220223
aggregation_method=aggregation_method,
221224
threshold_of_success=threshold_of_success,
222225
passed=None,
226+
run_id=run_id,
223227
)
224228

225229
# Populate completion_params in input_metadata for all rows and initialize eval_metadata BEFORE rollouts

eval_protocol/utils/logs_server.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ async def connect(self, websocket: WebSocket):
4040
logger.info(f"WebSocket connected. Total connections: {connection_count}")
4141
logs = default_logger.read()
4242
await websocket.send_text(
43-
json.dumps({"type": "initialize_logs", "logs": [log.model_dump_json(exclude_none=True) for log in logs]})
43+
json.dumps(
44+
{"type": "initialize_logs", "logs": [log.model_dump(exclude_none=True, mode="json") for log in logs]}
45+
)
4446
)
4547

4648
def disconnect(self, websocket: WebSocket):
@@ -57,7 +59,7 @@ def broadcast_row_upserted(self, row: "EvaluationRow"):
5759
"""
5860
try:
5961
# Serialize pydantic model
60-
json_message = json.dumps({"type": "log", "row": json.loads(row.model_dump_json(exclude_none=True))})
62+
json_message = json.dumps({"type": "log", "row": row.model_dump(exclude_none=True, mode="json")})
6163
# Queue the message for broadcasting in the main event loop
6264
self._broadcast_queue.put(json_message)
6365
except Exception as e:

tests/pytest/test_markdown_highlighting.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
import re
8-
from typing import Any, Dict, List, Optional
8+
from typing import Any, Dict, List
99

1010
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
1111
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
@@ -39,7 +39,8 @@ def test_markdown_highlighting_evaluation(row: EvaluationRow) -> EvaluationRow:
3939
assistant_response = row.messages[-1].content
4040

4141
if not assistant_response:
42-
return EvaluateResult(score=0.0, reason="❌ No assistant response found")
42+
row.evaluation_result = EvaluateResult(score=0.0, reason="❌ No assistant response found")
43+
return row
4344

4445
required_highlights = int(row.ground_truth)
4546

vite-app/dist/assets/index-D9iVTBbF.css

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-CmKCiozr.js.map renamed to vite-app/dist/assets/index-DiF_B1x_.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vite-app/dist/assets/index-Dp7ms4NJ.css

Lines changed: 0 additions & 1 deletion
This file was deleted.

vite-app/dist/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
66
<title>EP | Log Viewer</title>
77
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
8-
<script type="module" crossorigin src="/assets/index-CmKCiozr.js"></script>
9-
<link rel="stylesheet" crossorigin href="/assets/index-Dp7ms4NJ.css">
8+
<script type="module" crossorigin src="/assets/index-DiF_B1x_.js"></script>
9+
<link rel="stylesheet" crossorigin href="/assets/index-D9iVTBbF.css">
1010
</head>
1111
<body>
1212
<div id="root"></div>

vite-app/src/App.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ const App = observer(() => {
3939
);
4040
if (update.type === "initialize_logs") {
4141
const rows: EvaluationRow[] = update.logs.map((log) => {
42-
return EvaluationRowSchema.parse(JSON.parse(log));
42+
return EvaluationRowSchema.parse(log);
4343
});
4444
console.log("initialize_logs", rows);
4545
state.setDataset(rows);

0 commit comments

Comments
 (0)