Skip to content

Commit 0b637de

Browse files
committed
failing test
1 parent b1eaf1e commit 0b637de

2 files changed

Lines changed: 37 additions & 44 deletions

File tree

eval_protocol/pytest/evaluation_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -471,11 +471,10 @@ async def _execute_with_semaphore(row):
471471

472472
passed = success_passed and std_passed
473473

474-
# Update eval metadata status and passed field for all results
474+
# Update eval metadata passed field for all results
475475
for result in all_results:
476476
for r in result:
477477
if r.eval_metadata is not None:
478-
r.eval_metadata.status = "finished" # TODO: might not be needed
479478
r.eval_metadata.passed = passed
480479
active_logger.log(r)
481480

Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
11
import os
22
from unittest.mock import Mock, patch
33

4-
import eval_protocol.dataset_logger as dataset_logger
5-
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
6-
from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore
7-
from eval_protocol.models import EvaluationRow
8-
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
9-
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
10-
114

125
async def test_ensure_logging(monkeypatch):
136
"""
@@ -25,41 +18,42 @@ async def test_ensure_logging(monkeypatch):
2518
"eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store
2619
):
2720
from eval_protocol.models import EvaluationRow
21+
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
2822
from eval_protocol.pytest.evaluation_test import evaluation_test
2923
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
3024

31-
@evaluation_test(
32-
input_dataset=[
33-
"tests/pytest/data/markdown_dataset.jsonl",
34-
],
35-
completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}],
36-
dataset_adapter=markdown_dataset_to_evaluation_row,
37-
rollout_processor=NoOpRolloutProcessor(),
38-
mode="pointwise",
39-
combine_datasets=False,
40-
num_runs=2,
41-
# Don't pass logger parameter - let it use the default_logger (which we've replaced)
42-
)
43-
def eval_fn(row: EvaluationRow) -> EvaluationRow:
44-
return row
45-
46-
await eval_fn(
47-
dataset_path=["tests/pytest/data/markdown_dataset.jsonl"],
48-
completion_params={"temperature": 0.0, "model": "dummy/local-model"},
49-
)
50-
51-
# Verify that the store's upsert_row method was called
52-
assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called"
53-
54-
# Check that it was called multiple times (once for each row)
55-
call_count = mock_store.upsert_row.call_count
56-
assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times"
57-
58-
# Verify the calls were made with proper data structure
59-
for call in mock_store.upsert_row.call_args_list:
60-
args, kwargs = call
61-
data = args[0] if args else kwargs.get("data")
62-
assert data is not None, "upsert_row should be called with data parameter"
63-
assert isinstance(data, dict), "data should be a dictionary"
64-
assert "execution_metadata" in data, "data should contain execution_metadata"
65-
assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata"
25+
@evaluation_test(
26+
input_dataset=[
27+
"tests/pytest/data/markdown_dataset.jsonl",
28+
],
29+
completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}],
30+
dataset_adapter=markdown_dataset_to_evaluation_row,
31+
rollout_processor=NoOpRolloutProcessor(),
32+
mode="pointwise",
33+
combine_datasets=False,
34+
num_runs=2,
35+
# Don't pass logger parameter - let it use the default_logger (which we've replaced)
36+
)
37+
def eval_fn(row: EvaluationRow) -> EvaluationRow:
38+
return row
39+
40+
await eval_fn(
41+
dataset_path=["tests/pytest/data/markdown_dataset.jsonl"],
42+
completion_params={"temperature": 0.0, "model": "dummy/local-model"},
43+
)
44+
45+
# Verify that the store's upsert_row method was called
46+
assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called"
47+
48+
# Check that it was called multiple times (once for each row)
49+
call_count = mock_store.upsert_row.call_count
50+
assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times"
51+
52+
# Verify the calls were made with proper data structure
53+
for call in mock_store.upsert_row.call_args_list:
54+
args, kwargs = call
55+
data = args[0] if args else kwargs.get("data")
56+
assert data is not None, "upsert_row should be called with data parameter"
57+
assert isinstance(data, dict), "data should be a dictionary"
58+
assert "execution_metadata" in data, "data should contain execution_metadata"
59+
assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata"

0 commit comments

Comments
 (0)