Skip to content

Commit c436eaf

Browse files
author
Dylan Huang
committed
fix logging too much
1 parent 77f6318 commit c436eaf

File tree

2 files changed

+50
-12
lines changed

2 files changed

+50
-12
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,6 @@ def _log_eval_error(
374374
# has to be done in the pytest main process since it's
375375
# used to determine whether this eval has stopped
376376
row.pid = os.getpid()
377-
active_logger.log(row)
378377

379378
# Prepare rollout processor config once; we will generate fresh outputs per run
380379
config = RolloutProcessorConfig(
@@ -401,6 +400,10 @@ def _log_eval_error(
401400
for row in fresh_dataset:
402401
row.rollout_id = generate_id()
403402

403+
# log the fresh_dataset
404+
for row in fresh_dataset:
405+
active_logger.log(row)
406+
404407
processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
405408

406409
if mode == "pointwise":

tests/pytest/test_pytest_ids.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,67 @@
1-
import eval_protocol.pytest.evaluation_test as evaluation_test_module
1+
from typing import List
2+
3+
import eval_protocol.dataset_logger as dataset_logger
4+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
25
from eval_protocol.models import EvaluationRow
36
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
4-
from eval_protocol.pytest.evaluation_test import evaluation_test as evaluation_decorator
57
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
68

79

8-
class InMemoryLogger:
10+
class InMemoryLogger(DatasetLogger):
911
def __init__(self):
10-
self._rows = []
12+
self._rows: dict[str, EvaluationRow] = {}
1113

12-
def log(self, row):
13-
self._rows.append(row)
14+
def log(self, row: EvaluationRow):
15+
print(row.run_id, row.rollout_id)
16+
self._rows[row.rollout_id] = row
1417

1518
def read(self):
16-
return list(self._rows)
19+
return list(self._rows.values())
20+
21+
22+
def test_evaluation_test_decorator(monkeypatch):
23+
from eval_protocol.pytest.evaluation_test import evaluation_test
24+
25+
logger = InMemoryLogger()
26+
27+
@evaluation_test(
28+
input_dataset=[
29+
"tests/pytest/data/markdown_dataset.jsonl",
30+
],
31+
model=["dummy/local-model"],
32+
dataset_adapter=markdown_dataset_to_evaluation_row,
33+
rollout_processor=default_no_op_rollout_processor,
34+
mode="pointwise",
35+
combine_datasets=False,
36+
num_runs=2,
37+
logger=logger,
38+
)
39+
def eval_fn(row: EvaluationRow) -> EvaluationRow:
40+
return row
41+
42+
dataset_paths = [
43+
"tests/pytest/data/markdown_dataset.jsonl",
44+
]
45+
46+
# Manually invoke all parameter combinations within a single test
47+
for ds_path in dataset_paths:
48+
eval_fn(model="dummy/local-model", dataset_path=[ds_path])
49+
50+
# Assertions on IDs generated by the decorator logic
51+
assert len(logger.read()) == 38
1752

1853

1954
def test_evaluation_test_decorator_ids_single(monkeypatch):
20-
# Use an in-memory logger to avoid sqlite side effects
2155
in_memory_logger = InMemoryLogger()
22-
monkeypatch.setattr(evaluation_test_module, "default_logger", in_memory_logger, raising=False)
23-
2456
unique_run_ids = set()
2557
unique_cohort_ids = set()
2658
unique_rollout_ids = set()
2759
unique_invocation_ids = set()
2860
unique_row_ids = set()
2961

30-
@evaluation_decorator(
62+
from eval_protocol.pytest.evaluation_test import evaluation_test
63+
64+
@evaluation_test(
3165
input_dataset=[
3266
"tests/pytest/data/markdown_dataset.jsonl",
3367
"tests/pytest/data/markdown_dataset.jsonl",
@@ -39,6 +73,7 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
3973
mode="pointwise",
4074
combine_datasets=False,
4175
num_runs=5,
76+
logger=InMemoryLogger(),
4277
)
4378
def eval_fn(row: EvaluationRow) -> EvaluationRow:
4479
unique_run_ids.add(row.run_id)

0 commit comments

Comments
 (0)