1- import eval_protocol .pytest .evaluation_test as evaluation_test_module
1+ from typing import List
2+
3+ import eval_protocol .dataset_logger as dataset_logger
4+ from eval_protocol .dataset_logger .dataset_logger import DatasetLogger
25from eval_protocol .models import EvaluationRow
36from eval_protocol .pytest .default_no_op_rollout_process import default_no_op_rollout_processor
4- from eval_protocol .pytest .evaluation_test import evaluation_test as evaluation_decorator
57from tests .pytest .test_markdown_highlighting import markdown_dataset_to_evaluation_row
68
79
8- class InMemoryLogger :
10+ class InMemoryLogger ( DatasetLogger ) :
911 def __init__ (self ):
10- self ._rows = []
12+ self ._rows : dict [ str , EvaluationRow ] = {}
1113
12- def log (self , row ):
13- self ._rows .append (row )
14+ def log (self , row : EvaluationRow ):
15+ print (row .run_id , row .rollout_id )
16+ self ._rows [row .rollout_id ] = row
1417
1518 def read (self ):
16- return list (self ._rows )
19+ return list (self ._rows .values ())
20+
21+
22+ def test_evaluation_test_decorator (monkeypatch ):
23+ from eval_protocol .pytest .evaluation_test import evaluation_test
24+
25+ logger = InMemoryLogger ()
26+
27+ @evaluation_test (
28+ input_dataset = [
29+ "tests/pytest/data/markdown_dataset.jsonl" ,
30+ ],
31+ model = ["dummy/local-model" ],
32+ dataset_adapter = markdown_dataset_to_evaluation_row ,
33+ rollout_processor = default_no_op_rollout_processor ,
34+ mode = "pointwise" ,
35+ combine_datasets = False ,
36+ num_runs = 2 ,
37+ logger = logger ,
38+ )
39+ def eval_fn (row : EvaluationRow ) -> EvaluationRow :
40+ return row
41+
42+ dataset_paths = [
43+ "tests/pytest/data/markdown_dataset.jsonl" ,
44+ ]
45+
46+ # Manually invoke all parameter combinations within a single test
47+ for ds_path in dataset_paths :
48+ eval_fn (model = "dummy/local-model" , dataset_path = [ds_path ])
49+
50+ # Assertions on IDs generated by the decorator logic
51+ assert len (logger .read ()) == 38
1752
1853
1954def test_evaluation_test_decorator_ids_single (monkeypatch ):
20- # Use an in-memory logger to avoid sqlite side effects
2155 in_memory_logger = InMemoryLogger ()
22- monkeypatch .setattr (evaluation_test_module , "default_logger" , in_memory_logger , raising = False )
23-
2456 unique_run_ids = set ()
2557 unique_cohort_ids = set ()
2658 unique_rollout_ids = set ()
2759 unique_invocation_ids = set ()
2860 unique_row_ids = set ()
2961
30- @evaluation_decorator (
62+ from eval_protocol .pytest .evaluation_test import evaluation_test
63+
64+ @evaluation_test (
3165 input_dataset = [
3266 "tests/pytest/data/markdown_dataset.jsonl" ,
3367 "tests/pytest/data/markdown_dataset.jsonl" ,
@@ -39,6 +73,7 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
3973 mode = "pointwise" ,
4074 combine_datasets = False ,
4175 num_runs = 5 ,
76+ logger = InMemoryLogger (),
4277 )
4378 def eval_fn (row : EvaluationRow ) -> EvaluationRow :
4479 unique_run_ids .add (row .run_id )
0 commit comments