@@ -63,7 +63,11 @@ def evaluation_test( # noqa: C901
6363
6464 Here are some key concepts to understand the terminology in EP:
6565
66- - "cohort" is a group of runs with a static set of parameters. A single
66+ - "invocation" is a single execution of a test function. An invocation can
67+ generate 1 or more cohorts. Grouping by invocation might be useful to
68+ aggregate eval scores across multiple invocations when you want to aggregate
69+ scores across multiple datasets.
70+ - "cohort" is a group of runs with for a combination of parameters. A single
6771 cohort will have multiple runs if num_runs > 1.
6872 1. If your evaluation_test has combinations of parameters, it will generate
6973 multiple cohorts per combination of parameters.
@@ -85,8 +89,8 @@ def evaluation_test( # noqa: C901
8589 decorated test. It simply produces a score from 0 to 1 and attached it
8690 to the row as the "evaluation_result" field.
8791
88- A "cohort", "run", "rollout", and "row" each have a unique ID which can be
89- used to easily group and identify them .
92+ "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
93+ which can be used to easily group and identify your dataset by .
9094
9195 Args:
9296 model: Model identifiers to query.
@@ -205,7 +209,7 @@ def generate_combinations():
205209 datasets = [[input_dataset ]] # type: ignore
206210 else :
207211 datasets = [None ]
208- params : List [Optional [RolloutInputParam ]] = rollout_input_params if rollout_input_params is not None else [None ] # type: ignore
212+ rips : List [Optional [RolloutInputParam ]] = rollout_input_params if rollout_input_params is not None else [None ] # type: ignore
209213 # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over
210214 # each row. Instead, pass the entire sliced list through in a single test run
211215 # so summaries aggregate all rows together (AIME-style behavior).
@@ -224,15 +228,15 @@ def generate_combinations():
224228 # Generate all combinations
225229 for m in model :
226230 for ds in datasets :
227- for ip in params :
231+ for rip in rips :
228232 for im in messages :
229233 for etk in kwargs :
230234 # if no dataset and no messages, raise an error
231235 if ds is None and im is None :
232236 raise ValueError (
233237 "No dataset or messages provided. Please provide at least one of input_dataset or input_messages."
234238 )
235- combinations .append ((m , ds , ip , im , etk ))
239+ combinations .append ((m , ds , rip , im , etk ))
236240
237241 return combinations
238242
@@ -245,12 +249,12 @@ def generate_combinations():
245249 # Create parameter tuples for pytest.mark.parametrize
246250 param_tuples = []
247251 for combo in combinations :
248- model_name , dataset , params , messages , etk = combo
252+ model_name , dataset , rip , messages , etk = combo
249253 param_tuple = [model_name ]
250254 if input_dataset is not None :
251255 param_tuple .append (dataset )
252256 if rollout_input_params is not None :
253- param_tuple .append (params )
257+ param_tuple .append (rip )
254258 if input_messages is not None :
255259 param_tuple .append (messages )
256260 if evaluation_test_kwargs is not None :
@@ -271,13 +275,15 @@ def generate_combinations():
271275 # Create wrapper function with exact signature that pytest expects
272276 def create_wrapper_with_signature () -> Callable :
273277 # Create the function body that will be used
274- cohort_id = generate_id ()
278+ invocation_id = generate_id ()
275279
276280 def wrapper_body (** kwargs ):
277281 model_name = kwargs ["model" ]
278282 eval_metadata = None
279283 all_results : List [EvaluationRow ] = []
280284
285+ cohort_id = generate_id ()
286+
281287 def _log_eval_error (
282288 status : Literal ["finished" , "error" ], rows : Optional [List [EvaluationRow ]] | None , passed : bool
283289 ) -> None :
@@ -358,6 +364,7 @@ def _log_eval_error(
358364 # Initialize eval_metadata for each row
359365 row .eval_metadata = eval_metadata
360366 row .cohort_id = cohort_id
367+ row .invocation_id = invocation_id
361368
362369 # has to be done in the pytest main process since it's
363370 # used to determine whether this eval has stopped
0 commit comments