Adding thresholds for aime and gpqa

xzrderek · xzrderek · commit e1062278895d · 2025-08-21T10:45:23.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -72,7 +72,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
-    passed_threshold=None,
+    passed_threshold=0.8,
     num_runs=8,
     max_dataset_rows=2,
     max_concurrent_rollouts=4,
diff --git a/eval_protocol/benchmarks/test_gpqa.py b/eval_protocol/benchmarks/test_gpqa.py
@@ -96,7 +96,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
     ],
     rollout_processor=GPQAStripGTRolloutProcessor(),
     aggregation_method="mean",
-    passed_threshold=None,
+    passed_threshold=0.6,
     num_runs=8,
     mode="pointwise",
 )
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -81,14 +81,16 @@ def postprocess(
     if aggregation_method == "mean":
         try:
             result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-            _, mu_ci_low, mu_ci_high, standard_error = result_ci
-            if mu_ci_low is not None and mu_ci_high is not None:
+            _, mu_ci_low, mu_ci_high, se = result_ci
+            if mu_ci_low is not None and mu_ci_high is not None and se is not None:
                 ci_low = float(mu_ci_low)
                 ci_high = float(mu_ci_high)
+                standard_error = float(se)
                 # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
         except Exception:
             ci_low = None
             ci_high = None
+            standard_error = None
 
     # Determine if the evaluation passed based on threshold
     passed = None
@@ -127,9 +129,10 @@ def postprocess(
             "num_runs": num_runs,
             "rows": total_rows,
         }
-        if ci_low is not None and ci_high is not None:
+        if ci_low is not None and ci_high is not None and standard_error is not None:
             summary_obj["agg_ci_low"] = ci_low
             summary_obj["agg_ci_high"] = ci_high
+            summary_obj["standard_error"] = standard_error
 
         # Aggregate per-metric mean and 95% CI when available
         metrics_summary: Dict[str, Dict[str, float]] = {}
@@ -164,9 +167,9 @@ def postprocess(
         if metrics_summary:
             summary_obj["metrics_agg"] = metrics_summary
         if should_print:
-            if ci_low is not None and ci_high is not None:
+            if ci_low is not None and ci_high is not None and standard_error is not None:
                 print(
-                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
+                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} se={summary_obj['standard_error']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
                 )
             else:
                 print(

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->`
`96`	`96`	`],`
`97`	`97`	`rollout_processor=GPQAStripGTRolloutProcessor(),`
`98`	`98`	`aggregation_method="mean",`
`99`		`- passed_threshold=None,`
	`99`	`+ passed_threshold=0.6,`
`100`	`100`	`num_runs=8,`
`101`	`101`	`mode="pointwise",`
`102`	`102`	`)`