updated model

xzrderek · xzrderek · commit 2694ebdecd0d · 2025-08-10T19:15:06.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -231,7 +231,9 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold: Optional[EvaluationThreshold] = Field(None, description="Threshold configuration for test success")
+    passed_threshold: Optional[EvaluationThreshold] = Field(
+        None, description="Threshold configuration for test success"
+    )
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -53,7 +53,7 @@ def evaluation_test(  # noqa: C901
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold: Optional[EvaluationThreshold] = None,
+    passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -113,7 +113,7 @@ def evaluation_test(  # noqa: C901
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold: Threshold configuration for test success.
+        passed_threshold: Threshold configuration for test success.
             Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
@@ -129,11 +129,11 @@ def evaluation_test(  # noqa: C901
     def decorator(
         test_func: TestFunction,
     ):
-        if threshold is not None:
-            if isinstance(threshold, dict):
-                evaluation_threshold = EvaluationThreshold(**threshold)
-            elif isinstance(threshold, float):
-                evaluation_threshold = EvaluationThreshold(success=threshold)
+        if passed_threshold is not None:
+            if isinstance(passed_threshold, float):
+                threshold = EvaluationThreshold(success=passed_threshold)
+            else:
+                threshold = EvaluationThreshold(**passed_threshold)
 
         sig = inspect.signature(test_func)
 
@@ -361,7 +361,7 @@ def _log_eval_error(
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold=evaluation_threshold,
+                        passed_threshold=threshold,
                         passed=None,
                     )
 
@@ -459,6 +459,7 @@ def _log_eval_error(
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
                         for result in all_results
                     ]
+                    print(f"SCORES: {scores}")
                     agg_score = aggregate(scores, aggregation_method)
                     score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
@@ -495,13 +496,13 @@ def _log_eval_error(
                     # Determine if the evaluation passed based on threshold
                     passed = None
 
-                    if evaluation_threshold is not None:
+                    if threshold is not None:
                         success_passed, std_passed = True, True
 
-                        success_passed = agg_score >= evaluation_threshold.success
+                        success_passed = agg_score >= threshold.success
 
-                        if evaluation_threshold.standard_deviation is not None:
-                            std_passed = score_std <= evaluation_threshold.standard_deviation
+                        if threshold.standard_deviation is not None:
+                            std_passed = score_std <= threshold.standard_deviation
 
                         passed = success_passed and std_passed
 
@@ -636,14 +637,14 @@ def _extract_effort_tag(params: dict) -> str | None:
                         pass
 
                     # Check threshold after logging
-                    if evaluation_threshold is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= evaluation_threshold.success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {evaluation_threshold.success}"
-                        if evaluation_threshold.standard_deviation is not None:
+                            agg_score >= threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
+                        if threshold.standard_deviation is not None:
                             assert (
-                                score_std <= evaluation_threshold.standard_deviation
-                            ), f"Standard deviation {score_std:.3f} above threshold {evaluation_threshold.standard_deviation}"
+                                score_std <= threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
 
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -28,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.33,
+    passed_threshold=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -30,7 +30,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.8,
+    passed_threshold=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold=0.66,
+    passed_threshold=0.66,
     num_runs=1,
     max_concurrent_rollouts=3,
     mode="pointwise",
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold=0.33,
+    passed_threshold=0.33,
     num_runs=1,
     mode="pointwise",
 )
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     model=["gpt-4.1"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold=0.0,
+    passed_threshold=0.0,
     num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=3,
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -30,7 +30,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.5,
+    passed_threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -14,7 +14,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.3,  # Reasonable threshold for word count evaluation
+    passed_threshold=0.3,  # Reasonable threshold for word count evaluation
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
@@ -65,10 +65,10 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
     dataset_adapter=tau_bench_airline_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}],
+    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold={"success": 0.4, "standard_deviation": 0.03},
-    num_runs=4,
+    passed_threshold={"success": 0.4, "standard_deviation": 0.1},
+    num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",

Original file line number	Diff line number	Diff line change
`@@ -231,7 +231,9 @@ class EvalMetadata(BaseModel):`
`231`	`231`	`)`
`232`	`232`	`num_runs: int = Field(..., description="Number of times the evaluation was repeated")`
`233`	`233`	`aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")`
`234`		`- threshold: Optional[EvaluationThreshold] = Field(None, description="Threshold configuration for test success")`
	`234`	`+ passed_threshold: Optional[EvaluationThreshold] = Field(`
	`235`	`+ None, description="Threshold configuration for test success"`
	`236`	`+ )`
`235`	`237`	`passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")`
`236`	`238`
`237`	`239`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation`
`35`	`35`	`model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],`
`36`	`36`	`rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],`
`37`	`37`	`rollout_processor=default_single_turn_rollout_processor,`
`38`		`- threshold=0.33,`
	`38`	`+ passed_threshold=0.33,`
`39`	`39`	`num_runs=1,`
`40`	`40`	`mode="pointwise",`
`41`	`41`	`)`