From e52ea9b0bb47f89ebd8917d4b510d541b5071cf6 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 15 Aug 2025 14:22:18 -0700 Subject: [PATCH 1/4] Adding Standard Error --- .github/workflows/e2e-smoke-test.yml | 14 ++++++------- eval_protocol/models.py | 18 ++++++++++++++--- eval_protocol/pytest/evaluation_test.py | 22 ++++++++++++--------- eval_protocol/stats/confidence_intervals.py | 18 ++++++++--------- tests/pytest/test_tau_bench_airline.py | 2 +- vendor/tau2/__init__.py | 20 +++++++++++++++++++ vendor/tau2/config.py | 2 +- 7 files changed, 65 insertions(+), 31 deletions(-) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index ec91875c..2d160ac1 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -9,7 +9,7 @@ on: debug_mode: description: 'Enable debug output' required: false - default: 'false' + default: false type: boolean jobs: @@ -143,8 +143,8 @@ jobs: SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" echo "Test exit code: $TEST_EXIT_CODE" - echo "Threshold met (40%-60%): $THRESHOLD_MET" - echo "Lower bound met (≥40%): $LOWER_BOUND_MET" + echo "Threshold met (36%-60%): $THRESHOLD_MET" + echo "Lower bound met (≥36%): $LOWER_BOUND_MET" echo "Upper bound met (≤60%): $UPPER_BOUND_MET" echo "Success rate: $SUCCESS_RATE" @@ -152,7 +152,7 @@ jobs: if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then echo "❌ E2E smoke test FAILED" echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" - echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})" + echo " - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})" exit 1 elif [ "$TEST_EXIT_CODE" != "0" ]; then echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" @@ -169,7 +169,7 @@ jobs: if [ "$LOWER_BOUND_MET" != "1" ]; then echo "❌ E2E smoke test FAILED - success rate too low" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Required: ≥40%" + echo " - Required: ≥36%" elif [ "$UPPER_BOUND_MET" != "1" ]; then echo "❌ E2E smoke test FAILED - success rate suspiciously high" echo " - Success rate: ${SUCCESS_RATE:-unknown}" @@ -178,11 +178,11 @@ jobs: else echo "❌ E2E smoke test FAILED - success rate outside acceptable range" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Required range: 40%-60%" + echo " - Required range: 36%-60%" fi exit 1 else echo "✅ E2E smoke test PASSED" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Within acceptable range: 40%-60%" + echo " - Within acceptable range: 36%-60%" fi diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 3f4391fa..9d528289 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -117,6 +117,8 @@ class EvaluateResult(BaseModel): error (Optional[str]): Optional error message if evaluation failed. trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information. final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination. + agg_score (Optional[float]): The aggregated score of the evaluation across all runs. + standard_error (Optional[float]): The standard error of the evaluation across all runs. """ score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.") @@ -148,6 +150,16 @@ class EvaluateResult(BaseModel): default=None, description="The final control plane state that led to termination." ) + agg_score: Optional[float] = Field( + default=None, + description="The aggregated score of the evaluation across all runs.", + ) + + standard_error: Optional[float] = Field( + default=None, + description="The standard error of the evaluation across all runs.", + ) + def __getitem__(self, key: str) -> Any: if key in self.__fields__: # Changed to __fields__ value = getattr(self, key) @@ -213,14 +225,14 @@ class EvaluationThreshold(BaseModel): """Threshold configuration for evaluation tests. The success field is required - tests must specify a minimum success rate. - The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement. + The standard_error field is optional - if provided, tests must also meet the maximum standard error requirement. """ success: float = Field( ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0 ) - standard_deviation: Optional[float] = Field( - None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0 + standard_error: Optional[float] = Field( + None, description="Maximum standard error threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0 ) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 38f66d54..f9a1d566 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -128,7 +128,8 @@ def evaluation_test( # noqa: C901 rollout_processor_kwargs: Kwargs for the rollout processor. aggregation_method: How to aggregate scores across rows. passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object. - Success rate must be above success, and if set, standard deviation must be below standard_deviation. + Success rate must be above success, and if set, standard error must be below standard_error. + Success rate +/- one standard_error is equivalent to 68% confidence interval. num_runs: Number of times to repeat the rollout and evaluations. max_dataset_rows: Limit dataset to the first N rows. mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema @@ -436,12 +437,14 @@ async def _execute_with_semaphore(row): ) all_results[i] = results + for r in results: + r.eval_metadata.status = "finished" + scores = [ sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result) for result in all_results ] agg_score = aggregate(scores, aggregation_method) - score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0 # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats) ci_low: float | None = None @@ -449,7 +452,7 @@ async def _execute_with_semaphore(row): if aggregation_method == "mean": try: result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist]) - mu_ci_low, mu_ci_high = result_ci[1], result_ci[2] + _, mu_ci_low, mu_ci_high, standard_error = result_ci if mu_ci_low is not None and mu_ci_high is not None: ci_low = float(mu_ci_low) ci_high = float(mu_ci_high) @@ -466,8 +469,8 @@ async def _execute_with_semaphore(row): success_passed = agg_score >= threshold.success - if threshold.standard_deviation is not None: - std_passed = score_std <= threshold.standard_deviation + if threshold.standard_error is not None: + std_passed = standard_error <= threshold.standard_error passed = success_passed and std_passed @@ -475,8 +478,9 @@ async def _execute_with_semaphore(row): for result in all_results: for r in result: if r.eval_metadata is not None: - r.eval_metadata.status = "finished" r.eval_metadata.passed = passed + r.evaluation_result.agg_score = agg_score + r.evaluation_result.standard_error = standard_error active_logger.log(r) # Optional: print and/or persist a summary artifact for CI @@ -593,10 +597,10 @@ async def _execute_with_semaphore(row): assert ( agg_score >= threshold.success ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}" - if threshold.standard_deviation is not None: + if threshold.standard_error is not None: assert ( - score_std <= threshold.standard_deviation - ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}" + standard_error <= threshold.standard_error + ), f"Standard error {standard_error:.3f} above threshold {threshold.standard_error}" except AssertionError: _log_eval_error("finished", data if "data" in locals() else None, passed=False) diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py index bf78934c..85943152 100644 --- a/eval_protocol/stats/confidence_intervals.py +++ b/eval_protocol/stats/confidence_intervals.py @@ -36,7 +36,7 @@ def compute_fixed_set_mu_ci( rows: List[EvaluationRow], *, z_value: float = 1.96, -) -> Tuple[Optional[float], Optional[float], Optional[float]]: +) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]: """Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set. This treats questions/items as fixed and repeats as within-item Bernoulli draws. @@ -53,10 +53,10 @@ def compute_fixed_set_mu_ci( - Scores are taken from `row.evaluation_result.score` when available and numeric. Returns: - (mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data. + (mu_hat, ci_low, ci_high, standard_error). Returns (None, None, None, None) if insufficient data. """ if not rows: - return None, None, None + return None, None, None, None # Group scores by question id question_to_scores: Dict[str, List[float]] = defaultdict(list) @@ -80,7 +80,7 @@ def compute_fixed_set_mu_ci( Q = len(question_to_scores) if Q == 0: - return None, None, None + return None, None, None, None # Compute per-question means and the plug-in variance contribution ybars: List[float] = [] @@ -99,18 +99,16 @@ def compute_fixed_set_mu_ci( var_terms.append(ybar_i * (1.0 - ybar_i) / m_i) if not ybars: - return None, None, None + return None, None, None, None mu_hat = sum(ybars) / len(ybars) # Standard error for CI of μ se_sq = sum(var_terms) / (Q * Q) - se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0 + standard_error = math.sqrt(se_sq) if se_sq > 0.0 else 0.0 - margin = z_value * se + margin = z_value * standard_error ci_low = max(0.0, mu_hat - margin) ci_high = min(1.0, mu_hat + margin) - return float(mu_hat), float(ci_low), float(ci_high) - - + return float(mu_hat), float(ci_low), float(ci_high), float(standard_error) diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index f3a7c65f..7097108f 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval } ], rollout_processor=MCPGymRolloutProcessor(), - passed_threshold={"success": 0.4, "standard_deviation": 0.1}, + passed_threshold={"success": 0.4, "standard_error": 0.1}, num_runs=8, mode="pointwise", max_concurrent_rollouts=50, diff --git a/vendor/tau2/__init__.py b/vendor/tau2/__init__.py index 8b137891..f1e6ad1f 100644 --- a/vendor/tau2/__init__.py +++ b/vendor/tau2/__init__.py @@ -1 +1,21 @@ +import os +import sys +from loguru import logger + +from vendor.tau2.config import DEFAULT_LOG_LEVEL + +# Remove default handler to avoid duplicate logs +logger.remove() + +# Get log level from environment variable, then tau2 config, then default to WARNING +log_level = os.environ.get("TAU2_LOG_LEVEL") +if log_level is None: + log_level = DEFAULT_LOG_LEVEL + +# Add handler with appropriate log level +logger.add( + sys.stderr, + level=log_level, + format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:<8} | {name}:{function}:{line} - {message}", +) diff --git a/vendor/tau2/config.py b/vendor/tau2/config.py index 7be4ebb0..fb2f85e0 100644 --- a/vendor/tau2/config.py +++ b/vendor/tau2/config.py @@ -5,7 +5,7 @@ DEFAULT_MAX_CONCURRENCY = 3 DEFAULT_NUM_TRIALS = 1 DEFAULT_SAVE_TO = None -DEFAULT_LOG_LEVEL = "ERROR" +DEFAULT_LOG_LEVEL = "WARNING" # LLM DEFAULT_AGENT_IMPLEMENTATION = "llm_agent" From d2f132d708bca551fb86fc999509ef506f344484 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 15 Aug 2025 14:25:11 -0700 Subject: [PATCH 2/4] var name --- eval_protocol/pytest/evaluation_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index f9a1d566..84cda09a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -465,14 +465,14 @@ async def _execute_with_semaphore(row): passed = None if threshold is not None: - success_passed, std_passed = True, True + success_passed, standard_error_passed = True, True success_passed = agg_score >= threshold.success if threshold.standard_error is not None: - std_passed = standard_error <= threshold.standard_error + standard_error_passed = standard_error <= threshold.standard_error - passed = success_passed and std_passed + passed = success_passed and standard_error_passed # Update eval metadata passed field for all results for result in all_results: From b06c608936cc1011cd556250f5c2eeb931be6a02 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 15 Aug 2025 14:46:45 -0700 Subject: [PATCH 3/4] bug --- eval_protocol/pytest/evaluation_test.py | 8 ++++++-- tests/pytest/test_tau_bench_airline.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 84cda09a..9e4216de 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -403,7 +403,9 @@ async def _execute_with_semaphore(row): ): tasks.append(asyncio.create_task(_execute_with_semaphore(row))) - all_results[i] = await asyncio.gather(*tasks) + results = await asyncio.gather(*tasks) + + all_results[i] = results else: # Batch mode: collect all results first, then evaluate (no pipelining) @@ -438,7 +440,9 @@ async def _execute_with_semaphore(row): all_results[i] = results for r in results: - r.eval_metadata.status = "finished" + if r.eval_metadata is not None: + r.eval_metadata.status = "finished" + active_logger.log(r) scores = [ sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result) diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 7097108f..b9c59c7a 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval } ], rollout_processor=MCPGymRolloutProcessor(), - passed_threshold={"success": 0.4, "standard_error": 0.1}, + passed_threshold={"success": 0.4, "standard_error": 0.02}, num_runs=8, mode="pointwise", max_concurrent_rollouts=50, From 5829873b2eaa480f97c3bba29374bea98b6793c6 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 15 Aug 2025 14:58:13 -0700 Subject: [PATCH 4/4] bug --- eval_protocol/pytest/evaluation_test.py | 1 + tests/test_models.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 9e4216de..76868185 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -483,6 +483,7 @@ async def _execute_with_semaphore(row): for r in result: if r.eval_metadata is not None: r.eval_metadata.passed = passed + if r.evaluation_result is not None: r.evaluation_result.agg_score = agg_score r.evaluation_result.standard_error = standard_error active_logger.log(r) diff --git a/tests/test_models.py b/tests/test_models.py index 1358344b..8220a746 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -210,6 +210,8 @@ def test_evaluate_result_dict_access(): "step_outputs", "trajectory_info", "final_control_plane_info", + "agg_score", + "standard_error", } # values() - check presence due to potential order variation of model_fields @@ -232,6 +234,8 @@ def test_evaluate_result_dict_access(): ("step_outputs", None), ("trajectory_info", None), ("final_control_plane_info", None), + ("agg_score", None), + ("standard_error", None), ] ) # result.items() returns a list of tuples, so convert to list then sort. @@ -250,6 +254,8 @@ def test_evaluate_result_dict_access(): "step_outputs", "trajectory_info", "final_control_plane_info", + "agg_score", + "standard_error", }