eval-protocol · xzrderek · Aug 18, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
@@ -9,7 +9,7 @@ on:
       debug_mode:
         description: 'Enable debug output'
         required: false
-        default: 'false'
+        default: false
         type: boolean
 
 jobs:
@@ -143,16 +143,16 @@ jobs:
           SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
 
           echo "Test exit code: $TEST_EXIT_CODE"
-          echo "Threshold met (40%-60%): $THRESHOLD_MET"
-          echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
+          echo "Threshold met (36%-60%): $THRESHOLD_MET"
+          echo "Lower bound met (≥36%): $LOWER_BOUND_MET"
           echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
           echo "Success rate: $SUCCESS_RATE"
 
           # Fail the job if tests didn't run successfully or thresholds weren't met
           if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
             echo "❌ E2E smoke test FAILED"
             echo "   - Test execution failed (exit code: $TEST_EXIT_CODE)"
-            echo "   - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
+            echo "   - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})"
             exit 1
           elif [ "$TEST_EXIT_CODE" != "0" ]; then
             echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -169,7 +169,7 @@ jobs:
             if [ "$LOWER_BOUND_MET" != "1" ]; then
               echo "❌ E2E smoke test FAILED - success rate too low"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Required: ≥40%"
+              echo "   - Required: ≥36%"
             elif [ "$UPPER_BOUND_MET" != "1" ]; then
               echo "❌ E2E smoke test FAILED - success rate suspiciously high"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
@@ -178,11 +178,11 @@ jobs:
             else
               echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Required range: 40%-60%"
+              echo "   - Required range: 36%-60%"
             fi
             exit 1
           else
             echo "✅ E2E smoke test PASSED"
             echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-            echo "   - Within acceptable range: 40%-60%"
+            echo "   - Within acceptable range: 36%-60%"
           fi
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -117,6 +117,8 @@ class EvaluateResult(BaseModel):
         error (Optional[str]): Optional error message if evaluation failed.
         trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information.
         final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination.
+        agg_score (Optional[float]): The aggregated score of the evaluation across all runs.
+        standard_error (Optional[float]): The standard error of the evaluation across all runs.
     """
 
     score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.")
@@ -148,6 +150,16 @@ class EvaluateResult(BaseModel):
         default=None, description="The final control plane state that led to termination."
     )
 
+    agg_score: Optional[float] = Field(
+        default=None,
+        description="The aggregated score of the evaluation across all runs.",
+    )
+
+    standard_error: Optional[float] = Field(
+        default=None,
+        description="The standard error of the evaluation across all runs.",
+    )
+
     def __getitem__(self, key: str) -> Any:
         if key in self.__fields__:  # Changed to __fields__
             value = getattr(self, key)
@@ -213,14 +225,14 @@ class EvaluationThreshold(BaseModel):
     """Threshold configuration for evaluation tests.
 
     The success field is required - tests must specify a minimum success rate.
-    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    The standard_error field is optional - if provided, tests must also meet the maximum standard error requirement.
     """
 
     success: float = Field(
         ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
     )
-    standard_deviation: Optional[float] = Field(
-        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    standard_error: Optional[float] = Field(
+        None, description="Maximum standard error threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
     )
 
 

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -128,7 +128,8 @@ def evaluation_test(  # noqa: C901
         rollout_processor_kwargs: Kwargs for the rollout processor.
         aggregation_method: How to aggregate scores across rows.
         passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
-            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
+            Success rate must be above success, and if set, standard error must be below standard_error.
+            Success rate +/- one standard_error is equivalent to 68% confidence interval.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -402,7 +403,9 @@ async def _execute_with_semaphore(row):
                             ):
                                 tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
 
-                            all_results[i] = await asyncio.gather(*tasks)
+                            results = await asyncio.gather(*tasks)
+
+                            all_results[i] = results
 
                         else:
                             # Batch mode: collect all results first, then evaluate (no pipelining)
@@ -436,20 +439,24 @@ async def _execute_with_semaphore(row):
                                 )
                             all_results[i] = results
 
+                        for r in results:
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                            active_logger.log(r)
+
                     scores = [
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
                         for result in all_results
                     ]
                     agg_score = aggregate(scores, aggregation_method)
-                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None
                     if aggregation_method == "mean":
                         try:
                             result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-                            mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
+                            _, mu_ci_low, mu_ci_high, standard_error = result_ci
                             if mu_ci_low is not None and mu_ci_high is not None:
                                 ci_low = float(mu_ci_low)
                                 ci_high = float(mu_ci_high)
@@ -462,21 +469,23 @@ async def _execute_with_semaphore(row):
                     passed = None
 
                     if threshold is not None:
-                        success_passed, std_passed = True, True
+                        success_passed, standard_error_passed = True, True
 
                         success_passed = agg_score >= threshold.success
 
-                        if threshold.standard_deviation is not None:
-                            std_passed = score_std <= threshold.standard_deviation
+                        if threshold.standard_error is not None and standard_error is not None:
+                            standard_error_passed = standard_error <= threshold.standard_error
 
-                        passed = success_passed and std_passed
+                        passed = success_passed and standard_error_passed
 
                     # Update eval metadata passed field for all results
                     for result in all_results:
                         for r in result:
                             if r.eval_metadata is not None:
-                                r.eval_metadata.status = "finished"
                                 r.eval_metadata.passed = passed
+                            if r.evaluation_result is not None:
+                                r.evaluation_result.agg_score = agg_score
+                                r.evaluation_result.standard_error = standard_error
                             active_logger.log(r)
 
                     # Optional: print and/or persist a summary artifact for CI
@@ -593,9 +602,9 @@ async def _execute_with_semaphore(row):
                         assert agg_score >= threshold.success, (
                             f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
                         )
-                        if threshold.standard_deviation is not None:
-                            assert score_std <= threshold.standard_deviation, (
-                                f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
+                        if threshold.standard_error is not None and standard_error is not None:
+                            assert standard_error <= threshold.standard_error, (
+                                f"Standard error {standard_error:.3f} above threshold {threshold.standard_error}"
                             )
 
                 except AssertionError:

diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py
@@ -36,7 +36,7 @@ def compute_fixed_set_mu_ci(
     rows: List[EvaluationRow],
     *,
     z_value: float = 1.96,
-) -> Tuple[Optional[float], Optional[float], Optional[float]]:
+) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
     """Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set.
 
     This treats questions/items as fixed and repeats as within-item Bernoulli draws.
@@ -53,10 +53,10 @@ def compute_fixed_set_mu_ci(
     - Scores are taken from `row.evaluation_result.score` when available and numeric.
 
     Returns:
-        (mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data.
+        (mu_hat, ci_low, ci_high, standard_error). Returns (None, None, None, None) if insufficient data.
     """
     if not rows:
-        return None, None, None
+        return None, None, None, None
 
     # Group scores by question id
     question_to_scores: Dict[str, List[float]] = defaultdict(list)
@@ -80,7 +80,7 @@ def compute_fixed_set_mu_ci(
 
     Q = len(question_to_scores)
     if Q == 0:
-        return None, None, None
+        return None, None, None, None
 
     # Compute per-question means and the plug-in variance contribution
     ybars: List[float] = []
@@ -99,16 +99,16 @@ def compute_fixed_set_mu_ci(
             var_terms.append(ybar_i * (1.0 - ybar_i) / m_i)
 
     if not ybars:
-        return None, None, None
+        return None, None, None, None
 
     mu_hat = sum(ybars) / len(ybars)
 
     # Standard error for CI of μ
     se_sq = sum(var_terms) / (Q * Q)
-    se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
+    standard_error = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
 
-    margin = z_value * se
+    margin = z_value * standard_error
     ci_low = max(0.0, mu_hat - margin)
     ci_high = min(1.0, mu_hat + margin)
 
-    return float(mu_hat), float(ci_low), float(ci_high)
+    return float(mu_hat), float(ci_low), float(ci_high), float(standard_error)
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
-    passed_threshold={"success": 0.4, "standard_deviation": 0.1},
+    passed_threshold={"success": 0.4, "standard_error": 0.02},
     num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -210,6 +210,8 @@ def test_evaluate_result_dict_access():
         "step_outputs",
         "trajectory_info",
         "final_control_plane_info",
+        "agg_score",
+        "standard_error",
     }
 
     # values() - check presence due to potential order variation of model_fields
@@ -232,6 +234,8 @@ def test_evaluate_result_dict_access():
             ("step_outputs", None),
             ("trajectory_info", None),
             ("final_control_plane_info", None),
+            ("agg_score", None),
+            ("standard_error", None),
         ]
     )
     # result.items() returns a list of tuples, so convert to list then sort.
@@ -250,6 +254,8 @@ def test_evaluate_result_dict_access():
         "step_outputs",
         "trajectory_info",
         "final_control_plane_info",
+        "agg_score",
+        "standard_error",
     }
 
 

diff --git a/vendor/tau2/__init__.py b/vendor/tau2/__init__.py
@@ -0,0 +1,21 @@
+import os
+import sys
+
+from loguru import logger
+
+from vendor.tau2.config import DEFAULT_LOG_LEVEL
+
+# Remove default handler to avoid duplicate logs
+logger.remove()
+
+# Get log level from environment variable, then tau2 config, then default to WARNING
+log_level = os.environ.get("TAU2_LOG_LEVEL")
+if log_level is None:
+    log_level = DEFAULT_LOG_LEVEL
+
+# Add handler with appropriate log level
+logger.add(
+    sys.stderr,
+    level=log_level,
+    format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:<8} | {name}:{function}:{line} - {message}",
+)
diff --git a/vendor/tau2/config.py b/vendor/tau2/config.py
@@ -5,7 +5,7 @@
 DEFAULT_MAX_CONCURRENCY = 3
 DEFAULT_NUM_TRIALS = 1
 DEFAULT_SAVE_TO = None
-DEFAULT_LOG_LEVEL = "ERROR"
+DEFAULT_LOG_LEVEL = "WARNING"
 
 # LLM
 DEFAULT_AGENT_IMPLEMENTATION = "llm_agent"