From e52ea9b0bb47f89ebd8917d4b510d541b5071cf6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Fri, 15 Aug 2025 14:22:18 -0700
Subject: [PATCH 1/4] Adding Standard Error

---
 .github/workflows/e2e-smoke-test.yml        | 14 ++++++-------
 eval_protocol/models.py                     | 18 ++++++++++++++---
 eval_protocol/pytest/evaluation_test.py     | 22 ++++++++++++---------
 eval_protocol/stats/confidence_intervals.py | 18 ++++++++---------
 tests/pytest/test_tau_bench_airline.py      |  2 +-
 vendor/tau2/__init__.py                     | 20 +++++++++++++++++++
 vendor/tau2/config.py                       |  2 +-
 7 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index ec91875c..2d160ac1 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -9,7 +9,7 @@ on:
       debug_mode:
         description: 'Enable debug output'
         required: false
-        default: 'false'
+        default: false
         type: boolean
 
 jobs:
@@ -143,8 +143,8 @@ jobs:
           SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
 
           echo "Test exit code: $TEST_EXIT_CODE"
-          echo "Threshold met (40%-60%): $THRESHOLD_MET"
-          echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
+          echo "Threshold met (36%-60%): $THRESHOLD_MET"
+          echo "Lower bound met (≥36%): $LOWER_BOUND_MET"
           echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
           echo "Success rate: $SUCCESS_RATE"
 
@@ -152,7 +152,7 @@ jobs:
           if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
             echo "❌ E2E smoke test FAILED"
             echo "   - Test execution failed (exit code: $TEST_EXIT_CODE)"
-            echo "   - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
+            echo "   - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})"
             exit 1
           elif [ "$TEST_EXIT_CODE" != "0" ]; then
             echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -169,7 +169,7 @@ jobs:
             if [ "$LOWER_BOUND_MET" != "1" ]; then
               echo "❌ E2E smoke test FAILED - success rate too low"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Required: ≥40%"
+              echo "   - Required: ≥36%"
             elif [ "$UPPER_BOUND_MET" != "1" ]; then
               echo "❌ E2E smoke test FAILED - success rate suspiciously high"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
@@ -178,11 +178,11 @@ jobs:
             else
               echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Required range: 40%-60%"
+              echo "   - Required range: 36%-60%"
             fi
             exit 1
           else
             echo "✅ E2E smoke test PASSED"
             echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-            echo "   - Within acceptable range: 40%-60%"
+            echo "   - Within acceptable range: 36%-60%"
           fi
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 3f4391fa..9d528289 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -117,6 +117,8 @@ class EvaluateResult(BaseModel):
         error (Optional[str]): Optional error message if evaluation failed.
         trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information.
         final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination.
+        agg_score (Optional[float]): The aggregated score of the evaluation across all runs.
+        standard_error (Optional[float]): The standard error of the evaluation across all runs.
     """
 
     score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.")
@@ -148,6 +150,16 @@ class EvaluateResult(BaseModel):
         default=None, description="The final control plane state that led to termination."
     )
 
+    agg_score: Optional[float] = Field(
+        default=None,
+        description="The aggregated score of the evaluation across all runs.",
+    )
+
+    standard_error: Optional[float] = Field(
+        default=None,
+        description="The standard error of the evaluation across all runs.",
+    )
+
     def __getitem__(self, key: str) -> Any:
         if key in self.__fields__:  # Changed to __fields__
             value = getattr(self, key)
@@ -213,14 +225,14 @@ class EvaluationThreshold(BaseModel):
     """Threshold configuration for evaluation tests.
 
     The success field is required - tests must specify a minimum success rate.
-    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    The standard_error field is optional - if provided, tests must also meet the maximum standard error requirement.
     """
 
     success: float = Field(
         ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
     )
-    standard_deviation: Optional[float] = Field(
-        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    standard_error: Optional[float] = Field(
+        None, description="Maximum standard error threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
     )
 
 
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 38f66d54..f9a1d566 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -128,7 +128,8 @@ def evaluation_test(  # noqa: C901
         rollout_processor_kwargs: Kwargs for the rollout processor.
         aggregation_method: How to aggregate scores across rows.
         passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
-            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
+            Success rate must be above success, and if set, standard error must be below standard_error.
+            Success rate +/- one standard_error is equivalent to 68% confidence interval.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -436,12 +437,14 @@ async def _execute_with_semaphore(row):
                                 )
                             all_results[i] = results
 
+                        for r in results:
+                            r.eval_metadata.status = "finished"
+
                     scores = [
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
                         for result in all_results
                     ]
                     agg_score = aggregate(scores, aggregation_method)
-                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
@@ -449,7 +452,7 @@ async def _execute_with_semaphore(row):
                     if aggregation_method == "mean":
                         try:
                             result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-                            mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
+                            _, mu_ci_low, mu_ci_high, standard_error = result_ci
                             if mu_ci_low is not None and mu_ci_high is not None:
                                 ci_low = float(mu_ci_low)
                                 ci_high = float(mu_ci_high)
@@ -466,8 +469,8 @@ async def _execute_with_semaphore(row):
 
                         success_passed = agg_score >= threshold.success
 
-                        if threshold.standard_deviation is not None:
-                            std_passed = score_std <= threshold.standard_deviation
+                        if threshold.standard_error is not None:
+                            std_passed = standard_error <= threshold.standard_error
 
                         passed = success_passed and std_passed
 
@@ -475,8 +478,9 @@ async def _execute_with_semaphore(row):
                     for result in all_results:
                         for r in result:
                             if r.eval_metadata is not None:
-                                r.eval_metadata.status = "finished"
                                 r.eval_metadata.passed = passed
+                                r.evaluation_result.agg_score = agg_score
+                                r.evaluation_result.standard_error = standard_error
                             active_logger.log(r)
 
                     # Optional: print and/or persist a summary artifact for CI
@@ -593,10 +597,10 @@ async def _execute_with_semaphore(row):
                         assert (
                             agg_score >= threshold.success
                         ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
-                        if threshold.standard_deviation is not None:
+                        if threshold.standard_error is not None:
                             assert (
-                                score_std <= threshold.standard_deviation
-                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
+                                standard_error <= threshold.standard_error
+                            ), f"Standard error {standard_error:.3f} above threshold {threshold.standard_error}"
 
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)
diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py
index bf78934c..85943152 100644
--- a/eval_protocol/stats/confidence_intervals.py
+++ b/eval_protocol/stats/confidence_intervals.py
@@ -36,7 +36,7 @@ def compute_fixed_set_mu_ci(
     rows: List[EvaluationRow],
     *,
     z_value: float = 1.96,
-) -> Tuple[Optional[float], Optional[float], Optional[float]]:
+) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
     """Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set.
 
     This treats questions/items as fixed and repeats as within-item Bernoulli draws.
@@ -53,10 +53,10 @@ def compute_fixed_set_mu_ci(
     - Scores are taken from `row.evaluation_result.score` when available and numeric.
 
     Returns:
-        (mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data.
+        (mu_hat, ci_low, ci_high, standard_error). Returns (None, None, None, None) if insufficient data.
     """
     if not rows:
-        return None, None, None
+        return None, None, None, None
 
     # Group scores by question id
     question_to_scores: Dict[str, List[float]] = defaultdict(list)
@@ -80,7 +80,7 @@ def compute_fixed_set_mu_ci(
 
     Q = len(question_to_scores)
     if Q == 0:
-        return None, None, None
+        return None, None, None, None
 
     # Compute per-question means and the plug-in variance contribution
     ybars: List[float] = []
@@ -99,18 +99,16 @@ def compute_fixed_set_mu_ci(
             var_terms.append(ybar_i * (1.0 - ybar_i) / m_i)
 
     if not ybars:
-        return None, None, None
+        return None, None, None, None
 
     mu_hat = sum(ybars) / len(ybars)
 
     # Standard error for CI of μ
     se_sq = sum(var_terms) / (Q * Q)
-    se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
+    standard_error = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
 
-    margin = z_value * se
+    margin = z_value * standard_error
     ci_low = max(0.0, mu_hat - margin)
     ci_high = min(1.0, mu_hat + margin)
 
-    return float(mu_hat), float(ci_low), float(ci_high)
-
-
+    return float(mu_hat), float(ci_low), float(ci_high), float(standard_error)
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index f3a7c65f..7097108f 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
-    passed_threshold={"success": 0.4, "standard_deviation": 0.1},
+    passed_threshold={"success": 0.4, "standard_error": 0.1},
     num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,
diff --git a/vendor/tau2/__init__.py b/vendor/tau2/__init__.py
index 8b137891..f1e6ad1f 100644
--- a/vendor/tau2/__init__.py
+++ b/vendor/tau2/__init__.py
@@ -1 +1,21 @@
+import os
+import sys
 
+from loguru import logger
+
+from vendor.tau2.config import DEFAULT_LOG_LEVEL
+
+# Remove default handler to avoid duplicate logs
+logger.remove()
+
+# Get log level from environment variable, then tau2 config, then default to WARNING
+log_level = os.environ.get("TAU2_LOG_LEVEL")
+if log_level is None:
+    log_level = DEFAULT_LOG_LEVEL
+
+# Add handler with appropriate log level
+logger.add(
+    sys.stderr,
+    level=log_level,
+    format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:<8} | {name}:{function}:{line} - {message}",
+)
diff --git a/vendor/tau2/config.py b/vendor/tau2/config.py
index 7be4ebb0..fb2f85e0 100644
--- a/vendor/tau2/config.py
+++ b/vendor/tau2/config.py
@@ -5,7 +5,7 @@
 DEFAULT_MAX_CONCURRENCY = 3
 DEFAULT_NUM_TRIALS = 1
 DEFAULT_SAVE_TO = None
-DEFAULT_LOG_LEVEL = "ERROR"
+DEFAULT_LOG_LEVEL = "WARNING"
 
 # LLM
 DEFAULT_AGENT_IMPLEMENTATION = "llm_agent"

From d2f132d708bca551fb86fc999509ef506f344484 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Fri, 15 Aug 2025 14:25:11 -0700
Subject: [PATCH 2/4] var name

---
 eval_protocol/pytest/evaluation_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index f9a1d566..84cda09a 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -465,14 +465,14 @@ async def _execute_with_semaphore(row):
                     passed = None
 
                     if threshold is not None:
-                        success_passed, std_passed = True, True
+                        success_passed, standard_error_passed = True, True
 
                         success_passed = agg_score >= threshold.success
 
                         if threshold.standard_error is not None:
-                            std_passed = standard_error <= threshold.standard_error
+                            standard_error_passed = standard_error <= threshold.standard_error
 
-                        passed = success_passed and std_passed
+                        passed = success_passed and standard_error_passed
 
                     # Update eval metadata passed field for all results
                     for result in all_results:

From b06c608936cc1011cd556250f5c2eeb931be6a02 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Fri, 15 Aug 2025 14:46:45 -0700
Subject: [PATCH 3/4] bug

---
 eval_protocol/pytest/evaluation_test.py | 8 ++++++--
 tests/pytest/test_tau_bench_airline.py  | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 84cda09a..9e4216de 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -403,7 +403,9 @@ async def _execute_with_semaphore(row):
                             ):
                                 tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
 
-                            all_results[i] = await asyncio.gather(*tasks)
+                            results = await asyncio.gather(*tasks)
+
+                            all_results[i] = results
 
                         else:
                             # Batch mode: collect all results first, then evaluate (no pipelining)
@@ -438,7 +440,9 @@ async def _execute_with_semaphore(row):
                             all_results[i] = results
 
                         for r in results:
-                            r.eval_metadata.status = "finished"
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                            active_logger.log(r)
 
                     scores = [
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index 7097108f..b9c59c7a 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
         }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
-    passed_threshold={"success": 0.4, "standard_error": 0.1},
+    passed_threshold={"success": 0.4, "standard_error": 0.02},
     num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,

From 5829873b2eaa480f97c3bba29374bea98b6793c6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Fri, 15 Aug 2025 14:58:13 -0700
Subject: [PATCH 4/4] bug

---
 eval_protocol/pytest/evaluation_test.py | 1 +
 tests/test_models.py                    | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 9e4216de..76868185 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -483,6 +483,7 @@ async def _execute_with_semaphore(row):
                         for r in result:
                             if r.eval_metadata is not None:
                                 r.eval_metadata.passed = passed
+                            if r.evaluation_result is not None:
                                 r.evaluation_result.agg_score = agg_score
                                 r.evaluation_result.standard_error = standard_error
                             active_logger.log(r)
diff --git a/tests/test_models.py b/tests/test_models.py
index 1358344b..8220a746 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -210,6 +210,8 @@ def test_evaluate_result_dict_access():
         "step_outputs",
         "trajectory_info",
         "final_control_plane_info",
+        "agg_score",
+        "standard_error",
     }
 
     # values() - check presence due to potential order variation of model_fields
@@ -232,6 +234,8 @@ def test_evaluate_result_dict_access():
             ("step_outputs", None),
             ("trajectory_info", None),
             ("final_control_plane_info", None),
+            ("agg_score", None),
+            ("standard_error", None),
         ]
     )
     # result.items() returns a list of tuples, so convert to list then sort.
@@ -250,6 +254,8 @@ def test_evaluate_result_dict_access():
         "step_outputs",
         "trajectory_info",
         "final_control_plane_info",
+        "agg_score",
+        "standard_error",
     }