Skip to content

Commit e52ea9b

Browse files
committed
Adding Standard Error
1 parent 58d8409 commit e52ea9b

File tree

7 files changed

+65
-31
lines changed

7 files changed

+65
-31
lines changed

.github/workflows/e2e-smoke-test.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
debug_mode:
1010
description: 'Enable debug output'
1111
required: false
12-
default: 'false'
12+
default: false
1313
type: boolean
1414

1515
jobs:
@@ -143,16 +143,16 @@ jobs:
143143
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
144144
145145
echo "Test exit code: $TEST_EXIT_CODE"
146-
echo "Threshold met (40%-60%): $THRESHOLD_MET"
147-
echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
146+
echo "Threshold met (36%-60%): $THRESHOLD_MET"
147+
echo "Lower bound met (≥36%): $LOWER_BOUND_MET"
148148
echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
149149
echo "Success rate: $SUCCESS_RATE"
150150
151151
# Fail the job if tests didn't run successfully or thresholds weren't met
152152
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
153153
echo "❌ E2E smoke test FAILED"
154154
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
155-
echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
155+
echo " - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})"
156156
exit 1
157157
elif [ "$TEST_EXIT_CODE" != "0" ]; then
158158
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -169,7 +169,7 @@ jobs:
169169
if [ "$LOWER_BOUND_MET" != "1" ]; then
170170
echo "❌ E2E smoke test FAILED - success rate too low"
171171
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
172-
echo " - Required: ≥40%"
172+
echo " - Required: ≥36%"
173173
elif [ "$UPPER_BOUND_MET" != "1" ]; then
174174
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
175175
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
@@ -178,11 +178,11 @@ jobs:
178178
else
179179
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
180180
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
181-
echo " - Required range: 40%-60%"
181+
echo " - Required range: 36%-60%"
182182
fi
183183
exit 1
184184
else
185185
echo "✅ E2E smoke test PASSED"
186186
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
187-
echo " - Within acceptable range: 40%-60%"
187+
echo " - Within acceptable range: 36%-60%"
188188
fi

eval_protocol/models.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ class EvaluateResult(BaseModel):
117117
error (Optional[str]): Optional error message if evaluation failed.
118118
trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information.
119119
final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination.
120+
agg_score (Optional[float]): The aggregated score of the evaluation across all runs.
121+
standard_error (Optional[float]): The standard error of the evaluation across all runs.
120122
"""
121123

122124
score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.")
@@ -148,6 +150,16 @@ class EvaluateResult(BaseModel):
148150
default=None, description="The final control plane state that led to termination."
149151
)
150152

153+
agg_score: Optional[float] = Field(
154+
default=None,
155+
description="The aggregated score of the evaluation across all runs.",
156+
)
157+
158+
standard_error: Optional[float] = Field(
159+
default=None,
160+
description="The standard error of the evaluation across all runs.",
161+
)
162+
151163
def __getitem__(self, key: str) -> Any:
152164
if key in self.__fields__: # Changed to __fields__
153165
value = getattr(self, key)
@@ -213,14 +225,14 @@ class EvaluationThreshold(BaseModel):
213225
"""Threshold configuration for evaluation tests.
214226
215227
The success field is required - tests must specify a minimum success rate.
216-
The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
228+
The standard_error field is optional - if provided, tests must also meet the maximum standard error requirement.
217229
"""
218230

219231
success: float = Field(
220232
..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
221233
)
222-
standard_deviation: Optional[float] = Field(
223-
None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
234+
standard_error: Optional[float] = Field(
235+
None, description="Maximum standard error threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
224236
)
225237

226238

eval_protocol/pytest/evaluation_test.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ def evaluation_test( # noqa: C901
128128
rollout_processor_kwargs: Kwargs for the rollout processor.
129129
aggregation_method: How to aggregate scores across rows.
130130
passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
131-
Success rate must be above success, and if set, standard deviation must be below standard_deviation.
131+
Success rate must be above success, and if set, standard error must be below standard_error.
132+
Success rate +/- one standard_error is equivalent to 68% confidence interval.
132133
num_runs: Number of times to repeat the rollout and evaluations.
133134
max_dataset_rows: Limit dataset to the first N rows.
134135
mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -436,20 +437,22 @@ async def _execute_with_semaphore(row):
436437
)
437438
all_results[i] = results
438439

440+
for r in results:
441+
r.eval_metadata.status = "finished"
442+
439443
scores = [
440444
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
441445
for result in all_results
442446
]
443447
agg_score = aggregate(scores, aggregation_method)
444-
score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
445448

446449
# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
447450
ci_low: float | None = None
448451
ci_high: float | None = None
449452
if aggregation_method == "mean":
450453
try:
451454
result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
452-
mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
455+
_, mu_ci_low, mu_ci_high, standard_error = result_ci
453456
if mu_ci_low is not None and mu_ci_high is not None:
454457
ci_low = float(mu_ci_low)
455458
ci_high = float(mu_ci_high)
@@ -466,17 +469,18 @@ async def _execute_with_semaphore(row):
466469

467470
success_passed = agg_score >= threshold.success
468471

469-
if threshold.standard_deviation is not None:
470-
std_passed = score_std <= threshold.standard_deviation
472+
if threshold.standard_error is not None:
473+
std_passed = standard_error <= threshold.standard_error
471474

472475
passed = success_passed and std_passed
473476

474477
# Update eval metadata passed field for all results
475478
for result in all_results:
476479
for r in result:
477480
if r.eval_metadata is not None:
478-
r.eval_metadata.status = "finished"
479481
r.eval_metadata.passed = passed
482+
r.evaluation_result.agg_score = agg_score
483+
r.evaluation_result.standard_error = standard_error
480484
active_logger.log(r)
481485

482486
# Optional: print and/or persist a summary artifact for CI
@@ -593,10 +597,10 @@ async def _execute_with_semaphore(row):
593597
assert (
594598
agg_score >= threshold.success
595599
), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
596-
if threshold.standard_deviation is not None:
600+
if threshold.standard_error is not None:
597601
assert (
598-
score_std <= threshold.standard_deviation
599-
), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
602+
standard_error <= threshold.standard_error
603+
), f"Standard error {standard_error:.3f} above threshold {threshold.standard_error}"
600604

601605
except AssertionError:
602606
_log_eval_error("finished", data if "data" in locals() else None, passed=False)

eval_protocol/stats/confidence_intervals.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def compute_fixed_set_mu_ci(
3636
rows: List[EvaluationRow],
3737
*,
3838
z_value: float = 1.96,
39-
) -> Tuple[Optional[float], Optional[float], Optional[float]]:
39+
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
4040
"""Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set.
4141
4242
This treats questions/items as fixed and repeats as within-item Bernoulli draws.
@@ -53,10 +53,10 @@ def compute_fixed_set_mu_ci(
5353
- Scores are taken from `row.evaluation_result.score` when available and numeric.
5454
5555
Returns:
56-
(mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data.
56+
(mu_hat, ci_low, ci_high, standard_error). Returns (None, None, None, None) if insufficient data.
5757
"""
5858
if not rows:
59-
return None, None, None
59+
return None, None, None, None
6060

6161
# Group scores by question id
6262
question_to_scores: Dict[str, List[float]] = defaultdict(list)
@@ -80,7 +80,7 @@ def compute_fixed_set_mu_ci(
8080

8181
Q = len(question_to_scores)
8282
if Q == 0:
83-
return None, None, None
83+
return None, None, None, None
8484

8585
# Compute per-question means and the plug-in variance contribution
8686
ybars: List[float] = []
@@ -99,18 +99,16 @@ def compute_fixed_set_mu_ci(
9999
var_terms.append(ybar_i * (1.0 - ybar_i) / m_i)
100100

101101
if not ybars:
102-
return None, None, None
102+
return None, None, None, None
103103

104104
mu_hat = sum(ybars) / len(ybars)
105105

106106
# Standard error for CI of μ
107107
se_sq = sum(var_terms) / (Q * Q)
108-
se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
108+
standard_error = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
109109

110-
margin = z_value * se
110+
margin = z_value * standard_error
111111
ci_low = max(0.0, mu_hat - margin)
112112
ci_high = min(1.0, mu_hat + margin)
113113

114-
return float(mu_hat), float(ci_low), float(ci_high)
115-
116-
114+
return float(mu_hat), float(ci_low), float(ci_high), float(standard_error)

tests/pytest/test_tau_bench_airline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
7373
}
7474
],
7575
rollout_processor=MCPGymRolloutProcessor(),
76-
passed_threshold={"success": 0.4, "standard_deviation": 0.1},
76+
passed_threshold={"success": 0.4, "standard_error": 0.1},
7777
num_runs=8,
7878
mode="pointwise",
7979
max_concurrent_rollouts=50,

vendor/tau2/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,21 @@
1+
import os
2+
import sys
13

4+
from loguru import logger
5+
6+
from vendor.tau2.config import DEFAULT_LOG_LEVEL
7+
8+
# Remove default handler to avoid duplicate logs
9+
logger.remove()
10+
11+
# Get log level from environment variable, then tau2 config, then default to WARNING
12+
log_level = os.environ.get("TAU2_LOG_LEVEL")
13+
if log_level is None:
14+
log_level = DEFAULT_LOG_LEVEL
15+
16+
# Add handler with appropriate log level
17+
logger.add(
18+
sys.stderr,
19+
level=log_level,
20+
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:<8} | {name}:{function}:{line} - {message}",
21+
)

vendor/tau2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
DEFAULT_MAX_CONCURRENCY = 3
66
DEFAULT_NUM_TRIALS = 1
77
DEFAULT_SAVE_TO = None
8-
DEFAULT_LOG_LEVEL = "ERROR"
8+
DEFAULT_LOG_LEVEL = "WARNING"
99

1010
# LLM
1111
DEFAULT_AGENT_IMPLEMENTATION = "llm_agent"

0 commit comments

Comments
 (0)