Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/e2e-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
debug_mode:
description: 'Enable debug output'
required: false
default: 'false'
default: false
type: boolean

jobs:
Expand Down Expand Up @@ -143,16 +143,16 @@ jobs:
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"

echo "Test exit code: $TEST_EXIT_CODE"
echo "Threshold met (40%-60%): $THRESHOLD_MET"
echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
echo "Threshold met (36%-60%): $THRESHOLD_MET"
echo "Lower bound met (≥36%): $LOWER_BOUND_MET"
echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
echo "Success rate: $SUCCESS_RATE"

# Fail the job if tests didn't run successfully or thresholds weren't met
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED"
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
echo " - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})"
exit 1
elif [ "$TEST_EXIT_CODE" != "0" ]; then
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
Expand All @@ -169,7 +169,7 @@ jobs:
if [ "$LOWER_BOUND_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED - success rate too low"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Required: ≥40%"
echo " - Required: ≥36%"
elif [ "$UPPER_BOUND_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
Expand All @@ -178,11 +178,11 @@ jobs:
else
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Required range: 40%-60%"
echo " - Required range: 36%-60%"
fi
exit 1
else
echo "✅ E2E smoke test PASSED"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Within acceptable range: 40%-60%"
echo " - Within acceptable range: 36%-60%"
fi
18 changes: 15 additions & 3 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ class EvaluateResult(BaseModel):
error (Optional[str]): Optional error message if evaluation failed.
trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information.
final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination.
agg_score (Optional[float]): The aggregated score of the evaluation across all runs.
standard_error (Optional[float]): The standard error of the evaluation across all runs.
"""

score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.")
Expand Down Expand Up @@ -148,6 +150,16 @@ class EvaluateResult(BaseModel):
default=None, description="The final control plane state that led to termination."
)

agg_score: Optional[float] = Field(
default=None,
description="The aggregated score of the evaluation across all runs.",
)

standard_error: Optional[float] = Field(
default=None,
description="The standard error of the evaluation across all runs.",
)

def __getitem__(self, key: str) -> Any:
if key in self.__fields__: # Changed to __fields__
value = getattr(self, key)
Expand Down Expand Up @@ -213,14 +225,14 @@ class EvaluationThreshold(BaseModel):
"""Threshold configuration for evaluation tests.

The success field is required - tests must specify a minimum success rate.
The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
The standard_error field is optional - if provided, tests must also meet the maximum standard error requirement.
"""

success: float = Field(
..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
)
standard_deviation: Optional[float] = Field(
None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
standard_error: Optional[float] = Field(
None, description="Maximum standard error threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
)


Expand Down
33 changes: 21 additions & 12 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def evaluation_test( # noqa: C901
rollout_processor_kwargs: Kwargs for the rollout processor.
aggregation_method: How to aggregate scores across rows.
passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object.
Success rate must be above success, and if set, standard deviation must be below standard_deviation.
Success rate must be above success, and if set, standard error must be below standard_error.
Success rate +/- one standard_error is equivalent to 68% confidence interval.
num_runs: Number of times to repeat the rollout and evaluations.
max_dataset_rows: Limit dataset to the first N rows.
mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
Expand Down Expand Up @@ -402,7 +403,9 @@ async def _execute_with_semaphore(row):
):
tasks.append(asyncio.create_task(_execute_with_semaphore(row)))

all_results[i] = await asyncio.gather(*tasks)
results = await asyncio.gather(*tasks)

all_results[i] = results

else:
# Batch mode: collect all results first, then evaluate (no pipelining)
Expand Down Expand Up @@ -436,20 +439,24 @@ async def _execute_with_semaphore(row):
)
all_results[i] = results

for r in results:
if r.eval_metadata is not None:
r.eval_metadata.status = "finished"
active_logger.log(r)

scores = [
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
for result in all_results
]
agg_score = aggregate(scores, aggregation_method)
score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0

# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
ci_low: float | None = None
ci_high: float | None = None
if aggregation_method == "mean":
try:
result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
_, mu_ci_low, mu_ci_high, standard_error = result_ci
if mu_ci_low is not None and mu_ci_high is not None:
ci_low = float(mu_ci_low)
ci_high = float(mu_ci_high)
Expand All @@ -462,21 +469,23 @@ async def _execute_with_semaphore(row):
passed = None

if threshold is not None:
success_passed, std_passed = True, True
success_passed, standard_error_passed = True, True

success_passed = agg_score >= threshold.success

if threshold.standard_deviation is not None:
std_passed = score_std <= threshold.standard_deviation
if threshold.standard_error is not None and standard_error is not None:
standard_error_passed = standard_error <= threshold.standard_error

passed = success_passed and std_passed
passed = success_passed and standard_error_passed

# Update eval metadata passed field for all results
for result in all_results:
for r in result:
if r.eval_metadata is not None:
r.eval_metadata.status = "finished"
r.eval_metadata.passed = passed
if r.evaluation_result is not None:
r.evaluation_result.agg_score = agg_score
r.evaluation_result.standard_error = standard_error
active_logger.log(r)

# Optional: print and/or persist a summary artifact for CI
Expand Down Expand Up @@ -593,9 +602,9 @@ async def _execute_with_semaphore(row):
assert agg_score >= threshold.success, (
f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
)
if threshold.standard_deviation is not None:
assert score_std <= threshold.standard_deviation, (
f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
if threshold.standard_error is not None and standard_error is not None:
assert standard_error <= threshold.standard_error, (
f"Standard error {standard_error:.3f} above threshold {threshold.standard_error}"
)

except AssertionError:
Expand Down
16 changes: 8 additions & 8 deletions eval_protocol/stats/confidence_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def compute_fixed_set_mu_ci(
rows: List[EvaluationRow],
*,
z_value: float = 1.96,
) -> Tuple[Optional[float], Optional[float], Optional[float]]:
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
"""Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set.

This treats questions/items as fixed and repeats as within-item Bernoulli draws.
Expand All @@ -53,10 +53,10 @@ def compute_fixed_set_mu_ci(
- Scores are taken from `row.evaluation_result.score` when available and numeric.

Returns:
(mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data.
(mu_hat, ci_low, ci_high, standard_error). Returns (None, None, None, None) if insufficient data.
"""
if not rows:
return None, None, None
return None, None, None, None

# Group scores by question id
question_to_scores: Dict[str, List[float]] = defaultdict(list)
Expand All @@ -80,7 +80,7 @@ def compute_fixed_set_mu_ci(

Q = len(question_to_scores)
if Q == 0:
return None, None, None
return None, None, None, None

# Compute per-question means and the plug-in variance contribution
ybars: List[float] = []
Expand All @@ -99,16 +99,16 @@ def compute_fixed_set_mu_ci(
var_terms.append(ybar_i * (1.0 - ybar_i) / m_i)

if not ybars:
return None, None, None
return None, None, None, None

mu_hat = sum(ybars) / len(ybars)

# Standard error for CI of μ
se_sq = sum(var_terms) / (Q * Q)
se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
standard_error = math.sqrt(se_sq) if se_sq > 0.0 else 0.0

margin = z_value * se
margin = z_value * standard_error
ci_low = max(0.0, mu_hat - margin)
ci_high = min(1.0, mu_hat + margin)

return float(mu_hat), float(ci_low), float(ci_high)
return float(mu_hat), float(ci_low), float(ci_high), float(standard_error)
2 changes: 1 addition & 1 deletion tests/pytest/test_tau_bench_airline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
}
],
rollout_processor=MCPGymRolloutProcessor(),
passed_threshold={"success": 0.4, "standard_deviation": 0.1},
passed_threshold={"success": 0.4, "standard_error": 0.02},
num_runs=8,
mode="pointwise",
max_concurrent_rollouts=50,
Expand Down
6 changes: 6 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ def test_evaluate_result_dict_access():
"step_outputs",
"trajectory_info",
"final_control_plane_info",
"agg_score",
"standard_error",
}

# values() - check presence due to potential order variation of model_fields
Expand All @@ -232,6 +234,8 @@ def test_evaluate_result_dict_access():
("step_outputs", None),
("trajectory_info", None),
("final_control_plane_info", None),
("agg_score", None),
("standard_error", None),
]
)
# result.items() returns a list of tuples, so convert to list then sort.
Expand All @@ -250,6 +254,8 @@ def test_evaluate_result_dict_access():
"step_outputs",
"trajectory_info",
"final_control_plane_info",
"agg_score",
"standard_error",
}


Expand Down
21 changes: 21 additions & 0 deletions vendor/tau2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import sys

from loguru import logger

from vendor.tau2.config import DEFAULT_LOG_LEVEL

# Remove default handler to avoid duplicate logs
logger.remove()

# Get log level from environment variable, then tau2 config, then default to WARNING
log_level = os.environ.get("TAU2_LOG_LEVEL")
if log_level is None:
log_level = DEFAULT_LOG_LEVEL

# Add handler with appropriate log level
logger.add(
sys.stderr,
level=log_level,
format="{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:<8} | {name}:{function}:{line} - {message}",
)
2 changes: 1 addition & 1 deletion vendor/tau2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
DEFAULT_MAX_CONCURRENCY = 3
DEFAULT_NUM_TRIALS = 1
DEFAULT_SAVE_TO = None
DEFAULT_LOG_LEVEL = "ERROR"
DEFAULT_LOG_LEVEL = "WARNING"

# LLM
DEFAULT_AGENT_IMPLEMENTATION = "llm_agent"
Expand Down
Loading