diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml new file mode 100644 index 00000000..ec91875c --- /dev/null +++ b/.github/workflows/e2e-smoke-test.yml @@ -0,0 +1,188 @@ +name: E2E Smoke Test + +# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC +on: + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: # Allow manual triggering + inputs: + debug_mode: + description: 'Enable debug output' + required: false + default: 'false' + type: boolean + +jobs: + e2e-smoke-test: + name: E2E Smoke Test + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Install the project + run: uv sync --locked --all-extras --dev + + - name: Install tau2 for testing + run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main + + - name: Run E2E Smoke Test + id: run_test + env: + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" + run: | + echo "Running e2e smoke test..." + + # Run the test and capture both stdout and exit code + set +e # Don't exit on failure + + uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ + -v --tb=short --durations=10 \ + --ep-print-summary \ + --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log + + TEST_EXIT_CODE=$? + + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT + + # List generated files for debugging + echo "📁 Generated files:" + ls -la *.json 2>/dev/null || echo "No JSON files found" + ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" + + # Parse EP summary from terminal output (more reliable than JSON files) + if [ -f test_output.log ]; then + echo "📋 Parsing EP summary from terminal output..." + + # Show the terminal output for debugging + echo "Terminal output:" + cat test_output.log + echo "" + + # Extract the EP Summary line from the terminal output + EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "") + + if [ -n "$EP_SUMMARY_LINE" ]; then + echo "Found EP Summary line:" + echo "$EP_SUMMARY_LINE" + + # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..." + SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0") + + # Extract other info + NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT + + # Check if success rate meets thresholds (36% - 60% acceptable range) + LOWER_BOUND=0.36 # 36% + UPPER_BOUND=0.6 # 60% + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") + + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT + + echo "📊 Evaluation Summary (from terminal output):" + echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" + echo " - Dataset rows evaluated: $NUM_ROWS" + echo " - Number of runs: $NUM_RUNS" + echo " - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + else + echo "❌ No EP Summary line found in terminal output" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT + fi + else + echo "❌ No terminal output file found" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT + fi + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-smoke-test-results-${{ github.run_number }} + path: | + test_output.log + ep_summary*.json + *.log + retention-days: 7 + + - name: Validate test results + if: always() + run: | + echo "Validating test results against thresholds..." + + TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" + THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" + LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" + UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" + SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" + + echo "Test exit code: $TEST_EXIT_CODE" + echo "Threshold met (40%-60%): $THRESHOLD_MET" + echo "Lower bound met (≥40%): $LOWER_BOUND_MET" + echo "Upper bound met (≤60%): $UPPER_BOUND_MET" + echo "Success rate: $SUCCESS_RATE" + + # Fail the job if tests didn't run successfully or thresholds weren't met + if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED" + echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" + echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})" + exit 1 + elif [ "$TEST_EXIT_CODE" != "0" ]; then + echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" + echo " - Test exit code: $TEST_EXIT_CODE" + echo " - Thresholds met: $THRESHOLD_MET" + # Don't exit with error if thresholds were actually met despite test issues + if [ "$THRESHOLD_MET" = "1" ]; then + echo "✅ Thresholds met despite execution issues - considering this a pass" + else + exit 1 + fi + elif [ "$THRESHOLD_MET" != "1" ]; then + # Determine which bound was violated + if [ "$LOWER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate too low" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required: ≥40%" + elif [ "$UPPER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate suspiciously high" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Maximum expected: ≤60%" + echo " - This may indicate test issues or unrealistic performance" + else + echo "❌ E2E smoke test FAILED - success rate outside acceptable range" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required range: 40%-60%" + fi + exit 1 + else + echo "✅ E2E smoke test PASSED" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Within acceptable range: 40%-60%" + fi diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 80aadf14..f5472092 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py new file mode 100644 index 00000000..e96baabe --- /dev/null +++ b/tests/test_tau_bench_airline_smoke.py @@ -0,0 +1,236 @@ +""" +Smoke test for tau bench airline evaluation - runs with minimal configuration for CI/CD monitoring. + +This is a lightweight version of the full tau bench airline test, designed specifically +for automated smoke testing in CI/CD pipelines. It runs with only 1 iteration to provide +quick feedback on system health while minimizing resource usage. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from vendor.tau2.data_model.message import ( + AssistantMessage, + SystemMessage, + ToolCall, + ToolMessage, + UserMessage, +) +from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario +from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator +from vendor.tau2.evaluator.evaluator_action import ActionEvaluator +from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator +from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator +from vendor.tau2.registry import registry + + +def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from airline dataset to EvaluationRow objects for smoke testing. + """ + rows = [] + test_dir = Path(__file__).parent.parent / "examples" / "tau2_mcp" / "tests" + + # Load system prompt from file so we can change it in one place + domain = data[0]["environment_context"]["domain"] + prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" + + with open(prompt_file, "r") as f: + system_prompt = f.read().strip() + + for row in data: + eval_row = EvaluationRow( + messages=[Message(role="system", content=system_prompt)], + input_metadata=InputMetadata( + row_id=row["id"], + dataset_info={ + "environment_context": row["environment_context"], + "user_simulation": row["user_simulation"], + "evaluation_criteria": row["evaluation_criteria"], + "user_prompt_template": row["user_prompt_template"], + }, + ), + ) + + rows.append(eval_row) + + return rows + + +@evaluation_test( + input_dataset=["tests/pytest/data/airline_dataset.jsonl"], + dataset_adapter=tau_bench_airline_smoke_to_evaluation_row, + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + rollout_processor=default_mcp_gym_rollout_processor, + passed_threshold=0.36, + num_runs=1, # Smoke test: single run for quick feedback + mode="pointwise", + max_concurrent_rollouts=50, # Standard concurrency + server_script_path="examples/tau2_mcp/server.py", +) +def test_tau_bench_airline_smoke_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Smoke test for tau bench airline evaluation - single run version for CI/CD monitoring. + + This is a lightweight smoke test that runs the tau bench airline evaluation with + minimal configuration (1 run) to quickly validate system health and model performance. + It uses the same evaluation logic as the full test but with reduced resource usage. + + Args: + row: EvaluationRow object from tau bench airline dataset after rollout + + Returns: + EvaluationRow with tau2 evaluation results + """ + messages = row.messages + + # Get evaluation criteria and user_simulation from input_metadata.dataset_info + dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} + evaluation_criteria = dataset_info.get("evaluation_criteria", {}) + + nl_assertions = evaluation_criteria.get("nl_assertions", []) + communicate_info = evaluation_criteria.get("communicate_info", []) + actions = evaluation_criteria.get("actions", []) + + # Convert Message objects directly to tau2-bench message objects + trajectory_objects = [] + for msg in messages: + role = msg.role + content = msg.content + + if role == "system": + trajectory_objects.append(SystemMessage(role=role, content=content)) + elif role == "assistant": + tau2_tool_calls = [] + if msg.tool_calls: + for tool_call in msg.tool_calls: + arguments = json.loads(tool_call.function.arguments) + tau2_tool_call = ToolCall( + id=tool_call.id, + name=tool_call.function.name, + arguments=arguments, + ) + tau2_tool_calls.append(tau2_tool_call) + + trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls)) + elif role == "user": + trajectory_objects.append(UserMessage(role=role, content=content)) + elif role == "tool": + tool_id = msg.tool_call_id + trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content)) + + reward = 1.0 + + evaluation_criteria = EvaluationCriteria( + nl_assertions=nl_assertions, + communicate_info=communicate_info, + actions=actions, + reward_basis=[ + RewardType.DB, + RewardType.COMMUNICATE, + ], + ) + + task = Task( + id="SmokeTest", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="SmokeTest") + ) # id and user_scenario are required for the Task type but not used in calculating reward + + if RewardType.DB in task.evaluation_criteria.reward_basis: + env_reward_info = EnvironmentEvaluator.calculate_reward( + environment_constructor=registry.get_env_constructor("airline"), + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.ACTION in task.evaluation_criteria.reward_basis: + action_reward_info = ActionEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis: + communicate_reward_info = CommunicateEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis: + nl_reward_info = NLAssertionsEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + + reward = 1.0 + env_bases = {RewardType.DB, RewardType.ENV_ASSERTION} + action_bases = {RewardType.ACTION} + nl_bases = {RewardType.NL_ASSERTION} + comm_bases = {RewardType.COMMUNICATE} + task_reward_basis = set(task.evaluation_criteria.reward_basis) + + reward_breakdown = {} + if task_reward_basis & env_bases: + if env_reward_info.reward_breakdown is not None: + reward_breakdown.update(env_reward_info.reward_breakdown) + reward *= env_reward_info.reward + if task_reward_basis & action_bases: + if action_reward_info.reward_breakdown is not None: + reward_breakdown.update(action_reward_info.reward_breakdown) + reward *= action_reward_info.reward + if task_reward_basis & nl_bases: + if nl_reward_info.reward_breakdown is not None: + reward_breakdown.update(nl_reward_info.reward_breakdown) + reward *= nl_reward_info.reward + if task_reward_basis & comm_bases: + if communicate_reward_info.reward_breakdown is not None: + reward_breakdown.update(communicate_reward_info.reward_breakdown) + reward *= communicate_reward_info.reward + + # Generate reason showing only failed components + failed_reasons = [] + + if task_reward_basis & env_bases and env_reward_info.reward == 0: + failed_reasons.append("❌ Environment/DB check failed") + + if task_reward_basis & action_bases and action_reward_info.reward == 0: + failed_actions = [] + if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks: + failed_actions = [ + f"{ac.action.name}({ac.action.arguments})" + for ac in action_reward_info.action_checks + if not ac.action_match + ] + if failed_actions: + failed_reasons.append(f"❌ Failed actions: {failed_actions}") + else: + failed_reasons.append("❌ Actions failed") + + if task_reward_basis & nl_bases and nl_reward_info.reward == 0: + failed_nl = [] + if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions: + failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met] + if failed_nl: + failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}") + else: + failed_reasons.append("❌ NL Assertions failed") + + if task_reward_basis & comm_bases and communicate_reward_info.reward == 0: + failed_comm = [] + if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks: + failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met] + if failed_comm: + failed_reasons.append(f"❌ Failed communication: {failed_comm}") + else: + failed_reasons.append("❌ Communication failed") + + # If everything passed, show success + reason = "\n".join(failed_reasons) if failed_reasons else "✅ All checks passed [SMOKE TEST]" + + row.evaluation_result = EvaluateResult( + score=reward, + reason=reason, + metrics={}, + ) + return row