From a8445a19640e2ee25a62dc83f2c8824d791eefb2 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 18:33:45 +0000 Subject: [PATCH 1/7] e2e smoke test --- .github/workflows/e2e-smoke-test.yml | 232 ++++++++++++++++++++++++ tests/pytest/test_tau_bench_airline.py | 2 +- tests/test_tau_bench_airline_smoke.py | 236 +++++++++++++++++++++++++ 3 files changed, 469 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/e2e-smoke-test.yml create mode 100644 tests/test_tau_bench_airline_smoke.py diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml new file mode 100644 index 00000000..47061980 --- /dev/null +++ b/.github/workflows/e2e-smoke-test.yml @@ -0,0 +1,232 @@ +name: E2E Smoke Test + +# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC +on: + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: # Allow manual triggering + inputs: + debug_mode: + description: 'Enable debug output' + required: false + default: 'false' + type: boolean + +jobs: + e2e-smoke-test: + name: E2E Smoke Test + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Install the project + run: uv sync --locked --all-extras --dev + + - name: Install tau2 for testing + run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main + + - name: Run E2E Smoke Test + id: run_test + env: + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" + run: | + echo "Running e2e smoke test..." + + # Run the test and capture both stdout and exit code + set +e # Don't exit on failure + + uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ + -v --tb=short --durations=10 \ + --ep-print-summary \ + --ep-summary-json=ep_summary.json + + TEST_EXIT_CODE=$? + + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT + + # Parse evaluation protocol summary if it exists + if [ -f ep_summary.json ]; then + echo "EP Summary found, parsing..." + + # Log the full summary for debugging + echo "EP Summary contents:" + cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json + + # Extract success rate from EP summary (this contains the actual accuracy/success rate) + # The EP summary uses 'agg_score' for the aggregated success rate + SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0") + + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT + + # Check if success rate meets thresholds (40% - 90% acceptable range) + LOWER_BOUND=0.4 # 40% + UPPER_BOUND=0.9 # 90% + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l) + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l) + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l) + + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT + + # Extract additional info for display + NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0") + NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0") + + echo "📊 Evaluation Summary:" + echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%" + echo " - Dataset rows evaluated: $NUM_ROWS" + echo " - Number of runs: $NUM_RUNS" + echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + else + echo "❌ No EP summary file found" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT + fi + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-smoke-test-results-${{ github.run_number }} + path: | + ep_summary.json + *.log + retention-days: 7 + + - name: Validate test results + if: always() + run: | + echo "Validating test results against thresholds..." + + TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" + THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" + LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" + UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" + SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" + + echo "Test exit code: $TEST_EXIT_CODE" + echo "Threshold met (40%-90%): $THRESHOLD_MET" + echo "Lower bound met (≥40%): $LOWER_BOUND_MET" + echo "Upper bound met (≤90%): $UPPER_BOUND_MET" + echo "Success rate: $SUCCESS_RATE" + + # Fail the job if tests didn't run successfully or thresholds weren't met + if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED" + echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" + echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})" + exit 1 + elif [ "$TEST_EXIT_CODE" != "0" ]; then + echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" + echo " - Test exit code: $TEST_EXIT_CODE" + echo " - Thresholds met: $THRESHOLD_MET" + # Don't exit with error if thresholds were actually met despite test issues + if [ "$THRESHOLD_MET" = "1" ]; then + echo "✅ Thresholds met despite execution issues - considering this a pass" + else + exit 1 + fi + elif [ "$THRESHOLD_MET" != "1" ]; then + # Determine which bound was violated + if [ "$LOWER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate too low" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required: ≥40%" + elif [ "$UPPER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate suspiciously high" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Maximum expected: ≤90%" + echo " - This may indicate test issues or unrealistic performance" + else + echo "❌ E2E smoke test FAILED - success rate outside acceptable range" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required range: 40%-90%" + fi + exit 1 + else + echo "✅ E2E smoke test PASSED" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Within acceptable range: 40%-90%" + fi + + - name: Create GitHub issue on failure + if: failure() + uses: actions/github-script@v7 + with: + script: | + const testResults = { + exitCode: '${{ steps.run_test.outputs.test_exit_code }}', + successRate: '${{ steps.run_test.outputs.success_rate }}', + thresholdMet: '${{ steps.run_test.outputs.threshold_met }}', + lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}', + upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}' + }; + + const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`; + + const body = ` + ## E2E Smoke Test Failure Report + + **Test:** E2E Smoke Test + **Date:** ${new Date().toISOString()} + **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + ### Test Results + - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'} + - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'} + - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'} + - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'} + - **Test Exit Code:** ${testResults.exitCode || 'Unknown'} + + ### Required Actions + + ${ testResults.thresholdMet !== '1' ? + (testResults.lowerBoundMet !== '1' ? + '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' : + testResults.upperBoundMet !== '1' ? + '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' : + '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.' + ) : + '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.' + } + + ### Next Steps + 1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information + 2. Check if this is a temporary issue by re-running the workflow manually + 3. If persistent, investigate potential causes: + - Model performance degradation + - Test environment configuration + - API key or service availability issues + + ### Auto-generated + This issue was automatically created by the E2E smoke test workflow. + `; + + // Create the issue + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: title, + body: body, + labels: ['bug', 'e2e-test', 'automated', 'smoke-test'] + }); diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 80aadf14..f5472092 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py new file mode 100644 index 00000000..e0eae2ef --- /dev/null +++ b/tests/test_tau_bench_airline_smoke.py @@ -0,0 +1,236 @@ +""" +Smoke test for tau bench airline evaluation - runs with minimal configuration for CI/CD monitoring. + +This is a lightweight version of the full tau bench airline test, designed specifically +for automated smoke testing in CI/CD pipelines. It runs with only 1 iteration to provide +quick feedback on system health while minimizing resource usage. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from vendor.tau2.data_model.message import ( + AssistantMessage, + SystemMessage, + ToolCall, + ToolMessage, + UserMessage, +) +from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario +from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator +from vendor.tau2.evaluator.evaluator_action import ActionEvaluator +from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator +from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator +from vendor.tau2.registry import registry + + +def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from airline dataset to EvaluationRow objects for smoke testing. + """ + rows = [] + test_dir = Path(__file__).parent.parent / "examples" / "tau2_mcp" / "tests" + + # Load system prompt from file so we can change it in one place + domain = data[0]["environment_context"]["domain"] + prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" + + with open(prompt_file, "r") as f: + system_prompt = f.read().strip() + + for row in data: + eval_row = EvaluationRow( + messages=[Message(role="system", content=system_prompt)], + input_metadata=InputMetadata( + row_id=row["id"], + dataset_info={ + "environment_context": row["environment_context"], + "user_simulation": row["user_simulation"], + "evaluation_criteria": row["evaluation_criteria"], + "user_prompt_template": row["user_prompt_template"], + }, + ), + ) + + rows.append(eval_row) + + return rows + + +@evaluation_test( + input_dataset=["tests/pytest/data/airline_dataset.jsonl"], + dataset_adapter=tau_bench_airline_smoke_to_evaluation_row, + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + rollout_processor=default_mcp_gym_rollout_processor, + passed_threshold=0.4, + num_runs=1, # Smoke test: single run for quick feedback + mode="pointwise", + max_concurrent_rollouts=50, # Standard concurrency + server_script_path="examples/tau2_mcp/server.py", +) +def test_tau_bench_airline_smoke_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Smoke test for tau bench airline evaluation - single run version for CI/CD monitoring. + + This is a lightweight smoke test that runs the tau bench airline evaluation with + minimal configuration (1 run) to quickly validate system health and model performance. + It uses the same evaluation logic as the full test but with reduced resource usage. + + Args: + row: EvaluationRow object from tau bench airline dataset after rollout + + Returns: + EvaluationRow with tau2 evaluation results + """ + messages = row.messages + + # Get evaluation criteria and user_simulation from input_metadata.dataset_info + dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} + evaluation_criteria = dataset_info.get("evaluation_criteria", {}) + + nl_assertions = evaluation_criteria.get("nl_assertions", []) + communicate_info = evaluation_criteria.get("communicate_info", []) + actions = evaluation_criteria.get("actions", []) + + # Convert Message objects directly to tau2-bench message objects + trajectory_objects = [] + for msg in messages: + role = msg.role + content = msg.content + + if role == "system": + trajectory_objects.append(SystemMessage(role=role, content=content)) + elif role == "assistant": + tau2_tool_calls = [] + if msg.tool_calls: + for tool_call in msg.tool_calls: + arguments = json.loads(tool_call.function.arguments) + tau2_tool_call = ToolCall( + id=tool_call.id, + name=tool_call.function.name, + arguments=arguments, + ) + tau2_tool_calls.append(tau2_tool_call) + + trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls)) + elif role == "user": + trajectory_objects.append(UserMessage(role=role, content=content)) + elif role == "tool": + tool_id = msg.tool_call_id + trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content)) + + reward = 1.0 + + evaluation_criteria = EvaluationCriteria( + nl_assertions=nl_assertions, + communicate_info=communicate_info, + actions=actions, + reward_basis=[ + RewardType.DB, + RewardType.COMMUNICATE, + ], + ) + + task = Task( + id="SmokeTest", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="SmokeTest") + ) # id and user_scenario are required for the Task type but not used in calculating reward + + if RewardType.DB in task.evaluation_criteria.reward_basis: + env_reward_info = EnvironmentEvaluator.calculate_reward( + environment_constructor=registry.get_env_constructor("airline"), + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.ACTION in task.evaluation_criteria.reward_basis: + action_reward_info = ActionEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis: + communicate_reward_info = CommunicateEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis: + nl_reward_info = NLAssertionsEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + + reward = 1.0 + env_bases = {RewardType.DB, RewardType.ENV_ASSERTION} + action_bases = {RewardType.ACTION} + nl_bases = {RewardType.NL_ASSERTION} + comm_bases = {RewardType.COMMUNICATE} + task_reward_basis = set(task.evaluation_criteria.reward_basis) + + reward_breakdown = {} + if task_reward_basis & env_bases: + if env_reward_info.reward_breakdown is not None: + reward_breakdown.update(env_reward_info.reward_breakdown) + reward *= env_reward_info.reward + if task_reward_basis & action_bases: + if action_reward_info.reward_breakdown is not None: + reward_breakdown.update(action_reward_info.reward_breakdown) + reward *= action_reward_info.reward + if task_reward_basis & nl_bases: + if nl_reward_info.reward_breakdown is not None: + reward_breakdown.update(nl_reward_info.reward_breakdown) + reward *= nl_reward_info.reward + if task_reward_basis & comm_bases: + if communicate_reward_info.reward_breakdown is not None: + reward_breakdown.update(communicate_reward_info.reward_breakdown) + reward *= communicate_reward_info.reward + + # Generate reason showing only failed components + failed_reasons = [] + + if task_reward_basis & env_bases and env_reward_info.reward == 0: + failed_reasons.append("❌ Environment/DB check failed") + + if task_reward_basis & action_bases and action_reward_info.reward == 0: + failed_actions = [] + if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks: + failed_actions = [ + f"{ac.action.name}({ac.action.arguments})" + for ac in action_reward_info.action_checks + if not ac.action_match + ] + if failed_actions: + failed_reasons.append(f"❌ Failed actions: {failed_actions}") + else: + failed_reasons.append("❌ Actions failed") + + if task_reward_basis & nl_bases and nl_reward_info.reward == 0: + failed_nl = [] + if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions: + failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met] + if failed_nl: + failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}") + else: + failed_reasons.append("❌ NL Assertions failed") + + if task_reward_basis & comm_bases and communicate_reward_info.reward == 0: + failed_comm = [] + if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks: + failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met] + if failed_comm: + failed_reasons.append(f"❌ Failed communication: {failed_comm}") + else: + failed_reasons.append("❌ Communication failed") + + # If everything passed, show success + reason = "\n".join(failed_reasons) if failed_reasons else "✅ All checks passed [SMOKE TEST]" + + row.evaluation_result = EvaluateResult( + score=reward, + reason=reason, + metrics={}, + ) + return row From e001fd60fce878bfeff78bbfd88d7d3a0523c580 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 18:44:29 +0000 Subject: [PATCH 2/7] temp adding --- .github/workflows/e2e-smoke-test.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index 47061980..d934eaf4 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -2,6 +2,16 @@ name: E2E Smoke Test # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC on: + push: + branches: [main] + paths-ignore: + - "docs/**" + - "*.md" + pull_request: + branches: [main] + paths-ignore: + - "docs/**" + - "*.md" schedule: - cron: '0 */6 * * *' workflow_dispatch: # Allow manual triggering From b39cc40b1496a8200cb1b6eea672294d7076305a Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 18:55:03 +0000 Subject: [PATCH 3/7] update --- .github/workflows/e2e-smoke-test.yml | 90 +++++++--------------------- 1 file changed, 21 insertions(+), 69 deletions(-) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index d934eaf4..bdd1de2c 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -71,17 +71,31 @@ jobs: echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT + # List generated files for debugging + echo "📁 Generated files:" + ls -la *.json 2>/dev/null || echo "No JSON files found" + ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" + # Parse evaluation protocol summary if it exists - if [ -f ep_summary.json ]; then - echo "EP Summary found, parsing..." + # EP might generate files with different names, check for common patterns + EP_SUMMARY_FILE="" + for file in ep_summary*.json; do + if [ -f "$file" ]; then + EP_SUMMARY_FILE="$file" + break + fi + done + + if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then + echo "EP Summary found: $EP_SUMMARY_FILE, parsing..." # Log the full summary for debugging echo "EP Summary contents:" - cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json + cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE" # Extract success rate from EP summary (this contains the actual accuracy/success rate) # The EP summary uses 'agg_score' for the aggregated success rate - SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0") + SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT @@ -97,8 +111,8 @@ jobs: echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT # Extract additional info for display - NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0") - NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0") + NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") + NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") echo "📊 Evaluation Summary:" echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%" @@ -119,7 +133,7 @@ jobs: with: name: e2e-smoke-test-results-${{ github.run_number }} path: | - ep_summary.json + ep_summary*.json *.log retention-days: 7 @@ -178,65 +192,3 @@ jobs: echo " - Success rate: ${SUCCESS_RATE:-unknown}" echo " - Within acceptable range: 40%-90%" fi - - - name: Create GitHub issue on failure - if: failure() - uses: actions/github-script@v7 - with: - script: | - const testResults = { - exitCode: '${{ steps.run_test.outputs.test_exit_code }}', - successRate: '${{ steps.run_test.outputs.success_rate }}', - thresholdMet: '${{ steps.run_test.outputs.threshold_met }}', - lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}', - upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}' - }; - - const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`; - - const body = ` - ## E2E Smoke Test Failure Report - - **Test:** E2E Smoke Test - **Date:** ${new Date().toISOString()} - **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) - - ### Test Results - - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'} - - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'} - - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'} - - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'} - - **Test Exit Code:** ${testResults.exitCode || 'Unknown'} - - ### Required Actions - - ${ testResults.thresholdMet !== '1' ? - (testResults.lowerBoundMet !== '1' ? - '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' : - testResults.upperBoundMet !== '1' ? - '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' : - '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.' - ) : - '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.' - } - - ### Next Steps - 1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information - 2. Check if this is a temporary issue by re-running the workflow manually - 3. If persistent, investigate potential causes: - - Model performance degradation - - Test environment configuration - - API key or service availability issues - - ### Auto-generated - This issue was automatically created by the E2E smoke test workflow. - `; - - // Create the issue - await github.rest.issues.create({ - owner: context.repo.owner, - repo: context.repo.repo, - title: title, - body: body, - labels: ['bug', 'e2e-test', 'automated', 'smoke-test'] - }); From 02e9b932e87a92f9f5cccdcdd0af4b21b0be0fc0 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 20:04:06 +0000 Subject: [PATCH 4/7] test --- .github/workflows/e2e-smoke-test.yml | 108 ++++++++++++++------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index bdd1de2c..7521b3cb 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -65,7 +65,7 @@ jobs: uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ -v --tb=short --durations=10 \ --ep-print-summary \ - --ep-summary-json=ep_summary.json + --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log TEST_EXIT_CODE=$? @@ -76,53 +76,56 @@ jobs: ls -la *.json 2>/dev/null || echo "No JSON files found" ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" - # Parse evaluation protocol summary if it exists - # EP might generate files with different names, check for common patterns - EP_SUMMARY_FILE="" - for file in ep_summary*.json; do - if [ -f "$file" ]; then - EP_SUMMARY_FILE="$file" - break + # Parse EP summary from terminal output (more reliable than JSON files) + if [ -f test_output.log ]; then + echo "📋 Parsing EP summary from terminal output..." + + # Show the terminal output for debugging + echo "Terminal output:" + cat test_output.log + echo "" + + # Extract the EP Summary line from the terminal output + EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "") + + if [ -n "$EP_SUMMARY_LINE" ]; then + echo "Found EP Summary line:" + echo "$EP_SUMMARY_LINE" + + # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..." + SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0") + + # Extract other info + NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT + + # Check if success rate meets thresholds (40% - 90% acceptable range) + LOWER_BOUND=0.4 # 40% + UPPER_BOUND=0.9 # 90% + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") + + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT + + echo "📊 Evaluation Summary (from terminal output):" + echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" + echo " - Dataset rows evaluated: $NUM_ROWS" + echo " - Number of runs: $NUM_RUNS" + echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + else + echo "❌ No EP Summary line found in terminal output" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT fi - done - - if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then - echo "EP Summary found: $EP_SUMMARY_FILE, parsing..." - - # Log the full summary for debugging - echo "EP Summary contents:" - cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE" - - # Extract success rate from EP summary (this contains the actual accuracy/success rate) - # The EP summary uses 'agg_score' for the aggregated success rate - SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") - - echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT - - # Check if success rate meets thresholds (40% - 90% acceptable range) - LOWER_BOUND=0.4 # 40% - UPPER_BOUND=0.9 # 90% - LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l) - UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l) - THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l) - - echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT - echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT - echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT - - # Extract additional info for display - NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") - NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") - - echo "📊 Evaluation Summary:" - echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%" - echo " - Dataset rows evaluated: $NUM_ROWS" - echo " - Number of runs: $NUM_RUNS" - echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" - echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" - echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" else - echo "❌ No EP summary file found" + echo "❌ No terminal output file found" echo "threshold_met=0" >> $GITHUB_OUTPUT echo "success_rate=0" >> $GITHUB_OUTPUT fi @@ -133,6 +136,7 @@ jobs: with: name: e2e-smoke-test-results-${{ github.run_number }} path: | + test_output.log ep_summary*.json *.log retention-days: 7 @@ -149,16 +153,16 @@ jobs: SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" echo "Test exit code: $TEST_EXIT_CODE" - echo "Threshold met (40%-90%): $THRESHOLD_MET" + echo "Threshold met (40%-60%): $THRESHOLD_MET" echo "Lower bound met (≥40%): $LOWER_BOUND_MET" - echo "Upper bound met (≤90%): $UPPER_BOUND_MET" + echo "Upper bound met (≤60%): $UPPER_BOUND_MET" echo "Success rate: $SUCCESS_RATE" # Fail the job if tests didn't run successfully or thresholds weren't met if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then echo "❌ E2E smoke test FAILED" echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" - echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})" + echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})" exit 1 elif [ "$TEST_EXIT_CODE" != "0" ]; then echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" @@ -179,16 +183,16 @@ jobs: elif [ "$UPPER_BOUND_MET" != "1" ]; then echo "❌ E2E smoke test FAILED - success rate suspiciously high" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Maximum expected: ≤90%" + echo " - Maximum expected: ≤60%" echo " - This may indicate test issues or unrealistic performance" else echo "❌ E2E smoke test FAILED - success rate outside acceptable range" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Required range: 40%-90%" + echo " - Required range: 40%-60%" fi exit 1 else echo "✅ E2E smoke test PASSED" echo " - Success rate: ${SUCCESS_RATE:-unknown}" - echo " - Within acceptable range: 40%-90%" + echo " - Within acceptable range: 40%-60%" fi From d7e51f27146fbd443c7e233047bf58b0bd862bb2 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 20:05:27 +0000 Subject: [PATCH 5/7] adjust bounds --- .github/workflows/e2e-smoke-test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index 7521b3cb..a20bbc27 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -101,9 +101,9 @@ jobs: echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT - # Check if success rate meets thresholds (40% - 90% acceptable range) - LOWER_BOUND=0.4 # 40% - UPPER_BOUND=0.9 # 90% + # Check if success rate meets thresholds (36% - 60% acceptable range) + LOWER_BOUND=0.36 # 36% + UPPER_BOUND=0.6 # 60% LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") @@ -116,8 +116,8 @@ jobs: echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" echo " - Dataset rows evaluated: $NUM_ROWS" echo " - Number of runs: $NUM_RUNS" - echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" - echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" else echo "❌ No EP Summary line found in terminal output" From df26606705a986d05a808f0224ec6f69c9806443 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 20:09:07 +0000 Subject: [PATCH 6/7] change back to regular schedule --- .github/workflows/e2e-smoke-test.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml index a20bbc27..ec91875c 100644 --- a/.github/workflows/e2e-smoke-test.yml +++ b/.github/workflows/e2e-smoke-test.yml @@ -2,16 +2,6 @@ name: E2E Smoke Test # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC on: - push: - branches: [main] - paths-ignore: - - "docs/**" - - "*.md" - pull_request: - branches: [main] - paths-ignore: - - "docs/**" - - "*.md" schedule: - cron: '0 */6 * * *' workflow_dispatch: # Allow manual triggering From 1505018198cd39d854824ce29123e286ef54d1c3 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 13 Aug 2025 20:15:36 +0000 Subject: [PATCH 7/7] final --- tests/test_tau_bench_airline_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py index e0eae2ef..e96baabe 100644 --- a/tests/test_tau_bench_airline_smoke.py +++ b/tests/test_tau_bench_airline_smoke.py @@ -68,7 +68,7 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], rollout_processor=default_mcp_gym_rollout_processor, - passed_threshold=0.4, + passed_threshold=0.36, num_runs=1, # Smoke test: single run for quick feedback mode="pointwise", max_concurrent_rollouts=50, # Standard concurrency