|
| 1 | +name: E2E Smoke Test |
| 2 | + |
| 3 | +# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC |
| 4 | +on: |
| 5 | + schedule: |
| 6 | + - cron: '0 */6 * * *' |
| 7 | + workflow_dispatch: # Allow manual triggering |
| 8 | + inputs: |
| 9 | + debug_mode: |
| 10 | + description: 'Enable debug output' |
| 11 | + required: false |
| 12 | + default: 'false' |
| 13 | + type: boolean |
| 14 | + |
| 15 | +jobs: |
| 16 | + e2e-smoke-test: |
| 17 | + name: E2E Smoke Test |
| 18 | + runs-on: ubuntu-latest |
| 19 | + |
| 20 | + steps: |
| 21 | + - name: Checkout repository |
| 22 | + uses: actions/checkout@v4 |
| 23 | + with: |
| 24 | + fetch-depth: 0 |
| 25 | + |
| 26 | + - name: Set up Python 3.12 |
| 27 | + uses: actions/setup-python@v5 |
| 28 | + with: |
| 29 | + python-version: "3.12" |
| 30 | + |
| 31 | + - name: Install uv |
| 32 | + uses: astral-sh/setup-uv@v6 |
| 33 | + with: |
| 34 | + enable-cache: true |
| 35 | + |
| 36 | + - name: Install the project |
| 37 | + run: uv sync --locked --all-extras --dev |
| 38 | + |
| 39 | + - name: Install tau2 for testing |
| 40 | + run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main |
| 41 | + |
| 42 | + - name: Run E2E Smoke Test |
| 43 | + id: run_test |
| 44 | + env: |
| 45 | + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} |
| 46 | + FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} |
| 47 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 48 | + PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" |
| 49 | + run: | |
| 50 | + echo "Running e2e smoke test..." |
| 51 | +
|
| 52 | + # Run the test and capture both stdout and exit code |
| 53 | + set +e # Don't exit on failure |
| 54 | +
|
| 55 | + uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ |
| 56 | + -v --tb=short --durations=10 \ |
| 57 | + --ep-print-summary \ |
| 58 | + --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log |
| 59 | +
|
| 60 | + TEST_EXIT_CODE=$? |
| 61 | +
|
| 62 | + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT |
| 63 | +
|
| 64 | + # List generated files for debugging |
| 65 | + echo "📁 Generated files:" |
| 66 | + ls -la *.json 2>/dev/null || echo "No JSON files found" |
| 67 | + ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" |
| 68 | +
|
| 69 | + # Parse EP summary from terminal output (more reliable than JSON files) |
| 70 | + if [ -f test_output.log ]; then |
| 71 | + echo "📋 Parsing EP summary from terminal output..." |
| 72 | +
|
| 73 | + # Show the terminal output for debugging |
| 74 | + echo "Terminal output:" |
| 75 | + cat test_output.log |
| 76 | + echo "" |
| 77 | +
|
| 78 | + # Extract the EP Summary line from the terminal output |
| 79 | + EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "") |
| 80 | +
|
| 81 | + if [ -n "$EP_SUMMARY_LINE" ]; then |
| 82 | + echo "Found EP Summary line:" |
| 83 | + echo "$EP_SUMMARY_LINE" |
| 84 | +
|
| 85 | + # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..." |
| 86 | + SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0") |
| 87 | +
|
| 88 | + # Extract other info |
| 89 | + NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") |
| 90 | + NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") |
| 91 | +
|
| 92 | + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT |
| 93 | +
|
| 94 | + # Check if success rate meets thresholds (36% - 60% acceptable range) |
| 95 | + LOWER_BOUND=0.36 # 36% |
| 96 | + UPPER_BOUND=0.6 # 60% |
| 97 | + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") |
| 98 | + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") |
| 99 | + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") |
| 100 | +
|
| 101 | + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT |
| 102 | + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT |
| 103 | + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT |
| 104 | +
|
| 105 | + echo "📊 Evaluation Summary (from terminal output):" |
| 106 | + echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" |
| 107 | + echo " - Dataset rows evaluated: $NUM_ROWS" |
| 108 | + echo " - Number of runs: $NUM_RUNS" |
| 109 | + echo " - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 110 | + echo " - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 111 | + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 112 | + else |
| 113 | + echo "❌ No EP Summary line found in terminal output" |
| 114 | + echo "threshold_met=0" >> $GITHUB_OUTPUT |
| 115 | + echo "success_rate=0" >> $GITHUB_OUTPUT |
| 116 | + fi |
| 117 | + else |
| 118 | + echo "❌ No terminal output file found" |
| 119 | + echo "threshold_met=0" >> $GITHUB_OUTPUT |
| 120 | + echo "success_rate=0" >> $GITHUB_OUTPUT |
| 121 | + fi |
| 122 | +
|
| 123 | + - name: Upload test results |
| 124 | + if: always() |
| 125 | + uses: actions/upload-artifact@v4 |
| 126 | + with: |
| 127 | + name: e2e-smoke-test-results-${{ github.run_number }} |
| 128 | + path: | |
| 129 | + test_output.log |
| 130 | + ep_summary*.json |
| 131 | + *.log |
| 132 | + retention-days: 7 |
| 133 | + |
| 134 | + - name: Validate test results |
| 135 | + if: always() |
| 136 | + run: | |
| 137 | + echo "Validating test results against thresholds..." |
| 138 | +
|
| 139 | + TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" |
| 140 | + THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" |
| 141 | + LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" |
| 142 | + UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" |
| 143 | + SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" |
| 144 | +
|
| 145 | + echo "Test exit code: $TEST_EXIT_CODE" |
| 146 | + echo "Threshold met (40%-60%): $THRESHOLD_MET" |
| 147 | + echo "Lower bound met (≥40%): $LOWER_BOUND_MET" |
| 148 | + echo "Upper bound met (≤60%): $UPPER_BOUND_MET" |
| 149 | + echo "Success rate: $SUCCESS_RATE" |
| 150 | +
|
| 151 | + # Fail the job if tests didn't run successfully or thresholds weren't met |
| 152 | + if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then |
| 153 | + echo "❌ E2E smoke test FAILED" |
| 154 | + echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" |
| 155 | + echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})" |
| 156 | + exit 1 |
| 157 | + elif [ "$TEST_EXIT_CODE" != "0" ]; then |
| 158 | + echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" |
| 159 | + echo " - Test exit code: $TEST_EXIT_CODE" |
| 160 | + echo " - Thresholds met: $THRESHOLD_MET" |
| 161 | + # Don't exit with error if thresholds were actually met despite test issues |
| 162 | + if [ "$THRESHOLD_MET" = "1" ]; then |
| 163 | + echo "✅ Thresholds met despite execution issues - considering this a pass" |
| 164 | + else |
| 165 | + exit 1 |
| 166 | + fi |
| 167 | + elif [ "$THRESHOLD_MET" != "1" ]; then |
| 168 | + # Determine which bound was violated |
| 169 | + if [ "$LOWER_BOUND_MET" != "1" ]; then |
| 170 | + echo "❌ E2E smoke test FAILED - success rate too low" |
| 171 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 172 | + echo " - Required: ≥40%" |
| 173 | + elif [ "$UPPER_BOUND_MET" != "1" ]; then |
| 174 | + echo "❌ E2E smoke test FAILED - success rate suspiciously high" |
| 175 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 176 | + echo " - Maximum expected: ≤60%" |
| 177 | + echo " - This may indicate test issues or unrealistic performance" |
| 178 | + else |
| 179 | + echo "❌ E2E smoke test FAILED - success rate outside acceptable range" |
| 180 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 181 | + echo " - Required range: 40%-60%" |
| 182 | + fi |
| 183 | + exit 1 |
| 184 | + else |
| 185 | + echo "✅ E2E smoke test PASSED" |
| 186 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 187 | + echo " - Within acceptable range: 40%-60%" |
| 188 | + fi |
0 commit comments