|
| 1 | +name: E2E Smoke Test |
| 2 | + |
| 3 | +# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC |
| 4 | +on: |
| 5 | + schedule: |
| 6 | + - cron: '0 */6 * * *' |
| 7 | + workflow_dispatch: # Allow manual triggering |
| 8 | + inputs: |
| 9 | + debug_mode: |
| 10 | + description: 'Enable debug output' |
| 11 | + required: false |
| 12 | + default: 'false' |
| 13 | + type: boolean |
| 14 | + |
| 15 | +jobs: |
| 16 | + e2e-smoke-test: |
| 17 | + name: E2E Smoke Test |
| 18 | + runs-on: ubuntu-latest |
| 19 | + |
| 20 | + steps: |
| 21 | + - name: Checkout repository |
| 22 | + uses: actions/checkout@v4 |
| 23 | + with: |
| 24 | + fetch-depth: 0 |
| 25 | + |
| 26 | + - name: Set up Python 3.12 |
| 27 | + uses: actions/setup-python@v5 |
| 28 | + with: |
| 29 | + python-version: "3.12" |
| 30 | + |
| 31 | + - name: Install uv |
| 32 | + uses: astral-sh/setup-uv@v6 |
| 33 | + with: |
| 34 | + enable-cache: true |
| 35 | + |
| 36 | + - name: Install the project |
| 37 | + run: uv sync --locked --all-extras --dev |
| 38 | + |
| 39 | + - name: Install tau2 for testing |
| 40 | + run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main |
| 41 | + |
| 42 | + - name: Run E2E Smoke Test |
| 43 | + id: run_test |
| 44 | + env: |
| 45 | + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} |
| 46 | + FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} |
| 47 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 48 | + PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" |
| 49 | + run: | |
| 50 | + echo "Running e2e smoke test..." |
| 51 | +
|
| 52 | + # Run the test and capture both stdout and exit code |
| 53 | + set +e # Don't exit on failure |
| 54 | +
|
| 55 | + uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ |
| 56 | + -v --tb=short --durations=10 \ |
| 57 | + --ep-print-summary \ |
| 58 | + --ep-summary-json=ep_summary.json |
| 59 | +
|
| 60 | + TEST_EXIT_CODE=$? |
| 61 | +
|
| 62 | + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT |
| 63 | +
|
| 64 | + # Parse evaluation protocol summary if it exists |
| 65 | + if [ -f ep_summary.json ]; then |
| 66 | + echo "EP Summary found, parsing..." |
| 67 | +
|
| 68 | + # Log the full summary for debugging |
| 69 | + echo "EP Summary contents:" |
| 70 | + cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json |
| 71 | +
|
| 72 | + # Extract success rate from EP summary (this contains the actual accuracy/success rate) |
| 73 | + # The EP summary uses 'agg_score' for the aggregated success rate |
| 74 | + SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0") |
| 75 | +
|
| 76 | + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT |
| 77 | +
|
| 78 | + # Check if success rate meets thresholds (40% - 90% acceptable range) |
| 79 | + LOWER_BOUND=0.4 # 40% |
| 80 | + UPPER_BOUND=0.9 # 90% |
| 81 | + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l) |
| 82 | + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l) |
| 83 | + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l) |
| 84 | +
|
| 85 | + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT |
| 86 | + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT |
| 87 | + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT |
| 88 | +
|
| 89 | + # Extract additional info for display |
| 90 | + NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0") |
| 91 | + NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0") |
| 92 | +
|
| 93 | + echo "📊 Evaluation Summary:" |
| 94 | + echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%" |
| 95 | + echo " - Dataset rows evaluated: $NUM_ROWS" |
| 96 | + echo " - Number of runs: $NUM_RUNS" |
| 97 | + echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 98 | + echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 99 | + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" |
| 100 | + else |
| 101 | + echo "❌ No EP summary file found" |
| 102 | + echo "threshold_met=0" >> $GITHUB_OUTPUT |
| 103 | + echo "success_rate=0" >> $GITHUB_OUTPUT |
| 104 | + fi |
| 105 | +
|
| 106 | + - name: Upload test results |
| 107 | + if: always() |
| 108 | + uses: actions/upload-artifact@v4 |
| 109 | + with: |
| 110 | + name: e2e-smoke-test-results-${{ github.run_number }} |
| 111 | + path: | |
| 112 | + ep_summary.json |
| 113 | + *.log |
| 114 | + retention-days: 7 |
| 115 | + |
| 116 | + - name: Validate test results |
| 117 | + if: always() |
| 118 | + run: | |
| 119 | + echo "Validating test results against thresholds..." |
| 120 | +
|
| 121 | + TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" |
| 122 | + THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" |
| 123 | + LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" |
| 124 | + UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" |
| 125 | + SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" |
| 126 | +
|
| 127 | + echo "Test exit code: $TEST_EXIT_CODE" |
| 128 | + echo "Threshold met (40%-90%): $THRESHOLD_MET" |
| 129 | + echo "Lower bound met (≥40%): $LOWER_BOUND_MET" |
| 130 | + echo "Upper bound met (≤90%): $UPPER_BOUND_MET" |
| 131 | + echo "Success rate: $SUCCESS_RATE" |
| 132 | +
|
| 133 | + # Fail the job if tests didn't run successfully or thresholds weren't met |
| 134 | + if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then |
| 135 | + echo "❌ E2E smoke test FAILED" |
| 136 | + echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" |
| 137 | + echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})" |
| 138 | + exit 1 |
| 139 | + elif [ "$TEST_EXIT_CODE" != "0" ]; then |
| 140 | + echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" |
| 141 | + echo " - Test exit code: $TEST_EXIT_CODE" |
| 142 | + echo " - Thresholds met: $THRESHOLD_MET" |
| 143 | + # Don't exit with error if thresholds were actually met despite test issues |
| 144 | + if [ "$THRESHOLD_MET" = "1" ]; then |
| 145 | + echo "✅ Thresholds met despite execution issues - considering this a pass" |
| 146 | + else |
| 147 | + exit 1 |
| 148 | + fi |
| 149 | + elif [ "$THRESHOLD_MET" != "1" ]; then |
| 150 | + # Determine which bound was violated |
| 151 | + if [ "$LOWER_BOUND_MET" != "1" ]; then |
| 152 | + echo "❌ E2E smoke test FAILED - success rate too low" |
| 153 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 154 | + echo " - Required: ≥40%" |
| 155 | + elif [ "$UPPER_BOUND_MET" != "1" ]; then |
| 156 | + echo "❌ E2E smoke test FAILED - success rate suspiciously high" |
| 157 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 158 | + echo " - Maximum expected: ≤90%" |
| 159 | + echo " - This may indicate test issues or unrealistic performance" |
| 160 | + else |
| 161 | + echo "❌ E2E smoke test FAILED - success rate outside acceptable range" |
| 162 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 163 | + echo " - Required range: 40%-90%" |
| 164 | + fi |
| 165 | + exit 1 |
| 166 | + else |
| 167 | + echo "✅ E2E smoke test PASSED" |
| 168 | + echo " - Success rate: ${SUCCESS_RATE:-unknown}" |
| 169 | + echo " - Within acceptable range: 40%-90%" |
| 170 | + fi |
| 171 | +
|
| 172 | + - name: Create GitHub issue on failure |
| 173 | + if: failure() |
| 174 | + uses: actions/github-script@v7 |
| 175 | + with: |
| 176 | + script: | |
| 177 | + const testResults = { |
| 178 | + exitCode: '${{ steps.run_test.outputs.test_exit_code }}', |
| 179 | + successRate: '${{ steps.run_test.outputs.success_rate }}', |
| 180 | + thresholdMet: '${{ steps.run_test.outputs.threshold_met }}', |
| 181 | + lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}', |
| 182 | + upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}' |
| 183 | + }; |
| 184 | +
|
| 185 | + const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`; |
| 186 | +
|
| 187 | + const body = ` |
| 188 | + ## E2E Smoke Test Failure Report |
| 189 | +
|
| 190 | + **Test:** E2E Smoke Test |
| 191 | + **Date:** ${new Date().toISOString()} |
| 192 | + **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |
| 193 | +
|
| 194 | + ### Test Results |
| 195 | + - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'} |
| 196 | + - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'} |
| 197 | + - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'} |
| 198 | + - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'} |
| 199 | + - **Test Exit Code:** ${testResults.exitCode || 'Unknown'} |
| 200 | +
|
| 201 | + ### Required Actions |
| 202 | +
|
| 203 | + ${ testResults.thresholdMet !== '1' ? |
| 204 | + (testResults.lowerBoundMet !== '1' ? |
| 205 | + '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' : |
| 206 | + testResults.upperBoundMet !== '1' ? |
| 207 | + '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' : |
| 208 | + '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.' |
| 209 | + ) : |
| 210 | + '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.' |
| 211 | + } |
| 212 | +
|
| 213 | + ### Next Steps |
| 214 | + 1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information |
| 215 | + 2. Check if this is a temporary issue by re-running the workflow manually |
| 216 | + 3. If persistent, investigate potential causes: |
| 217 | + - Model performance degradation |
| 218 | + - Test environment configuration |
| 219 | + - API key or service availability issues |
| 220 | +
|
| 221 | + ### Auto-generated |
| 222 | + This issue was automatically created by the E2E smoke test workflow. |
| 223 | + `; |
| 224 | +
|
| 225 | + // Create the issue |
| 226 | + await github.rest.issues.create({ |
| 227 | + owner: context.repo.owner, |
| 228 | + repo: context.repo.repo, |
| 229 | + title: title, |
| 230 | + body: body, |
| 231 | + labels: ['bug', 'e2e-test', 'automated', 'smoke-test'] |
| 232 | + }); |
0 commit comments