Skip to content

Commit a8445a1

Browse files
committed
e2e smoke test
1 parent 7edd65e commit a8445a1

File tree

3 files changed

+469
-1
lines changed

3 files changed

+469
-1
lines changed
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
name: E2E Smoke Test
2+
3+
# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
4+
on:
5+
schedule:
6+
- cron: '0 */6 * * *'
7+
workflow_dispatch: # Allow manual triggering
8+
inputs:
9+
debug_mode:
10+
description: 'Enable debug output'
11+
required: false
12+
default: 'false'
13+
type: boolean
14+
15+
jobs:
16+
e2e-smoke-test:
17+
name: E2E Smoke Test
18+
runs-on: ubuntu-latest
19+
20+
steps:
21+
- name: Checkout repository
22+
uses: actions/checkout@v4
23+
with:
24+
fetch-depth: 0
25+
26+
- name: Set up Python 3.12
27+
uses: actions/setup-python@v5
28+
with:
29+
python-version: "3.12"
30+
31+
- name: Install uv
32+
uses: astral-sh/setup-uv@v6
33+
with:
34+
enable-cache: true
35+
36+
- name: Install the project
37+
run: uv sync --locked --all-extras --dev
38+
39+
- name: Install tau2 for testing
40+
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
41+
42+
- name: Run E2E Smoke Test
43+
id: run_test
44+
env:
45+
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
46+
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
47+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
48+
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
49+
run: |
50+
echo "Running e2e smoke test..."
51+
52+
# Run the test and capture both stdout and exit code
53+
set +e # Don't exit on failure
54+
55+
uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
56+
-v --tb=short --durations=10 \
57+
--ep-print-summary \
58+
--ep-summary-json=ep_summary.json
59+
60+
TEST_EXIT_CODE=$?
61+
62+
echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
63+
64+
# Parse evaluation protocol summary if it exists
65+
if [ -f ep_summary.json ]; then
66+
echo "EP Summary found, parsing..."
67+
68+
# Log the full summary for debugging
69+
echo "EP Summary contents:"
70+
cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json
71+
72+
# Extract success rate from EP summary (this contains the actual accuracy/success rate)
73+
# The EP summary uses 'agg_score' for the aggregated success rate
74+
SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0")
75+
76+
echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
77+
78+
# Check if success rate meets thresholds (40% - 90% acceptable range)
79+
LOWER_BOUND=0.4 # 40%
80+
UPPER_BOUND=0.9 # 90%
81+
LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l)
82+
UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l)
83+
THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l)
84+
85+
echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
86+
echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
87+
echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
88+
89+
# Extract additional info for display
90+
NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0")
91+
NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0")
92+
93+
echo "📊 Evaluation Summary:"
94+
echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
95+
echo " - Dataset rows evaluated: $NUM_ROWS"
96+
echo " - Number of runs: $NUM_RUNS"
97+
echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
98+
echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
99+
echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
100+
else
101+
echo "❌ No EP summary file found"
102+
echo "threshold_met=0" >> $GITHUB_OUTPUT
103+
echo "success_rate=0" >> $GITHUB_OUTPUT
104+
fi
105+
106+
- name: Upload test results
107+
if: always()
108+
uses: actions/upload-artifact@v4
109+
with:
110+
name: e2e-smoke-test-results-${{ github.run_number }}
111+
path: |
112+
ep_summary.json
113+
*.log
114+
retention-days: 7
115+
116+
- name: Validate test results
117+
if: always()
118+
run: |
119+
echo "Validating test results against thresholds..."
120+
121+
TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}"
122+
THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}"
123+
LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}"
124+
UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}"
125+
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
126+
127+
echo "Test exit code: $TEST_EXIT_CODE"
128+
echo "Threshold met (40%-90%): $THRESHOLD_MET"
129+
echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
130+
echo "Upper bound met (≤90%): $UPPER_BOUND_MET"
131+
echo "Success rate: $SUCCESS_RATE"
132+
133+
# Fail the job if tests didn't run successfully or thresholds weren't met
134+
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
135+
echo "❌ E2E smoke test FAILED"
136+
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
137+
echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})"
138+
exit 1
139+
elif [ "$TEST_EXIT_CODE" != "0" ]; then
140+
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
141+
echo " - Test exit code: $TEST_EXIT_CODE"
142+
echo " - Thresholds met: $THRESHOLD_MET"
143+
# Don't exit with error if thresholds were actually met despite test issues
144+
if [ "$THRESHOLD_MET" = "1" ]; then
145+
echo "✅ Thresholds met despite execution issues - considering this a pass"
146+
else
147+
exit 1
148+
fi
149+
elif [ "$THRESHOLD_MET" != "1" ]; then
150+
# Determine which bound was violated
151+
if [ "$LOWER_BOUND_MET" != "1" ]; then
152+
echo "❌ E2E smoke test FAILED - success rate too low"
153+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
154+
echo " - Required: ≥40%"
155+
elif [ "$UPPER_BOUND_MET" != "1" ]; then
156+
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
157+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
158+
echo " - Maximum expected: ≤90%"
159+
echo " - This may indicate test issues or unrealistic performance"
160+
else
161+
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
162+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
163+
echo " - Required range: 40%-90%"
164+
fi
165+
exit 1
166+
else
167+
echo "✅ E2E smoke test PASSED"
168+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
169+
echo " - Within acceptable range: 40%-90%"
170+
fi
171+
172+
- name: Create GitHub issue on failure
173+
if: failure()
174+
uses: actions/github-script@v7
175+
with:
176+
script: |
177+
const testResults = {
178+
exitCode: '${{ steps.run_test.outputs.test_exit_code }}',
179+
successRate: '${{ steps.run_test.outputs.success_rate }}',
180+
thresholdMet: '${{ steps.run_test.outputs.threshold_met }}',
181+
lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}',
182+
upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}'
183+
};
184+
185+
const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`;
186+
187+
const body = `
188+
## E2E Smoke Test Failure Report
189+
190+
**Test:** E2E Smoke Test
191+
**Date:** ${new Date().toISOString()}
192+
**Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
193+
194+
### Test Results
195+
- **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'}
196+
- **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'}
197+
- **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'}
198+
- **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'}
199+
- **Test Exit Code:** ${testResults.exitCode || 'Unknown'}
200+
201+
### Required Actions
202+
203+
${ testResults.thresholdMet !== '1' ?
204+
(testResults.lowerBoundMet !== '1' ?
205+
'🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' :
206+
testResults.upperBoundMet !== '1' ?
207+
'⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' :
208+
'🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.'
209+
) :
210+
'🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.'
211+
}
212+
213+
### Next Steps
214+
1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information
215+
2. Check if this is a temporary issue by re-running the workflow manually
216+
3. If persistent, investigate potential causes:
217+
- Model performance degradation
218+
- Test environment configuration
219+
- API key or service availability issues
220+
221+
### Auto-generated
222+
This issue was automatically created by the E2E smoke test workflow.
223+
`;
224+
225+
// Create the issue
226+
await github.rest.issues.create({
227+
owner: context.repo.owner,
228+
repo: context.repo.repo,
229+
title: title,
230+
body: body,
231+
labels: ['bug', 'e2e-test', 'automated', 'smoke-test']
232+
});

tests/pytest/test_tau_bench_airline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
6565
input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
6666
dataset_adapter=tau_bench_airline_to_evaluation_row,
6767
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
68-
rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
68+
rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
6969
rollout_processor=default_mcp_gym_rollout_processor,
7070
passed_threshold={"success": 0.4, "standard_deviation": 0.1},
7171
num_runs=8,

0 commit comments

Comments
 (0)