Skip to content

Commit c3574f9

Browse files
authored
E2E Smoke Test (#75)
* e2e smoke test * temp adding * update * test * adjust bounds * change back to regular schedule * final
1 parent 7edd65e commit c3574f9

File tree

3 files changed

+425
-1
lines changed

3 files changed

+425
-1
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
name: E2E Smoke Test
2+
3+
# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
4+
on:
5+
schedule:
6+
- cron: '0 */6 * * *'
7+
workflow_dispatch: # Allow manual triggering
8+
inputs:
9+
debug_mode:
10+
description: 'Enable debug output'
11+
required: false
12+
default: 'false'
13+
type: boolean
14+
15+
jobs:
16+
e2e-smoke-test:
17+
name: E2E Smoke Test
18+
runs-on: ubuntu-latest
19+
20+
steps:
21+
- name: Checkout repository
22+
uses: actions/checkout@v4
23+
with:
24+
fetch-depth: 0
25+
26+
- name: Set up Python 3.12
27+
uses: actions/setup-python@v5
28+
with:
29+
python-version: "3.12"
30+
31+
- name: Install uv
32+
uses: astral-sh/setup-uv@v6
33+
with:
34+
enable-cache: true
35+
36+
- name: Install the project
37+
run: uv sync --locked --all-extras --dev
38+
39+
- name: Install tau2 for testing
40+
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
41+
42+
- name: Run E2E Smoke Test
43+
id: run_test
44+
env:
45+
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
46+
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
47+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
48+
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
49+
run: |
50+
echo "Running e2e smoke test..."
51+
52+
# Run the test and capture both stdout and exit code
53+
set +e # Don't exit on failure
54+
55+
uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
56+
-v --tb=short --durations=10 \
57+
--ep-print-summary \
58+
--ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
59+
60+
TEST_EXIT_CODE=$?
61+
62+
echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
63+
64+
# List generated files for debugging
65+
echo "📁 Generated files:"
66+
ls -la *.json 2>/dev/null || echo "No JSON files found"
67+
ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
68+
69+
# Parse EP summary from terminal output (more reliable than JSON files)
70+
if [ -f test_output.log ]; then
71+
echo "📋 Parsing EP summary from terminal output..."
72+
73+
# Show the terminal output for debugging
74+
echo "Terminal output:"
75+
cat test_output.log
76+
echo ""
77+
78+
# Extract the EP Summary line from the terminal output
79+
EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
80+
81+
if [ -n "$EP_SUMMARY_LINE" ]; then
82+
echo "Found EP Summary line:"
83+
echo "$EP_SUMMARY_LINE"
84+
85+
# Parse the agg score from the line: "EP Summary | ... agg=0.420 ..."
86+
SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
87+
88+
# Extract other info
89+
NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
90+
NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
91+
92+
echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
93+
94+
# Check if success rate meets thresholds (36% - 60% acceptable range)
95+
LOWER_BOUND=0.36 # 36%
96+
UPPER_BOUND=0.6 # 60%
97+
LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
98+
UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
99+
THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
100+
101+
echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
102+
echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
103+
echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
104+
105+
echo "📊 Evaluation Summary (from terminal output):"
106+
echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
107+
echo " - Dataset rows evaluated: $NUM_ROWS"
108+
echo " - Number of runs: $NUM_RUNS"
109+
echo " - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
110+
echo " - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
111+
echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
112+
else
113+
echo "❌ No EP Summary line found in terminal output"
114+
echo "threshold_met=0" >> $GITHUB_OUTPUT
115+
echo "success_rate=0" >> $GITHUB_OUTPUT
116+
fi
117+
else
118+
echo "❌ No terminal output file found"
119+
echo "threshold_met=0" >> $GITHUB_OUTPUT
120+
echo "success_rate=0" >> $GITHUB_OUTPUT
121+
fi
122+
123+
- name: Upload test results
124+
if: always()
125+
uses: actions/upload-artifact@v4
126+
with:
127+
name: e2e-smoke-test-results-${{ github.run_number }}
128+
path: |
129+
test_output.log
130+
ep_summary*.json
131+
*.log
132+
retention-days: 7
133+
134+
- name: Validate test results
135+
if: always()
136+
run: |
137+
echo "Validating test results against thresholds..."
138+
139+
TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}"
140+
THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}"
141+
LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}"
142+
UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}"
143+
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
144+
145+
echo "Test exit code: $TEST_EXIT_CODE"
146+
echo "Threshold met (40%-60%): $THRESHOLD_MET"
147+
echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
148+
echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
149+
echo "Success rate: $SUCCESS_RATE"
150+
151+
# Fail the job if tests didn't run successfully or thresholds weren't met
152+
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
153+
echo "❌ E2E smoke test FAILED"
154+
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
155+
echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
156+
exit 1
157+
elif [ "$TEST_EXIT_CODE" != "0" ]; then
158+
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
159+
echo " - Test exit code: $TEST_EXIT_CODE"
160+
echo " - Thresholds met: $THRESHOLD_MET"
161+
# Don't exit with error if thresholds were actually met despite test issues
162+
if [ "$THRESHOLD_MET" = "1" ]; then
163+
echo "✅ Thresholds met despite execution issues - considering this a pass"
164+
else
165+
exit 1
166+
fi
167+
elif [ "$THRESHOLD_MET" != "1" ]; then
168+
# Determine which bound was violated
169+
if [ "$LOWER_BOUND_MET" != "1" ]; then
170+
echo "❌ E2E smoke test FAILED - success rate too low"
171+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
172+
echo " - Required: ≥40%"
173+
elif [ "$UPPER_BOUND_MET" != "1" ]; then
174+
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
175+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
176+
echo " - Maximum expected: ≤60%"
177+
echo " - This may indicate test issues or unrealistic performance"
178+
else
179+
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
180+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
181+
echo " - Required range: 40%-60%"
182+
fi
183+
exit 1
184+
else
185+
echo "✅ E2E smoke test PASSED"
186+
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
187+
echo " - Within acceptable range: 40%-60%"
188+
fi

tests/pytest/test_tau_bench_airline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
6565
input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
6666
dataset_adapter=tau_bench_airline_to_evaluation_row,
6767
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
68-
rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
68+
rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
6969
rollout_processor=default_mcp_gym_rollout_processor,
7070
passed_threshold={"success": 0.4, "standard_deviation": 0.1},
7171
num_runs=8,

0 commit comments

Comments
 (0)