Skip to content

Commit 02e9b93

Browse files
committed
test
1 parent b39cc40 commit 02e9b93

File tree

1 file changed

+56
-52
lines changed

1 file changed

+56
-52
lines changed

.github/workflows/e2e-smoke-test.yml

Lines changed: 56 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ jobs:
6565
uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
6666
-v --tb=short --durations=10 \
6767
--ep-print-summary \
68-
--ep-summary-json=ep_summary.json
68+
--ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
6969
7070
TEST_EXIT_CODE=$?
7171
@@ -76,53 +76,56 @@ jobs:
7676
ls -la *.json 2>/dev/null || echo "No JSON files found"
7777
ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
7878
79-
# Parse evaluation protocol summary if it exists
80-
# EP might generate files with different names, check for common patterns
81-
EP_SUMMARY_FILE=""
82-
for file in ep_summary*.json; do
83-
if [ -f "$file" ]; then
84-
EP_SUMMARY_FILE="$file"
85-
break
79+
# Parse EP summary from terminal output (more reliable than JSON files)
80+
if [ -f test_output.log ]; then
81+
echo "📋 Parsing EP summary from terminal output..."
82+
83+
# Show the terminal output for debugging
84+
echo "Terminal output:"
85+
cat test_output.log
86+
echo ""
87+
88+
# Extract the EP Summary line from the terminal output
89+
EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
90+
91+
if [ -n "$EP_SUMMARY_LINE" ]; then
92+
echo "Found EP Summary line:"
93+
echo "$EP_SUMMARY_LINE"
94+
95+
# Parse the agg score from the line: "EP Summary | ... agg=0.420 ..."
96+
SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
97+
98+
# Extract other info
99+
NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
100+
NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
101+
102+
echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
103+
104+
# Check if success rate meets thresholds (40% - 90% acceptable range)
105+
LOWER_BOUND=0.4 # 40%
106+
UPPER_BOUND=0.9 # 90%
107+
LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
108+
UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
109+
THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
110+
111+
echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
112+
echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
113+
echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
114+
115+
echo "📊 Evaluation Summary (from terminal output):"
116+
echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
117+
echo " - Dataset rows evaluated: $NUM_ROWS"
118+
echo " - Number of runs: $NUM_RUNS"
119+
echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
120+
echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
121+
echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
122+
else
123+
echo "❌ No EP Summary line found in terminal output"
124+
echo "threshold_met=0" >> $GITHUB_OUTPUT
125+
echo "success_rate=0" >> $GITHUB_OUTPUT
86126
fi
87-
done
88-
89-
if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then
90-
echo "EP Summary found: $EP_SUMMARY_FILE, parsing..."
91-
92-
# Log the full summary for debugging
93-
echo "EP Summary contents:"
94-
cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE"
95-
96-
# Extract success rate from EP summary (this contains the actual accuracy/success rate)
97-
# The EP summary uses 'agg_score' for the aggregated success rate
98-
SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
99-
100-
echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
101-
102-
# Check if success rate meets thresholds (40% - 90% acceptable range)
103-
LOWER_BOUND=0.4 # 40%
104-
UPPER_BOUND=0.9 # 90%
105-
LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l)
106-
UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l)
107-
THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l)
108-
109-
echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
110-
echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
111-
echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
112-
113-
# Extract additional info for display
114-
NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
115-
NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
116-
117-
echo "📊 Evaluation Summary:"
118-
echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
119-
echo " - Dataset rows evaluated: $NUM_ROWS"
120-
echo " - Number of runs: $NUM_RUNS"
121-
echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
122-
echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
123-
echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
124127
else
125-
echo "❌ No EP summary file found"
128+
echo "❌ No terminal output file found"
126129
echo "threshold_met=0" >> $GITHUB_OUTPUT
127130
echo "success_rate=0" >> $GITHUB_OUTPUT
128131
fi
@@ -133,6 +136,7 @@ jobs:
133136
with:
134137
name: e2e-smoke-test-results-${{ github.run_number }}
135138
path: |
139+
test_output.log
136140
ep_summary*.json
137141
*.log
138142
retention-days: 7
@@ -149,16 +153,16 @@ jobs:
149153
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
150154
151155
echo "Test exit code: $TEST_EXIT_CODE"
152-
echo "Threshold met (40%-90%): $THRESHOLD_MET"
156+
echo "Threshold met (40%-60%): $THRESHOLD_MET"
153157
echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
154-
echo "Upper bound met (≤90%): $UPPER_BOUND_MET"
158+
echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
155159
echo "Success rate: $SUCCESS_RATE"
156160
157161
# Fail the job if tests didn't run successfully or thresholds weren't met
158162
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
159163
echo "❌ E2E smoke test FAILED"
160164
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
161-
echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})"
165+
echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
162166
exit 1
163167
elif [ "$TEST_EXIT_CODE" != "0" ]; then
164168
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -179,16 +183,16 @@ jobs:
179183
elif [ "$UPPER_BOUND_MET" != "1" ]; then
180184
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
181185
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
182-
echo " - Maximum expected: ≤90%"
186+
echo " - Maximum expected: ≤60%"
183187
echo " - This may indicate test issues or unrealistic performance"
184188
else
185189
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
186190
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
187-
echo " - Required range: 40%-90%"
191+
echo " - Required range: 40%-60%"
188192
fi
189193
exit 1
190194
else
191195
echo "✅ E2E smoke test PASSED"
192196
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
193-
echo " - Within acceptable range: 40%-90%"
197+
echo " - Within acceptable range: 40%-60%"
194198
fi

0 commit comments

Comments
 (0)