6565 uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
6666 -v --tb=short --durations=10 \
6767 --ep-print-summary \
68- --ep-summary-json=ep_summary.json
68+ --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
6969
7070 TEST_EXIT_CODE=$?
7171
@@ -76,53 +76,56 @@ jobs:
7676 ls -la *.json 2>/dev/null || echo "No JSON files found"
7777 ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
7878
79- # Parse evaluation protocol summary if it exists
80- # EP might generate files with different names, check for common patterns
81- EP_SUMMARY_FILE=""
82- for file in ep_summary*.json; do
83- if [ -f "$file" ]; then
84- EP_SUMMARY_FILE="$file"
85- break
79+ # Parse EP summary from terminal output (more reliable than JSON files)
80+ if [ -f test_output.log ]; then
81+ echo "📋 Parsing EP summary from terminal output..."
82+
83+ # Show the terminal output for debugging
84+ echo "Terminal output:"
85+ cat test_output.log
86+ echo ""
87+
88+ # Extract the EP Summary line from the terminal output
89+ EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
90+
91+ if [ -n "$EP_SUMMARY_LINE" ]; then
92+ echo "Found EP Summary line:"
93+ echo "$EP_SUMMARY_LINE"
94+
95+ # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..."
96+ SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
97+
98+ # Extract other info
99+ NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
100+ NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
101+
102+ echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
103+
104+ # Check if success rate meets thresholds (40% - 90% acceptable range)
105+ LOWER_BOUND=0.4 # 40%
106+ UPPER_BOUND=0.9 # 90%
107+ LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
108+ UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
109+ THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
110+
111+ echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
112+ echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
113+ echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
114+
115+ echo "📊 Evaluation Summary (from terminal output):"
116+ echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
117+ echo " - Dataset rows evaluated: $NUM_ROWS"
118+ echo " - Number of runs: $NUM_RUNS"
119+ echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
120+ echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
121+ echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
122+ else
123+ echo "❌ No EP Summary line found in terminal output"
124+ echo "threshold_met=0" >> $GITHUB_OUTPUT
125+ echo "success_rate=0" >> $GITHUB_OUTPUT
86126 fi
87- done
88-
89- if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then
90- echo "EP Summary found: $EP_SUMMARY_FILE, parsing..."
91-
92- # Log the full summary for debugging
93- echo "EP Summary contents:"
94- cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE"
95-
96- # Extract success rate from EP summary (this contains the actual accuracy/success rate)
97- # The EP summary uses 'agg_score' for the aggregated success rate
98- SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
99-
100- echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
101-
102- # Check if success rate meets thresholds (40% - 90% acceptable range)
103- LOWER_BOUND=0.4 # 40%
104- UPPER_BOUND=0.9 # 90%
105- LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l)
106- UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l)
107- THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l)
108-
109- echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
110- echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
111- echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
112-
113- # Extract additional info for display
114- NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
115- NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
116-
117- echo "📊 Evaluation Summary:"
118- echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
119- echo " - Dataset rows evaluated: $NUM_ROWS"
120- echo " - Number of runs: $NUM_RUNS"
121- echo " - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
122- echo " - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
123- echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
124127 else
125- echo "❌ No EP summary file found"
128+ echo "❌ No terminal output file found"
126129 echo "threshold_met=0" >> $GITHUB_OUTPUT
127130 echo "success_rate=0" >> $GITHUB_OUTPUT
128131 fi
@@ -133,6 +136,7 @@ jobs:
133136 with :
134137 name : e2e-smoke-test-results-${{ github.run_number }}
135138 path : |
139+ test_output.log
136140 ep_summary*.json
137141 *.log
138142 retention-days : 7
@@ -149,16 +153,16 @@ jobs:
149153 SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
150154
151155 echo "Test exit code: $TEST_EXIT_CODE"
152- echo "Threshold met (40%-90 %): $THRESHOLD_MET"
156+ echo "Threshold met (40%-60 %): $THRESHOLD_MET"
153157 echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
154- echo "Upper bound met (≤90 %): $UPPER_BOUND_MET"
158+ echo "Upper bound met (≤60 %): $UPPER_BOUND_MET"
155159 echo "Success rate: $SUCCESS_RATE"
156160
157161 # Fail the job if tests didn't run successfully or thresholds weren't met
158162 if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
159163 echo "❌ E2E smoke test FAILED"
160164 echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
161- echo " - Success rate outside acceptable range (required: 40%-90 %, actual: ${SUCCESS_RATE:-unknown})"
165+ echo " - Success rate outside acceptable range (required: 40%-60 %, actual: ${SUCCESS_RATE:-unknown})"
162166 exit 1
163167 elif [ "$TEST_EXIT_CODE" != "0" ]; then
164168 echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -179,16 +183,16 @@ jobs:
179183 elif [ "$UPPER_BOUND_MET" != "1" ]; then
180184 echo "❌ E2E smoke test FAILED - success rate suspiciously high"
181185 echo " - Success rate: ${SUCCESS_RATE:-unknown}"
182- echo " - Maximum expected: ≤90 %"
186+ echo " - Maximum expected: ≤60 %"
183187 echo " - This may indicate test issues or unrealistic performance"
184188 else
185189 echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
186190 echo " - Success rate: ${SUCCESS_RATE:-unknown}"
187- echo " - Required range: 40%-90 %"
191+ echo " - Required range: 40%-60 %"
188192 fi
189193 exit 1
190194 else
191195 echo "✅ E2E smoke test PASSED"
192196 echo " - Success rate: ${SUCCESS_RATE:-unknown}"
193- echo " - Within acceptable range: 40%-90 %"
197+ echo " - Within acceptable range: 40%-60 %"
194198 fi
0 commit comments