@@ -71,17 +71,31 @@ jobs:
7171
7272 echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
7373
74+ # List generated files for debugging
75+ echo "📁 Generated files:"
76+ ls -la *.json 2>/dev/null || echo "No JSON files found"
77+ ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
78+
7479 # Parse evaluation protocol summary if it exists
75- if [ -f ep_summary.json ]; then
76- echo "EP Summary found, parsing..."
80+ # EP might generate files with different names, check for common patterns
81+ EP_SUMMARY_FILE=""
82+ for file in ep_summary*.json; do
83+ if [ -f "$file" ]; then
84+ EP_SUMMARY_FILE="$file"
85+ break
86+ fi
87+ done
88+
89+ if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then
90+ echo "EP Summary found: $EP_SUMMARY_FILE, parsing..."
7791
7892 # Log the full summary for debugging
7993 echo "EP Summary contents:"
80- cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json
94+ cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE"
8195
8296 # Extract success rate from EP summary (this contains the actual accuracy/success rate)
8397 # The EP summary uses 'agg_score' for the aggregated success rate
84- SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0")
98+ SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
8599
86100 echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
87101
97111 echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
98112
99113 # Extract additional info for display
100- NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0")
101- NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0")
114+ NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
115+ NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
102116
103117 echo "📊 Evaluation Summary:"
104118 echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
@@ -119,7 +133,7 @@ jobs:
119133 with :
120134 name : e2e-smoke-test-results-${{ github.run_number }}
121135 path : |
122- ep_summary.json
136+ ep_summary* .json
123137 *.log
124138 retention-days : 7
125139
@@ -178,65 +192,3 @@ jobs:
178192 echo " - Success rate: ${SUCCESS_RATE:-unknown}"
179193 echo " - Within acceptable range: 40%-90%"
180194 fi
181-
182- - name : Create GitHub issue on failure
183- if : failure()
184- uses : actions/github-script@v7
185- with :
186- script : |
187- const testResults = {
188- exitCode: '${{ steps.run_test.outputs.test_exit_code }}',
189- successRate: '${{ steps.run_test.outputs.success_rate }}',
190- thresholdMet: '${{ steps.run_test.outputs.threshold_met }}',
191- lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}',
192- upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}'
193- };
194-
195- const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`;
196-
197- const body = `
198- ## E2E Smoke Test Failure Report
199-
200- **Test:** E2E Smoke Test
201- **Date:** ${new Date().toISOString()}
202- **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
203-
204- ### Test Results
205- - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'}
206- - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'}
207- - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'}
208- - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'}
209- - **Test Exit Code:** ${testResults.exitCode || 'Unknown'}
210-
211- ### Required Actions
212-
213- ${ testResults.thresholdMet !== '1' ?
214- (testResults.lowerBoundMet !== '1' ?
215- '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' :
216- testResults.upperBoundMet !== '1' ?
217- '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' :
218- '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.'
219- ) :
220- '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.'
221- }
222-
223- ### Next Steps
224- 1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information
225- 2. Check if this is a temporary issue by re-running the workflow manually
226- 3. If persistent, investigate potential causes:
227- - Model performance degradation
228- - Test environment configuration
229- - API key or service availability issues
230-
231- ### Auto-generated
232- This issue was automatically created by the E2E smoke test workflow.
233- `;
234-
235- // Create the issue
236- await github.rest.issues.create({
237- owner: context.repo.owner,
238- repo: context.repo.repo,
239- title: title,
240- body: body,
241- labels: ['bug', 'e2e-test', 'automated', 'smoke-test']
242- });
0 commit comments