Skip to content

Commit 6afbfeb

Browse files
vgodsoeIswarya Alex
andauthored
Add monitoring for LIRA self-hosted CI runners (#10)
* Add monitoring for LIRA self-hosted CI runners * Change from bash to pwsh so it'll work on windows and linux * powershell * Tested and ready to deploy * Fixing the workflows for the runners and ensuring the teams plugin works * trigger workflow --------- Co-authored-by: Iswarya Alex <iswarya.alex@amd.com>
1 parent c4a8afd commit 6afbfeb

3 files changed

Lines changed: 177 additions & 5 deletions

File tree

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
name: Monitor Self-Hosted Runners
2+
3+
on:
4+
schedule:
5+
- cron: "0 1 * * 1" # every Monday at 1AM UTC
6+
workflow_dispatch:
7+
8+
permissions:
9+
actions: read # Required to read artifacts
10+
11+
jobs:
12+
check:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Get list of runners
17+
id: runners
18+
run: |
19+
# Try to get repo-level runners first
20+
curl -s \
21+
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
22+
https://api.github.com/repos/${{ github.repository }}/actions/runners \
23+
> repo_runners.json
24+
25+
# Also get org-level runners
26+
curl -s \
27+
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
28+
https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners \
29+
> org_runners.json
30+
31+
# Combine and deduplicate runner names
32+
repo_names=$(jq -r '.runners[]?.name // empty' repo_runners.json 2>/dev/null || echo "")
33+
org_names=$(jq -r '.runners[]?.name // empty' org_runners.json 2>/dev/null || echo "")
34+
35+
# Fallback to known runner list if API fails
36+
known_runners="xsjevo04"
37+
38+
if [ -n "$repo_names" ] || [ -n "$org_names" ]; then
39+
all_names=$(echo -e "$repo_names\n$org_names" | sort -u | tr '\n' ',' | sed 's/,$//')
40+
echo "names=$all_names" >> $GITHUB_OUTPUT
41+
echo "Using API discovered runners: $all_names"
42+
else
43+
echo "names=$known_runners" >> $GITHUB_OUTPUT
44+
echo "Using fallback runner list: $known_runners"
45+
fi
46+
47+
- name: Get artifacts
48+
id: artifacts
49+
run: |
50+
curl -s \
51+
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
52+
https://api.github.com/repos/${{ github.repository }}/actions/artifacts \
53+
> artifacts.json
54+
55+
- name: Check each runner heartbeat
56+
id: check
57+
run: |
58+
missing=""
59+
now=$(date -u +%s)
60+
61+
for r in $(echo "${{ steps.runners.outputs.names }}" | tr ',' ' '); do
62+
[ -z "$r" ] && continue
63+
echo "Checking $r"
64+
65+
ts=$(jq -r --arg r "$r" '[.artifacts[] | select(.name=="heartbeat-"+$r)] | sort_by(.updated_at) | last | .updated_at // empty' artifacts.json)
66+
67+
if [ -z "$ts" ] || [ "$ts" == "null" ]; then
68+
echo "No heartbeat found for $r"
69+
missing="$missing $r"
70+
continue
71+
fi
72+
73+
hb=$(date -d "$ts" +%s)
74+
diff=$((now - hb))
75+
76+
if [ $diff -gt 691200 ]; then # 8 days (weekly + 1 day buffer)
77+
echo "Heartbeat stale for $r (last seen: $ts)"
78+
missing="$missing $r"
79+
fi
80+
done
81+
82+
echo "missing=$missing" >> $GITHUB_OUTPUT
83+
84+
- name: Fail if any runner missing
85+
if: steps.check.outputs.missing != ''
86+
run: |
87+
echo "Missing or stale runners: ${{ steps.check.outputs.missing }}"
88+
exit 1
89+
90+
- name: Send Teams alert via Power Automate
91+
if: failure()
92+
run: |
93+
echo "Sending Teams alert via Power Automate..."
94+
response=$(curl -s -w "%{http_code}" -X POST \
95+
-H "Content-Type: application/json" \
96+
-d '{
97+
"type": "AdaptiveCard",
98+
"version": "1.3",
99+
"body": [
100+
{
101+
"type": "TextBlock",
102+
"text": "⚠️ Self-hosted Runner Alert",
103+
"weight": "bolder",
104+
"size": "medium",
105+
"color": "attention"
106+
},
107+
{
108+
"type": "TextBlock",
109+
"text": "The following runners are offline or missing heartbeats:",
110+
"wrap": true
111+
},
112+
{
113+
"type": "TextBlock",
114+
"text": "${{ steps.check.outputs.missing }}",
115+
"wrap": true,
116+
"fontType": "monospace"
117+
}
118+
]
119+
}' \
120+
"${{ secrets.TEAMS_WEBHOOK_URL }}")
121+
122+
http_code="${response: -3}"
123+
response_body="${response%???}"
124+
125+
echo "HTTP Status: $http_code"
126+
echo "Response: $response_body"
127+
128+
129+
if [ "$http_code" != "200" ] && [ "$http_code" != "202" ]; then
130+
echo "Power Automate webhook failed with status $http_code"
131+
else
132+
echo "Teams alert sent successfully via Power Automate"
133+
fi
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Runner Heartbeat
2+
3+
# Change to trigger push
4+
5+
on:
6+
schedule:
7+
- cron: "0 1 * * 0" # every Sunday at 1AM UTC
8+
workflow_dispatch:
9+
10+
permissions:
11+
actions: write # Required to upload artifacts
12+
13+
jobs:
14+
heartbeat:
15+
strategy:
16+
matrix:
17+
runner:
18+
- xsjevo04
19+
fail-fast: false # Continue even if one runner fails
20+
runs-on: [self-hosted, "${{ matrix.runner }}"]
21+
steps:
22+
- name: Create heartbeat file (Windows)
23+
if: runner.os == 'Windows'
24+
shell: powershell
25+
run: |
26+
$timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK"
27+
$runner = "${{ runner.name }}"
28+
"$timestamp" | Out-File -FilePath "heartbeat-$runner.txt" -Encoding utf8
29+
30+
- name: Create heartbeat file (Linux)
31+
if: runner.os == 'Linux'
32+
shell: bash
33+
run: |
34+
timestamp=$(date -Iseconds)
35+
runner="${{ runner.name }}"
36+
echo "$timestamp" > "heartbeat-$runner.txt"
37+
38+
- name: Upload heartbeat artifact
39+
uses: actions/upload-artifact@v4
40+
with:
41+
name: heartbeat-${{ runner.name }}
42+
path: heartbeat-${{ runner.name }}.txt
43+
retention-days: 14

.github/workflows/test_cpu.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,7 @@ jobs:
4747
pip install onnxruntime
4848
python -m pip check
4949
pip install -e .
50-
- name: Lint with Black
51-
uses: psf/black@stable
52-
with:
53-
options: "--check --verbose"
54-
src: "lira/models"
50+
5551
- name: Run LIRA tests
5652
shell: bash -el {0}
5753
run: |

0 commit comments

Comments
 (0)