From 1f791994937407fd82deb6c20e5e33ff5a1745f7 Mon Sep 17 00:00:00 2001 From: GrammaTonic Date: Mon, 2 Mar 2026 02:30:59 +0100 Subject: [PATCH] feat(monitoring): split mega-dashboard into 4 standalone Grafana dashboards Replace the combined github-runner.json (24 panels, 4 rows) with 4 focused, standalone dashboards per the Phase 4 spec (Issue #1062): - runner-overview.json: Runner status, health, uptime, Quick Links - dora-metrics.json: Unchanged - DF, Lead Time, CFR, MTTR, trends - performance-trends.json: NEW - Cache, CPU/Memory, build percentiles - job-analysis.json: Unchanged - Summary, histogram, runner comparison Also adds Grafana dashboard provisioning config, inter-dashboard navigation links, and updated plan tracking and docs. Resolves #1062 --- docs/features/GRAFANA_DASHBOARD_METRICS.md | 22 +- .../grafana/dashboards/github-runner.json | 945 ------------------ .../dashboards/performance-trends.json | 681 +++++++++++++ .../grafana/dashboards/runner-overview.json | 555 ++++++++++ .../provisioning/dashboards/dashboards.yml | 20 + plan/feature-prometheus-monitoring-1.md | 16 +- 6 files changed, 1280 insertions(+), 959 deletions(-) delete mode 100644 monitoring/grafana/dashboards/github-runner.json create mode 100644 monitoring/grafana/dashboards/performance-trends.json create mode 100644 monitoring/grafana/dashboards/runner-overview.json create mode 100644 monitoring/grafana/provisioning/dashboards/dashboards.yml diff --git a/docs/features/GRAFANA_DASHBOARD_METRICS.md b/docs/features/GRAFANA_DASHBOARD_METRICS.md index b3fbc841..785ffec6 100644 --- a/docs/features/GRAFANA_DASHBOARD_METRICS.md +++ b/docs/features/GRAFANA_DASHBOARD_METRICS.md @@ -99,12 +99,22 @@ Implement a lightweight custom metrics endpoint on each GitHub Actions runner (p - **Location**: Bash scripts started by `entrypoint.sh` and `entrypoint-chrome.sh` - **Metrics**: Runner status, job counts, uptime, cache hit rates, job duration -#### 2. Grafana Dashboard JSON - **We Provide** - -- **File**: `monitoring/grafana/dashboards/github-runner-dashboard.json` -- **Panels**: 12 panels covering all key metrics -- **Variables**: Filter by runner_name, runner_type -- **Import**: Users import JSON into their Grafana instance +#### 2. Grafana Dashboard JSON Files - **We Provide** + +**4 standalone dashboards** in `monitoring/grafana/dashboards/`: + +| Dashboard | File | Panels | Focus | +|---|---|---|---| +| Runner Overview | `runner-overview.json` | 12 | Runner status, health, uptime, queue time, navigation | +| DORA Metrics | `dora-metrics.json` | 12 | Deployment Frequency, Lead Time, CFR, MTTR, trends, classification | +| Performance Trends | `performance-trends.json` | 14 | Cache hit rates, CPU/memory, build duration percentiles, queue times | +| Job Analysis | `job-analysis.json` | 16 | Job summary, duration histograms, status breakdown, runner comparison | + +- **Variables**: All dashboards filter by `runner_name` and `runner_type` (multi-select) +- **Inter-dashboard links**: Navigation links and Quick Links panel for cross-dashboard navigation +- **Import**: Users import JSON into their Grafana instance, or use provisioning config +- **Provisioning**: `monitoring/grafana/provisioning/dashboards/dashboards.yml` for auto-loading +- **Datasource**: All dashboards use `${DS_PROMETHEUS}` input variable for portability #### 3. Example Prometheus Config - **We Provide Documentation** diff --git a/monitoring/grafana/dashboards/github-runner.json b/monitoring/grafana/dashboards/github-runner.json deleted file mode 100644 index 139bda01..00000000 --- a/monitoring/grafana/dashboards/github-runner.json +++ /dev/null @@ -1,945 +0,0 @@ -{ - "dashboard": { - "id": null, - "uid": "github-runner-overview", - "title": "GitHub Actions Runners - Overview & DORA", - "description": "Comprehensive overview of GitHub Actions self-hosted runners with DORA metrics, job tracking, and performance insights", - "tags": [ - "github-actions", - "runners", - "ci-cd", - "dora", - "monitoring" - ], - "timezone": "browser", - "schemaVersion": 39, - "version": 2, - "refresh": "15s", - "time": { - "from": "now-24h", - "to": "now" - }, - "templating": { - "list": [ - { - "name": "runner_name", - "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "query": "label_values(github_runner_info, runner_name)", - "multi": true, - "includeAll": true, - "current": { - "text": "All", - "value": "$__all" - }, - "refresh": 2 - }, - { - "name": "runner_type", - "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "query": "label_values(github_runner_info, runner_type)", - "multi": true, - "includeAll": true, - "current": { - "text": "All", - "value": "$__all" - }, - "refresh": 2 - } - ] - }, - "panels": [ - { - "id": 1, - "title": "Runner Overview", - "type": "row", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "collapsed": false - }, - { - "id": 2, - "title": "Runners Online", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(github_runner_status{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", - "legendFormat": "Online" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "green", - "value": 2 - } - ] - }, - "unit": "none" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 1 - } - }, - { - "id": 3, - "title": "Total Jobs", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", - "legendFormat": "Total" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "blue", - "value": null - } - ] - }, - "unit": "none" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 1 - } - }, - { - "id": 4, - "title": "Success Rate", - "type": "gauge", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(github_runner_jobs_total{status=\"success\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) / clamp_min(sum(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}), 1) * 100", - "legendFormat": "Success %" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 50 - }, - { - "color": "yellow", - "value": 80 - }, - { - "color": "green", - "value": 95 - } - ] - }, - "min": 0, - "max": 100, - "unit": "percent" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 1 - } - }, - { - "id": 5, - "title": "Runner Uptime", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "max(github_runner_uptime_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", - "legendFormat": "Max Uptime" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "yellow", - "value": null - }, - { - "color": "green", - "value": 3600 - } - ] - }, - "unit": "s" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 1 - } - }, - { - "id": 6, - "title": "Avg Queue Time", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "avg(github_runner_queue_time_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", - "legendFormat": "Queue Time" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 30 - }, - { - "color": "orange", - "value": 120 - }, - { - "color": "red", - "value": 300 - } - ] - }, - "unit": "s" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 1 - } - }, - { - "id": 7, - "title": "Runner Info", - "type": "table", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "github_runner_info{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", - "legendFormat": "{{ runner_name }}", - "format": "table", - "instant": true - } - ], - "fieldConfig": { - "defaults": {}, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - } - ] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 1 - } - }, - { - "id": 10, - "title": "DORA Metrics", - "type": "row", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "collapsed": false - }, - { - "id": 11, - "title": "Deployment Frequency (24h)", - "description": "Number of successful deployments in the last 24 hours. Elite performers deploy multiple times per day.", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(increase(github_runner_jobs_total{status=\"success\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[24h]))", - "legendFormat": "Deployments/day" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "yellow", - "value": 5 - }, - { - "color": "green", - "value": 10 - } - ] - }, - "unit": "none" - } - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 0, - "y": 6 - } - }, - { - "id": 12, - "title": "Lead Time (Avg Duration)", - "description": "Average job duration approximating lead time for changes. Elite performers have LTFC < 1 hour.", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(github_runner_job_duration_seconds_sum{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) / clamp_min(sum(github_runner_job_duration_seconds_count{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}), 1)", - "legendFormat": "Avg Duration" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 600 - }, - { - "color": "orange", - "value": 1800 - }, - { - "color": "red", - "value": 3600 - } - ] - }, - "unit": "s" - } - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 6, - "y": 6 - } - }, - { - "id": 13, - "title": "Change Failure Rate", - "description": "Percentage of failed deployments. Elite performers have CFR of 0-15%.", - "type": "gauge", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(github_runner_jobs_total{status=\"failed\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) / clamp_min(sum(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}), 1) * 100", - "legendFormat": "CFR %" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 15 - }, - { - "color": "orange", - "value": 30 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "min": 0, - "max": 100, - "unit": "percent" - } - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 12, - "y": 6 - } - }, - { - "id": 14, - "title": "Mean Time to Recovery", - "description": "Average queue time as MTTR proxy. Elite performers have MTTR < 1 hour.", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "avg(github_runner_queue_time_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", - "legendFormat": "MTTR Proxy" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 60 - }, - { - "color": "orange", - "value": 300 - }, - { - "color": "red", - "value": 3600 - } - ] - }, - "unit": "s" - } - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 6 - } - }, - { - "id": 15, - "title": "Deployment Frequency Trend", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(increase(github_runner_jobs_total{status=\"success\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[1h]))", - "legendFormat": "Deployments/hour" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "bars", - "fillOpacity": 30, - "pointSize": 5 - }, - "unit": "none" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 11 - } - }, - { - "id": 16, - "title": "Job Duration Trend (p50/p95/p99)", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "histogram_quantile(0.50, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", - "legendFormat": "p50" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", - "legendFormat": "p95" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", - "legendFormat": "p99" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "unit": "s" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 11 - } - }, - { - "id": 17, - "title": "Failure Rate Trend", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "sum(increase(github_runner_jobs_total{status=\"failed\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[1h])) / clamp_min(sum(increase(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[1h])), 1) * 100", - "legendFormat": "Failure Rate %" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "fixed" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 20 - }, - "unit": "percent" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 11 - } - }, - { - "id": 20, - "title": "Job Analysis", - "type": "row", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 18 - }, - "collapsed": false - }, - { - "id": 21, - "title": "Job Duration Distribution", - "type": "barchart", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", - "legendFormat": "{{ le }}s", - "format": "table", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "none" - } - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 19 - } - }, - { - "id": 22, - "title": "Jobs by Status", - "type": "piechart", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "github_runner_jobs_total{status!=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", - "legendFormat": "{{ status }}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - } - } - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 19 - } - }, - { - "id": 23, - "title": "Queue Time Trend", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "github_runner_queue_time_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", - "legendFormat": "{{ runner_name }}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 15 - }, - "unit": "s" - } - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 19 - } - }, - { - "id": 30, - "title": "Performance", - "type": "row", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 26 - }, - "collapsed": false - }, - { - "id": 31, - "title": "Cache Hit Rate", - "description": "Cache hit rates by type (BuildKit, APT, npm). Currently stubbed \u2014 data source integration pending.", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "github_runner_cache_hit_rate{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", - "legendFormat": "{{ cache_type }}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "min": 0, - "max": 1, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 27 - } - }, - { - "id": 32, - "title": "CPU Usage (cAdvisor)", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "rate(container_cpu_usage_seconds_total{name=~\"github-runner.*\"}[5m]) * 100", - "legendFormat": "{{ name }}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "unit": "percent" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 27 - } - }, - { - "id": 33, - "title": "Memory Usage (cAdvisor)", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "targets": [ - { - "expr": "container_memory_usage_bytes{name=~\"github-runner.*\"}", - "legendFormat": "{{ name }}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 27 - } - } - ], - "annotations": { - "list": [] - } - }, - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "Prometheus datasource for runner metrics", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "9.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat" - }, - { - "type": "panel", - "id": "gauge", - "name": "Gauge" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series" - }, - { - "type": "panel", - "id": "table", - "name": "Table" - }, - { - "type": "panel", - "id": "barchart", - "name": "Bar chart" - }, - { - "type": "panel", - "id": "piechart", - "name": "Pie chart" - } - ] -} diff --git a/monitoring/grafana/dashboards/performance-trends.json b/monitoring/grafana/dashboards/performance-trends.json new file mode 100644 index 00000000..6c5edd0d --- /dev/null +++ b/monitoring/grafana/dashboards/performance-trends.json @@ -0,0 +1,681 @@ +{ + "dashboard": { + "id": null, + "uid": "github-runner-performance", + "title": "GitHub Actions Runners - Performance Trends", + "description": "Performance monitoring for GitHub Actions self-hosted runners: cache hit rates, build times, CPU/memory usage, and queue times", + "tags": [ + "github-actions", + "performance", + "monitoring", + "cache" + ], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "15s", + "time": { + "from": "now-24h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "runner_name", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "query": "label_values(github_runner_info, runner_name)", + "multi": true, + "includeAll": true, + "current": { + "text": "All", + "value": "$__all" + }, + "refresh": 2 + }, + { + "name": "runner_type", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "query": "label_values(github_runner_info, runner_type)", + "multi": true, + "includeAll": true, + "current": { + "text": "All", + "value": "$__all" + }, + "refresh": 2 + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Performance Summary", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "collapsed": false + }, + { + "id": 2, + "title": "Avg Cache Hit Rate", + "description": "Average cache hit rate across all cache types (0-100%)", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "avg(github_runner_cache_hit_rate{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) * 100", + "legendFormat": "Cache Hit %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 30 + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "green", + "value": 80 + } + ] + }, + "min": 0, + "max": 100, + "unit": "percent" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 1 + } + }, + { + "id": 3, + "title": "CPU Usage", + "description": "Current average CPU usage across runner containers (requires cAdvisor)", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "avg(rate(container_cpu_usage_seconds_total{name=~\"github-runner.*\"}[5m])) * 100", + "legendFormat": "CPU %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 1 + } + }, + { + "id": 4, + "title": "Memory Usage", + "description": "Current average memory usage across runner containers (requires cAdvisor)", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "avg(container_memory_usage_bytes{name=~\"github-runner.*\"})", + "legendFormat": "Memory" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "orange", + "value": 4294967296 + }, + { + "color": "red", + "value": 6442450944 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 1 + } + }, + { + "id": 5, + "title": "Build Time p50", + "description": "Median job duration (50th percentile) over the last 5 minutes", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", + "legendFormat": "p50" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 300 + }, + { + "color": "orange", + "value": 600 + }, + { + "color": "red", + "value": 1800 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 1 + } + }, + { + "id": 10, + "title": "Cache Performance", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "collapsed": false + }, + { + "id": 11, + "title": "Cache Hit Rate by Type", + "description": "Cache hit rate over time broken down by cache type (BuildKit, APT, npm)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "github_runner_cache_hit_rate{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", + "legendFormat": "{{ cache_type }} ({{ runner_name }})" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + }, + "min": 0, + "max": 1, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + } + }, + { + "id": 12, + "title": "Cache Hit Rate by Runner", + "description": "Average cache hit rate per runner over time", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "avg by (runner_name, runner_type) (github_runner_cache_hit_rate{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", + "legendFormat": "{{ runner_name }} ({{ runner_type }})" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto" + }, + "min": 0, + "max": 1, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + } + }, + { + "id": 20, + "title": "Resource Usage", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "collapsed": false + }, + { + "id": 21, + "title": "CPU Usage Over Time", + "description": "CPU usage percentage per runner container over time (requires cAdvisor)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{name=~\"github-runner.*\"}[5m]) * 100", + "legendFormat": "{{ name }}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + }, + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 60 }, + { "color": "red", "value": 90 } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + } + }, + { + "id": 22, + "title": "Memory Usage Over Time", + "description": "Memory usage per runner container over time (requires cAdvisor)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "container_memory_usage_bytes{name=~\"github-runner.*\"}", + "legendFormat": "{{ name }}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + }, + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2147483648 }, + { "color": "red", "value": 6442450944 } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + } + }, + { + "id": 30, + "title": "Build Performance", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "collapsed": false + }, + { + "id": 31, + "title": "Job Duration Percentiles", + "description": "Job duration percentiles (p50, p90, p95, p99) over time", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(github_runner_job_duration_seconds_bucket{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + } + }, + { + "id": 32, + "title": "Queue Time Trend", + "description": "Average queue time per runner over time", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "github_runner_queue_time_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", + "legendFormat": "{{ runner_name }} ({{ runner_type }})" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + }, + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 30 }, + { "color": "red", "value": 300 } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + } + }, + { + "id": 33, + "title": "Avg Job Duration by Runner Type", + "description": "Average job duration broken down by runner type", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "sum by (runner_type) (github_runner_job_duration_seconds_sum{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) / clamp_min(sum by (runner_type) (github_runner_job_duration_seconds_count{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}), 1)", + "legendFormat": "{{ runner_type }}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "lineWidth": 1, + "fillOpacity": 50, + "pointSize": 5, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 33 + } + } + ], + "annotations": { + "list": [ + { + "name": "Annotations & Alerts", + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "type": "dashboard", + "builtIn": 1 + } + ] + }, + "links": [ + { + "title": "Runner Overview", + "url": "/d/github-runner-runner-overview", + "type": "link", + "icon": "dashboard", + "tooltip": "View Runner Overview dashboard" + }, + { + "title": "DORA Metrics", + "url": "/d/github-runner-dora", + "type": "link", + "icon": "dashboard", + "tooltip": "View DORA Metrics dashboard" + }, + { + "title": "Job Analysis", + "url": "/d/github-runner-job-analysis", + "type": "link", + "icon": "dashboard", + "tooltip": "View Job Analysis dashboard" + } + ] + }, + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus datasource for GitHub Actions runner metrics", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series" + } + ] +} diff --git a/monitoring/grafana/dashboards/runner-overview.json b/monitoring/grafana/dashboards/runner-overview.json new file mode 100644 index 00000000..ee35be8a --- /dev/null +++ b/monitoring/grafana/dashboards/runner-overview.json @@ -0,0 +1,555 @@ +{ + "dashboard": { + "id": null, + "uid": "github-runner-runner-overview", + "title": "GitHub Actions Runners - Runner Overview", + "description": "Overview of GitHub Actions self-hosted runner health, status, and general metrics", + "tags": [ + "github-actions", + "runners", + "overview", + "monitoring" + ], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "15s", + "time": { + "from": "now-24h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "runner_name", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "query": "label_values(github_runner_info, runner_name)", + "multi": true, + "includeAll": true, + "current": { + "text": "All", + "value": "$__all" + }, + "refresh": 2 + }, + { + "name": "runner_type", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "query": "label_values(github_runner_info, runner_type)", + "multi": true, + "includeAll": true, + "current": { + "text": "All", + "value": "$__all" + }, + "refresh": 2 + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Runner Status", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "collapsed": false + }, + { + "id": 2, + "title": "Runners Online", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "sum(github_runner_status{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", + "legendFormat": "Online" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "green", + "value": 2 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + } + }, + { + "id": 3, + "title": "Total Jobs", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "sum(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", + "legendFormat": "Total" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + } + }, + { + "id": 4, + "title": "Success Rate", + "type": "gauge", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "sum(github_runner_jobs_total{status=\"success\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}) / clamp_min(sum(github_runner_jobs_total{status=\"total\",runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}), 1) * 100", + "legendFormat": "Success %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "green", + "value": 95 + } + ] + }, + "min": 0, + "max": 100, + "unit": "percent" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + } + }, + { + "id": 5, + "title": "Runner Uptime", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "max(github_runner_uptime_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", + "legendFormat": "Max Uptime" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "yellow", + "value": null + }, + { + "color": "green", + "value": 3600 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + } + }, + { + "id": 6, + "title": "Avg Queue Time", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "avg(github_runner_queue_time_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"})", + "legendFormat": "Queue Time" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "orange", + "value": 120 + }, + { + "color": "red", + "value": 300 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + } + }, + { + "id": 7, + "title": "Runner Info", + "type": "table", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "github_runner_info{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", + "legendFormat": "{{ runner_name }}", + "format": "table", + "instant": true + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + } + }, + { + "id": 10, + "title": "Runner Health", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "collapsed": false + }, + { + "id": 11, + "title": "Runner Status Over Time", + "description": "Runner online/offline status over time by runner name", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "github_runner_status{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"}", + "legendFormat": "{{ runner_name }} ({{ runner_type }})" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + }, + "min": 0, + "max": 1, + "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + } + }, + { + "id": 12, + "title": "Uptime by Runner", + "description": "Uptime in hours for each runner", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "targets": [ + { + "expr": "github_runner_uptime_seconds{runner_name=~\"$runner_name\",runner_type=~\"$runner_type\"} / 3600", + "legendFormat": "{{ runner_name }}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "unit": "h" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + } + }, + { + "id": 20, + "title": "Quick Links", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "collapsed": false + }, + { + "id": 21, + "title": "Dashboard Navigation", + "description": "Quick links to other runner dashboards", + "type": "text", + "options": { + "mode": "markdown", + "content": "### Related Dashboards\n\n| Dashboard | Description |\n|---|---|\n| **[DORA Metrics](/d/github-runner-dora)** | Deployment Frequency, Lead Time, Change Failure Rate, MTTR |\n| **[Job Analysis](/d/github-runner-job-analysis)** | Job durations, status breakdown, runner comparison |\n| **[Performance Trends](/d/github-runner-performance)** | Cache hit rates, CPU/Memory usage, build times |" + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 14 + } + } + ], + "annotations": { + "list": [ + { + "name": "Annotations & Alerts", + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "type": "dashboard", + "builtIn": 1 + } + ] + }, + "links": [ + { + "title": "DORA Metrics", + "url": "/d/github-runner-dora", + "type": "link", + "icon": "dashboard", + "tooltip": "View DORA Metrics dashboard" + }, + { + "title": "Job Analysis", + "url": "/d/github-runner-job-analysis", + "type": "link", + "icon": "dashboard", + "tooltip": "View Job Analysis dashboard" + }, + { + "title": "Performance Trends", + "url": "/d/github-runner-performance", + "type": "link", + "icon": "dashboard", + "tooltip": "View Performance Trends dashboard" + } + ] + }, + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus datasource for GitHub Actions runner metrics", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series" + }, + { + "type": "panel", + "id": "table", + "name": "Table" + }, + { + "type": "panel", + "id": "text", + "name": "Text" + } + ] +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..09dac89c --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,20 @@ +# Grafana Dashboard Provisioning Configuration +# Auto-loads all dashboard JSON files when Grafana starts +# +# Usage: Mount this file to /etc/grafana/provisioning/dashboards/dashboards.yml +# and mount the dashboards directory to /var/lib/grafana/dashboards/ + +apiVersion: 1 + +providers: + - name: 'GitHub Actions Runners' + orgId: 1 + folder: 'GitHub Actions' + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/plan/feature-prometheus-monitoring-1.md b/plan/feature-prometheus-monitoring-1.md index 2de94774..3f5d923e 100644 --- a/plan/feature-prometheus-monitoring-1.md +++ b/plan/feature-prometheus-monitoring-1.md @@ -153,16 +153,16 @@ This implementation plan provides a fully executable roadmap for adding Promethe | Task | Description | Completed | Date | |------|-------------|-----------|------| -| TASK-037 | Replaced `monitoring/grafana/dashboards/github-runner.json` with comprehensive DORA overview dashboard (24 panels across 4 rows: Runner Overview, DORA Metrics, Job Analysis, Performance) | ✅ | 2025-07-25 | -| TASK-038 | Configure dashboard variables: `runner_name` (multi-select from `github_runner_info`), `runner_type` (multi-select: standard, chrome, chrome-go) | ✅ | 2025-07-25 | +| TASK-037 | Create `monitoring/grafana/dashboards/runner-overview.json` — standalone Runner Overview dashboard (3 rows: Runner Status stats, Runner Health timeseries, Quick Links navigation). Replaced combined `github-runner.json` mega-dashboard. | ✅ | 2026-03-02 | +| TASK-038 | Configure dashboard variables: `runner_name` (multi-select from `github_runner_info`), `runner_type` (multi-select: standard, chrome, chrome-go) — applied to all 4 dashboards | ✅ | 2025-07-25 | | TASK-039 | Create `monitoring/grafana/dashboards/dora-metrics.json` with panels: Deployment Frequency, Lead Time, Change Failure Rate, MTTR, trend charts, and DORA classification reference table | ✅ | 2025-07-25 | -| TASK-040 | Performance trends panels integrated into github-runner.json Performance row (cache hit rate, CPU, memory) | ✅ | 2025-07-25 | +| TASK-040 | Create standalone `monitoring/grafana/dashboards/performance-trends.json` — 4 rows: Performance Summary stats, Cache Performance timeseries, Resource Usage (CPU/Memory), Build Performance (duration percentiles, queue time, runner type comparison) | ✅ | 2026-03-02 | | TASK-041 | Create `monitoring/grafana/dashboards/job-analysis.json` with panels: Job Duration Histogram, Jobs by Status, Percentile Trends, Queue Time, Runner Comparison | ✅ | 2025-07-25 | -| TASK-042 | Add dashboard metadata: title, description, tags, version, refresh interval (15s), time range (last 24h) | ✅ | 2025-07-25 | -| TASK-043 | Dashboard JSON validated with python3 json.tool | ✅ | 2025-07-25 | -| TASK-044 | Capture screenshots of each dashboard for documentation | | | -| TASK-045 | Export final dashboard JSON files with templating variables configured | ✅ | 2025-07-25 | -| TASK-046 | PromQL queries validated in dashboard definitions | ✅ | 2025-07-25 | +| TASK-042 | Add dashboard metadata: title, description, tags, version, refresh interval (15s), time range (last 24h). All 4 dashboards have consistent metadata, `__inputs`, `__requires`, and inter-dashboard navigation links. | ✅ | 2026-03-02 | +| TASK-043 | Dashboard JSON validated with python3 json.tool — all 4 files pass | ✅ | 2026-03-02 | +| TASK-044 | Capture screenshots of each dashboard for documentation | ⏳ | | +| TASK-045 | Export final dashboard JSON files with templating variables configured. Added Grafana provisioning config at `monitoring/grafana/provisioning/dashboards/dashboards.yml` for auto-loading. | ✅ | 2026-03-02 | +| TASK-046 | PromQL queries validated in dashboard definitions — all queries reference metrics from `metrics-collector.sh` or cAdvisor | ✅ | 2026-03-02 | ### Implementation Phase 5: Documentation & User Guide