diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1a6667b..41cf177 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: name: Set up Go uses: actions/setup-go@v6 with: - go-version: '1.25.x' + go-version: '1.25.11' - name: Run tests run: go test ./... diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2392697..62085e1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,15 +16,21 @@ jobs: - name: setup Go uses: actions/setup-go@v6 with: - go-version: '1.25.x' + go-version: '1.25.11' - run: go test -v ./... + validate-dashboard: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: validate dashboard JSON + run: ./scripts/validate-dashboard.sh dashboards/sonic-exporter.json golangci-lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: '1.25.x' + go-version: '1.25.11' cache: false - name: golangci-lint run: go run github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.8 run @@ -34,7 +40,7 @@ jobs: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: '1.25.x' + go-version: '1.25.11' - name: install govulncheck run: go install golang.org/x/vuln/cmd/govulncheck@latest - name: run govulncheck @@ -45,7 +51,7 @@ jobs: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: '1.25.x' + go-version: '1.25.11' - name: install gitleaks run: go install github.com/zricethezav/gitleaks/v8@latest - name: gitleaks scan working tree diff --git a/README.md b/README.md index 8cf3b57..c76eb6e 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,29 @@ For a deeper breakdown, see `docs/architecture.md`. Collector implementations live in `internal/collector/*_collector.go`. +## Grafana dashboard + +The Grafana dashboard lives in `dashboards/sonic-exporter.json`. It is a single-switch drilldown dashboard for Grafana 10 and Grafana 11. + +Rows are ordered this way: + +- `Overview / Exporter Health` +- `Interfaces / Queues` +- `Hardware / CRM / Host` +- `Topology / L2` +- `Optional / FDB` +- `Optional / System` +- `Optional / Docker` +- `Optional / FRR` + +The optional rows are collapsed by default and are safe when those metrics are absent. Validate dashboard changes with: + +```bash +./scripts/validate-dashboard.sh dashboards/sonic-exporter.json +``` + +For import, provisioning, variables, validation, smoke checks, and limits, see `docs/grafana-dashboard.md`. + ## Quick start ### Run locally diff --git a/dashboards/sonic-exporter.json b/dashboards/sonic-exporter.json new file mode 100644 index 0000000..aad6763 --- /dev/null +++ b/dashboards/sonic-exporter.json @@ -0,0 +1,6528 @@ +{ + "id": null, + "uid": null, + "title": "SONiC Exporter", + "description": "Portable Grafana 10/11 dashboard for SONiC Exporter single-switch drilldown.", + "tags": [ + "sonic", + "prometheus", + "network", + "exporter" + ], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "30s", + "1m", + "5m", + "15m", + "30m", + "1h" + ] + }, + "editable": true, + "graphTooltip": 1, + "fiscalYearStartMonth": 0, + "weekStart": "", + "annotations": { + "list": [] + }, + "templating": { + "list": [ + { + "name": "datasource", + "label": "Prometheus datasource", + "type": "datasource", + "query": "prometheus", + "regex": "", + "hide": 0, + "refresh": 1, + "skipUrlSync": false, + "current": { + "selected": false, + "text": "", + "value": "" + }, + "options": [] + }, + { + "name": "job", + "label": "Job", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(up, job)", + "query": "label_values(up, job)", + "regex": "", + "hide": 0, + "refresh": 1, + "sort": 1, + "multi": false, + "includeAll": false, + "skipUrlSync": false, + "current": { + "selected": false, + "text": "", + "value": "" + }, + "options": [] + }, + { + "name": "instance", + "label": "Instance", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(up{job=\"$job\"}, instance)", + "query": "label_values(up{job=\"$job\"}, instance)", + "regex": "", + "hide": 0, + "refresh": 1, + "sort": 1, + "multi": false, + "includeAll": false, + "skipUrlSync": false, + "current": { + "selected": false, + "text": "", + "value": "" + }, + "options": [] + } + ] + }, + "panels": [ + { + "type": "row", + "title": "Overview / Exporter Health", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "panels": [], + "id": 1 + }, + { + "type": "stat", + "title": "Target Up", + "description": "Prometheus scrape reachability for the selected SONiC exporter target.", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "up{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Target up" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool", + "decimals": 2 + }, + "overrides": [] + }, + "id": 2 + }, + { + "type": "gauge", + "title": "Core Collector Success", + "description": "Always-registered SONiC collectors. Value 1 means the latest refresh succeeded.", + "gridPos": { + "h": 4, + "w": 8, + "x": 4, + "y": 1 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Interface" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "HW" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_crm_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CRM" + }, + { + "refId": "D", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_queue_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Queue" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool", + "decimals": 2 + }, + "overrides": [] + }, + "id": 3 + }, + { + "type": "stat", + "title": "Default Optional Collector Success", + "description": "LLDP, VLAN, and LAG are env-gated but default enabled. Missing data can mean the collector was disabled.", + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool", + "decimals": 2 + }, + "overrides": [] + }, + "id": 4 + }, + { + "type": "stat", + "title": "System Uptime", + "description": "Optional System collector uptime. No data is expected when SYSTEM_ENABLED=false.", + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_uptime_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "System uptime" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "s", + "decimals": 2 + }, + "overrides": [] + }, + "id": 5 + }, + { + "type": "table", + "title": "Collector Success Detail", + "description": "Compact table for core, default-enabled, and disabled-by-default SONiC collector health. Optional rows may be absent.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 5 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Interface" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "HW" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_crm_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CRM" + }, + { + "refId": "D", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_queue_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Queue" + }, + { + "refId": "E", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "F", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "G", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + }, + { + "refId": "H", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "FDB optional" + }, + { + "refId": "I", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "System optional" + }, + { + "refId": "J", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Docker optional" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [ + { + "displayName": "Time", + "desc": true + } + ] + }, + "id": 6 + }, + { + "type": "timeseries", + "title": "Collector Scrape Duration", + "description": "Latest collector refresh duration. Includes core, default-enabled, and optional collectors; absent optional series are safe.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 5 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Interface" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "HW" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_crm_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CRM" + }, + { + "refId": "D", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_queue_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Queue" + }, + { + "refId": "E", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "F", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "G", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + }, + { + "refId": "H", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "FDB optional" + }, + { + "refId": "I", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "System optional" + }, + { + "refId": "J", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Docker optional" + } + ], + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull", + "max" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s", + "decimals": 2 + }, + "overrides": [] + }, + "id": 7 + }, + { + "type": "gauge", + "title": "Cache Age", + "description": "Cache freshness for cached collectors. Core interface/HW/CRM/queue collectors do not expose cache-age metrics.", + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 12 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + }, + { + "refId": "D", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "FDB optional" + }, + { + "refId": "E", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "System optional" + }, + { + "refId": "F", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Docker optional" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s", + "decimals": 2 + }, + "overrides": [] + }, + "id": 8 + }, + { + "type": "stat", + "title": "Skipped / Truncated / Stale", + "description": "Safeguard signals for skipped records, FDB truncation, and stale Docker source data. Zero is healthy; absent optional series are expected when disabled.", + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 12 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP skipped" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN skipped" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG skipped" + }, + { + "refId": "D", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "FDB skipped optional" + }, + { + "refId": "E", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_truncated{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "FDB truncated optional" + }, + { + "refId": "F", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Docker skipped optional" + }, + { + "refId": "G", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_source_stale{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Docker stale optional" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "decimals": 2 + }, + "overrides": [] + }, + "id": 9 + }, + { + "type": "table", + "title": "System Version", + "description": "Optional System collector software/version metadata. No data is expected when SYSTEM_ENABLED=false.", + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 12 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_software_info{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "System software" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [ + { + "displayName": "Time", + "desc": true + } + ] + }, + "id": 10 + }, + { + "type": "row", + "title": "Interfaces / Queues", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "panels": [], + "id": 11 + }, + { + "type": "table", + "title": "Interface Admin / Oper Status", + "description": "One row per interface status series. The `device` label identifies the SONiC port; values are 1 for up and 0 for down.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_admin_status{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "admin {{device}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_operational_status{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "oper {{device}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "" + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false, + "reducer": [ + "sum" + ], + "countRows": false, + "fields": "" + } + }, + "id": 12 + }, + { + "type": "stat", + "title": "Admin Down Interfaces", + "description": "Counts interfaces with admin status down for the selected switch.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 18 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(sonic_interface_admin_status{job=\"$job\",instance=\"$instance\"} == 0)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "admin down" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "id": 13 + }, + { + "type": "stat", + "title": "Oper Down Interfaces", + "description": "Counts interfaces with operational status down for the selected switch.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 18 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(sonic_interface_operational_status{job=\"$job\",instance=\"$instance\"} == 0)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "oper down" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "id": 14 + }, + { + "type": "table", + "title": "Interface Inventory", + "description": "Metadata table only. This keeps `sonic_interface_info` out of dense time-series graphs.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_interface_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{device}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "" + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false, + "reducer": [ + "sum" + ], + "countRows": false, + "fields": "" + } + }, + "id": 15 + }, + { + "type": "timeseries", + "title": "Top Interface Traffic", + "description": "Top-k ingress and egress interface traffic. Byte counters are converted to bits per second.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, 8 * sum by (device) (rate(sonic_interface_receive_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "RX {{device}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, 8 * sum by (device) (rate(sonic_interface_transmit_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TX {{device}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "id": 16 + }, + { + "type": "timeseries", + "title": "Top Interface Packet Rates", + "description": "Packet counters are grouped by interface and method, then bounded with top-k.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, sum by (device, method) (rate(sonic_interface_receive_packets_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "RX {{device}} {{method}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, sum by (device, method) (rate(sonic_interface_transmit_packets_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TX {{device}} {{method}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "id": 17 + }, + { + "type": "timeseries", + "title": "Top Interface Error Rates", + "description": "Receive and transmit error counters are grouped by interface and error type, then bounded with top-k.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, sum by (device, type) (rate(sonic_interface_receive_errs_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "RX {{device}} {{type}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, sum by (device, type) (rate(sonic_interface_transmit_errs_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TX {{device}} {{type}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "id": 18 + }, + { + "type": "timeseries", + "title": "Transceiver Temperature", + "description": "Latest transceiver DOM temperature per interface, bounded to the warmest optics.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 34 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(16, sonic_interface_transceiver_temperature_celsius{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "id": 19 + }, + { + "type": "timeseries", + "title": "Transceiver Voltage", + "description": "Latest transceiver DOM voltage per interface, bounded to top visible values.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 34 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(16, sonic_interface_transceiver_voltage{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "volt" + }, + "overrides": [] + }, + "id": 20 + }, + { + "type": "timeseries", + "title": "Optical Power TX / RX", + "description": "Transceiver optical receive and transmit power. The `unit` label is preserved in the legend.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(16, sonic_interface_optic_receive_power_dbm{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "RX {{device}} {{unit}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(16, sonic_interface_optic_transmit_power_dbm{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TX {{device}} {{unit}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "dBm" + }, + "overrides": [] + }, + "id": 21 + }, + { + "type": "timeseries", + "title": "Top Queue Packet Rates", + "description": "Queue packet counters are grouped by interface and queue, then limited with top-k.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, rate(sonic_queue_packets_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}} queue {{queue}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "id": 22 + }, + { + "type": "timeseries", + "title": "Top Queue Byte Rates", + "description": "Queue byte counters are converted to bits per second and bounded with top-k.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, 8 * rate(sonic_queue_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}} queue {{queue}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "id": 23 + }, + { + "type": "timeseries", + "title": "Top Queue Drop Rates", + "description": "Queue drops are high cardinality, so packet and byte drop counters are top-k bounded by interface and queue.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, rate(sonic_queue_dropped_packets_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "packets {{device}} queue {{queue}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, 8 * rate(sonic_queue_dropped_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "bits {{device}} queue {{queue}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "id": 24 + }, + { + "type": "timeseries", + "title": "Queue Shared Watermark Rate", + "description": "Shared watermark `_total` metric uses the Grafana rate interval and is bounded by top-k.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 58 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, rate(sonic_queue_shared_watermark_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}} queue {{queue}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "id": 25 + }, + { + "type": "timeseries", + "title": "Queue Watermark Detail Rate", + "description": "Detailed queue watermark series preserve `device`, `queue`, `type`, and `watermark` labels while top-k limits default rendering.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 58 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, rate(sonic_queue_watermark_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{device}} q{{queue}} {{type}} {{watermark}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "id": 26 + }, + { + "type": "row", + "title": "Hardware / CRM / Host", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 66 + }, + "panels": [], + "id": 27 + }, + { + "type": "table", + "title": "PSU Inventory", + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 67 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "PSU" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false + } + }, + "id": 28 + }, + { + "type": "stat", + "title": "PSU Operational Status", + "gridPos": { + "h": 6, + "w": 4, + "x": 8, + "y": 67 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_operational_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Down", + "color": "red" + }, + "1": { + "text": "Up", + "color": "green" + } + } + } + ], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 29 + }, + { + "type": "stat", + "title": "PSU Available Status", + "gridPos": { + "h": 6, + "w": 4, + "x": 12, + "y": 67 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_available_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Down", + "color": "red" + }, + "1": { + "text": "Up", + "color": "green" + } + } + } + ], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 30 + }, + { + "type": "table", + "title": "Chassis Info", + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 67 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_chassis_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Chassis" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false + } + }, + "id": 31 + }, + { + "type": "timeseries", + "title": "PSU Electrical Metrics", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_voltage_volts{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}} volts" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_current_amperes{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}} amps" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_power_watts{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}} watts" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 32 + }, + { + "type": "timeseries", + "title": "PSU Temperature", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 73 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_psu_temperature_celsius{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "celsius", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 33 + }, + { + "type": "timeseries", + "title": "Fan RPM", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 81 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_fan_rpm{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}/{{name}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "rpm", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 34 + }, + { + "type": "timeseries", + "title": "Fan Status", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 81 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_fan_operational_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}/{{name}} operational" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_hw_fan_available_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{slot}}/{{name}} available" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Down", + "color": "red" + }, + "1": { + "text": "Up", + "color": "green" + } + } + } + ], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 35 + }, + { + "type": "timeseries", + "title": "CRM Resources Used / Available", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 89 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_crm_resource_used{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{resource}} used" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_crm_resource_available{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{resource}} available" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 36 + }, + { + "type": "timeseries", + "title": "CRM Resource Used Percent", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 89 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, 100 * sonic_crm_resource_used{job=\"$job\",instance=\"$instance\"} / (sonic_crm_resource_used{job=\"$job\",instance=\"$instance\"} + sonic_crm_resource_available{job=\"$job\",instance=\"$instance\"}))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{resource}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 37 + }, + { + "type": "timeseries", + "title": "CRM ACL Used / Available", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 97 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_crm_acl_resource_used{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{acl_target}}/{{resource}} used" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_crm_acl_resource_available{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{acl_target}}/{{resource}} available" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 38 + }, + { + "type": "timeseries", + "title": "CRM ACL Used Percent", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 97 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, 100 * sonic_crm_acl_resource_used{job=\"$job\",instance=\"$instance\"} / (sonic_crm_acl_resource_used{job=\"$job\",instance=\"$instance\"} + sonic_crm_acl_resource_available{job=\"$job\",instance=\"$instance\"}))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{acl_target}}/{{resource}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 39 + }, + { + "type": "timeseries", + "title": "Host CPU By Mode", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 105 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg by (mode) (rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])) * 100", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{mode}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 40 + }, + { + "type": "timeseries", + "title": "Host Load Average", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 105 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load1{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "load1" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load5{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "load5" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_load15{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "load15" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 41 + }, + { + "type": "timeseries", + "title": "Host CPU Usage By CPU", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 113 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "100 * (1 - avg by (cpu) (rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[$__rate_interval])))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 94 + }, + { + "type": "timeseries", + "title": "Host Memory Available / Total", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 121 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "available" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "total" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 42 + }, + { + "type": "timeseries", + "title": "Filesystem Used Percent", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 121 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, 100 * (1 - node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|fuse.lxcfs|squashfs|overlay|cifs\",mountpoint!~\"/run.*|/var/lib/docker.*\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|fuse.lxcfs|squashfs|overlay|cifs\",mountpoint!~\"/run.*|/var/lib/docker.*\"}))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 43 + }, + { + "type": "timeseries", + "title": "Disk Read / Write Throughput", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 129 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, rate(node_disk_read_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|ram.*|sr.*\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} read" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, rate(node_disk_written_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|ram.*|sr.*\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} write" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "Bps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 44 + }, + { + "type": "timeseries", + "title": "Disk I/O Operations And Busy Time", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 129 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, rate(node_disk_reads_completed_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|ram.*|sr.*\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} reads" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, rate(node_disk_writes_completed_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|ram.*|sr.*\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} writes" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(8, rate(node_disk_io_time_seconds_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|ram.*|sr.*\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} busy" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 45 + }, + { + "type": "stat", + "title": "Host Uptime From Boot Time", + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 137 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "node_time_seconds{job=\"$job\",instance=\"$instance\"} - node_boot_time_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "uptime" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 46 + }, + { + "type": "stat", + "title": "Host Time Drift", + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 137 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "time() - node_time_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "drift" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 47 + }, + { + "type": "timeseries", + "title": "Host Context Switches / Interrupts", + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 137 + }, + "description": "", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_context_switches_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "context switches" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "rate(node_intr_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "interrupts" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 48 + }, + { + "type": "row", + "title": "Topology / L2", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 142 + }, + "panels": [], + "id": 49 + }, + { + "type": "stat", + "title": "LLDP Neighbors", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 143 + }, + "description": "Neighbor count from the LLDP collector. No data can mean LLDP is disabled or the collector has not emitted yet.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_neighbors{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "neighbors" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 50 + }, + { + "type": "stat", + "title": "VLAN Member Counts", + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 143 + }, + "description": "One card per VLAN when VLAN metrics are present.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_members{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{vlan}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 51 + }, + { + "type": "stat", + "title": "LAG Member Counts", + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 143 + }, + "description": "One card per PortChannel when LAG metrics are present.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_members{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{lag}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 52 + }, + { + "type": "stat", + "title": "LAG Members Up", + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 143 + }, + "description": "Sum of enabled LAG member interfaces. The table below shows each member.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(sonic_lag_member_status{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "enabled members" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 53 + }, + { + "type": "stat", + "title": "Entries Skipped", + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 143 + }, + "description": "Skipped entries explain invalid input, caps, or incomplete Redis data during the latest refresh.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 54 + }, + { + "type": "stat", + "title": "Cache Age", + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 143 + }, + "description": "Age of the cached LLDP, VLAN, and LAG collector refreshes. No data is expected when a collector is disabled.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LLDP" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "VLAN" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "LAG" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 120 + }, + { + "color": "red", + "value": 300 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 55 + }, + { + "type": "stat", + "title": "VLAN Admin Status", + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 147 + }, + "description": "Administrative state per VLAN.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_admin_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{vlan}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "down", + "color": "red" + }, + "1": { + "text": "up", + "color": "green" + } + } + } + ], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 56 + }, + { + "type": "stat", + "title": "VLAN Oper Status", + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 147 + }, + "description": "Operational state per VLAN.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_oper_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{vlan}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "down", + "color": "red" + }, + "1": { + "text": "up", + "color": "green" + } + } + } + ], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 57 + }, + { + "type": "stat", + "title": "LAG Admin Status", + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 147 + }, + "description": "Administrative state per PortChannel.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_admin_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{lag}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "down", + "color": "red" + }, + "1": { + "text": "up", + "color": "green" + } + } + } + ], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 58 + }, + { + "type": "stat", + "title": "LAG Oper Status", + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 147 + }, + "description": "Operational state per PortChannel.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_oper_status{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{lag}}" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "No data" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "down", + "color": "red" + }, + "1": { + "text": "up", + "color": "green" + } + } + } + ], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "id": 59 + }, + { + "type": "table", + "title": "LLDP Neighbor Table", + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 151 + }, + "description": "High-label LLDP metadata stays in a table instead of dense time-series. Labels include local interface, local role, remote system, remote port, chassis, and management address when exported.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lldp_neighbor_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false, + "reducer": [ + "sum" + ], + "countRows": false, + "fields": "" + }, + "sortBy": [] + }, + "id": 60 + }, + { + "type": "table", + "title": "VLAN Member Table", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 160 + }, + "description": "VLAN membership table keyed by VLAN, member interface, and tagging mode.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_vlan_member_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false, + "reducer": [ + "sum" + ], + "countRows": false, + "fields": "" + }, + "sortBy": [] + }, + "id": 61 + }, + { + "type": "table", + "title": "LAG Member Status Table", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 160 + }, + "description": "Per-member PortChannel status. Value 1 means enabled and 0 means disabled.", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_lag_member_status{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "disabled", + "color": "red" + }, + "1": { + "text": "enabled", + "color": "green" + } + } + } + ], + "unit": "", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false, + "reducer": [ + "sum" + ], + "countRows": false, + "fields": "" + }, + "sortBy": [] + }, + "id": 62 + }, + { + "type": "row", + "title": "Optional / FDB", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 168 + }, + "panels": [ + { + "type": "stat", + "title": "FDB Entries", + "description": "Total FDB entries from the disabled-by-default FDB collector.", + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "entries" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 64 + }, + { + "type": "stat", + "title": "FDB Unknown VLAN", + "description": "Entries where the collector could not map the bridge VLAN ID.", + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_unknown_vlan{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "unknown VLAN" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 65 + }, + { + "type": "stat", + "title": "FDB Skipped Entries", + "description": "Entries skipped during the latest FDB refresh.", + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "skipped" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 66 + }, + { + "type": "stat", + "title": "FDB Truncated", + "description": "1 means FDB collection reached its configured entry cap.", + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_truncated{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "truncated" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 67 + }, + { + "type": "timeseries", + "title": "Top FDB Ports", + "description": "Top per-port FDB counts. Limited with topk to keep the optional row readable.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_fdb_entries_by_port{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{port}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 68 + }, + { + "type": "timeseries", + "title": "Top FDB VLANs", + "description": "Top per-VLAN FDB counts. Limited with topk because VLAN count can be high.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_fdb_entries_by_vlan{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{vlan}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 69 + }, + { + "type": "timeseries", + "title": "FDB by Type", + "description": "FDB count split by exported entry type.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 11 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_entries_by_type{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{entry_type}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 70 + }, + { + "type": "timeseries", + "title": "FDB Cache and Scrape", + "description": "FDB collector cache age and refresh duration. No data is expected when FDB is disabled.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 11 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "cache age" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_fdb_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scrape duration" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 71 + } + ], + "id": 63 + }, + { + "type": "row", + "title": "Optional / System", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 169 + }, + "panels": [ + { + "type": "table", + "title": "System Identity", + "description": "Identity labels from Redis, read-only files, or allowlisted command fallback.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_identity_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [] + }, + "id": 73 + }, + { + "type": "table", + "title": "System Software", + "description": "SONiC software metadata. Kept as a table so labels are not graphed as time series.", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_software_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [] + }, + "id": 74 + }, + { + "type": "stat", + "title": "System Uptime", + "description": "Switch uptime from the optional system collector.", + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_uptime_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "uptime" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 75 + }, + { + "type": "stat", + "title": "System Collector", + "description": "1 means the latest optional system collector refresh succeeded.", + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "collector success" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 76 + }, + { + "type": "stat", + "title": "System Cache Age", + "description": "Age of the optional system collector cache.", + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "cache age" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 77 + }, + { + "type": "stat", + "title": "System Scrape Duration", + "description": "Duration of the latest optional system collector refresh.", + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_system_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scrape duration" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 78 + } + ], + "id": 72 + }, + { + "type": "row", + "title": "Optional / Docker", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 170 + }, + "panels": [ + { + "type": "stat", + "title": "Docker Containers", + "description": "Container count from SONiC STATE_DB DOCKER_STATS entries.", + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_containers{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "containers" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 80 + }, + { + "type": "stat", + "title": "Docker Collector", + "description": "1 means the latest optional Docker collector refresh succeeded.", + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_collector_success{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "collector success" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 81 + }, + { + "type": "stat", + "title": "Docker Source Stale", + "description": "1 means STATE_DB Docker source data is older than the configured threshold.", + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_source_stale{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "source stale" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 82 + }, + { + "type": "stat", + "title": "Docker Source Age", + "description": "Age of DOCKER_STATS source data.", + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_source_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "source age" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 83 + }, + { + "type": "table", + "title": "Docker Container Info", + "description": "Container inventory table. The collector exports only the bounded container label.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_container_info{job=\"$job\",instance=\"$instance\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 2 + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [] + }, + "id": 84 + }, + { + "type": "timeseries", + "title": "Top Docker CPU", + "description": "Top containers by CPU percent.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_docker_container_cpu_percent{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{container}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 85 + }, + { + "type": "timeseries", + "title": "Top Docker Memory", + "description": "Top container memory usage and exported memory limit.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 11 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_docker_container_memory_usage_bytes{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "used {{container}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_docker_container_memory_limit_bytes{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "limit {{container}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 86 + }, + { + "type": "timeseries", + "title": "Top Docker Memory Percent", + "description": "Top containers by memory percent.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 11 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, sonic_docker_container_memory_percent{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{container}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 87 + }, + { + "type": "timeseries", + "title": "Top Docker Network I/O Rate", + "description": "Container network receive and transmit rates, limited with topk.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 18 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, rate(sonic_docker_container_network_receive_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "rx {{container}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, rate(sonic_docker_container_network_transmit_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "tx {{container}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "Bps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 88 + }, + { + "type": "timeseries", + "title": "Top Docker Block I/O Rate", + "description": "Container block read and write rates, limited with topk.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 18 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, rate(sonic_docker_container_block_read_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "read {{container}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(12, rate(sonic_docker_container_block_write_bytes_total{job=\"$job\",instance=\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "write {{container}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "Bps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 89 + }, + { + "type": "timeseries", + "title": "Docker Cache and Scrape", + "description": "Docker collector cache, scrape, and skipped-entry safeguards.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 25 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_cache_age_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "cache age" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_scrape_duration_seconds{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scrape duration" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sonic_docker_entries_skipped{job=\"$job\",instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "entries skipped" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 90 + } + ], + "id": 79 + }, + { + "type": "row", + "title": "Optional / FRR", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 171 + }, + "panels": [ + { + "type": "timeseries", + "title": "FRR Collector Up", + "description": "Conservative FRR wrapper health by upstream collector. No peer or route detail metrics are included.", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (collector) (frr_collector_up{job=\"$job\",instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "id": 92 + }, + { + "type": "stat", + "title": "FRR Healthy Collectors", + "description": "Count of FRR upstream collectors reporting up. No data is expected when FRR is disabled.", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(frr_collector_up{job=\"$job\",instance=\"$instance\"} == 1)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "healthy collectors" + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "displayMode": "basic", + "noValue": "collector off" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "id": 93 + } + ], + "id": 91 + } + ] +} diff --git a/docs/grafana-dashboard.md b/docs/grafana-dashboard.md new file mode 100644 index 0000000..db5cd2d --- /dev/null +++ b/docs/grafana-dashboard.md @@ -0,0 +1,84 @@ +# SONiC Exporter Grafana Dashboard + +This dashboard gives operators a single-switch view of SONiC exporter health, traffic, and service state. +It is built for Grafana 10 and Grafana 11 and is meant to be portable across Prometheus setups. + +## What the dashboard does + +The dashboard helps you check whether `sonic-exporter` is collecting data, which collectors are active, and whether the main SONiC subsystems are healthy. +It is focused on drilldown for one switch at a time, not fleet overview. + +## Requirements + +- Grafana 10 or Grafana 11. +- A Prometheus data source that scrapes `sonic-exporter`. +- Metrics from the default collectors, plus any optional collectors you enable. + +## Import in Grafana + +Import `dashboards/sonic-exporter.json` from the Grafana dashboard import screen. +Pick the Prometheus data source when Grafana asks for the dashboard data source. + +If you prefer the API, import the same JSON file through the Grafana dashboard import endpoint. + +## Provisioning option + +You can also provision the dashboard from `dashboards/sonic-exporter.json`. +Keep the file path stable and point Grafana at the checked-in JSON so the dashboard stays in sync with the repo. + +## Variables + +The dashboard uses these variables only: + +- `datasource`, the Prometheus data source for `sonic-exporter`. +- `job`, the Prometheus job label for the exporter scrape target. +- `instance`, the Prometheus instance label for one switch. + +Do not add site, role, or hostname variables unless the dashboard design changes later. + +## Row order + +The checked-in dashboard uses this final row order: + +- `Overview / Exporter Health` +- `Interfaces / Queues` +- `Hardware / CRM / Host` +- `Topology / L2` +- `Optional / FDB` +- `Optional / System` +- `Optional / Docker` +- `Optional / FRR` + +The first four rows are expanded by default. They cover exporter health, interface and queue traffic, hardware, CRM, host health, LLDP, VLAN, and LAG signals. + +The optional FDB, System, Docker, and FRR rows are collapsed by default. They do not need to return data for the default dashboard to be useful. + +## Validation command + +Run these commands after changing the dashboard JSON: + +```bash +jq empty dashboards/sonic-exporter.json +./scripts/validate-dashboard.sh dashboards/sonic-exporter.json +go test ./... +``` + +## Grafana 10 and 11 smoke test + +The smoke test imports the dashboard into Grafana 10 and Grafana 11 with Docker, then confirms the `datasource`, `job`, and `instance` variables are present. + +Example Grafana images for a local smoke test: + +```bash +docker run -d --name sonic-exporter-grafana-10-smoke-task10 -p 127.0.0.1:13010:3000 grafana/grafana:10.4.15 +docker run -d --name sonic-exporter-grafana-11-smoke-task10 -p 127.0.0.1:13011:3000 grafana/grafana:11.6.0 +``` + +If Docker or Grafana cannot run in your environment, record the exact error message in your validation notes. Do not hide the failure. + +## Known limits and cardinality warnings + +- This dashboard is for one switch at a time, not a fleet summary. +- Keep labels bounded. Do not add high-cardinality labels to panels. +- Optional collectors may be absent on some switches, so panels for them should not be required for the dashboard to load. +- The dashboard should stay portable, so avoid hard-coded hostnames, site names, or private IPs in the JSON or docs. diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go index 2ebc25e..907cf4b 100644 --- a/internal/collector/collector_test.go +++ b/internal/collector/collector_test.go @@ -6,6 +6,8 @@ import ( "io" "log/slog" "os" + "reflect" + "sort" "strings" "testing" "time" @@ -13,6 +15,7 @@ import ( "github.com/alicebob/miniredis/v2" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + clientModel "github.com/prometheus/client_model/go" "github.com/prometheus/common/promslog" "github.com/vinted/sonic-exporter/pkg/redis" ) @@ -41,6 +44,89 @@ func assertMetricFamilyPresence(t *testing.T, c prometheus.Collector, metricName } } +func getMetricFamily(t *testing.T, c prometheus.Collector, metricName string) *clientModel.MetricFamily { + t.Helper() + + registry := prometheus.NewRegistry() + registry.MustRegister(c) + + metricFamilies, err := registry.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + for _, mf := range metricFamilies { + if mf.GetName() == metricName { + return mf + } + } + + return nil +} + +func hasPsuSlotInMetricFamily(metricFamily *clientModel.MetricFamily, slot string) bool { + if metricFamily == nil { + return false + } + + for _, metric := range metricFamily.Metric { + for _, label := range metric.Label { + if label.GetName() == "slot" && label.GetValue() == slot { + return true + } + } + } + + return false +} + +func collectPsuSlotsFromMetricFamily(metricFamily *clientModel.MetricFamily) []string { + if metricFamily == nil { + return nil + } + + seen := make(map[string]struct{}) + + for _, metric := range metricFamily.Metric { + for _, label := range metric.Label { + if label.GetName() == "slot" { + seen[label.GetValue()] = struct{}{} + } + } + } + + slots := make([]string, 0, len(seen)) + for slot := range seen { + slots = append(slots, slot) + } + + sort.Strings(slots) + return slots +} + +func collectMetricFamilyNamesWithPrefix(t *testing.T, c prometheus.Collector, prefix string) []string { + t.Helper() + + registry := prometheus.NewRegistry() + registry.MustRegister(c) + + metricFamilies, err := registry.Gather() + if err != nil { + t.Fatalf("failed to gather metrics: %v", err) + } + + var names []string + for _, mf := range metricFamilies { + name := mf.GetName() + if strings.HasPrefix(name, prefix) { + names = append(names, name) + } + } + + sort.Strings(names) + return names +} + type redisDatabase struct { DbId string `json:"id"` Data map[string]map[string]string `json:"data"` @@ -194,15 +280,34 @@ func TestHwCollector(t *testing.T) { metadata := ` # HELP sonic_hw_collector_success Whether hw collector succeeded # TYPE sonic_hw_collector_success gauge + # HELP sonic_hw_psu_voltage_volts PSU voltage + # TYPE sonic_hw_psu_voltage_volts gauge + # HELP sonic_hw_psu_current_amperes PSU current + # TYPE sonic_hw_psu_current_amperes gauge + # HELP sonic_hw_psu_power_watts PSU power + # TYPE sonic_hw_psu_power_watts gauge ` expected := ` sonic_hw_collector_success 1 + sonic_hw_psu_voltage_volts{slot="1"} 12.4 + sonic_hw_psu_voltage_volts{slot="2"} 12.3 + sonic_hw_psu_current_amperes{slot="1"} 5 + sonic_hw_psu_current_amperes{slot="2"} 5 + sonic_hw_psu_power_watts{slot="1"} 60 + sonic_hw_psu_power_watts{slot="2"} 60 ` success_metric := "sonic_hw_collector_success" - if err := testutil.CollectAndCompare(hwCollector, strings.NewReader(metadata+expected), success_metric); err != nil { + if err := testutil.CollectAndCompare( + hwCollector, + strings.NewReader(metadata+expected), + success_metric, + "sonic_hw_psu_voltage_volts", + "sonic_hw_psu_current_amperes", + "sonic_hw_psu_power_watts", + ); err != nil { t.Errorf("unexpected collecting result:\n%s", err) } } @@ -214,7 +319,24 @@ func TestHwCollectorMetricFilter(t *testing.T) { t.Run("default emits hw metrics", func(t *testing.T) { hwCollector := NewHwCollector(logger, NewMetricFilter(logger)) assertMetricFamilyPresence(t, hwCollector, "sonic_hw_fan_rpm", true) + assertMetricFamilyPresence(t, hwCollector, "sonic_hw_psu_voltage_volts", true) + assertMetricFamilyPresence(t, hwCollector, "sonic_hw_psu_current_amperes", true) + assertMetricFamilyPresence(t, hwCollector, "sonic_hw_psu_power_watts", true) assertMetricFamilyPresence(t, hwCollector, "sonic_hw_psu_info", true) + + psuMetricNames := collectMetricFamilyNamesWithPrefix(t, hwCollector, "sonic_hw_psu_") + expectedPsuMetricNames := []string{ + "sonic_hw_psu_available_status", + "sonic_hw_psu_current_amperes", + "sonic_hw_psu_info", + "sonic_hw_psu_operational_status", + "sonic_hw_psu_power_watts", + "sonic_hw_psu_voltage_volts", + } + + if !reflect.DeepEqual(expectedPsuMetricNames, psuMetricNames) { + t.Fatalf("unexpected PSU metric families: got %v want %v", psuMetricNames, expectedPsuMetricNames) + } }) t.Run("wildcard disable removes fan metric families", func(t *testing.T) { @@ -233,6 +355,53 @@ func TestHwCollectorMetricFilter(t *testing.T) { }) } +func TestHwCollectorPsuNumericMetricParsing(t *testing.T) { + ctx := context.Background() + redisClient, err := redis.NewClient() + if err != nil { + t.Fatalf("failed to create redis client: %v", err) + } + + err = redisClient.HsetToDb(ctx, "STATE_DB", "PSU_INFO|PSU 3", map[string]string{ + "presence": "true", + "status": "true", + "model": "BAD-MODEL", + "serial": "BAD-SERIAL", + "voltage": "", + "current": "N/A", + "power": "not-a-number", + }) + if err != nil { + t.Fatalf("failed to write invalid PSU sample data: %v", err) + } + + promslogConfig := &promslog.Config{} + logger := promslog.New(promslogConfig) + + hwCollector := NewHwCollector(logger, NewMetricFilter(logger)) + hwCollector.lastScrapeTime = time.Time{} + hwCollector.cachedMetrics = nil + + voltageFamily := getMetricFamily(t, hwCollector, "sonic_hw_psu_voltage_volts") + if hasPsuSlotInMetricFamily(voltageFamily, "3") { + t.Fatalf("unexpected voltage metric for PSU 3 with empty value") + } + + currentFamily := getMetricFamily(t, hwCollector, "sonic_hw_psu_current_amperes") + if hasPsuSlotInMetricFamily(currentFamily, "3") { + t.Fatalf("unexpected current metric for PSU 3 with N/A value") + } + + powerFamily := getMetricFamily(t, hwCollector, "sonic_hw_psu_power_watts") + if hasPsuSlotInMetricFamily(powerFamily, "3") { + t.Fatalf("unexpected power metric for PSU 3 with malformed value") + } + + if !hasPsuSlotInMetricFamily(voltageFamily, "1") || !hasPsuSlotInMetricFamily(voltageFamily, "2") { + t.Fatalf("expected valid voltage values from fixture PSUs 1 and 2, got %v", collectPsuSlotsFromMetricFamily(voltageFamily)) + } +} + func TestCrmCollector(t *testing.T) { promslogConfig := &promslog.Config{} logger := promslog.New(promslogConfig) diff --git a/internal/collector/hw_collector.go b/internal/collector/hw_collector.go index d66aa27..f28dc99 100644 --- a/internal/collector/hw_collector.go +++ b/internal/collector/hw_collector.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" "regexp" + "strconv" "strings" "sync" "time" @@ -14,42 +15,40 @@ import ( ) type hwCollector struct { - hwPsuInfo *prometheus.Desc - hwPsuInputVoltageVolts *prometheus.Desc - hwPsuInputCurrentAmperes *prometheus.Desc - hwPsuOutputVoltageVolts *prometheus.Desc - hwPsuOutputCurrentAmperes *prometheus.Desc - hwPsuOperationalStatus *prometheus.Desc - hwPsuAvailableStatus *prometheus.Desc - hwPsuTemperatureCelsius *prometheus.Desc - hwFanRpm *prometheus.Desc - hwFanOperationalStatus *prometheus.Desc - hwFanAvailableStatus *prometheus.Desc - hwChassisInfo *prometheus.Desc - scrapeDuration *prometheus.Desc - scrapeCollectorSuccess *prometheus.Desc - cachedMetrics []prometheus.Metric - lastScrapeTime time.Time - logger *slog.Logger - metricFilter MetricFilter - mu sync.Mutex + hwPsuInfo *prometheus.Desc + hwPsuVoltageVolts *prometheus.Desc + hwPsuCurrentAmperes *prometheus.Desc + hwPsuPowerWatts *prometheus.Desc + hwPsuOperationalStatus *prometheus.Desc + hwPsuAvailableStatus *prometheus.Desc + hwPsuTemperatureCelsius *prometheus.Desc + hwFanRpm *prometheus.Desc + hwFanOperationalStatus *prometheus.Desc + hwFanAvailableStatus *prometheus.Desc + hwChassisInfo *prometheus.Desc + scrapeDuration *prometheus.Desc + scrapeCollectorSuccess *prometheus.Desc + cachedMetrics []prometheus.Metric + lastScrapeTime time.Time + logger *slog.Logger + metricFilter MetricFilter + mu sync.Mutex } const ( - hwPsuInfoMetricName = "sonic_hw_psu_info" - hwPsuInputVoltageVoltsMetricName = "sonic_hw_psu_input_voltage_volts" - hwPsuInputCurrentAmperesMetricName = "sonic_hw_psu_input_current_amperes" - hwPsuOutputVoltageVoltsMetricName = "sonic_hw_psu_output_voltage_volts" - hwPsuOutputCurrentAmperesMetricName = "sonic_hw_psu_output_current_amperes" - hwPsuOperationalStatusMetricName = "sonic_hw_psu_operational_status" - hwPsuAvailableStatusMetricName = "sonic_hw_psu_available_status" - hwPsuTemperatureCelsiusMetricName = "sonic_hw_psu_temperature_celsius" - hwFanRpmMetricName = "sonic_hw_fan_rpm" - hwFanOperationalStatusMetricName = "sonic_hw_fan_operational_status" - hwFanAvailableStatusMetricName = "sonic_hw_fan_available_status" - hwChassisInfoMetricName = "sonic_hw_chassis_info" - hwScrapeDurationMetricName = "sonic_hw_scrape_duration_seconds" - hwCollectorSuccessMetricName = "sonic_hw_collector_success" + hwPsuInfoMetricName = "sonic_hw_psu_info" + hwPsuVoltageVoltsMetricName = "sonic_hw_psu_voltage_volts" + hwPsuCurrentAmperesMetricName = "sonic_hw_psu_current_amperes" + hwPsuPowerWattsMetricName = "sonic_hw_psu_power_watts" + hwPsuOperationalStatusMetricName = "sonic_hw_psu_operational_status" + hwPsuAvailableStatusMetricName = "sonic_hw_psu_available_status" + hwPsuTemperatureCelsiusMetricName = "sonic_hw_psu_temperature_celsius" + hwFanRpmMetricName = "sonic_hw_fan_rpm" + hwFanOperationalStatusMetricName = "sonic_hw_fan_operational_status" + hwFanAvailableStatusMetricName = "sonic_hw_fan_available_status" + hwChassisInfoMetricName = "sonic_hw_chassis_info" + hwScrapeDurationMetricName = "sonic_hw_scrape_duration_seconds" + hwCollectorSuccessMetricName = "sonic_hw_collector_success" ) func NewHwCollector(logger *slog.Logger, metricFilter MetricFilter) *hwCollector { @@ -61,14 +60,12 @@ func NewHwCollector(logger *slog.Logger, metricFilter MetricFilter) *hwCollector return &hwCollector{ hwPsuInfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_info"), "Non-numeric data about PSU, value is always 1", []string{"slot", "serial", "model_name", "model"}, nil), - hwPsuInputVoltageVolts: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_input_voltage_volts"), - "PSU input voltage", []string{"slot"}, nil), - hwPsuInputCurrentAmperes: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_input_current_amperes"), - "PSU input current", []string{"slot"}, nil), - hwPsuOutputVoltageVolts: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_output_voltage_volts"), - "PSU output voltage", []string{"slot"}, nil), - hwPsuOutputCurrentAmperes: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_output_current_amperes"), - "PSU output current", []string{"slot"}, nil), + hwPsuVoltageVolts: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_voltage_volts"), + "PSU voltage", []string{"slot"}, nil), + hwPsuCurrentAmperes: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_current_amperes"), + "PSU current", []string{"slot"}, nil), + hwPsuPowerWatts: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_power_watts"), + "PSU power", []string{"slot"}, nil), hwPsuOperationalStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_operational_status"), "PSU operational status: 0(DOWN), 1(UP)", []string{"slot"}, nil), hwPsuAvailableStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "psu_available_status"), @@ -94,10 +91,9 @@ func NewHwCollector(logger *slog.Logger, metricFilter MetricFilter) *hwCollector func (collector *hwCollector) Describe(ch chan<- *prometheus.Desc) { ch <- collector.hwPsuInfo - ch <- collector.hwPsuInputVoltageVolts - ch <- collector.hwPsuInputCurrentAmperes - ch <- collector.hwPsuOutputVoltageVolts - ch <- collector.hwPsuOutputCurrentAmperes + ch <- collector.hwPsuVoltageVolts + ch <- collector.hwPsuCurrentAmperes + ch <- collector.hwPsuPowerWatts ch <- collector.hwPsuOperationalStatus ch <- collector.hwPsuAvailableStatus ch <- collector.hwPsuTemperatureCelsius @@ -232,38 +228,29 @@ func (collector *hwCollector) collectPsuInfo(ctx context.Context, redisClient re } // voltage, amperage and temperature metrics are appended only if values can be parsed - inVolts, err := parseFloat(data["input_voltage"]) + volts, err := parsePsuFloat(data["voltage"]) if err == nil { - if collector.metricFilter.Enabled(hwPsuInputVoltageVoltsMetricName) { + if collector.metricFilter.Enabled(hwPsuVoltageVoltsMetricName) { collector.cachedMetrics = append(collector.cachedMetrics, prometheus.MustNewConstMetric( - collector.hwPsuInputVoltageVolts, prometheus.GaugeValue, inVolts, psuId, + collector.hwPsuVoltageVolts, prometheus.GaugeValue, volts, psuId, )) } } - inAmperes, err := parseFloat(data["input_current"]) + amperes, err := parsePsuFloat(data["current"]) if err == nil { - if collector.metricFilter.Enabled(hwPsuInputCurrentAmperesMetricName) { + if collector.metricFilter.Enabled(hwPsuCurrentAmperesMetricName) { collector.cachedMetrics = append(collector.cachedMetrics, prometheus.MustNewConstMetric( - collector.hwPsuInputCurrentAmperes, prometheus.GaugeValue, inAmperes, psuId, + collector.hwPsuCurrentAmperes, prometheus.GaugeValue, amperes, psuId, )) } } - outVolts, err := parseFloat(data["output_voltage"]) + power, err := parsePsuFloat(data["power"]) if err == nil { - if collector.metricFilter.Enabled(hwPsuOutputVoltageVoltsMetricName) { + if collector.metricFilter.Enabled(hwPsuPowerWattsMetricName) { collector.cachedMetrics = append(collector.cachedMetrics, prometheus.MustNewConstMetric( - collector.hwPsuOutputVoltageVolts, prometheus.GaugeValue, outVolts, psuId, - )) - } - } - - outAmperes, err := parseFloat(data["output_current"]) - if err == nil { - if collector.metricFilter.Enabled(hwPsuOutputCurrentAmperesMetricName) { - collector.cachedMetrics = append(collector.cachedMetrics, prometheus.MustNewConstMetric( - collector.hwPsuOutputCurrentAmperes, prometheus.GaugeValue, outAmperes, psuId, + collector.hwPsuPowerWatts, prometheus.GaugeValue, power, psuId, )) } } @@ -281,6 +268,20 @@ func (collector *hwCollector) collectPsuInfo(ctx context.Context, redisClient re return nil } +func parsePsuFloat(value string) (float64, error) { + value = strings.TrimSpace(value) + if value == "" || strings.EqualFold(value, "N/A") { + return 0, fmt.Errorf("invalid PSU value: %q", value) + } + + parsedValue, err := strconv.ParseFloat(value, 64) + if err != nil { + return 0, fmt.Errorf("invalid PSU value: %w", err) + } + + return parsedValue, nil +} + func (collector *hwCollector) collectFanInfo(ctx context.Context, redisClient redis.Client) error { const fanKeyPattern string = "FAN_INFO|*" fanRegex := regexp.MustCompile(`(?i)FAN_INFO\|(PSU\d+|Fantray\d+)(\s|\-)(.+)`) diff --git a/scripts/validate-dashboard.sh b/scripts/validate-dashboard.sh new file mode 100755 index 0000000..57950b7 --- /dev/null +++ b/scripts/validate-dashboard.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/validate-dashboard.sh + +Validate a Grafana dashboard JSON file for lightweight CI checks. + +Checks performed: +- JSON parses with jq +- title is "SONiC Exporter" +- required template variables: datasource, job, instance +- datasource UIDs use variable references +- non-empty panels array +- no obvious private IPv4 or internal-domain target values +USAGE + exit 2 +} + +fail() { + printf 'error: %s\n' "$1" >&2 + exit 1 +} + +if [[ $# -ne 1 ]]; then + usage +fi + +dash_file=$1 + +if ! command -v jq >/dev/null 2>&1; then + fail "jq is required to validate dashboard JSON. Install jq and retry." +fi + +if [[ ! -f "$dash_file" ]]; then + fail "dashboard file not found: $dash_file" +fi + +if ! jq empty "$dash_file" >/dev/null 2>&1; then + fail "invalid JSON in dashboard file: $dash_file" +fi + +title=$(jq -r '.title // empty' "$dash_file") +if [[ "$title" != "SONiC Exporter" ]]; then + fail "unexpected dashboard title: '$title' (expected: 'SONiC Exporter')" +fi + +for required_var in datasource job instance; do + if ! jq -e --arg var "$required_var" '.templating.list // [] | any(.name == $var)' "$dash_file" >/dev/null; then + fail "missing required template variable: $required_var" + fi +done + +invalid_uids=$(jq -r '.. | objects | .datasource? | objects | .uid? // empty' "$dash_file") +if [[ -n "$invalid_uids" ]]; then + while IFS= read -r uid; do + [[ -z "$uid" ]] && continue + if [[ "$uid" != '$datasource' && "$uid" != '${datasource}' ]]; then + fail "non-portable datasource UID found: '$uid'" + fi + done <<< "$invalid_uids" +fi + +if ! jq -e '.panels | (type == "array" and length > 0)' "$dash_file" >/dev/null; then + fail "dashboard panels must be a non-empty array" +fi + +panel_strings=$(jq -r '.. | strings' "$dash_file") +private_ipv4_pattern='(^|[^0-9])(10\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|172\.(1[6-9]|2[0-9]|3[0-1])\.[0-9]{1,3}\.[0-9]{1,3}|192\.168\.[0-9]{1,3}\.[0-9]{1,3})([^0-9]|$)' +private_domain_pattern='(^|[^A-Za-z0-9_-])(localhost\b|[A-Za-z0-9_-]+\.(local|internal|corp|lan|example)\b)([^A-Za-z0-9_-]|$)' + +private_hits="$(printf '%s\n' "$panel_strings" | grep -E "$private_ipv4_pattern" || true)" +private_hits+="$(printf '%s\n' "$panel_strings" | grep -E "$private_domain_pattern" || true)" + +if [[ -n "$private_hits" ]]; then + fail "private target values detected:$'\n'$private_hits" +fi + +echo "Dashboard validation passed: $dash_file"