Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 63 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ flowchart LR

subgraph sonic-exporter
M[cmd/sonic-exporter/main.go]
COL[Collectors\ninterface, hw, crm, queue, lldp, vlan, lag, fdb\nsystem*, docker*, frr*]
COL[Collectors\ninterface, hw, crm, queue, lldp, vlan, lag, fdb\nswitch, thermal, transceiver\nrouting*, platform*, system*, docker*, frr*]
CACHE[(In-memory metric cache)]
NODE[node_exporter subset\nloadavg,cpu,diskstats,filesystem,meminfo,time,stat]
end
Expand Down Expand Up @@ -81,6 +81,11 @@ For a deeper breakdown, see `docs/architecture.md`.
| LLDP | LLDP neighbors from Redis | Enabled |
| VLAN | VLAN and VLAN member state | Enabled |
| LAG | PortChannel and member state | Enabled |
| Switch | Switch-level Redis state from `APPL_DB` `SWITCH_TABLE` | Enabled |
| Thermal | ASIC and SFP max temperatures from `STATE_DB` | Enabled |
| Transceiver | Transceiver status, flags, and thresholds from `STATE_DB` | Enabled |
| Routing | Route and neighbor summaries from `APPL_DB` | Disabled (`ROUTING_ENABLED=false`) |
| Platform Health | Process, storage, and system health metrics from `STATE_DB` | Disabled (`PLATFORM_HEALTH_ENABLED=false`) |
| FDB | FDB summary from ASIC DB | Disabled (`FDB_ENABLED=false`) |
| System (experimental) | Switch identity, software metadata, uptime | Disabled (`SYSTEM_ENABLED=false`) |
| Docker (experimental) | Container runtime metrics from `STATE_DB` | Disabled (`DOCKER_ENABLED=false`) |
Expand Down Expand Up @@ -140,7 +145,7 @@ curl localhost:9101/metrics

### Docker deployment for SONiC

Optional collectors stay opt in. Keep `SYSTEM_ENABLED=false`, `DOCKER_ENABLED=false`, and `FRR_ENABLED=false` unless you need them.
Optional collectors stay opt in. Keep `ROUTING_ENABLED=false`, `PLATFORM_HEALTH_ENABLED=false`, `SYSTEM_ENABLED=false`, `DOCKER_ENABLED=false`, and `FRR_ENABLED=false` unless you need them.

#### Recommended online switch flow

Expand Down Expand Up @@ -191,6 +196,8 @@ REDIS_NETWORK=tcp
REDIS_PASSWORD=
SONIC_DISABLED_METRICS=
FDB_ENABLED=false
ROUTING_ENABLED=false
PLATFORM_HEALTH_ENABLED=false
SYSTEM_ENABLED=false
DOCKER_ENABLED=false
FRR_ENABLED=false
Expand All @@ -212,6 +219,8 @@ sudo docker run -d \
-e REDIS_NETWORK=tcp \
-e SONIC_DISABLED_METRICS= \
-e FDB_ENABLED=false \
-e ROUTING_ENABLED=false \
-e PLATFORM_HEALTH_ENABLED=false \
-e SYSTEM_ENABLED=false \
-e DOCKER_ENABLED=false \
-e FRR_ENABLED=false \
Expand Down Expand Up @@ -244,7 +253,7 @@ Restart=always
RestartSec=30
ExecStartPre=/bin/sh -c 'until /usr/bin/redis-cli -h 127.0.0.1 -p 6379 ping | /bin/grep -q PONG; do sleep 2; done'
ExecStartPre=-/usr/bin/docker rm -f sonic-exporter
ExecStart=/usr/bin/docker run --name sonic-exporter --label app=sonic-exporter --label managed-by=systemd --restart no --network host -e REDIS_ADDRESS=127.0.0.1:6379 -e REDIS_NETWORK=tcp -e REDIS_PASSWORD= -e SONIC_DISABLED_METRICS= -e FDB_ENABLED=false -e SYSTEM_ENABLED=false -e DOCKER_ENABLED=false -e FRR_ENABLED=false ghcr.io/rokernel/sonic-exporter:v0.1.1
ExecStart=/usr/bin/docker run --name sonic-exporter --label app=sonic-exporter --label managed-by=systemd --restart no --network host -e REDIS_ADDRESS=127.0.0.1:6379 -e REDIS_NETWORK=tcp -e REDIS_PASSWORD= -e SONIC_DISABLED_METRICS= -e FDB_ENABLED=false -e ROUTING_ENABLED=false -e PLATFORM_HEALTH_ENABLED=false -e SYSTEM_ENABLED=false -e DOCKER_ENABLED=false -e FRR_ENABLED=false ghcr.io/rokernel/sonic-exporter:v0.1.1
ExecStop=-/usr/bin/docker stop sonic-exporter
ExecStopPost=-/usr/bin/docker rm -f sonic-exporter

Expand Down Expand Up @@ -477,6 +486,52 @@ Be careful with broad patterns. A wide match can also hide health metrics such a
| `FDB_MAX_PORTS` | Max per-port FDB series exported | `1024` |
| `FDB_MAX_VLANS` | Max per-VLAN FDB series exported | `4096` |

### Switch collector

| Variable | Description | Default |
|---|---|---|
| `SWITCH_ENABLED` | Enable switch collector | `true` |
| `SWITCH_REFRESH_INTERVAL` | Cache refresh interval | `60s` |
| `SWITCH_TIMEOUT` | Timeout for one refresh cycle | `2s` |
| `SWITCH_MAX_ENTRIES` | Max switch table entries exported per refresh | `16` |

### Thermal collector

| Variable | Description | Default |
|---|---|---|
| `THERMAL_ENABLED` | Enable thermal collector | `true` |
| `THERMAL_REFRESH_INTERVAL` | Cache refresh interval | `60s` |
| `THERMAL_TIMEOUT` | Timeout for one refresh cycle | `2s` |

### Transceiver collector

| Variable | Description | Default |
|---|---|---|
| `TRANSCEIVER_ENABLED` | Enable transceiver collector | `true` |
| `TRANSCEIVER_REFRESH_INTERVAL` | Cache refresh interval | `60s` |
| `TRANSCEIVER_TIMEOUT` | Timeout for one refresh cycle | `2s` |
| `TRANSCEIVER_MAX_PORTS` | Max transceiver ports exported per refresh | `1024` |

### Routing collector

| Variable | Description | Default |
|---|---|---|
| `ROUTING_ENABLED` | Enable routing collector | `false` |
| `ROUTING_REFRESH_INTERVAL` | Cache refresh interval | `60s` |
| `ROUTING_TIMEOUT` | Timeout for one refresh cycle | `2s` |
| `ROUTING_MAX_NEIGHBORS` | Max neighbor entries exported per refresh | `50000` |
| `ROUTING_MAX_ROUTES` | Max route entries exported per refresh | `200000` |

### Platform health collector

| Variable | Description | Default |
|---|---|---|
| `PLATFORM_HEALTH_ENABLED` | Enable platform health collector | `false` |
| `PLATFORM_HEALTH_REFRESH_INTERVAL` | Cache refresh interval | `60s` |
| `PLATFORM_HEALTH_TIMEOUT` | Timeout for one refresh cycle | `2s` |
| `PLATFORM_HEALTH_MAX_PROCESSES` | Max process entries exported per refresh | `512` |
| `PLATFORM_HEALTH_MAX_STORAGE_DEVICES` | Max storage devices exported per refresh | `128` |

### System collector (experimental)

| Variable | Description | Default |
Expand Down Expand Up @@ -714,7 +769,12 @@ SONIC_DISABLED_METRICS=
LLDP_ENABLED=true
VLAN_ENABLED=true
LAG_ENABLED=true
SWITCH_ENABLED=true
THERMAL_ENABLED=true
TRANSCEIVER_ENABLED=true
FDB_ENABLED=false
ROUTING_ENABLED=false
PLATFORM_HEALTH_ENABLED=false
SYSTEM_ENABLED=false
DOCKER_ENABLED=false
FRR_ENABLED=false
Expand Down
20 changes: 20 additions & 0 deletions cmd/sonic-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ func main() {
vlanCollector := collector.NewVlanCollector(logger, metricFilter)
lagCollector := collector.NewLagCollector(logger, metricFilter)
fdbCollector := collector.NewFdbCollector(logger, metricFilter)
routingCollector := collector.NewRoutingCollector(logger, metricFilter)
switchCollector := collector.NewSwitchCollector(logger, metricFilter)
thermalCollector := collector.NewThermalCollector(logger, metricFilter)
transceiverCollector := collector.NewTransceiverCollector(logger, metricFilter)
platformHealthCollector := collector.NewPlatformHealthCollector(logger, metricFilter)
systemCollector := collector.NewSystemCollector(logger, metricFilter)
dockerCollector := collector.NewDockerCollector(logger, metricFilter)
frrCollector := collector.NewFrrCollector(logger)
Expand All @@ -80,6 +85,21 @@ func main() {
if fdbCollector.IsEnabled() {
prometheus.MustRegister(fdbCollector)
}
if routingCollector.IsEnabled() {
prometheus.MustRegister(routingCollector)
}
if switchCollector.IsEnabled() {
prometheus.MustRegister(switchCollector)
}
if thermalCollector.IsEnabled() {
prometheus.MustRegister(thermalCollector)
}
if transceiverCollector.IsEnabled() {
prometheus.MustRegister(transceiverCollector)
}
if platformHealthCollector.IsEnabled() {
prometheus.MustRegister(platformHealthCollector)
}
if systemCollector.IsEnabled() {
prometheus.MustRegister(systemCollector)
}
Expand Down
26 changes: 26 additions & 0 deletions fixtures/test/appl_db_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,32 @@
},
"LAG_MEMBER_TABLE:PortChannel2:Ethernet92": {
"status": "enabled"
},
"NEIGH_TABLE:eth0:192.0.2.1": {
"neigh": "00:11:22:33:44:55",
"family": "IPv4"
},
"NEIGH_TABLE:Ethernet0:2001:db8::1": {
"neigh": "00:11:22:33:44:66",
"family": "IPv6"
},
"ROUTE_TABLE:192.0.2.0/24": {
"protocol": "kernel",
"nexthop": "0.0.0.0",
"ifname": "eth0"
},
"ROUTE_TABLE:2001:db8::/64": {
"protocol": "static",
"nexthop": "::",
"ifname": "Ethernet0"
},
"SWITCH_TABLE:switch": {
"ecmp_hash_offset": "0",
"ecmp_hash_seed": "10",
"fdb_aging_time": "600",
"lag_hash_offset": "1",
"lag_hash_seed": "20",
"ordered_ecmp": "true"
}
}
}
119 changes: 119 additions & 0 deletions fixtures/test/state_db_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,125 @@
"model": "Model-X",
"revision": "A01"
},
"ASIC_TEMPERATURE_INFO": {
"temperature_0": "45",
"temperature_1": "47",
"maximum_temperature": "50",
"average_temperature": "46"
},
"TEMPERATURE_SFP_MAX": {
"maximum_temperature": "31",
"timestamp": "20260605 10:46:20"
},
"TRANSCEIVER_STATUS|Ethernet0": {
"module_state": "ModuleReady",
"module_fault_cause": "No Fault detected",
"tx1OutputStatus": "True",
"tx2OutputStatus": "False",
"rx1OutputStatusHostlane": "True",
"rx2OutputStatusHostlane": "True",
"tx1disable": "False",
"tx2disable": "True",
"last_update_time": "Fri Jun 05 10:44:56 2026"
},
"TRANSCEIVER_STATUS_FLAG|Ethernet0": {
"tx1fault": "False",
"tx2fault": "True",
"rx1los": "False",
"rx2los": "True",
"last_update_time": "Fri Jun 05 10:44:56 2026"
},
"TRANSCEIVER_STATUS_FLAG_CHANGE_COUNT|Ethernet0": {
"tx1fault": "0",
"tx2fault": "1",
"rx1los": "0",
"rx2los": "2"
},
"TRANSCEIVER_STATUS_FLAG_CLEAR_TIME|Ethernet0": {
"tx1fault": "never",
"tx2fault": "Fri Jun 05 10:44:00 2026",
"rx1los": "never",
"rx2los": "Fri Jun 05 10:45:00 2026"
},
"TRANSCEIVER_STATUS_FLAG_SET_TIME|Ethernet0": {
"tx1fault": "never",
"tx2fault": "Fri Jun 05 10:40:00 2026",
"rx1los": "never",
"rx2los": "Fri Jun 05 10:41:00 2026"
},
"TRANSCEIVER_DOM_FLAG|Ethernet0": {
"tempHAlarm": "False",
"tempLAlarm": "False",
"tx1powerHAlarm": "True",
"tx1powerLWarn": "False",
"last_update_time": "Fri Jun 05 10:44:56 2026"
},
"TRANSCEIVER_DOM_FLAG_CHANGE_COUNT|Ethernet0": {
"tempHAlarm": "0",
"tempLAlarm": "0",
"tx1powerHAlarm": "3",
"tx1powerLWarn": "1"
},
"TRANSCEIVER_DOM_FLAG_CLEAR_TIME|Ethernet0": {
"tempHAlarm": "never",
"tempLAlarm": "never",
"tx1powerHAlarm": "Fri Jun 05 10:43:00 2026",
"tx1powerLWarn": "Fri Jun 05 10:42:00 2026"
},
"TRANSCEIVER_DOM_FLAG_SET_TIME|Ethernet0": {
"tempHAlarm": "never",
"tempLAlarm": "never",
"tx1powerHAlarm": "Fri Jun 05 10:41:00 2026",
"tx1powerLWarn": "Fri Jun 05 10:40:00 2026"
},
"TRANSCEIVER_DOM_THRESHOLD|Ethernet0": {
"temphighalarm": "80.0",
"templowalarm": "-5.0",
"txpowerhighalarm": "6.5",
"txpowerlowwarning": "-4.3",
"last_update_time": "Fri Jun 05 10:44:56 2026"
},
"PROCESS_STATS|1": {
"CMD": "/sbin/init",
"CPU": "0.0",
"MEM": "0.1",
"PPID": "0",
"STIME": "May29",
"TIME": "0:07:49",
"TT": "None",
"UID": "0"
},
"PROCESS_STATS|42": {
"CMD": "/usr/bin/orchagent",
"CPU": "1.5",
"MEM": "2.5",
"PPID": "1",
"STIME": "May29",
"TIME": "0:00:30",
"TT": "None",
"UID": "1000"
},
"PROCESS_STATS|LastUpdateTime": {
"lastupdate": "2026-06-05 10:26:12"
},
"STORAGE_INFO|sda": {
"device_model": "M.2 (S80) 3ME4",
"serial": "TESTSERIAL001",
"firmware": "L20420",
"health": "100",
"temperature": "32",
"latest_fsio_reads": "18564",
"latest_fsio_writes": "894008",
"disk_io_reads": "9704",
"disk_io_writes": "17085",
"reserved_blocks": "133",
"last_sync_time": "2026-06-05 10:26:12",
"total_fsio_reads": "29562",
"total_fsio_writes": "1821638"
},
"SYSTEM_HEALTH_INFO": {
"summary": "OK"
},
"DOCKER_STATS|0001": {
"NAME": "swss",
"CPU%": "1.5",
Expand Down
Loading