Skip to content

Commit cf724cd

Browse files
committed
add Prometheus metrics and Grafana dashboards for application monitoring
1 parent 85f5ffe commit cf724cd

20 files changed

Lines changed: 1066 additions & 17 deletions

File tree

ansible/playbooks/deploy-monitoring.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
- name: Deploy monitoring stack (Loki + Promtail + Grafana)
2+
- name: Deploy monitoring stack (Loki + Promtail + Prometheus + Grafana)
33
hosts: webservers
44
become: true
55
roles:

ansible/roles/monitoring/defaults/main.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,33 @@ monitoring_dir: /opt/monitoring
44
loki_version: "3.0.0"
55
promtail_version: "3.0.0"
66
grafana_version: "12.3.1"
7+
prometheus_version: "3.9.0"
78

89
loki_port: 3100
910
grafana_port: 3000
1011
promtail_port: 9080
12+
prometheus_port: 9090
1113

1214
loki_retention_period: "168h"
1315
loki_schema_version: "v13"
1416

17+
prometheus_retention_days: 15
18+
prometheus_retention_size: "10GB"
19+
prometheus_scrape_interval: "15s"
20+
21+
prometheus_targets:
22+
- job: "prometheus"
23+
targets: ["localhost:9090"]
24+
- job: "app"
25+
targets: ["app-python:5000"]
26+
path: "/metrics"
27+
- job: "loki"
28+
targets: ["loki:3100"]
29+
path: "/metrics"
30+
- job: "grafana"
31+
targets: ["grafana:3000"]
32+
path: "/metrics"
33+
1534
grafana_admin_password: "securepassword123"
1635

1736
resource_limits:
@@ -26,7 +45,17 @@ resource_limits:
2645
cpus_reservation: "0.25"
2746
memory_reservation: "256M"
2847
grafana:
48+
cpus: "0.5"
49+
memory: "512M"
50+
cpus_reservation: "0.25"
51+
memory_reservation: "256M"
52+
prometheus:
2953
cpus: "1.0"
3054
memory: "1G"
3155
cpus_reservation: "0.5"
3256
memory_reservation: "512M"
57+
app:
58+
cpus: "0.5"
59+
memory: "256M"
60+
cpus_reservation: "0.25"
61+
memory_reservation: "128M"
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
{
2+
"annotations": {
3+
"list": []
4+
},
5+
"editable": true,
6+
"fiscalYearStartMonth": 0,
7+
"graphTooltip": 0,
8+
"links": [],
9+
"panels": [
10+
{
11+
"title": "Logs Table",
12+
"type": "logs",
13+
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 },
14+
"id": 1,
15+
"datasource": { "type": "loki", "uid": "loki" },
16+
"targets": [
17+
{
18+
"datasource": { "type": "loki", "uid": "loki" },
19+
"expr": "{container=~\".+\"}",
20+
"refId": "A"
21+
}
22+
],
23+
"options": {
24+
"showTime": true,
25+
"showLabels": true,
26+
"showCommonLabels": false,
27+
"wrapLogMessage": true,
28+
"prettifyLogMessage": false,
29+
"enableLogDetails": true,
30+
"sortOrder": "Descending",
31+
"dedupStrategy": "none"
32+
}
33+
},
34+
{
35+
"title": "Request Rate",
36+
"type": "timeseries",
37+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
38+
"id": 2,
39+
"datasource": { "type": "loki", "uid": "loki" },
40+
"targets": [
41+
{
42+
"datasource": { "type": "loki", "uid": "loki" },
43+
"expr": "sum by (container) (rate({container=~\".+\"} [1m]))",
44+
"refId": "A"
45+
}
46+
],
47+
"fieldConfig": {
48+
"defaults": {
49+
"color": { "mode": "palette-classic" },
50+
"custom": {
51+
"drawStyle": "line",
52+
"lineInterpolation": "smooth",
53+
"fillOpacity": 20,
54+
"pointSize": 5,
55+
"showPoints": "auto",
56+
"lineWidth": 2
57+
},
58+
"unit": "reqps"
59+
},
60+
"overrides": []
61+
},
62+
"options": {
63+
"legend": { "displayMode": "list", "placement": "bottom" },
64+
"tooltip": { "mode": "multi" }
65+
}
66+
},
67+
{
68+
"title": "Error Logs",
69+
"type": "logs",
70+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
71+
"id": 3,
72+
"datasource": { "type": "loki", "uid": "loki" },
73+
"targets": [
74+
{
75+
"datasource": { "type": "loki", "uid": "loki" },
76+
"expr": "{container=~\".+\"} | json | level=\"ERROR\" or level=\"error\"",
77+
"refId": "A"
78+
}
79+
],
80+
"options": {
81+
"showTime": true,
82+
"showLabels": true,
83+
"showCommonLabels": false,
84+
"wrapLogMessage": true,
85+
"prettifyLogMessage": false,
86+
"enableLogDetails": true,
87+
"sortOrder": "Descending",
88+
"dedupStrategy": "none"
89+
}
90+
},
91+
{
92+
"title": "Log Level Distribution",
93+
"type": "piechart",
94+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
95+
"id": 4,
96+
"datasource": { "type": "loki", "uid": "loki" },
97+
"targets": [
98+
{
99+
"datasource": { "type": "loki", "uid": "loki" },
100+
"expr": "sum by (level) (count_over_time({container=~\".+\"} | json [5m]))",
101+
"refId": "A"
102+
}
103+
],
104+
"fieldConfig": {
105+
"defaults": {
106+
"color": { "mode": "palette-classic" }
107+
},
108+
"overrides": []
109+
},
110+
"options": {
111+
"legend": { "displayMode": "list", "placement": "right" },
112+
"tooltip": { "mode": "single" },
113+
"pieType": "pie",
114+
"reduceOptions": {
115+
"calcs": ["lastNotNull"],
116+
"fields": "",
117+
"values": false
118+
}
119+
}
120+
}
121+
],
122+
"schemaVersion": 39,
123+
"tags": ["loki", "logs"],
124+
"templating": { "list": [] },
125+
"time": { "from": "now-1h", "to": "now" },
126+
"timepicker": {},
127+
"timezone": "browser",
128+
"title": "App Logs",
129+
"uid": "app-logs",
130+
"version": 1
131+
}
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
{
2+
"annotations": {
3+
"list": []
4+
},
5+
"editable": true,
6+
"fiscalYearStartMonth": 0,
7+
"graphTooltip": 1,
8+
"links": [],
9+
"panels": [
10+
{
11+
"title": "Request Rate by Endpoint",
12+
"type": "timeseries",
13+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
14+
"datasource": { "type": "prometheus", "uid": "prometheus" },
15+
"targets": [
16+
{
17+
"expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
18+
"legendFormat": "{{endpoint}}",
19+
"refId": "A"
20+
}
21+
],
22+
"fieldConfig": {
23+
"defaults": {
24+
"unit": "reqps",
25+
"custom": {
26+
"drawStyle": "line",
27+
"lineWidth": 2,
28+
"fillOpacity": 20,
29+
"pointSize": 5,
30+
"showPoints": "auto"
31+
}
32+
},
33+
"overrides": []
34+
}
35+
},
36+
{
37+
"title": "Error Rate (5xx)",
38+
"type": "timeseries",
39+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
40+
"datasource": { "type": "prometheus", "uid": "prometheus" },
41+
"targets": [
42+
{
43+
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))",
44+
"legendFormat": "5xx errors/s",
45+
"refId": "A"
46+
}
47+
],
48+
"fieldConfig": {
49+
"defaults": {
50+
"unit": "reqps",
51+
"custom": {
52+
"drawStyle": "line",
53+
"lineWidth": 2,
54+
"fillOpacity": 20
55+
},
56+
"color": { "mode": "fixed", "fixedColor": "red" }
57+
},
58+
"overrides": []
59+
}
60+
},
61+
{
62+
"title": "Request Duration p95",
63+
"type": "timeseries",
64+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
65+
"datasource": { "type": "prometheus", "uid": "prometheus" },
66+
"targets": [
67+
{
68+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))",
69+
"legendFormat": "p95 {{endpoint}}",
70+
"refId": "A"
71+
}
72+
],
73+
"fieldConfig": {
74+
"defaults": {
75+
"unit": "s",
76+
"custom": {
77+
"drawStyle": "line",
78+
"lineWidth": 2,
79+
"fillOpacity": 10
80+
}
81+
},
82+
"overrides": []
83+
}
84+
},
85+
{
86+
"title": "Request Duration Heatmap",
87+
"type": "heatmap",
88+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
89+
"datasource": { "type": "prometheus", "uid": "prometheus" },
90+
"targets": [
91+
{
92+
"expr": "sum(increase(http_request_duration_seconds_bucket[5m])) by (le)",
93+
"legendFormat": "{{le}}",
94+
"refId": "A",
95+
"format": "heatmap"
96+
}
97+
],
98+
"options": {
99+
"calculate": false,
100+
"yAxis": {
101+
"unit": "s"
102+
},
103+
"color": {
104+
"scheme": "Oranges"
105+
}
106+
}
107+
},
108+
{
109+
"title": "Active Requests",
110+
"type": "timeseries",
111+
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
112+
"datasource": { "type": "prometheus", "uid": "prometheus" },
113+
"targets": [
114+
{
115+
"expr": "http_requests_in_progress",
116+
"legendFormat": "in-progress",
117+
"refId": "A"
118+
}
119+
],
120+
"fieldConfig": {
121+
"defaults": {
122+
"unit": "short",
123+
"custom": {
124+
"drawStyle": "line",
125+
"lineWidth": 2,
126+
"fillOpacity": 30
127+
},
128+
"color": { "mode": "fixed", "fixedColor": "blue" }
129+
},
130+
"overrides": []
131+
}
132+
},
133+
{
134+
"title": "Status Code Distribution",
135+
"type": "piechart",
136+
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
137+
"datasource": { "type": "prometheus", "uid": "prometheus" },
138+
"targets": [
139+
{
140+
"expr": "sum by (status) (rate(http_requests_total[5m]))",
141+
"legendFormat": "{{status}}",
142+
"refId": "A"
143+
}
144+
],
145+
"options": {
146+
"legend": {
147+
"displayMode": "table",
148+
"placement": "right"
149+
},
150+
"pieType": "pie"
151+
}
152+
},
153+
{
154+
"title": "Service Uptime",
155+
"type": "stat",
156+
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 },
157+
"datasource": { "type": "prometheus", "uid": "prometheus" },
158+
"targets": [
159+
{
160+
"expr": "up{job=\"app\"}",
161+
"legendFormat": "app-python",
162+
"refId": "A"
163+
}
164+
],
165+
"fieldConfig": {
166+
"defaults": {
167+
"mappings": [
168+
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } }
169+
],
170+
"thresholds": {
171+
"mode": "absolute",
172+
"steps": [
173+
{ "color": "red", "value": null },
174+
{ "color": "green", "value": 1 }
175+
]
176+
}
177+
},
178+
"overrides": []
179+
}
180+
}
181+
],
182+
"schemaVersion": 39,
183+
"tags": ["app", "metrics", "RED"],
184+
"templating": { "list": [] },
185+
"time": { "from": "now-30m", "to": "now" },
186+
"timepicker": {},
187+
"timezone": "browser",
188+
"title": "Application Metrics",
189+
"uid": "app-metrics",
190+
"version": 1
191+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: 1
2+
3+
providers:
4+
- name: default
5+
orgId: 1
6+
folder: ""
7+
type: file
8+
disableDeletion: false
9+
editable: true
10+
options:
11+
path: /var/lib/grafana/dashboards
12+
foldersFromFilesStructure: false

0 commit comments

Comments
 (0)