diff --git a/app_python/app.py b/app_python/app.py index 85b6af2740..8b9d3c7372 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -1,5 +1,5 @@ from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, Response import os import platform import socket @@ -9,6 +9,9 @@ import time import uuid from contextlib import asynccontextmanager +from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST +import asyncio + HOST = os.getenv('HOST', '0.0.0.0') PORT = int(os.getenv('PORT', 5000)) @@ -39,6 +42,77 @@ def format(self, record): app = FastAPI() start_time = datetime.now() +http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status'] +) + +http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration', + ['method', 'endpoint'] +) + +http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'HTTP requests currently being processed' +) + +# Application-specific metrics + +uptime_seconds = Gauge( + 'app_uptime_seconds', + 'Application uptime in seconds' +) + +endpoint_response_size_bytes = Histogram( + 'endpoint_response_size_bytes', + 'Response payload size in bytes', + ['endpoint'] +) + + +@app.middleware("http") +async def dispatch(request, call_next): + if request.url.path == "/metrics": + return await call_next(request) + + http_requests_in_progress.inc() + + start_time = time.time() + status_code = 500 + response = None + try: + response = await call_next(request) + status_code = response.status_code + except Exception as e: + status_code = 500 + http_requests_in_progress.dec() + raise + finally: + duration = time.time() - start_time + + http_requests_total.labels( + method=request.method, + endpoint=request.url.path, + status=status_code + ).inc() + + http_request_duration_seconds.labels( + method=request.method, + endpoint=request.url.path + ).observe(duration) + + if response and hasattr(response, 'body'): + response_size = len(response.body) + endpoint_response_size_bytes.labels( + endpoint=request.url.path).observe(response_size) + + http_requests_in_progress.dec() + + return response + @app.middleware("http") async def log_requests(request: Request, call_next): @@ -88,9 +162,25 @@ async def lifespan(app: FastAPI): logger.info("Application starting up", extra={ "extra_info": {"config": startup_config}}) + async def update_uptime(): + while True: + uptime_seconds.set(get_uptime()['seconds']) + await asyncio.sleep(5) # Update every 5 seconds + + uptime_task = asyncio.create_task(update_uptime()) + yield logger.info("Application shutting down") + uptime_task.cancel() + + +@app.get('/metrics') +def metrics(): + return Response( + generate_latest(), + media_type=CONTENT_TYPE_LATEST + ) @app.get("/") diff --git a/app_python/requirements.txt b/app_python/requirements.txt index ff8283619c..739014b9e7 100644 --- a/app_python/requirements.txt +++ b/app_python/requirements.txt @@ -47,4 +47,5 @@ jsonschema==4.23.0 flake8==7.3.0 mccabe==0.7.0 pycodestyle==2.14.0 -pyflakes==3.4.0 \ No newline at end of file +pyflakes==3.4.0 +prometheus-client==0.23.1 \ No newline at end of file diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 93a53a0d57..b4effab772 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -60,6 +60,32 @@ services: cpus: '0.5' memory: 256M + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' + ports: + - 9090:9090 + restart: unless-stopped + volumes: + - ./prometheus:/etc/prometheus + - prometheus-data:/prometheus + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + grafana: image: grafana/grafana:12.3.1 container_name: grafana @@ -87,6 +113,7 @@ services: volumes: loki-data: grafana-data: + prometheus-data: networks: logging: \ No newline at end of file diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..9be2ae7c79 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,239 @@ +# Prometheus & Grafana Monitoring Lab Report + +## Architecture + +Application write his data to global variables (prometheus-client package) and exposes '/metrics' endpoint which returns current values. + +Prometheus "asks" for metrics each known application by some interval (just GET request). And save to database. + +Grafana just makes requests to Prometheus and visualize data. + +Application <--- Prometheus <--- Grafana + +## Application Instrumentation + +| Metric | Type | Labels | Purpose | +| -- | -- | -- | - | +| `http_requests_total` | Counter | method, endpoint, status | Track API usage and error rates | +| `http_request_duration_seconds` | Histogram | method, endpoint | Monitor latency and performance | +| `http_requests_in_progress` | Gauge | none | Detect concurrency issues | +| `app_uptime_seconds` | Gauge | none | Detect unexpected restarts | +| `endpoint_response_size_bytes` | Histogram | endpoint | Monitor payload sizes and bandwidth | + +## Prometheus Configuration + +### prometheus.yml + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +storage: + tsdb: + retention_time: 15d + retention_size: 10GB + +scrape_configs: + - job_name: "app" + static_configs: + - targets: ["app-python:5000"] + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] +``` + +## Dashboard Walkthrough + +# Grafana Dashboard Visualizations Explained + +![alt text](image-6.png) + +## 1. Request Rate (Graph) + +### Query + +```promql +sum(rate(http_requests_total[5m])) by (endpoint) +``` + +### What It Shows + +A line graph displaying requests per second for each endpoint over time. Each endpoint gets its own colored line. + +## 2. Error Rate (Graph) + +### Query + +```promql +sum(rate(http_requests_total{status=~"5.."}[5m])) +``` + +### What It Shows + +How many 5xx errors (server errors) are occurring per second. This is a critical health indicator. + +## 3. Request Duration p95 (Graph) + +### Query + +```promql +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +### What It Shows + +**The 95th percentile latency**: 95% of requests complete within this time; 5% are slower. + +## 4. Request Duration Heatmap (Heatmap) + +### Query + +```promql +rate(http_request_duration_seconds_bucket[5m]) +``` + +### What It Shows + +**A 2D visualization showing the distribution of request latencies over time**. +## 5. Active Requests (Gauge/Graph) + +### Query + +```promql +http_requests_in_progress +``` + +### What It Shows + +**Real-time count of requests currently being processed by the application**. + +## 6. Status Code Distribution (Pie Chart) + +### Query + +```promql +sum by (status) (rate(http_requests_total[5m])) +``` + +### What It Shows + +**Breakdown of request outcomes: successful (2xx), client errors (4xx), and server errors (5xx)**. + +## 7. Uptime (Stat) + +### Query + +```promql +up{job="app"} +``` + +### What It Shows + +**Whether the service is currently up (1) or down (0)**. + +## PromQL Examples + +### Query 1: Total Requests by Status Code + +```promql +sum by (status) (http_requests_total) +``` + +**Explanation**: Groups all requests by HTTP status code (200, 404, 500, etc.) and sums them. Shows distribution of successful vs. failed requests. + +**Use Case**: Quick overview of application health. + +### Query 2: Request Rate by Endpoint + +```promql +sum by (endpoint) (rate(http_requests_total[5m])) +``` + +**Explanation**: Calculates the rate of requests per endpoint over the last 5 minutes. Shows which endpoints are most heavily used. + +**Use Case**: Identify traffic hotspots and optimize accordingly. + +### Query 3: Average Latency Over Time + +```promql +avg(rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])) +``` + +**Explanation**: Divides the sum of durations by the count of requests to get average latency. Smoothed over 5 minutes. + +**Use Case**: Track performance trends and detect degradation. + +### Query 4: High Error Rate Alert + +```promql +(sum(rate(http_requests_total{status=~"5.."}[1m])) / sum(rate(http_requests_total[1m]))) > 0.05 +``` + +**Explanation**: Fires when error rate exceeds 5% over 1 minute. Detects sudden application failures. + +**Use Case**: Trigger alerts to on-call engineers. + +### Query 5: Requests Taking Longer Than 2 Seconds + +```promql +histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2 +``` + +**Explanation**: Shows when the 99th percentile latency exceeds 2 seconds. Indicates significant performance issues. + +**Use Case**: SLA monitoring and performance budgets. + +### Query 6: Application Restart Detection + +```promql +increase(app_uptime_seconds[5m]) < 0 +``` + +**Explanation**: Detects when uptime decreases (restart), which would show as negative increase. + +**Use Case**: Alert on unexpected restarts. + +## Production Setup + +### Health Checks + +## Prometheus health check + +Every 10 seconds with timout of 5 seconds. Maximum 5 retries. + +## App health check + +Every 10 seconds with timout of 5 seconds. Maximum 5 retries. + +### Resource Requirements + +Prometheus: 1G memory, 1 CPU +Loki: 1G memory, 1 CPU +Grafana: 512M memory, 0.5 CPU +Apps: 256M memory, 0.5 CPU + +### Prometheus retention + +time: 15 days +size: 10Gb + +## Testing Results + +![alt text](image-3.png) + +![alt text](image-4.png) + +![alt text](image-5.png) + +### After restart + +![alt text](image-8.png) + +![alt text](image-7.png) \ No newline at end of file diff --git a/monitoring/docs/dashboard.json b/monitoring/docs/dashboard.json new file mode 100644 index 0000000000..28b315f3cc --- /dev/null +++ b/monitoring/docs/dashboard.json @@ -0,0 +1,588 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95 ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Error rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": true, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(http_request_duration_seconds_bucket[5m])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "http_requests_in_progress", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (status) (rate(http_requests_total[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "dfgf10avx6ry8c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "code", + "expr": "up{job=\"app\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Request Rate", + "uid": "adlmrv7", + "version": 9 +} \ No newline at end of file diff --git a/monitoring/docs/image-3.png b/monitoring/docs/image-3.png new file mode 100644 index 0000000000..86567c82f1 Binary files /dev/null and b/monitoring/docs/image-3.png differ diff --git a/monitoring/docs/image-4.png b/monitoring/docs/image-4.png new file mode 100644 index 0000000000..7869cb6fb7 Binary files /dev/null and b/monitoring/docs/image-4.png differ diff --git a/monitoring/docs/image-5.png b/monitoring/docs/image-5.png new file mode 100644 index 0000000000..31f8934896 Binary files /dev/null and b/monitoring/docs/image-5.png differ diff --git a/monitoring/docs/image-6.png b/monitoring/docs/image-6.png new file mode 100644 index 0000000000..ad93245b6b Binary files /dev/null and b/monitoring/docs/image-6.png differ diff --git a/monitoring/docs/image-7.png b/monitoring/docs/image-7.png new file mode 100644 index 0000000000..1a5ac79a53 Binary files /dev/null and b/monitoring/docs/image-7.png differ diff --git a/monitoring/docs/image-8.png b/monitoring/docs/image-8.png new file mode 100644 index 0000000000..c22d162107 Binary files /dev/null and b/monitoring/docs/image-8.png differ diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..4968017e08 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,22 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +storage: + tsdb: + retention_time: 15d + retention_size: 10GB + +scrape_configs: + - job_name: "app" + static_configs: + - targets: ["app-python:5000"] + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] \ No newline at end of file