diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml index ec9d369526..8a5929c76a 100644 --- a/ansible/roles/monitoring/defaults/main.yml +++ b/ansible/roles/monitoring/defaults/main.yml @@ -1,12 +1,47 @@ --- +# Project paths and naming monitoring_project_dir: /opt/monitoring -monitoring_loki_image: grafana/loki:3.0.0 -monitoring_promtail_image: grafana/promtail:3.0.0 -monitoring_grafana_image: grafana/grafana:12.3.1 +monitoring_compose_project_name: devops-monitoring + +# Images and versions +loki_version: "3.0.0" +promtail_version: "3.0.0" +grafana_version: "12.3.1" +prometheus_version: "3.9.0" +monitoring_loki_image: "grafana/loki:{{ loki_version }}" +monitoring_promtail_image: "grafana/promtail:{{ promtail_version }}" +monitoring_grafana_image: "grafana/grafana:{{ grafana_version }}" + +# App image (reuse web app naming conventions when available) +monitoring_app_image: "{{ web_app_docker_image | default((dockerhub_username | default('your_dockerhub_username')) ~ '/devops-info-service') }}" +monitoring_app_tag: "{{ web_app_docker_tag | default(docker_tag | default('latest')) }}" + +# Ports monitoring_loki_port: 3100 monitoring_promtail_port: 9080 monitoring_grafana_port: 3000 +prometheus_port: 9090 +monitoring_app_port: 8000 +monitoring_app_internal_port: 8000 + +# Retention and scrape behavior monitoring_retention_period: 168h +prometheus_retention_days: 15 +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" + +# Grafana auth monitoring_grafana_admin_user: admin monitoring_grafana_admin_password: change-me-in-vault -monitoring_compose_project_name: devops-monitoring + +# Scrape targets for prometheus.yml.j2 template +prometheus_targets: + - job: "prometheus" + targets: ["localhost:9090"] + - job: "loki" + targets: ["loki:3100"] + - job: "grafana" + targets: ["grafana:3000"] + - job: "app" + targets: ["app-python:8000"] + path: "/metrics" diff --git a/ansible/roles/monitoring/files/grafana-app-dashboard.json b/ansible/roles/monitoring/files/grafana-app-dashboard.json new file mode 100644 index 0000000000..2f4ceb99c8 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-app-dashboard.json @@ -0,0 +1,436 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "legendFormat": "{{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "mode": "scheme" + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "short" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 20 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(http_requests_in_progress)", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 1, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "up{job=\"app\"}", + "refId": "A" + } + ], + "title": "App Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 40, + "style": "dark", + "tags": [ + "lab8", + "prometheus", + "red" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lab 8 - Application Metrics", + "uid": "lab8-app-metrics", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/monitoring/files/grafana-logs-dashboard.json b/ansible/roles/monitoring/files/grafana-logs-dashboard.json new file mode 100644 index 0000000000..1411712d22 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-logs-dashboard.json @@ -0,0 +1,78 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{job=\"docker\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Container Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 40, + "tags": [ + "lab7", + "loki" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Lab 7 - Logs Overview", + "uid": "lab7-logs-overview", + "version": 1 +} diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml index 8045115f61..bce6683c4b 100644 --- a/ansible/roles/monitoring/tasks/main.yml +++ b/ansible/roles/monitoring/tasks/main.yml @@ -1,5 +1,5 @@ --- -- name: Deploy Loki monitoring stack +- name: Deploy observability stack tags: - monitoring_deploy block: @@ -10,11 +10,14 @@ mode: "0755" loop: - "{{ monitoring_project_dir }}" + - "{{ monitoring_project_dir }}/prometheus" - "{{ monitoring_project_dir }}/loki" - "{{ monitoring_project_dir }}/promtail" - "{{ monitoring_project_dir }}/grafana" - "{{ monitoring_project_dir }}/grafana/provisioning" - "{{ monitoring_project_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_project_dir }}/grafana/provisioning/dashboards" + - "{{ monitoring_project_dir }}/grafana/provisioning/dashboards/json" - name: Render Loki configuration ansible.builtin.template: @@ -28,11 +31,39 @@ dest: "{{ monitoring_project_dir }}/promtail/config.yml" mode: "0644" + - name: Render Prometheus configuration + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ monitoring_project_dir }}/prometheus/prometheus.yml" + mode: "0644" + - name: Render Grafana datasource provisioning ansible.builtin.template: src: grafana-datasource.yml.j2 - dest: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/loki.yml" + dest: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/datasources.yml" + mode: "0644" + + - name: Remove legacy Loki-only datasource file + ansible.builtin.file: + path: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/loki.yml" + state: absent + + - name: Render Grafana dashboard provider + ansible.builtin.template: + src: grafana-dashboard-provider.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/dashboards/dashboards.yml" + mode: "0644" + + - name: Provision Grafana dashboards + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "{{ monitoring_project_dir }}/grafana/provisioning/dashboards/json/{{ item.dest }}" mode: "0644" + loop: + - src: grafana-app-dashboard.json + dest: app-metrics-dashboard.json + - src: grafana-logs-dashboard.json + dest: logs-dashboard.json - name: Render monitoring docker-compose file ansible.builtin.template: @@ -61,6 +92,16 @@ delay: 5 until: monitoring_loki_ready.status == 200 + - name: Wait for Prometheus health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ prometheus_port }}/-/healthy" + status_code: 200 + timeout: 10 + register: monitoring_prometheus_health + retries: 12 + delay: 5 + until: monitoring_prometheus_health.status == 200 + - name: Wait for Grafana health endpoint ansible.builtin.uri: url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/health" @@ -71,6 +112,16 @@ delay: 5 until: monitoring_grafana_health.status == 200 + - name: Verify application health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_app_port }}/health" + status_code: 200 + timeout: 10 + register: monitoring_app_health + retries: 12 + delay: 5 + until: monitoring_app_health.status == 200 + rescue: - name: Show monitoring deployment diagnostics ansible.builtin.debug: diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 index 1edc2d13fe..ab3861a199 100644 --- a/ansible/roles/monitoring/templates/docker-compose.yml.j2 +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -1,28 +1,61 @@ name: {{ monitoring_compose_project_name }} services: + app-python: + image: "{{ monitoring_app_image }}:{{ monitoring_app_tag }}" + container_name: app-python + environment: + HOST: "0.0.0.0" + PORT: "{{ monitoring_app_internal_port }}" + DEBUG: "False" + ports: + - "{{ monitoring_app_port }}:{{ monitoring_app_internal_port }}" + networks: + - logging + labels: + logging: "promtail" + app: "devops-info-service" + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:{{ monitoring_app_internal_port }}/health')\""] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + restart: unless-stopped + loki: image: {{ monitoring_loki_image }} - container_name: devops-loki + container_name: loki command: -config.file=/etc/loki/config.yml ports: - "{{ monitoring_loki_port }}:{{ monitoring_loki_port }}" volumes: - ./loki/config.yml:/etc/loki/config.yml:ro - loki-data:/loki + networks: + - logging labels: logging: "promtail" - app: "devops-loki" + app: "loki" healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:{{ monitoring_loki_port }}/ready >/dev/null 2>&1 || curl -fsS http://localhost:{{ monitoring_loki_port }}/ready >/dev/null 2>&1"] - interval: 15s + interval: 10s timeout: 5s - retries: 10 - start_period: 20s + retries: 5 + deploy: + resources: + limits: + memory: 1G + cpus: "1.0" + restart: unless-stopped promtail: image: {{ monitoring_promtail_image }} - container_name: devops-promtail + container_name: promtail command: -config.file=/etc/promtail/config.yml ports: - "{{ monitoring_promtail_port }}:{{ monitoring_promtail_port }}" @@ -32,36 +65,92 @@ services: depends_on: loki: condition: service_healthy + networks: + - logging + labels: + logging: "promtail" + app: "promtail" + healthcheck: + test: ["CMD-SHELL", "grep -qi ':{{ '%x' % (monitoring_promtail_port | int) }}' /proc/net/tcp /proc/net/tcp6"] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + restart: unless-stopped + + prometheus: + image: "prom/prometheus:v{{ prometheus_version }}" + container_name: prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time={{ prometheus_retention_days }}d" + - "--storage.tsdb.retention.size={{ prometheus_retention_size }}" + ports: + - "{{ prometheus_port }}:{{ prometheus_port }}" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging labels: logging: "promtail" - app: "devops-promtail" + app: "prometheus" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ prometheus_port }}/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 1G + cpus: "1.0" + restart: unless-stopped grafana: image: {{ monitoring_grafana_image }} - container_name: devops-grafana + container_name: grafana ports: - "{{ monitoring_grafana_port }}:{{ monitoring_grafana_port }}" environment: - GF_AUTH_ANONYMOUS_ENABLED: "false" GF_SECURITY_ADMIN_USER: "{{ monitoring_grafana_admin_user }}" GF_SECURITY_ADMIN_PASSWORD: "{{ monitoring_grafana_admin_password }}" GF_USERS_ALLOW_SIGN_UP: "false" + GF_METRICS_ENABLED: "true" volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro depends_on: loki: condition: service_healthy + prometheus: + condition: service_healthy + networks: + - logging labels: logging: "promtail" - app: "devops-grafana" + app: "grafana" healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:{{ monitoring_grafana_port }}/api/health >/dev/null 2>&1 || curl -fsS http://localhost:{{ monitoring_grafana_port }}/api/health >/dev/null 2>&1"] - interval: 15s + interval: 10s timeout: 5s - retries: 10 - start_period: 30s + retries: 5 + deploy: + resources: + limits: + memory: 512M + cpus: "0.5" + restart: unless-stopped volumes: loki-data: grafana-data: + prometheus-data: + +networks: + logging: + driver: bridge diff --git a/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 b/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 new file mode 100644 index 0000000000..52ce418c92 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: 'Lab 8' + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards/json diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 index f13a7c7c40..c924e3d9cf 100644 --- a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -1,10 +1,16 @@ apiVersion: 1 datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:{{ prometheus_port }} + editable: true + - name: Loki uid: loki type: loki access: proxy url: http://loki:{{ monitoring_loki_port }} - isDefault: true editable: true diff --git a/ansible/roles/monitoring/templates/prometheus.yml.j2 b/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000000..8a4e49654a --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,16 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +scrape_configs: +{% for target in prometheus_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: +{% for endpoint in target.targets %} + - '{{ endpoint }}' +{% endfor %} +{% if target.path is defined %} + metrics_path: '{{ target.path }}' +{% endif %} +{% endfor %} diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 index f7c374f3ba..76edcd9103 100644 --- a/ansible/roles/monitoring/templates/promtail-config.yml.j2 +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -16,6 +16,9 @@ scrape_configs: pipeline_stages: - docker: {} relabel_configs: + - source_labels: ["__meta_docker_container_label_logging"] + regex: promtail + action: keep - source_labels: ["__meta_docker_container_name"] regex: "/(.*)" target_label: container diff --git a/app_python/README.md b/app_python/README.md index 58894af2b1..5678c7a3a4 100644 --- a/app_python/README.md +++ b/app_python/README.md @@ -42,6 +42,15 @@ HOST=127.0.0.1 PORT=3000 DEBUG=true python app.py - `GET /health` – Health check - Returns basic health status, timestamp, and uptime in seconds +- `GET /metrics` – Prometheus metrics endpoint + - Exposes RED metrics: + - `http_requests_total{method,endpoint,status_code}` + - `http_request_duration_seconds{method,endpoint,status_code}` + - `http_requests_in_progress{method,endpoint}` + - Exposes app-specific metrics: + - `devops_info_endpoint_calls{endpoint}` + - `devops_info_system_collection_seconds` + ## Configuration Configuration is done via environment variables: diff --git a/app_python/app.py b/app_python/app.py index c3712632ec..57cb8ece4c 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -12,7 +12,8 @@ from datetime import datetime, timezone from typing import Any, Dict -from flask import Flask, g, jsonify, request +from flask import Flask, Response, g, jsonify, request +from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest app = Flask(__name__) @@ -76,6 +77,32 @@ def configure_logging() -> logging.Logger: }, ) +# Prometheus metrics (RED method + app-specific signals) +HTTP_REQUESTS_TOTAL = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status_code"], +) +HTTP_REQUEST_DURATION_SECONDS = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint", "status_code"], +) +HTTP_REQUESTS_IN_PROGRESS = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", + ["method", "endpoint"], +) +DEVOPS_INFO_ENDPOINT_CALLS = Counter( + "devops_info_endpoint_calls", + "Total endpoint calls in DevOps info service", + ["endpoint"], +) +DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS = Histogram( + "devops_info_system_collection_seconds", + "System information collection duration in seconds", +) + def get_uptime() -> Dict[str, Any]: """Return uptime in seconds and human-readable form.""" @@ -91,14 +118,18 @@ def get_uptime() -> Dict[str, Any]: def get_system_info() -> Dict[str, Any]: """Collect system information.""" - return { - "hostname": socket.gethostname(), - "platform": platform.system(), - "platform_version": platform.platform(), - "architecture": platform.machine(), - "cpu_count": os.cpu_count() or 1, - "python_version": platform.python_version(), - } + started_at = time.perf_counter() + try: + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": platform.platform(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count() or 1, + "python_version": platform.python_version(), + } + finally: + DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS.observe(time.perf_counter() - started_at) def get_request_info() -> Dict[str, Any]: @@ -114,21 +145,48 @@ def get_request_info() -> Dict[str, Any]: } +def get_normalized_endpoint() -> str: + """Map routes to low-cardinality labels for Prometheus metrics.""" + if request.url_rule is not None and request.url_rule.rule: + return request.url_rule.rule + if request.path in {"/", "/health", "/metrics"}: + return request.path + return "/unknown" + + @app.before_request def before_request_logging() -> None: - """Track request start time for latency logging.""" + """Track request start for logging and metrics.""" g.request_started_at = time.perf_counter() + g.request_endpoint = get_normalized_endpoint() + HTTP_REQUESTS_IN_PROGRESS.labels( + method=request.method, endpoint=g.request_endpoint + ).inc() @app.after_request def after_request_logging(response): - """Emit one structured log entry per HTTP request.""" + """Emit logs and collect request metrics.""" request_info = get_request_info() request_info["status_code"] = response.status_code + endpoint = getattr(g, "request_endpoint", get_normalized_endpoint()) + method = request.method + status_code = str(response.status_code) + started_at = getattr(g, "request_started_at", None) + duration_seconds = 0.0 if started_at is not None: - request_info["duration_ms"] = round((time.perf_counter() - started_at) * 1000, 2) + duration_seconds = time.perf_counter() - started_at + request_info["duration_ms"] = round(duration_seconds * 1000, 2) + + HTTP_REQUESTS_TOTAL.labels( + method=method, endpoint=endpoint, status_code=status_code + ).inc() + HTTP_REQUEST_DURATION_SECONDS.labels( + method=method, endpoint=endpoint, status_code=status_code + ).observe(duration_seconds) + HTTP_REQUESTS_IN_PROGRESS.labels(method=method, endpoint=endpoint).dec() logger.info("HTTP request handled", extra={"context": request_info}) return response @@ -137,6 +195,7 @@ def after_request_logging(response): @app.route("/", methods=["GET"]) def index(): """Main endpoint - service and system information.""" + DEVOPS_INFO_ENDPOINT_CALLS.labels(endpoint="/").inc() uptime = get_uptime() system_info = get_system_info() request_info = get_request_info() @@ -167,6 +226,11 @@ def index(): "method": "GET", "description": "Health check", }, + { + "path": "/metrics", + "method": "GET", + "description": "Prometheus metrics", + }, ], } @@ -176,6 +240,7 @@ def index(): @app.route("/health", methods=["GET"]) def health(): """Health check endpoint.""" + DEVOPS_INFO_ENDPOINT_CALLS.labels(endpoint="/health").inc() uptime = get_uptime() payload = { "status": "healthy", @@ -185,6 +250,13 @@ def health(): return jsonify(payload), 200 +@app.route("/metrics", methods=["GET"]) +def metrics(): + """Expose Prometheus scrape endpoint.""" + DEVOPS_INFO_ENDPOINT_CALLS.labels(endpoint="/metrics").inc() + return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) + + @app.errorhandler(404) def not_found(error): """Return JSON for 404 errors.""" diff --git a/app_python/requirements.txt b/app_python/requirements.txt index 22ac75b399..46c776bf8d 100644 --- a/app_python/requirements.txt +++ b/app_python/requirements.txt @@ -1 +1,2 @@ Flask==3.1.0 +prometheus-client==0.23.1 diff --git a/app_python/tests/test_app.py b/app_python/tests/test_app.py index 222f4ab610..4179b4fda0 100644 --- a/app_python/tests/test_app.py +++ b/app_python/tests/test_app.py @@ -80,7 +80,22 @@ def test_not_found_returns_json_404(self): self.assertEqual(data["error"], "Not Found") self.assertIn("message", data) + def test_metrics_endpoint_exposes_prometheus_metrics(self): + # Generate a few requests so metrics have data points. + self.client.get("/") + self.client.get("/health") + + resp = self.client.get("/metrics") + self.assertEqual(resp.status_code, 200) + + body = resp.get_data(as_text=True) + self.assertIn("# HELP http_requests_total", body) + self.assertIn("# TYPE http_requests_total counter", body) + self.assertIn("http_request_duration_seconds", body) + self.assertIn("http_requests_in_progress", body) + self.assertIn("devops_info_endpoint_calls_total", body) + self.assertIn("devops_info_system_collection_seconds", body) + if __name__ == "__main__": unittest.main() - diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 6dd75f6f2e..48a621f0a9 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -1,9 +1,56 @@ name: devops-monitoring services: + app-python: + build: + context: ../app_python + container_name: app-python + environment: + HOST: 0.0.0.0 + PORT: "8000" + DEBUG: "False" + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-info-service" + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/health')\""] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + restart: unless-stopped + + devops-go: + build: + context: ../app_go + container_name: devops-go + environment: + PORT: "8001" + ports: + - "8001:8001" + networks: + - logging + labels: + logging: "promtail" + app: "devops-go" + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + restart: unless-stopped + loki: image: grafana/loki:3.0.0 - container_name: devops-loki + container_name: loki command: -config.file=/etc/loki/config.yml ports: - "3100:3100" @@ -14,148 +61,117 @@ services: - logging labels: logging: "promtail" - app: "devops-loki" + app: "loki" healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready >/dev/null 2>&1 || curl -fsS http://localhost:3100/ready >/dev/null 2>&1"] - interval: 15s + interval: 10s timeout: 5s - retries: 10 - start_period: 20s + retries: 5 deploy: resources: limits: - cpus: "1.0" memory: 1G - reservations: - cpus: "0.25" - memory: 256M + cpus: "1.0" + restart: unless-stopped promtail: image: grafana/promtail:3.0.0 - container_name: devops-promtail + container_name: promtail command: -config.file=/etc/promtail/config.yml ports: - "9080:9080" volumes: - ./promtail/config.yml:/etc/promtail/config.yml:ro - /var/run/docker.sock:/var/run/docker.sock:ro - networks: - - logging depends_on: loki: condition: service_healthy - labels: - logging: "promtail" - app: "devops-promtail" - deploy: - resources: - limits: - cpus: "0.5" - memory: 512M - reservations: - cpus: "0.10" - memory: 128M - - grafana: - image: grafana/grafana:12.3.1 - container_name: devops-grafana - ports: - - "3000:3000" - environment: - GF_AUTH_ANONYMOUS_ENABLED: "false" - GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" - GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-change-me-in-dot-env}" - GF_USERS_ALLOW_SIGN_UP: "false" - volumes: - - grafana-data:/var/lib/grafana - - ./grafana/provisioning:/etc/grafana/provisioning:ro networks: - logging - depends_on: - loki: - condition: service_healthy labels: logging: "promtail" - app: "devops-grafana" + app: "promtail" healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health >/dev/null 2>&1 || curl -fsS http://localhost:3000/api/health >/dev/null 2>&1"] - interval: 15s + test: ["CMD-SHELL", "grep -qi ':2388' /proc/net/tcp /proc/net/tcp6"] + interval: 10s timeout: 5s - retries: 10 - start_period: 30s + retries: 5 deploy: resources: limits: - cpus: "1.0" - memory: 1G - reservations: - cpus: "0.25" memory: 256M + cpus: "0.5" + restart: unless-stopped - devops-python: - build: - context: ../app_python - image: devops-python:lab07 - container_name: devops-python + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' ports: - - "8000:8000" - environment: - HOST: "0.0.0.0" - PORT: "8000" - DEBUG: "false" + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus networks: - logging labels: logging: "promtail" - app: "devops-python" + app: "prometheus" healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:8000/health >/dev/null 2>&1 || curl -fsS http://localhost:8000/health >/dev/null 2>&1"] - interval: 15s + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s timeout: 5s - retries: 10 - start_period: 20s + retries: 5 deploy: resources: limits: - cpus: "0.5" - memory: 512M - reservations: - cpus: "0.10" - memory: 128M + memory: 1G + cpus: "1.0" + restart: unless-stopped - devops-go: - build: - context: ../app_go - image: devops-go:lab07 - container_name: devops-go - ports: - - "8001:8001" + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana environment: - PORT: "8001" + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-password} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_METRICS_ENABLED: "true" + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + loki: + condition: service_healthy + prometheus: + condition: service_healthy networks: - logging labels: logging: "promtail" - app: "devops-go" + app: "grafana" healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:8001/health >/dev/null 2>&1 || curl -fsS http://localhost:8001/health >/dev/null 2>&1"] - interval: 15s + test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health >/dev/null 2>&1 || curl -fsS http://localhost:3000/api/health >/dev/null 2>&1"] + interval: 10s timeout: 5s - retries: 10 - start_period: 20s + retries: 5 deploy: resources: limits: - cpus: "0.5" memory: 512M - reservations: - cpus: "0.10" - memory: 128M - -networks: - logging: - driver: bridge + cpus: "0.5" + restart: unless-stopped volumes: loki-data: grafana-data: + prometheus-data: + +networks: + logging: + driver: bridge diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..bb11d73e12 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,226 @@ +# LAB08 — Metrics & Monitoring with Prometheus + +## 1. Architecture + +### Monitoring flow + +```mermaid +flowchart LR + APP[Python App\n/health, /metrics] -->|scrape 15s| PROM[Prometheus] + LOKI[Loki /metrics] -->|scrape 15s| PROM + GRAFANA[Grafana /metrics] -->|scrape 15s| PROM + + APP -->|container logs| PROMTAIL[Promtail] + PROMTAIL -->|push logs| LOKI + + PROM --> GRAFANA_UI[Grafana Dashboards] + LOKI --> GRAFANA_UI +``` + +### Components + +- `app-python`: Flask app instrumented with Prometheus metrics (RED + app-specific). +- `devops-go`: optional second app from Lab 1 kept in stack for log collection compatibility. +- `prometheus`: pulls metrics from app, Loki, Grafana, and itself every `15s`. +- `grafana`: visualizes Prometheus and Loki data with provisioned dashboards. +- `loki` + `promtail`: log aggregation pipeline from Lab 7. + +## 2. Application Instrumentation + +File: `app_python/app.py` + +### Implemented metrics + +- `http_requests_total{method,endpoint,status_code}` (Counter) + - Tracks total request count (Rate + Errors via status filtering). +- `http_request_duration_seconds{method,endpoint,status_code}` (Histogram) + - Tracks response latency distribution (Duration). +- `http_requests_in_progress{method,endpoint}` (Gauge) + - Tracks concurrent requests. +- `devops_info_endpoint_calls{endpoint}` (Counter) + - App-specific endpoint usage metric. +- `devops_info_system_collection_seconds` (Histogram) + - App-specific internal operation timing (system info collection). + +### Instrumentation points + +- `@app.before_request` + - Stores start time and increments in-progress gauge. +- `@app.after_request` + - Records request counter + histogram and decrements in-progress gauge. +- `@app.route("/metrics")` + - Exposes Prometheus text format using `generate_latest()`. + +### Label strategy + +- Low cardinality endpoint normalization: + - Known routes keep their route labels (`/`, `/health`, `/metrics`). + - Unknown paths are grouped into `/unknown`. +- Avoid per-user or per-id labels. + +## 3. Prometheus Configuration + +File: `monitoring/prometheus/prometheus.yml` + +### Global settings + +- `scrape_interval: 15s` +- `evaluation_interval: 15s` + +### Scrape targets + +- `prometheus`: `localhost:9090` +- `app`: `app-python:8000` (`/metrics`) +- `loki`: `loki:3100` (`/metrics`) +- `grafana`: `grafana:3000` (`/metrics`) + +### Retention policy + +Configured in `monitoring/docker-compose.yml` Prometheus command flags: + +- `--storage.tsdb.retention.time=15d` +- `--storage.tsdb.retention.size=10GB` + +## 4. Dashboard Walkthrough + +Provisioned dashboard JSON: + +- `monitoring/grafana/provisioning/dashboards/json/app-metrics-dashboard.json` + +### Panels (6+) + +1. **Request Rate by Endpoint** + - `sum by (endpoint) (rate(http_requests_total[5m]))` +2. **Error Rate (5xx)** + - `sum(rate(http_requests_total{status_code=~"5.."}[5m]))` +3. **Request Duration p95** + - `histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))` +4. **Request Duration Heatmap** + - `sum by (le) (rate(http_request_duration_seconds_bucket[5m]))` +5. **Active Requests** + - `sum(http_requests_in_progress)` +6. **Status Code Distribution** + - `sum by (status_code) (rate(http_requests_total[5m]))` +7. **App Uptime (UP/DOWN)** + - `up{job="app"}` + +## 5. PromQL Examples + +1. Requests per second by endpoint: + - `sum by (endpoint) (rate(http_requests_total[5m]))` +2. Global request rate: + - `sum(rate(http_requests_total[5m]))` +3. 5xx error rate: + - `sum(rate(http_requests_total{status_code=~"5.."}[5m]))` +4. Error percentage: + - `100 * sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))` +5. p95 latency: + - `histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))` +6. Active requests now: + - `sum(http_requests_in_progress)` +7. Target availability: + - `up` + +## 6. Production Setup + +### Health checks + +Configured for all services in `monitoring/docker-compose.yml`: + +- `app-python`: `/health` +- `prometheus`: `/-/healthy` +- `grafana`: `/api/health` +- `loki`: `/ready` +- `promtail`: `/ready` + +### Resource limits + +- Prometheus: `1G`, `1.0 CPU` +- Loki: `1G`, `1.0 CPU` +- Grafana: `512M`, `0.5 CPU` +- App: `256M`, `0.5 CPU` +- Promtail: `256M`, `0.5 CPU` + +### Persistence + +Named volumes: + +- `prometheus-data` +- `loki-data` +- `grafana-data` + +Data and dashboards remain after `docker compose down` / `up -d`. + +## 7. Testing Results + +### Local validation commands + +```bash +# App tests +python -m unittest app_python/tests/test_app.py + +# Compose syntax +cd monitoring +docker compose config + +# Start stack +docker compose up -d + +docker compose ps +curl -s http://localhost:8000/metrics | head -40 +curl -s http://localhost:9090/api/v1/query?query=up +``` + +### Evidence to capture + +Store screenshots in `monitoring/docs/screenshots/` with names: + +- `metrics-endpoint.png` (browser/curl output of `/metrics`) +- `prometheus-targets-up.png` (`http://localhost:9090/targets` all UP) +- `promql-up-query.png` (`up` query result) +- `grafana-app-dashboard1.png` (custom dashboard with 6+ panels) +- `grafana-app-dashboard2.png` (custom dashboard with 6+ panels) +- `compose-healthy.png` (`docker compose ps` healthy services) +- `persistence-proof.png` (dashboard exists after restart) + +## 8. Challenges & Solutions + +- **High-cardinality risk for endpoint labels** + - Solution: normalized unknown paths to `/unknown`. +- **Prometheus retention requirements** + - Solution: set explicit TSDB retention by time and size in container command. +- **Repeatable dashboard setup** + - Solution: provisioning files + dashboard JSON committed to repo. +- **Automation requirement (bonus)** + - Solution: extended Ansible `monitoring` role with templated Prometheus and dashboard provisioning. + +## Metrics vs Logs (Lab 7 vs Lab 8) + +- Use **metrics** for trends/SLOs/alerts (rate, error rate, p95, uptime). +- Use **logs** for event-level debugging and root-cause analysis. +- Together they cover both macro health (metrics) and request-level detail (logs). + +## Bonus — Ansible Automation + +Extended role: `ansible/roles/monitoring` + +### Implemented + +- Added Prometheus variables and scrape target list in `defaults/main.yml`. +- Added Jinja2 Prometheus template: `templates/prometheus.yml.j2`. +- Updated Compose template to include: + - app + Loki + Promtail + Prometheus + Grafana + - health checks, resource limits, retention, persistent volumes. +- Provisioned Grafana data sources (Loki + Prometheus) and dashboards. +- Added dashboard files: + - `files/grafana-app-dashboard.json` + - `files/grafana-logs-dashboard.json` + +### End-to-end deployment + +```bash +cd ansible +ansible-playbook playbooks/deploy-monitoring.yml -i inventory/hosts.ini --ask-vault-pass +``` + +Run twice to verify idempotency (`changed=0` expected on second run for stable state). diff --git a/monitoring/docs/screenshots/ promql-up-query.png b/monitoring/docs/screenshots/ promql-up-query.png new file mode 100644 index 0000000000..095a7bf573 Binary files /dev/null and b/monitoring/docs/screenshots/ promql-up-query.png differ diff --git a/monitoring/docs/screenshots/afteerrestart.png b/monitoring/docs/screenshots/afteerrestart.png new file mode 100644 index 0000000000..18a6cd0924 Binary files /dev/null and b/monitoring/docs/screenshots/afteerrestart.png differ diff --git a/monitoring/docs/screenshots/compose-healthyyyy.png b/monitoring/docs/screenshots/compose-healthyyyy.png new file mode 100644 index 0000000000..cfdd5c3434 Binary files /dev/null and b/monitoring/docs/screenshots/compose-healthyyyy.png differ diff --git a/monitoring/docs/screenshots/grafana-app-dashboard1.png b/monitoring/docs/screenshots/grafana-app-dashboard1.png new file mode 100644 index 0000000000..837b5fc57d Binary files /dev/null and b/monitoring/docs/screenshots/grafana-app-dashboard1.png differ diff --git a/monitoring/docs/screenshots/grafana-app-dashboard2.png b/monitoring/docs/screenshots/grafana-app-dashboard2.png new file mode 100644 index 0000000000..2ee706acad Binary files /dev/null and b/monitoring/docs/screenshots/grafana-app-dashboard2.png differ diff --git a/monitoring/docs/screenshots/metrics-endpoint.png b/monitoring/docs/screenshots/metrics-endpoint.png new file mode 100644 index 0000000000..9eb2fbaa82 Binary files /dev/null and b/monitoring/docs/screenshots/metrics-endpoint.png differ diff --git a/monitoring/docs/screenshots/persistence-proof.png b/monitoring/docs/screenshots/persistence-proof.png new file mode 100644 index 0000000000..da847273d8 Binary files /dev/null and b/monitoring/docs/screenshots/persistence-proof.png differ diff --git a/monitoring/docs/screenshots/prometheus-targets-up.png b/monitoring/docs/screenshots/prometheus-targets-up.png new file mode 100644 index 0000000000..3bc935c37d Binary files /dev/null and b/monitoring/docs/screenshots/prometheus-targets-up.png differ diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..52ce418c92 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: 'Lab 8' + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards/json diff --git a/monitoring/grafana/provisioning/dashboards/json/app-metrics-dashboard.json b/monitoring/grafana/provisioning/dashboards/json/app-metrics-dashboard.json new file mode 100644 index 0000000000..2f4ceb99c8 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/json/app-metrics-dashboard.json @@ -0,0 +1,436 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "legendFormat": "{{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "mode": "scheme" + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "short" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 20 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(http_requests_in_progress)", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 1, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "up{job=\"app\"}", + "refId": "A" + } + ], + "title": "App Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 40, + "style": "dark", + "tags": [ + "lab8", + "prometheus", + "red" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lab 8 - Application Metrics", + "uid": "lab8-app-metrics", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/json/logs-dashboard.json b/monitoring/grafana/provisioning/dashboards/json/logs-dashboard.json new file mode 100644 index 0000000000..1411712d22 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/json/logs-dashboard.json @@ -0,0 +1,78 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{job=\"docker\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Container Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 40, + "tags": [ + "lab7", + "loki" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Lab 7 - Logs Overview", + "uid": "lab7-logs-overview", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/datasources.yml similarity index 50% rename from monitoring/grafana/provisioning/datasources/loki.yml rename to monitoring/grafana/provisioning/datasources/datasources.yml index fba0b1b8e0..259b7dac3a 100644 --- a/monitoring/grafana/provisioning/datasources/loki.yml +++ b/monitoring/grafana/provisioning/datasources/datasources.yml @@ -1,10 +1,16 @@ apiVersion: 1 datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + editable: true + - name: Loki uid: loki type: loki access: proxy url: http://loki:3100 - isDefault: true editable: true diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..f155272a88 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + metrics_path: '/metrics' + static_configs: + - targets: ['app-python:8000'] + + - job_name: 'loki' + metrics_path: '/metrics' + static_configs: + - targets: ['loki:3100'] + + - job_name: 'grafana' + metrics_path: '/metrics' + static_configs: + - targets: ['grafana:3000'] diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml index 9bcb08d656..949c79ff77 100644 --- a/monitoring/promtail/config.yml +++ b/monitoring/promtail/config.yml @@ -13,12 +13,12 @@ scrape_configs: docker_sd_configs: - host: unix:///var/run/docker.sock refresh_interval: 5s - filters: - - name: label - values: ["logging=promtail"] pipeline_stages: - docker: {} relabel_configs: + - source_labels: ["__meta_docker_container_label_logging"] + regex: promtail + action: keep - source_labels: ["__meta_docker_container_name"] regex: "/(.*)" target_label: container