diff --git a/ansible/playbooks/deploy_all.yml b/ansible/playbooks/deploy_all.yml index 941f57a575..3150e56193 100644 --- a/ansible/playbooks/deploy_all.yml +++ b/ansible/playbooks/deploy_all.yml @@ -3,8 +3,11 @@ hosts: webservers become: true gather_facts: true - tasks: + - name: Deploy Monitoring Stack + ansible.builtin.include_role: + name: monitoring + - name: Deploy Python App ansible.builtin.include_role: name: web_app @@ -29,4 +32,4 @@ app_internal_port: 8080 compose_project_dir: /opt/devops-go app_environment: - APP_LANG: go + APP_LANG: go \ No newline at end of file diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml index a0d8971366..d24c370f58 100644 --- a/ansible/roles/monitoring/defaults/main.yml +++ b/ansible/roles/monitoring/defaults/main.yml @@ -29,3 +29,26 @@ promtail_memory_limit: "256m" promtail_cpu_limit: "0.5" grafana_memory_limit: "512m" grafana_cpu_limit: "1.0" + +# Prometheus +prometheus_version: "v3.9.0" +prometheus_port: 9090 +prometheus_retention_days: "15d" +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" +prometheus_memory_limit: "1g" +prometheus_cpu_limit: "1.0" + +prometheus_scrape_targets: + - job: "prometheus" + targets: ["localhost:9090"] + path: "/metrics" + - job: "loki" + targets: ["loki:3100"] + path: "/metrics" + - job: "grafana" + targets: ["grafana:3000"] + path: "/metrics" + - job: "app" + targets: ["devops-python:8000:8000"] + path: "/metrics" \ No newline at end of file diff --git a/ansible/roles/monitoring/files/app-dashboard.json b/ansible/roles/monitoring/files/app-dashboard.json new file mode 100644 index 0000000000..f5a219c48d --- /dev/null +++ b/ansible/roles/monitoring/files/app-dashboard.json @@ -0,0 +1,534 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "up{job=\"app\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "http_requests_in_progress", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "range": true, + "refId": "A" + } + ], + "title": " Error Rate", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prometheus Dashboard", + "uid": "admgtq4", + "version": 8 +} \ No newline at end of file diff --git a/ansible/roles/monitoring/files/dashboards-provisioner.yml b/ansible/roles/monitoring/files/dashboards-provisioner.yml new file mode 100644 index 0000000000..b18a0c3ba4 --- /dev/null +++ b/ansible/roles/monitoring/files/dashboards-provisioner.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: default + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /etc/grafana/provisioning/dashboards \ No newline at end of file diff --git a/ansible/roles/monitoring/files/grafana-logs-dashboard.json b/ansible/roles/monitoring/files/grafana-logs-dashboard.json new file mode 100644 index 0000000000..5ebf689d8e --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-logs-dashboard.json @@ -0,0 +1,288 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "direction": "backward", + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\" } | json [5m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Level Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} |= \"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "direction": "backward", + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "DevOps Apps - Log Overview", + "uid": "adwbdg5", + "version": 5 +} \ No newline at end of file diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml index c975e4ccbe..79ad4dd5db 100644 --- a/ansible/roles/monitoring/tasks/deploy.yml +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -28,6 +28,15 @@ delay: 10 until: grafana_ready.status == 200 + - name: Wait for Prometheus to be ready + ansible.builtin.uri: + url: "http://localhost:{{ prometheus_port }}/-/healthy" + status_code: 200 + register: prometheus_ready + retries: 12 + delay: 10 + until: prometheus_ready.status == 200 + - name: Report deployment success ansible.builtin.debug: msg: "Monitoring stack deployed — Grafana at http://localhost:{{ grafana_port }}" @@ -53,4 +62,4 @@ - name: Print container status ansible.builtin.debug: - msg: "{{ compose_ps.stdout_lines }}" + msg: "{{ compose_ps.stdout_lines }}" \ No newline at end of file diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml index ed15f385ed..9f4ecf461e 100644 --- a/ansible/roles/monitoring/tasks/setup.yml +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -12,6 +12,9 @@ - "{{ monitoring_dir }}" - "{{ monitoring_dir }}/loki" - "{{ monitoring_dir }}/promtail" + - "{{ monitoring_dir }}/prometheus" + - "{{ monitoring_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_dir }}/grafana/provisioning/dashboards" - name: Template Loki configuration ansible.builtin.template: @@ -34,6 +37,41 @@ mode: "0644" notify: Restart monitoring stack + - name: Template Prometheus configuration + ansible.builtin.template: + src: prometheus-config.yml.j2 + dest: "{{ monitoring_dir }}/prometheus/prometheus.yml" + mode: "0644" + notify: Restart monitoring stack + + - name: Template Grafana datasources provisioning + ansible.builtin.template: + src: grafana-datasources.yml.j2 + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/datasources.yml" + mode: "0644" + notify: Restart monitoring stack + + - name: Copy Grafana dashboard JSON + ansible.builtin.copy: + src: app-dashboard.json + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/app-dashboard.json" + mode: "0644" + notify: Restart monitoring stack + + - name: Copy Grafana logs dashboard JSON + ansible.builtin.copy: + src: grafana-logs-dashboard.json + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/grafana-logs-dashboard.json" + mode: "0644" + notify: Restart monitoring stack + + - name: Copy Grafana dashboard provisioner config + ansible.builtin.copy: + src: dashboards-provisioner.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/dashboards-provisioner.yml" + mode: "0644" + notify: Restart monitoring stack + rescue: - name: Report setup failure ansible.builtin.debug: @@ -48,4 +86,4 @@ - name: Show monitoring directory contents ansible.builtin.debug: - msg: "{{ monitoring_dir_contents.stdout_lines }}" + msg: "{{ monitoring_dir_contents.stdout_lines }}" \ No newline at end of file diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 index 8985057fe7..69920eb21e 100644 --- a/ansible/roles/monitoring/templates/docker-compose.yml.j2 +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -5,6 +5,7 @@ networks: volumes: loki-data: grafana-data: + prometheus-data: services: @@ -59,6 +60,8 @@ services: - "{{ grafana_port }}:3000" volumes: - grafana-data:/var/lib/grafana + - {{ monitoring_dir }}/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - {{ monitoring_dir }}/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro environment: - GF_AUTH_ANONYMOUS_ENABLED=false - GF_SECURITY_ADMIN_USER={{ grafana_admin_user }} @@ -80,3 +83,33 @@ services: cpus: '{{ grafana_cpu_limit }}' memory: {{ grafana_memory_limit }} restart: unless-stopped + + prometheus: + image: prom/prometheus:{{ prometheus_version }} + container_name: prometheus + ports: + - "{{ prometheus_port }}:9090" + volumes: + - {{ monitoring_dir }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time={{ prometheus_retention_days }}' + - '--storage.tsdb.retention.size={{ prometheus_retention_size }}' + networks: + - logging + depends_on: + - loki + - grafana + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + deploy: + resources: + limits: + cpus: '{{ prometheus_cpu_limit }}' + memory: {{ prometheus_memory_limit }} + restart: unless-stopped \ No newline at end of file diff --git a/ansible/roles/monitoring/templates/grafana-datasources.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasources.yml.j2 new file mode 100644 index 0000000000..55525a6dd7 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasources.yml.j2 @@ -0,0 +1,16 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:{{ loki_port }} + isDefault: false + editable: true + + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:{{ prometheus_port }} + isDefault: true + editable: true \ No newline at end of file diff --git a/ansible/roles/monitoring/templates/prometheus-config.yml.j2 b/ansible/roles/monitoring/templates/prometheus-config.yml.j2 new file mode 100644 index 0000000000..ac789be904 --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus-config.yml.j2 @@ -0,0 +1,11 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +scrape_configs: +{% for target in prometheus_scrape_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: {{ target.targets }} + metrics_path: '{{ target.path }}' +{% endfor %} \ No newline at end of file diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 index 71be74032e..176d9795ff 100644 --- a/ansible/roles/web_app/templates/docker-compose.yml.j2 +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -6,6 +6,9 @@ services: container_name: {{ app_name }} ports: - "{{ app_port }}:{{ app_internal_port }}" + labels: + logging: "promtail" + app: "{{ app_name }}" environment: APP_ENV: production APP_PORT: "{{ app_internal_port }}" @@ -18,6 +21,7 @@ services: restart: unless-stopped networks: - app_network + - logging healthcheck: test: ["CMD", "curl", "-f", "http://localhost:{{ app_internal_port }}/health"] interval: 30s @@ -33,3 +37,6 @@ services: networks: app_network: driver: bridge + logging: + external: true + name: monitoring_logging \ No newline at end of file diff --git a/app_python/app.py b/app_python/app.py index 94ac357fc8..06dfc1151d 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -1,7 +1,14 @@ from fastapi import FastAPI, Request from datetime import datetime, timezone -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, Response from starlette.exceptions import HTTPException as StarletteHTTPException +from prometheus_client import ( + Counter, + Histogram, + Gauge, + generate_latest, + CONTENT_TYPE_LATEST, +) import platform import socket import os @@ -57,6 +64,28 @@ def format(self, record: logging.LogRecord) -> str: app = FastAPI() START_TIME = datetime.now(timezone.utc) +# ── Prometheus Metrics ─────────────────────────────────────────────────────── +http_requests_total = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status_code"], +) + +http_request_duration_seconds = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], + buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5], +) + +http_requests_in_progress = Gauge( + "http_requests_in_progress", "HTTP requests currently being processed" +) + +devops_info_endpoint_calls = Counter( + "devops_info_endpoint_calls_total", "Calls per endpoint", ["endpoint"] +) + HOST = os.getenv("HOST", "0.0.0.0") PORT = int(os.getenv("PORT", 8000)) @@ -90,8 +119,12 @@ async def log_requests(request: Request, call_next): start_time = datetime.now(timezone.utc) client_ip = request.client.host if request.client else "unknown" + # Normalize endpoint (avoid high cardinality) + endpoint = request.url.path + + http_requests_in_progress.inc() logger.info( - f"Request started: {request.method}{request.url.path} from {client_ip}" + f"Request started: {request.method} {endpoint} from {client_ip}" ) try: @@ -100,11 +133,24 @@ async def log_requests(request: Request, call_next): datetime.now(timezone.utc) - start_time ).total_seconds() + # Record metrics + http_requests_total.labels( + method=request.method, + endpoint=endpoint, + status_code=str(response.status_code), + ).inc() + + http_request_duration_seconds.labels( + method=request.method, endpoint=endpoint + ).observe(process_time) + + devops_info_endpoint_calls.labels(endpoint=endpoint).inc() + logger.info( "Request completed", extra={ "method": request.method, - "path": request.url.path, + "path": endpoint, "status_code": response.status_code, "client_ip": client_ip, "duration_seconds": round(process_time, 3), @@ -113,21 +159,32 @@ async def log_requests(request: Request, call_next): response.headers["X-Process-Time"] = str(process_time) return response + except Exception as e: process_time = ( datetime.now(timezone.utc) - start_time ).total_seconds() + http_requests_total.labels( + method=request.method, endpoint=endpoint, status_code="500" + ).inc() logger.error( "Request failed", extra={ "method": request.method, - "path": request.url.path, + "path": endpoint, "client_ip": client_ip, "duration_seconds": round(process_time, 3), "error": str(e), }, ) raise + finally: + http_requests_in_progress.dec() + + +@app.get("/metrics") +def metrics(): + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) @app.get("/") diff --git a/app_python/requirements.txt b/app_python/requirements.txt index 7a8f2f1806..8ed1b51a07 100644 Binary files a/app_python/requirements.txt and b/app_python/requirements.txt differ diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index b4341bd176..2b582ada92 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -7,7 +7,7 @@ networks: volumes: loki-data: grafana-data: - + prometheus-data: services: loki: @@ -94,16 +94,29 @@ services: restart: unless-stopped app-python: - image: 3llimi/devops-info-service:latest - container_name: devops-python - ports: - - "8000:8000" - networks: - - logging - labels: - logging: "promtail" - app: "devops-python" - restart: unless-stopped + image: 3llimi/devops-info-service:latest + container_name: devops-python + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.1' + memory: 64M + restart: unless-stopped app-go: image: 3llimi/devops-go-service:latest @@ -115,4 +128,37 @@ services: labels: logging: "promtail" app: "devops-go" - restart: unless-stopped \ No newline at end of file + restart: unless-stopped + + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' + networks: + - logging + depends_on: + - loki + - grafana + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + restart: unless-stopped \ No newline at end of file diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..6b68586bdd --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,512 @@ +# Lab 8 — Metrics & Monitoring with Prometheus + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Docker Network: logging │ +│ │ +│ ┌──────────────────┐ /metrics ┌─────────────────────────┐ │ +│ │ devops-python │◄─────────────│ │ │ +│ │ :8000 │ │ Prometheus :9090 │ │ +│ └──────────────────┘ │ TSDB Storage │ │ +│ │ 15d retention │ │ +│ ┌──────────────────┐ /metrics │ │ │ +│ │ Loki :3100 │◄─────────────│ scrape interval: 15s │ │ +│ └──────────────────┘ └───────────┬─────────────┘ │ +│ │ │ +│ ┌──────────────────┐ /metrics │ query │ +│ │ Grafana :3000 │◄─────────────────────────┘ │ +│ │ Dashboards │◄── PromQL ───────────────────────────────┤ │ +│ └──────────────────┘ │ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**How it works:** +- The Python app exposes a `/metrics` endpoint using `prometheus_client` +- Prometheus scrapes all four targets every 15 seconds (pull-based model) +- Metrics are stored in Prometheus TSDB with 15-day retention +- Grafana queries Prometheus using PromQL and displays dashboards + +--- + +## Application Instrumentation + +### Metrics Added + +Four metrics were added to the FastAPI app using `prometheus_client==0.23.1`: + +```python +from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST + +http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status_code'] +) + +http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration in seconds', + ['method', 'endpoint'], + buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5] +) + +http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'HTTP requests currently being processed' +) + +devops_info_endpoint_calls_total = Counter( + 'devops_info_endpoint_calls_total', + 'Calls per endpoint', + ['endpoint'] +) +``` + +### Why These Metrics + +| Metric | Type | Purpose | +|--------|------|---------| +| `http_requests_total` | Counter | Tracks total requests — used for rate and error rate (RED method) | +| `http_request_duration_seconds` | Histogram | Measures latency distribution — enables p95/p99 percentiles | +| `http_requests_in_progress` | Gauge | Live concurrency — detects traffic spikes in real time | +| `devops_info_endpoint_calls_total` | Counter | Business metric — per-endpoint usage breakdown | + +**Label design:** Labels use `method`, `endpoint`, and `status_code` with low cardinality. User IDs, IPs, and query strings are deliberately excluded to avoid cardinality explosion. + +### Middleware Implementation + +Metrics are recorded in the FastAPI middleware so every request is tracked automatically: + +```python +@app.middleware("http") +async def log_requests(request: Request, call_next): + start_time = datetime.now(timezone.utc) + endpoint = request.url.path + http_requests_in_progress.inc() + try: + response = await call_next(request) + process_time = (datetime.now(timezone.utc) - start_time).total_seconds() + http_requests_total.labels( + method=request.method, + endpoint=endpoint, + status_code=str(response.status_code) + ).inc() + http_request_duration_seconds.labels( + method=request.method, + endpoint=endpoint + ).observe(process_time) + devops_info_endpoint_calls_total.labels(endpoint=endpoint).inc() + return response + finally: + http_requests_in_progress.dec() + +@app.get("/metrics") +def metrics(): + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) +``` + +### Metrics Endpoint Output + +``` +# HELP http_requests_total Total HTTP requests +# TYPE http_requests_total counter +http_requests_total{endpoint="/",method="GET",status_code="200"} 20.0 +http_requests_total{endpoint="/health",method="GET",status_code="200"} 10.0 +http_requests_total{endpoint="/metrics",method="GET",status_code="200"} 2.0 + +# HELP http_request_duration_seconds HTTP request duration in seconds +# TYPE http_request_duration_seconds histogram +http_request_duration_seconds_bucket{endpoint="/",le="0.005",method="GET"} 19.0 +http_request_duration_seconds_bucket{endpoint="/",le="0.01",method="GET"} 20.0 +... +http_request_duration_seconds_count{endpoint="/",method="GET"} 20.0 +http_request_duration_seconds_sum{endpoint="/",method="GET"} 0.033492999999999995 + +# HELP http_requests_in_progress HTTP requests currently being processed +# TYPE http_requests_in_progress gauge +http_requests_in_progress 1.0 + +# HELP devops_info_endpoint_calls_total Calls per endpoint +# TYPE devops_info_endpoint_calls_total counter +devops_info_endpoint_calls_total{endpoint="/"} 20.0 +devops_info_endpoint_calls_total{endpoint="/health"} 10.0 +devops_info_endpoint_calls_total{endpoint="/metrics"} 2.0 +``` + +--- + +## Prometheus Configuration + +### `monitoring/prometheus/prometheus.yml` + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['app-python:8000'] + metrics_path: '/metrics' + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: '/metrics' + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] + metrics_path: '/metrics' +``` + +### Scrape Targets + +| Job | Target | Purpose | +|-----|--------|---------| +| `prometheus` | `localhost:9090` | Prometheus self-monitoring | +| `app` | `app-python:8000` | Python app custom metrics | +| `loki` | `loki:3100` | Loki internal metrics | +| `grafana` | `grafana:3000` | Grafana internal metrics | + +**Why 15s interval:** Balance between freshness and storage cost. For a dev/course environment, 15s provides enough resolution for dashboards without excessive TSDB writes. + +**Why pull-based:** Prometheus scrapes targets on schedule. This means failed scrapes are immediately visible as gaps in data, and apps don't need to know where Prometheus is. + +### Retention + +Configured via command flags: +``` +--storage.tsdb.retention.time=15d +--storage.tsdb.retention.size=10GB +``` + +15 days provides enough history for trend analysis while preventing unbounded disk growth. + +--- + +## Dashboard Walkthrough + +**Dashboard name:** Prometheus Dashboard +**Data source:** Prometheus (`http://prometheus:9090`) + +### Panel 1 — Uptime (Stat) +**Query:** `up{job="app"}` +**Purpose:** Shows whether the app is reachable by Prometheus. Value `1` = UP, `0` = DOWN. Instant health indicator at the top of the dashboard. + +### Panel 2 — Status Code Distribution (Pie Chart) +**Query:** `sum by (status_code) (rate(http_requests_total[5m]))` +**Purpose:** Visualises the proportion of 2xx vs 4xx vs 5xx responses over the last 5 minutes. All green (200) means the app is healthy. + +### Panel 3 — Request Duration p95 (Time Series) +**Query:** `histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))` +**Purpose:** Shows the 95th percentile latency — 95% of requests complete faster than this value. Observed values around 5–9ms, well within acceptable range. + +### Panel 4 — Active Requests (Gauge) +**Query:** `http_requests_in_progress` +**Purpose:** Live count of requests currently being processed. Useful for detecting traffic spikes and concurrency issues. + +### Panel 5 — Request Rate (Time Series) +**Query:** `sum(rate(http_requests_total[5m])) by (endpoint)` +**Purpose:** Shows requests per second broken down by endpoint. Covers the **Rate** dimension of the RED method. Three lines: `/`, `/health`, `/metrics`. + +### Panel 6 — Error Rate (Time Series) +**Query:** `sum(rate(http_requests_total{status_code=~"5.."}[5m]))` +**Purpose:** Shows the rate of 5xx errors per second. Shows "No data" when the app is healthy — which is the expected result. + +--- + +## PromQL Examples + +```promql +# 1. All targets up/down status +up + +# 2. Request rate per endpoint (RED: Rate) +sum(rate(http_requests_total[5m])) by (endpoint) + +# 3. Error rate — 5xx responses per second (RED: Errors) +sum(rate(http_requests_total{status_code=~"5.."}[5m])) + +# 4. p95 latency across all endpoints (RED: Duration) +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# 5. p99 latency per endpoint +histogram_quantile(0.99, sum by (endpoint, le) (rate(http_request_duration_seconds_bucket[5m]))) + +# 6. Total request count over last hour +increase(http_requests_total[1h]) + +# 7. Average request duration per endpoint +rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m]) + +# 8. Request rate by status code +sum by (status_code) (rate(http_requests_total[5m])) +``` + +--- + +## Production Setup + +### Health Checks + +All services have health checks to enable Docker dependency management and visibility: + +```yaml +# Prometheus +healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + +# Python app +healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 +``` + +### Resource Limits + +| Service | CPU Limit | Memory Limit | +|---------|-----------|--------------| +| Prometheus | 1.0 | 1G | +| Loki | 1.0 | 1G | +| Grafana | 1.0 | 512M | +| Promtail | 0.5 | 256M | +| app-python | 0.5 | 256M | + +### Retention Policies + +| Service | Retention | Config | +|---------|-----------|--------| +| Prometheus | 15 days / 10GB | `--storage.tsdb.retention.time=15d` | +| Loki | 7 days | `retention_period: 168h` in loki config | + +### Data Persistence + +Named Docker volumes ensure data survives container restarts: +```yaml +volumes: + prometheus-data: # Prometheus TSDB + loki-data: # Loki log storage + grafana-data: # Grafana dashboards and config +``` + +**Persistence verified:** Stack was stopped with `docker compose down` and restarted with `docker compose up -d`. The Prometheus Dashboard was still present in Grafana after restart. + +--- + +## Testing Results + +### All Targets UP + +All four Prometheus scrape targets confirmed healthy: + +``` +app http://app-python:8000/metrics UP ✅ +grafana http://grafana:3000/metrics UP ✅ +loki http://loki:3100/metrics UP ✅ +prometheus http://localhost:9090/metrics UP ✅ +``` + +![Prometheus Targets — All UP](screenshots/targets-all-up.png) + +### PromQL `up` Query — All 4 Targets = 1 + +![PromQL up query](screenshots/promql-up-query.png) + +### Grafana Prometheus Data Source + +![Grafana datasource connected](screenshots/grafana-datasource.png) + +### Dashboard — All 6 Panels with Live Data + +![Prometheus Dashboard](screenshots/dashboard-panels.png) + +### Stack Health After Restart + +``` +NAME IMAGE STATUS +devops-go 3llimi/devops-go-service:latest Up (healthy) +devops-python 3llimi/devops-info-service:latest Up (healthy) +grafana grafana/grafana:12.3.1 Up (healthy) +loki grafana/loki:3.0.0 Up (healthy) +prometheus prom/prometheus:v3.9.0 Up (healthy) +promtail grafana/promtail:3.0.0 Up +``` + +### Dashboard Persisted After Restart + +![Dashboard after restart](screenshots/dashboard-persistence.png) + +--- + +## Bonus — Ansible Automation + +### Updated Role Structure + +``` +roles/monitoring/ +├── defaults/main.yml # All variables including Prometheus +├── meta/main.yml # Depends on: docker role +├── handlers/main.yml # Restart stack on config change +├── files/ +│ ├── app-dashboard.json # Metrics dashboard +│ ├── grafana-logs-dashboard.json # Logs dashboard +│ └── dashboards-provisioner.yml +├── tasks/ +│ ├── main.yml # Orchestrates setup + deploy +│ ├── setup.yml # Dirs, templates, files +│ └── deploy.yml # docker compose up + health waits +└── templates/ + ├── docker-compose.yml.j2 + ├── loki-config.yml.j2 + ├── promtail-config.yml.j2 + ├── prometheus-config.yml.j2 # NEW + └── grafana-datasources.yml.j2 # NEW +``` + +### New Variables (`defaults/main.yml`) + +```yaml +# Prometheus +prometheus_version: "v3.9.0" +prometheus_port: 9090 +prometheus_retention_days: "15d" +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" +prometheus_memory_limit: "1g" +prometheus_cpu_limit: "1.0" + +prometheus_scrape_targets: + - job: "prometheus" + targets: ["localhost:9090"] + path: "/metrics" + - job: "loki" + targets: ["loki:3100"] + path: "/metrics" + - job: "grafana" + targets: ["grafana:3000"] + path: "/metrics" + - job: "app" + targets: ["app-python:8000"] + path: "/metrics" +``` + +### Templated Prometheus Config (`prometheus-config.yml.j2`) + +```yaml +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +scrape_configs: +{% for target in prometheus_scrape_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: {{ target.targets }} + metrics_path: '{{ target.path }}' +{% endfor %} +``` + +### Grafana Datasource Provisioning (`grafana-datasources.yml.j2`) + +```yaml +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:{{ loki_port }} + isDefault: false + editable: true + + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:{{ prometheus_port }} + isDefault: true + editable: true +``` + +### First Run Evidence + +``` +TASK [monitoring : Template Docker Compose file] changed: [localhost] +TASK [monitoring : Template Prometheus configuration] changed: [localhost] +TASK [monitoring : Template Grafana datasources] changed: [localhost] +TASK [monitoring : Copy Grafana dashboard JSON] changed: [localhost] +TASK [monitoring : Copy Grafana logs dashboard JSON] changed: [localhost] +TASK [monitoring : Deploy monitoring stack] changed: [localhost] + +PLAY RECAP +localhost : ok=32 changed=3 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +### Second Run — Idempotency Evidence + +``` +PLAY RECAP +localhost : ok=31 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +`changed=0` on second run confirms full idempotency ✅ + +--- + +## Metrics vs Logs — When to Use Each + +| Scenario | Use | +|----------|-----| +| "How many requests/sec is the app handling?" | Metrics (rate counter) | +| "Why did this specific request fail at 14:32?" | Logs (structured JSON) | +| "Is p95 latency within SLA?" | Metrics (histogram) | +| "What was the exact error message for user X?" | Logs (field filter) | +| "Is the service up right now?" | Metrics (`up` gauge) | +| "What sequence of events led to this crash?" | Logs (correlated trace) | + +**Together:** Metrics alert you that something is wrong; logs tell you exactly what happened and why. + +--- + +## Challenges & Solutions + +**Challenge 1: Docker image 404 on app `/metrics`** +The running container was using the old image without the metrics endpoint. Fixed by rebuilding with `docker build`, pushing to Docker Hub, and force-recreating the container with `docker compose up -d --force-recreate app-python`. + +![App target DOWN before fix](screenshots/targets-app-down.png) + +**Challenge 2: Prometheus image tag `3.9.0` not found** +Docker Hub uses the `v` prefix for Prometheus tags (`v3.9.0` not `3.9.0`). The Ansible variable was updated to `"v3.9.0"` to match the actual tag. + + +--- + +## Summary + +| Component | Version | Purpose | +|-----------|---------|---------| +| Prometheus | v3.9.0 | Metrics scraping and TSDB storage | +| Grafana | 12.3.1 | Visualization and dashboards | +| Loki | 3.0.0 | Log storage (from Lab 7) | +| Promtail | 3.0.0 | Log collection (from Lab 7) | +| prometheus_client | 0.23.1 | Python app instrumentation | + +**Key results:** +- Scrape targets: 4 (all UP) +- Dashboard panels: 6 +- Metrics implemented: Counter × 2, Histogram × 1, Gauge × 1 +- Retention: 15 days / 10GB (Prometheus), 7 days (Loki) +- Ansible idempotency: ✅ confirmed (`changed=0` on second run) \ No newline at end of file diff --git a/monitoring/docs/screenshots/dashboard-panels.png b/monitoring/docs/screenshots/dashboard-panels.png new file mode 100644 index 0000000000..c44cb4663c Binary files /dev/null and b/monitoring/docs/screenshots/dashboard-panels.png differ diff --git a/monitoring/docs/screenshots/dashboard-persistence.png b/monitoring/docs/screenshots/dashboard-persistence.png new file mode 100644 index 0000000000..c508913226 Binary files /dev/null and b/monitoring/docs/screenshots/dashboard-persistence.png differ diff --git a/monitoring/docs/screenshots/grafana-datasource.png b/monitoring/docs/screenshots/grafana-datasource.png new file mode 100644 index 0000000000..9f2783980d Binary files /dev/null and b/monitoring/docs/screenshots/grafana-datasource.png differ diff --git a/monitoring/docs/screenshots/promql-up-query.png b/monitoring/docs/screenshots/promql-up-query.png new file mode 100644 index 0000000000..42e13461de Binary files /dev/null and b/monitoring/docs/screenshots/promql-up-query.png differ diff --git a/monitoring/docs/screenshots/targets-all-up.png b/monitoring/docs/screenshots/targets-all-up.png new file mode 100644 index 0000000000..cb18dd8be1 Binary files /dev/null and b/monitoring/docs/screenshots/targets-all-up.png differ diff --git a/monitoring/docs/screenshots/targets-app-down.png b/monitoring/docs/screenshots/targets-app-down.png new file mode 100644 index 0000000000..8f68bf8dee Binary files /dev/null and b/monitoring/docs/screenshots/targets-app-down.png differ diff --git a/monitoring/grafana/dashboards/app-dashboard.json b/monitoring/grafana/dashboards/app-dashboard.json new file mode 100644 index 0000000000..cc412211dd --- /dev/null +++ b/monitoring/grafana/dashboards/app-dashboard.json @@ -0,0 +1,534 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "up{job=\"app\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "http_requests_in_progress", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cff932sxwmps0e" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "range": true, + "refId": "A" + } + ], + "title": " Error Rate", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prometheus Dashboard", + "uid": "admgtq4", + "version": 8 +} \ No newline at end of file diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..a37795ae6a --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['app-python:8000'] + metrics_path: '/metrics' + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: '/metrics' + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] + metrics_path: '/metrics' \ No newline at end of file