diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..35ed05e5e0 --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,10 @@ +--- +- name: Deploy monitoring stack + hosts: webservers + become: true + + roles: + - role: monitoring + tags: + - monitoring + - monitoring_deploy diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..ec9d369526 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,12 @@ +--- +monitoring_project_dir: /opt/monitoring +monitoring_loki_image: grafana/loki:3.0.0 +monitoring_promtail_image: grafana/promtail:3.0.0 +monitoring_grafana_image: grafana/grafana:12.3.1 +monitoring_loki_port: 3100 +monitoring_promtail_port: 9080 +monitoring_grafana_port: 3000 +monitoring_retention_period: 168h +monitoring_grafana_admin_user: admin +monitoring_grafana_admin_password: change-me-in-vault +monitoring_compose_project_name: devops-monitoring diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..2ccc844348 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,4 @@ +--- +# Docker Engine and Compose plugin are required for monitoring stack deployment. +dependencies: + - role: docker diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..8045115f61 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,77 @@ +--- +- name: Deploy Loki monitoring stack + tags: + - monitoring_deploy + block: + - name: Create monitoring directory tree + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ monitoring_project_dir }}" + - "{{ monitoring_project_dir }}/loki" + - "{{ monitoring_project_dir }}/promtail" + - "{{ monitoring_project_dir }}/grafana" + - "{{ monitoring_project_dir }}/grafana/provisioning" + - "{{ monitoring_project_dir }}/grafana/provisioning/datasources" + + - name: Render Loki configuration + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_project_dir }}/loki/config.yml" + mode: "0644" + + - name: Render Promtail configuration + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_project_dir }}/promtail/config.yml" + mode: "0644" + + - name: Render Grafana datasource provisioning + ansible.builtin.template: + src: grafana-datasource.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/loki.yml" + mode: "0644" + + - name: Render monitoring docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_project_dir }}/docker-compose.yml" + mode: "0644" + + - name: Deploy monitoring stack with Docker Compose v2 + community.docker.docker_compose_v2: + project_src: "{{ monitoring_project_dir }}" + files: + - docker-compose.yml + pull: always + remove_orphans: true + recreate: auto + state: present + register: monitoring_compose_result + + - name: Wait for Loki readiness endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_loki_port }}/ready" + status_code: 200 + timeout: 10 + register: monitoring_loki_ready + retries: 10 + delay: 5 + until: monitoring_loki_ready.status == 200 + + - name: Wait for Grafana health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/health" + status_code: 200 + timeout: 10 + register: monitoring_grafana_health + retries: 12 + delay: 5 + until: monitoring_grafana_health.status == 200 + + rescue: + - name: Show monitoring deployment diagnostics + ansible.builtin.debug: + var: monitoring_compose_result diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..1edc2d13fe --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,67 @@ +name: {{ monitoring_compose_project_name }} + +services: + loki: + image: {{ monitoring_loki_image }} + container_name: devops-loki + command: -config.file=/etc/loki/config.yml + ports: + - "{{ monitoring_loki_port }}:{{ monitoring_loki_port }}" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + labels: + logging: "promtail" + app: "devops-loki" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:{{ monitoring_loki_port }}/ready >/dev/null 2>&1 || curl -fsS http://localhost:{{ monitoring_loki_port }}/ready >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + + promtail: + image: {{ monitoring_promtail_image }} + container_name: devops-promtail + command: -config.file=/etc/promtail/config.yml + ports: + - "{{ monitoring_promtail_port }}:{{ monitoring_promtail_port }}" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + depends_on: + loki: + condition: service_healthy + labels: + logging: "promtail" + app: "devops-promtail" + + grafana: + image: {{ monitoring_grafana_image }} + container_name: devops-grafana + ports: + - "{{ monitoring_grafana_port }}:{{ monitoring_grafana_port }}" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: "{{ monitoring_grafana_admin_user }}" + GF_SECURITY_ADMIN_PASSWORD: "{{ monitoring_grafana_admin_password }}" + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + loki: + condition: service_healthy + labels: + logging: "promtail" + app: "devops-grafana" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:{{ monitoring_grafana_port }}/api/health >/dev/null 2>&1 || curl -fsS http://localhost:{{ monitoring_grafana_port }}/api/health >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + +volumes: + loki-data: + grafana-data: diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 new file mode 100644 index 0000000000..f13a7c7c40 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:{{ monitoring_loki_port }} + isDefault: true + editable: true diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..fe0408919c --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,43 @@ +auth_enabled: false + +server: + http_listen_port: {{ monitoring_loki_port }} + log_level: info + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: {{ monitoring_retention_period }} + reject_old_samples: true + reject_old_samples_max_age: {{ monitoring_retention_period }} + +analytics: + reporting_enabled: false diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..f7c374f3ba --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,29 @@ +server: + http_listen_port: {{ monitoring_promtail_port }} + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:{{ monitoring_loki_port }}/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + pipeline_stages: + - docker: {} + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: container + - source_labels: ["__meta_docker_container_label_app"] + target_label: app + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: service + - source_labels: ["__meta_docker_container_log_stream"] + target_label: stream + - target_label: job + replacement: docker diff --git a/app_python/README.md b/app_python/README.md index 25a9880577..58894af2b1 100644 --- a/app_python/README.md +++ b/app_python/README.md @@ -54,6 +54,25 @@ Configuration is done via environment variables: All configuration is read in `app.py` at startup, so restart the application after changing environment variables. +## Structured Logging (Lab 7) + +The service writes JSON logs to `stdout` for centralized log collection (Loki/Promtail). + +Example: + +```json +{"timestamp":"2026-03-12T20:45:10.123456+00:00","level":"INFO","logger":"devops-info-service","message":"HTTP request handled","method":"GET","path":"/health","status_code":200,"client_ip":"127.0.0.1","duration_ms":1.11} +``` + +Each request log includes: + +- `method` +- `path` +- `status_code` +- `client_ip` +- `duration_ms` +- `user_agent` + ## Docker How to use the containerized application (patterns): @@ -66,4 +85,3 @@ How to use the containerized application (patterns): Notes: - The container exposes port `5002` by default (see `app.py`). - The image runs as a non-root user for improved security. - diff --git a/app_python/app.py b/app_python/app.py index 8fe89858ca..c3712632ec 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -3,14 +3,16 @@ Main application module for Lab 1. """ +import json import logging import os import platform import socket +import time from datetime import datetime, timezone from typing import Any, Dict -from flask import Flask, jsonify, request +from flask import Flask, g, jsonify, request app = Flask(__name__) @@ -26,13 +28,53 @@ START_TIME = datetime.now(timezone.utc) -# Logging configuration -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +class JsonFormatter(logging.Formatter): + """Render application logs as one-line JSON objects.""" + + def format(self, record: logging.LogRecord) -> str: + payload: Dict[str, Any] = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + context = getattr(record, "context", None) + if isinstance(context, dict): + payload.update(context) + + if record.exc_info: + payload["exception"] = self.formatException(record.exc_info) + + return json.dumps(payload) + + +def configure_logging() -> logging.Logger: + """Configure JSON logging for structured log aggregation.""" + app_logger = logging.getLogger("devops-info-service") + app_logger.setLevel(logging.INFO) + app_logger.propagate = False + + if not app_logger.handlers: + handler = logging.StreamHandler() + handler.setFormatter(JsonFormatter()) + app_logger.addHandler(handler) + + logging.getLogger("werkzeug").setLevel(logging.WARNING) + return app_logger + + +logger = configure_logging() +logger.info( + "Application starting", + extra={ + "context": { + "event": "startup", + "host": HOST, + "port": PORT, + } + }, ) -logger = logging.getLogger(__name__) -logger.info("Application starting...") def get_uptime() -> Dict[str, Any]: @@ -72,6 +114,26 @@ def get_request_info() -> Dict[str, Any]: } +@app.before_request +def before_request_logging() -> None: + """Track request start time for latency logging.""" + g.request_started_at = time.perf_counter() + + +@app.after_request +def after_request_logging(response): + """Emit one structured log entry per HTTP request.""" + request_info = get_request_info() + request_info["status_code"] = response.status_code + + started_at = getattr(g, "request_started_at", None) + if started_at is not None: + request_info["duration_ms"] = round((time.perf_counter() - started_at) * 1000, 2) + + logger.info("HTTP request handled", extra={"context": request_info}) + return response + + @app.route("/", methods=["GET"]) def index(): """Main endpoint - service and system information.""" @@ -108,7 +170,6 @@ def index(): ], } - logger.info("Handled main / request") return jsonify(response) @@ -121,14 +182,15 @@ def health(): "timestamp": datetime.now(timezone.utc).isoformat(), "uptime_seconds": uptime["seconds"], } - logger.info("Health check OK") return jsonify(payload), 200 @app.errorhandler(404) def not_found(error): """Return JSON for 404 errors.""" - logger.warning("404 Not Found: %s %s", request.method, request.path) + request_info = get_request_info() + request_info["status_code"] = 404 + logger.warning("Not found", extra={"context": request_info}) return ( jsonify( { @@ -143,7 +205,9 @@ def not_found(error): @app.errorhandler(500) def internal_error(error): """Return JSON for 500 errors.""" - logger.exception("500 Internal Server Error") + request_info = get_request_info() + request_info["status_code"] = 500 + logger.error("Internal server error", extra={"context": request_info}) return ( jsonify( { @@ -156,5 +220,8 @@ def internal_error(error): if __name__ == "__main__": - logger.info("Starting Flask development server on %s:%s", HOST, PORT) + logger.info( + "Starting Flask development server", + extra={"context": {"event": "flask_start", "host": HOST, "port": PORT}}, + ) app.run(host=HOST, port=PORT, debug=DEBUG) diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000000..92fa066460 --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,3 @@ +# Copy to monitoring/.env and change the password before deployment. +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=change-me-please diff --git a/monitoring/.gitignore b/monitoring/.gitignore new file mode 100644 index 0000000000..4c49bd78f1 --- /dev/null +++ b/monitoring/.gitignore @@ -0,0 +1 @@ +.env diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..6dd75f6f2e --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,161 @@ +name: devops-monitoring + +services: + loki: + image: grafana/loki:3.0.0 + container_name: devops-loki + command: -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready >/dev/null 2>&1 || curl -fsS http://localhost:3100/ready >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:3.0.0 + container_name: devops-promtail + command: -config.file=/etc/promtail/config.yml + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + depends_on: + loki: + condition: service_healthy + labels: + logging: "promtail" + app: "devops-promtail" + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + container_name: devops-grafana + ports: + - "3000:3000" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-change-me-in-dot-env}" + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + networks: + - logging + depends_on: + loki: + condition: service_healthy + labels: + logging: "promtail" + app: "devops-grafana" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health >/dev/null 2>&1 || curl -fsS http://localhost:3000/api/health >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + devops-python: + build: + context: ../app_python + image: devops-python:lab07 + container_name: devops-python + ports: + - "8000:8000" + environment: + HOST: "0.0.0.0" + PORT: "8000" + DEBUG: "false" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8000/health >/dev/null 2>&1 || curl -fsS http://localhost:8000/health >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + devops-go: + build: + context: ../app_go + image: devops-go:lab07 + container_name: devops-go + ports: + - "8001:8001" + environment: + PORT: "8001" + networks: + - logging + labels: + logging: "promtail" + app: "devops-go" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8001/health >/dev/null 2>&1 || curl -fsS http://localhost:8001/health >/dev/null 2>&1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..eabf44b4bf --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,326 @@ +## Overview + +In Lab 7, I deployed a centralized logging stack based on Loki, Promtail, and Grafana, then integrated course applications and structured JSON logs for observability. + +Stack versions: + +- Loki `3.0.0` +- Promtail `3.0.0` +- Grafana `12.3.1` + +--- + +## Architecture + +```text ++-------------------+ +---------------------+ +| devops-python app | | devops-go app | +| labels: | | labels: | +| logging=promtail | | logging=promtail | +| app=devops-python | | app=devops-go | ++---------+---------+ +----------+----------+ + | | + +------------- Docker logs ------+ + (docker.sock) + | + +-----v------+ + | Promtail | + | :9080 | + +-----+------+ + | + push logs | /loki/api/v1/push + v + +-----+------+ + | Loki 3.0 | + | TSDB + FS | + | :3100 | + +-----+------+ + | + query | LogQL + v + +-----+------+ + | Grafana | + | :3000 | + +------------+ +``` + +Key design choices: + +- Promtail uses Docker service discovery and keeps only containers with label `logging=promtail`. +- Logs are labeled with `app`, `container`, `service`, `stream`, and `job` for efficient LogQL filtering. +- Loki stores data with TSDB schema v13 and 7-day retention. + +--- + +## Setup Guide + +1. Prepare Grafana credentials: + +```bash +cd monitoring +cp .env.example .env +# edit .env and set a strong GRAFANA_ADMIN_PASSWORD +``` + +2. Start the full stack: + +```bash +docker compose up -d --build +docker compose ps +``` + +3. Verify components: + +```bash +curl http://localhost:3100/ready +curl http://localhost:9080/targets +curl http://localhost:3000/api/health +``` + +4. Open Grafana: + +- URL: `http://localhost:3000` +- Login with values from `.env` + +Data source is provisioned automatically from `monitoring/grafana/provisioning/datasources/loki.yml`. + +--- + +## Configuration + +### Docker Compose + +File: `monitoring/docker-compose.yml` + +What is configured: + +- Core stack: `loki`, `promtail`, `grafana` +- App services: `devops-python`, `devops-go` +- Shared network `logging` +- Persistent volumes: `loki-data`, `grafana-data` +- Resource constraints in `deploy.resources` +- Healthchecks for Loki and Grafana +- Labels for Promtail filtering (`logging=promtail`, `app=...`) + +### Loki (TSDB) + +File: `monitoring/loki/config.yml` + +Important settings: + +- `schema_config.store: tsdb` +- `schema: v13` +- `object_store: filesystem` +- retention: `limits_config.retention_period: 168h` +- compactor retention enabled + +Why this matters: + +- TSDB in Loki 3.x provides better query performance and scale behavior. +- v13 schema is the expected modern schema for TSDB deployments. +- 7-day retention controls disk usage. + +### Promtail + +File: `monitoring/promtail/config.yml` + +Important settings: + +- client endpoint: `http://loki:3100/loki/api/v1/push` +- Docker SD: `docker_sd_configs` with `unix:///var/run/docker.sock` +- filter by label `logging=promtail` +- relabeling extracts `container`, `app`, `service`, `stream` +- static `job="docker"` label for baseline queries + +--- + +## Application Logging + +Python app (`app_python/app.py`) now emits structured JSON logs. + +Implemented: + +- custom `JsonFormatter` +- startup event logs (`event=startup`) +- request logging via `before_request` + `after_request` +- request context fields: + - `method` + - `path` + - `status_code` + - `client_ip` + - `duration_ms` + - `user_agent` + +Example log line: + +```json +{"timestamp":"2026-03-12T20:45:10.123456+00:00","level":"INFO","logger":"devops-info-service","message":"HTTP request handled","client_ip":"172.19.0.1","user_agent":"curl/8.7.1","method":"GET","path":"/health","status_code":200,"duration_ms":1.11} +``` + +--- + +## Dashboard + +Create dashboard with 4 panels in Grafana: + +1. Logs Table +- Query: `{app=~"devops-.*"}` +- Visualization: Logs + +2. Request Rate +- Query: `sum by (app) (rate({app=~"devops-.*"}[1m]))` +- Visualization: Time series + +3. Error Logs +- Query: `{app=~"devops-.*"} | json | level="ERROR"` +- Visualization: Logs + +4. Log Level Distribution +- Query: `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` +- Visualization: Pie chart (or Stat) + +Useful Explore queries: + +- `{job="docker"}` +- `{app="devops-python"}` +- `{app="devops-go"}` +- `{app="devops-python"} | json | method="GET"` +- `{app="devops-python"} |= "ERROR"` + +--- + +## Production Config + +Implemented production-oriented hardening: + +- Resource limits/reservations for all services +- Grafana anonymous auth disabled (`GF_AUTH_ANONYMOUS_ENABLED=false`) +- Admin password sourced from `.env` (not committed) +- Healthchecks for Loki and Grafana +- Retention policy set to 7 days in Loki + +--- + +## Bonus Automation (Ansible) + +Implemented files: + +- `ansible/roles/monitoring/defaults/main.yml` +- `ansible/roles/monitoring/tasks/main.yml` +- `ansible/roles/monitoring/templates/*.j2` +- `ansible/playbooks/deploy-monitoring.yml` + +What the role does: + +- creates monitoring directory structure on target VM +- templates Loki/Promtail/Grafana datasource configs with Jinja2 +- templates monitoring Docker Compose file +- deploys stack with `community.docker.docker_compose_v2` +- waits for Loki and Grafana health endpoints + +Run commands: + +```bash +cd ansible +ansible-playbook playbooks/deploy-monitoring.yml --ask-vault-pass +ansible-playbook playbooks/deploy-monitoring.yml --ask-vault-pass +``` + +Expected idempotency behavior: + +- first run: `changed` on create/template/deploy tasks +- second run: mostly `ok` (no config drift) + +--- + +## Testing + +Generate logs: + +```bash +for i in {1..20}; do curl -s http://localhost:8000/ >/dev/null; done +for i in {1..20}; do curl -s http://localhost:8000/health >/dev/null; done +for i in {1..20}; do curl -s http://localhost:8001/ >/dev/null; done +for i in {1..20}; do curl -s http://localhost:8001/health >/dev/null; done +``` + +Verify stack: + +```bash +docker compose ps +docker compose logs --tail=20 devops-python +curl -f http://localhost:3100/ready +curl -f http://localhost:9080/targets +curl -f http://localhost:3000/api/health +``` + +![Verify Stack](./screenshots/verify_stack.png) + +Expected results: + +- all services are `Up` +- Loki returns `ready` +- Promtail has active Docker targets +- Grafana API health returns `ok` +- Logs visible in Grafana Explore for both applications + +--- + +## Research Notes + +1. How Loki differs from Elasticsearch: +- Loki indexes labels, not full log content, so storage is typically cheaper. +- Elasticsearch indexes full text and is more expensive for high log volume. + +2. What labels are and why they matter: +- Labels are metadata dimensions (for example `app`, `container`, `job`). +- They define streams and make LogQL filtering and aggregation fast. + +3. How Promtail discovers containers: +- Promtail reads Docker metadata via `docker_sd_configs` and `docker.sock`. +- Relabeling maps Docker metadata to Loki labels. + +--- + +## Challenges & Solutions + +1. External access checks can fail due host firewall/security group. +- Solution: verify Loki/Promtail/Grafana locally on the Docker host with `localhost` endpoints. + +2. Promtail should avoid scraping every container. +- Solution: explicit label gate `logging=promtail`. + +3. Secure Grafana credentials in Compose. +- Solution: use `.env` (ignored by git) plus committed `.env.example` template. + +--- + +## Evidence Checklist + +Add screenshots to `monitoring/docs/screenshots/`: + +- `json-log-output.png` (terminal output showing JSON logs from Python app) +- `explore-both-apps.png` (Grafana Explore with application logs) +- `dashboard-four-panels.png` (dashboard with all required panels) +- `compose-healthy.png` (`docker compose ps` showing healthy services) +- `grafana_login.png` (login page, no anonymous access) + +### compose-healthy.png + +![Compose Healthy](./screenshots/compose-healthy.png) + +### json-log-output.png + +![JSON Log Output](./screenshots/json-log-output.png) + +### explore-both-apps.png + +![Explore Both Apps](./screenshots/explore-both-apps.png) + +### dashboard-four-panels.png + +![Dashboard Four Panels](./screenshots/dashboard-four-panels.png) + +### grafana_login.png + +![Grafana Login](./screenshots/grafana_login.png) diff --git a/monitoring/docs/screenshots/.gitkeep b/monitoring/docs/screenshots/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/monitoring/docs/screenshots/compose-healthy.png b/monitoring/docs/screenshots/compose-healthy.png new file mode 100644 index 0000000000..3c1b716c79 Binary files /dev/null and b/monitoring/docs/screenshots/compose-healthy.png differ diff --git a/monitoring/docs/screenshots/dashboard-four-panels.png b/monitoring/docs/screenshots/dashboard-four-panels.png new file mode 100644 index 0000000000..39b0e9a0c3 Binary files /dev/null and b/monitoring/docs/screenshots/dashboard-four-panels.png differ diff --git a/monitoring/docs/screenshots/explore-both-apps.png b/monitoring/docs/screenshots/explore-both-apps.png new file mode 100644 index 0000000000..fdce0fed55 Binary files /dev/null and b/monitoring/docs/screenshots/explore-both-apps.png differ diff --git a/monitoring/docs/screenshots/grafana_login.png b/monitoring/docs/screenshots/grafana_login.png new file mode 100644 index 0000000000..b5f72b7372 Binary files /dev/null and b/monitoring/docs/screenshots/grafana_login.png differ diff --git a/monitoring/docs/screenshots/json-log-output.png b/monitoring/docs/screenshots/json-log-output.png new file mode 100644 index 0000000000..0f822ba2ce Binary files /dev/null and b/monitoring/docs/screenshots/json-log-output.png differ diff --git a/monitoring/docs/screenshots/verify_stack.png b/monitoring/docs/screenshots/verify_stack.png new file mode 100644 index 0000000000..e2853841e2 Binary files /dev/null and b/monitoring/docs/screenshots/verify_stack.png differ diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..fba0b1b8e0 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: true diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..527ac3449b --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,43 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + log_level: info + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: 168h + reject_old_samples: true + reject_old_samples_max_age: 168h + +analytics: + reporting_enabled: false diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..9bcb08d656 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,32 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + pipeline_stages: + - docker: {} + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: container + - source_labels: ["__meta_docker_container_label_app"] + target_label: app + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: service + - source_labels: ["__meta_docker_container_log_stream"] + target_label: stream + - target_label: job + replacement: docker