diff --git a/app_python/app.py b/app_python/app.py index e05047b8c4..c63be56286 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -1,118 +1,285 @@ -import os -import socket -import platform -import logging -from datetime import datetime, timezone -from flask import Flask, jsonify, request - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -app = Flask(__name__) - -# Configuration -HOST = os.getenv('HOST', '0.0.0.0') -PORT = int(os.getenv('PORT', 5000)) -DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' - -# Application start time -START_TIME = datetime.now(timezone.utc) - -def get_uptime(): - """Calculation of the application's operating time.""" - delta = datetime.now(timezone.utc) - START_TIME - seconds = int(delta.total_seconds()) - hours = seconds // 3600 - minutes = (seconds % 3600) // 60 - return { - 'seconds': seconds, - 'human': f"{hours} hours, {minutes} minutes" - } - -def get_system_info(): - """Collect system information.""" - return { - 'hostname': socket.gethostname(), - 'platform': platform.system(), - 'platform_version': get_platform_version(), - 'architecture': platform.machine(), - 'cpu_count': os.cpu_count(), - 'python_version': platform.python_version() - } - -@app.route('/') -def index(): - """Main endpoint - service and system information.""" - uptime = get_uptime() - - response = { - 'service': { - 'name': 'devops-info-service', - 'version': '1.0.0', - 'description': 'DevOps course info service', - 'framework': 'Flask' - }, - 'system': get_system_info(), - 'runtime': { - 'uptime_seconds': uptime['seconds'], - 'uptime_human': uptime['human'], - 'current_time': datetime.now(timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z"), - 'timezone': 'UTC' - }, - 'request': { - 'client_ip': request.remote_addr, - 'user_agent': request.headers.get('User-Agent'), - 'method': request.method, - 'path': request.path - }, - 'endpoints': [ - {'path': '/', 'method': 'GET', 'description': 'Service information'}, - {'path': '/health', 'method': 'GET', 'description': 'Health check'} - ] - } - - logger.info("Request %s %s from %s", request.method, request.path, request.remote_addr) - return jsonify(response) - -@app.route('/health') -def health(): - """Health check endpoint for monitoring.""" - logger.info("Request %s %s from %s", request.method, request.path, request.remote_addr) - return jsonify({ - 'status': 'healthy', - 'timestamp': datetime.now(timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z"), - 'uptime_seconds': get_uptime()['seconds'] - }) - -@app.errorhandler(404) -def not_found(error): - return jsonify({ - 'error': 'Not Found', - 'message': 'Endpoint does not exist' - }), 404 - -@app.errorhandler(500) -def internal_error(error): - return jsonify({ - 'error': 'Internal Server Error', - 'message': 'An unexpected error occurred' - }), 500 - -def get_platform_version(): - """Return a platform version.""" - try: - if hasattr(platform, "freedesktop_os_release"): - info = platform.freedesktop_os_release() - if info.get("PRETTY_NAME"): - return info["PRETTY_NAME"] - except Exception: - pass - return platform.platform() - - -if __name__ == '__main__': - logger.info(f'Starting DevOps Info Service on {HOST}:{PORT}') - app.run(host=HOST, port=PORT, debug=DEBUG) \ No newline at end of file +import json +import logging +import os +import platform +import socket +import sys +import time +from datetime import datetime, timezone + +from flask import Flask, g, has_request_context, jsonify, request +from werkzeug.exceptions import HTTPException + +SERVICE_NAME = "devops-info-service" +SERVICE_VERSION = "1.0.0" +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 5000)) +DEBUG = os.getenv("DEBUG", "False").lower() == "true" +START_TIME = datetime.now(timezone.utc) + + +class JSONFormatter(logging.Formatter): + """Format log records as structured JSON for Loki/Grafana.""" + + EXTRA_FIELDS = ( + "event", + "service", + "host", + "port", + "debug", + "method", + "path", + "endpoint", + "status_code", + "client_ip", + "user_agent", + "duration_ms", + ) + + def format(self, record: logging.LogRecord) -> str: + payload = { + "timestamp": datetime.fromtimestamp( + record.created, tz=timezone.utc + ).isoformat(timespec="milliseconds").replace("+00:00", "Z"), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + "service": SERVICE_NAME, + } + + for field in self.EXTRA_FIELDS: + value = getattr(record, field, None) + if value is not None: + payload[field] = value + + if record.exc_info: + payload["exception"] = self.formatException(record.exc_info) + + return json.dumps(payload, ensure_ascii=False) + + +def configure_logging() -> logging.Logger: + """Configure application logging to stdout in JSON format.""" + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(JSONFormatter()) + + root_logger = logging.getLogger() + root_logger.handlers.clear() + root_logger.setLevel(logging.INFO) + root_logger.addHandler(handler) + + werkzeug_logger = logging.getLogger("werkzeug") + werkzeug_logger.handlers.clear() + werkzeug_logger.propagate = False + werkzeug_logger.disabled = True + + return logging.getLogger(SERVICE_NAME) + + +logger = configure_logging() +app = Flask(__name__) + + +def get_platform_version() -> str: + """Return a platform version.""" + try: + if hasattr(platform, "freedesktop_os_release"): + info = platform.freedesktop_os_release() + if info.get("PRETTY_NAME"): + return info["PRETTY_NAME"] + except Exception: + pass + return platform.platform() + + +def get_uptime() -> dict: + """Calculate the application's uptime.""" + delta = datetime.now(timezone.utc) - START_TIME + seconds = int(delta.total_seconds()) + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + return { + "seconds": seconds, + "human": f"{hours} hours, {minutes} minutes", + } + + +def get_system_info() -> dict: + """Collect system information.""" + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": get_platform_version(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count(), + "python_version": platform.python_version(), + } + + +def get_client_ip() -> str | None: + """Return the client IP, preferring X-Forwarded-For when present.""" + if not has_request_context(): + return None + + forwarded_for = request.headers.get("X-Forwarded-For", "") + if forwarded_for: + return forwarded_for.split(",")[0].strip() + return request.remote_addr + + +def build_request_log_context(status_code: int | None = None) -> dict: + """Build structured context for request-related logs.""" + context: dict[str, object] = {"event": "http.request", "service": SERVICE_NAME} + + if not has_request_context(): + return context + + context.update( + { + "method": request.method, + "path": request.path, + "endpoint": request.endpoint, + "client_ip": get_client_ip(), + "user_agent": request.headers.get("User-Agent"), + } + ) + + if status_code is not None: + context["status_code"] = status_code + + started_at = getattr(g, "request_started_at", None) + if started_at is not None: + context["duration_ms"] = round((time.perf_counter() - started_at) * 1000, 2) + + return context + + +@app.before_request +def track_request_start() -> None: + """Store request start time for structured logging.""" + g.request_started_at = time.perf_counter() + + +@app.after_request +def log_response(response): + """Log every completed HTTP request as JSON.""" + level = logging.INFO + if response.status_code >= 400: + level = logging.ERROR + + logger.log( + level, + "HTTP request completed", + extra=build_request_log_context(response.status_code), + ) + return response + + +@app.route("/") +def index(): + """Main endpoint - service and system information.""" + uptime = get_uptime() + + response = { + "service": { + "name": SERVICE_NAME, + "version": SERVICE_VERSION, + "description": "DevOps course info service", + "framework": "Flask", + }, + "system": get_system_info(), + "runtime": { + "uptime_seconds": uptime["seconds"], + "uptime_human": uptime["human"], + "current_time": datetime.now(timezone.utc) + .isoformat(timespec="milliseconds") + .replace("+00:00", "Z"), + "timezone": "UTC", + }, + "request": { + "client_ip": get_client_ip(), + "user_agent": request.headers.get("User-Agent"), + "method": request.method, + "path": request.path, + }, + "endpoints": [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"}, + ], + } + + return jsonify(response) + + +@app.route("/health") +def health(): + """Health check endpoint for monitoring.""" + return jsonify( + { + "status": "healthy", + "timestamp": datetime.now(timezone.utc) + .isoformat(timespec="milliseconds") + .replace("+00:00", "Z"), + "uptime_seconds": get_uptime()["seconds"], + } + ) + + +@app.errorhandler(404) +def not_found(error): + return ( + jsonify( + { + "error": "Not Found", + "message": "Endpoint does not exist", + } + ), + 404, + ) + + +@app.errorhandler(405) +def method_not_allowed(error): + return ( + jsonify( + { + "error": "Method Not Allowed", + "message": "Method is not allowed for this endpoint", + } + ), + 405, + ) + + +@app.errorhandler(Exception) +def handle_unexpected_error(error): + if isinstance(error, HTTPException): + return error + + logger.exception( + "Unhandled application error", + extra=build_request_log_context(status_code=500), + ) + return ( + jsonify( + { + "error": "Internal Server Error", + "message": "An unexpected error occurred", + } + ), + 500, + ) + + +if __name__ == "__main__": + logger.info( + "Application startup", + extra={ + "event": "app.startup", + "service": SERVICE_NAME, + "host": HOST, + "port": PORT, + "debug": DEBUG, + }, + ) + app.run(host=HOST, port=PORT, debug=DEBUG) diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..b5b18eaa6a --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,137 @@ +name: lab07-monitoring + +version: "3.8" + +x-default-resources: &default-resources + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + +services: + loki: + image: grafana/loki:3.0.0 + container_name: loki + command: + - -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + <<: *default-resources + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + command: + - -config.file=/etc/promtail/config.yml + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-positions:/tmp/positions + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + <<: *default-resources + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ALLOW_EMBEDDING: "false" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:?set GRAFANA_ADMIN_PASSWORD in monitoring/.env}" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 25s + <<: *default-resources + + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + image: devops-info-service:lab07 + container_name: app-python + environment: + HOST: 0.0.0.0 + PORT: 8000 + PYTHONUNBUFFERED: "1" + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + depends_on: + promtail: + condition: service_started + restart: unless-stopped + healthcheck: + test: + [ + "CMD", + "python", + "-c", + "import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=3)", + ] + interval: 15s + timeout: 5s + retries: 5 + start_period: 10s + <<: *default-resources + +volumes: + loki-data: + grafana-data: + promtail-positions: + +networks: + logging: + driver: bridge diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..5c609ceba8 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,343 @@ +# Lab 07 — Observability & Logging with Loki Stack + +## 1. Architecture + +This lab deploys a centralized logging stack based on the Grafana Loki ecosystem. + +### Components +- **Loki** — stores and indexes logs. +- **Promtail** — discovers Docker containers and ships logs to Loki. +- **Grafana** — visualizes and queries logs with LogQL. +- **Python application** — emits structured JSON logs to stdout. + +### Data flow +1. The Python container writes JSON logs to stdout. +2. Docker stores container logs. +3. Promtail discovers running containers through the Docker socket and reads their logs. +4. Promtail pushes log streams to Loki. +5. Grafana connects to Loki as a data source and visualizes logs in Explore and on dashboards. + +### Services in the stack +- `loki` +- `promtail` +- `grafana` +- `app-python` + +## 2. Setup Guide + +### Directory structure +```text +monitoring/ +├── docker-compose.yml +├── .env.example +├── loki/ +│ └── config.yml +├── promtail/ +│ └── config.yml +├── grafana/ +│ ├── dashboards/ +│ │ └── lab07-observability.json +│ └── provisioning/ +│ ├── dashboards/ +│ │ └── dashboard.yml +│ └── datasources/ +│ └── loki.yml +└── docs/ + ├── LAB07.md + └── screenshots/ +``` + +### Deployment steps +1. Open the `monitoring` directory. +2. Create a local `.env` file from the example: + ```bash + cp .env.example .env + ``` + On Windows PowerShell: + ```powershell + Copy-Item .env.example .env + ``` +3. Set a Grafana admin password in `.env`: + ```env + GRAFANA_ADMIN_PASSWORD=your_secure_password + ``` +4. Start the stack: + ```bash + docker compose up -d --build + ``` +5. Check service status: + ```bash + docker compose ps + ``` +6. Open Grafana: + - URL: `http://localhost:3000` + - Username: value from `GF_SECURITY_ADMIN_USER` or default `admin` + - Password: value from `GRAFANA_ADMIN_PASSWORD` + +## 3. Configuration + +### Docker Compose +The stack is defined in `monitoring/docker-compose.yml`. + +Key implementation details: +- Loki uses image `grafana/loki:3.0.0` and exposes port `3100`. +- Promtail uses image `grafana/promtail:3.0.0` and mounts: + - `/var/run/docker.sock` + - Docker log storage +- Grafana uses image `grafana/grafana:12.3.1` and exposes port `3000`. +- All services share the `logging` network. +- Persistent named volumes are used for Loki and Grafana data. +- Health checks are defined for Loki, Grafana, and the Python app. +- Resource limits and reservations are configured for production readiness. + +### Loki configuration +File: `monitoring/loki/config.yml` + +The Loki configuration includes: +- `auth_enabled: false` for local development. +- HTTP server on port `3100`. +- TSDB storage backend with filesystem storage. +- Schema version `v13`, which is recommended for Loki 3.x. +- Retention period of **7 days** (`168h`). +- Compactor enabled to enforce retention. + +Why this configuration was chosen: +- TSDB is the recommended and more efficient index format for Loki 3.0. +- Filesystem storage is sufficient for a single-node lab setup. +- A 7-day retention policy is enough for lab validation while keeping storage bounded. + +### Promtail configuration +File: `monitoring/promtail/config.yml` + +Promtail is configured to: +- expose its own HTTP endpoint on port `9080`; +- store read offsets in a positions file; +- send logs to `http://loki:3100/loki/api/v1/push`; +- use Docker service discovery via `docker_sd_configs`; +- relabel metadata from Docker into queryable labels such as: + - `container` + - `compose_service` + - `app` + - `job` +- parse JSON/container log envelopes so logs appear correctly in Loki. + +Why this configuration matters: +- Docker service discovery allows Promtail to automatically find running containers. +- Labels make filtering in LogQL much easier. +- Structured labels and parsed JSON logs enable targeted observability use cases. + +### Grafana provisioning +Grafana is provisioned automatically with: +- a Loki data source via `grafana/provisioning/datasources/loki.yml`; +- a dashboard provider via `grafana/provisioning/dashboards/dashboard.yml`; +- a prebuilt dashboard file `grafana/dashboards/lab07-observability.json`. + +This avoids manual setup after each redeploy. + +## 4. Application Logging + +The Python application was updated to emit **JSON logs** to stdout. + +### Logging requirements implemented +The logs include structured fields such as: +- `timestamp` +- `level` +- `logger` +- `message` +- `service` +- `event` +- `method` +- `path` +- `status_code` +- `client_ip` +- `user_agent` +- `duration_ms` + +### Events logged +- application startup; +- successful HTTP requests; +- failed requests, including `404` errors. + +### Why JSON logging was used +JSON logs are better for aggregation because they: +- can be parsed automatically; +- support filtering by fields instead of only plain text matching; +- integrate well with Loki and Grafana Explore. + +### Example JSON log +```json +{ + "timestamp": "2026-03-11T17:55:15.191Z", + "level": "ERROR", + "logger": "devops-info-service", + "message": "HTTP request completed", + "service": "devops-info-service", + "event": "http.request", + "method": "GET", + "path": "/not-found", + "status_code": 404, + "client_ip": "172.18.0.1", + "duration_ms": 0.09 +} +``` + +## 5. Dashboard + +A Grafana dashboard named **Lab 07 - Loki Observability** was created and provisioned automatically. + +### Panels implemented +1. **Logs Table** + - shows recent logs from all services; + - based on Loki log streams. + +2. **Request Rate by App** + - time series showing log rate per application; + - based on `rate()` aggregation grouped by `app`. + +3. **Error Logs** + - displays only logs with error-level data. + +4. **Log Level Distribution** + - shows how log entries are distributed by level. + +### Example LogQL queries used +```logql +{job="docker"} +{app="devops-python"} +{app="devops-python"} | json +{app="devops-python"} | json | path="/health" +{app="devops-python"} | json | status_code=404 +{app=~"devops-.*"} +sum by (app) (rate({app=~"devops-.*"}[1m])) +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` + +### What was verified in Grafana +- the Loki data source was available and set as default; +- logs were visible from multiple containers; +- JSON fields were extracted and shown in Explore; +- filtering by `path` and `status_code` worked; +- the dashboard displayed real traffic data. + +## 6. Production Configuration + +Several production-readiness improvements were applied. + +### Security +- Anonymous Grafana access was disabled. +- Grafana credentials are supplied through environment variables. +- Secrets are stored in `.env`, which should **not** be committed. + +### Resource limits +Resource constraints were configured for the services to avoid uncontrolled resource usage. + +Example approach: +```yaml +resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M +``` + +### Health checks +Health checks were added for: +- Loki: `http://localhost:3100/ready` +- Grafana: `http://localhost:3000/api/health` +- Python app: `http://localhost:8000/health` + +### Retention +Loki retention was set to **7 days**. + +This provides a realistic baseline for centralized logging while keeping disk usage under control. + +## 7. Testing + +### Commands used to validate the stack +```bash +docker compose up -d --build +docker compose ps +docker compose logs app-python +``` + +### Traffic generation used for testing +PowerShell examples: +```powershell +1..20 | ForEach-Object { + Invoke-WebRequest http://localhost:8000/ -UseBasicParsing | Out-Null + Invoke-WebRequest http://localhost:8000/health -UseBasicParsing | Out-Null +} + +1..10 | ForEach-Object { + try { + Invoke-WebRequest http://localhost:8000/not-found -UseBasicParsing -ErrorAction Stop | Out-Null + } catch {} +} +``` + +### Validation results +- all containers started successfully; +- health checks passed; +- JSON logs were produced by `app-python`; +- Grafana showed logs from multiple services; +- Explore queries returned expected results for normal and error traffic. + +## 8. Challenges and Solutions + +### Challenge 1 — Grafana provisioning directories were missing +At first, Grafana started but did not load the Loki data source or dashboard automatically. + +**Cause:** +The provisioning subdirectories and files were missing or empty. + +**Solution:** +Created and mounted: +- `grafana/provisioning/datasources/loki.yml` +- `grafana/provisioning/dashboards/dashboard.yml` +- `grafana/dashboards/lab07-observability.json` + +### Challenge 2 — Application logs were plain text instead of JSON +Initially the Python app wrote regular text logs, which limited structured querying. + +**Solution:** +Updated the logging implementation so the application emits structured JSON logs to stdout. + +### Challenge 3 — PowerShell `curl` behavior on Windows +On Windows, `curl` is aliased to `Invoke-WebRequest`, which caused prompts and different behavior. + +**Solution:** +Used explicit PowerShell commands with `-UseBasicParsing` for reliable traffic generation. + +### Challenge 4 — Filtering logs in Explore +Some filters did not work at first until the JSON parser and labels were confirmed. + +**Solution:** +Verified labels and used queries such as: +```logql +{app="devops-python"} | json | status_code=404 +``` + +## 9. Evidence + +The following evidence should be attached in `monitoring/docs/screenshots/`: +- `01-datasource-loki.png` — Loki data source page in Grafana. +- `02-dashboard-overview.png` — dashboard with all panels and real data. +- `03-explore-queries.png` — Explore page with LogQL queries. +- `04-explore-results.png` — Explore results with parsed JSON logs. +- `05-json-logs-app-python.png` — terminal output with JSON logs from the app. +- `06-docker-compose-ps.png` — all services running and healthy. + +## 10. Conclusion + +This lab successfully implemented a centralized logging stack using Loki, Promtail, and Grafana. + +The final solution provides: +- centralized collection of container logs; +- structured JSON application logging; +- filtering and analysis through LogQL; +- dashboard-based observability; +- basic production-readiness features such as health checks, retention, and secured Grafana access. + +The stack is suitable as a foundation for further observability practices in later DevOps work. diff --git a/monitoring/docs/screenshots/01-datasource-loki.png b/monitoring/docs/screenshots/01-datasource-loki.png new file mode 100644 index 0000000000..9c644fe75d Binary files /dev/null and b/monitoring/docs/screenshots/01-datasource-loki.png differ diff --git a/monitoring/docs/screenshots/02-dashboard-overview.png b/monitoring/docs/screenshots/02-dashboard-overview.png new file mode 100644 index 0000000000..3c794b05f1 Binary files /dev/null and b/monitoring/docs/screenshots/02-dashboard-overview.png differ diff --git a/monitoring/docs/screenshots/03-explore-queries.png b/monitoring/docs/screenshots/03-explore-queries.png new file mode 100644 index 0000000000..68f07b02dc Binary files /dev/null and b/monitoring/docs/screenshots/03-explore-queries.png differ diff --git a/monitoring/docs/screenshots/04-explore-results.png b/monitoring/docs/screenshots/04-explore-results.png new file mode 100644 index 0000000000..c8b00bc22b Binary files /dev/null and b/monitoring/docs/screenshots/04-explore-results.png differ diff --git a/monitoring/docs/screenshots/05-json-logs-app-python.png b/monitoring/docs/screenshots/05-json-logs-app-python.png new file mode 100644 index 0000000000..f867539b9e Binary files /dev/null and b/monitoring/docs/screenshots/05-json-logs-app-python.png differ diff --git a/monitoring/docs/screenshots/06-docker-compose-ps.png b/monitoring/docs/screenshots/06-docker-compose-ps.png new file mode 100644 index 0000000000..1bb76cc9a0 Binary files /dev/null and b/monitoring/docs/screenshots/06-docker-compose-ps.png differ diff --git a/monitoring/grafana/dashboards/lab07-observability.json b/monitoring/grafana/dashboards/lab07-observability.json new file mode 100644 index 0000000000..958bcaf55a --- /dev/null +++ b/monitoring/grafana/dashboards/lab07-observability.json @@ -0,0 +1,278 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Recent Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 12, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "logs/sec" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "legendFormat": "{{app}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate by App", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "percent", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "legendFormat": "{{level}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Level Distribution (Last 5m)", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "lab07", + "loki", + "observability" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab 07 - Loki Observability", + "uid": "lab07-loki-observability", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboard.yml b/monitoring/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 0000000000..65a14e803e --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: lab07-dashboards + orgId: 1 + folder: Lab 07 + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..34782cbc5b --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: true + jsonData: + timeout: 60 + maxLines: 1000 diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..a97c3d13ee --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,44 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + retention_period: 168h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem + +analytics: + reporting_enabled: false diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..6f414f0ccc --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,30 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions/positions.yml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + pipeline_stages: + - docker: {} + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_label_app"] + target_label: "app" + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: "compose_service" + - target_label: "job" + replacement: "docker"