diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md index 5907778989..f6c19e1de5 100644 --- a/ansible/docs/LAB05.md +++ b/ansible/docs/LAB05.md @@ -2,7 +2,7 @@ ## 1. Architecture Overview -**Ansible Version:** 2.10.8 +**Ansible Version:** 2.17.14 **Target VM OS:** Ubuntu 22.04 LTS (jammy64) **Control Node:** Same VM (Ansible runs on the VM and targets itself via `ansible_connection=local`) @@ -260,7 +260,7 @@ Any secret stored in plain text in a Git repository is effectively public, even ## 7. Challenges - **WSL2 disk space:** The WSL2 Alpine distro had only 136MB disk space, not enough to install Ansible. Solved by installing Ansible directly on the Vagrant VM and running it against localhost. -- **Docker login module:** `community.general.docker_login` failed in Ansible 2.10. Solved by using a `shell` task with `docker login --password-stdin` instead. +- **Docker login module:** `community.general.docker_login` failed. Solved by using a `shell` task with `docker login --password-stdin` instead. - **group_vars not loading with become:** Vault-encrypted `group_vars/all.yml` variables were not accessible when `become: yes` was set at the play level. Solved by passing variables explicitly with `-e @group_vars/all.yml` and setting `become: no` in the deploy playbook. - **App port:** The application runs on port 8000 (FastAPI/Uvicorn), not 5000 as initially assumed. Discovered via `docker logs` and corrected in the vault variables and port mapping. diff --git a/ansible/docs/LAB06.md b/ansible/docs/LAB06.md index e60a238acd..fae18b8a1d 100644 --- a/ansible/docs/LAB06.md +++ b/ansible/docs/LAB06.md @@ -622,7 +622,7 @@ Both `ansible-deploy.yml` and `ansible-deploy-bonus.yml` show green in GitHub Ac ## Summary ### Technologies Used -- Ansible 2.10.8 on Ubuntu 22.04 (Vagrant VM, `ansible_connection=local`) +- Ansible 2.17.14 on Ubuntu 22.04 (Vagrant VM, `ansible_connection=local`) - Docker Compose v2 plugin (`docker compose` not `docker-compose`) - GitHub Actions with self-hosted runner on the Vagrant VM - Jinja2 templating for docker-compose.yml generation diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..989910754c --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,7 @@ +--- +- name: Deploy Monitoring Stack + hosts: all + gather_facts: true + + roles: + - role: monitoring diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..a0d8971366 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,31 @@ +# Service versions +loki_version: "3.0.0" +promtail_version: "3.0.0" +grafana_version: "12.3.1" + +# Ports +loki_port: 3100 +promtail_port: 9080 +grafana_port: 3000 + +# Retention +loki_retention_period: "168h" + +# Grafana credentials +grafana_admin_user: "admin" +grafana_admin_password: "admin123" + +# Deployment directory +monitoring_dir: "/opt/monitoring" + +# Schema +loki_schema_version: "v13" +loki_schema_from: "2024-01-01" + +# Resource limits +loki_memory_limit: "1g" +loki_cpu_limit: "1.0" +promtail_memory_limit: "256m" +promtail_cpu_limit: "0.5" +grafana_memory_limit: "512m" +grafana_cpu_limit: "1.0" diff --git a/ansible/roles/monitoring/handlers/main.yml b/ansible/roles/monitoring/handlers/main.yml new file mode 100644 index 0000000000..b4098a0c93 --- /dev/null +++ b/ansible/roles/monitoring/handlers/main.yml @@ -0,0 +1,8 @@ +--- +- name: Restart monitoring stack + become: true + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + state: present + remove_orphans: true + recreate: always diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..ef0966d4c7 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,8 @@ +galaxy_info: + author: 3llimi + description: Deploys Loki, Promtail, and Grafana monitoring stack + license: MIT + min_ansible_version: "2.16" + +dependencies: + - role: docker diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml new file mode 100644 index 0000000000..c975e4ccbe --- /dev/null +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -0,0 +1,56 @@ +--- +- name: Deploy monitoring stack with Docker Compose + become: true + tags: [monitoring, monitoring_deploy] + block: + - name: Deploy monitoring stack + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + state: present + remove_orphans: true + register: compose_result + + - name: Wait for Loki to be ready + ansible.builtin.uri: + url: "http://localhost:{{ loki_port }}/ready" + status_code: 200 + register: loki_ready + retries: 12 + delay: 10 + until: loki_ready.status == 200 + + - name: Wait for Grafana to be ready + ansible.builtin.uri: + url: "http://localhost:{{ grafana_port }}/api/health" + status_code: 200 + register: grafana_ready + retries: 12 + delay: 10 + until: grafana_ready.status == 200 + + - name: Report deployment success + ansible.builtin.debug: + msg: "Monitoring stack deployed — Grafana at http://localhost:{{ grafana_port }}" + + rescue: + - name: Show container logs on failure + ansible.builtin.command: > + docker compose -f {{ monitoring_dir }}/docker-compose.yml logs --tail=20 + changed_when: false + failed_when: false + register: compose_logs + + - name: Print container logs + ansible.builtin.debug: + msg: "{{ compose_logs.stdout_lines }}" + + always: + - name: Show running containers + ansible.builtin.command: docker compose -f {{ monitoring_dir }}/docker-compose.yml ps + changed_when: false + failed_when: false + register: compose_ps + + - name: Print container status + ansible.builtin.debug: + msg: "{{ compose_ps.stdout_lines }}" diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..39c60e486d --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: Setup monitoring directories and configs + ansible.builtin.include_tasks: setup.yml + tags: [monitoring, monitoring_setup] + +- name: Deploy monitoring stack + ansible.builtin.include_tasks: deploy.yml + tags: [monitoring, monitoring_deploy] diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml new file mode 100644 index 0000000000..ed15f385ed --- /dev/null +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -0,0 +1,51 @@ +--- +- name: Setup monitoring directories and configuration files + become: true + tags: [monitoring, monitoring_setup] + block: + - name: Create monitoring directory structure + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ monitoring_dir }}" + - "{{ monitoring_dir }}/loki" + - "{{ monitoring_dir }}/promtail" + + - name: Template Loki configuration + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_dir }}/loki/config.yml" + mode: "0644" + notify: Restart monitoring stack + + - name: Template Promtail configuration + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_dir }}/promtail/config.yml" + mode: "0644" + notify: Restart monitoring stack + + - name: Template Docker Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_dir }}/docker-compose.yml" + mode: "0644" + notify: Restart monitoring stack + + rescue: + - name: Report setup failure + ansible.builtin.debug: + msg: "Failed to set up monitoring configuration. Check directory permissions." + + always: + - name: List monitoring directory + ansible.builtin.command: ls -la {{ monitoring_dir }} + changed_when: false + failed_when: false + register: monitoring_dir_contents + + - name: Show monitoring directory contents + ansible.builtin.debug: + msg: "{{ monitoring_dir_contents.stdout_lines }}" diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..8985057fe7 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,82 @@ +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + +services: + + loki: + image: grafana/loki:{{ loki_version }} + container_name: loki + ports: + - "{{ loki_port }}:{{ loki_port }}" + volumes: + - {{ monitoring_dir }}/loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ loki_port }}/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + deploy: + resources: + limits: + cpus: '{{ loki_cpu_limit }}' + memory: {{ loki_memory_limit }} + restart: unless-stopped + + promtail: + image: grafana/promtail:{{ promtail_version }} + container_name: promtail + volumes: + - {{ monitoring_dir }}/promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: '{{ promtail_cpu_limit }}' + memory: {{ promtail_memory_limit }} + restart: unless-stopped + + grafana: + image: grafana/grafana:{{ grafana_version }} + container_name: grafana + ports: + - "{{ grafana_port }}:3000" + volumes: + - grafana-data:/var/lib/grafana + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_USER={{ grafana_admin_user }} + - GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }} + networks: + - logging + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + deploy: + resources: + limits: + cpus: '{{ grafana_cpu_limit }}' + memory: {{ grafana_memory_limit }} + restart: unless-stopped diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..01a603578e --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,44 @@ +auth_enabled: false + +server: + http_listen_port: {{ loki_port }} + grpc_listen_port: 9096 + log_level: info + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: {{ loki_schema_from }} + store: tsdb + object_store: filesystem + schema: {{ loki_schema_version }} + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: {{ loki_retention_period }} + allow_structured_metadata: true + volume_enabled: true + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + +analytics: + reporting_enabled: false diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..55ad5b9176 --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,31 @@ +server: + http_listen_port: {{ promtail_port }} + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:{{ loki_port }}/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + + - source_labels: [__meta_docker_container_label_app] + target_label: app + + - target_label: job + replacement: docker + + - source_labels: [__meta_docker_container_log_stream] + target_label: stream diff --git a/app_python/app.py b/app_python/app.py index 1fae0664c5..94ac357fc8 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -7,16 +7,51 @@ import os import logging import sys - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=[ - logging.StreamHandler(sys.stdout), - logging.FileHandler("app.log"), - ], -) - +import json + + +class JSONFormatter(logging.Formatter): + def format(self, record: logging.LogRecord) -> str: + log_entry = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + for key, value in record.__dict__.items(): + if key not in ( + "name", + "msg", + "args", + "levelname", + "levelno", + "pathname", + "filename", + "module", + "exc_info", + "exc_text", + "stack_info", + "lineno", + "funcName", + "created", + "msecs", + "relativeCreated", + "thread", + "threadName", + "processName", + "process", + "message", + "taskName", + ): + log_entry[key] = value + if record.exc_info: + log_entry["exception"] = self.formatException(record.exc_info) + return json.dumps(log_entry) + + +handler = logging.StreamHandler(sys.stdout) +handler.setFormatter(JSONFormatter()) +logging.basicConfig(level=logging.INFO, handlers=[handler]) logger = logging.getLogger(__name__) app = FastAPI() @@ -56,8 +91,7 @@ async def log_requests(request: Request, call_next): client_ip = request.client.host if request.client else "unknown" logger.info( - f"Request started: {request.method} {request.url.path} " - f"from {client_ip}" + f"Request started: {request.method}{request.url.path} from {client_ip}" ) try: @@ -67,8 +101,14 @@ async def log_requests(request: Request, call_next): ).total_seconds() logger.info( - f"Request completed: {request.method} {request.url.path} - " - f"Status: {response.status_code} - Duration: {process_time:.3f}s" + "Request completed", + extra={ + "method": request.method, + "path": request.url.path, + "status_code": response.status_code, + "client_ip": client_ip, + "duration_seconds": round(process_time, 3), + }, ) response.headers["X-Process-Time"] = str(process_time) @@ -78,8 +118,14 @@ async def log_requests(request: Request, call_next): datetime.now(timezone.utc) - start_time ).total_seconds() logger.error( - f"Request failed: {request.method} {request.url.path} - " - f"Error: {str(e)} - Duration: {process_time:.3f}s" + "Request failed", + extra={ + "method": request.method, + "path": request.url.path, + "client_ip": client_ip, + "duration_seconds": round(process_time, 3), + "error": str(e), + }, ) raise @@ -147,8 +193,13 @@ async def http_exception_handler( ): client = request.client.host if request.client else "unknown" logger.warning( - f"HTTP exception: {exc.status_code} - {exc.detail} - " - f"Path: {request.url.path} - Client: {client}" + "HTTP exception", + extra={ + "status_code": exc.status_code, + "detail": exc.detail, + "path": request.url.path, + "client_ip": client, + }, ) return JSONResponse( status_code=exc.status_code, @@ -164,8 +215,12 @@ async def http_exception_handler( async def general_exception_handler(request: Request, exc: Exception): client = request.client.host if request.client else "unknown" logger.error( - f"Unhandled exception: {type(exc).__name__} - {str(exc)} - " - f"Path: {request.url.path} - Client: {client}", + "Unhandled exception", + extra={ + "exception_type": type(exc).__name__, + "path": request.url.path, + "client_ip": client, + }, exc_info=True, ) return JSONResponse( diff --git a/monitoring/.env b/monitoring/.env new file mode 100644 index 0000000000..ed69997bab --- /dev/null +++ b/monitoring/.env @@ -0,0 +1 @@ +GRAFANA_ADMIN_PASSWORD=admin123 diff --git a/monitoring/.gitignore b/monitoring/.gitignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/monitoring/app.py b/monitoring/app.py new file mode 100644 index 0000000000..1fae0664c5 --- /dev/null +++ b/monitoring/app.py @@ -0,0 +1,185 @@ +from fastapi import FastAPI, Request +from datetime import datetime, timezone +from fastapi.responses import JSONResponse +from starlette.exceptions import HTTPException as StarletteHTTPException +import platform +import socket +import os +import logging +import sys + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler("app.log"), + ], +) + +logger = logging.getLogger(__name__) + +app = FastAPI() +START_TIME = datetime.now(timezone.utc) + +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 8000)) + +logger.info(f"Application starting - Host: {HOST}, Port: {PORT}") + + +def get_uptime(): + delta = datetime.now(timezone.utc) - START_TIME + secs = int(delta.total_seconds()) + hrs = secs // 3600 + mins = (secs % 3600) // 60 + return {"seconds": secs, "human": f"{hrs} hours, {mins} minutes"} + + +@app.on_event("startup") +async def startup_event(): + logger.info("FastAPI application startup complete") + logger.info(f"Python version: {platform.python_version()}") + logger.info(f"Platform: {platform.system()} {platform.platform()}") + logger.info(f"Hostname: {socket.gethostname()}") + + +@app.on_event("shutdown") +async def shutdown_event(): + uptime = get_uptime() + logger.info(f"Application shutting down. Total uptime: {uptime['human']}") + + +@app.middleware("http") +async def log_requests(request: Request, call_next): + start_time = datetime.now(timezone.utc) + client_ip = request.client.host if request.client else "unknown" + + logger.info( + f"Request started: {request.method} {request.url.path} " + f"from {client_ip}" + ) + + try: + response = await call_next(request) + process_time = ( + datetime.now(timezone.utc) - start_time + ).total_seconds() + + logger.info( + f"Request completed: {request.method} {request.url.path} - " + f"Status: {response.status_code} - Duration: {process_time:.3f}s" + ) + + response.headers["X-Process-Time"] = str(process_time) + return response + except Exception as e: + process_time = ( + datetime.now(timezone.utc) - start_time + ).total_seconds() + logger.error( + f"Request failed: {request.method} {request.url.path} - " + f"Error: {str(e)} - Duration: {process_time:.3f}s" + ) + raise + + +@app.get("/") +def home(request: Request): + logger.debug("Home endpoint called") + uptime = get_uptime() + return { + "service": { + "name": "devops-info-service", + "version": "1.0.0", + "description": "DevOps course info service", + "framework": "FastAPI", + }, + "system": { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": platform.platform(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count(), + "python_version": platform.python_version(), + }, + "runtime": { + "uptime_seconds": uptime["seconds"], + "uptime_human": uptime["human"], + "current_time": datetime.now(timezone.utc).isoformat(), + "timezone": "UTC", + }, + "request": { + "client_ip": request.client.host if request.client else "unknown", + "user_agent": request.headers.get("user-agent", "unknown"), + "method": request.method, + "path": request.url.path, + }, + "endpoints": [ + { + "path": "/", + "method": "GET", + "description": "Service information", + }, + { + "path": "/health", + "method": "GET", + "description": "Health check", + }, + ], + } + + +@app.get("/health") +def health(): + logger.debug("Health check endpoint called") + uptime = get_uptime() + return { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "uptime_seconds": uptime["seconds"], + } + + +@app.exception_handler(StarletteHTTPException) +async def http_exception_handler( + request: Request, exc: StarletteHTTPException +): + client = request.client.host if request.client else "unknown" + logger.warning( + f"HTTP exception: {exc.status_code} - {exc.detail} - " + f"Path: {request.url.path} - Client: {client}" + ) + return JSONResponse( + status_code=exc.status_code, + content={ + "error": exc.detail, + "status_code": exc.status_code, + "path": request.url.path, + }, + ) + + +@app.exception_handler(Exception) +async def general_exception_handler(request: Request, exc: Exception): + client = request.client.host if request.client else "unknown" + logger.error( + f"Unhandled exception: {type(exc).__name__} - {str(exc)} - " + f"Path: {request.url.path} - Client: {client}", + exc_info=True, + ) + return JSONResponse( + status_code=500, + content={ + "error": "Internal Server Error", + "message": "An unexpected error occurred", + "path": request.url.path, + }, + ) + + +if __name__ == "__main__": + import uvicorn + + logger.info(f"Starting Uvicorn server on {HOST}:{PORT}") + uvicorn.run(app, host=HOST, port=PORT) diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..b4341bd176 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,118 @@ +version: '3.8' + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + +services: + + loki: + image: grafana/loki:3.0.0 + container_name: loki + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + restart: unless-stopped + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.1' + memory: 64M + restart: unless-stopped + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123} + - GF_SECURITY_ALLOW_EMBEDDING=false + networks: + - logging + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + deploy: + resources: + limits: + cpus: '1.0' + memory: 512M + reservations: + cpus: '0.25' + memory: 128M + restart: unless-stopped + + app-python: + image: 3llimi/devops-info-service:latest + container_name: devops-python + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + restart: unless-stopped + + app-go: + image: 3llimi/devops-go-service:latest + container_name: devops-go + ports: + - "8001:8080" + networks: + - logging + labels: + logging: "promtail" + app: "devops-go" + restart: unless-stopped \ No newline at end of file diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..a674e0b619 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,429 @@ +# Lab 7 — Observability & Logging with Loki Stack + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Docker Network: logging │ +│ │ +│ ┌─────────────┐ logs ┌──────────────┐ │ +│ │ devops- │──────────► │ │ │ +│ │ python:8000 │ │ Promtail │ │ +│ └─────────────┘ │ :9080 │ │ +│ │ │ │ +│ ┌─────────────┐ logs │ Docker SD │ push │ +│ │ devops-go │──────────► │ (socket) │─────────► │ +│ │ :8001 │ └──────────────┘ │ +│ └─────────────┘ │ +│ │ +│ ┌──────────────────┐ ┌───────────────────┐ │ +│ │ Grafana :3000 │◄─────│ Loki :3100 │ │ +│ │ Dashboards │query │ TSDB Storage │ │ +│ │ LogQL Explore │ │ 7d retention │ │ +│ └──────────────────┘ └───────────────────┘ │ +└────────────────────────────────────────────────────────┘ +``` + +**How it works:** +- Promtail discovers containers via Docker socket using service discovery +- Only containers with label `logging=promtail` are scraped +- Logs are pushed to Loki's HTTP API and stored with TSDB +- Grafana queries Loki using LogQL and displays dashboards + +--- + +## Setup Guide + +### Prerequisites +- Docker and Docker Compose v2 installed +- Apps from Lab 1/2 available as Docker images + +### Project Structure + +``` +monitoring/ +├── docker-compose.yml +├── .env # Grafana password (not committed) +├── .gitignore +├── loki/ +│ └── config.yml +├── promtail/ +│ └── config.yml +└── docs/ + └── LAB07.md +``` + +### Deploy + +```bash +# Set Grafana password (UTF-8, no BOM) +echo "GRAFANA_ADMIN_PASSWORD=admin123" > .env + +# Start the stack +docker compose up -d + +# Verify +docker compose ps +``` +![Grafana Explore](screenshots/screenshot-explore.png) +**Evidence — stack running:** +``` +NAME IMAGE STATUS +devops-go 3llimi/devops-go-service:latest Up 15 hours +devops-python 3llimi/devops-info-service:latest Up 14 hours +grafana grafana/grafana:12.3.1 Up 15 hours (healthy) +loki grafana/loki:3.0.0 Up 15 hours (healthy) +promtail grafana/promtail:3.0.0 Up 15 hours +``` + +--- + +## Configuration + +### Loki — `loki/config.yml` + +Key decisions: + +```yaml +schema_config: + configs: + - from: 2024-01-01 + store: tsdb # Loki 3.0 recommended — 10x faster than boltdb + schema: v13 # Latest schema version + +limits_config: + retention_period: 168h # 7 days — balance between storage and history + +compactor: + retention_enabled: true # Required to actually enforce retention_period +``` + +**Why TSDB over boltdb-shipper:** TSDB is the default in Loki 3.0, offers faster queries and lower memory usage. boltdb-shipper is legacy. + +**Why `auth_enabled: false`:** Single-instance setup — no multi-tenancy needed. + +### Promtail — `promtail/config.yml` + +Key decisions: + +```yaml +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + filters: + - name: label + values: ["logging=promtail"] # Only scrape labelled containers + relabel_configs: + - source_labels: [__meta_docker_container_label_app] + target_label: app # app label from Docker → Loki label +``` + +**Why filter by label:** Without the filter, Promtail would scrape all containers including Loki and Grafana themselves, creating noise. The `logging=promtail` label is an explicit opt-in. + +**Why Docker socket:** Promtail uses the Docker API to discover running containers and their log paths automatically — no manual config needed when containers are added or removed. + +--- + +## Application Logging + +### JSON Logging — Python App + +The Python app was updated to output structured JSON logs instead of plain text. A custom `JSONFormatter` class was added: + +```python +class JSONFormatter(logging.Formatter): + def format(self, record: logging.LogRecord) -> str: + log_entry = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + # Extra fields passed via extra={} appear as top-level JSON keys + for key, value in record.__dict__.items(): + if key not in (...standard fields...): + log_entry[key] = value + return json.dumps(log_entry) +``` + +HTTP request logs use `extra={}` to add structured fields: + +```python +logger.info("Request completed", extra={ + "method": request.method, + "path": request.url.path, + "status_code": response.status_code, + "client_ip": client_ip, + "duration_seconds": round(process_time, 3), +}) +``` + +**Evidence — JSON log output:** +```json +{"timestamp": "2026-02-28T23:59:34.184447+00:00", "level": "INFO", "logger": "__main__", "message": "Request completed", "method": "GET", "path": "/health", "status_code": 200, "client_ip": "172.18.0.1", "duration_seconds": 0.002} +``` + +**Why JSON:** Enables field-level filtering in LogQL. Plain text only supports string matching (`|= "GET"`), while JSON allows `| json | status_code=200` and `| json | path="/health"`. + +**Why stdout only:** Containers should log to stdout, not files. The orchestrator (Docker/Kubernetes) handles log collection. Removed the `FileHandler` from the original app. + +--- + +## Dashboard + +**Dashboard name:** DevOps Apps - Log Overview +![Dashboard](screenshots/screenshot-dashboard.png) + +### Panel 1 — Logs Table +**Type:** Logs +**Query:** `{app=~"devops-.*"}` +**Purpose:** Shows all recent log lines from both apps in real time. The regex `devops-.*` matches both `devops-python` and `devops-go` with a single query. + +### Panel 2 — Request Rate +**Type:** Time series +**Query:** `sum by (app) (rate({app=~"devops-.*"}[1m]))` +**Purpose:** Shows logs per second for each app over time. `rate()` calculates the per-second rate over a 1-minute window. `sum by (app)` splits the line by app label so each app gets its own series. + +### Panel 3 — Error Logs +**Type:** Logs +**Query:** `{app=~"devops-.*"} |= "ERROR"` +**Purpose:** Filters log lines containing the word ERROR. Shows "No data" when the app is healthy — which is the expected result. + +### Panel 4 — Log Level Distribution +**Type:** Pie chart +**Query:** `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` +**Purpose:** Counts log lines grouped by level (INFO, WARNING, ERROR) over a 5-minute window. Uses `| json` to parse the structured logs from the Python app and extract the `level` field. + +--- + +## Production Config + +### Resource Limits + +All services have CPU and memory limits to prevent resource exhaustion: + +| Service | CPU Limit | Memory Limit | +|---------|-----------|--------------| +| Loki | 1.0 | 1G | +| Promtail | 0.5 | 256M | +| Grafana | 1.0 | 512M | + +### Security + +- `GF_AUTH_ANONYMOUS_ENABLED=false` — Grafana requires login +- Admin password stored in `.env` file, excluded from git via `.gitignore` +- Promtail mounts Docker socket read-only (`/var/run/docker.sock:ro`) +- Container logs directory mounted read-only (`/var/lib/docker/containers:ro`) + +### Health Checks + +```yaml +# Loki +healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + retries: 5 + start_period: 20s + +# Grafana +healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + retries: 5 + start_period: 30s +``` + +**Evidence — health checks passing:** +``` +$ docker inspect loki --format "{{.State.Health.Status}}" +healthy + +$ docker inspect grafana --format "{{.State.Health.Status}}" +healthy +``` + +### Retention + +Loki is configured with 7-day retention (`168h`). The compactor runs every 10 minutes and enforces deletion after a 2-hour delay. This prevents unbounded storage growth in production. + +--- + +## Testing + +```bash +# Verify Loki is ready +curl http://localhost:3100/ready +# Expected: ready + +# Check labels ingested +curl http://localhost:3100/loki/api/v1/labels +# Expected: {"status":"success","data":["app","container","job","level","service_name","stream"]} + +# Query logs for Python app +curl "http://localhost:3100/loki/api/v1/query_range?query=%7Bapp%3D%22devops-python%22%7D&limit=5" +# Expected: JSON with log streams + +# Generate traffic +for ($i=1; $i -le 30; $i++) { curl -UseBasicParsing http://localhost:8000/ | Out-Null } +for ($i=1; $i -le 30; $i++) { curl -UseBasicParsing http://localhost:8001/ | Out-Null } + +# Check Promtail discovered containers +docker logs promtail +# Expected: "added Docker target" for each app container +``` + +### LogQL Queries Used + +```logql +# All logs from both apps +{app=~"devops-.*"} + +# Only errors +{app=~"devops-.*"} |= "ERROR" + +# Parse JSON and filter by path +{app="devops-python"} | json | path="/health" + +# Request rate per app +sum by (app) (rate({app=~"devops-.*"}[1m])) + +# Log count by level +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` + +--- + +## Bonus — Ansible Automation + +### Role Structure + +``` +roles/monitoring/ +├── defaults/main.yml # Versions, ports, retention, resource limits +├── meta/main.yml # Depends on: docker role +├── handlers/main.yml # Restart stack on config change +├── tasks/ +│ ├── main.yml # Orchestrates setup + deploy +│ ├── setup.yml # Creates dirs, templates configs +│ └── deploy.yml # docker compose up + health wait +└── templates/ + ├── docker-compose.yml.j2 + ├── loki-config.yml.j2 + └── promtail-config.yml.j2 +``` + +### Key Variables (defaults/main.yml) + +```yaml +loki_version: "3.0.0" +grafana_version: "12.3.1" +loki_port: 3100 +grafana_port: 3000 +loki_retention_period: "168h" +monitoring_dir: "/opt/monitoring" +loki_schema_version: "v13" +``` + +All versions, ports, retention period, and resource limits are parameterised — override any variable without touching role code. + +### Role Dependency + +`meta/main.yml` declares `docker` as a dependency. Ansible automatically runs the docker role before monitoring — Docker is guaranteed to be installed before `docker compose up` runs. + +### Playbook + +```yaml +# playbooks/deploy-monitoring.yml +- name: Deploy Monitoring Stack + hosts: all + gather_facts: true + roles: + - role: monitoring +``` + +### First Run Evidence + +``` +TASK [monitoring : Create monitoring directory structure] +changed: [localhost] => (item=/opt/monitoring) +changed: [localhost] => (item=/opt/monitoring/loki) +changed: [localhost] => (item=/opt/monitoring/promtail) + +TASK [monitoring : Template Loki configuration] changed: [localhost] +TASK [monitoring : Template Promtail configuration] changed: [localhost] +TASK [monitoring : Template Docker Compose file] changed: [localhost] +TASK [monitoring : Deploy monitoring stack] changed: [localhost] + +PLAY RECAP +localhost : ok=26 changed=5 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +### Second Run — Idempotency Evidence + +``` +TASK [monitoring : Create monitoring directory structure] ok: [localhost] +TASK [monitoring : Template Loki configuration] ok: [localhost] +TASK [monitoring : Template Promtail configuration] ok: [localhost] +TASK [monitoring : Template Docker Compose file] ok: [localhost] +TASK [monitoring : Deploy monitoring stack] ok: [localhost] + +PLAY RECAP +localhost : ok=26 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +`changed=0` on second run confirms full idempotency ✅ + +### Rendered docker-compose.yml on VM + +```yaml +services: + loki: + image: grafana/loki:3.0.0 + ports: + - "3100:3100" + healthcheck: + test: ["CMD-SHELL", "wget ... http://localhost:3100/ready || exit 1"] + promtail: + image: grafana/promtail:3.0.0 + grafana: + image: grafana/grafana:12.3.1 + ports: + - "3000:3000" + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false +``` + +--- + +## Challenges & Solutions + +**Challenge 1: .env file encoding on Windows** +The `.env` file was saved as UTF-16 with BOM by Notepad, causing Docker Compose to fail with `unexpected character "xFF"`. Fixed by using PowerShell's `[System.IO.File]::WriteAllText()` with `UTF8Encoding($false)` to write without BOM. + +**Challenge 2: Promtail port 9080 not accessible from host** +`curl http://localhost:9080/targets` failed because port 9080 was not exposed in docker-compose.yml (intentional — Promtail is internal only). Verified Promtail operation via `docker logs promtail` instead, which showed `added Docker target` for both app containers. + +**Challenge 3: Vagrant synced folder not mounting** +Default Vagrantfile had no `synced_folder` config. Adding `config.vm.synced_folder "..", "/devops"` and running `vagrant reload` mounted the entire repo at `/devops`, making Ansible files accessible inside the VM without SCP. + +**Challenge 4: Ansible world-writable directory warning** +Running `ansible-playbook` from `/devops/ansible` caused Ansible to ignore `ansible.cfg` because the directory is world-writable (shared folder permissions). Fixed by copying the ansible directory to `~/ansible` and running from there. + +--- + +## Summary + +| Component | Version | Purpose | +|-----------|---------|---------| +| Loki | 3.0.0 | Log storage with TSDB | +| Promtail | 3.0.0 | Log collection via Docker SD | +| Grafana | 12.3.1 | Visualization and dashboards | +| Python app | latest | JSON-structured application logs | +| Go app | latest | Application logs | + +**Key metrics:** +- Log retention: 7 days +- Containers monitored: 2 (devops-python, devops-go) +- Dashboard panels: 4 +- Ansible idempotency: ✅ confirmed (changed=0 on second run) \ No newline at end of file diff --git a/monitoring/docs/screenshots/screenshot-dashboard.png b/monitoring/docs/screenshots/screenshot-dashboard.png new file mode 100644 index 0000000000..b359a43b8d Binary files /dev/null and b/monitoring/docs/screenshots/screenshot-dashboard.png differ diff --git a/monitoring/docs/screenshots/screenshot-explore.png b/monitoring/docs/screenshots/screenshot-explore.png new file mode 100644 index 0000000000..8330695eb9 Binary files /dev/null and b/monitoring/docs/screenshots/screenshot-explore.png differ diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..618648284d --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,47 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h # 7 days + allow_structured_metadata: true + volume_enabled: true + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + +ruler: + alertmanager_url: http://localhost:9093 + +analytics: + reporting_enabled: false \ No newline at end of file diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..6865452528 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,35 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + # Use container name as the "container" label (strip leading slash) + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + + # Use the "app" Docker label as the "app" label in Loki + - source_labels: [__meta_docker_container_label_app] + target_label: app + + # Keep the job label as "docker" + - target_label: job + replacement: docker + + # Use the container log path + - source_labels: [__meta_docker_container_log_stream] + target_label: stream \ No newline at end of file diff --git a/vagrant/.vagrant/machines/default/virtualbox/synced_folders b/vagrant/.vagrant/machines/default/virtualbox/synced_folders index e8707d7907..acb5560719 100644 --- a/vagrant/.vagrant/machines/default/virtualbox/synced_folders +++ b/vagrant/.vagrant/machines/default/virtualbox/synced_folders @@ -1 +1 @@ -{"virtualbox":{"/vagrant":{"guestpath":"/vagrant","hostpath":"C:/Users/3llim/OneDrive/Documents/GitHub/DevOps-Core-Course/vagrant","disabled":false,"__vagrantfile":true}}} \ No newline at end of file +{"virtualbox":{"/devops":{"guestpath":"/devops","hostpath":"C:/Users/3llim/OneDrive/Documents/GitHub/DevOps-Core-Course","disabled":false,"__vagrantfile":true},"/vagrant":{"guestpath":"/vagrant","hostpath":"C:/Users/3llim/OneDrive/Documents/GitHub/DevOps-Core-Course/vagrant","disabled":false,"__vagrantfile":true}}} \ No newline at end of file diff --git a/vagrant/Vagrantfile b/vagrant/Vagrantfile index 197e9839d3..f9fad98585 100644 --- a/vagrant/Vagrantfile +++ b/vagrant/Vagrantfile @@ -1,6 +1,7 @@ Vagrant.configure("2") do |config| config.vm.box = "ubuntu/jammy64" config.vm.network "private_network", ip: "192.168.56.10" + config.vm.synced_folder "..", "/devops" config.vm.provider "virtualbox" do |vb| vb.memory = "2048" end