diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..27a092f978 --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,119 @@ +name: Ansible Deployment + +on: + push: + branches: + - master + - main + paths: + - "ansible/**" + - "!ansible/docs/**" + - ".github/workflows/ansible-deploy.yml" + pull_request: + branches: + - master + - main + paths: + - "ansible/**" + - "!ansible/docs/**" + - ".github/workflows/ansible-deploy.yml" + +concurrency: + group: ansible-deploy-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install Ansible toolchain + run: | + pip install ansible ansible-lint + ansible-galaxy collection install -r ansible/requirements.yml + + - name: Run ansible-lint + working-directory: ansible + run: ansible-lint playbooks/*.yml + + deploy: + name: Deploy Application + needs: lint + if: github.event_name == 'push' && (github.ref == 'refs/heads/master' || github.ref == 'refs/heads/main') + runs-on: ubuntu-latest + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install Ansible toolchain + run: | + pip install ansible + ansible-galaxy collection install -r ansible/requirements.yml + + - name: Configure SSH access + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H "${{ secrets.VM_HOST }}" >> ~/.ssh/known_hosts + + - name: Create runtime inventory + run: | + cat > /tmp/hosts.ini < /tmp/vault_pass + VAULT_ARGS="--vault-password-file /tmp/vault_pass" + fi + + ansible-playbook playbooks/deploy.yml \ + -i /tmp/hosts.ini \ + $VAULT_ARGS \ + -e "dockerhub_username=$DOCKERHUB_USERNAME" \ + -e "dockerhub_password=$DOCKERHUB_PASSWORD" + + rm -f /tmp/vault_pass + + - name: Verify deployment + run: | + sleep 10 + curl -f "http://${{ secrets.VM_HOST }}:5000/" >/dev/null + curl -f "http://${{ secrets.VM_HOST }}:5000/health" >/dev/null diff --git a/.gitignore b/.gitignore index fe2b520861..206f692e33 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,15 @@ pulumi/venv/ *.jks *.json credentials + +# Ansible +*.retry +.vault_pass +ansible/group_vars/all.yml +ansible/group_vars/*.bak +ansible/inventory/*.pyc +__pycache__/ + +# Monitoring +monitoring/.env +!monitoring/grafana/dashboards/lab07-logs-dashboard.json diff --git a/Lab-1/app_python/app.py b/Lab-1/app_python/app.py index 4a6bdfb9a2..38080375b4 100644 --- a/Lab-1/app_python/app.py +++ b/Lab-1/app_python/app.py @@ -1,17 +1,82 @@ from __future__ import annotations +import json import logging import os import platform import socket +import time from datetime import datetime, timezone from dotenv import load_dotenv -from flask import Flask, jsonify, request +from flask import Flask, g, jsonify, request from flask_swagger_ui import get_swaggerui_blueprint app = Flask(__name__) + +class JSONFormatter(logging.Formatter): + def format(self, record: logging.LogRecord) -> str: + payload: dict[str, object] = { + 'timestamp': datetime.fromtimestamp( + record.created, + timezone.utc + ).isoformat(timespec='milliseconds').replace('+00:00', 'Z'), + 'level': record.levelname, + 'logger': record.name, + 'message': record.getMessage(), + } + + extra_fields = ( + 'event', + 'method', + 'path', + 'status_code', + 'client_ip', + 'user_agent', + 'duration_ms', + 'host', + 'port', + 'debug', + ) + + for field in extra_fields: + if hasattr(record, field): + payload[field] = getattr(record, field) + + if record.exc_info: + payload['exception'] = self.formatException(record.exc_info) + + return json.dumps(payload, ensure_ascii=True) + + +def configure_logging() -> logging.Logger: + handler = logging.StreamHandler() + handler.setFormatter(JSONFormatter()) + + root_logger = logging.getLogger() + root_logger.handlers.clear() + root_logger.addHandler(handler) + root_logger.setLevel(logging.INFO) + + # Route Flask internals through the same root logger. + app.logger.handlers.clear() + app.logger.propagate = True + + return logging.getLogger('devops-info-service') + + +def _iso_utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec='milliseconds').replace('+00:00', 'Z') + + +def get_client_ip() -> str: + client_ip = request.headers.get('X-Forwarded-For', request.remote_addr or '') + if ',' in client_ip: + return client_ip.split(',')[0].strip() + return client_ip + + # conf load_dotenv() HOST = os.getenv('HOST', '0.0.0.0') @@ -21,13 +86,17 @@ # start time START_TIME = datetime.now(timezone.utc) -# Logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +# logging +logger = configure_logging() +logger.info( + 'Application starting', + extra={ + 'event': 'startup', + 'host': HOST, + 'port': PORT, + 'debug': DEBUG, + } ) -logger = logging.getLogger(__name__) -logger.info('Application starting...') # swagger info SWAGGER_URL = '/docs' @@ -41,10 +110,6 @@ app.register_blueprint(swaggerui_blueprint, url_prefix=SWAGGER_URL) -def _iso_utc_now() -> str: - return datetime.now(timezone.utc).isoformat(timespec='milliseconds').replace('+00:00', 'Z') - - def get_uptime() -> dict: delta = datetime.now(timezone.utc) - START_TIME seconds = int(delta.total_seconds()) @@ -81,12 +146,8 @@ def get_system_info() -> dict: def get_request_info() -> dict: - client_ip = request.headers.get('X-Forwarded-For', request.remote_addr or '') - if ',' in client_ip: - client_ip = client_ip.split(',')[0].strip() - return { - 'client_ip': client_ip, + 'client_ip': get_client_ip(), 'user_agent': request.headers.get('User-Agent', ''), 'method': request.method, 'path': request.path @@ -110,7 +171,8 @@ def get_endpoints() -> list[dict]: {'path': '/health', 'method': 'GET', 'description': 'Health check'} ] -#API + +# API OPENAPI_SPEC = { 'openapi': '3.0.3', 'info': { @@ -145,7 +207,42 @@ def get_endpoints() -> list[dict]: @app.before_request def log_request() -> None: - logger.debug('Request: %s %s', request.method, request.path) + g.request_started_at = time.perf_counter() + logger.info( + 'Incoming request', + extra={ + 'event': 'request_start', + 'method': request.method, + 'path': request.path, + 'client_ip': get_client_ip(), + 'user_agent': request.headers.get('User-Agent', ''), + } + ) + + +@app.after_request +def log_response(response): + started_at = getattr(g, 'request_started_at', None) + duration_ms = None + if started_at is not None: + duration_ms = round((time.perf_counter() - started_at) * 1000, 2) + + log_extra: dict[str, object] = { + 'event': 'request_end', + 'method': request.method, + 'path': request.path, + 'status_code': response.status_code, + 'client_ip': get_client_ip(), + } + if duration_ms is not None: + log_extra['duration_ms'] = duration_ms + + logger.log( + logging.ERROR if response.status_code >= 500 else logging.INFO, + 'Request completed', + extra=log_extra + ) + return response @app.route('/') @@ -185,6 +282,16 @@ def swagger_json(): @app.errorhandler(404) def not_found(error): + logger.warning( + 'Endpoint not found', + extra={ + 'event': 'http_404', + 'method': request.method, + 'path': request.path, + 'client_ip': get_client_ip(), + 'status_code': 404, + } + ) return jsonify({ 'error': 'Not Found', 'message': 'Endpoint does not exist' @@ -193,6 +300,27 @@ def not_found(error): @app.errorhandler(500) def internal_error(error): + original_error = getattr(error, 'original_exception', None) + extra = { + 'event': 'http_500', + 'method': request.method, + 'path': request.path, + 'client_ip': get_client_ip(), + 'status_code': 500, + } + + if original_error is not None: + logger.exception( + 'Unhandled application exception', + exc_info=(type(original_error), original_error, original_error.__traceback__), + extra=extra + ) + else: + logger.error( + 'Internal server error', + extra=extra + ) + return jsonify({ 'error': 'Internal Server Error', 'message': 'An unexpected error occurred' diff --git a/Lab-1/app_python/docs/screenshots/lab_05_1.png b/Lab-1/app_python/docs/screenshots/lab_05_1.png new file mode 100644 index 0000000000..5248e51b23 Binary files /dev/null and b/Lab-1/app_python/docs/screenshots/lab_05_1.png differ diff --git a/Lab-1/app_python/docs/screenshots/lab_05_2.png b/Lab-1/app_python/docs/screenshots/lab_05_2.png new file mode 100644 index 0000000000..a7a45e218f Binary files /dev/null and b/Lab-1/app_python/docs/screenshots/lab_05_2.png differ diff --git a/README.md b/README.md index 371d51f456..cdc48abecf 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Labs](https://img.shields.io/badge/Labs-18-blue)](#labs) [![Exam](https://img.shields.io/badge/Exam-Optional-green)](#exam-alternative) [![Duration](https://img.shields.io/badge/Duration-18%20Weeks-lightgrey)](#course-roadmap) +[![Ansible Deployment](https://github.com/Linktur/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/Linktur/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) Master **production-grade DevOps practices** through hands-on labs. Build, containerize, deploy, monitor, and scale applications using industry-standard tools. diff --git a/ansible/.ansible-lint b/ansible/.ansible-lint new file mode 100644 index 0000000000..f452336661 --- /dev/null +++ b/ansible/.ansible-lint @@ -0,0 +1,4 @@ +--- +skip_list: + - key-order + - var-naming diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000000..56ea457fc1 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,12 @@ +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +host_key_checking = False +remote_user = ubuntu +retry_files_enabled = False +interpreter_python = auto_silent + +[privilege_escalation] +become = True +become_method = sudo +become_user = root diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md new file mode 100644 index 0000000000..db3dc58d4f --- /dev/null +++ b/ansible/docs/LAB05.md @@ -0,0 +1,295 @@ +# LAB05 - Ansible Fundamentals + +## 1. Architecture Overview + +- **Ansible version:** 2.16+ (expected by lab, verify with `ansible --version`) +- **Target VM:** local VM from LAB04 (`10.241.1.215`) +- **Target OS:** Debian 13 (role logic also supports Ubuntu) +- **Project structure:** role-based layout (`common`, `docker`, `app_deploy`) with separate playbooks for provisioning and deploy. + +Why roles instead of one monolithic playbook: +- roles isolate responsibilities by domain (base system, Docker, app deployment); +- variables, handlers and tasks stay reusable between labs/environments; +- support and debugging are easier because each role is independent. + +## 2. Roles Documentation + +### Role: `common` +- **Purpose:** base server preparation (apt cache, common packages, timezone). +- **Variables:** + - `common_packages` - list of base packages. + - `common_timezone` - desired timezone (`UTC` by default). + - `common_apt_cache_valid_time` - apt cache TTL. +- **Handlers:** none. +- **Dependencies:** none. + +### Role: `docker` +- **Purpose:** install and configure Docker Engine via official Docker repository. +- **Variables:** + - `docker_user` - user added to `docker` group. + - `docker_packages` - Docker related packages. + - `docker_repo_distribution`, `docker_apt_release` - distro/release mapping for repo URL. + - `docker_architecture_map` - architecture mapping for apt repo. +- **Handlers:** + - `restart docker` - restarts Docker service when repository/packages/key change. +- **Dependencies:** none. + +### Role: `app_deploy` +- **Purpose:** login to Docker Hub, pull image, recreate container when needed, verify app health. +- **Variables:** + - `dockerhub_username`, `dockerhub_password` - credentials from Vault. + - `docker_image`, `docker_image_tag` - deployment image settings. + - `app_container_name`, `app_port`, `app_container_port`, `app_restart_policy`. + - `app_env`, `app_healthcheck_path`. +- **Handlers:** + - `restart application container` - restarts app container if image pull triggered handler and container existed. +- **Dependencies:** Docker must be installed first (`docker` role). + +## 3. Idempotency Demonstration + +Run from `ansible/`: + +```bash +ansible-playbook playbooks/provision.yml +ansible-playbook playbooks/provision.yml +``` + +Paste output snippets below: + +### First run (`provision.yml`) +```text +PLAY [Provision web servers] *************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [web-01] + +TASK [common : Update apt cache] *********************************************** +ok: [web-01] + +TASK [common : Install common packages] **************************************** +ok: [web-01] + +TASK [common : Read current timezone] ****************************************** +ok: [web-01] + +TASK [common : Set timezone] *************************************************** +skipping: [web-01] + +TASK [docker : Update apt cache] *********************************************** +ok: [web-01] + +TASK [docker : Install Docker prerequisites] *********************************** +ok: [web-01] + +TASK [docker : Ensure Docker keyring directory exists] ************************* +ok: [web-01] + +TASK [docker : Download Docker GPG key] **************************************** +ok: [web-01] + +TASK [docker : Add Docker apt repository] ************************************** +ok: [web-01] + +TASK [docker : Install Docker packages] **************************************** +ok: [web-01] + +TASK [docker : Ensure Docker service is enabled and running] ******************* +ok: [web-01] + +TASK [docker : Add user to docker group] *************************************** +ok: [web-01] + +PLAY RECAP ********************************************************************* +web-01 : ok=12 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +### Second run (`provision.yml`) +```text +PLAY [Provision web servers] *************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [web-01] + +TASK [common : Update apt cache] *********************************************** +ok: [web-01] + +TASK [common : Install common packages] **************************************** +ok: [web-01] + +TASK [common : Read current timezone] ****************************************** +ok: [web-01] + +TASK [common : Set timezone] *************************************************** +skipping: [web-01] + +TASK [docker : Update apt cache] *********************************************** +ok: [web-01] + +TASK [docker : Install Docker prerequisites] *********************************** +ok: [web-01] + +TASK [docker : Ensure Docker keyring directory exists] ************************* +ok: [web-01] + +TASK [docker : Download Docker GPG key] **************************************** +ok: [web-01] + +TASK [docker : Add Docker apt repository] ************************************** +ok: [web-01] + +TASK [docker : Install Docker packages] **************************************** +ok: [web-01] + +TASK [docker : Ensure Docker service is enabled and running] ******************* +ok: [web-01] + +TASK [docker : Add user to docker group] *************************************** +ok: [web-01] + +PLAY RECAP ********************************************************************* +web-01 : ok=12 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +Analysis: +- first run should show many `changed` tasks because packages/repos/services are applied first time; +- second run should be mostly `ok` because desired state already matches actual state; +- this is achieved by stateful modules (`apt`, `service`, `user`, `docker_container`) and conditional recreation logic. + +## 4. Ansible Vault Usage + +Sensitive variables are stored in encrypted `group_vars/all.yml`. + +Create file: + +```bash +cd ansible +ansible-vault create group_vars/all.yml +``` + +Use this content inside Vault file: + +```yaml +--- +dockerhub_username: "your-dockerhub-username" +dockerhub_password: "your-dockerhub-access-token" + +app_name: "devops-lab2" +docker_image: "{{ dockerhub_username }}/{{ app_name }}" +docker_image_tag: "latest" +app_port: 5000 +app_container_port: 5000 +app_container_name: "{{ app_name }}" +``` + +Password strategy: +- use `--ask-vault-pass` for manual runs; +- optional: store password in `.vault_pass` locally and keep it out of git. + +Why Vault is important: +- secrets can be committed safely in encrypted form; +- prevents plaintext credential leakage in repository history. + +## 5. Deployment Verification + +Run deploy: + +```bash +cd ansible +ansible-galaxy collection install -r requirements.yml +ansible-playbook playbooks/deploy.yml --ask-vault-pass +ansible webservers -a "docker ps" +curl http://10.241.1.215:5000/health +curl http://10.241.1.215:5000/ +``` + +Paste output snippets: + +### `deploy.yml` output +```text +PLAY [Deploy application] ****************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [web-01] + +TASK [app_deploy : Validate required Docker Hub credentials] ******************* +ok: [web-01] => { + "changed": false, + "msg": "All assertions passed" +} + +TASK [app_deploy : Read current container information] ************************* +ok: [web-01] + +TASK [app_deploy : Log in to Docker Hub] *************************************** +ok: [web-01] + +TASK [app_deploy : Pull application image] ************************************* +ok: [web-01] + +TASK [app_deploy : Stop existing container when redeploy is required] ********** +skipping: [web-01] + +TASK [app_deploy : Remove old container when redeploy is required] ************* +skipping: [web-01] + +TASK [app_deploy : Run application container] ********************************** +changed: [web-01] + +TASK [app_deploy : Wait for application port] ********************************** +ok: [web-01] + +TASK [app_deploy : Verify health endpoint] ************************************* +ok: [web-01] + +PLAY RECAP ********************************************************************* +web-01 : ok=8 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 +``` + +### `docker ps` output +```text +web-01 | CHANGED | rc=0 >> +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +c10a9b0e7565 linktur/devops-lab2:v1 "python app.py" 27 minutes ago Up 27 minutes 0.0.0.0:5000->5000/tcp, [::]:5000->5000/tcp devops-lab2 +``` + +### Health checks +```text +curl http://10.241.1.215:5000/health +{"status":"healthy","timestamp":"2026-03-13T20:51:31.374Z","uptime_seconds":4768} + +curl http://10.241.1.215:5000/ +{"endpoints":[{"description":"Service information","method":"GET","path":"/"},{"description":"Health check","method":"GET","path":"/health"}],"request":{"client_ip":"10.241.1.148","method":"GET","path":"/","user_agent":"curl/7.81.0"},"runtime":{"current_time":"2026-03-13T20:51:31.383Z","timezone":"UTC","uptime_human":"1 hours, 19 minutes","uptime_seconds":4768},"service":{"description":"DevOps course info service","framework":"Flask","name":"devops-info-service","version":"1.0.0"},"system":{"architecture":"x86_64","cpu_count":1,"hostname":"c10a9b0e7565","platform":"Linux","platform_version":"Debian GNU/Linux 13 (trixie)","python_version":"3.13.12"}} +``` + +### Handler execution +```text +# if handler ran, paste TASK [app_deploy : restart application container] lines +``` + +## 6. Key Decisions + +**Why roles instead of plain playbooks?** +Roles isolate logic and keep playbooks thin. This gives clearer boundaries between provisioning and deployment and makes future changes safer. + +**How do roles improve reusability?** +Each role can be reused in other environments or combined with other playbooks. Variable defaults make behavior configurable without editing task code. + +**What makes a task idempotent?** +A task is idempotent when repeated runs converge to same state without extra changes. Using declarative modules (`state: present/started`) and conditions avoids unnecessary mutations. + +**How do handlers improve efficiency?** +Handlers run only when notified by changed tasks, so services are not restarted on every run. This reduces downtime and keeps runs predictable. + +**Why is Ansible Vault necessary?** +Vault protects secrets in version control and CI logs. It allows collaboration while keeping Docker Hub credentials encrypted. + +## 7. Challenges (Optional) + +- `ansible` may not be preinstalled on control node: install in WSL or Linux before running. +- Docker repo can require distro-specific release names; override `docker_apt_release` if needed. +- Ensure VM firewall allows `22` and `5000` from your workstation. + +## 8. Screenshots + +![Lab 05 Screenshot 1](../../Lab-1/app_python/docs/screenshots/lab_05_1.png) +![Lab 05 Screenshot 2](../../Lab-1/app_python/docs/screenshots/lab_05_2.png) diff --git a/ansible/docs/LAB06.md b/ansible/docs/LAB06.md new file mode 100644 index 0000000000..e8edf1e733 --- /dev/null +++ b/ansible/docs/LAB06.md @@ -0,0 +1,119 @@ +## 1. Overview + +In this lab I improved Ansible project from Lab05. + +I did: +- blocks, rescue, always +- tags for selective run +- Docker Compose deploy +- wipe logic (safe delete) +- GitHub Actions workflow for Ansible + +Main files: +- `ansible/roles/common/tasks/main.yml` +- `ansible/roles/docker/tasks/main.yml` +- `ansible/roles/web_app/tasks/main.yml` +- `ansible/roles/web_app/tasks/wipe.yml` +- `ansible/roles/web_app/templates/docker-compose.yml.j2` +- `.github/workflows/ansible-deploy.yml` + +## 2. Blocks and Tags + +I added blocks and tags in roles: + +- `common` role: tags `packages`, `users` +- `docker` role: tags `docker_install`, `docker_config` +- role-level tags: `common`, `docker`, `web_app` + +List tags result: +- `common, docker, docker_config, docker_install, packages, users` + +Selective run tests: +- `--tags docker` works +- `--skip-tags common` works + +## 3. Docker Compose Migration + +I renamed role `app_deploy` to `web_app`. + +I changed deploy from `docker_container` to Docker Compose (`docker_compose_v2`). + +I added: +- compose template with variables +- role dependency `web_app -> docker` +- health check after deploy + +## 4. Wipe Logic + +Safety logic: +- variable: `web_app_wipe` (default `false`) +- tag: `web_app_wipe` + +Behavior: +- normal deploy: wipe tasks are skipped +- wipe-only command: removes app files and containers +- clean reinstall: wipe first, then deploy + +## 5. CI/CD + +I created workflow: +- file: `.github/workflows/ansible-deploy.yml` +- jobs: `lint` and `deploy` +- deploy uses SSH + secrets +- workflow has verification step with `curl` + +I also added workflow badge in `README.md`. + +## 6. Test Results (from terminal) + +### Provision with tags +- `ansible-playbook playbooks/provision.yml --tags docker` +- Result: `ok=9 changed=0 failed=0` + +### Deploy run 1 +- `ansible-playbook playbooks/deploy.yml --vault-id @prompt` +- Result: `ok=22 changed=2 failed=0` + +### Deploy run 2 (idempotency) +- `ansible-playbook playbooks/deploy.yml --vault-id @prompt` +- Result: `ok=21 changed=0 failed=0` + +### Wipe only +- `ansible-playbook playbooks/deploy.yml --vault-id @prompt -e "web_app_wipe=true" --tags web_app_wipe` +- Result: `ok=8 changed=3 failed=0` + +### Clean reinstall +- `ansible-playbook playbooks/deploy.yml --vault-id @prompt -e "web_app_wipe=true"` +- Result: `ok=25 changed=3 failed=0` + +### Safety check (tag only, variable false) +- `ansible-playbook playbooks/deploy.yml --vault-id @prompt --tags web_app_wipe` +- Result: `ok=3 changed=0 skipped=6 failed=0` + +### Service checks +- `curl http://10.241.1.215:5000/` -> app returns JSON +- `curl http://10.241.1.215:5000/health` -> `{"status":"healthy", ...}` + +### Screenshot +![Lab06 Ansible result](../../screenshots/lab06_ans.png) +![Lab06 Ansible lint](../../screenshots/lint_ans.png) + +## 7. Simple Research Answers + +1. **Why variable + tag for wipe?** + For double safety. It is harder to delete app by mistake. + +2. **Why wipe before deploy?** + So clean reinstall works in one command. + +3. **Can we use Vault vars in templates?** + Yes, Vault vars work like normal Ansible vars after decrypt. + +4. **`restart: always` vs `unless-stopped`?** + `always` always tries restart. `unless-stopped` does not restart container stopped by user. + +## 8. Notes + +- Warning about world-writable directory appears because project is in `/mnt/c/...`. +- It does not block lab execution. +- For cleaner setup, project can be moved to Linux FS (`~/...`). diff --git a/ansible/group_vars/all.yml.bak b/ansible/group_vars/all.yml.bak new file mode 100644 index 0000000000..0a12339d33 --- /dev/null +++ b/ansible/group_vars/all.yml.bak @@ -0,0 +1,19 @@ +$ANSIBLE_VAULT;1.1;AES256 +30616337323935636461353938633536396535653761663466316534616337313134626364393330 +3330356536366663656661633739643565343039363235340a653039626663643038653432613430 +61613166313238666165343734656439373935343131316566633131656331336263636236623766 +3334356230656133370a366630363037346365393533643566643266356131326336323061383131 +37646232343165373466376465643432663231386463393264323030623938306438323761626265 +38346235306230353965323530323330356266633132383662383436636338326466643363653537 +34313562663031373863333035623266643539386532366439356166306462323932363661303230 +31616232383461623334376538376535643966333837303839333462636661363130336433666366 +63663939623562616434383839386235313064666662626435633561653431343137313461663363 +31303264383430393030303533666163646462323234333966646336316161653665633930376635 +33636566656237373833643664343564663037376666323438613230643638343439313930303632 +38393834643462623834383762623138333532616566393334316262303765343464323232613934 +36336664316562633332613837616564343564353861613761666234383530356632393337643836 +39626530303966336235626231643739333366306161363962633033373362666565393337343232 +61323533346365333863333530356164373832313938656161663565383731326230636431323739 +65323861356434363630626231303161623239376561373937666231373962666533373439333438 +66373038613066663861353635333430393834633335356630653539386532343437623231306664 +6665363934336435303337306333373562306637383130643439 diff --git a/ansible/group_vars/all.yml.example b/ansible/group_vars/all.yml.example new file mode 100644 index 0000000000..ed2ec672cf --- /dev/null +++ b/ansible/group_vars/all.yml.example @@ -0,0 +1,24 @@ +--- +# Copy this file to group_vars/all.yml and create it with: +# ansible-vault create group_vars/all.yml +# +# Example content for encrypted file: +dockerhub_username: "your-dockerhub-username" +dockerhub_password: "your-dockerhub-access-token" + +app_name: "devops-lab2" +docker_image: "{{ dockerhub_username }}/{{ app_name }}" +docker_tag: "latest" + +app_port: 5000 # Host port +app_internal_port: 5000 # Container port +compose_project_dir: "/opt/{{ app_name }}" + +# Wipe safety switch (used with --tags web_app_wipe) +web_app_wipe: false + +# Monitoring role (Lab07 bonus) +monitoring_grafana_admin_user: "admin" +monitoring_grafana_admin_password: "replace-with-strong-password" +# monitoring_project_dir: "/opt/monitoring" +# monitoring_app_enabled: true diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000000..dfeebb20fa --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,6 @@ +[webservers] +# Replace values below with your VM details from Lab 4 +web-01 ansible_host=10.241.1.215 ansible_user=loshara ansible_ssh_private_key_file=~/.ssh/id_rsa + +[webservers:vars] +ansible_python_interpreter=/usr/bin/python3 diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..b7989840bc --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,22 @@ +--- +- name: Deploy monitoring stack + hosts: webservers + become: true + + pre_tasks: + - name: Check whether local group_vars/all.yml exists + ansible.builtin.stat: + path: "{{ playbook_dir }}/../group_vars/all.yml" + register: monitoring_local_group_vars + delegate_to: localhost + become: false + + - name: Load local vaulted variables when available + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../group_vars/all.yml" + when: monitoring_local_group_vars.stat.exists + + roles: + - role: monitoring + tags: + - monitoring diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml new file mode 100644 index 0000000000..19eae420a5 --- /dev/null +++ b/ansible/playbooks/deploy.yml @@ -0,0 +1,22 @@ +--- +- name: Deploy application + hosts: webservers + become: true + + pre_tasks: + - name: Check whether local group_vars/all.yml exists + ansible.builtin.stat: + path: "{{ playbook_dir }}/../group_vars/all.yml" + register: deploy_local_group_vars + delegate_to: localhost + become: false + + - name: Load local vaulted variables when available + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../group_vars/all.yml" + when: deploy_local_group_vars.stat.exists + + roles: + - role: web_app + tags: + - web_app diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml new file mode 100644 index 0000000000..6334c412cc --- /dev/null +++ b/ansible/playbooks/provision.yml @@ -0,0 +1,12 @@ +--- +- name: Provision web servers + hosts: webservers + become: true + + roles: + - role: common + tags: + - common + - role: docker + tags: + - docker diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 0000000000..1138ac0748 --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,6 @@ +--- +- name: Run Provision Playbook + import_playbook: provision.yml + +- name: Run Deploy Playbook + import_playbook: deploy.yml diff --git a/ansible/requirements.yml b/ansible/requirements.yml new file mode 100644 index 0000000000..660f775816 --- /dev/null +++ b/ansible/requirements.yml @@ -0,0 +1,3 @@ +--- +collections: + - name: community.docker diff --git a/ansible/roles/common/defaults/main.yml b/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000000..086636f92b --- /dev/null +++ b/ansible/roles/common/defaults/main.yml @@ -0,0 +1,17 @@ +--- +common_apt_cache_valid_time: 3600 + +common_packages: + - python3-pip + - curl + - git + - vim + - htop + - ca-certificates + - gnupg + - lsb-release + - unzip + +common_timezone: "UTC" +common_manage_user: true +common_user_name: "{{ ansible_user | default('ubuntu') }}" diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000000..23ba41a223 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,59 @@ +--- +- name: Install common packages and configure timezone + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: "{{ common_apt_cache_valid_time }}" + lock_timeout: 600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + lock_timeout: 600 + + - name: Read current timezone + ansible.builtin.command: timedatectl show --property=Timezone --value + register: common_current_timezone + changed_when: false + + - name: Set timezone + ansible.builtin.command: "timedatectl set-timezone {{ common_timezone }}" + when: common_current_timezone.stdout != common_timezone + changed_when: true + rescue: + - name: Recover apt metadata after failed package preparation + ansible.builtin.apt: + update_cache: true + force_apt_get: true + changed_when: false + + - name: Retry apt cache update after recovery + ansible.builtin.apt: + update_cache: true + cache_valid_time: "{{ common_apt_cache_valid_time }}" + lock_timeout: 600 + always: + - name: Record common role completion marker + ansible.builtin.lineinfile: + path: /tmp/ansible-common-role.log + line: "common role completed" + create: true + mode: "0644" + become: true + tags: + - packages + +- name: Ensure automation user exists + block: + - name: Create or update managed user + ansible.builtin.user: + name: "{{ common_user_name }}" + state: present + create_home: true + shell: /bin/bash + when: common_manage_user | bool + become: true + tags: + - users diff --git a/ansible/roles/docker/defaults/main.yml b/ansible/roles/docker/defaults/main.yml new file mode 100644 index 0000000000..10a3bb0332 --- /dev/null +++ b/ansible/roles/docker/defaults/main.yml @@ -0,0 +1,29 @@ +--- +docker_apt_cache_valid_time: 3600 +docker_user: "{{ ansible_user | default('ubuntu') }}" + +docker_prerequisite_packages: + - ca-certificates + - curl + - gnupg + +docker_apt_keyring_dir: /etc/apt/keyrings +docker_apt_keyring_file: /etc/apt/keyrings/docker.asc +docker_gpg_key_url: "https://download.docker.com/linux/{{ docker_repo_distribution }}/gpg" + +docker_repo_distribution: >- + {{ 'ubuntu' if ansible_distribution | lower == 'ubuntu' else 'debian' }} +docker_apt_release: "{{ ansible_distribution_release }}" + +docker_architecture_map: + x86_64: amd64 + aarch64: arm64 + armv7l: armhf + +docker_packages: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + - python3-docker diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml new file mode 100644 index 0000000000..07aa0eb290 --- /dev/null +++ b/ansible/roles/docker/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart docker + ansible.builtin.service: + name: docker + state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000000..7619894feb --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,98 @@ +--- +- name: Install Docker engine + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: "{{ docker_apt_cache_valid_time }}" + lock_timeout: 600 + + - name: Install Docker prerequisites + ansible.builtin.apt: + name: "{{ docker_prerequisite_packages }}" + state: present + lock_timeout: 600 + + - name: Ensure Docker keyring directory exists + ansible.builtin.file: + path: "{{ docker_apt_keyring_dir }}" + state: directory + mode: "0755" + + - name: Download Docker GPG key + ansible.builtin.get_url: + url: "{{ docker_gpg_key_url }}" + dest: "{{ docker_apt_keyring_file }}" + mode: "0644" + notify: Restart docker + + - name: Add Docker apt repository + ansible.builtin.apt_repository: + repo: >- + deb [arch={{ docker_architecture_map.get(ansible_architecture, 'amd64') }} + signed-by={{ docker_apt_keyring_file }}] + https://download.docker.com/linux/{{ docker_repo_distribution }} + {{ docker_apt_release }} stable + state: present + filename: docker + notify: Restart docker + + - name: Install Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + lock_timeout: 600 + notify: Restart docker + rescue: + - name: Wait before retrying Docker apt metadata updates + ansible.builtin.pause: + seconds: 10 + + - name: Retry apt cache update after Docker key/repository failure + ansible.builtin.apt: + update_cache: true + lock_timeout: 600 + + - name: Retry Docker GPG key download + ansible.builtin.get_url: + url: "{{ docker_gpg_key_url }}" + dest: "{{ docker_apt_keyring_file }}" + mode: "0644" + + - name: Re-apply Docker apt repository after retry + ansible.builtin.apt_repository: + repo: >- + deb [arch={{ docker_architecture_map.get(ansible_architecture, 'amd64') }} + signed-by={{ docker_apt_keyring_file }}] + https://download.docker.com/linux/{{ docker_repo_distribution }} + {{ docker_apt_release }} stable + state: present + filename: docker + + - name: Retry Docker package installation + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + lock_timeout: 600 + always: + - name: Ensure Docker service is enabled and running + ansible.builtin.service: + name: docker + state: started + enabled: true + become: true + tags: + - docker_install + +- name: Configure Docker access + block: + - name: Add user to docker group + ansible.builtin.user: + name: "{{ docker_user }}" + groups: docker + append: true + become: true + tags: + - docker_config diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..246eb907ae --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,68 @@ +--- +# Project directories +monitoring_project_dir: /opt/monitoring + +# Service versions +monitoring_loki_version: "3.0.0" +monitoring_promtail_version: "3.0.0" +monitoring_grafana_version: "12.3.1" + +# Ports +monitoring_loki_port: 3100 +monitoring_promtail_port: 9080 +monitoring_grafana_port: 3000 +monitoring_app_host_port: 8000 +monitoring_app_container_port: 5000 + +# Loki settings +monitoring_loki_schema_from: "2024-01-01" +monitoring_loki_schema_version: "v13" +monitoring_loki_retention_period: "168h" + +# App integration +monitoring_app_enabled: true +monitoring_app_name: "devops-python" +monitoring_app_image: "{{ dockerhub_username | default('linktur') }}/devops-lab2" +monitoring_app_tag: "{{ docker_image_tag | default('latest') }}" + +# Grafana security +monitoring_grafana_admin_user: "admin" +monitoring_grafana_admin_password: "ChangeMe_Lab07_Replace" +monitoring_grafana_datasource_uid: "loki" +monitoring_grafana_dashboard_uid: "lab07-logs" + +# Compose behavior +monitoring_compose_pull_policy: "always" +monitoring_compose_recreate: "auto" +monitoring_wait_timeout: 120 + +# Service resources +monitoring_resources: + loki: + limits: + cpus: "1.0" + memory: "1G" + reservations: + cpus: "0.25" + memory: "256M" + promtail: + limits: + cpus: "0.5" + memory: "512M" + reservations: + cpus: "0.10" + memory: "128M" + grafana: + limits: + cpus: "1.0" + memory: "1G" + reservations: + cpus: "0.25" + memory: "256M" + app: + limits: + cpus: "0.5" + memory: "512M" + reservations: + cpus: "0.10" + memory: "128M" diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..cb7d8e0460 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml new file mode 100644 index 0000000000..704ef11f57 --- /dev/null +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -0,0 +1,76 @@ +--- +- name: Log in to Docker Hub when credentials are available + community.docker.docker_login: + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + no_log: true + when: + - dockerhub_username is defined + - dockerhub_password is defined + - dockerhub_username | length > 0 + - dockerhub_password | length > 0 + +- name: Start monitoring stack with Docker Compose + community.docker.docker_compose_v2: + project_src: "{{ monitoring_project_dir }}" + pull: "{{ monitoring_compose_pull_policy }}" + recreate: "{{ monitoring_compose_recreate }}" + state: present + register: monitoring_compose_result + +- name: Wait for Loki port + ansible.builtin.wait_for: + host: "127.0.0.1" + port: "{{ monitoring_loki_port }}" + delay: 2 + timeout: "{{ monitoring_wait_timeout }}" + +- name: Wait for Grafana port + ansible.builtin.wait_for: + host: "127.0.0.1" + port: "{{ monitoring_grafana_port }}" + delay: 2 + timeout: "{{ monitoring_wait_timeout }}" + +- name: Wait for application port when enabled + ansible.builtin.wait_for: + host: "127.0.0.1" + port: "{{ monitoring_app_host_port }}" + delay: 2 + timeout: "{{ monitoring_wait_timeout }}" + when: monitoring_app_enabled | bool + +- name: Verify Loki readiness endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_loki_port }}/ready" + method: GET + status_code: 200 + register: monitoring_loki_ready + retries: 20 + delay: 3 + until: monitoring_loki_ready.status == 200 + +- name: Verify Grafana health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/health" + method: GET + status_code: 200 + register: monitoring_grafana_health + retries: 20 + delay: 3 + until: monitoring_grafana_health.status == 200 + +- name: Verify Loki datasource in Grafana + ansible.builtin.uri: + url: >- + http://127.0.0.1:{{ monitoring_grafana_port }}/api/datasources/uid/{{ + monitoring_grafana_datasource_uid }} + method: GET + user: "{{ monitoring_grafana_admin_user }}" + password: "{{ monitoring_grafana_admin_password }}" + force_basic_auth: true + status_code: 200 + register: monitoring_loki_datasource + retries: 20 + delay: 3 + until: monitoring_loki_datasource.status == 200 diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..2c6e5fce33 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,12 @@ +--- +- name: Include setup tasks + ansible.builtin.include_tasks: setup.yml + tags: + - monitoring_setup + - monitoring + +- name: Include deploy tasks + ansible.builtin.include_tasks: deploy.yml + tags: + - monitoring_deploy + - monitoring diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml new file mode 100644 index 0000000000..ec18583cee --- /dev/null +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -0,0 +1,68 @@ +--- +- name: Validate Grafana admin password baseline + ansible.builtin.assert: + that: + - monitoring_grafana_admin_password | length >= 12 + - monitoring_grafana_admin_password != "ChangeMe_Lab07_Replace" + fail_msg: >- + Set monitoring_grafana_admin_password to a strong secret + (recommended via ansible-vault in group_vars/all.yml). + +- name: Ensure monitoring directory structure exists + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ monitoring_project_dir }}" + - "{{ monitoring_project_dir }}/loki" + - "{{ monitoring_project_dir }}/promtail" + - "{{ monitoring_project_dir }}/grafana" + - "{{ monitoring_project_dir }}/grafana/provisioning" + - "{{ monitoring_project_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_project_dir }}/grafana/provisioning/dashboards" + - "{{ monitoring_project_dir }}/grafana/dashboards" + +- name: Render monitoring docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_project_dir }}/docker-compose.yml" + mode: "0644" + +- name: Render Loki config + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_project_dir }}/loki/config.yml" + mode: "0644" + +- name: Render Promtail config + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_project_dir }}/promtail/config.yml" + mode: "0644" + +- name: Render Grafana datasource provisioning + ansible.builtin.template: + src: grafana-datasource.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/loki.yml" + mode: "0644" + +- name: Render Grafana dashboard provider + ansible.builtin.template: + src: dashboard-provider.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/dashboards/dashboard-provider.yml" + mode: "0644" + +- name: Render Grafana dashboard + ansible.builtin.template: + src: lab07-logs-dashboard.json.j2 + dest: "{{ monitoring_project_dir }}/grafana/dashboards/lab07-logs-dashboard.json" + mode: "0644" + +- name: Render monitoring environment file + ansible.builtin.copy: + dest: "{{ monitoring_project_dir }}/.env" + mode: "0600" + content: | + GRAFANA_ADMIN_USER={{ monitoring_grafana_admin_user }} + GRAFANA_ADMIN_PASSWORD={{ monitoring_grafana_admin_password }} diff --git a/ansible/roles/monitoring/templates/dashboard-provider.yml.j2 b/ansible/roles/monitoring/templates/dashboard-provider.yml.j2 new file mode 100644 index 0000000000..d8154d1496 --- /dev/null +++ b/ansible/roles/monitoring/templates/dashboard-provider.yml.j2 @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: "lab07-dashboards" + orgId: 1 + folder: "Lab 07" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..6f5e89da83 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,150 @@ +version: "3.8" + +services: + loki: + image: grafana/loki:{{ monitoring_loki_version }} + container_name: loki + command: + - "-config.file=/etc/loki/config.yml" + ports: + - "{{ monitoring_loki_port }}:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resources.loki.limits.cpus }}" + memory: "{{ monitoring_resources.loki.limits.memory }}" + reservations: + cpus: "{{ monitoring_resources.loki.reservations.cpus }}" + memory: "{{ monitoring_resources.loki.reservations.memory }}" + + promtail: + image: grafana/promtail:{{ monitoring_promtail_version }} + container_name: promtail + command: + - "-config.file=/etc/promtail/config.yml" + ports: + - "{{ monitoring_promtail_port }}:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - promtail-positions:/tmp + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + restart: unless-stopped + depends_on: + - loki + healthcheck: + test: ["CMD-SHELL", "grep -qa promtail /proc/1/cmdline || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resources.promtail.limits.cpus }}" + memory: "{{ monitoring_resources.promtail.limits.memory }}" + reservations: + cpus: "{{ monitoring_resources.promtail.reservations.cpus }}" + memory: "{{ monitoring_resources.promtail.reservations.memory }}" + + grafana: + image: grafana/grafana:{{ monitoring_grafana_version }} + container_name: grafana + ports: + - "{{ monitoring_grafana_port }}:3000" + env_file: + - ./.env + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + restart: unless-stopped + depends_on: + - loki + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + deploy: + resources: + limits: + cpus: "{{ monitoring_resources.grafana.limits.cpus }}" + memory: "{{ monitoring_resources.grafana.limits.memory }}" + reservations: + cpus: "{{ monitoring_resources.grafana.reservations.cpus }}" + memory: "{{ monitoring_resources.grafana.reservations.memory }}" +{% if monitoring_app_enabled | bool %} + + {{ monitoring_app_name }}: + image: "{{ monitoring_app_image }}:{{ monitoring_app_tag }}" + container_name: "{{ monitoring_app_name }}" + ports: + - "{{ monitoring_app_host_port }}:{{ monitoring_app_container_port }}" + environment: + PORT: "{{ monitoring_app_container_port }}" + DEBUG: "false" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + restart: unless-stopped + depends_on: + - promtail + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:{{ monitoring_app_container_port }}/health', timeout=3)"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resources.app.limits.cpus }}" + memory: "{{ monitoring_resources.app.limits.memory }}" + reservations: + cpus: "{{ monitoring_resources.app.reservations.cpus }}" + memory: "{{ monitoring_resources.app.reservations.memory }}" +{% endif %} + +volumes: + loki-data: + promtail-positions: + grafana-data: + +networks: + logging: + name: logging diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 new file mode 100644 index 0000000000..4e1c141681 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: {{ monitoring_grafana_datasource_uid }} + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false diff --git a/ansible/roles/monitoring/templates/lab07-logs-dashboard.json.j2 b/ansible/roles/monitoring/templates/lab07-logs-dashboard.json.j2 new file mode 100644 index 0000000000..7926f6a97d --- /dev/null +++ b/ansible/roles/monitoring/templates/lab07-logs-dashboard.json.j2 @@ -0,0 +1,287 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "value", + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "{{ monitoring_grafana_datasource_uid }}" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "instant": true, + "queryType": "instant", + "refId": "A" + } + ], + "title": "Log Level Distribution", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "lab07", + "loki", + "logging" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab07 - Observability & Logging", + "uid": "{{ monitoring_grafana_dashboard_uid }}", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..ea87f5d8d9 --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,45 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: "{{ monitoring_loki_schema_from }}" + store: tsdb + object_store: filesystem + schema: {{ monitoring_loki_schema_version }} + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: {{ monitoring_loki_retention_period }} + +analytics: + reporting_enabled: false diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..fa7b6055e8 --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,31 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: + - logging=promtail + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: container + - source_labels: ['__meta_docker_container_label_app'] + target_label: app + - source_labels: ['__meta_docker_container_label_com_docker_compose_service'] + target_label: service + - source_labels: ['__meta_docker_container_id'] + target_label: container_id + pipeline_stages: + - docker: {} diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..809f9fde68 --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1,26 @@ +--- +# Application configuration +app_name: "devops-lab2" +app_port: 5000 +app_internal_port: "{{ app_container_port | default(5000) }}" +app_healthcheck_path: "/health" +app_wait_timeout: 90 +app_restart_policy: "unless-stopped" + +# Docker image configuration +docker_image: "{{ dockerhub_username }}/{{ app_name }}" +docker_tag: "{{ docker_image_tag | default('latest') }}" + +# Compose configuration +docker_compose_version: "3.8" +compose_project_dir: "/opt/{{ app_name }}" +web_app_compose_pull_policy: "always" +web_app_compose_recreate: "auto" +web_app_remove_image: false + +# Deployment safety controls +web_app_wipe: false + +# Environment variables passed to the application container. +app_env: + PORT: "{{ app_internal_port | string }}" diff --git a/ansible/roles/web_app/handlers/main.yml b/ansible/roles/web_app/handlers/main.yml new file mode 100644 index 0000000000..2e8449c4b3 --- /dev/null +++ b/ansible/roles/web_app/handlers/main.yml @@ -0,0 +1,8 @@ +--- +- name: Restart web application + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + state: present + recreate: always + pull: never + become: true diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..cc004b056f --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,7 @@ +--- +# Docker must be available before docker_compose_v2 can start the application stack. +dependencies: + - role: docker + tags: + - docker + - web_app diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..b0711b4bdb --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,101 @@ +--- +# Wipe logic executes first, but only performs cleanup when web_app_wipe=true. +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + +- name: Validate required Docker Hub credentials + ansible.builtin.assert: + that: + - dockerhub_username is defined + - dockerhub_username | length > 0 + - dockerhub_password is defined + - dockerhub_password | length > 0 + fail_msg: >- + dockerhub_username/dockerhub_password are not set. + Create encrypted group_vars/all.yml with ansible-vault. + tags: + - app_deploy + - compose + +- name: Deploy application with Docker Compose + block: + - name: Ensure compose project directory exists + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: directory + mode: "0755" + become: true + + - name: Render docker-compose configuration + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ compose_project_dir }}/docker-compose.yml" + mode: "0644" + become: true + + - name: Inspect existing container with target name + community.docker.docker_container_info: + name: "{{ app_name }}" + register: web_app_existing_container + failed_when: false + changed_when: false + become: true + + - name: Remove conflicting non-compose container + community.docker.docker_container: + name: "{{ app_name }}" + state: absent + force_kill: true + when: + - web_app_existing_container.exists | default(false) + - >- + (web_app_existing_container.container.Config.Labels['com.docker.compose.project'] | default('')) + != (compose_project_dir | basename) + become: true + + - name: Log in to Docker Hub + community.docker.docker_login: + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + no_log: true + + - name: Start or update application via Docker Compose + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + pull: "{{ web_app_compose_pull_policy }}" + recreate: "{{ web_app_compose_recreate }}" + state: present + register: web_app_compose_result + become: true + + - name: Wait for application port to become available + ansible.builtin.wait_for: + host: "127.0.0.1" + port: "{{ app_port }}" + delay: 2 + timeout: "{{ app_wait_timeout }}" + + - name: Verify health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ app_port }}{{ app_healthcheck_path }}" + method: GET + status_code: 200 + register: app_health_result + retries: 10 + delay: 3 + until: app_health_result.status == 200 + rescue: + - name: Report deployment failure context + ansible.builtin.debug: + msg: >- + Docker Compose deployment failed for {{ app_name }} in + {{ compose_project_dir }}. Check docker compose logs on the target host. + + - name: Stop play when compose deployment failed + ansible.builtin.fail: + msg: "Deployment failed for {{ docker_image }}:{{ docker_tag }}. Verify image/tag exists in Docker Hub." + tags: + - app_deploy + - compose diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..559daf53de --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,50 @@ +--- +- name: Check whether docker-compose.yml exists + ansible.builtin.stat: + path: "{{ compose_project_dir }}/docker-compose.yml" + register: web_app_compose_file + tags: + - web_app_wipe + +- name: Wipe web application deployment + block: + - name: Stop and remove compose services + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + state: absent + when: web_app_compose_file.stat.exists + become: true + + - name: Optionally remove application image + community.docker.docker_image: + name: "{{ docker_image }}:{{ docker_tag }}" + state: absent + force_absent: true + when: web_app_remove_image | bool + become: true + + - name: Remove legacy container with the same name + community.docker.docker_container: + name: "{{ app_name }}" + state: absent + force_kill: true + become: true + + - name: Remove docker-compose file + ansible.builtin.file: + path: "{{ compose_project_dir }}/docker-compose.yml" + state: absent + become: true + + - name: Remove application directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: absent + become: true + + - name: Confirm wipe completion + ansible.builtin.debug: + msg: "Application {{ app_name }} wiped successfully" + when: web_app_wipe | bool + tags: + - web_app_wipe diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..29e89358dc --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,17 @@ +services: + {{ app_name }}: + image: "{{ docker_image }}:{{ docker_tag }}" + container_name: "{{ app_name }}" + restart: "{{ app_restart_policy }}" + ports: + - "{{ app_port }}:{{ app_internal_port }}" +{% if app_env | length > 0 %} + environment: +{% for key, value in app_env.items() %} + {{ key }}: "{{ value }}" +{% endfor %} +{% endif %} + +networks: + default: + name: "{{ app_name }}-network" diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000000..b08bd90e55 --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,2 @@ +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=ChangeMe_Lab07_Replace diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..53ba2ad99e --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,150 @@ +version: "3.8" + +services: + loki: + image: grafana/loki:3.0.0 + container_name: loki + command: + - "-config.file=/etc/loki/config.yml" + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + command: + - "-config.file=/etc/promtail/config.yml" + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - promtail-positions:/tmp + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + restart: unless-stopped + depends_on: + - loki + healthcheck: + test: ["CMD-SHELL", "grep -qa promtail /proc/1/cmdline || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + env_file: + - ./.env + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + restart: unless-stopped + depends_on: + - loki + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + app-python: + build: + context: ../Lab-1/app_python + image: devops-lab2:lab07 + container_name: app-python + ports: + - "8000:5000" + environment: + PORT: "5000" + DEBUG: "false" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + restart: unless-stopped + depends_on: + - promtail + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health', timeout=3)"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + +volumes: + loki-data: + promtail-positions: + grafana-data: + +networks: + logging: + name: logging diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..1d2bd27d6a --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,276 @@ +# LAB07 - Observability & Logging with Loki Stack + +## 1. Architecture + +```text + +-------------------------+ + | Grafana | + | :3000 (Dashboards) | + +------------+------------+ + | + | LogQL queries + v + +-------------------------+ + | Loki | + | :3100 (Log Storage) | + +------------+------------+ + ^ + | /loki/api/v1/push + | + +------------+------------+ + | Promtail | + | :9080 + Docker SD | + +-----+-----------+-------+ + | | + Docker socket | | Container log files + | | + +------------+-----------+------------+ + | | + +--------+---------+ +--------+---------+ + | app-python:8000 | | grafana/loki/... | + | JSON application | | infra containers | + +------------------+ +------------------+ +``` + +Architecture notes: +- `Promtail` discovers containers via Docker socket and reads container log files. +- Only containers with label `logging=promtail` are scraped. +- `Loki` stores logs with TSDB schema `v13` on local filesystem volume. +- `Grafana` queries Loki via LogQL and shows both Explore and dashboard panels. + +## 2. Setup Guide + +1. Open `monitoring` directory. +2. Create secrets file: + +```bash +cd monitoring +cp .env.example .env +# set a strong password in .env +``` + +3. Start stack: + +```bash +docker compose up -d +docker compose ps +``` + +4. Verify endpoints: + +```bash +curl http://localhost:3100/ready +curl http://localhost:9080/targets +curl http://localhost:3000/api/health +curl http://localhost:8000/health +``` + +5. Open Grafana: `http://localhost:3000` and login with `.env` credentials. + +## 3. Configuration + +### Loki (`monitoring/loki/config.yml`) +- Uses `tsdb` + `filesystem` storage. +- Uses schema `v13`. +- Retention is set to `168h` (7 days). +- Compactor is enabled for retention cleanup. +- `schema_config.configs[].from` is set to `"2024-01-01"` for stable v13 schema activation. + +### Promtail (`monitoring/promtail/config.yml`) +- Sends logs to `http://loki:3100/loki/api/v1/push`. +- Uses `docker_sd_configs` for container discovery. +- Uses label filter `logging=promtail` to collect only tagged containers. +- Relabeling maps: + - container name to `container` + - app label to `app` + - compose service to `service` + +### Grafana provisioning +- Data source provisioned from `monitoring/grafana/provisioning/datasources/loki.yml`. +- Dashboard provider points to `monitoring/grafana/dashboards/`. +- Dashboard is preloaded from `lab07-logs-dashboard.json`. +- Anonymous access is disabled; login is required. + +## 4. Application Logging + +`Lab-1/app_python/app.py` now logs in JSON format using a custom `JSONFormatter`. + +Logged events: +- `startup` (app boot info) +- `request_start` (method/path/client/user-agent) +- `request_end` (status + request duration) +- `http_404` and `http_500` errors + +Example log: + +```json +{"timestamp":"2026-03-19T01:20:00.123Z","level":"INFO","logger":"devops-info-service","message":"Request completed","event":"request_end","method":"GET","path":"/health","status_code":200,"client_ip":"127.0.0.1","duration_ms":2.53} +``` + +## 5. Dashboard + +Dashboard file: `monitoring/grafana/dashboards/lab07-logs-dashboard.json` + +Panels: +1. Logs Table +Query: `{app=~"devops-.*"}` +2. Request Rate (time series) +Query: `sum by (app) (rate({app=~"devops-.*"}[1m]))` +3. Error Logs +Query: `{app=~"devops-.*"} | json | level="ERROR"` +4. Log Level Distribution (pie chart) +Query: `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` + +Additional useful LogQL examples: + +```logql +{app="devops-python"} +{app="devops-python"} |= "ERROR" +{app="devops-python"} | json | method="GET" +``` + +## 6. Production Config + +Implemented in `monitoring/docker-compose.yml`: +- Resource limits and reservations for all services (`deploy.resources`). +- Per-service limits: + - Loki: `cpus: 1.0`, `memory: 1G` + - Promtail: `cpus: 0.5`, `memory: 512M` + - Grafana: `cpus: 1.0`, `memory: 1G` + - app-python: `cpus: 0.5`, `memory: 512M` +- Grafana anonymous access is disabled: + - `GF_AUTH_ANONYMOUS_ENABLED=false` +- Admin credentials loaded from `.env` (`GRAFANA_ADMIN_PASSWORD`). +- Healthchecks for Loki, Promtail, Grafana, and app-python. +- Retention policy in Loki: `168h` (7 days). + +## 7. Testing + +Validation date: **2026-03-19** + +Command: + +```powershell +docker compose ps +``` + +Result: +- `app-python` - `Up ... (healthy)` +- `grafana` - `Up ... (healthy)` +- `loki` - `Up ... (healthy)` +- `promtail` - `Up ... (healthy)` + +Command: + +```powershell +docker compose logs app-python --tail=20 +``` + +Result: +- JSON logs confirmed. +- Structured fields observed: `timestamp`, `level`, `logger`, `message`, `event`, `method`, `path`, `status_code`, `client_ip`, `duration_ms`. + +Command: + +```powershell +curl http://localhost:3100/ready +``` + +Result: +- HTTP 200 +- Response body: `ready` + +Command: + +```powershell +curl http://localhost:3000/api/health +``` + +Result: +- HTTP 200 +- Grafana health response includes `database: ok` + +Command: + +```powershell +curl http://localhost:3100/loki/api/v1/label/app/values +``` + +Result: +- HTTP 200 +- Loki labels received: `devops-grafana`, `devops-loki`, `devops-promtail`, `devops-python` + +Command: + +```powershell +curl http://localhost:9080/targets +``` + +Result: +- HTTP 200 (Promtail targets UI HTML page) +- Ready targets visible for Docker job + +## 8. Challenges + +1. Docker label-based filtering +Promtail was configured to scrape only `logging=promtail` containers to reduce noise. + +2. Structured logs from Flask +A custom JSON formatter was added to keep logs machine-readable and searchable in LogQL. + +3. Repeatable setup +Grafana data source and dashboard were provisioned from files to avoid manual UI setup. + +4. Docker Compose warning +`docker compose` prints warning that `version` is obsolete in v2 CLI. This warning does not affect stack functionality. + +## 9. Bonus - Ansible Automation + +Added role and playbook: +- `ansible/roles/monitoring` +- `ansible/playbooks/deploy-monitoring.yml` + +Role behavior: +- creates monitoring directories on target host +- renders templated Loki/Promtail/Compose/Grafana provisioning files +- writes `.env` with Grafana admin credentials +- deploys stack with `community.docker.docker_compose_v2` +- waits for ports and health endpoints +- verifies Loki datasource exists in Grafana API + +Run: + +```bash +cd ansible +ansible-playbook playbooks/deploy-monitoring.yml --vault-id @prompt +ansible-playbook playbooks/deploy-monitoring.yml --vault-id @prompt +``` + +Expected second run: idempotent (`changed=0` for already converged state). + +## Evidence Checklist + +- `monitoring/docker-compose.yml` includes Loki/Promtail/Grafana + app. +- Loki data source is provisioned automatically. +- Python app logs are JSON. +- Dashboard with 4 panels is provisioned. +- Resource limits, health checks, and Grafana auth hardening are present. + +## 10. Screenshots + +### Docker stack health +![Docker services healthy](../../screenshots/lab07/DockerHealth.png) + +### Grafana Loki data source +![Grafana Loki datasource](../../screenshots/lab07/GrafanaLoki.png) + +### JSON logs from app-python +![App JSON logs](../../screenshots/lab07/logs.png) + +### LogQL query examples +![LogQL query 1](../../screenshots/lab07/Query1.png) +![LogQL query 2](../../screenshots/lab07/Query2.png) +![LogQL query 3](../../screenshots/lab07/Query3.png) + +### Dashboard with 3 panels +![Lab07 dashboard](../../screenshots/lab07/DashBoard.png) diff --git a/monitoring/grafana/dashboards/lab07-logs-dashboard.json b/monitoring/grafana/dashboards/lab07-logs-dashboard.json new file mode 100644 index 0000000000..2a624d695c --- /dev/null +++ b/monitoring/grafana/dashboards/lab07-logs-dashboard.json @@ -0,0 +1,289 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "legendFormat": "{{app}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "value", + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "instant": true, + "legendFormat": "{{level}}", + "queryType": "instant", + "refId": "A" + } + ], + "title": "Log Level Distribution", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "lab07", + "loki", + "logging" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab07 - Observability & Logging", + "uid": "lab07-logs", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml b/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml new file mode 100644 index 0000000000..d8154d1496 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: "lab07-dashboards" + orgId: 1 + folder: "Lab 07" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..71041923b2 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..d28ab89806 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,45 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: 168h + +analytics: + reporting_enabled: false diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..fa7b6055e8 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,31 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: + - logging=promtail + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: container + - source_labels: ['__meta_docker_container_label_app'] + target_label: app + - source_labels: ['__meta_docker_container_label_com_docker_compose_service'] + target_label: service + - source_labels: ['__meta_docker_container_id'] + target_label: container_id + pipeline_stages: + - docker: {} diff --git a/screenshots/lab06_ans.png b/screenshots/lab06_ans.png new file mode 100644 index 0000000000..d7b0555d9a Binary files /dev/null and b/screenshots/lab06_ans.png differ diff --git a/screenshots/lab07/DashBoard.png b/screenshots/lab07/DashBoard.png new file mode 100644 index 0000000000..e0091b3557 Binary files /dev/null and b/screenshots/lab07/DashBoard.png differ diff --git a/screenshots/lab07/DockerHealth.png b/screenshots/lab07/DockerHealth.png new file mode 100644 index 0000000000..09ce7a00ef Binary files /dev/null and b/screenshots/lab07/DockerHealth.png differ diff --git a/screenshots/lab07/GrafanaLoki.png b/screenshots/lab07/GrafanaLoki.png new file mode 100644 index 0000000000..99aff6f906 Binary files /dev/null and b/screenshots/lab07/GrafanaLoki.png differ diff --git a/screenshots/lab07/Query1.png b/screenshots/lab07/Query1.png new file mode 100644 index 0000000000..5c9387c51c Binary files /dev/null and b/screenshots/lab07/Query1.png differ diff --git a/screenshots/lab07/Query2.png b/screenshots/lab07/Query2.png new file mode 100644 index 0000000000..1f44bc25d0 Binary files /dev/null and b/screenshots/lab07/Query2.png differ diff --git a/screenshots/lab07/Query3.png b/screenshots/lab07/Query3.png new file mode 100644 index 0000000000..8ec69c154f Binary files /dev/null and b/screenshots/lab07/Query3.png differ diff --git a/screenshots/lab07/logs.png b/screenshots/lab07/logs.png new file mode 100644 index 0000000000..bc05302f90 Binary files /dev/null and b/screenshots/lab07/logs.png differ diff --git a/screenshots/lint_ans.png b/screenshots/lint_ans.png new file mode 100644 index 0000000000..2c3ef9546b Binary files /dev/null and b/screenshots/lint_ans.png differ