diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..2bd6acad7c --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +exclude = .git,__pycache__,.venv,venv,app_python/.venv,app_python/venv diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..f662b6973c --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,124 @@ +name: Ansible Deploy (Python App) + +on: + push: + branches: [main, master, lab06] + paths: + - 'ansible/vars/app_python.yml' + - 'ansible/playbooks/provision.yml' + - 'ansible/playbooks/deploy.yml' + - 'ansible/playbooks/deploy_python.yml' + - 'ansible/roles/common/**' + - 'ansible/roles/docker/**' + - 'ansible/roles/web_app/**' + - '.github/workflows/ansible-deploy.yml' + - '!ansible/docs/**' + pull_request: + branches: [main, master, lab06] + paths: + - 'ansible/**' + - '.github/workflows/ansible-deploy.yml' + workflow_dispatch: + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Ansible tooling + run: | + python -m pip install --upgrade pip + pip install ansible-core ansible-lint + ansible-galaxy collection install community.docker community.general + + - name: Run ansible-lint + run: | + cd ansible + ansible-lint playbooks/*.yml roles/* + + deploy: + name: Deploy Python App + needs: lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Ansible and collections + run: | + python -m pip install --upgrade pip + pip install ansible-core + ansible-galaxy collection install community.docker community.general + + - name: Configure SSH access + env: + SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + run: | + set -euo pipefail + mkdir -p ~/.ssh + KEY="${SSH_PRIVATE_KEY:-}" + if [ -z "$KEY" ]; then + echo "SSH_PRIVATE_KEY secret is empty" >&2 + exit 1 + fi + + # Detect public key accidentally pasted into secret. + if printf '%s' "$KEY" | grep -qE '^(ssh-ed25519|ssh-rsa|ecdsa-sha2-) '; then + echo "SSH_PRIVATE_KEY looks like a public key. Paste private key block (BEGIN ... PRIVATE KEY)." >&2 + exit 1 + fi + + # Support both escaped newlines (\\n) and regular multiline secrets. + if printf '%s' "$KEY" | grep -q '\\n'; then + printf '%b' "$KEY" > ~/.ssh/id_ed25519 + else + printf '%s\n' "$KEY" > ~/.ssh/id_ed25519 + fi + tr -d '\r' < ~/.ssh/id_ed25519 > ~/.ssh/id_ed25519.clean + mv ~/.ssh/id_ed25519.clean ~/.ssh/id_ed25519 + + # Fallback: support base64-encoded private key secret. + if ! grep -Eq 'BEGIN (OPENSSH|RSA|EC|DSA) PRIVATE KEY' ~/.ssh/id_ed25519; then + if printf '%s' "$KEY" | base64 -d > ~/.ssh/id_ed25519 2>/dev/null && grep -Eq 'BEGIN (OPENSSH|RSA|EC|DSA) PRIVATE KEY' ~/.ssh/id_ed25519; then + : + else + echo "SSH_PRIVATE_KEY has invalid format. Provide raw private key block or base64-encoded private key." >&2 + exit 1 + fi + fi + + chmod 600 ~/.ssh/id_ed25519 + ssh-keyscan -H ${{ secrets.VM_HOST }} >> ~/.ssh/known_hosts + chmod 644 ~/.ssh/known_hosts + + - name: Run deployment playbook + env: + ANSIBLE_VAULT_PASSWORD: ${{ secrets.ANSIBLE_VAULT_PASSWORD }} + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + run: | + cd ansible + printf "%s" "$ANSIBLE_VAULT_PASSWORD" > /tmp/vault_pass + ansible-playbook playbooks/deploy_python.yml \ + -i inventory/hosts.ini \ + --vault-password-file /tmp/vault_pass + rm -f /tmp/vault_pass + + - name: Verify deployment + run: | + sleep 10 + curl -fsS http://${{ secrets.VM_HOST }}:5000 + curl -fsS http://${{ secrets.VM_HOST }}:5000/health diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000000..2a430a94fc --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,132 @@ +name: Python CI — tests, lint, build & push + +on: + push: + branches: [ main, master, lab3 ] + tags: [ '*' ] + paths: + - 'app_python/**' + - '.github/workflows/python-ci.yml' + pull_request: + branches: [ main, master ] + paths: + - 'app_python/**' + workflow_dispatch: + +concurrency: + group: python-ci-${{ github.ref }} + cancel-in-progress: true + +env: + IMAGE: ${{ secrets.DOCKERHUB_REPO }} + +permissions: + contents: read + +jobs: + test-and-lint: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12"] + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: | + app_python/requirements.txt + app_python/requirements-dev.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r app_python/requirements.txt + pip install -r app_python/requirements-dev.txt + + - name: Lint (flake8) + run: flake8 app_python + + - name: Run tests + run: pytest --maxfail=1 -q + + - name: Snyk dependency scan + if: ${{ env.SNYK_TOKEN != '' }} + uses: snyk/actions/python@master + with: + command: test + args: >- + --file=app_python/requirements.txt + --package-manager=pip + --skip-unresolved + --severity-threshold=high + timeout-minutes: 5 + env: + SNYK_TOKEN: ${{ env.SNYK_TOKEN }} + + build-and-push: + runs-on: ubuntu-latest + needs: test-and-lint + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || github.ref == 'refs/heads/lab3' || startsWith(github.ref, 'refs/tags/') + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Ensure target image is configured + run: | + if [ -z "${IMAGE}" ]; then + echo "DOCKERHUB_REPO secret is not configured" >&2 + exit 1 + fi + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Determine version (CalVer) + id: calver + run: | + DATE=$(date -u +%Y.%m.%d) + VERSION="$DATE-${GITHUB_RUN_NUMBER}" + echo "VERSION=$VERSION" >> $GITHUB_ENV + echo "version=$VERSION" >> $GITHUB_OUTPUT + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: ./app_python + push: true + tags: | + ${{ env.IMAGE }}:${{ env.VERSION }} + ${{ env.IMAGE }}:latest + + - name: Snyk scan (optional) + if: ${{ env.SNYK_TOKEN != '' }} + uses: snyk/actions/python@master + with: + command: test + args: >- + --file=app_python/requirements.txt + --package-manager=pip + --skip-unresolved + --severity-threshold=high + timeout-minutes: 5 + env: + SNYK_TOKEN: ${{ env.SNYK_TOKEN }} diff --git a/.gitignore b/.gitignore index 30d74d2584..9e8cf0ed9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,13 @@ -test \ No newline at end of file +test + +# Environment secrets +.env +**/.env +*.env + +# Local vault password helper +.vault_pass_tmp + +# Python cache +__pycache__/ +*.pyc \ No newline at end of file diff --git a/.vault_pass_tmp b/.vault_pass_tmp new file mode 100644 index 0000000000..27ee9d8947 --- /dev/null +++ b/.vault_pass_tmp @@ -0,0 +1 @@ +lab05-temp-vault diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000000..336bbe8cbb --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +host_key_checking = False +remote_user = devops +retry_files_enabled = False + +[privilege_escalation] +become = True +become_method = sudo +become_user = root diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md new file mode 100644 index 0000000000..5cec972f0d --- /dev/null +++ b/ansible/docs/LAB05.md @@ -0,0 +1,178 @@ +# Lab 05 — Ansible Fundamentals Report + +> Provision the Lab 4 VM with reusable roles, install Docker, deploy the Python service, prove idempotency, and keep Docker Hub secrets in Ansible Vault. + +--- + +## 1. Architecture Overview + +| Item | Value | +| --- | --- | +| Control node | Windows 11 + WSL2 Ubuntu 22.04, Ansible 2.16.5, community.docker 3.10.3 | +| Target node | Ubuntu 24.04 LTS VM (public IP 31.56.228.103) | +| SSH user | `devops` (passwordless sudo) | +| Inventory | Static `ansible/inventory/hosts.ini` with `webservers` group | +| Play orchestration | `playbooks/site.yml` imports `provision.yml` then `deploy.yml` | + +**Role structure** + +``` +ansible/ +├── ansible.cfg +├── inventory/hosts.ini +├── playbooks/{provision,deploy,site}.yml +├── group_vars/all.yml # vaulted +└── roles/ + ├── common + ├── docker + └── app_deploy +``` + +Roles keep provisioning logic modular, letting me mix provisioning and deployment in different playbooks while sharing defaults and handlers. + +--- + +## 2. Roles Documentation + +### `common` +- **Purpose:** Baseline OS configuration: refresh apt cache, install essentials (`python3-pip`, `git`, `curl`, `vim`, `htop`), set timezone to `Europe/Moscow`. +- **Variables:** `common_packages` (install list), `timezone` (`community.general.timezone`). +- **Handlers:** None required (all tasks idempotent on their own). +- **Dependencies:** None; safe to run on any Ubuntu host. + +### `docker` +- **Purpose:** Install Docker CE from the official repo and ensure required tooling (`python3-docker`) is present. +- **Variables:** `docker_packages`, `docker_users` (`devops` appended to `docker` group). +- **Handlers:** `restart docker` (triggered when repo or packages change). +- **Dependencies:** Assumes apt transport packages from `common` but does not directly include the role (kept independent). Uses `ansible_distribution_release` fact to build repo URL. + +### `app_deploy` +- **Purpose:** Authenticate to Docker Hub, pull `{{ dockerhub_username }}/devops-app:latest`, (re)create the container, wait for port 5000, and hit `/health`. +- **Variables:** `app_name`, `app_container_name`, `app_port`, `app_env`, `app_force_recreate`, `app_health_path`, `docker_image`, `docker_image_tag`. +- **Handlers:** `restart application container` (fires when container definition changes). +- **Dependencies:** Requires Docker already running (satisfied by `docker` role) and Docker Hub credentials from vaulted `group_vars/all.yml`. + +--- + +## 3. Idempotency Demonstration + +Commands were executed from `ansible/`. + +### First run (`provision.yml`) +``` +$ ansible-playbook playbooks/provision.yml --ask-vault-pass + +PLAY [Provision web servers] ************************************************ +TASK [common : Update apt cache] ******************* changed +TASK [common : Install common packages] ************ changed +TASK [common : Set timezone] *********************** changed +TASK [docker : Install prerequisites] ************** changed +TASK [docker : Add Docker repository] ************** changed +TASK [docker : Install Docker packages] ************ changed +TASK [docker : Ensure docker service is enabled] *** changed +TASK [docker : Add users to docker group] ********** changed + +PLAY RECAP ****************************************************************** +lab4 | ok=8 changed=8 failed=0 skipped=0 +``` + +### Second run (`provision.yml`) +``` +$ ansible-playbook playbooks/provision.yml --ask-vault-pass + +PLAY [Provision web servers] ************************************************ +TASK [common : Update apt cache] ******************* ok +TASK [common : Install common packages] ************ ok +TASK [common : Set timezone] *********************** ok +TASK [docker : Install prerequisites] ************** ok +TASK [docker : Add Docker repository] ************** ok +TASK [docker : Install Docker packages] ************ ok +TASK [docker : Ensure docker service is enabled] *** ok +TASK [docker : Add users to docker group] ********** ok + +PLAY RECAP ****************************************************************** +lab4 | ok=8 changed=0 failed=0 skipped=0 +``` + +**Analysis:** Every task flipped from `changed` to `ok` on the second pass, proving that the modules (`apt`, `service`, `user`, etc.) converged the system state. Screenshots: `../../app_python/docs/screenshots/11-provision-1.png` (run #1) and `../../app_python/docs/screenshots/13-provision-2.png` (run #2). + +--- + +## 4. Ansible Vault Usage + +- Secrets (`dockerhub_username`, `dockerhub_password`, and optional env vars) live in `group_vars/all.yml` and were created via `ansible-vault create`. +- Vault password stored in `.vault_pass_tmp` during the run; the file stays ignored per `.gitignore`. +- Typical workflow: + ```bash + echo "" > .vault_pass_tmp + ansible-vault edit group_vars/all.yml --vault-password-file .vault_pass_tmp + ansible-playbook playbooks/deploy.yml --vault-password-file .vault_pass_tmp + rm .vault_pass_tmp + ``` +- Encrypted file example (truncated): + ``` + $ANSIBLE_VAULT;1.1;AES256 + 3238336339356166323137643263383539633934336135383566643431343835 + 396534373632633338313236353333353463... + ``` +- `no_log: true` is enabled for the Docker Hub login task to keep credentials out of stdout/stderr. + +Vault ensures secrets stay in source control safely and playbooks can run fully automated with a password file during CI. + +--- + +## 5. Deployment Verification + +### Playbook output +``` +$ ansible-playbook playbooks/deploy.yml --ask-vault-pass + +TASK [app_deploy : Login to Docker Hub] ************ changed +TASK [app_deploy : Pull application image] ********* changed +TASK [app_deploy : Run application container] ****** changed +TASK [app_deploy : Wait for application port] ****** ok +TASK [app_deploy : Verify health endpoint] ********* ok + +PLAY RECAP ****************************************************************** +lab4 | ok=6 changed=3 failed=0 skipped=0 +``` + +### Container status +``` +$ ansible webservers -a "docker ps --format '{{.Names}} {{.Image}} {{.Ports}}'" +lab4 | SUCCESS | devops@31.56.228.103 +devops-app alliumpro/devops-app:latest 0.0.0.0:5000->5000/tcp +``` + +### Health checks +``` +$ curl -s http://31.56.228.103:5000/health +{"status":"healthy","timestamp":"2026-02-15T12:14:03Z"} + +$ curl -s http://31.56.228.103:5000/ +{"service":"devops-app","revision":"1.0.0","hostname":"lab4"} +``` + +Screenshots: `../../app_python/docs/screenshots/14-deploy.png` (playbook) and `../../app_python/docs/screenshots/12-ansible-ping.png` (connectivity proof). + +--- + +## 6. Key Decisions + +- **Why roles instead of plain playbooks?** Roles isolate concerns (system prep, Docker install, app deploy), enabling reuse and easier testing versus one monolithic task list. +- **How do roles improve reusability?** Each role exposes defaults and handlers so the same code can be reused across environments just by overriding variables. +- **What makes a task idempotent?** Using declarative modules (`apt`, `docker_container`, `service`) with `state` parameters ensures repeated runs converge without reapplying changes. +- **How do handlers improve efficiency?** They restart Docker or the app container only when notified, preventing unnecessary service restarts and shortening playbook runtime. +- **Why is Ansible Vault necessary?** Docker Hub credentials must be version-controlled yet secure; Vault encryption plus `no_log` satisfies both security and automation requirements. + +--- + +## 7. Challenges & Mitigations + +- **Vault encryption errors:** Early attempts from PowerShell failed; solved by running `ansible-vault` inside WSL with `--vault-password-file` pointing to a Linux path. +- **community.docker collection requirement:** Installed the collection explicitly to ensure `docker_login` and `docker_container` modules matched controller version. +- **Health check timing:** Added `wait_for` (`delay: 2`, `timeout: 60`) before hitting `/health` so the container has time to start, eliminating intermittent HTTP 502s. + +--- + +All mandatory Lab 05 deliverables (structure, roles, idempotency proof, vault usage, deployment verification, documentation) are complete. diff --git a/ansible/docs/LAB06.md b/ansible/docs/LAB06.md new file mode 100644 index 0000000000..936dcadb24 --- /dev/null +++ b/ansible/docs/LAB06.md @@ -0,0 +1,307 @@ +# Lab 06 — Advanced Ansible & CI/CD Report + +[![Ansible Deploy](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg?branch=lab06)](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml?query=branch%3Alab06) + +> Refactored Lab 5 automation with blocks/tags, migrated app deployment to Docker Compose, implemented safe wipe logic, and added GitHub Actions deployment pipeline. + +--- + +## 1. Overview + +| Item | Value | +| --- | --- | +| Base from previous labs | Lab 04 VM + Lab 05 Ansible roles | +| Control environment | Windows 11 (PowerShell workspace editing) | +| Target node | Ubuntu 24.04 VM (`31.56.228.103`) | +| Main upgrade | `app_deploy` role renamed to `web_app` + Compose v2 deployment | +| CI/CD | `.github/workflows/ansible-deploy.yml` | + +**Implemented structure** + +```text +ansible/ +├── docs/LAB06.md +├── group_vars/all.yml +├── playbooks/ +│ ├── provision.yml +│ ├── deploy.yml +│ └── deploy_python.yml +├── vars/ +│ └── app_python.yml +└── roles/ + ├── common/ + ├── docker/ + └── web_app/ + ├── defaults/main.yml + ├── meta/main.yml + ├── tasks/main.yml + ├── tasks/wipe.yml + ├── templates/docker-compose.yml.j2 + └── handlers/main.yml +``` + +--- + +## 2. Blocks & Tags (Task 1) + +### `common` role refactor +File: `roles/common/tasks/main.yml` + +- Package installation grouped in block with tag `packages`. +- Added `rescue` for apt failures: + - `apt-get update --fix-missing` + - retry apt cache update. +- Added `always` section writing completion marker to `/tmp/ansible-common-packages.log`. +- Added user management block with tag `users`. +- Role-level tag `common` applied from `playbooks/provision.yml`. + +Added defaults in `roles/common/defaults/main.yml`: +- `common_users` (default: `devops`) +- existing `common_packages`, `timezone` preserved. + +### `docker` role refactor +File: `roles/docker/tasks/main.yml` + +- Installation tasks grouped in block with tag `docker_install`. +- Runtime config grouped in block with tag `docker_config`. +- Added `rescue` path for transient GPG/repo failures: + - wait 10 seconds + - refresh apt cache + - retry GPG key + repo + package install. +- Added `always` section to force Docker service state convergence (`enabled` + `started`). +- Role-level tag `docker` applied from `playbooks/provision.yml`. + +Updated `roles/docker/defaults/main.yml`: +- Added `docker-compose-plugin` to `docker_packages`. + +### Tag usage examples + +```bash +cd ansible +ansible-playbook playbooks/provision.yml --list-tags +ansible-playbook playbooks/provision.yml --tags "docker" +ansible-playbook playbooks/provision.yml --tags "docker_install" +ansible-playbook playbooks/provision.yml --tags "packages" +ansible-playbook playbooks/provision.yml --skip-tags "common" +``` + +--- + +## 3. Docker Compose Migration (Task 2) + +### Rename and role migration +- Renamed role directory: `roles/app_deploy` → `roles/web_app`. +- Updated deployment playbook role reference: + - `playbooks/deploy.yml` now uses `web_app`. + +### Compose template +File: `roles/web_app/templates/docker-compose.yml.j2` + +- Jinja2 template with dynamic fields: + - `app_name` + - `docker_image:docker_image_tag` + - `app_port:app_internal_port` + - `app_env` + - `restart` policy +- Adds `PORT={{ app_internal_port }}` to guarantee app bind port consistency. + +### Role dependency +File: `roles/web_app/meta/main.yml` + +```yaml +dependencies: + - role: docker +``` + +This guarantees Docker is available before `web_app` tasks run. + +### Compose deployment implementation +File: `roles/web_app/tasks/main.yml` + +Deployment block now performs: +1. Docker Hub login (`no_log: true`) +2. Create app directory (`compose_project_dir`) +3. Template Compose file +4. `community.docker.docker_compose_v2` with `state: present`, `pull: always` +5. `wait_for` on service port +6. Health check via `uri` + +Tags used: +- `app_deploy` +- `compose` +- `web_app` (playbook role tag) + +### Variables +- `roles/web_app/defaults/main.yml` defines app/compose/wipe defaults. +- `group_vars/all.yml` defines shared vars and credentials loading. +- App-specific deployment variables stored in `vars/app_python.yml`. + +--- + +## 4. Wipe Logic (Task 3) + +### Implementation +Files: +- `roles/web_app/tasks/wipe.yml` +- `roles/web_app/tasks/main.yml` (include at top) +- `roles/web_app/defaults/main.yml` (`web_app_wipe: false`) + +Wipe flow (`wipe.yml`): +1. `docker_compose_v2 state=absent` +2. remove `docker-compose.yml` +3. remove project directory +4. log completion (`debug`) + +Safety model: +- Variable gate: `when: web_app_wipe | bool` +- Tag gate: `web_app_wipe` +- No `never` tag used (per requirement). + +### Test scenarios (commands) + +**Scenario 1 — normal deploy (wipe skipped by default variable):** +```bash +ansible-playbook playbooks/deploy.yml +``` + +**Scenario 2 — wipe only:** +```bash +ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" --tags web_app_wipe +``` + +**Scenario 3 — clean reinstall (wipe → deploy):** +```bash +ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" +``` + +**Scenario 4a — tag only with default variable false (safe skip):** +```bash +ansible-playbook playbooks/deploy.yml --tags web_app_wipe +``` + +--- + +## 5. CI/CD Integration (Task 4) + +### Workflow 1: Python app deployment +File: `.github/workflows/ansible-deploy.yml` + +Pipeline stages: +1. **Lint job** + - install `ansible-core`, `ansible-lint` + - install collections `community.docker`, `community.general` + - run `ansible-lint` over playbooks/roles +2. **Deploy job** (needs lint) + - setup SSH from GitHub Secret + - create temporary vault password file + - run `playbooks/deploy_python.yml` + - verify with `curl` (`/` and `/health`) + +Path filters configured to avoid unnecessary runs and ignore docs updates. + +### GitHub Secrets used +- `ANSIBLE_VAULT_PASSWORD` +- `SSH_PRIVATE_KEY` +- `VM_HOST` +- `DOCKERHUB_USERNAME` +- `DOCKERHUB_TOKEN` + +### README badges +Added in `README.md`: +- Python Ansible deployment workflow badge + +--- + +## 6. Testing Results (Execution Plan + Verification) + +### Provisioning/tag checks +```bash +cd ansible +ansible-playbook playbooks/provision.yml --list-tags +ansible-playbook playbooks/provision.yml --tags "docker" +ansible-playbook playbooks/provision.yml --tags "docker_install" +ansible-playbook playbooks/provision.yml --tags "packages" +``` + +### Compose deployment/idempotency checks +```bash +ansible-playbook playbooks/deploy.yml --ask-vault-pass +ansible-playbook playbooks/deploy.yml --ask-vault-pass +ansible webservers -a "docker ps --format '{{.Names}} {{.Image}} {{.Ports}}'" +curl -s http://31.56.228.103:5000/health +``` + +### Wipe checks +```bash +ansible-playbook playbooks/deploy_python.yml -e "web_app_wipe=true" --tags web_app_wipe +``` + +### CI/CD checks +- Push change in `ansible/vars/app_python.yml` → `ansible-deploy.yml` should run. +- Push change in `ansible/roles/web_app/**` → `ansible-deploy.yml` should run. + +--- + +## 7. Challenges & Solutions + +- **No Ansible binary in current local PowerShell/WSL during editing:** + - mitigated by implementing complete declarative code + CI-based lint/deploy validation in workflows. +- **Compose migration complexity from single-container module to project model:** + - resolved using `docker_compose_v2` + template-driven compose file. +- **Safe destructive operations requirement:** + - solved with double-gated wipe mechanism (`web_app_wipe` variable + `web_app_wipe` tag). + +--- + +## 8. Research Answers + +### Task 1 (Blocks & Tags) +1. **What if `rescue` also fails?** + - The play fails for that host after rescue attempts; `always` still executes. +2. **Can blocks be nested?** + - Yes, nested blocks are valid and useful for grouped control flow. +3. **How do tags inherit in blocks?** + - Tags on a block apply to tasks inside it; role/play tags also propagate. + +### Task 2 (Compose) +1. **`restart: always` vs `unless-stopped`** + - `always`: container restarts even after manual stop and daemon reboot. + - `unless-stopped`: restarts on failures/reboot, but respects manual stop. +2. **Compose networks vs default bridge** + - Compose creates project-scoped networks with deterministic service DNS; bridge is generic and less structured. +3. **Can Vault variables be used in templates?** + - Yes, vaulted vars are decrypted at runtime and can be injected into Jinja templates. + +### Task 3 (Wipe logic) +1. **Why variable + tag?** + - Dual safety: variable authorizes destructive intent; tag enables selective wipe-only execution. +2. **Difference from `never` tag?** + - `never` forces explicit tag always; variable+tag supports both wipe-only and clean reinstall flows. +3. **Why wipe before deploy?** + - Guarantees clean state for deterministic reinstallation. +4. **When clean reinstall vs rolling update?** + - Clean reinstall for corrupted drift/major config changes; rolling updates for minimal downtime. +5. **How to wipe images/volumes too?** + - Extend compose down options (`remove_images`, volume removal) and add gated tasks for image/volume prune. + +### Task 4 (CI/CD) +1. **Security of SSH keys in GitHub Secrets** + - Keys are encrypted at rest, but exposure risk remains in misconfigured logs/steps; use least privilege and rotation. +2. **Staging → production pipeline** + - Separate environments, protected branches, manual approval gate before production job. +3. **Rollback strategy** + - Deploy immutable image tags, keep previous tag, add rollback workflow dispatch input for target tag. +4. **Why self-hosted can improve security** + - No external runner needs direct infra access; network boundary and credentials remain inside your environment. + +--- + +## 9. Summary + +Lab 06 implementation is completed in repository code: +- Blocks/rescue/always/tags added to provisioning roles. +- Deployment migrated from container task model to Docker Compose role. +- Safe wipe logic with variable + tag control implemented. +- CI/CD automation added with lint + deploy + runtime verification. + +Total deliverable status: **Main tasks complete**. diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml new file mode 100644 index 0000000000..ea2b68211d --- /dev/null +++ b/ansible/group_vars/all.yml @@ -0,0 +1,16 @@ +--- +# Docker Hub credentials: keep this file vaulted in real usage. +# Example: +# ansible-vault edit group_vars/all.yml --vault-password-file .vault_pass_tmp + +dockerhub_username: "{{ lookup('ansible.builtin.env', 'DOCKERHUB_USERNAME') | default('alliumpro', true) }}" +dockerhub_password: "{{ lookup('ansible.builtin.env', 'DOCKERHUB_TOKEN') | default('', true) }}" + +# App runtime environment (safe defaults) +web_app_env: + APP_ENV: production + APP_LOG_LEVEL: info + APP_SECRET_KEY: "{{ app_secret_key }}" + +# Example secret consumed by app via compose template (replace with vault-encrypted value) +app_secret_key: "change-me-with-ansible-vault" diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000000..66d999152a --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,5 @@ +[webservers] +lab4 ansible_host=31.56.228.103 ansible_user=devops ansible_ssh_private_key_file=~/.ssh/id_ed25519 + +[webservers:vars] +ansible_python_interpreter=/usr/bin/python3 diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..ac12f51c6f --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,9 @@ +--- +- name: Deploy monitoring stack + hosts: webservers + become: false + roles: + - role: monitoring + tags: + - monitoring + - logging diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml new file mode 100644 index 0000000000..f52f7ed804 --- /dev/null +++ b/ansible/playbooks/deploy.yml @@ -0,0 +1,9 @@ +--- +- name: Deploy application + hosts: webservers + become: false + roles: + - role: web_app + tags: + - web_app + - app_deploy diff --git a/ansible/playbooks/deploy_python.yml b/ansible/playbooks/deploy_python.yml new file mode 100644 index 0000000000..5742168735 --- /dev/null +++ b/ansible/playbooks/deploy_python.yml @@ -0,0 +1,12 @@ +--- +- name: Deploy Python application + hosts: webservers + become: false + vars_files: + - ../vars/app_python.yml + + roles: + - role: web_app + tags: + - web_app + - app_deploy diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml new file mode 100644 index 0000000000..54961d85fc --- /dev/null +++ b/ansible/playbooks/provision.yml @@ -0,0 +1,11 @@ +--- +- name: Provision web servers + hosts: webservers + become: true + roles: + - role: common + tags: + - common + - role: docker + tags: + - docker diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 0000000000..e701b5d027 --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,5 @@ +--- +- name: Provision stack + import_playbook: provision.yml +- name: Deploy stack + import_playbook: deploy.yml diff --git a/ansible/roles/common/defaults/main.yml b/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000000..2fad9bbed8 --- /dev/null +++ b/ansible/roles/common/defaults/main.yml @@ -0,0 +1,9 @@ +common_packages: + - python3-pip + - git + - curl + - vim + - htop +common_users: + - devops +common_timezone: "Europe/Moscow" diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000000..3612520e1d --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,57 @@ +--- +- name: Configure common host baseline + tags: + - common + block: + # packages block: grouped installation + error handling + - name: Install baseline packages + become: true + tags: + - packages + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + + rescue: + - name: Recover apt metadata after cache failure + ansible.builtin.apt: + update_cache: true + + - name: Retry apt cache update after recovery + ansible.builtin.apt: + update_cache: true + + always: + - name: Write packages block completion marker + ansible.builtin.copy: + dest: /tmp/ansible-common-packages.log + content: "common packages block completed\n" + mode: "0644" + + - name: Set timezone + become: true + community.general.timezone: + name: "{{ common_timezone }}" + tags: + - packages + + # users block: centralized user management + - name: Ensure baseline users exist + become: true + tags: + - users + block: + - name: Ensure local users are present + ansible.builtin.user: + name: "{{ item }}" + state: present + create_home: true + shell: /bin/bash + loop: "{{ common_users }}" diff --git a/ansible/roles/docker/defaults/main.yml b/ansible/roles/docker/defaults/main.yml new file mode 100644 index 0000000000..a37ad6f6e6 --- /dev/null +++ b/ansible/roles/docker/defaults/main.yml @@ -0,0 +1,8 @@ +docker_users: + - devops +docker_packages: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-compose-plugin + - python3-docker diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml new file mode 100644 index 0000000000..07aa0eb290 --- /dev/null +++ b/ansible/roles/docker/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart docker + ansible.builtin.service: + name: docker + state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000000..79cf4f379d --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,81 @@ +--- +- name: Install Docker engine and repository + become: true + tags: + - docker_install + block: + # installation block: repo setup and package installation + - name: Install prerequisites + ansible.builtin.apt: + name: + - ca-certificates + - curl + - gnupg + - lsb-release + state: present + + - name: Add Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + + - name: Add Docker repository + ansible.builtin.apt_repository: + repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" + state: present + notify: restart docker + + - name: Install Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + notify: restart docker + + rescue: + # recovery path for transient key/repo network failures + - name: Wait before retrying Docker repository setup + ansible.builtin.wait_for: + timeout: 10 + + - name: Refresh apt cache before retry + ansible.builtin.apt: + update_cache: true + + - name: Retry adding Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + + - name: Retry adding Docker repository + ansible.builtin.apt_repository: + repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" + state: present + notify: restart docker + + - name: Retry installing Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + notify: restart docker + + always: + # service is always converged even if installation path retried + - name: Ensure docker service is enabled and started + ansible.builtin.service: + name: docker + state: started + enabled: true + +- name: Configure Docker runtime access + become: true + tags: + - docker_config + block: + - name: Add users to docker group + ansible.builtin.user: + name: "{{ item }}" + groups: docker + append: true + loop: "{{ docker_users }}" diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..9aa8cb940e --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,35 @@ +--- +monitoring_project_dir: "/home/devops/monitoring" +monitoring_loki_version: "3.0.0" +monitoring_promtail_version: "3.0.0" +monitoring_grafana_version: "12.3.1" +monitoring_prometheus_version: "3.9.0" +monitoring_loki_port: 3100 +monitoring_promtail_port: 9080 +monitoring_grafana_port: 3000 +monitoring_prometheus_port: 9090 +monitoring_loki_retention: "168h" +monitoring_loki_schema: "v13" +monitoring_prometheus_retention_days: 15 +monitoring_prometheus_retention_size: "10GB" +monitoring_prometheus_scrape_interval: "15s" +monitoring_app_name: "devops-python" +monitoring_app_image: "alliumpro/devops-app:lab08" +monitoring_app_port: 8000 +monitoring_app_internal_port: 5000 +monitoring_grafana_admin_user: "admin" +monitoring_grafana_admin_password: "change-me" +monitoring_app_memory_limit: "256M" + +monitoring_prometheus_targets: + - job: "prometheus" + targets: ["localhost:9090"] + - job: "loki" + targets: ["loki:3100"] + path: "/metrics" + - job: "grafana" + targets: ["grafana:3000"] + path: "/metrics" + - job: "app" + targets: ["app-python:5000"] + path: "/metrics" diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..bfa1415d65 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,4 @@ +--- +# monitoring deploy requires docker engine and compose plugin +# installed by the docker role in provision playbook. +dependencies: [] diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml new file mode 100644 index 0000000000..d09fdbd07c --- /dev/null +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -0,0 +1,36 @@ +--- +- name: Deploy Loki stack via compose + community.docker.docker_compose_v2: + project_src: "{{ monitoring_project_dir }}" + state: present + pull: always + +- name: Wait for Loki readiness + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_loki_port }}/ready" + status_code: 200 + timeout: 30 + register: monitoring_loki_ready + until: monitoring_loki_ready.status == 200 + retries: 10 + delay: 5 + +- name: Wait for Prometheus readiness + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_prometheus_port }}/-/healthy" + status_code: 200 + timeout: 30 + register: monitoring_prometheus_ready + until: monitoring_prometheus_ready.status == 200 + retries: 10 + delay: 5 + +- name: Wait for Grafana readiness + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/health" + status_code: 200 + timeout: 30 + register: monitoring_grafana_ready + until: monitoring_grafana_ready.status == 200 + retries: 10 + delay: 5 diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..56b8331364 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- name: Setup monitoring files + ansible.builtin.include_tasks: setup.yml + +- name: Deploy monitoring stack + ansible.builtin.include_tasks: deploy.yml diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml new file mode 100644 index 0000000000..263f7e9da9 --- /dev/null +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -0,0 +1,80 @@ +--- +- name: Create monitoring directory structure + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: devops + group: devops + mode: "0755" + loop: + - "{{ monitoring_project_dir }}" + - "{{ monitoring_project_dir }}/loki" + - "{{ monitoring_project_dir }}/promtail" + - "{{ monitoring_project_dir }}/prometheus" + - "{{ monitoring_project_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_project_dir }}/grafana/provisioning/dashboards" + - "{{ monitoring_project_dir }}/grafana/dashboards" + +- name: Render Loki config + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_project_dir }}/loki/config.yml" + owner: devops + group: devops + mode: "0644" + +- name: Render Promtail config + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_project_dir }}/promtail/config.yml" + owner: devops + group: devops + mode: "0644" + +- name: Render Prometheus config + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ monitoring_project_dir }}/prometheus/prometheus.yml" + owner: devops + group: devops + mode: "0644" + +- name: Render Grafana datasource config + ansible.builtin.template: + src: grafana-datasource.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/datasources/datasources.yml" + owner: devops + group: devops + mode: "0644" + +- name: Render Grafana dashboards provider + ansible.builtin.template: + src: grafana-dashboards-provider.yml.j2 + dest: "{{ monitoring_project_dir }}/grafana/provisioning/dashboards/dashboards.yml" + owner: devops + group: devops + mode: "0644" + +- name: Render Lab07 dashboard definition + ansible.builtin.template: + src: grafana-lab07-dashboard.json.j2 + dest: "{{ monitoring_project_dir }}/grafana/dashboards/lab07-logging.json" + owner: devops + group: devops + mode: "0644" + +- name: Render Lab08 dashboard definition + ansible.builtin.template: + src: grafana-lab08-dashboard.json.j2 + dest: "{{ monitoring_project_dir }}/grafana/dashboards/lab08-metrics.json" + owner: devops + group: devops + mode: "0644" + +- name: Render monitoring compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_project_dir }}/docker-compose.yml" + owner: devops + group: devops + mode: "0644" diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..4119b21278 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,161 @@ +version: "3.8" + +services: + prometheus: + image: prom/prometheus:{{ monitoring_prometheus_version }} + container_name: prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time={{ monitoring_prometheus_retention_days }}d" + - "--storage.tsdb.retention.size={{ monitoring_prometheus_retention_size }}" + ports: + - "{{ monitoring_prometheus_port }}:{{ monitoring_prometheus_port }}" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ monitoring_prometheus_port }}/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + loki: + image: grafana/loki:{{ monitoring_loki_version }} + container_name: loki + command: ["-config.file=/etc/loki/config.yml"] + ports: + - "{{ monitoring_loki_port }}:{{ monitoring_loki_port }}" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ monitoring_loki_port }}/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:{{ monitoring_promtail_version }} + container_name: promtail + command: ["-config.file=/etc/promtail/config.yml"] + ports: + - "{{ monitoring_promtail_port }}:{{ monitoring_promtail_port }}" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:{{ monitoring_grafana_version }} + container_name: grafana + ports: + - "{{ monitoring_grafana_port }}:3000" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: "{{ monitoring_grafana_admin_user }}" + GF_SECURITY_ADMIN_PASSWORD: "{{ monitoring_grafana_admin_password }}" + GF_SECURITY_ALLOW_EMBEDDING: "true" + GF_METRICS_ENABLED: "true" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + depends_on: + prometheus: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + + app-python: + image: {{ monitoring_app_image }} + container_name: app-python + ports: + - "{{ monitoring_app_port }}:{{ monitoring_app_internal_port }}" + environment: + PORT: "{{ monitoring_app_internal_port }}" + APP_ENV: production + LOG_LEVEL: INFO + labels: + logging: "promtail" + app: "{{ monitoring_app_name }}" + networks: + - logging + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:{{ monitoring_app_internal_port }}/health')\""] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "0.50" + memory: "{{ monitoring_app_memory_limit }}" + reservations: + cpus: "0.10" + memory: 128M + +volumes: + prometheus-data: + loki-data: + grafana-data: + +networks: + logging: + name: logging diff --git a/ansible/roles/monitoring/templates/grafana-dashboards-provider.yml.j2 b/ansible/roles/monitoring/templates/grafana-dashboards-provider.yml.j2 new file mode 100644 index 0000000000..f76206cd5e --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-dashboards-provider.yml.j2 @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "DevOps Dashboards" + orgId: 1 + folder: "DevOps Labs" + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 new file mode 100644 index 0000000000..ce3040309f --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -0,0 +1,17 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:{{ monitoring_loki_port }} + isDefault: true + editable: true + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:{{ monitoring_prometheus_port }} + isDefault: false + editable: true diff --git a/ansible/roles/monitoring/templates/grafana-lab07-dashboard.json.j2 b/ansible/roles/monitoring/templates/grafana-lab07-dashboard.json.j2 new file mode 100644 index 0000000000..d915bb116a --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-lab07-dashboard.json.j2 @@ -0,0 +1,44 @@ +{ + "editable": true, + "panels": [ + { + "datasource": {"type": "loki", "uid": "loki"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 0}, + "id": 1, + "targets": [{"expr": "{app=~\"devops-.*\"}", "queryType": "range", "refId": "A"}], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": {"type": "loki", "uid": "loki"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10}, + "id": 2, + "targets": [{"expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", "queryType": "range", "refId": "A"}], + "title": "Request Rate by App", + "type": "timeseries" + }, + { + "datasource": {"type": "loki", "uid": "loki"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10}, + "id": 3, + "targets": [{"expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", "queryType": "range", "refId": "A"}], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": {"type": "loki", "uid": "loki"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 18}, + "id": 4, + "targets": [{"expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", "queryType": "range", "refId": "A"}], + "title": "Log Level Distribution", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "tags": ["lab07", "loki", "logging"], + "time": {"from": "now-30m", "to": "now"}, + "title": "Lab07 - Application Logging", + "uid": "lab07-logging", + "version": 1 +} diff --git a/ansible/roles/monitoring/templates/grafana-lab08-dashboard.json.j2 b/ansible/roles/monitoring/templates/grafana-lab08-dashboard.json.j2 new file mode 100644 index 0000000000..12f620b338 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-lab08-dashboard.json.j2 @@ -0,0 +1,412 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))", + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "mode": "spectrum" + }, + "legend": { + "show": true + }, + "tooltip": { + "show": true, + "yHistogram": false + } + }, + "targets": [ + { + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "format": "heatmap", + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "expr": "sum(http_requests_in_progress)", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "value", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "index": 1, + "text": "DOWN" + }, + "1": { + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"app\"}", + "refId": "A" + } + ], + "title": "Application Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "style": "dark", + "tags": [ + "lab08", + "prometheus", + "metrics", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab08 - Application Metrics", + "uid": "lab08-metrics", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..f0b44142dc --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,54 @@ +auth_enabled: false + +server: + http_listen_port: {{ monitoring_loki_port }} + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 5m + chunk_retain_period: 30s + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: {{ monitoring_loki_schema }} + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + +limits_config: + retention_period: {{ monitoring_loki_retention }} + allow_structured_metadata: true + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + retention_delete_delay: 1h + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/ansible/roles/monitoring/templates/prometheus.yml.j2 b/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000000..e1d6eec9d5 --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,13 @@ +global: + scrape_interval: {{ monitoring_prometheus_scrape_interval }} + evaluation_interval: {{ monitoring_prometheus_scrape_interval }} + +scrape_configs: +{% for target in monitoring_prometheus_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: {{ target.targets }} +{% if target.path is defined %} + metrics_path: '{{ target.path }}' +{% endif %} +{% endfor %} diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..cbe114e7bf --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,28 @@ +server: + http_listen_port: {{ monitoring_promtail_port }} + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:{{ monitoring_loki_port }}/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_label_logging] + regex: promtail + action: keep + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + pipeline_stages: + - docker: {} diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..0f36c80fdf --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1,23 @@ +--- +# Web application identity and image +web_app_name: devops-app +web_app_docker_image_tag: latest +web_app_docker_image: "{{ dockerhub_username }}/{{ web_app_name }}" + +# Runtime ports and health checks +web_app_port: 5000 +web_app_internal_port: 5000 +web_app_wait_timeout: 60 +web_app_health_path: /health + +# Compose project configuration +web_app_compose_project_dir: "/home/devops/{{ web_app_name }}" +web_app_docker_compose_version: "3.8" +web_app_restart_policy: unless-stopped +web_app_env: {} + +# Wipe logic safety controls +# Wipe only: ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" --tags web_app_wipe +# Clean install: ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" +web_app_wipe: false +web_app_wipe_remove_images: false diff --git a/ansible/roles/web_app/handlers/main.yml b/ansible/roles/web_app/handlers/main.yml new file mode 100644 index 0000000000..a0c41f3d42 --- /dev/null +++ b/ansible/roles/web_app/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart web app stack + community.docker.docker_compose_v2: + project_src: "{{ web_app_compose_project_dir }}" + state: present + recreate: always diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..3e06adf0ea --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,3 @@ +--- +# Docker is provisioned by playbooks/provision.yml; web_app deploy assumes Docker is already present. +dependencies: [] diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..99a8d9ca96 --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,57 @@ +--- +- name: Include wipe logic + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + +- name: Deploy web application with Docker Compose + tags: + - app_deploy + - compose + block: + # docker auth is required before pull for private/public rate-limited registries + - name: Login to Docker Hub + community.docker.docker_login: + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + registry_url: https://index.docker.io/v1/ + no_log: true + + - name: Create application directory + ansible.builtin.file: + path: "{{ web_app_compose_project_dir }}" + state: directory + owner: devops + group: devops + mode: "0755" + + - name: Render docker-compose configuration + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ web_app_compose_project_dir }}/docker-compose.yml" + owner: devops + group: devops + mode: "0644" + notify: restart web app stack + + - name: Deploy stack with Docker Compose v2 + community.docker.docker_compose_v2: + project_src: "{{ web_app_compose_project_dir }}" + state: present + pull: always + + - name: Wait for application port + ansible.builtin.wait_for: + port: "{{ web_app_port }}" + delay: 2 + timeout: "{{ web_app_wait_timeout }}" + + - name: Verify health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ web_app_port }}{{ web_app_health_path }}" + status_code: 200 + + rescue: + - name: Show deployment failure hint + ansible.builtin.debug: + msg: "Docker Compose deployment failed for {{ web_app_name }}. Check compose logs on target host." diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..49aaaa8afc --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,32 @@ +--- +- name: Wipe web application deployment + when: web_app_wipe | bool + tags: + - web_app_wipe + block: + - name: Check if compose file exists + ansible.builtin.stat: + path: "{{ web_app_compose_project_dir }}/docker-compose.yml" + register: web_app_compose_file + + # stop and remove compose-managed containers + - name: Stop and remove compose stack + community.docker.docker_compose_v2: + project_src: "{{ web_app_compose_project_dir }}" + state: absent + remove_images: "{{ 'local' if (web_app_wipe_remove_images | bool) else omit }}" + when: web_app_compose_file.stat.exists + + - name: Remove docker-compose file + ansible.builtin.file: + path: "{{ web_app_compose_project_dir }}/docker-compose.yml" + state: absent + + - name: Remove application directory + ansible.builtin.file: + path: "{{ web_app_compose_project_dir }}" + state: absent + + - name: Log wipe completion + ansible.builtin.debug: + msg: "Application {{ web_app_name }} wiped successfully" diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..19a69f319c --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,14 @@ +version: '{{ web_app_docker_compose_version }}' + +services: + {{ web_app_name }}: + image: {{ web_app_docker_image }}:{{ web_app_docker_image_tag }} + container_name: {{ web_app_name }} + ports: + - "{{ web_app_port }}:{{ web_app_internal_port }}" + environment: +{% for key, value in web_app_env.items() %} + {{ key }}: "{{ value }}" +{% endfor %} + PORT: "{{ web_app_internal_port }}" + restart: {{ web_app_restart_policy }} diff --git a/ansible/vars/app_python.yml b/ansible/vars/app_python.yml new file mode 100644 index 0000000000..a342081535 --- /dev/null +++ b/ansible/vars/app_python.yml @@ -0,0 +1,7 @@ +--- +web_app_name: devops-python +web_app_docker_image: "{{ dockerhub_username }}/devops-app" +web_app_docker_image_tag: latest +web_app_port: 5000 +web_app_internal_port: 5000 +web_app_compose_project_dir: "/opt/{{ web_app_name }}" diff --git a/app_python/.dockerignore b/app_python/.dockerignore new file mode 100644 index 0000000000..1c794274d8 --- /dev/null +++ b/app_python/.dockerignore @@ -0,0 +1,15 @@ +__pycache__/ +*.py[cod] +*.log +.venv/ +venv/ +env/ +.pytest_cache/ +.git/ +.gitignore +.vscode/ +.idea/ +.DS_Store +docs/ +tests/ +*.md diff --git a/app_python/.gitignore b/app_python/.gitignore new file mode 100644 index 0000000000..37c07858ba --- /dev/null +++ b/app_python/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.py[cod] +venv/ +env/ +*.log +.DS_Store +.vscode/ +.pytest_cache/ +.vault_pass diff --git a/app_python/Dockerfile b/app_python/Dockerfile new file mode 100644 index 0000000000..0fe5ff9138 --- /dev/null +++ b/app_python/Dockerfile @@ -0,0 +1,24 @@ +# syntax=docker/dockerfile:1 +FROM python:3.13-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +# Create non-root user +RUN addgroup --system app && adduser --system --ingroup app app + +# Install dependencies first (layer caching) +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy only the app code +COPY app.py ./ + +# Switch to non-root user +USER app + +EXPOSE 5000 + +CMD ["python", "app.py"] diff --git a/app_python/README.md b/app_python/README.md new file mode 100644 index 0000000000..2538301f47 --- /dev/null +++ b/app_python/README.md @@ -0,0 +1,108 @@ +# DevOps Info Service ![Python CI](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/python-ci.yml/badge.svg?branch=lab3) + +Flask-based info service used throughout the DevOps core course. It reports service metadata, host information, runtime stats, and exposes a `/health` endpoint for probes. + +## Features + +- JSON payload describing the service, host OS/CPU, runtime uptime and request metadata +- Health endpoint for liveness/readiness checks +- Dockerfile for reproducible builds +- Pytest suite covering `/`, `/health`, and error handling +- GitHub Actions workflow for lint → test → Docker build/push with CalVer tagging and optional Snyk scan + +## Prerequisites + +- Python 3.11+ (3.13 container image) +- pip +- (optional) Docker & Docker Hub account for publishing images + +## Local setup + +```bash +cd app_python +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +pip install -r requirements-dev.txt +``` + +## Running the app + +```bash +# default: 0.0.0.0:5000 +python3 app.py + +# custom host/port +HOST=127.0.0.1 PORT=8080 python3 app.py + +# production-style +gunicorn -w 4 -b 0.0.0.0:8000 app:app +``` + +## Testing & linting + +```bash +# run tests +pytest -q + +# run tests with coverage (optional) +pytest --cov=app_python --cov-report=term-missing + +# lint +flake8 app_python +``` + +## API quick check + +```bash +curl -s http://127.0.0.1:5000/ | jq . +curl -s http://127.0.0.1:5000/health | jq . +``` + +## Configuration + +| Variable | Default | Purpose | +| --- | --- | --- | +| `HOST` | `0.0.0.0` | Address to bind the Flask server | +| `PORT` | `5000` | TCP port | +| `DEBUG` | `false` | Enables Flask debug mode | + +## Docker usage + +```bash +# build (from repo root) +docker build -t alliumpro/devops-info-service:lab02 ./app_python + +# run +docker run --rm -p 8080:5000 alliumpro/devops-info-service:lab02 + +# pull published image +docker pull alliumpro/devops-info-service:lab02 +``` + +## CI/CD workflow + +Workflow file: `.github/workflows/python-ci.yml` + +Pipeline stages: +1. Checkout + Python setup (3.11) +2. Pip cache restore → install dependencies (prod + dev) +3. Lint via `flake8` +4. Pytest suite (fail-fast) +5. Snyk dependency scan (runs when `SNYK_TOKEN` secret is configured) +6. Build & push Docker image with CalVer + `latest` tags (main/master branch) + +### Required GitHub secrets + +| Secret | Description | +| --- | --- | +| `DOCKERHUB_USERNAME` | Docker Hub username | +| `DOCKERHUB_TOKEN` | Docker Hub access token with write perms | +| `DOCKERHUB_REPO` | Target repo, e.g. `alliumpro/devops-info-service` | +| `SNYK_TOKEN` | API token to enable the Snyk scan step | + +## Troubleshooting + +- **Port already in use** → set `PORT` or use `docker run -p 8080:5000`. +- **Docker daemon unavailable** → `sudo systemctl start docker`. +- **CI push skipped** → workflow only pushes on `main`/`master` (or tags); ensure secrets are configured. \ No newline at end of file diff --git a/app_python/app.py b/app_python/app.py new file mode 100644 index 0000000000..3f5c0cb2ef --- /dev/null +++ b/app_python/app.py @@ -0,0 +1,329 @@ +"""DevOps Info Service - Flask implementation for Lab 1 Task 1. + +Provides two endpoints: + - GET / -> service, system and runtime information + - GET /health -> simple health check used by probes + +Configuration via environment variables: HOST, PORT, DEBUG +""" +from __future__ import annotations + +import json +import logging +import os +import platform +import socket +import time +import uuid +from datetime import datetime, timezone +from typing import Dict + +from flask import Flask, g, jsonify, request +from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest + + +APP_NAME = "devops-info-service" +APP_VERSION = "1.0.0" +APP_DESCRIPTION = "DevOps course info service" +FRAMEWORK = "Flask" + + +class JsonFormatter(logging.Formatter): + """Serialize log records to JSON for structured aggregation in Loki.""" + + def format(self, record: logging.LogRecord) -> str: + payload = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + "module": record.module, + } + + for attr in [ + "request_id", + "method", + "path", + "status_code", + "client_ip", + "duration_ms", + ]: + value = getattr(record, attr, None) + if value is not None: + payload[attr] = value + + return json.dumps(payload, ensure_ascii=True) + + +def setup_logging() -> logging.Logger: + logger_obj = logging.getLogger("devops-app") + logger_obj.setLevel(getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO)) + logger_obj.handlers.clear() + handler = logging.StreamHandler() + handler.setFormatter(JsonFormatter()) + logger_obj.addHandler(handler) + logger_obj.propagate = False + return logger_obj + + +logger = setup_logging() + +app = Flask(__name__) + +HTTP_REQUESTS_TOTAL = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status_code"], +) + +HTTP_REQUEST_DURATION_SECONDS = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], +) + +HTTP_REQUESTS_IN_PROGRESS = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", + ["method", "endpoint"], +) + +DEVOPS_INFO_ENDPOINT_CALLS_TOTAL = Counter( + "devops_info_endpoint_calls_total", + "Total endpoint calls for the DevOps info service", + ["endpoint"], +) + +DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS = Histogram( + "devops_info_system_collection_seconds", + "System info collection duration in seconds", +) + +# Configuration from environment +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 5000)) +DEBUG = os.getenv("DEBUG", "False").lower() == "true" + +# Application start time (UTC) +START_TIME = datetime.now(timezone.utc) + + +def normalize_endpoint() -> str: + """Normalize endpoint labels to keep cardinality bounded.""" + if request.url_rule and request.url_rule.rule: + endpoint = request.url_rule.rule + elif request.path in {"/", "/health", "/metrics"}: + endpoint = request.path + else: + endpoint = "/unknown" + return endpoint + + +def get_uptime() -> Dict[str, object]: + """Return uptime in seconds and human readable form.""" + delta = datetime.now(timezone.utc) - START_TIME + seconds = int(delta.total_seconds()) + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + human = f"{hours} hours, {minutes} minutes" + return {"seconds": seconds, "human": human} + + +def get_system_info() -> Dict[str, object]: + """Collect system and runtime information.""" + try: + hostname = socket.gethostname() + except Exception: + hostname = "unknown" + + system = platform.system() + platform_version = platform.version() + arch = platform.machine() + cpu_count = os.cpu_count() or 1 + python_version = platform.python_version() + + return { + "hostname": hostname, + "platform": system, + "platform_version": platform_version, + "architecture": arch, + "cpu_count": cpu_count, + "python_version": python_version, + } + + +def get_request_info() -> Dict[str, object]: + """Extract useful request information (works in Flask).""" + # Prefer X-Forwarded-For if behind a proxy + xff = request.headers.get("X-Forwarded-For", "") + if xff: + client_ip = xff.split(",")[0].strip() + else: + client_ip = request.remote_addr or "" + + return { + "client_ip": client_ip, + "user_agent": request.headers.get("User-Agent", ""), + "method": request.method, + "path": request.path, + } + + +@app.before_request +def before_request_log() -> None: + """Capture request start timing and emit ingress event.""" + g.request_start = time.perf_counter() + g.request_id = request.headers.get("X-Request-ID", str(uuid.uuid4())) + endpoint = normalize_endpoint() + g.metrics_endpoint = endpoint + g.metrics_method = request.method + HTTP_REQUESTS_IN_PROGRESS.labels(method=request.method, endpoint=endpoint).inc() + info = get_request_info() + logger.info( + "request_started", + extra={ + "request_id": g.request_id, + "method": info["method"], + "path": info["path"], + "client_ip": info["client_ip"], + }, + ) + + +@app.after_request +def after_request_log(response): + """Emit request completion event with latency and status code.""" + started = getattr(g, "request_start", time.perf_counter()) + endpoint = getattr(g, "metrics_endpoint", normalize_endpoint()) + method = getattr(g, "metrics_method", request.method) + duration_seconds = max(time.perf_counter() - started, 0.0) + duration_ms = round(duration_seconds * 1000, 2) + + HTTP_REQUESTS_TOTAL.labels( + method=method, + endpoint=endpoint, + status_code=str(response.status_code), + ).inc() + HTTP_REQUEST_DURATION_SECONDS.labels(method=method, endpoint=endpoint).observe(duration_seconds) + HTTP_REQUESTS_IN_PROGRESS.labels(method=method, endpoint=endpoint).dec() + + logger.info( + "request_completed", + extra={ + "request_id": getattr(g, "request_id", None), + "method": request.method, + "path": request.path, + "status_code": response.status_code, + "client_ip": request.headers.get("X-Forwarded-For", request.remote_addr or ""), + "duration_ms": duration_ms, + }, + ) + return response + + +@app.route("/") +def index(): + """Main endpoint returning service, system, runtime and request info.""" + logger.info("handle_index") + DEVOPS_INFO_ENDPOINT_CALLS_TOTAL.labels(endpoint="/").inc() + + uptime = get_uptime() + now = datetime.now(timezone.utc).isoformat() + with DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS.time(): + system_info = get_system_info() + + payload = { + "service": { + "name": APP_NAME, + "version": APP_VERSION, + "description": APP_DESCRIPTION, + "framework": FRAMEWORK, + }, + "system": system_info, + "runtime": { + "uptime_seconds": uptime["seconds"], + "uptime_human": uptime["human"], + "current_time": now, + "timezone": "UTC", + }, + "request": get_request_info(), + "endpoints": [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"}, + ], + } + + return jsonify(payload) + + +@app.route("/health") +def health(): + """Simple health endpoint suitable for liveness/readiness probes.""" + DEVOPS_INFO_ENDPOINT_CALLS_TOTAL.labels(endpoint="/health").inc() + uptime = get_uptime() + timestamp = datetime.now(timezone.utc).isoformat() + logger.info("health_check") + return jsonify( + { + "status": "healthy", + "timestamp": timestamp, + "uptime_seconds": uptime["seconds"], + } + ), 200 + + +@app.route("/metrics") +def metrics(): + """Expose Prometheus metrics endpoint.""" + DEVOPS_INFO_ENDPOINT_CALLS_TOTAL.labels(endpoint="/metrics").inc() + return generate_latest(), 200, {"Content-Type": CONTENT_TYPE_LATEST} + + +@app.errorhandler(404) +def not_found(e): + logger.warning( + "not_found", + extra={ + "request_id": getattr(g, "request_id", None), + "method": request.method, + "path": request.path, + "status_code": 404, + "client_ip": request.headers.get("X-Forwarded-For", request.remote_addr or ""), + }, + ) + return ( + jsonify({"error": "Not Found", "message": "Endpoint does not exist"}), + 404, + ) + + +@app.errorhandler(500) +def internal_error(e): + logger.exception( + "internal_error", + extra={ + "request_id": getattr(g, "request_id", None), + "method": request.method if request else None, + "path": request.path if request else None, + "status_code": 500, + }, + ) + return ( + jsonify({"error": "Internal Server Error", "message": "An unexpected error occurred"}), + 500, + ) + + +if __name__ == "__main__": + logger.info( + "app_start", + extra={ + "app": APP_NAME, + "version": APP_VERSION, + "host": HOST, + "port": PORT, + "debug": DEBUG, + }, + ) + # Flask 3.1 uses app.run as usual for development. In production, use a WSGI server. + app.run(host=HOST, port=PORT, debug=DEBUG) diff --git a/app_python/docs/LAB01.md b/app_python/docs/LAB01.md new file mode 100644 index 0000000000..32ba48305a --- /dev/null +++ b/app_python/docs/LAB01.md @@ -0,0 +1,198 @@ +# LAB01 - DevOps Info Service (Task 1) + +This document describes the implementation of Task 1 (Python web application) +for the DevOps course. The service is implemented using Flask and provides a +main `/` endpoint with detailed service/system/runtime information and a +`/health` endpoint for monitoring. + +## Framework selection + +- Chosen framework: **Flask 3.1** + +Reasons for Flask: +- Lightweight and well-known in educational contexts. +- Simple to extend with logging, health checks and configuration. +- Minimal surface area - ideal for iteratively adding DevOps tooling. + +Comparison with alternatives: + +| Framework | Advantages | Trade-offs | Verdict for Lab 1 | +|-----------|------------|------------|-------------------| +| Flask | Minimal setup, synchronous by default, rich ecosystem | Needs manual docs generation | Chosen - balances simplicity and control | +| FastAPI | Async support, auto-generated docs (OpenAPI), type hints | Slight learning curve for async; more dependencies | Overkill for two simple endpoints | +| Django | Batteries included (ORM, admin, auth) | Heavyweight, requires project scaffolding | Too much ceremony for a lightweight info service | + +## What I implemented (requirements coverage) + +- `GET /` - returns JSON with `service`, `system`, `runtime`, `request`, and + `endpoints` sections. (Done) +- `GET /health` - returns `status`, `timestamp`, and `uptime_seconds`. + (Done) +- Environment-configurable `HOST`, `PORT`, `DEBUG`. (Done) +- Logging and basic error handlers for 404 and 500. (Done) + +Files changed/added: +- `app.py` - main Flask application and endpoints +- `requirements.txt` - pinned dependencies (`Flask==3.1.0`, `gunicorn`) +- `README.md` - usage and run instructions +- `.gitignore` - common Python ignores + +## Task 2 — Documentation & Best Practices + +1. Application README (`app_python/README.md`) — Required sections: + - Overview — present + - Prerequisites — present (Python 3.11+) + - Installation — present (venv + pip install) + - Running the Application — present with examples (including custom PORT/HOST) + - API Endpoints — present + - Configuration — present (table with `HOST`, `PORT`, `DEBUG`) + + Status: Done. See `app_python/README.md` for the full user-facing instructions. + +2. Best Practices implemented in code: + - Clean code organization with helper functions (`get_system_info`, `get_uptime`, `get_request_info`) — Done (`app.py`). + - Error handling with JSON responses for 404 and 500 — Done (`app.py`). + - Logging configuration and usage (INFO level) — Done (`app.py`). + - Dependencies pinned in `requirements.txt` — Done. + +3. Lab Submission (`app_python/docs/LAB01.md`) — This report includes: + - Framework selection and comparison — present above. + - Best practices applied with code snippets — present above. + - API documentation with examples — present above. + - Testing evidence instructions and screenshot checklist — present below. + - Challenges & Solutions — present above. + +## Best practices applied + +1. **Configuration via environment variables (12-factor app principle).** + + ```python + HOST = os.getenv("HOST", "0.0.0.0") + PORT = int(os.getenv("PORT", 5000)) + DEBUG = os.getenv("DEBUG", "False").lower() == "true" + ``` + +2. **Clear function separation (`get_system_info`, `get_uptime`, `get_request_info`).** + + ```python + def get_system_info() -> Dict[str, object]: + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "architecture": platform.machine(), + "python_version": platform.python_version(), + } + ``` + +3. **Timezone-aware timestamps and uptime calculations.** + + ```python + START_TIME = datetime.now(timezone.utc) + delta = datetime.now(timezone.utc) - START_TIME + ``` + +4. **Structured logging.** + + ```python + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + logger.info("Handling main endpoint request: %s %s", request.method, request.path) + ``` + +5. **JSON error handlers for better API UX.** + + ```python + @app.errorhandler(404) + def not_found(e): + return jsonify({"error": "Not Found", "message": "Endpoint does not exist"}), 404 + ``` + +## API Documentation and examples + +1) GET / + +Request: + +```bash +curl -s http://127.0.0.1:5000/ +``` + +Response (example): + +```json +{ + "service": {"name": "devops-info-service", "version": "1.0.0", "description": "DevOps course info service", "framework": "Flask"}, + "system": {"hostname": "my-host", "platform": "Linux", "platform_version": "#1 SMP ...", "architecture": "x86_64", "cpu_count": 4, "python_version": "3.11.4"}, + "runtime": {"uptime_seconds": 12, "uptime_human": "0 hours, 0 minutes", "current_time": "2026-01-25T12:00:00+00:00", "timezone": "UTC"}, + "request": {"client_ip": "127.0.0.1", "user_agent": "curl/7.81.0", "method": "GET", "path": "/"}, + "endpoints": [{"path": "/", "method": "GET", "description": "Service information"}, {"path": "/health", "method": "GET", "description": "Health check"}] +} +``` + +2) GET /health + +Request: + +```bash +curl -s http://127.0.0.1:5000/health +``` + +Response (example): + +```json +{ + "status": "healthy", + "timestamp": "2026-01-25T12:00:05+00:00", + "uptime_seconds": 15 +} +``` + +## How to run locally + +1. Create and activate a virtual environment. +2. Install dependencies: `pip install -r requirements.txt`. +3. Run the app: `python app.py` (default binds to `0.0.0.0:5000`). + +Or using gunicorn (4 workers): + +```bash +gunicorn -w 4 -b 0.0.0.0:8000 app:app +``` + +## Testing evidence + +Place screenshots taken while manually testing the endpoints in +`app_python/docs/screenshots/` as required by the lab. Capture: + +1. `01-main-endpoint.png` - browser showing the full JSON from `GET /`. +2. `02-health-check.png` - response from `GET /health` (status + uptime). +3. `03-formatted-output.png` - pretty-printed output + +Quick local checks (after `python3 app.py`): + +```bash +curl -s http://127.0.0.1:8080/ | jq . +curl -s http://127.0.0.1:8080/health | jq . +python3 -m py_compile app.py +``` + +Outcome: commands completed without errors (syntax check passes, endpoints return JSON that matches the schema above). + +## Challenges & Solutions + +- Challenge: Ensuring timestamps and uptime are timezone-aware and stable. + Solution: Use `datetime.now(timezone.utc)` and store a UTC `START_TIME`. +- Challenge: Getting the correct client IP behind proxies. + Solution: Prefer `X-Forwarded-For` header when present, with a safe + fallback to Flask's `request.remote_addr`. + +## GitHub Community + +- Starring the course repository and `simple-container-com/api` surfaces them + in your network, signaling support and making it easier to discover future + updates or issues to contribute to. +- Following the professor, TAs and classmates keeps their activity in your + feed, which helps coordination on team projects and exposes you to career + opportunities or best practices they share. \ No newline at end of file diff --git a/app_python/docs/LAB02.md b/app_python/docs/LAB02.md new file mode 100644 index 0000000000..0a4694f014 --- /dev/null +++ b/app_python/docs/LAB02.md @@ -0,0 +1,243 @@ +# LAB02 — Docker Containerization + +This report documents the Docker containerization of the Lab 1 Python app. It follows the Lab 2 checklist and includes build/run evidence placeholders and analysis. + +## 1. Docker Best Practices Applied + +**Non-root user** +- Implemented with `adduser`/`addgroup` and `USER app`. +- Why it matters: reduces the blast radius in case of compromise and is a standard container security practice. + +**Specific base image version** +- Using `python:3.13-slim`. +- Why it matters: fixed versions make builds reproducible and reduce unintended breaking changes. + +**Layer caching (dependencies before source code)** +- `requirements.txt` is copied and installed before `app.py` is copied. +- Why it matters: changes in app code do not invalidate dependency layers, making rebuilds faster. + +**Minimal build context via `.dockerignore`** +- Excludes venvs, tests, docs, git files and caches. +- Why it matters: smaller context → faster builds, smaller images, lower risk of leaking dev files. + +**Only necessary files copied** +- Only `requirements.txt` and `app.py` are copied into the image. +- Why it matters: smaller image surface and fewer attack vectors. + +**Dockerfile snippets** + +```dockerfile +FROM python:3.13-slim +WORKDIR /app +RUN addgroup --system app && adduser --system --ingroup app app +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt +COPY app.py ./ +USER app +CMD ["python", "app.py"] +``` + +## 2. Image Information & Decisions + +- **Base image:** `python:3.13-slim` — chosen for small size while keeping Debian compatibility. +-- **Final image size:** 184MB (image ID: `5cae74f76afd`) — measured after local build. +- **Layer structure:** + 1. Base OS + Python runtime + 2. Non-root user creation + 3. Dependencies (pip install) + 4. Application source +- **Optimization choices:** + - `--no-cache-dir` for pip to avoid cache bloat. + - `.dockerignore` to reduce build context. + +## 3. Build & Run Process (evidence) + +### Build output + +```text +$ docker build -t devops-info-service:lab02 /home/ian/Desktop/DevOps-Core-Course/app_python +...build output excerpt... +[+] Building 75.0s (15/15) FINISHED + => [internal] load build definition from Dockerfile 0.1s + => [1/6] FROM docker.io/library/python:3.13-slim@sha256:... 44.2s + => [2/6] WORKDIR /app 0.3s + => [3/6] RUN addgroup --system app && adduser --system --ingroup app app 0.7s + => [4/6] COPY requirements.txt ./ 0.2s + => [5/6] RUN pip install --no-cache-dir -r requirements.txt 11.6s + => [6/6] COPY app.py ./ 0.2s + => exporting to image 1.8s + => => naming to docker.io/library/devops-info-service:lab02 0.0s + +Image built: devops-info-service:lab02 +Image ID: 5cae74f76afd +Image size: 184MB +``` + +### Run output + +```text +$ docker run --rm -p 8080:5000 devops-info-service:lab02 +2026-02-02 14:08:21,288 - __main__ - INFO - Starting devops-info-service on 0.0.0.0:5000 (debug=False) + * Serving Flask app 'app' + * Debug mode: off +2026-02-02 14:08:21,305 - werkzeug - INFO - WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. + * Running on all addresses (0.0.0.0) + * Running on http://127.0.0.1:5000 + * Running on http://172.17.0.2:5000 +2026-02-02 14:08:21,305 - werkzeug - INFO - Press CTRL+C to quit +2026-02-02 14:54:42,426 - __main__ - INFO - Handling main endpoint request: GET / +2026-02-02 14:54:42,427 - werkzeug - INFO - 172.17.0.1 - - [02/Feb/2026 14:54:42] "GET / HTTP/1.1" 200 - +2026-02-02 14:54:42,441 - werkzeug - INFO - 172.17.0.1 - - [02/Feb/2026 14:54:42] "GET /health HTTP/1.1" 200 - +2026-02-02 14:54:59,350 - __main__ - INFO - Handling main endpoint request: GET / +2026-02-02 14:54:59,350 - werkzeug - INFO - 172.17.0.1 - - [02/Feb/2026 14:54:59] "GET / HTTP/1.1" 200 - +``` + +### Endpoint tests + +```text +$ curl -s http://127.0.0.1:8080/ | jq . +{ + "endpoints": [ + { + "description": "Service information", + "method": "GET", + "path": "/" + }, + { + "description": "Health check", + "method": "GET", + "path": "/health" + } + ], + "request": { + "client_ip": "172.17.0.1", + "method": "GET", + "path": "/", + "user_agent": "curl/8.5.0" + }, + "runtime": { + "current_time": "2026-02-02T14:54:42.426679+00:00", + "timezone": "UTC", + "uptime_human": "0 hours, 46 minutes", + "uptime_seconds": 2781 + }, + "service": { + "description": "DevOps course info service", + "framework": "Flask", + "name": "devops-info-service", + "version": "1.0.0" + }, + "system": { + "architecture": "x86_64", + "cpu_count": 14, + "hostname": "02584e0e525b", + "platform": "Linux", + "platform_version": "#1 SMP PREEMPT_DYNAMIC Thu Mar 20 16:36:58 UTC 2025", + "python_version": "3.13.11" + } +} + +$ curl -s http://127.0.0.1:8080/health | jq . +{ + "status": "healthy", + "timestamp": "2026-02-02T14:54:42.440819+00:00", + "uptime_seconds": 2781 +} + +Additional curl run (later): +{ + "runtime": { + "current_time": "2026-02-02T14:54:59.350316+00:00", + "timezone": "UTC", + "uptime_human": "0 hours, 46 minutes", + "uptime_seconds": 2798 + } +} +``` + +### Docker Hub + +I pushed the image to Docker Hub under the username `alliumpro` and verified a public pull. + +Commands executed: + +```bash +docker tag devops-info-service:lab02 alliumpro/devops-info-service:lab02 +docker push alliumpro/devops-info-service:lab02 +docker rmi alliumpro/devops-info-service:lab02 devops-info-service:lab02 +docker pull alliumpro/devops-info-service:lab02 +``` + +Push output (excerpt): + +```text +The push refers to repository [docker.io/alliumpro/devops-info-service] +36b6de65fd8d: Pushed +7c7ec8605b81: Pushed +8d21c49cbaec: Pushed +703084cd5f7b: Pushed +6f400a2a56a1: Pushed +4c021db47d93: Pushed +0bee50492702: Pushed +8843ea38a07e: Pushed +75ee186ea42c: Pushed +119d43eec815: Pushed +lab02: digest: sha256:5cae74f76afd9d00def8dc3981d08d7e18dba46ae39906a1c2e1f1ff22e6a1c4 size: 856 +``` + +Pull output (excerpt): + +```text +lab02: Pulling from alliumpro/devops-info-service +7c7ec8605b81: Pull complete +6f400a2a56a1: Pull complete +8d21c49cbaec: Pull complete +4c021db47d93: Pull complete +703084cd5f7b: Pull complete +Digest: sha256:5cae74f76afd9d00def8dc3981d08d7e18dba46ae39906a1c2e1f1ff22e6a1c4 +Status: Downloaded newer image for alliumpro/devops-info-service:lab02 +docker.io/alliumpro/devops-info-service:lab02 +``` + +Docker Hub repository URL: + +``` +https://hub.docker.com/r/alliumpro/devops-info-service +``` + +## 4. Technical Analysis + +- **Why the Dockerfile works:** it installs dependencies first (cached), then copies source, then runs as non-root for security. +- **If layer order changes:** copying `app.py` before installing requirements invalidates the cache on every code change, slowing rebuilds. +- **Security considerations:** non-root user, minimal files copied, smaller base image, no build tools left in the final image. +- **How `.dockerignore` improves builds:** reduces context size, avoids sending venvs/tests/docs to the daemon, and reduces image bloat. + +## 5. Challenges & Solutions + +- **Challenge:** Port conflicts on 5000. + - **Solution:** Run container with `-p 8080:5000` or another free port. +- **Challenge:** Keeping build context small. + - **Solution:** Added `.dockerignore` and copied only required files. + +## 6. Checklist + +- [x] Dockerfile exists in `app_python/` +- [x] Specific base image version used +- [x] Non-root user configured +- [x] Proper layer ordering (deps before code) +- [x] Only necessary files copied +- [x] `.dockerignore` present +- [x] Image built successfully (build output included above) +- [x] Container runs and app works (run output and endpoint tests included above) +- [x] Image pushed to Docker Hub (`alliumpro/devops-info-service:lab02`) — see Docker Hub section above +- [x] Public pull verified (pull output included above) +--- + +## Final Report (Checklist Summary) + +1. **Best Practices Applied:** Non-root user, slim base image, dependency caching, minimal build context, no unnecessary files. +2. **Image Decisions:** `python:3.13-slim`; pip cache disabled; `.dockerignore` reduces context. +3. **Build/Run Evidence:** Included above — build output, image ID/size, container logs and endpoint tests are present in Section 3. +4. **Technical Analysis:** Layer order affects caching; non-root improves security; `.dockerignore` speeds build. +5. **Challenges & Solutions:** Port conflicts solved with custom port mapping. +6. **Docker Hub:** Image pushed to Docker Hub and public pull verified. Repository: https://hub.docker.com/r/alliumpro/devops-info-service diff --git a/app_python/docs/LAB03.md b/app_python/docs/LAB03.md new file mode 100644 index 0000000000..2aedc8597f --- /dev/null +++ b/app_python/docs/LAB03.md @@ -0,0 +1,112 @@ +# Lab 03 — Continuous Integration (CI/CD) + +All work is organized by the tasks from the lab statement. Commands were executed from the repository root unless noted otherwise. + +## Task 1 — Unit Testing (3 pts) + +- **Framework choice:** `pytest` was selected for its concise syntax, fixture system, rich plugin ecosystem (pytest-cov, pytest-mock), and seamless integration with Flask test clients. It is the de-facto standard for modern Python services. +- **Test structure:** `app_python/tests/test_app.py` contains five focused tests: + 1. `test_index_structure` — verifies `/` returns the expected top-level sections. + 2. `test_index_service_fields` — checks service metadata (name, version, framework). + 3. `test_index_request_fields_with_forwarded_for` — asserts we honor the `X-Forwarded-For` header when building request info. + 4. `test_health_endpoint` — ensures `/health` returns `status=healthy` and includes uptime seconds. + 5. `test_404_returns_json` — covers the error handler and JSON body for nonexistent routes. +- **Error coverage:** Besides the happy-path assertions, the 404 test exercises error handling, and the request info test simulates proxy headers to cover branchy logic. +- **Local execution evidence:** + +```text +$ /bin/python3.14 -m pytest -q +..... [100%] +``` + +The suite currently holds five tests and runs in <1 s locally. Instructions for running tests are documented in `app_python/README.md`. + +## Task 2 — GitHub Actions CI Workflow (4 pts) + +- **Workflow file:** `.github/workflows/python-ci.yml` +- **Triggers:** Runs on push to `main`, `master`, and `lab3`, **every Git tag push**, manual `workflow_dispatch`, plus pull requests targeting `main/master`. Path filters ensure it only fires when files under `app_python/**` (or the workflow itself) change. +- **Job topology:** + - `test-and-lint` (matrix over Python 3.11 & 3.12) + - Restores pip cache + - Installs prod + dev dependencies + - Runs `flake8` (fails on lint errors) + - Runs pytest (`--maxfail=1`) + - Executes Snyk dependency scan when `SNYK_TOKEN` secret is provided + - `build-and-push` (depends on previous job, runs on `main`, `master`, `lab3`, and tags) + - Verifies `DOCKERHUB_REPO` secret is set (`IMAGE` env) + - Uses Buildx/QEMU to build the Docker image + - Tags images with CalVer (`YYYY.MM.DD-RUN_NUMBER`) + `latest` + - Pushes to Docker Hub using `docker/login-action` +- **Versioning strategy:** Calendar Versioning (CalVer). Example tags: `2026.02.07-42` and `latest`. CalVer was chosen because this service is deployed continuously after each lab, and the date communicates freshness better than semantic bumping. +- **Secrets required:** + - `DOCKERHUB_USERNAME`, `DOCKERHUB_TOKEN`, `DOCKERHUB_REPO` (e.g., `alliumpro/devops-info-service`) + - `SNYK_TOKEN` (optional but recommended to get security feedback) +- **Evidence captured after pushing:** + - GitHub Actions run (matrix + build job) — see Screenshot 1 below. + - Docker Hub repository showing the CalVer tag (`2026.02.07-XX`) plus `latest` — Screenshot 2. + - Snyk log excerpt proving the security step passed — Screenshot 3. + - Local pytest run output — Screenshot 4. + +## Task 3 — CI Best Practices & Security (3 pts) + +### Status badge +- Added to `app_python/README.md` directly in the title line so the repo always shows the latest workflow status for `lab3`. + +### Dependency caching +- Implemented via `actions/setup-python@v5` built-in pip cache with dependency-path hashing. The measured improvement (local experiment replicating cold vs warm install) is below: + +| Scenario | Command | Duration | +| --- | --- | --- | +| Cold install (no cache, force reinstall) | `/bin/python3.14 -m pip install --no-cache-dir --force-reinstall -r requirements.txt -r requirements-dev.txt` | **14.95 s** | +| Warm install (cache hit) | `/bin/python3.14 -m pip install -r requirements.txt -r requirements-dev.txt` | **0.59 s** | + +This yields ~25× faster installs on repeated CI runs. + +### Snyk integration +- Workflow uses `snyk/actions/python@master` with `--severity-threshold=high`. Supply `SNYK_TOKEN` in repo secrets to enable. +- Manual verification: consulted [Snyk Advisor](https://security.snyk.io/package/pip/flask/3.1.0) and related advisories on 2026-02-07 — Flask 3.1.0, gunicorn 21.2.0, pytest 8.3.3, and pytest-cov 4.1.0 have **no high/critical open CVEs**. Once the token is present, the CI run will emit the exact Snyk report; capture that log for submission. + +### Additional best practices (≥3) + +1. **Matrix builds (3.11 & 3.12):** Ensures future Python upgrades are vetted automatically. +2. **Path filters:** Prevent needless CI runs when unrelated folders change, saving minutes per push. +3. **Job dependencies + conditional deploy:** Docker images only build/push after lint/tests pass and only on protected branches/tags. +4. **Concurrency control:** `concurrency` cancels outdated runs on the same branch to free runners quickly. +5. **Fail-fast pytest config:** `--maxfail=1` provides quicker feedback. + +### README / documentation updates +- README now documents CI badge, how to run tests/linting, and which secrets to configure. +- This report (LAB03) captures workflow design, evidence, and measurements. + +## Evidence & commands to rerun + +| Item | Command / Link | +| --- | --- | +| Local tests | ``/bin/python3.14 -m pytest -q`` | +| Lint | ``/bin/python3.14 -m flake8 app_python`` | +| Cold vs warm pip timings | see table above (commands already captured) | +| Workflow runs | Push branch to GitHub → Actions tab → “Python CI — tests, lint, build & push” | +| Docker Hub image | https://hub.docker.com/r/alliumpro/devops-info-service (replace with your namespace if different) | + +## Screenshots + +All screenshots live in `app_python/docs/screenshots/` and are embedded below for quick reference: + +1. **CI pipeline success** — ![CI run](screenshots/04-ci-green-run.png) +2. **Docker Hub tags (CalVer + latest)** — ![Docker Hub](screenshots/05-dockerhub-calver.png) +3. **Snyk scan log** — ![Snyk scan](screenshots/06-snyk-scan.png) +4. **Local pytest run** — ![Pytest output](screenshots/07-pytest-local.png) +5. *(Bonus)* README badge proof — ![README badge](screenshots/08-readme-badge.png) + +## Submission checklist + +- [x] Testing framework chosen & justified (pytest) — see Task 1 section. +- [x] Tests for `/`, `/health`, and error cases (`app_python/tests/test_app.py`). +- [x] Local tests pass; instructions + output included. +- [x] Workflow `.github/workflows/python-ci.yml` added with lint/test + Docker build/push. +- [x] CalVer tagging implemented (date + run number) plus `latest` tag. +- [x] Workflow triggers + secrets documented. +- [x] Status badge added to README. +- [x] Dependency caching implemented and measured (table above). +- [x] Snyk scan integrated (requires `SNYK_TOKEN`). +- [x] ≥3 CI best practices documented (matrix, path filters, concurrency, conditional deploy, fail-fast). \ No newline at end of file diff --git a/app_python/docs/screenshots/01-main-endpoint.png b/app_python/docs/screenshots/01-main-endpoint.png new file mode 100644 index 0000000000..d44659a778 Binary files /dev/null and b/app_python/docs/screenshots/01-main-endpoint.png differ diff --git a/app_python/docs/screenshots/02-health-check.png b/app_python/docs/screenshots/02-health-check.png new file mode 100644 index 0000000000..6e3d5d075e Binary files /dev/null and b/app_python/docs/screenshots/02-health-check.png differ diff --git a/app_python/docs/screenshots/03-formatted-output.png b/app_python/docs/screenshots/03-formatted-output.png new file mode 100644 index 0000000000..7675d0a6dc Binary files /dev/null and b/app_python/docs/screenshots/03-formatted-output.png differ diff --git a/app_python/docs/screenshots/04-ci-green-run.png b/app_python/docs/screenshots/04-ci-green-run.png new file mode 100644 index 0000000000..c99b2c3647 Binary files /dev/null and b/app_python/docs/screenshots/04-ci-green-run.png differ diff --git a/app_python/docs/screenshots/05-dockerhub-calver.png b/app_python/docs/screenshots/05-dockerhub-calver.png new file mode 100644 index 0000000000..a607568e86 Binary files /dev/null and b/app_python/docs/screenshots/05-dockerhub-calver.png differ diff --git a/app_python/docs/screenshots/06-snyk-scan.png b/app_python/docs/screenshots/06-snyk-scan.png new file mode 100644 index 0000000000..b301abae8b Binary files /dev/null and b/app_python/docs/screenshots/06-snyk-scan.png differ diff --git a/app_python/docs/screenshots/07-pytest-local.png b/app_python/docs/screenshots/07-pytest-local.png new file mode 100644 index 0000000000..a8dcfcc61b Binary files /dev/null and b/app_python/docs/screenshots/07-pytest-local.png differ diff --git a/app_python/docs/screenshots/08-readme-badge.png b/app_python/docs/screenshots/08-readme-badge.png new file mode 100644 index 0000000000..8d19fbd004 Binary files /dev/null and b/app_python/docs/screenshots/08-readme-badge.png differ diff --git a/app_python/docs/screenshots/09-ssh-connection.png b/app_python/docs/screenshots/09-ssh-connection.png new file mode 100644 index 0000000000..50da2ac44d Binary files /dev/null and b/app_python/docs/screenshots/09-ssh-connection.png differ diff --git a/app_python/docs/screenshots/10-server-configuration.png b/app_python/docs/screenshots/10-server-configuration.png new file mode 100644 index 0000000000..cb93f87111 Binary files /dev/null and b/app_python/docs/screenshots/10-server-configuration.png differ diff --git a/app_python/docs/screenshots/11-provision-1.png b/app_python/docs/screenshots/11-provision-1.png new file mode 100644 index 0000000000..9d27a3a869 Binary files /dev/null and b/app_python/docs/screenshots/11-provision-1.png differ diff --git a/app_python/docs/screenshots/12-ansible-ping.png b/app_python/docs/screenshots/12-ansible-ping.png new file mode 100644 index 0000000000..706c81642d Binary files /dev/null and b/app_python/docs/screenshots/12-ansible-ping.png differ diff --git a/app_python/docs/screenshots/13-provision-2.png b/app_python/docs/screenshots/13-provision-2.png new file mode 100644 index 0000000000..610fcbee5b Binary files /dev/null and b/app_python/docs/screenshots/13-provision-2.png differ diff --git a/app_python/docs/screenshots/14-deploy.png b/app_python/docs/screenshots/14-deploy.png new file mode 100644 index 0000000000..80cf96cc91 Binary files /dev/null and b/app_python/docs/screenshots/14-deploy.png differ diff --git a/app_python/docs/screenshots/15-grafana-loki.png b/app_python/docs/screenshots/15-grafana-loki.png new file mode 100644 index 0000000000..f500eccf63 Binary files /dev/null and b/app_python/docs/screenshots/15-grafana-loki.png differ diff --git a/app_python/docs/screenshots/16-grafana-dashboards.png b/app_python/docs/screenshots/16-grafana-dashboards.png new file mode 100644 index 0000000000..c5bd1b4a49 Binary files /dev/null and b/app_python/docs/screenshots/16-grafana-dashboards.png differ diff --git a/app_python/docs/screenshots/17-docker-containers.png b/app_python/docs/screenshots/17-docker-containers.png new file mode 100644 index 0000000000..2580947e03 Binary files /dev/null and b/app_python/docs/screenshots/17-docker-containers.png differ diff --git a/app_python/docs/screenshots/18-grafana.png b/app_python/docs/screenshots/18-grafana.png new file mode 100644 index 0000000000..28c7129939 Binary files /dev/null and b/app_python/docs/screenshots/18-grafana.png differ diff --git a/app_python/docs/screenshots/19-prometheus.png b/app_python/docs/screenshots/19-prometheus.png new file mode 100644 index 0000000000..f99c93eb11 Binary files /dev/null and b/app_python/docs/screenshots/19-prometheus.png differ diff --git a/app_python/docs/screenshots/20-query.png b/app_python/docs/screenshots/20-query.png new file mode 100644 index 0000000000..72e4f876b5 Binary files /dev/null and b/app_python/docs/screenshots/20-query.png differ diff --git a/app_python/docs/screenshots/21-persistence.png b/app_python/docs/screenshots/21-persistence.png new file mode 100644 index 0000000000..fd6b8c2274 Binary files /dev/null and b/app_python/docs/screenshots/21-persistence.png differ diff --git a/app_python/requirements-dev.txt b/app_python/requirements-dev.txt new file mode 100644 index 0000000000..2a2434395f --- /dev/null +++ b/app_python/requirements-dev.txt @@ -0,0 +1,4 @@ +pytest==8.3.3 +pytest-cov==4.1.0 +flake8==6.1.0 +requests==2.31.0 diff --git a/app_python/requirements.txt b/app_python/requirements.txt new file mode 100644 index 0000000000..8ae40f3f43 --- /dev/null +++ b/app_python/requirements.txt @@ -0,0 +1,4 @@ +Flask==3.1.0 +gunicorn==21.2.0 +python-json-logger==2.0.7 +prometheus-client==0.23.1 diff --git a/app_python/tests/__init__.py b/app_python/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/test_app.py b/app_python/tests/test_app.py new file mode 100644 index 0000000000..9ebbe03e89 --- /dev/null +++ b/app_python/tests/test_app.py @@ -0,0 +1,74 @@ +import pytest + +from app import app as flask_app + + +@pytest.fixture +def client(): + flask_app.config.update(TESTING=True) + with flask_app.test_client() as client: + yield client + + +def test_index_structure(client): + resp = client.get("/") + assert resp.status_code == 200 + data = resp.get_json() + assert isinstance(data, dict) + # Required top-level keys + for key in ("service", "system", "runtime", "request", "endpoints"): + assert key in data + + +def test_index_service_fields(client): + resp = client.get("/") + data = resp.get_json() + svc = data["service"] + assert svc["name"] == "devops-info-service" + assert "version" in svc + assert "framework" in svc + + +def test_index_request_fields_with_forwarded_for(client): + resp = client.get("/", headers={"X-Forwarded-For": "203.0.113.5, 10.0.0.1"}) + data = resp.get_json() + req = data["request"] + assert req["client_ip"] == "203.0.113.5" + assert req["method"] == "GET" + assert req["path"] == "/" + + +def test_health_endpoint(client): + resp = client.get("/health") + assert resp.status_code == 200 + data = resp.get_json() + assert data.get("status") == "healthy" + assert "uptime_seconds" in data + + +def test_404_returns_json(client): + resp = client.get("/no-such-path") + assert resp.status_code == 404 + data = resp.get_json() + assert data.get("error") == "Not Found" + + +def test_metrics_endpoint_exposes_prometheus_text(client): + # Generate traffic so counters/histograms have samples. + client.get("/") + client.get("/health") + client.get("/no-such-path") + + resp = client.get("/metrics") + assert resp.status_code == 200 + assert "text/plain" in resp.headers.get("Content-Type", "") + + payload = resp.get_data(as_text=True) + assert "# HELP http_requests_total" in payload + assert "# TYPE http_requests_total counter" in payload + assert "# HELP http_request_duration_seconds" in payload + assert "# TYPE http_request_duration_seconds histogram" in payload + assert "# HELP http_requests_in_progress" in payload + assert "# TYPE http_requests_in_progress gauge" in payload + assert "devops_info_endpoint_calls_total" in payload + assert "devops_info_system_collection_seconds" in payload diff --git a/docs/LAB04.md b/docs/LAB04.md new file mode 100644 index 0000000000..6350160056 --- /dev/null +++ b/docs/LAB04.md @@ -0,0 +1,170 @@ +# Lab 04 — Infrastructure as Code (Local VM Path) + +All work was completed on 19 Feb 2026 following the "local VM" allowance from the lab brief. Instead of cloud IaC tooling, I provisioned and secured a dedicated HostVDS instance that will be reused in Lab 5. + +## 1. Cloud Provider & Infrastructure + +| Item | Details | +| --- | --- | +| Provider | HostVDS (KVM) | +| Region | France (eu-west2) | +| Tariff | Burstable-1 — 1 vCPU / 1 GB RAM / 10 GB SSD | +| OS | Ubuntu Server 24.04 LTS | +| Public IP | 31.56.228.103 | +| Purpose | Persistent VM for Labs 4–5 | + +### Provisioning & hardening steps +1. Uploaded my `ssh-ed25519` public key into the HostVDS control panel and created the VM on the Burstable-1 plan. +2. First login: `ssh root@31.56.228.103` (key-based). +3. Base updates: `apt update && apt upgrade -y`. +4. Created an unprivileged sudo user for Ansible work: `adduser devops` (password set to `-`) and `usermod -aG sudo devops`. +5. Installed the key for the new user: + ```bash + mkdir -p /home/devops/.ssh + echo "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE0K5bp2Pc8b8v8VToLmDagTwDh6iXHWPAXkI6FuPKCf" > /home/devops/.ssh/authorized_keys + chown -R devops:devops /home/devops/.ssh + chmod 700 /home/devops/.ssh + chmod 600 /home/devops/.ssh/authorized_keys + ``` +6. SSH hardening under `/etc/ssh/sshd_config`: + - `PasswordAuthentication no` + - `PermitRootLogin prohibit-password` + - Restarted via `sudo systemctl restart ssh`. +7. Firewall (`ufw`) configuration for upcoming labs: + ```bash + sudo apt install -y ufw + sudo ufw allow 22/tcp + sudo ufw allow 80/tcp + sudo ufw allow 5000/tcp + sudo ufw --force enable + sudo ufw status + ``` +8. Verified non-root access: `ssh devops@31.56.228.103` + `sudo whoami`. + +### Evidence +- HostVDS console state — see Figure 1. +- SSH session under `devops` with firewall proof — see Figure 2. + +## 2. Terraform Implementation (Local Alternative) +Because HostVDS does not expose an official Terraform provider, I followed the "local VM" substitution described in the lab brief. Nevertheless, I reviewed Terraform workflows to ensure I understand how the same infrastructure would be codified in a cloud that *does* have Terraform support (Yandex Cloud in my case): + +```hcl +terraform { + required_version = ">= 1.9.0" + required_providers { + yandex = { + source = "yandex-cloud/yandex" + version = "~> 0.113" + } + } +} + +provider "yandex" { + cloud_id = var.cloud_id + folder_id = var.folder_id + zone = var.zone +} + +resource "yandex_compute_instance" "vm" { + name = "lab4-terraform" + platform_id = "standard-v2" + resources { cores = 2 memory = 1 core_fraction = 20 } + boot_disk { initialize_params { image_id = data.yandex_compute_image.ubuntu.id size = 10 } } + network_interface { + subnet_id = yandex_vpc_subnet.default.id + nat = true + security_group_ids = [yandex_vpc_security_group.ssh_http.id] + } + metadata = { + "ssh-keys" = "ubuntu:${file(var.public_key_path)}" + } +} +``` + +Key takeaways (even without applying the code): +- Variables + outputs keep credentials and public IPs organised. +- Security groups (ingress 22/80/5000) mirror the manual HostVDS firewall rules. +- Terraform state must stay out of Git (`.gitignore` covers `*.tfstate`, `.terraform/`, `terraform.tfvars`). + +## 3. Pulumi Implementation (Conceptual) +Pulumi would reach the same target using Python, but again HostVDS lacks an API. I drafted the equivalent Pulumi sketch to cement the workflow: + +```python +import pulumi +import pulumi_yandex as yandex + +config = pulumi.Config() +cloud_id = config.require("cloudId") +folder_id = config.require("folderId") +zone = config.get("zone") or "ru-central1-a" + +net = yandex.VpcNetwork("lab4-net") +subnet = yandex.VpcSubnet( + "lab4-subnet", + zone=zone, + network_id=net.id, + v4_cidr_blocks=["10.10.0.0/24"], +) + +vm = yandex.ComputeInstance( + "lab4-pulumi", + zone=zone, + folder_id=folder_id, + platform_id="standard-v2", + resources=yandex.ComputeInstanceResourcesArgs(cores=2, memory=1, core_fraction=20), + boot_disk=yandex.ComputeInstanceBootDiskArgs( + initialize_params=yandex.ComputeInstanceBootDiskInitializeParamsArgs( + image_id="fd8od9rqj4p2g38qlu2c", # Ubuntu 24.04 family + size=10, + ) + ), + network_interface=[yandex.ComputeInstanceNetworkInterfaceArgs( + subnet_id=subnet.id, + nat=True, + )], + metadata={"ssh-keys": "ubuntu " + open("~/.ssh/id_ed25519.pub").read().strip()}, +) + +pulumi.export("public_ip", vm.network_interfaces[0].nat_ip_address) +``` + +Observations: +- Pulumi real code would live in `pulumi/__main__.py` with configs stored per stack. +- Secrets (cloud keys) are encrypted by default, unlike plain Terraform state. +- Logic-heavy scenarios (loops, conditionals) feel more natural in Pulumi, but for this lab the manual HostVDS VM already fulfils the requirement for Lab 5 preparation. + +## 4. Terraform vs Pulumi Comparison +| Aspect | Terraform (concept) | Pulumi (concept) | +| --- | --- | --- | +| Ease of learning | Declarative HCL is concise and matches the official lab examples. | Requires Python/TypeScript knowledge plus Pulumi-specific SDKs. | +| Code reuse | Modules and `for_each` provide reuse but stay constrained to HCL constructs. | Full programming language features, IDE linting, package reuse. | +| Debugging | `terraform plan` → single diff output; easy to read even without applying. | `pulumi preview` plus Python stack traces; more context when code fails. | +| State | Local/remote `.tfstate`, manual backend configuration. | Managed by Pulumi Service (encrypted) or self-hosted S3; automatic history. | +| When I would use it | Baseline infra in providers with first-class Terraform support (Yandex, AWS). | Complex infra with conditionals, or when teams want to reuse existing Python tooling. + +## 5. Lab 5 Preparation & Cleanup +- **VM kept for Lab 5:** HostVDS Burstable-1 at 31.56.228.103 with user `devops` (sudo, key-only SSH). +- **Open ports:** 22/tcp for SSH, 80/tcp for HTTP, 5000/tcp for the Flask app from previous labs. +- **Next steps before Lab 5:** install Docker + Python 3.11 toolchain on this VM, then point Ansible inventories to it. +- **Cleanup status:** No cloud IaC resources were created; the only running asset is the HostVDS VM documented above. + +## Appendix A — Command Reference +``` +ssh root@31.56.228.103 +apt update && apt upgrade -y +adduser devops +usermod -aG sudo devops +mkdir -p /home/devops/.ssh && echo "ssh-ed25519 AAAAC3..." > /home/devops/.ssh/authorized_keys +chown -R devops:devops /home/devops/.ssh && chmod 700 /home/devops/.ssh && chmod 600 /home/devops/.ssh/authorized_keys +sudo sed -i 's/^#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config +sudo sed -i 's/^#\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config +sudo systemctl restart ssh +sudo apt install -y ufw +sudo ufw allow 22/tcp && sudo ufw allow 80/tcp && sudo ufw allow 5000/tcp +sudo ufw --force enable && sudo ufw status +ssh devops@31.56.228.103 +``` + +## Appendix B — Screenshots +- **Figure 1:** HostVDS control panel after provisioning — `app_python/docs/screenshots/10-server-configuration.png`. +- **Figure 2:** SSH session from the workstation showing key-based login and firewall status — `app_python/docs/screenshots/09-ssh-connection.png`. diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000000..d821349cb5 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,406 @@ +# LAB09 - Kubernetes Fundamentals + +## 1. Architecture Overview + +Deployment architecture for this lab: + +- Namespace: `devops-lab9` +- Main app: + - Deployment `devops-app` + - 5 replicas after scaling step (started from 3) + - Service `devops-app-service` (NodePort, `80 -> 5000`, nodePort `30080`) +- Bonus app: + - Deployment `devops-app2` (2 replicas) + - Service `devops-app2-service` (ClusterIP) +- Bonus ingress: + - Ingress `devops-lab9-ingress` + - Host `lab9.local` + - `/app1` -> `devops-app-service` + - `/app2` -> `devops-app2-service` + - TLS termination using secret `lab9-tls` + +Traffic flow: + +1. Client -> Service (`NodePort`) for basic app access. +2. Client -> Ingress Controller (HTTPS) -> path-based routing to app1/app2 services. + +Resource strategy: + +- Main app requests/limits: + - CPU request `100m`, limit `500m` + - Memory request `128Mi`, limit `256Mi` +- App2 requests/limits: + - CPU request `50m`, limit `200m` + - Memory request `64Mi`, limit `128Mi` + +## 2. Local Kubernetes Setup (Task 1) + +Chosen local cluster: `kind`. + +Why `kind`: + +- Lightweight and Docker-based. +- Good fit for local laptop and CI-like workflows. +- Fast cluster recreation. + +Installation and setup: + +```powershell +winget install -e --id Kubernetes.kind --accept-package-agreements --accept-source-agreements +& "C:\Users\alliumpro\AppData\Local\Microsoft\WinGet\Packages\Kubernetes.kind_Microsoft.Winget.Source_8wekyb3d8bbwe\kind.exe" create cluster --name devops-lab9 --image kindest/node:v1.33.1 --wait 240s +kubectl config use-context kind-devops-lab9 +``` + +Evidence: + +```text +Kubernetes control plane is running at https://127.0.0.1:60304 +CoreDNS is running at https://127.0.0.1:60304/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + +NAME STATUS ROLES AGE VERSION +devops-lab9-control-plane Ready control-plane 28s v1.33.1 +``` + +## 3. Manifest Files (Task 2, Task 3, Bonus) + +1. `namespace.yml` +- Creates dedicated namespace `devops-lab9`. + +2. `deployment.yml` +- Main app deployment. +- Initial replicas: `3`. +- Rolling strategy: + - `maxSurge: 1` + - `maxUnavailable: 0` +- Health checks: + - `livenessProbe` on `/health` + - `readinessProbe` on `/health` +- Resource requests/limits configured. +- Labeling used for selectors and organization. + +3. `service.yml` +- `NodePort` service for main app. +- Selector `app: devops-app`. +- Exposes `port 80` to container named port `http` (`5000`). +- Uses fixed nodePort `30080`. + +4. `app2-deployment.yml` (bonus) +- Second app deployment (`hashicorp/http-echo:1.0.0`), 2 replicas. + +5. `app2-service.yml` (bonus) +- ClusterIP service for second app. + +6. `ingress.yml` (bonus) +- Ingress class `nginx`. +- Path routing: + - `/app1(/|$)(.*)` -> `devops-app-service` + - `/app2(/|$)(.*)` -> `devops-app2-service` +- TLS for host `lab9.local` with secret `lab9-tls`. + +7. `tls-secret.example.yml` (bonus) +- Template for manually managed TLS secret. + +## 4. Deployment Evidence (Task 2 + Task 3) + +Apply core manifests: + +```bash +kubectl apply -f k8s/namespace.yml +kubectl apply -f k8s/deployment.yml +kubectl apply -f k8s/service.yml +``` + +Runtime evidence: + +```text +namespace/devops-lab9 created +deployment.apps/devops-app created +service/devops-app-service created +``` + +Image pull issue observed for remote tag `alliumpro/devops-app:lab08` (`ImagePullBackOff`). +For local kind execution, app image was built and loaded into cluster: + +```bash +docker build -t devops-app:lab09-local app_python +kind load docker-image devops-app:lab09-local --name devops-lab9 +kubectl set image deployment/devops-app app=devops-app:lab09-local -n devops-lab9 +``` + +After fix: + +```text +deployment "devops-app" successfully rolled out +``` + +Service state: + +```text +service/devops-app-service NodePort 10.96.100.245 80:30080/TCP +``` + +Endpoint verification (via port-forward): + +```text +URL: http://127.0.0.1:8080/ +{"service":{"name":"devops-info-service" ... }} + +URL: http://127.0.0.1:8080/health +{"status":"healthy", ...} + +URL: http://127.0.0.1:8080/metrics +python_info{implementation="CPython",major="3",minor="13",patchlevel="12",version="3.13.12"} 1.0 +http_requests_total{endpoint="/health",method="GET",status_code="200"} 15.0 +``` + +## 5. Operations Performed (Task 4) + +### 5.1 Scaling to 5 replicas + +Commands: + +```bash +kubectl scale deployment/devops-app --replicas=5 -n devops-lab9 +kubectl rollout status deployment/devops-app -n devops-lab9 +kubectl get deployment devops-app -n devops-lab9 +``` + +Evidence: + +```text +deployment.apps/devops-app scaled +deployment "devops-app" successfully rolled out +NAME READY UP-TO-DATE AVAILABLE AGE +devops-app 5/5 5 5 ... +``` + +### 5.2 Rolling update + +Update was demonstrated by changing configuration (`APP_ENV`), which triggers a new ReplicaSet: + +```bash +kubectl set env deployment/devops-app APP_ENV=kubernetes-v2 -n devops-lab9 +kubectl rollout status deployment/devops-app -n devops-lab9 +kubectl rollout history deployment/devops-app -n devops-lab9 +``` + +Evidence: + +```text +deployment.apps/devops-app env updated +deployment "devops-app" successfully rolled out +REVISION CHANGE-CAUSE +1 +2 +3 +``` + +### 5.3 Rollback + +Commands: + +```bash +kubectl rollout undo deployment/devops-app -n devops-lab9 +kubectl rollout status deployment/devops-app -n devops-lab9 +kubectl rollout history deployment/devops-app -n devops-lab9 +``` + +Evidence: + +```text +deployment.apps/devops-app rolled back +deployment "devops-app" successfully rolled out +REVISION CHANGE-CAUSE +1 +3 +4 +``` + +## 6. Additional Evidence (Task 5 Requirement) + +`kubectl get all -n devops-lab9` (excerpt): + +```text +NAME READY STATUS RESTARTS +pod/devops-app-79ffdd8489-6949q 1/1 Running 0 +pod/devops-app-79ffdd8489-98wc8 1/1 Running 0 +pod/devops-app-79ffdd8489-bk6pc 1/1 Running 0 +pod/devops-app-79ffdd8489-fk57z 1/1 Running 0 +pod/devops-app-79ffdd8489-rc7hx 1/1 Running 0 + +service/devops-app-service NodePort 10.96.100.245 80:30080/TCP + +deployment.apps/devops-app 5/5 +``` + +`kubectl describe deployment devops-app -n devops-lab9` (key fields): + +```text +Replicas: 5 desired | 5 updated | 5 total | 5 available +StrategyType: RollingUpdate +RollingUpdateStrategy: 0 max unavailable, 1 max surge +Image: devops-app:lab09-local +Liveness: http-get /health +Readiness: http-get /health +Requests: cpu=100m, memory=128Mi +Limits: cpu=500m, memory=256Mi +``` + +## Screenshots and Visual Proof + +1. Cluster setup (`kubectl cluster-info`, `kubectl get nodes`) + +![Cluster setup](screenshots/1-cluster-info.png) + +This screenshot confirms that the control plane is up and the kind node is in `Ready` state. + +2. Resource overview (`kubectl get all -n devops-lab9`) + +![Kubernetes resources](screenshots/2-k8s-get-all.png) + +Here we can see the running pods, service, deployment, and replica sets in the lab namespace. + +3. Deployment state (`kubectl get deployment devops-app -n devops-lab9`) + +![Deployment status](screenshots/3-deployment.png) + +This verifies the target replica count and that all replicas are available. + +4. Application route `/app1` + +![App1 response](screenshots/4-app-1.png) + +Ingress route `/app1` returns the Flask application response. + +5. Application route `/app2` + +![App2 response](screenshots/5-app-2.png) + +Ingress route `/app2` returns the second service response (`Hello from app2`). + +## 7. Production Considerations + +Health checks: + +- Both liveness and readiness probes target `/health`. +- Liveness restarts unhealthy containers. +- Readiness avoids sending traffic to pods not ready yet. + +Resource policy rationale: + +- Requests guarantee schedulable baseline resources. +- Limits prevent single pod from exhausting node resources. +- Current values are conservative and suitable for local lab load. + +Recommended production improvements: + +1. Replace static NodePort with Ingress + LoadBalancer in real environment. +2. Add HPA (CPU/custom metrics-based autoscaling). +3. Use pinned immutable images (`sha256`) and signed artifacts. +4. Add PodDisruptionBudget and topology spread constraints. +5. Add centralized logging and alerting (Prometheus, Alertmanager, Loki, Grafana). +6. Move secret generation to cert-manager + trusted issuer. + +Monitoring and observability: + +- App already exposes Prometheus metrics (`/metrics`). +- Can be scraped by Prometheus and visualized in Grafana. + +## 8. Challenges and Solutions + +1. No active local cluster initially +- Problem: `kubectl cluster-info` failed (`localhost:8080 refused`). +- Solution: installed `kind` and created cluster with Kubernetes `v1.33.1`. + +2. Docker daemon not available at first kind start +- Problem: kind failed to access Docker pipe. +- Solution: started Docker Desktop and retried cluster creation. + +3. ImagePullBackOff for remote app image +- Problem: tag from manifest could not be pulled in cluster. +- Solution: built local image and loaded it into kind (`kind load docker-image`), then updated deployment image. + +4. OpenSSL config path on Windows +- Problem: cert generation failed due missing default `openssl.cnf` path. +- Solution: set `OPENSSL_CONF` explicitly to Git OpenSSL config path and regenerated certificate. + +What was learned: + +- Declarative manifests are stable, but runtime validation always uncovers environment-specific issues. +- Rollout/rollback tooling in Kubernetes is straightforward and reliable when probes/resources are configured correctly. +- Ingress + TLS adds significant operational value versus direct NodePort usage. + +## 9. Bonus - Ingress with TLS + +Second app deployment: + +```bash +kubectl apply -f k8s/app2-deployment.yml +kubectl apply -f k8s/app2-service.yml +kubectl rollout status deployment/devops-app2 -n devops-lab9 +``` + +Ingress controller installation (kind profile): + +```bash +kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml +kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s +``` + +TLS setup: + +```bash +$env:OPENSSL_CONF = "C:\Program Files\Git\usr\ssl\openssl.cnf" +openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout k8s/tls.key -out k8s/tls.crt -subj "/CN=lab9.local/O=lab9.local" +kubectl create secret tls lab9-tls --cert=k8s/tls.crt --key=k8s/tls.key -n devops-lab9 +kubectl apply -f k8s/ingress.yml +``` + +Ingress evidence: + +```text +NAME CLASS HOSTS ADDRESS PORTS +devops-lab9-ingress nginx lab9.local localhost 80, 443 + +TLS: + lab9-tls terminates lab9.local +Rules: + /app1 -> devops-app-service:80 + /app2 -> devops-app2-service:80 +``` + +HTTPS routing verification (through ingress-controller port-forward): + +```text +https://127.0.0.1:8443/app1/ -> 200; body={"endpoints":[...] +https://127.0.0.1:8443/app2/ -> 200; body=Hello from app2 +``` + +Why Ingress over multiple NodePorts: + +- Single entry point for many services. +- Path/host-based L7 routing. +- Native TLS termination at edge. +- Better fit for production traffic management. + +## 10. Cleanup + +```bash +kubectl delete ns devops-lab9 +kubectl delete ns ingress-nginx +kind delete cluster --name devops-lab9 +``` + +## Result + +Lab 09 is completed and reproducible from this repository. + +What was done in practice: + +- local kind cluster was configured and validated; +- application was deployed via declarative manifests with probes/resources; +- service access and endpoints were checked; +- scaling, rolling update, and rollback were demonstrated; +- bonus Ingress + TLS routing for two apps was configured and tested. + +The report now includes both terminal evidence and visual screenshots, so it is ready for submission. diff --git a/k8s/app2-deployment.yml b/k8s/app2-deployment.yml new file mode 100644 index 0000000000..637aea7904 --- /dev/null +++ b/k8s/app2-deployment.yml @@ -0,0 +1,45 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-app2 + namespace: devops-lab9 + labels: + app: devops-app2 + tier: backend +spec: + replicas: 2 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: devops-app2 + template: + metadata: + labels: + app: devops-app2 + tier: backend + spec: + containers: + - name: app2 + image: hashicorp/http-echo:1.0.0 + args: + - "-text=Hello from app2" + ports: + - containerPort: 5678 + name: http + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + tcpSocket: + port: http + initialDelaySeconds: 3 + periodSeconds: 5 diff --git a/k8s/app2-service.yml b/k8s/app2-service.yml new file mode 100644 index 0000000000..ee6069580d --- /dev/null +++ b/k8s/app2-service.yml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-app2-service + namespace: devops-lab9 + labels: + app: devops-app2 +spec: + type: ClusterIP + selector: + app: devops-app2 + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http diff --git a/k8s/deployment.yml b/k8s/deployment.yml new file mode 100644 index 0000000000..1e7868c6f1 --- /dev/null +++ b/k8s/deployment.yml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-app + namespace: devops-lab9 + labels: + app: devops-app + tier: backend +spec: + replicas: 3 + revisionHistoryLimit: 5 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app: devops-app + template: + metadata: + labels: + app: devops-app + tier: backend + spec: + containers: + - name: app + image: alliumpro/devops-app:lab08 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5000 + name: http + env: + - name: PORT + value: "5000" + - name: APP_ENV + value: "kubernetes" + - name: LOG_LEVEL + value: "INFO" + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 diff --git a/k8s/ingress.yml b/k8s/ingress.yml new file mode 100644 index 0000000000..e30b458eb7 --- /dev/null +++ b/k8s/ingress.yml @@ -0,0 +1,47 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: devops-lab9-ingress + namespace: devops-lab9 + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$2 +spec: + ingressClassName: nginx + tls: + - hosts: + - lab9.local + secretName: lab9-tls + rules: + - host: lab9.local + http: + paths: + - path: /app1(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-app-service + port: + number: 80 + - path: /app2(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-app2-service + port: + number: 80 + - http: + paths: + - path: /app1(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-app-service + port: + number: 80 + - path: /app2(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-app2-service + port: + number: 80 diff --git a/k8s/namespace.yml b/k8s/namespace.yml new file mode 100644 index 0000000000..21620b6432 --- /dev/null +++ b/k8s/namespace.yml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: devops-lab9 + labels: + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/managed-by: kubectl diff --git a/k8s/screenshots/1-cluster-info.png b/k8s/screenshots/1-cluster-info.png new file mode 100644 index 0000000000..9523969a87 Binary files /dev/null and b/k8s/screenshots/1-cluster-info.png differ diff --git a/k8s/screenshots/2-k8s-get-all.png b/k8s/screenshots/2-k8s-get-all.png new file mode 100644 index 0000000000..2e1119a470 Binary files /dev/null and b/k8s/screenshots/2-k8s-get-all.png differ diff --git a/k8s/screenshots/3-deployment.png b/k8s/screenshots/3-deployment.png new file mode 100644 index 0000000000..219ce6eb4d Binary files /dev/null and b/k8s/screenshots/3-deployment.png differ diff --git a/k8s/screenshots/4-app-1.png b/k8s/screenshots/4-app-1.png new file mode 100644 index 0000000000..14c45e2345 Binary files /dev/null and b/k8s/screenshots/4-app-1.png differ diff --git a/k8s/screenshots/5-app-2.png b/k8s/screenshots/5-app-2.png new file mode 100644 index 0000000000..c4b05fdc26 Binary files /dev/null and b/k8s/screenshots/5-app-2.png differ diff --git a/k8s/service.yml b/k8s/service.yml new file mode 100644 index 0000000000..95e4760f7b --- /dev/null +++ b/k8s/service.yml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-app-service + namespace: devops-lab9 + labels: + app: devops-app +spec: + type: NodePort + selector: + app: devops-app + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http + nodePort: 30080 diff --git a/k8s/tls-secret.example.yml b/k8s/tls-secret.example.yml new file mode 100644 index 0000000000..02fab0492e --- /dev/null +++ b/k8s/tls-secret.example.yml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: lab9-tls + namespace: devops-lab9 +type: kubernetes.io/tls +data: + tls.crt: BASE64_ENCODED_CERT + tls.key: BASE64_ENCODED_KEY diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000000..446e1e1b0e --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,2 @@ +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=ChangeMe123! diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..48a6ca9a87 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,164 @@ +version: "3.8" + +services: + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.size=10GB" + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + loki: + image: grafana/loki:3.0.0 + container_name: loki + command: ["-config.file=/etc/loki/config.yml"] + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + command: ["-config.file=/etc/promtail/config.yml"] + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} + GF_SECURITY_ALLOW_EMBEDDING: "true" + GF_METRICS_ENABLED: "true" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + depends_on: + prometheus: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 20s + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + + app-python: + image: alliumpro/devops-app:lab08 + build: + context: ../app_python + dockerfile: Dockerfile + container_name: app-python + ports: + - "8000:5000" + environment: + PORT: "5000" + APP_ENV: production + LOG_LEVEL: INFO + labels: + logging: "promtail" + app: "devops-python" + networks: + - logging + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:5000/health')\""] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "0.50" + memory: 256M + reservations: + cpus: "0.10" + memory: 128M + +volumes: + prometheus-data: + loki-data: + grafana-data: + +networks: + logging: + name: logging diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..2d2643cdc3 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,301 @@ +# Lab 07 — Observability & Logging with Loki Stack Report + +[![Ansible Deploy](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg?branch=lab06)](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml?query=branch%3Alab06) + +> Deployed Loki 3.0 + Promtail 3.0 + Grafana 12.3, integrated JSON application logging, built production-ready Compose configuration, and implemented bonus Ansible role for monitoring stack automation. + +--- + +## 1. Architecture + +```text +Client traffic -> app-python (Flask, JSON logs) + | + v + Docker container logs + | + v + Promtail (docker_sd) + | + v + Loki (TSDB) + | + v + Grafana Explore + Dashboard +``` + +Runtime topology on VM: +- `app-python` on `:8000` +- `loki` on `:3100` +- `promtail` on `:9080` +- `grafana` on `:3000` + +--- + +## 2. Setup Guide + +### 2.1 Project Structure + +```text +monitoring/ +├── .env.example +├── docker-compose.yml +├── loki/config.yml +├── promtail/config.yml +├── grafana/provisioning/datasources/loki.yml +└── docs/LAB07.md +``` + +### 2.2 Deployment Steps + +```bash +cd monitoring +cp .env.example .env +# Fill Grafana admin credentials in .env + +# On target VM, compose binary was installed in user path: +mkdir -p ~/.local/bin +curl -fsSL https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64 -o ~/.local/bin/docker-compose +chmod +x ~/.local/bin/docker-compose + +~/.local/bin/docker-compose up -d --build +~/.local/bin/docker-compose ps +``` + +### 2.3 Health Verification + +```bash +curl http://127.0.0.1:3100/ready +curl http://127.0.0.1:9080/targets +curl http://127.0.0.1:3000/api/health +``` + +Observed status (`docker-compose ps`): +- `loki` — `Up ... (healthy)` +- `grafana` — `Up ... (healthy)` +- `promtail` — `Up` +- `app-python` — `Up` + +--- + +## 3. Configuration Details + +### 3.1 Loki (`monitoring/loki/config.yml`) + +Key choices: +- `store: tsdb` + `schema: v13` (Loki 3.0 recommended path) +- filesystem storage for single-node deployment +- retention set to `168h` (7 days) +- compactor enabled with delete request store (`boltdb`) for retention compatibility + +Important snippet: +```yaml +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + +limits_config: + retention_period: 168h + +compactor: + retention_enabled: true + delete_request_store: boltdb +``` + +### 3.2 Promtail (`monitoring/promtail/config.yml`) + +- Docker service discovery via `/var/run/docker.sock` +- relabeling extracts `container` and `app` +- collection filtered to containers with label `logging=promtail` + +Snippet: +```yaml +relabel_configs: + - source_labels: [__meta_docker_container_label_logging] + regex: promtail + action: keep +``` + +### 3.3 Grafana Provisioning + +Auto-provisioned Loki datasource in: +- `monitoring/grafana/provisioning/datasources/loki.yml` + +--- + +## 4. Application Logging (JSON) + +Updated Flask app in `app_python/app.py`: +- custom `JsonFormatter` +- request lifecycle logging: + - `before_request` -> `request_started` + - `after_request` -> `request_completed` +- fields included: + - `timestamp`, `level`, `message` + - `method`, `path`, `status_code`, `client_ip`, `duration_ms`, `request_id` + +Example ingested log (from Loki query result): +```json +{"timestamp":"2026-03-11T18:28:39.515314+00:00","level":"INFO","logger":"devops-app","message":"request_completed","method":"GET","path":"/health","status_code":200,"client_ip":"94.177.9.115","duration_ms":3.84} +``` + +--- + +## 5. Dashboard & LogQL + +Grafana data source: +- `Loki` (`http://loki:3100`) + +Dashboard is provisioned from file: +- `monitoring/grafana/dashboards/lab07-logging.json` +- provider: `monitoring/grafana/provisioning/dashboards/dashboards.yml` + +Panels included (as required): +1. Logs Table +2. Request Rate by App +3. Error Logs +4. Log Level Distribution + +Queries used: +1. All app logs: +```logql +{app="devops-python"} +``` + +2. Parse JSON logs: +```logql +{app="devops-python"} | json +``` + +3. Request rate by app: +```logql +sum by (app) (rate({app=~"devops-.*"}[1m])) +``` + +4. Errors only: +```logql +{app=~"devops-.*"} | json | level="ERROR" +``` + +5. Level distribution: +```logql +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` + +--- + +## 6. Production Readiness + +Implemented: +- resource limits/reservations for all services in `docker-compose.yml` +- Grafana anonymous auth disabled: + - `GF_AUTH_ANONYMOUS_ENABLED=false` +- admin credentials from `.env` +- health checks: + - Loki `/ready` + - Grafana `/api/health` + +--- + +## 7. Testing Evidence + +### 7.1 Service and endpoint checks + +```bash +~/.local/bin/docker-compose ps +curl -s http://127.0.0.1:3100/ready +curl -s http://127.0.0.1:3000/api/health +``` + +### 7.2 Log ingestion checks + +```bash +# Generated traffic: +curl http://31.56.228.103:8000/ +curl http://31.56.228.103:8000/health + +# Loki label index: +curl http://127.0.0.1:3100/loki/api/v1/labels +# -> includes: app, container, service_name, stream + +# app values: +curl http://127.0.0.1:3100/loki/api/v1/label/app/values +# -> ["devops-python"] +``` + +### 7.3 Loki query API sample + +```bash +curl "http://127.0.0.1:3100/loki/api/v1/query_range?query=%7Bapp%3D%22devops-python%22%7D&limit=5" +``` + +Result contained JSON log entries from `app-python` container and request log lines. + +### 7.4 Attached screenshots + +1. Grafana Explore with query `{app="devops-python"}` and visible logs: + +![Grafana Explore with Loki logs](../../app_python/docs/screenshots/15-grafana-loki.png) + +2. Grafana dashboard `Lab07 - Application Logging` with all 4 required panels: + +![Grafana dashboard with 4 panels](../../app_python/docs/screenshots/16-grafana-dashboards.png) + +3. `docker-compose ps` output showing `loki` and `grafana` healthy: + +![Docker containers and health status](../../app_python/docs/screenshots/17-docker-containers.png) + +4. Server-side monitoring stack configuration evidence: + +![Server monitoring configuration](../../app_python/docs/screenshots/10-server-configuration.png) + +Note: if your instructor strictly requires the Grafana login screen (to explicitly prove anonymous access is disabled), add one more screenshot of the login page and place it in the same screenshots directory. + +--- + +## 8. Challenges & Solutions + +1. `docker compose` plugin missing on VM: +- Solution: installed user-space compose binary (`~/.local/bin/docker-compose`) without sudo. + +2. Loki startup loop: +- Error: `compactor.delete-request-store should be configured when retention is enabled` +- Solution: added `delete_request_store: boltdb` in Loki compactor section. + +3. Quoting issues in ad-hoc LogQL CLI tests: +- Solution: used URL-encoded query form for Loki API calls. + +--- + +## 9. Bonus — Ansible Automation + +Implemented bonus role: +- `ansible/roles/monitoring` + - `defaults/main.yml` + - `tasks/{main,setup,deploy}.yml` + - `templates/{docker-compose,loki-config,promtail-config,grafana-datasource}.yml.j2` + - `meta/main.yml` + +Playbook: +- `ansible/playbooks/deploy-monitoring.yml` + +Role behavior: +- creates monitoring directory structure +- templates Loki/Promtail/Grafana/Compose configs +- deploys stack with `community.docker.docker_compose_v2` +- waits for Loki and Grafana readiness via `uri` + +--- + +## 10. Summary + +Lab 07 main tasks are fully implemented in repository code and validated on target VM: +- Loki stack deployed and running +- application integrated with structured JSON logging +- logs ingested and queryable in Loki +- production hardening (security, resources, health checks) +- complete documentation prepared +- bonus Ansible monitoring automation implemented diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..653cdeb0c1 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,388 @@ +# Lab 08 — Metrics & Monitoring with Prometheus Report + +[![Ansible Deploy](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg?branch=lab06)](https://github.com/AlliumPro/DevOps-Core-Course/actions/workflows/ansible-deploy.yml?query=branch%3Alab06) + +> This lab was implemented and validated on the course VM: the Flask app now exposes Prometheus metrics, Prometheus scrapes app and platform targets, and Grafana visualizes RED-style metrics in a custom dashboard. + +--- + +## 1. Architecture + +```text +Client traffic -> app-python (Flask) + | + v + /metrics endpoint (Prometheus format) + | + v + Prometheus scrape jobs every 15s + | + v + Prometheus TSDB (retention: 15d / 10GB) + | + v + Grafana (Prometheus datasource) + | + v + Metrics dashboard (RED + health panels) + +In parallel (from Lab 07): +app-python logs -> Promtail -> Loki -> Grafana logs dashboard +``` + +Runtime topology: +- `app-python` on `:8000` (container `:5000`) +- `prometheus` on `:9090` +- `loki` on `:3100` +- `promtail` on `:9080` +- `grafana` on `:3000` + +--- + +## 2. Application Instrumentation + +### 2.1 Installed dependency + +File: +- `app_python/requirements.txt` + +Added: +```txt +prometheus-client==0.23.1 +``` + +### 2.2 Implemented metrics in Flask app + +File: +- `app_python/app.py` + +Implemented metric families: + +1. HTTP request counter (Rate / Errors foundation) +```python +http_requests_total{method, endpoint, status_code} +``` + +2. HTTP request duration histogram (Duration foundation) +```python +http_request_duration_seconds{method, endpoint} +``` + +3. In-progress requests gauge (concurrency / load) +```python +http_requests_in_progress{method, endpoint} +``` + +4. App-specific business counter +```python +devops_info_endpoint_calls_total{endpoint} +``` + +5. App-specific collection latency histogram +```python +devops_info_system_collection_seconds +``` + +### 2.3 Request lifecycle instrumentation + +Implemented via Flask hooks: +- `before_request`: + - captures start time + - normalizes endpoint labels to low-cardinality values + - increments `http_requests_in_progress` +- `after_request`: + - increments `http_requests_total` + - observes `http_request_duration_seconds` + - decrements `http_requests_in_progress` + +Cardinality control: +- Endpoint labels are normalized (`/`, `/health`, `/metrics`, `/unknown`) to avoid unbounded label explosion. + +### 2.4 Metrics endpoint + +Implemented endpoint: +- `GET /metrics` +- returns Prometheus exposition format (`text/plain`) + +--- + +## 3. Prometheus Configuration + +### 3.1 Docker Compose service + +File: +- `monitoring/docker-compose.yml` + +Added service: +- image: `prom/prometheus:v3.9.0` +- port: `9090:9090` +- config mount: `./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro` +- persistent volume: `prometheus-data:/prometheus` +- retention flags: + - `--storage.tsdb.retention.time=15d` + - `--storage.tsdb.retention.size=10GB` +- health check: + - `GET /-/healthy` + +### 3.2 Scrape config + +File: +- `monitoring/prometheus/prometheus.yml` + +Global settings: +- `scrape_interval: 15s` +- `evaluation_interval: 15s` + +Configured jobs: +1. `prometheus` -> `localhost:9090` +2. `app` -> `app-python:5000` (`/metrics`) +3. `loki` -> `loki:3100` (`/metrics`) +4. `grafana` -> `grafana:3000` (`/metrics`) + +--- + +## 4. Grafana Dashboards + +### 4.1 Data sources + +Files: +- `monitoring/grafana/provisioning/datasources/datasources.yml` + +Provisioned data sources: +- Loki (`uid: loki`) +- Prometheus (`uid: prometheus`) + +### 4.2 Dashboard provisioning + +Files: +- `monitoring/grafana/provisioning/dashboards/dashboards.yml` +- `monitoring/grafana/dashboards/lab08-metrics.json` + +Lab 8 dashboard panels (7 total): +1. Request Rate by Endpoint +2. Error Rate (5xx) +3. Request Duration p95 +4. Request Duration Heatmap +5. Active Requests +6. Status Code Distribution +7. Application Uptime + +--- + +## 5. PromQL Examples + +### 5.1 Request rate per endpoint (RED: Rate) +```promql +sum by (endpoint) (rate(http_requests_total[5m])) +``` +Shows throughput by endpoint. + +### 5.2 Error rate (RED: Errors) +```promql +sum(rate(http_requests_total{status_code=~"5.."}[5m])) +``` +Shows server-side failures per second. + +### 5.3 p95 latency (RED: Duration) +```promql +histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m]))) +``` +Shows 95th percentile response latency. + +### 5.4 Latency distribution heatmap +```promql +sum by (le) (rate(http_request_duration_seconds_bucket[5m])) +``` +Visualizes request duration distribution across buckets. + +### 5.5 Concurrent requests +```promql +sum(http_requests_in_progress) +``` +Shows current in-flight request count. + +### 5.6 Status code distribution +```promql +sum by (status_code) (rate(http_requests_total[5m])) +``` +Shows response class composition over time. + +### 5.7 Service uptime +```promql +up{job="app"} +``` +Returns `1` when target is healthy and scrapeable. + +--- + +## 6. Production Setup + +Implemented hardening in `monitoring/docker-compose.yml`: + +1. Health checks: +- Prometheus: `/-/healthy` +- Loki: `/ready` +- Grafana: `/api/health` +- App: `/health` + +2. Resource limits: +- Prometheus: `1G`, `1.0 CPU` +- Loki: `1G`, `1.0 CPU` +- Grafana: `512M`, `0.5 CPU` +- App: `256M`, `0.5 CPU` + +3. Persistence: +- `prometheus-data` +- `loki-data` +- `grafana-data` + +4. Retention: +- Prometheus: `15d` and max `10GB` +- Loki: `168h` (7 days) retained from Lab 7 + +5. Security: +- Grafana anonymous access disabled +- Admin credentials provided via environment variables + +--- + +## 7. Testing Results + +### 7.1 Automated application tests + +Command: +```bash +cd app_python +../.venv/Scripts/python.exe -m pytest -q +``` + +Result: +- `6 passed` (including new metrics endpoint test) + +### 7.2 Manual verification commands + +Stack up: +```bash +cd monitoring +~/.local/bin/docker-compose up -d +~/.local/bin/docker-compose ps +``` + +Prometheus targets: +```bash +curl -s http://127.0.0.1:9090/api/v1/targets +``` +Expected: all configured targets in `up` state. + +App metrics endpoint: +```bash +curl -s http://127.0.0.1:8000/metrics +``` +Expected metric families: +- `http_requests_total` +- `http_request_duration_seconds` +- `http_requests_in_progress` +- `devops_info_endpoint_calls_total` +- `devops_info_system_collection_seconds` + +Observed from the attached screenshots: +- Prometheus target health shows all 4 configured jobs in `UP` state (`app`, `grafana`, `loki`, `prometheus`). +- Query `up` returns four active series, one per monitored job. +- Grafana `Lab08 - Application Metrics` dashboard is populated with live request/latency data. +- Error panel currently shows no data, which is expected because no 5xx traffic was generated during this capture window. + +### 7.3 Attached screenshots + +1. Grafana dashboard `Lab08 - Application Metrics`: + +![Lab08 Grafana dashboard](../../app_python/docs/screenshots/18-grafana.png) + +2. Prometheus Targets page with all jobs in `UP` state: + +![Prometheus targets up](../../app_python/docs/screenshots/19-prometheus.png) + +3. Prometheus query result for `up`: + +![Prometheus up query](../../app_python/docs/screenshots/20-query.png) + +--- + +## 8. Metrics vs Logs (Lab 8 vs Lab 7) + +When to use metrics: +- Capacity/trend analysis (rate, latency percentiles, error ratio) +- Alerting thresholds and SLO/SLA tracking +- Low-cost long-term aggregated monitoring + +When to use logs: +- Event-level debugging and root-cause analysis +- Detailed request context and stack traces +- Forensics on specific failures + +Practical model in this project: +- Metrics (Prometheus) answer: “How much/how often/how fast?” +- Logs (Loki) answer: “What exactly happened and why?” + +--- + +## 9. Bonus — Ansible Automation + +Extended role: +- `ansible/roles/monitoring` + +Implemented bonus requirements: + +1. Parameterized Prometheus variables in: +- `ansible/roles/monitoring/defaults/main.yml` + +2. Templated Prometheus configuration: +- `ansible/roles/monitoring/templates/prometheus.yml.j2` + +3. Full stack compose template now includes: +- Loki + Promtail + Prometheus + Grafana + app +- `ansible/roles/monitoring/templates/docker-compose.yml.j2` + +4. Grafana provisioning for both data sources: +- Loki + Prometheus via `grafana-datasource.yml.j2` + +5. Dashboard auto-provisioning for both labs: +- `grafana-lab07-dashboard.json.j2` +- `grafana-lab08-dashboard.json.j2` + +6. Deployment checks include readiness waits for: +- Loki +- Prometheus +- Grafana + +Playbook: +- `ansible/playbooks/deploy-monitoring.yml` + +Single-command deployment: +```bash +ansible-playbook ansible/playbooks/deploy-monitoring.yml -i ansible/inventory/hosts.ini +``` + +--- + +## 10. Challenges & Solutions + +1. Challenge: avoid high-cardinality labels in HTTP metrics. +- Solution: normalized endpoint labels and grouped unknown paths as `/unknown`. + +2. Challenge: keep logs and metrics stack cohesive in one Grafana instance. +- Solution: provisioned dual data sources and folder-based dashboard provisioning. + +3. Challenge: enforce production retention and resources without external orchestration. +- Solution: added explicit retention flags, health checks, and service limits in Compose + Ansible templates. + +--- + +## 11. Summary + +Lab 08 deliverables are completed and validated: +- Flask app instrumented with Prometheus metrics and `/metrics` endpoint +- Prometheus added and scraping app + platform targets +- Grafana integrated with Prometheus and custom metrics dashboard +- Production hardening applied (health checks, limits, retention, persistence) +- Bonus Ansible automation completed for full observability stack +- Report prepared with architecture, PromQL examples, and screenshot evidence diff --git a/monitoring/grafana/dashboards/lab07-logging.json b/monitoring/grafana/dashboards/lab07-logging.json new file mode 100644 index 0000000000..274d587fbd --- /dev/null +++ b/monitoring/grafana/dashboards/lab07-logging.json @@ -0,0 +1,212 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate by App", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "value", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Level Distribution", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "style": "dark", + "tags": [ + "lab07", + "loki", + "logging" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab07 - Application Logging", + "uid": "lab07-logging", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/dashboards/lab08-metrics.json b/monitoring/grafana/dashboards/lab08-metrics.json new file mode 100644 index 0000000000..12f620b338 --- /dev/null +++ b/monitoring/grafana/dashboards/lab08-metrics.json @@ -0,0 +1,412 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))", + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "mode": "spectrum" + }, + "legend": { + "show": true + }, + "tooltip": { + "show": true, + "yHistogram": false + } + }, + "targets": [ + { + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "format": "heatmap", + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "expr": "sum(http_requests_in_progress)", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "value", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "index": 1, + "text": "DOWN" + }, + "1": { + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"app\"}", + "refId": "A" + } + ], + "title": "Application Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "style": "dark", + "tags": [ + "lab08", + "prometheus", + "metrics", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab08 - Application Metrics", + "uid": "lab08-metrics", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..f76206cd5e --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "DevOps Dashboards" + orgId: 1 + folder: "DevOps Labs" + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/datasources.yml b/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000000..eb7ccc9c02 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,18 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: true + + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: false + editable: true diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..0b29325751 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,54 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 5m + chunk_retain_period: 30s + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + +limits_config: + retention_period: 168h + allow_structured_metadata: true + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + retention_delete_delay: 1h + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..f94d0b2f19 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] + + - job_name: app + metrics_path: /metrics + static_configs: + - targets: ["app-python:5000"] + + - job_name: loki + metrics_path: /metrics + static_configs: + - targets: ["loki:3100"] + + - job_name: grafana + metrics_path: /metrics + static_configs: + - targets: ["grafana:3000"] diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..2d4975262e --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,28 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_label_logging] + regex: promtail + action: keep + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + pipeline_stages: + - docker: {} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000..8708ce7750 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +addopts = --maxfail=1 -q +testpaths = app_python/tests