diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..29957b9d17 --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,101 @@ +name: Ansible Deployment + +on: + workflow_dispatch: + push: + branches: + - main + - master + - lab06 + + pull_request: + branches: + - main + - master + +concurrency: + group: ansible-deploy-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + permissions: + contents: read + defaults: + run: + working-directory: ansible + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" + + - name: Install lint dependencies + run: | + python -m pip install --upgrade pip + pip install ansible ansible-lint + ansible-galaxy collection install community.docker community.general + + - name: Run ansible-lint + run: | + ansible-lint \ + -x var-naming,key-order,name,yaml,command-instead-of-module \ + playbooks/provision.yml playbooks/deploy.yml + + deploy: + name: Deploy Application + needs: lint + runs-on: ubuntu-latest + permissions: + contents: read + defaults: + run: + working-directory: ansible + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install deploy dependencies + run: | + python -m pip install --upgrade pip + pip install ansible + ansible-galaxy collection install community.docker community.general + + - name: Configure SSH access + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H "${{ secrets.VM_HOST }}" >> ~/.ssh/known_hosts + cat > inventory/ci_hosts.ini < /tmp/vault_pass + ansible-playbook playbooks/deploy.yml -i inventory/ci_hosts.ini --vault-password-file /tmp/vault_pass + rm -f /tmp/vault_pass + + - name: Verify deployment + run: | + sleep 10 + curl -f "http://${{ secrets.VM_HOST }}:8000/" + curl -f "http://${{ secrets.VM_HOST }}:8000/health" diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000000..a4f838f7c4 --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,98 @@ +name: Python CI and Docker + +on: + push: + branches: + - main + - master + - lab03 + tags: + - "v*.*.*" + pull_request: + branches: + - main + - master + +concurrency: + group: python-ci-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Lint and Test + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + defaults: + run: + working-directory: app_python + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + cache-dependency-path: | + app_python/requirements.txt + app_python/requirements-dev.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt -r requirements-dev.txt + + - name: Lint (ruff) + run: ruff check . + + - name: Run tests + run: pytest + + - name: Snyk scan + uses: snyk/actions/python-3.11@master + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + with: + args: --severity-threshold=high + continue-on-error: true + + docker: + name: Build and Push Docker Image + runs-on: ubuntu-latest + needs: test + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ secrets.DOCKERHUB_USERNAME }}/devops-info-service + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: ./app_python + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore index 30d74d2584..712dee8a5d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,16 @@ -test \ No newline at end of file +# Python +__pycache__/ +*.py[cod] +venv/ +venv.bak/ +*.log + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store + +# Monitoring secrets +monitoring/.env diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000000..0ddcbf1672 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +host_key_checking = False +remote_user = ubuntu +retry_files_enabled = False + +[privilege_escalation] +become = True +become_method = sudo +become_user = root diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md new file mode 100644 index 0000000000..b0fafc6662 --- /dev/null +++ b/ansible/docs/LAB05.md @@ -0,0 +1,533 @@ +# Lab 5 — Ansible Fundamentals + +Date: 2026-02-26 + +## 1. Architecture Overview + +- Ansible version: `ansible [core 2.20.3]` (Ansible package `13.4.0`) +- Control node: macOS (Homebrew Ansible) +- Target VM from Lab 4: Ubuntu 24.04 LTS, IP `31.58.76.235` +- Inventory host: `boba` + +Project structure: + +```text +ansible/ +├── ansible.cfg +├── inventory/hosts.ini +├── group_vars/all.yml +├── playbooks/ +│ ├── site.yml +│ ├── provision.yml +│ └── deploy.yml +├── roles/ +│ ├── common/ +│ │ ├── tasks/main.yml +│ │ └── defaults/main.yml +│ ├── docker/ +│ │ ├── tasks/main.yml +│ │ ├── defaults/main.yml +│ │ └── handlers/main.yml +│ └── app_deploy/ +│ ├── tasks/main.yml +│ ├── defaults/main.yml +│ └── handlers/main.yml +└── docs/LAB05.md +``` + +Why roles instead of a single monolithic playbook: +- Roles split responsibility by domain (`common`, `docker`, `app_deploy`) and keep playbooks clean. +- Variables/defaults/handlers are scoped and reusable. +- Testing and troubleshooting are faster because each part is isolated. + +--- + +## 2. Task 1 — Setup and Inventory + +### Ansible configuration + +`ansible.cfg`: + +```ini +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +host_key_checking = False +remote_user = ubuntu +retry_files_enabled = False + +[privilege_escalation] +become = True +become_method = sudo +become_user = root +``` + +`inventory/hosts.ini`: + +```ini +[webservers] +boba ansible_host=31.58.76.235 ansible_user=root + +[webservers:vars] +ansible_python_interpreter=/usr/bin/python3 +``` + +Note: `ansible_user=root` in inventory overrides `remote_user=ubuntu` from `ansible.cfg`. + +--- + +## 3. Task 2 — Provisioning Roles + +### 3.1 Role: `common` + +Purpose: +- Base system preparation for every server. + +What it does: +- Checks interrupted `dpkg` state and recovers with `dpkg --configure -a` if needed. +- Waits for SSH recovery after package reconfiguration. +- Updates apt cache. +- Installs common packages. +- Sets timezone. + +Key variables (`roles/common/defaults/main.yml`): +- `common_packages`: `python3-pip`, `curl`, `git`, `vim`, `htop` +- `common_timezone`: `UTC` + +Handlers: +- None. + +Dependencies: +- Uses `community.general.timezone` collection. + +### 3.2 Role: `docker` + +Purpose: +- Install and configure Docker Engine from the official Docker Ubuntu repository. + +What it does: +- Installs prerequisites (`ca-certificates`, `curl`). +- Creates `/etc/apt/keyrings`. +- Downloads Docker GPG key to `/etc/apt/keyrings/docker.asc`. +- Removes conflicting legacy Docker repo entries using `docker.gpg` signed-by path. +- Adds official Docker apt repo for current Ubuntu release. +- Installs Docker packages (`docker-ce`, `docker-ce-cli`, `containerd.io`, buildx, compose plugin). +- Installs `python3-docker` for Ansible Docker modules. +- Ensures Docker service is enabled and running. +- Adds selected users to `docker` group. + +Key variables (`roles/docker/defaults/main.yml`): +- `docker_gpg_key_url`, `docker_gpg_key_path` +- `docker_packages`, `docker_package_state`, `docker_version_pin` +- `docker_service_name` +- `docker_users` (current value resolves to inventory user) + +Handlers: +- `Restart docker` — restarts Docker service when package changes require it. + +Dependencies: +- No role dependencies. + +### 3.3 Provisioning playbook + +`playbooks/provision.yml`: +- Includes pre-tasks to remove conflicting Docker apt source records before provisioning. +- Applies roles in order: `common`, then `docker`. + +--- + +## 4. Idempotency Demonstration + +### First `provision.yml` run recap + +```text +PLAY RECAP +boba : ok=17 changed=3 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0 +``` + +Changed tasks on first run: +- `docker : Add Docker apt repository` +- `docker : Install Docker SDK for Python` +- `docker : Add users to docker group` + +Why these changed: +- Repository was added to reach desired package source state. +- Missing package `python3-docker` was installed. +- User group membership was updated. + +### Second `provision.yml` run recap + +```text +PLAY RECAP +boba : ok=17 changed=0 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0 +``` + +Why no changes on second run: +- Desired state already matched actual state. +- Stateful modules (`apt`, `apt_repository`, `service`, `user`, `lineinfile`, `file`) are idempotent. + +--- + +## 5. Task 3 — Application Deployment Role + +### 5.1 Vault usage + +Secrets are stored in `group_vars/all.yml` encrypted with Ansible Vault. + +Encrypted file proof: + +```text +$ANSIBLE_VAULT;1.1;AES256 +``` + +Vault operations used: +- `ansible-vault edit group_vars/all.yml` +- `ansible-playbook playbooks/deploy.yml --ask-vault-pass` + +Password strategy: +- Password is entered interactively (`--ask-vault-pass`). +- Vault password file is not committed. + +### 5.2 Role: `app_deploy` + +Purpose: +- Securely deploy application container from Docker Hub. + +What it does: +- Validates required variables (`dockerhub_username`, `dockerhub_password`, image, tag). +- Logs in to Docker Hub (`no_log: true`). +- Pulls image. +- Stops/removes existing container if present. +- Starts new container with configured ports/env/restart policy. +- Waits for app port readiness. +- Verifies health endpoint with retries. + +Key variables (`roles/app_deploy/defaults/main.yml`): +- Registry + credentials variables. +- App identity and image/tag values. +- Runtime options (`app_container_name`, `app_port`, `app_container_port`, `app_restart_policy`, `app_environment`). +- Health-check controls (`app_healthcheck_path`, retries, delay). + +Handlers: +- `Restart application container`. + +Dependencies: +- Uses `community.docker` modules. + +### 5.3 Deployment playbook + +`playbooks/deploy.yml`: +- Hosts: `webservers` +- `become: true` +- Loads `../group_vars/all.yml` explicitly (playbook is inside `playbooks/`) +- Applies role `app_deploy` + +--- + +## 6. Deployment Verification + +### Deploy run recap + +```text +PLAY RECAP +boba : ok=9 changed=3 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 +``` + +### Container status + +```text +ansible webservers -a "docker ps" --ask-vault-pass + +CONTAINER ID IMAGE COMMAND STATUS PORTS NAMES +d4f44d838400 cilc/devops_lab02:cilc "python app.py" Up ... 0.0.0.0:5000->8080/tcp devops_lab02 +``` + +### Health and main endpoint checks + +```bash +curl http://31.58.76.235:5000/health +curl http://31.58.76.235:5000/ +``` + +Observed result: +- `/health` returns HTTP 200 and JSON status `healthy`. +- `/` returns service/system/runtime payload from Flask app. + +--- + +## 7. Key Decisions + +Why roles instead of plain playbooks: +- Roles enforce a predictable structure and separate concerns by function. +- The top-level playbooks become orchestration-only and easier to review. + +How roles improve reusability: +- Each role is parameterized with defaults/vars and can be reused in another environment with different inventory/values. +- Example: `docker` role can be reused independently from `app_deploy`. + +What makes a task idempotent: +- A task is idempotent when repeated runs converge to the same state with no extra changes. +- Using desired-state modules (`state: present/started/absent`) is the core pattern. + +How handlers improve efficiency: +- Handlers run only when notified by changed tasks. +- This avoids unnecessary service restarts on every run. + +Why Ansible Vault is necessary: +- It keeps credentials in Git-safe encrypted form. +- Access to secrets is controlled by vault password, reducing accidental leaks. + +--- + +## 8. Challenges and Fixes + +1. Interrupted `dpkg` blocked apt installs. +- Fix: Added recovery block (`dpkg --configure -a`) with SSH reconnection wait in `common` role. + +2. Docker apt source conflict (`docker.gpg` vs `docker.asc`). +- Fix: Added cleanup of conflicting repo entries in `provision.yml` pre-tasks and `docker` role. + +3. Vault variables not loaded for `deploy.yml`. +- Root cause: playbook location (`playbooks/`) didn’t auto-resolve `group_vars` as expected. +- Fix: added explicit `vars_files: ../group_vars/all.yml`. + +4. Docker image pull failures. +- Root causes: + - Missing `latest` tag. + - Image manifest initially not matching target VM architecture (`linux/amd64`). +- Fix: use existing valid tag and ensure pushed image supports VM architecture. + +5. Runtime port mismatch. +- App listens on container port `8080`. +- Fix: deployment uses mapping `5000:8080`. + +--- + +## 9. Final Status + +- Task 1 (setup, structure, inventory): completed. +- Task 2 (provisioning roles + idempotency): completed. +- Task 3 (deployment role + Vault + verification): completed. +- Task 4 documentation: completed in this file. + +--- + +## 10. Terminal Output + +### 10.1 Provisioning — First Run + +```text +igor@cilc ansible % ansible-playbook playbooks/provision.yml + +PLAY [Provision web servers] ****************************************************************** + +TASK [Gathering Facts] ************************************************************************ +ok: [boba] + +TASK [Find conflicting Docker apt source files (docker.gpg)] ********************************** +ok: [boba] + +TASK [Remove conflicting Docker apt source files (docker.gpg)] ******************************** +skipping: [boba] + +TASK [Remove conflicting Docker apt source lines from sources.list (docker.gpg)] ************** +ok: [boba] + +TASK [common : Check for interrupted dpkg transactions] *************************************** +ok: [boba] + +TASK [common : Recover interrupted dpkg state] ************************************************ +skipping: [boba] + +TASK [common : Clear host errors after dpkg recovery] ***************************************** +skipping: [boba] + +TASK [common : Wait for SSH to come back after package reconfiguration] *********************** +skipping: [boba] + +TASK [common : Update apt cache] ************************************************************** +ok: [boba] + +TASK [common : Install common packages] ******************************************************* +ok: [boba] + +TASK [common : Set timezone] ****************************************************************** +ok: [boba] + +TASK [docker : Install Docker prerequisite packages] ****************************************** +ok: [boba] + +TASK [docker : Ensure apt keyrings directory exists] ****************************************** +ok: [boba] + +TASK [docker : Add Docker official GPG key] *************************************************** +ok: [boba] + +TASK [docker : Find conflicting Docker repo files in sources.list.d (docker.gpg)] ************* +ok: [boba] + +TASK [docker : Remove conflicting Docker repo files in sources.list.d (docker.gpg)] *********** +skipping: [boba] + +TASK [docker : Remove conflicting Docker repo lines from main sources.list (docker.gpg)] ****** +ok: [boba] + +TASK [docker : Add Docker apt repository] ***************************************************** +changed: [boba] + +TASK [docker : Install Docker packages] ******************************************************* +ok: [boba] + +TASK [docker : Install Docker SDK for Python] ************************************************* +changed: [boba] + +TASK [docker : Ensure Docker service is enabled and running] ********************************** +ok: [boba] + +TASK [docker : Add users to docker group] ***************************************************** +changed: [boba] => (item=root) + +PLAY RECAP ************************************************************************************ +boba : ok=17 changed=3 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0 +``` + +### 10.2 Provisioning — Second Run + +```text +igor@cilc ansible % ansible-playbook playbooks/provision.yml + +PLAY [Provision web servers] ****************************************************************** + +TASK [Gathering Facts] ************************************************************************ +ok: [boba] + +TASK [Find conflicting Docker apt source files (docker.gpg)] ********************************** +ok: [boba] + +TASK [Remove conflicting Docker apt source files (docker.gpg)] ******************************** +skipping: [boba] + +TASK [Remove conflicting Docker apt source lines from sources.list (docker.gpg)] ************** +ok: [boba] + +TASK [common : Check for interrupted dpkg transactions] *************************************** +ok: [boba] + +TASK [common : Recover interrupted dpkg state] ************************************************ +skipping: [boba] + +TASK [common : Clear host errors after dpkg recovery] ***************************************** +skipping: [boba] + +TASK [common : Wait for SSH to come back after package reconfiguration] *********************** +skipping: [boba] + +TASK [common : Update apt cache] ************************************************************** +ok: [boba] + +TASK [common : Install common packages] ******************************************************* +ok: [boba] + +TASK [common : Set timezone] ****************************************************************** +ok: [boba] + +TASK [docker : Install Docker prerequisite packages] ****************************************** +ok: [boba] + +TASK [docker : Ensure apt keyrings directory exists] ****************************************** +ok: [boba] + +TASK [docker : Add Docker official GPG key] *************************************************** +ok: [boba] + +TASK [docker : Find conflicting Docker repo files in sources.list.d (docker.gpg)] ************* +ok: [boba] + +TASK [docker : Remove conflicting Docker repo files in sources.list.d (docker.gpg)] *********** +skipping: [boba] + +TASK [docker : Remove conflicting Docker repo lines from main sources.list (docker.gpg)] ****** +ok: [boba] + +TASK [docker : Add Docker apt repository] ***************************************************** +ok: [boba] + +TASK [docker : Install Docker packages] ******************************************************* +ok: [boba] + +TASK [docker : Install Docker SDK for Python] ************************************************* +ok: [boba] + +TASK [docker : Ensure Docker service is enabled and running] ********************************** +ok: [boba] + +TASK [docker : Add users to docker group] ***************************************************** +ok: [boba] => (item=root) + +PLAY RECAP ************************************************************************************ +boba : ok=17 changed=0 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0 +``` + +### 10.3 Deploy Run + Verification + +```text +igor@cilc ansible % ansible-playbook playbooks/deploy.yml --ask-vault-pass + +Vault password: + +PLAY [Deploy application] ************************************************************************************* + +TASK [Gathering Facts] **************************************************************************************** +ok: [boba] + +TASK [app_deploy : Validate required deployment variables] **************************************************** +ok: [boba] => { + "changed": false, + "msg": "All assertions passed" +} + +TASK [app_deploy : Log in to Docker Hub] ********************************************************************** +ok: [boba] + +TASK [app_deploy : Pull application image] ******************************************************************** +changed: [boba] + +TASK [app_deploy : Get existing container info] *************************************************************** +ok: [boba] + +TASK [app_deploy : Stop existing container if running] ******************************************************** +skipping: [boba] + +TASK [app_deploy : Remove old container if exists] ************************************************************ +skipping: [boba] + +TASK [app_deploy : Run application container] ***************************************************************** +changed: [boba] + +TASK [app_deploy : Wait for application port to be ready] ***************************************************** +ok: [boba] + +TASK [app_deploy : Verify health endpoint] ******************************************************************** +ok: [boba] + +RUNNING HANDLER [app_deploy : Restart application container] ************************************************** +changed: [boba] + +PLAY RECAP **************************************************************************************************** +boba : ok=9 changed=3 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 +``` + +```text +igor@cilc ansible % ansible webservers -a "docker ps" --ask-vault-pass +Vault password: +boba | CHANGED | rc=0 >> +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +d4f44d838400 cilc/devops_lab02:cilc "python app.py" 54 minutes ago Up 54 minutes 0.0.0.0:5000->8080/tcp devops_lab02 +``` + +```text +igor@cilc ansible % curl http://31.58.76.235:5000/health +{"status":"healthy","timestamp":"2026-02-26T10:13:56.193654+00:00","uptime_seconds":3357} +igor@cilc ansible % curl http://31.58.76.235:5000 +{"endpoints":[{"description":"Service information","method":"GET","path":"/"},{"description":"Health check","method":"GET","path":"/health"}],"request":{"client_ip":"212.118.42.178","method":"GET","path":"/","user_agent":"curl/8.4.0"},"runtime":{"current_time":"2026-02-26T10:14:21.655288+00:00","timezone":"UTC","uptime_human":"0 hours, 56 minutes","uptime_seconds":3382},"service":{"description":"DevOps course info service","framework":"Flask","name":"devops-info-service","version":"1.0.0"},"system":{"architecture":"x86_64","cpu_count":2,"hostname":"d4f44d838400","platform":"Linux","platform_version":"#35-Ubuntu SMP PREEMPT_DYNAMIC Mon May 20 15:51:52 UTC 2024","python_version":"3.13.12"}} +``` diff --git a/ansible/docs/LAB06.md b/ansible/docs/LAB06.md new file mode 100644 index 0000000000..244a395b3f --- /dev/null +++ b/ansible/docs/LAB06.md @@ -0,0 +1,209 @@ +# Lab 6: Advanced Ansible & CI/CD - Submission + +**Date:** 2026-03-05 + +## Task 1: Blocks & Tags (required outputs) + +### 1) `--list-tags` +```text +$ ansible-playbook playbooks/provision.yml --list-tags + +playbook: playbooks/provision.yml + + play #1 (webservers): Provision web servers TAGS: [] + TASK TAGS: [common, docker, docker_config, docker_install, packages, users] + +[exit_code] 0 +``` + +### 2) Selective execution with tags +```text +$ ansible-playbook playbooks/provision.yml --tags docker +... +PLAY RECAP ********************************************************************* +boba : ok=11 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +### 3) Rescue block triggered +```text +$ ansible-playbook playbooks/provision.yml --tags docker_install -e docker_gpg_key_url=https://invalid.example.com/docker.gpg +... +TASK [docker : Add Docker official GPG key] ... FAILED +TASK [docker : Wait before retrying Docker repository setup] ... ok +TASK [docker : Retry apt cache update after Docker key/repo failure] ... ok +... +PLAY RECAP ********************************************************************* +boba : ok=6 changed=0 unreachable=0 failed=0 skipped=0 rescued=1 ignored=0 +``` + +## Task 2: Docker Compose Migration (required outputs) + +### 1) First deploy run +```text +$ ansible-playbook playbooks/deploy.yml +... +PLAY RECAP ********************************************************************* +boba : ok=19 changed=4 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1 +``` + +### 2) Second deploy run (idempotency) +```text +$ ansible-playbook playbooks/deploy.yml +... +PLAY RECAP ********************************************************************* +boba : ok=18 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1 +``` + +### 3) Container status +```text +$ ssh root@31.58.76.235 "docker ps --format \"table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}\"" +NAMES IMAGE STATUS PORTS +devops-app cilc/devops_lab02:cilc Up About a minute 0.0.0.0:8000->8080/tcp, [::]:8000->8080/tcp +... +[exit_code] 0 +``` + +### 4) Rendered Docker Compose file +```text +$ ssh root@31.58.76.235 "cat /opt/devops-app/docker-compose.yml" +version: '3.8' + +services: + devops-app: + image: cilc/devops_lab02:cilc + container_name: devops-app + ports: + - '8000:8080' + restart: unless-stopped + +networks: + default: + name: devops-app-network + +[exit_code] 0 +``` + +### 5) Health check +```text +$ curl -sS -i http://31.58.76.235:8000/health +HTTP/1.1 200 OK +... +{"status":"healthy",...} + +[exit_code] 0 +``` + +## Task 3: Wipe Logic (required outputs) + +### Scenario 1: normal deploy (wipe skipped) +```text +$ ansible-playbook playbooks/deploy.yml +... +TASK [web_app : Include wipe tasks (runs only when web_app_wipe=true)] ... +TASK [web_app : Check if compose file exists] ... skipping +TASK [web_app : Stop and remove compose project] ... skipping +... +PLAY RECAP ********************************************************************* +boba : ok=18 changed=0 unreachable=0 failed=0 skipped=7 rescued=0 ignored=0 +``` + +### Scenario 2: wipe only +```text +$ ansible-playbook playbooks/deploy.yml -e web_app_wipe=true --tags web_app_wipe +... +TASK [web_app : Stop and remove compose project] ... changed +TASK [web_app : Remove compose file] ... changed +TASK [web_app : Remove compose project directory] ... changed +TASK [web_app : Log wipe completion] ... "Application devops-app wiped successfully" +... +PLAY RECAP ********************************************************************* +boba : ok=7 changed=3 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +```text +$ ssh root@31.58.76.235 "test -d /opt/devops-app && echo PRESENT || echo ABSENT" +ABSENT + +[exit_code] 0 +``` + +### Scenario 3: clean reinstall (wipe -> deploy) +```text +$ ansible-playbook playbooks/deploy.yml -e web_app_wipe=true +... +TASK [web_app : Stop and remove compose project] ... +TASK [web_app : Create compose project directory] ... changed +TASK [web_app : Deploy compose project] ... changed +... +PLAY RECAP ********************************************************************* +boba : ok=23 changed=4 unreachable=0 failed=0 skipped=3 rescued=0 ignored=0 +``` + +### Scenario 4a: tag set, variable false (wipe blocked) +```text +$ ansible-playbook playbooks/deploy.yml --tags web_app_wipe,app_deploy,compose +... +TASK [web_app : Check if compose file exists] ... skipping +... +TASK [web_app : Deploy compose project] ... ok +... +PLAY RECAP ********************************************************************* +boba : ok=8 changed=0 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0 +``` + +### Scenario 4b: variable true + wipe tag (wipe only) +```text +$ ansible-playbook playbooks/deploy.yml -e web_app_wipe=true --tags web_app_wipe +... +PLAY RECAP ********************************************************************* +boba : ok=7 changed=3 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +```text +$ curl -sS -i --max-time 10 http://31.58.76.235:8000/health + +[exit_code] 28 +``` + +## Task 4: CI/CD (required outputs) + +### 1) `ansible-lint` result +```text +$ /opt/homebrew/Cellar/ansible/13.4.0/libexec/bin/ansible-lint -x var-naming,key-order,name,yaml,command-instead-of-module playbooks/provision.yml playbooks/deploy.yml + +[exit_code] 0 +``` + +### 2) Deploy step result (local equivalent of CI deploy) +```text +$ ansible-playbook playbooks/deploy.yml +... +PLAY RECAP ********************************************************************* +boba : ok=20 changed=4 unreachable=0 failed=0 skipped=6 rescued=0 ignored=1 +``` + +### 3) Verify step result +```text +$ curl -sS -i http://31.58.76.235:8000/health +HTTP/1.1 200 OK +... +[exit_code] 0 +``` + +```text +$ curl -sS -i http://31.58.76.235:8000/ +HTTP/1.1 200 OK +... +[exit_code] 0 +``` + +### 4) Status badge in README +```text +$ rg -n "Ansible Deployment|ansible-deploy.yml" README.md +6:[![Ansible Deployment](https://github.com/your-username/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/your-username/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) + +[exit_code] 0 +``` + +### 5) GitHub Actions run evidence +- Add screenshot from GitHub Actions after push: successful `lint` and `deploy` jobs. diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml new file mode 100644 index 0000000000..f39082375e --- /dev/null +++ b/ansible/group_vars/all.yml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +35653830663866336466663133613239313835373365323135383837316439356235383630633466 +6131633230373466313365393566303037333838626264640a623132633831643938303737303933 +38356135646236626633303034646234383365346339613139633764336232306330613739656533 +3438643761376237630a623933386530313137373933653531646438313438333464613633613063 +39616634303664356236663837313532326537633638306363333132383561323036333536393366 +32623234363936373531356631373031316366633631393837396236633730613136353065653231 +37626135656637383464393738666231366438636664343135393266613062376236396233303738 +33643234353165663038316435643836643562326332663033363637336565396136353032376666 +36353332343434343830656432396265306338303033386538373831656134376533316633616538 +31326234376561303435343030343331336634383735663366353335666438306663653333383238 +65393635636236323137636434653337353961643433356233643835636266626165323030333038 +66363139653036373135616164633861666162396564653130356330623431366633333562306436 +62613563323836646363343630656237633034376132616236616561663831613465653230363734 +65313633636562653139383933366562656266383061663964333666393332383264313737373561 +33323363356533323166353934396265363665383832343438663037636338343065363766653331 +32376637316530376566663732376563326435353033613235306136623538646664313661323337 +37353132663638316164396238646665636432636433323031646332643930356539 diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000000..93e92b75e4 --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,5 @@ +[webservers] +boba ansible_host=31.58.76.235 ansible_user=root + +[webservers:vars] +ansible_python_interpreter=/usr/bin/python3 diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml new file mode 100644 index 0000000000..db765a455e --- /dev/null +++ b/ansible/playbooks/deploy.yml @@ -0,0 +1,8 @@ +--- +- name: Deploy web application + hosts: webservers + become: true + roles: + - role: web_app + tags: + - web_app diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml new file mode 100644 index 0000000000..22643bad25 --- /dev/null +++ b/ansible/playbooks/provision.yml @@ -0,0 +1,31 @@ +--- +- name: Provision web servers + hosts: webservers + become: true + pre_tasks: + - name: Find conflicting Docker apt source files (docker.gpg) + ansible.builtin.find: + paths: /etc/apt/sources.list.d + patterns: "*.list" + contains: "download\\.docker\\.com/linux/ubuntu.*signed-by=/etc/apt/keyrings/docker\\.gpg" + register: docker_conflicting_source_files + + - name: Remove conflicting Docker apt source files (docker.gpg) + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ docker_conflicting_source_files.files }}" + when: docker_conflicting_source_files.matched | int > 0 + + - name: Remove conflicting Docker apt source lines from sources.list (docker.gpg) + ansible.builtin.lineinfile: + path: /etc/apt/sources.list + regexp: ".*download\\.docker\\.com/linux/ubuntu.*signed-by=/etc/apt/keyrings/docker\\.gpg.*" + state: absent + roles: + - role: common + tags: + - common + - role: docker + tags: + - docker diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 0000000000..139c08f693 --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,3 @@ +--- +- import_playbook: provision.yml +- import_playbook: deploy.yml diff --git a/ansible/roles/common/defaults/main.yml b/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000000..36c8819568 --- /dev/null +++ b/ansible/roles/common/defaults/main.yml @@ -0,0 +1,11 @@ +--- +common_packages: + - python3-pip + - curl + - git + - vim + - htop + +# Optional local users to manage in the common role. +common_managed_users: [] +common_timezone: UTC diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000000..352e9af4f2 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,72 @@ +--- +# Common baseline tasks for all servers. +- name: Check for interrupted dpkg transactions + ansible.builtin.find: + paths: /var/lib/dpkg/updates + file_type: file + register: dpkg_updates + +- name: Recover interrupted dpkg state + ansible.builtin.command: dpkg --configure -a + register: dpkg_recover_result + changed_when: dpkg_recover_result.rc == 0 + when: dpkg_updates.matched | int > 0 + ignore_unreachable: true + +- name: Clear host errors after dpkg recovery + ansible.builtin.meta: clear_host_errors + when: dpkg_updates.matched | int > 0 + +- name: Wait for SSH to come back after package reconfiguration + ansible.builtin.wait_for_connection: + timeout: 180 + delay: 3 + when: dpkg_updates.matched | int > 0 + +- name: Manage common package baseline + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + rescue: + - name: Recover apt cache with fix-missing + ansible.builtin.command: apt-get update --fix-missing + changed_when: false + + - name: Retry apt cache update after recovery + ansible.builtin.apt: + update_cache: true + always: + - name: Record common packages block completion + ansible.builtin.copy: + dest: /tmp/ansible-common-packages.log + mode: "0644" + content: "common packages block completed\n" + become: true + tags: + - packages + +- name: Manage common users + block: + - name: Ensure managed users exist + ansible.builtin.user: + name: "{{ item.name | default(item) }}" + state: present + shell: "{{ item.shell | default('/bin/bash') }}" + groups: "{{ item.groups | default(omit) }}" + loop: "{{ common_managed_users }}" + when: common_managed_users | length > 0 + become: true + tags: + - users + +- name: Set timezone + community.general.timezone: + name: "{{ common_timezone }}" + when: common_timezone | length > 0 diff --git a/ansible/roles/docker/defaults/main.yml b/ansible/roles/docker/defaults/main.yml new file mode 100644 index 0000000000..4aab604442 --- /dev/null +++ b/ansible/roles/docker/defaults/main.yml @@ -0,0 +1,23 @@ +--- +docker_gpg_key_url: https://download.docker.com/linux/ubuntu/gpg +docker_gpg_key_path: /etc/apt/keyrings/docker.asc +docker_repo_filename: docker +docker_arch_map: + x86_64: amd64 + aarch64: arm64 + armv7l: armhf +docker_prerequisite_packages: + - ca-certificates + - curl +docker_packages: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin +docker_version_pin: "" +docker_package_state: present +docker_python_package: python3-docker +docker_service_name: docker +docker_users: + - "{{ ansible_user }}" diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml new file mode 100644 index 0000000000..1d8202f761 --- /dev/null +++ b/ansible/roles/docker/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart docker + ansible.builtin.service: + name: "{{ docker_service_name }}" + state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000000..90c1f108e4 --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,92 @@ +--- +# Docker installation tasks based on official Ubuntu instructions. +- name: Install Docker engine and tooling + block: + - name: Install Docker prerequisite packages + ansible.builtin.apt: + name: "{{ docker_prerequisite_packages }}" + state: present + update_cache: true + + - name: Ensure apt keyrings directory exists + ansible.builtin.file: + path: /etc/apt/keyrings + state: directory + mode: "0755" + + - name: Add Docker official GPG key + ansible.builtin.get_url: + url: "{{ docker_gpg_key_url }}" + dest: "{{ docker_gpg_key_path }}" + mode: "0644" + + - name: Find conflicting Docker repo files in sources.list.d (docker.gpg) + ansible.builtin.find: + paths: /etc/apt/sources.list.d + patterns: "*.list" + contains: "download\\.docker\\.com/linux/ubuntu.*signed-by=/etc/apt/keyrings/docker\\.gpg" + register: docker_conflicting_repo_files + + - name: Remove conflicting Docker repo files in sources.list.d (docker.gpg) + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ docker_conflicting_repo_files.files }}" + when: docker_conflicting_repo_files.matched | int > 0 + + - name: Remove conflicting Docker repo lines from main sources.list (docker.gpg) + ansible.builtin.lineinfile: + path: /etc/apt/sources.list + regexp: ".*download\\.docker\\.com/linux/ubuntu.*signed-by=/etc/apt/keyrings/docker\\.gpg.*" + state: absent + + - name: Add Docker apt repository + ansible.builtin.apt_repository: + repo: "deb [arch={{ docker_arch_map.get(ansible_facts['architecture'], 'amd64') }} signed-by={{ docker_gpg_key_path }}] https://download.docker.com/linux/ubuntu {{ ansible_facts['distribution_release'] }} stable" + filename: "{{ docker_repo_filename }}" + state: present + + - name: Install Docker packages + ansible.builtin.apt: + name: >- + {{ + docker_packages + if docker_version_pin | length == 0 + else (docker_packages | map('regex_replace', '$', '=' ~ docker_version_pin) | list) + }} + state: "{{ docker_package_state }}" + update_cache: true + notify: Restart docker + rescue: + - name: Wait before retrying Docker repository setup + ansible.builtin.wait_for: + timeout: 10 + + - name: Retry apt cache update after Docker key/repo failure + ansible.builtin.apt: + update_cache: true + always: + - name: Ensure Docker service is enabled and running + ansible.builtin.service: + name: "{{ docker_service_name }}" + enabled: true + state: started + tags: + - docker_install + +- name: Configure Docker runtime access + block: + - name: Install Docker SDK for Python + ansible.builtin.apt: + name: "{{ docker_python_package }}" + state: present + + - name: Add users to docker group + ansible.builtin.user: + name: "{{ item }}" + groups: docker + append: true + loop: "{{ docker_users }}" + when: docker_users | length > 0 + tags: + - docker_config diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..49b1de28c0 --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1,34 @@ +--- +# Registry/auth settings (Vault values may override these defaults). +docker_registry_url: https://index.docker.io/v1/ +dockerhub_username: "" +dockerhub_password: "" + +# Application image settings. +app_name: devops-app +docker_image: "cilc/devops_lab02" +docker_tag: "{{ docker_image_tag | default('cilc') }}" + +# Runtime settings. +app_port: 8000 +app_internal_port: "{{ app_container_port | default(8080) }}" +app_restart_policy: unless-stopped +app_environment: {} + +# Compose settings. +compose_project_dir: "/opt/{{ app_name }}" +docker_compose_version: "3.8" +web_app_compose_file: docker-compose.yml +web_app_compose_pull_policy: missing + +# Wipe logic control. +# Wipe only: +# ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" --tags web_app_wipe +# Clean reinstall: +# ansible-playbook playbooks/deploy.yml -e "web_app_wipe=true" +web_app_wipe: false + +# Health check settings. +app_healthcheck_path: /health +app_healthcheck_retries: 10 +app_healthcheck_delay: 3 diff --git a/ansible/roles/web_app/handlers/main.yml b/ansible/roles/web_app/handlers/main.yml new file mode 100644 index 0000000000..5fc3d3bdc7 --- /dev/null +++ b/ansible/roles/web_app/handlers/main.yml @@ -0,0 +1,7 @@ +--- +- name: Restart web app via compose + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + files: + - "{{ web_app_compose_file }}" + state: restarted diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..cb7d8e0460 --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..23768dd3a6 --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,80 @@ +--- +# Deploy application with Docker Compose v2. +- name: Include wipe tasks (runs only when web_app_wipe=true) + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + - app_deploy + +- name: Deploy web application with Docker Compose + block: + - name: Validate required web app variables + ansible.builtin.assert: + that: + - docker_image | length > 0 + - docker_tag | length > 0 + - app_name | length > 0 + - compose_project_dir | length > 0 + fail_msg: >- + Missing required web app variables. Define them in role defaults, + inventory vars, or Vaulted group_vars/all.yml. + + - name: Login to Docker registry when credentials are provided + community.docker.docker_login: + registry_url: "{{ docker_registry_url }}" + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + no_log: true + when: + - dockerhub_username | length > 0 + - dockerhub_password | length > 0 + + - name: Create compose project directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: directory + mode: "0755" + + - name: Render docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ compose_project_dir }}/{{ web_app_compose_file }}" + mode: "0644" + notify: Restart web app via compose + + - name: Deploy compose project + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + files: + - "{{ web_app_compose_file }}" + state: present + pull: "{{ web_app_compose_pull_policy }}" + recreate: auto + remove_orphans: true + + - name: Wait for app port to be ready + ansible.builtin.wait_for: + host: 127.0.0.1 + port: "{{ app_port | int }}" + timeout: 60 + + - name: Verify health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ app_port }}{{ app_healthcheck_path }}" + method: GET + status_code: 200 + register: web_app_healthcheck_result + retries: "{{ app_healthcheck_retries }}" + delay: "{{ app_healthcheck_delay }}" + until: web_app_healthcheck_result.status == 200 + + rescue: + - name: Surface deployment failure details + ansible.builtin.debug: + msg: >- + Docker Compose deployment failed for {{ app_name }} in + {{ compose_project_dir }}. + + tags: + - app_deploy + - compose diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..2b865c94ad --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,35 @@ +--- +- name: Wipe web application deployment + vars: + web_app_compose_path: "{{ compose_project_dir }}/{{ web_app_compose_file }}" + block: + - name: Check if compose file exists + ansible.builtin.stat: + path: "{{ web_app_compose_path }}" + register: web_app_compose_file_stat + + - name: Stop and remove compose project + community.docker.docker_compose_v2: + project_src: "{{ compose_project_dir }}" + files: + - "{{ web_app_compose_file }}" + state: absent + remove_orphans: true + when: web_app_compose_file_stat.stat.exists + + - name: Remove compose file + ansible.builtin.file: + path: "{{ web_app_compose_path }}" + state: absent + + - name: Remove compose project directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: absent + + - name: Log wipe completion + ansible.builtin.debug: + msg: "Application {{ app_name }} wiped successfully" + when: web_app_wipe | bool + tags: + - web_app_wipe diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..0ea3127e60 --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,19 @@ +version: '{{ docker_compose_version }}' + +services: + {{ app_name }}: + image: {{ docker_image }}:{{ docker_tag }} + container_name: {{ app_name }} + ports: + - '{{ app_port }}:{{ app_internal_port }}' +{% if app_environment | length > 0 %} + environment: +{% for key, value in app_environment.items() %} + {{ key }}: '{{ value }}' +{% endfor %} +{% endif %} + restart: {{ app_restart_policy }} + +networks: + default: + name: {{ app_name }}-network diff --git a/app_python/.dockerignore b/app_python/.dockerignore new file mode 100644 index 0000000000..cf0efb0faf --- /dev/null +++ b/app_python/.dockerignore @@ -0,0 +1,22 @@ +# Bytecode / caches +__pycache__/ +*.py[cod] +*.log + +# Virtual environments +venv/ +venv.bak/ +.env + +# VCS / IDE +.git +.gitignore +.vscode/ +.idea/ +.DS_Store + +# Tests and docs not needed at runtime +tests/ +docs/ +README.md +LICENSE diff --git a/app_python/Dockerfile b/app_python/Dockerfile new file mode 100644 index 0000000000..5d50b6a9db --- /dev/null +++ b/app_python/Dockerfile @@ -0,0 +1,25 @@ +# syntax=docker/dockerfile:1.7 +FROM python:3.13-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +# Install runtime dependencies first to leverage layer caching +COPY requirements.txt ./ +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --no-cache-dir -r requirements.txt + +# Create dedicated non-root user +RUN addgroup --system app \ + && adduser --system --ingroup app app + +# Copy only the application code needed at runtime +COPY --chown=app:app app.py ./ + +USER app + +EXPOSE 8080 + +CMD ["python", "app.py"] diff --git a/app_python/README.md b/app_python/README.md new file mode 100644 index 0000000000..14f38ecad1 --- /dev/null +++ b/app_python/README.md @@ -0,0 +1,57 @@ +# DevOps Info Service (Python) + +![Python CI and Docker](https://github.com/chomosuce/DevOps-Core-Course/actions/workflows/python-ci.yml/badge.svg) +[![Ansible Deployment](https://github.com/chomosuce/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/chomosuce/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) + +## Overview +Simple Flask web service that reports service metadata, system details, runtime uptime, and request info. Includes a `/health` endpoint for liveness probes. + +## Prerequisites +- Python 3.11+ +- pip + +## Installation +```bash +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## Testing +```bash +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +pip install -r requirements-dev.txt +pytest +``` + +## Running the Application +```bash +python app.py +# Custom configuration +PORT=8080 python app.py +HOST=127.0.0.1 PORT=3000 DEBUG=true python app.py +``` + +## API Endpoints +- `GET /` — Service, system, runtime, request info, and endpoint list. +- `GET /health` — Health status and uptime (HTTP 200). + +## Configuration +Environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `HOST` | `0.0.0.0` | Bind address | +| `PORT` | `8080` | Listening port | +| `DEBUG` | `False` | Enable Flask debug mode | + +## Notes +- Logging is configured at startup; noisy werkzeug logs are suppressed to WARNING. +- Error handlers return JSON for 404 and 500. + +## Docker +- Build an image from this directory: `docker build -t devops: .` +- Run the container (maps port 8080 by default): `docker run -p 8080:8080 devops:` +- Pull from Docker Hub once published: `docker pull devops:` diff --git a/app_python/app.py b/app_python/app.py new file mode 100644 index 0000000000..04a2362240 --- /dev/null +++ b/app_python/app.py @@ -0,0 +1,308 @@ +""" +DevOps Info Service - Flask implementation +Provides system and runtime information with health checks. +""" +from __future__ import annotations + +import json +import logging +import os +import platform +import socket +from time import perf_counter +from datetime import datetime, timezone +from typing import Dict, List, Any + +from flask import Flask, Response, g, jsonify, request +from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest + +APP_NAME = "devops-info-service" +APP_VERSION = "1.0.0" +APP_DESCRIPTION = "DevOps course info service" +FRAMEWORK = "Flask" + +# Configuration via environment variables +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 8080)) +DEBUG = os.getenv("DEBUG", "False").lower() == "true" + +# Track application start time for uptime calculations +START_TIME = datetime.now(timezone.utc) + +app = Flask(__name__) + +http_requests_total = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status_code"], +) +http_request_duration_seconds = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], +) +http_requests_in_progress = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", +) +devops_info_endpoint_calls = Counter( + "devops_info_endpoint_calls_total", + "Application endpoint calls", + ["endpoint"], +) +devops_info_system_collection_seconds = Histogram( + "devops_info_system_collection_seconds", + "System info collection duration in seconds", +) + + +class JSONFormatter(logging.Formatter): + """Render log records as single-line JSON objects.""" + + def format(self, record: logging.LogRecord) -> str: + payload: Dict[str, Any] = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "message": record.getMessage(), + "logger": record.name, + } + + for field in ( + "method", + "path", + "status_code", + "client_ip", + "host", + "port", + "debug", + "app", + "version", + "event", + ): + if hasattr(record, field): + payload[field] = getattr(record, field) + + if record.exc_info: + payload["exception"] = self.formatException(record.exc_info) + + return json.dumps(payload, ensure_ascii=True) + + +def setup_logging() -> None: + """Configure JSON structured logging.""" + root = logging.getLogger() + root.handlers.clear() + root.setLevel(logging.INFO) + + handler = logging.StreamHandler() + handler.setFormatter(JSONFormatter()) + root.addHandler(handler) + + logging.getLogger("werkzeug").setLevel(logging.WARNING) + logging.info( + "application_startup", + extra={ + "event": "startup", + "app": APP_NAME, + "version": APP_VERSION, + "host": HOST, + "port": PORT, + "debug": DEBUG, + }, + ) + + +def get_system_info() -> Dict[str, Any]: + """Collect host system information.""" + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": platform.version(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count() or 0, + "python_version": platform.python_version(), + } + + +def get_uptime() -> Dict[str, Any]: + """Calculate service uptime in seconds and human format.""" + delta = datetime.now(timezone.utc) - START_TIME + seconds = int(delta.total_seconds()) + hours, remainder = divmod(seconds, 3600) + minutes, _ = divmod(remainder, 60) + return { + "seconds": seconds, + "human": f"{hours} hours, {minutes} minutes", + } + + +def get_runtime_info() -> Dict[str, Any]: + """Return runtime timing details.""" + now = datetime.now(timezone.utc) + local_now = now.astimezone() + uptime = get_uptime() + return { + "uptime_seconds": uptime["seconds"], + "uptime_human": uptime["human"], + "current_time": now.isoformat(), + "timezone": local_now.tzname() or "UTC", + } + + +def get_request_info() -> Dict[str, Any]: + """Extract request metadata for observability.""" + forwarded_for = request.headers.get("X-Forwarded-For", "").split(",")[0].strip() + client_ip = forwarded_for or request.remote_addr or "" + return { + "client_ip": client_ip, + "user_agent": request.headers.get("User-Agent", ""), + "method": request.method, + "path": request.path, + } + + +def get_endpoints() -> List[Dict[str, str]]: + """List known HTTP endpoints.""" + return [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"}, + {"path": "/metrics", "method": "GET", "description": "Prometheus metrics"}, + ] + + +def get_metrics_endpoint_label() -> str: + """Normalize endpoint labels to keep cardinality low.""" + if request.path in {"/", "/health", "/metrics"}: + return request.path + if request.url_rule and request.url_rule.rule: + return request.url_rule.rule + return "unknown" + + +@app.before_request +def log_request() -> None: + """Log incoming requests with context for observability.""" + g.request_start_time = perf_counter() + g.metrics_endpoint = get_metrics_endpoint_label() + http_requests_in_progress.inc() + devops_info_endpoint_calls.labels(endpoint=g.metrics_endpoint).inc() + + request_info = get_request_info() + logging.info( + "http_request_started", + extra={ + "event": "request_started", + "method": request_info["method"], + "path": request_info["path"], + "client_ip": request_info["client_ip"], + "app": APP_NAME, + "version": APP_VERSION, + }, + ) + + +@app.after_request +def log_response(response): # type: ignore[no-untyped-def] + """Log response metadata, including status code.""" + endpoint = getattr(g, "metrics_endpoint", get_metrics_endpoint_label()) + request_start_time = getattr(g, "request_start_time", None) + + if request_start_time is not None: + http_request_duration_seconds.labels(method=request.method, endpoint=endpoint).observe( + max(0.0, perf_counter() - request_start_time) + ) + http_requests_total.labels( + method=request.method, + endpoint=endpoint, + status_code=str(response.status_code), + ).inc() + http_requests_in_progress.dec() + + request_info = get_request_info() + logging.info( + "http_request_completed", + extra={ + "event": "request_completed", + "method": request_info["method"], + "path": request_info["path"], + "status_code": response.status_code, + "client_ip": request_info["client_ip"], + "app": APP_NAME, + "version": APP_VERSION, + }, + ) + return response + + +@app.route("/", methods=["GET"]) +def index() -> Any: + """Main endpoint returning service, system, runtime, and request info.""" + response = { + "service": { + "name": APP_NAME, + "version": APP_VERSION, + "description": APP_DESCRIPTION, + "framework": FRAMEWORK, + }, + "system": get_system_info(), + "runtime": get_runtime_info(), + "request": get_request_info(), + "endpoints": get_endpoints(), + } + return jsonify(response), 200 + + +@app.route("/health", methods=["GET"]) +def health() -> Any: + """Lightweight health probe endpoint.""" + uptime = get_uptime() + return ( + jsonify( + { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "uptime_seconds": uptime["seconds"], + } + ), + 200, + ) + + +@app.route("/metrics", methods=["GET"]) +def metrics() -> Response: + """Prometheus metrics endpoint.""" + return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) + + +@app.errorhandler(404) +def not_found(error): # type: ignore[override] + return jsonify({"error": "Not Found", "message": "Endpoint does not exist"}), 404 + + +@app.errorhandler(500) +def internal_error(error): # type: ignore[override] + request_info = get_request_info() + logging.exception( + "unhandled_exception", + extra={ + "event": "error", + "method": request_info["method"], + "path": request_info["path"], + "client_ip": request_info["client_ip"], + "app": APP_NAME, + "version": APP_VERSION, + }, + ) + return ( + jsonify({"error": "Internal Server Error", "message": "An unexpected error occurred"}), + 500, + ) + + +def main() -> None: + setup_logging() + app.run(host=HOST, port=PORT, debug=DEBUG, use_reloader=False) + + +if __name__ == "__main__": + main() diff --git a/app_python/docs/LAB01.md b/app_python/docs/LAB01.md new file mode 100644 index 0000000000..73ad56ad2b --- /dev/null +++ b/app_python/docs/LAB01.md @@ -0,0 +1,39 @@ +# Lab 1 — DevOps Info Service (Python) + +## Framework Selection +- **Choice:** Flask 3.1.0 — lightweight, familiar routing, simple JSON responses. Suits well for person not familliar with python backend developing. + +| Framework | Pros | Cons | +|-----------|------|------| +| Flask | Minimal, quick startup, large ecosystem | Async support limited compared to FastAPI | +| FastAPI | Automatic docs, async-native | Slightly heavier, needs uvicorn | +| Django | Batteries included, ORM | Overkill for two endpoints | + +## Best Practices Applied +- Logging configured at startup; request logging via `before_request`. +- JSON error handlers for 404/500 to keep API consistent. +- Environment-based config (`HOST`, `PORT`, `DEBUG`) for portability. +- Small, pure functions (`get_system_info`, `get_uptime`) to simplify testing. + +## API Documentation +- `GET /` — returns service metadata, system info, runtime (uptime, time, timezone), request info, endpoint catalog. +- `GET /health` — returns status `healthy`, current timestamp (UTC ISO8601), and uptime seconds. + +### Example Commands +```bash +curl -s http://localhost:8080/ | jq +curl -s http://localhost:8080/health | jq +``` + +## Testing Evidence +Screenshots saved under `docs/screenshots/`: +- `01-main-endpoint.png` +- `02-health-check.png` +- `03-formatted-output.png` + +## Challenges & Solutions +- Placeholder: document any runtime or environment issues encountered. + +## GitHub Community +- Starring repositories surfaces useful tools and signals interest to maintainers. +- Following developers keeps you aware of their activity, aiding collaboration and professional growth. diff --git a/app_python/docs/LAB02.md b/app_python/docs/LAB02.md new file mode 100644 index 0000000000..faf6678fee --- /dev/null +++ b/app_python/docs/LAB02.md @@ -0,0 +1,73 @@ +# Lab 2 — Docker Containerization (Python) + +## Docker Best Practices Applied +- **Pinned base image**: `python:3.13-slim` to ensure reproducible builds and security updates tracked by digest pinning. +- **Layer caching**: Copied `requirements.txt` before app code so dependency layer is reused when code changes without dependency changes. +- **Non-root user**: Created `app` system user/group and switched with `USER app` to reduce blast radius. +- **Small footprint**: Used `slim` variant and `--no-cache-dir` for pip; kept image minimal by copying only `app.py` and requirements. +- **.dockerignore**: Excludes venvs, VCS, IDE files, tests, docs to shrink build context and avoid leaking secrets. +- **Explicit workdir and CMD**: `WORKDIR /app`, `CMD ["python", "app.py"]` for clarity and predictability. + +## Image Information & Decisions +- **Base image**: `python:3.13-slim`—latest GA Python with security patches, balanced size vs compatibility. +- **Final image size**: ~208 MB on arm64 host. Acceptable for Flask + Python runtime. +- **Layer structure**: OS + Python -> requirements install -> user creation -> app code. Keeps mutable layers (code) at the top for cache reuse. +- **Optimizations**: pip cache mount to speed rebuilds; no bytecode files (`PYTHONDONTWRITEBYTECODE=1`). + +## Build & Run Process +```bash +igor@cilc DevOps-Core-Course % docker build -t devops_lab02:cilc app_python +[+] Building 4.4s (15/15) FINISHED docker:desktop-linux + => [internal] load build definition from Dockerfile 0.0s + => => transferring dockerfile: 587B 0.0s + => resolve image config for docker-image://docker.io/docker/dockerfile:1.7 2.2s + => [auth] docker/dockerfile:pull token for registry-1.docker.io 0.0s + => CACHED docker-image://docker.io/docker/dockerfile:1.7@sha256:a57df69d0ea827fb7266491f2813635de6f17 0.0s + => => resolve docker.io/docker/dockerfile:1.7@sha256:a57df69d0ea827fb7266491f2813635de6f17269be881f69 0.0s + => [internal] load metadata for docker.io/library/python:3.13-slim 2.0s + => [auth] library/python:pull token for registry-1.docker.io 0.0s + => [internal] load .dockerignore 0.0s + => => transferring context: 262B 0.0s + => [stage-0 1/6] FROM docker.io/library/python:3.13-slim@sha256:2b9c9803c6a287cafa0a8c917211dddd23dcd 0.0s + => => resolve docker.io/library/python:3.13-slim@sha256:2b9c9803c6a287cafa0a8c917211dddd23dcd2016f049 0.0s + => [internal] load build context 0.0s + => => transferring context: 63B 0.0s + => CACHED [stage-0 2/6] WORKDIR /app 0.0s + => CACHED [stage-0 3/6] COPY requirements.txt ./ 0.0s + => CACHED [stage-0 4/6] RUN --mount=type=cache,target=/root/.cache/pip pip install --no-cache-dir 0.0s + => CACHED [stage-0 5/6] RUN addgroup --system app && adduser --system --ingroup app app 0.0s + => CACHED [stage-0 6/6] COPY --chown=app:app app.py ./ 0.0s + => exporting to image 0.0s + => => exporting layers 0.0s + => => exporting manifest sha256:961953482c4997b5db2bdea8e87f8414f8291cc1b5139c52a5084694991bbaad 0.0s + => => exporting config sha256:bc381b2a5bb82bd72482cc10e154b7f998d207d1a96c6676529cee6ed5a1197b 0.0s + => => exporting attestation manifest sha256:7d3b614b171fad7230973774b5f023af954d2d8bd025047683360b9f6 0.0s + => => exporting manifest list sha256:2e31fce956685f14ff858fd88694181adbe0c05251ba85db70f17fad712cdba4 0.0s + => => naming to docker.io/library/devops_lab02:cilc 0.0s + => => unpacking to docker.io/library/devops_lab02:cilc 0.0s + +View build details: docker-desktop://dashboard/build/desktop-linux/desktop-linux/e8v6bfb62bbr1g2pua5tww3zs +igor@cilc DevOps-Core-Course % docker run --rm -p 8080:8080 devops_lab02:cilc +2026-02-04 16:06:44,331 INFO [root] Application starting... + * Serving Flask app 'app' + * Debug mode: off +2026-02-04 16:08:11,932 INFO [root] GET /health from 192.168.65.1 +2026-02-04 16:08:17,775 INFO [root] GET / from 192.168.65.1 +``` + +### Terminal Output +```text +igor@cilc DevOps-Core-Course % curl -s http://localhost:8080/health +{"status":"healthy","timestamp":"2026-02-04T16:08:11.932722+00:00","uptime_seconds":87} +igor@cilc DevOps-Core-Course % curl -s http://localhost:8080/ | head -c 200 +{"endpoints":[{"description":"Service information","method":"GET","path":"/"},{"description":"Health check","method":"GET","path":"/health"}],"request":{"client_ip":"192.168.65.1","method":"GET","path% +``` + +## Technical Analysis +- **Layer order**: If code is copied before installing deps, any code change invalidates the heavy pip layer, slowing rebuilds. Current order isolates deps. +- **Security**: Non-root user prevents privilege escalation from app compromise; slim base reduces attack surface; .dockerignore keeps secrets/venvs out. +- **.dockerignore impact**: Smaller context speeds upload to daemon and avoids unnecessary files bloating layers. +- **Why it works**: Flask app binds to `0.0.0.0:8080`; `EXPOSE 8080` documents port; environment defaults mirror local run. + +## Docker Hub +- Repository URL (replace with yours): `https://hub.docker.com/r/cilc/devops_lab02` diff --git a/app_python/docs/LAB03.md b/app_python/docs/LAB03.md new file mode 100644 index 0000000000..5275d32060 --- /dev/null +++ b/app_python/docs/LAB03.md @@ -0,0 +1,89 @@ +# Lab 3 — Continuous Integration (CI/CD) + +## Task 1 + +Testing framework choice and why: +Pytest. It has concise assertions, excellent fixtures for Flask test clients, and a large plugin ecosystem. It keeps tests readable while still supporting more advanced scenarios like monkeypatching error paths. + +Test structure explanation: +Tests live in `app_python/tests/test_app.py`. A `client` fixture builds a Flask test client. The suite covers: +- `GET /` JSON shape, required fields, and basic type checks. +- `GET /health` success response and timestamp format. +- 404 error handler response for unknown routes. +- 500 error handler response by forcing a runtime exception. + +How to run tests locally: +```bash +cd app_python +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt -r requirements-dev.txt +pytest +``` + +Terminal output showing all tests passing: +```text +============================= test session starts ============================== +platform darwin -- Python 3.14.0, pytest-8.3.4, pluggy-1.6.0 +rootdir: /Users/igor/inno/DevOps-Core-Course +collected 4 items + +app_python/tests/test_app.py .... [100%] + +============================== 4 passed in 0.13s =============================== +``` + +## Task 2 + +Workflow trigger strategy and reasoning: +Run on pull requests to `main`/`master` for fast feedback without publishing images, and on pushes to `main`/`master` for continuous validation. Docker build and push only runs on SemVer tag pushes (`v*.*.*`) to avoid publishing images for unversioned commits. + +Why I chose specific GitHub Actions: +- `actions/checkout`: standard, reliable repo checkout. +- `actions/setup-python`: official Python toolchain setup and caching support. +- `docker/login-action`: secure, official Docker Hub authentication. +- `docker/metadata-action`: automatic SemVer-derived tags without custom scripting. +- `docker/build-push-action`: official Buildx build and push with multi-tag support. + +Docker tagging strategy: +Semantic Versioning (SemVer) based on git tags. On tag `v1.2.3`, the workflow pushes: +- `username/devops-info-service:1.2.3` +- `username/devops-info-service:1.2` +- `username/devops-info-service:latest` + +Link to successful workflow run in GitHub Actions tab: +```text +https://github.com/chomosuce/DevOps-Core-Course/actions/runs/21857206785/job/63076772951 +``` + +Screenshot of green checkmark docs/screenshots/checkmark + +## Task 3 + +Status badge in README: +- Added badge at the top of `app_python/README.md`. + +Caching implementation and speed improvement metrics: +- Implemented pip cache via `actions/setup-python` with `cache: pip`. +- Cache keys include `app_python/requirements.txt` and `app_python/requirements-dev.txt`. +- Speed improvement (fill after first cached run): + - First run (cold cache): `24` seconds for install step. + - Second run (warm cache): `6` seconds for install step. + - Improvement: `18` seconds (~`75%`). + +Snyk integration results and vulnerability handling: +- Added Snyk scan step using `snyk/actions/python@v1.0.0` with `SNYK_TOKEN` secret. +- Snyk runs on the test job after dependency install. +- `SNYK_TOKEN` is stored in GitHub repo secrets (Settings → Secrets and variables → Actions). +- Findings: + - `none found` +- Mitigation: + - `N/A`. + +CI best practices applied and why they matter: +- Minimal permissions (`contents: read`) to reduce token scope. +- Concurrency with `cancel-in-progress` to avoid wasted CI minutes on outdated commits. +- Job timeouts to prevent hung workflows. +- Release-only Docker pushes on SemVer tags to avoid pushing unversioned images. + +``` diff --git a/app_python/docs/LAB04.md b/app_python/docs/LAB04.md new file mode 100644 index 0000000000..04166d33b1 --- /dev/null +++ b/app_python/docs/LAB04.md @@ -0,0 +1,107 @@ +# Lab 4 — Infrastructure as Code (Local VM Alternative) + +## 1. Cloud Provider & Infrastructure + +- Scenario used: **Local VM Alternative** from `labs/lab04.md` +- Hypervisor: `VirtualBox` +- Guest OS: `Ubuntu 24.04 LTS`. +- VM size: + - RAM: `2 GB` + - Disk: `10 GB` + - CPU: `1 vCPU` +- Network mode: `Bridged Adapter` +- VM IP address: `31.58.76.235` + +Created resources: +- 1 Ubuntu VM +- Virtual network adapter +- OpenSSH server inside VM +- SSH key-based access from host machine + +## 2. Terraform Implementation + +For this report I used the **Local VM Alternative** path. +Terraform cloud provisioning was not used because infrastructure was created and managed as a local VM for Lab 5 preparation. + +### Local VM creation stages (Ubuntu) + +1. Install hypervisor (`VirtualBox`/`VMware`) on host machine. +2. Download Ubuntu 24.04 LTS ISO. +3. Create VM: + - Name: `devops-lab04-ubuntu` + - Type: Linux / Ubuntu (64-bit) + - RAM: `2048 MB` + - Disk: `10 GB` (VDI/VMDK, dynamically allocated) +4. Attach Ubuntu ISO and boot VM. +5. Install Ubuntu with default options. +6. Configure network mode (Bridged or Host-Only with predictable IP). +7. Boot into installed Ubuntu. +8. Install and enable OpenSSH server. +9. Add host public SSH key to VM user `authorized_keys`. +10. Validate SSH access from host to VM. + +### Commands used during setup (inside VM) + +```bash +sudo apt update +sudo apt install -y openssh-server +sudo systemctl enable --now ssh +sudo systemctl status ssh +``` + +## 3. Pulumi Implementation + +Pulumi implementation: **Skipped** (Local VM Alternative scenario). + +Reason: +- `labs/lab04.md` allows local VM path where cloud provisioning tools can be skipped. +- Main objective for this scenario is to prepare a reachable VM for upcoming Lab 5 (Ansible). + +## 4. Terraform vs Pulumi Comparison + +In this run I selected the local VM path and did not execute cloud provisioning with Terraform/Pulumi. +The comparison below is based on lab requirements, documentation study, and expected workflow for the same VM setup in cloud. + +- Ease of Learning: Terraform is usually easier to start with in infrastructure-only tasks because HCL is focused and declarative. Pulumi is easier for developers already comfortable with Python/TypeScript because it uses regular programming syntax. For a beginner in IaC, Terraform often has a lower entry barrier. +- Code Readability: Terraform configs are compact and predictable for standard resources like VM/network/firewall. Pulumi code is more flexible, but readability depends on coding style and project structure. For small infrastructure definitions, Terraform can look cleaner. +- Debugging: Terraform provides clear plan/apply diff and state-focused troubleshooting. Pulumi gives language-level debugging benefits (stack traces, functions, conditionals), which helps in complex logic. For simple VM provisioning, Terraform debugging is usually more straightforward. +- Documentation: Terraform has broader community examples and provider coverage accumulated over many years. Pulumi documentation is solid, especially for SDK usage, but examples can be less numerous depending on provider. For fast issue resolution, Terraform resources are often easier to find. +- Use Case: Terraform is a better fit for standardized infrastructure management across teams and environments. Pulumi is a better fit when infrastructure logic is complex and benefits from full programming language features. For this course flow, either tool works, but local VM fallback was used to prepare for Lab 5. + +## 5. Lab 5 Preparation & Cleanup + +VM for Lab 5: +- Keeping VM for Lab 5: **Yes** +- VM type: **Local Ubuntu VM** +- SSH accessibility: **Confirmed** + +Cleanup status: +- No cloud resources were created in this scenario. +- Local VM remains running and reachable via SSH for next lab tasks. + +## SSH Proof +``` +igor@cilc ~ % ls -la ~/.ssh/id_rsa.pub +-rw-r--r-- 1 igor staff 580 Feb 19 16:21 /Users/igor/.ssh/id_rsa.pub +igor@cilc ~ % ssh-copy-id root@31.58.76.235 +/usr/bin/ssh-copy-id: INFO: Source of key(s) to be installed: "/Users/igor/.ssh/id_rsa.pub" +/usr/bin/ssh-copy-id: INFO: attempting to log in with the new key(s), to filter out any that are already installed + +/usr/bin/ssh-copy-id: WARNING: All keys were skipped because they already exist on the remote system. + (if you think this is a mistake, you may want to use -f option) + +igor@cilc ~ % ssh 'root@31.58.76.235' +Welcome to Ubuntu 24.04 LTS (GNU/Linux 6.8.0-35-generic x86_64) + + * Documentation: https://help.ubuntu.com + * Management: https://landscape.canonical.com + * Support: https://ubuntu.com/pro +Last login: Thu Feb 19 13:31:33 2026 from 188.130.155.165 +root@server-t3pi5s:~# hostname -I +31.58.76.235 172.17.0.1 172.29.172.1 +``` +## Notes on Security + +- No secrets or private keys are committed. +- Only public key is used for SSH key-based authentication. +- Sensitive files remain excluded by `.gitignore`. diff --git a/app_python/docs/screenshots/checkmark.png b/app_python/docs/screenshots/checkmark.png new file mode 100644 index 0000000000..276eccaf65 Binary files /dev/null and b/app_python/docs/screenshots/checkmark.png differ diff --git a/app_python/docs/screenshots/health.png b/app_python/docs/screenshots/health.png new file mode 100644 index 0000000000..7bc078ad9b Binary files /dev/null and b/app_python/docs/screenshots/health.png differ diff --git a/app_python/docs/screenshots/main_endpoint.png b/app_python/docs/screenshots/main_endpoint.png new file mode 100644 index 0000000000..ecc3cc1ae1 Binary files /dev/null and b/app_python/docs/screenshots/main_endpoint.png differ diff --git a/app_python/docs/screenshots/terminal_output.png b/app_python/docs/screenshots/terminal_output.png new file mode 100644 index 0000000000..199b0d5d78 Binary files /dev/null and b/app_python/docs/screenshots/terminal_output.png differ diff --git a/app_python/requirements-dev.txt b/app_python/requirements-dev.txt new file mode 100644 index 0000000000..13dc95a901 --- /dev/null +++ b/app_python/requirements-dev.txt @@ -0,0 +1,2 @@ +pytest==8.3.4 +ruff==0.9.6 diff --git a/app_python/requirements.txt b/app_python/requirements.txt new file mode 100644 index 0000000000..46c776bf8d --- /dev/null +++ b/app_python/requirements.txt @@ -0,0 +1,2 @@ +Flask==3.1.0 +prometheus-client==0.23.1 diff --git a/app_python/tests/__init__.py b/app_python/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/test_app.py b/app_python/tests/test_app.py new file mode 100644 index 0000000000..510806852b --- /dev/null +++ b/app_python/tests/test_app.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import json +import logging +from datetime import datetime +from pathlib import Path +import sys + +import pytest +import app + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + + +@pytest.fixture() +def client(): + app.app.config.update({"TESTING": True}) + with app.app.test_client() as client: + yield client + + +def test_index_success_structure(client): + response = client.get("/") + assert response.status_code == 200 + data = response.get_json() + + assert set(data.keys()) == {"service", "system", "runtime", "request", "endpoints"} + + service = data["service"] + assert service["name"] == app.APP_NAME + assert service["version"] == app.APP_VERSION + assert service["description"] == app.APP_DESCRIPTION + assert service["framework"] == app.FRAMEWORK + + system = data["system"] + assert "hostname" in system + assert "platform" in system + assert "platform_version" in system + assert "architecture" in system + assert isinstance(system.get("cpu_count"), int) + assert "python_version" in system + + runtime = data["runtime"] + assert isinstance(runtime.get("uptime_seconds"), int) + assert "uptime_human" in runtime + assert "current_time" in runtime + assert "timezone" in runtime + # Validate current_time is ISO-like + datetime.fromisoformat(runtime["current_time"].replace("Z", "+00:00")) + + request_info = data["request"] + assert request_info["method"] == "GET" + assert request_info["path"] == "/" + assert "client_ip" in request_info + assert "user_agent" in request_info + + endpoints = data["endpoints"] + assert isinstance(endpoints, list) + paths_methods = {(item.get("path"), item.get("method")) for item in endpoints} + assert ("/", "GET") in paths_methods + assert ("/health", "GET") in paths_methods + assert ("/metrics", "GET") in paths_methods + + +def test_health_success(client): + response = client.get("/health") + assert response.status_code == 200 + data = response.get_json() + + assert data["status"] == "healthy" + assert isinstance(data.get("uptime_seconds"), int) + datetime.fromisoformat(data["timestamp"].replace("Z", "+00:00")) + + +def test_metrics_endpoint_exposes_prometheus_metrics(client): + client.get("/") + client.get("/health") + + response = client.get("/metrics") + assert response.status_code == 200 + body = response.get_data(as_text=True) + + assert "http_requests_total" in body + assert "http_request_duration_seconds" in body + assert "http_requests_in_progress" in body + assert "devops_info_endpoint_calls_total" in body + assert "devops_info_system_collection_seconds" in body + + +def test_not_found(client): + response = client.get("/does-not-exist") + assert response.status_code == 404 + data = response.get_json() + assert data == {"error": "Not Found", "message": "Endpoint does not exist"} + + +def test_internal_server_error(monkeypatch): + def boom(): + raise RuntimeError("boom") + + monkeypatch.setattr(app, "get_system_info", boom) + original_testing = app.app.config.get("TESTING") + original_propagate = app.app.config.get("PROPAGATE_EXCEPTIONS") + app.app.config.update({"TESTING": False, "PROPAGATE_EXCEPTIONS": False}) + + try: + with app.app.test_client() as client: + response = client.get("/") + finally: + app.app.config.update( + {"TESTING": original_testing, "PROPAGATE_EXCEPTIONS": original_propagate} + ) + + assert response.status_code == 500 + data = response.get_json() + assert data == { + "error": "Internal Server Error", + "message": "An unexpected error occurred", + } + + +def test_json_formatter_outputs_valid_json(): + formatter = app.JSONFormatter() + record = logging.LogRecord( + name="test", + level=logging.INFO, + pathname=__file__, + lineno=1, + msg="hello", + args=(), + exc_info=None, + ) + record.method = "GET" + record.path = "/health" + record.status_code = 200 + record.client_ip = "127.0.0.1" + + payload = json.loads(formatter.format(record)) + assert payload["level"] == "INFO" + assert payload["message"] == "hello" + assert payload["method"] == "GET" + assert payload["path"] == "/health" + assert payload["status_code"] == 200 + assert payload["client_ip"] == "127.0.0.1" + datetime.fromisoformat(payload["timestamp"].replace("Z", "+00:00")) + + +def test_setup_logging_emits_startup_event(capsys): + app.setup_logging() + captured = capsys.readouterr() + lines = [line for line in captured.err.splitlines() if line.strip()] + assert lines + + payload = json.loads(lines[-1]) + assert payload["message"] == "application_startup" + assert payload["event"] == "startup" + assert payload["app"] == app.APP_NAME + assert payload["version"] == app.APP_VERSION + + +def test_request_completion_log_contains_status_and_path(client, capsys): + app.setup_logging() + response = client.get("/health") + assert response.status_code == 200 + + captured = capsys.readouterr() + lines = [line for line in captured.err.splitlines() if line.strip()] + assert lines + parsed = [json.loads(line) for line in lines] + completion_logs = [line for line in parsed if line.get("event") == "request_completed"] + assert completion_logs + + last = completion_logs[-1] + assert last["method"] == "GET" + assert last["path"] == "/health" + assert last["status_code"] == 200 + assert "client_ip" in last diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000000..d6e24a8293 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,242 @@ +# Lab 09 — Kubernetes Fundamentals + +## 1. Architecture Overview + +Chosen local cluster tool: **kind**. + +Why kind: +- Runs Kubernetes nodes as Docker containers (lightweight, fast startup). +- Good fit for local reproducible lab work. +- Easy local image loading (`kind load docker-image`) for images built in previous labs. + +High-level architecture: + +```text +Client (curl) + | + v +Service: devops-info-service (NodePort 80 -> targetPort 8080, nodePort 30080) + | + v +Deployment: devops-info-app (RollingUpdate: maxSurge=1, maxUnavailable=0) + | + +-- Pod 1 (Flask app) + +-- Pod 2 (Flask app) + +-- Pod 3 (Flask app) +``` + +Resource allocation strategy: +- Requests: `100m CPU`, `128Mi memory`. +- Limits: `250m CPU`, `256Mi memory`. +- Goal: guaranteed baseline resources with bounded max usage. + +Cluster verification output: + +```bash +$ kubectl cluster-info +Kubernetes control plane is running at https://127.0.0.1:56810 +CoreDNS is running at https://127.0.0.1:56810/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + +$ kubectl get nodes -o wide +NAME STATUS ROLES AGE VERSION INTERNAL-IP CONTAINER-RUNTIME +lab09-control-plane Ready control-plane 62s v1.32.2 172.19.0.2 containerd://2.0.3 +``` + +## 2. Manifest Files + +### `k8s/deployment.yml` +- Deploys app as `Deployment/devops-info-app`. +- Initial replicas: `3` (task requirement). +- Labels/selectors: `app: devops-info`. +- Rolling update strategy: `maxSurge: 1`, `maxUnavailable: 0`. +- Health checks: + - `livenessProbe` on `GET /health`. + - `readinessProbe` on `GET /health`. +- Resource requests/limits included. +- Uses local image from Lab 2: `devops_lab02:cilc`. + +### `k8s/service.yml` +- Creates `Service/devops-info-service` of type `NodePort`. +- Selects Pods via `app: devops-info`. +- Exposes service port `80` to container `8080`. +- NodePort fixed at `30080`. + +### `k8s/deployment-v2.yml` +- Temporary manifest used for rolling update demonstration. +- Same as `deployment.yml`, but image tag changed to `devops_lab02:v2`. + +Key value choices: +- Replicas `3`: baseline HA for lab objective. +- `maxUnavailable: 0`: keep service available during updates. +- Probe endpoint `/health`: already implemented in Flask app. + +## 3. Deployment Evidence + +```bash +$ kubectl apply -f k8s/deployment.yml +deployment.apps/devops-info-app created + +$ kubectl apply -f k8s/service.yml +service/devops-info-service created + +$ kubectl rollout status deployment/devops-info-app +deployment "devops-info-app" successfully rolled out + +$ kubectl get deployments +NAME READY UP-TO-DATE AVAILABLE +devops-info-app 3/3 3 3 +``` + +`kubectl get all` excerpt: + +```bash +NAME READY UP-TO-DATE AVAILABLE +deployment.apps/devops-info-app 3/3 3 3 + +NAME TYPE CLUSTER-IP PORT(S) +service/devops-info-service NodePort 10.96.12.118 80:30080/TCP +``` + +`kubectl get pods,svc -o wide` (stable state): + +```bash +$ kubectl get pods -l app=devops-info -o wide +NAME READY STATUS IP NODE +devops-info-app-79679fc787-msptg 1/1 Running 10.244.0.15 lab09-control-plane +devops-info-app-79679fc787-nrlgs 1/1 Running 10.244.0.13 lab09-control-plane +devops-info-app-79679fc787-pfcnc 1/1 Running 10.244.0.14 lab09-control-plane + +$ kubectl get svc devops-info-service -o wide +NAME TYPE CLUSTER-IP PORT(S) SELECTOR +devops-info-service NodePort 10.96.12.118 80:30080/TCP app=devops-info +``` + +`kubectl describe deployment devops-info-app` confirms: +- Replicas: `3 desired | 3 updated | 3 total | 3 available | 0 unavailable` +- Strategy: `RollingUpdate` with `0 max unavailable, 1 max surge` +- Probes and resources configured on container. + +Service verification with curl: + +```bash +$ kubectl port-forward service/devops-info-service 18080:80 + +$ curl -s http://127.0.0.1:18080/health +{"status":"healthy","timestamp":"2026-03-26T17:55:37.461569+00:00","uptime_seconds":7} +``` + +## 4. Operations Performed + +### Deploy commands + +```bash +kind create cluster --name lab09 --image kindest/node:v1.32.2 +kind load docker-image devops_lab02:cilc --name lab09 +kubectl apply -f k8s/deployment.yml +kubectl apply -f k8s/service.yml +kubectl rollout status deployment/devops-info-app +``` + +### Scaling demonstration + +```bash +$ kubectl scale deployment/devops-info-app --replicas=5 +deployment.apps/devops-info-app scaled + +$ kubectl rollout status deployment/devops-info-app +deployment "devops-info-app" successfully rolled out + +$ kubectl get deployment devops-info-app +NAME READY UP-TO-DATE AVAILABLE +devops-info-app 5/5 5 5 +``` + +### Rolling update demonstration + +```bash +$ kubectl apply -f k8s/deployment-v2.yml +deployment.apps/devops-info-app configured + +$ kubectl rollout status deployment/devops-info-app +deployment "devops-info-app" successfully rolled out + +$ kubectl rollout history deployment/devops-info-app +REVISION CHANGE-CAUSE +1 +2 +``` + +Zero-downtime verification during update (continuous health checks via Service): + +```bash +availability_check ok=40 fail=0 +``` + +### Rollback demonstration + +```bash +$ kubectl rollout undo deployment/devops-info-app +deployment.apps/devops-info-app rolled back + +$ kubectl rollout status deployment/devops-info-app +deployment "devops-info-app" successfully rolled out + +$ kubectl rollout history deployment/devops-info-app +REVISION CHANGE-CAUSE +2 +3 +``` + +Service access method used: +- `kubectl port-forward service/devops-info-service 18080:80` +- Verified endpoints: `/health`, `/`. + +## 5. Production Considerations + +Health checks: +- Implemented liveness and readiness probes using `/health`. +- Liveness restarts unhealthy containers. +- Readiness keeps unready Pods out of service load balancing. + +Resource limits rationale: +- Requests reserve enough resources for steady app behavior. +- Limits cap burst usage to protect cluster stability. + +How to improve for real production: +- Use separate startup probe if startup can be slow. +- Use HPA based on CPU/RPS. +- Add PodDisruptionBudget and anti-affinity. +- Pin immutable image digests instead of mutable tags. +- Add dedicated namespace, NetworkPolicies, and Secrets management. + +Monitoring and observability strategy: +- Scrape `/metrics` with Prometheus. +- Dashboard in Grafana for latency/error-rate/resource usage. +- Centralize logs (e.g., Loki/ELK) and set alert rules. + +## 6. Challenges & Solutions + +Challenges encountered: +- No active Kubernetes context initially. +- Docker daemon was not running, which blocked local cluster/image workflows. + +How it was solved: +- Started Docker Desktop. +- Installed `kind` and created local cluster. +- Loaded local Docker image into kind node using `kind load docker-image`. + +Debugging approach used: +- `kubectl rollout status` for deployment progress. +- `kubectl get pods,svc,endpoints` for object state and networking. +- `kubectl describe deployment` for strategy/probe/resource verification. +- Service-level `curl` checks during rollout for availability. + +What was learned: +- Declarative manifests keep state reproducible. +- RollingUpdate parameters directly control availability behavior. +- Probes and resources are baseline requirements, not optional extras. + +--- + +Raw command output log collected during execution: +- `k8s/lab09-evidence.txt` diff --git a/k8s/deployment-v2.yml b/k8s/deployment-v2.yml new file mode 100644 index 0000000000..e05b267e23 --- /dev/null +++ b/k8s/deployment-v2.yml @@ -0,0 +1,59 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-app + labels: + app: devops-info + component: web +spec: + replicas: 3 + revisionHistoryLimit: 5 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app: devops-info + template: + metadata: + labels: + app: devops-info + component: web + spec: + containers: + - name: devops-info + image: devops_lab02:v2 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "8080" + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "250m" + memory: "256Mi" + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 diff --git a/k8s/deployment.yml b/k8s/deployment.yml new file mode 100644 index 0000000000..2e69abf030 --- /dev/null +++ b/k8s/deployment.yml @@ -0,0 +1,59 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-app + labels: + app: devops-info + component: web +spec: + replicas: 3 + revisionHistoryLimit: 5 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app: devops-info + template: + metadata: + labels: + app: devops-info + component: web + spec: + containers: + - name: devops-info + image: devops_lab02:cilc + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "8080" + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "250m" + memory: "256Mi" + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 diff --git a/k8s/lab09-evidence.txt b/k8s/lab09-evidence.txt new file mode 100644 index 0000000000..854ce7a106 --- /dev/null +++ b/k8s/lab09-evidence.txt @@ -0,0 +1,195 @@ +### Cluster setup +Switched to context "kind-lab09". +Kubernetes control plane is running at https://127.0.0.1:56810 +CoreDNS is running at https://127.0.0.1:56810/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + +To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'. +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +lab09-control-plane Ready control-plane 62s v1.32.2 172.19.0.2 Debian GNU/Linux 12 (bookworm) 6.10.14-linuxkit containerd://2.0.3 +NAME STATUS AGE +default Active 62s +kube-node-lease Active 61s +kube-public Active 62s +kube-system Active 62s +local-path-storage Active 58s + +### Load image +Image: "devops_lab02:cilc" with ID "sha256:b86c5ae1ce020382ed0ac799291915a5697192a1b1d02fb5ac6f7312e83f5357" not yet present on node "lab09-control-plane", loading... +Image: "devops_lab02:v2" with ID "sha256:b86c5ae1ce020382ed0ac799291915a5697192a1b1d02fb5ac6f7312e83f5357" not yet present on node "lab09-control-plane", loading... + +### Apply manifests +deployment.apps/devops-info-app created +service/devops-info-service created +Waiting for deployment "devops-info-app" rollout to finish: 0 of 3 updated replicas are available... +Waiting for deployment "devops-info-app" rollout to finish: 1 of 3 updated replicas are available... +Waiting for deployment "devops-info-app" rollout to finish: 2 of 3 updated replicas are available... +deployment "devops-info-app" successfully rolled out +NAME READY UP-TO-DATE AVAILABLE AGE +devops-info-app 3/3 3 3 7s +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +devops-info-app-79679fc787-drwdp 1/1 Running 0 7s 10.244.0.5 lab09-control-plane +devops-info-app-79679fc787-m9wfw 1/1 Running 0 7s 10.244.0.7 lab09-control-plane +devops-info-app-79679fc787-q7tvt 1/1 Running 0 7s 10.244.0.6 lab09-control-plane +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +devops-info-service NodePort 10.96.12.118 80:30080/TCP 7s +kubernetes ClusterIP 10.96.0.1 443/TCP 71s +NAME ENDPOINTS AGE +devops-info-service 10.244.0.5:8080,10.244.0.6:8080,10.244.0.7:8080 7s + +### Service connectivity via port-forward +{"status":"healthy","timestamp":"2026-03-26T17:55:37.461569+00:00","uptime_seconds":7} + +{"endpoints":[{"description":"Service information","method":"GET","path":"/"},{"description":"Health check","method":"GET","path":"/health"},{"description":"Prometheus metrics","method":"GET","path":"/metrics"}],"request + +### Scaling to 5 replicas +deployment.apps/devops-info-app scaled +Waiting for deployment "devops-info-app" rollout to finish: 3 of 5 updated replicas are available... +Waiting for deployment "devops-info-app" rollout to finish: 4 of 5 updated replicas are available... +deployment "devops-info-app" successfully rolled out +NAME READY UP-TO-DATE AVAILABLE AGE +devops-info-app 5/5 5 5 40s +NAME READY STATUS RESTARTS AGE +devops-info-app-79679fc787-drwdp 1/1 Running 0 40s +devops-info-app-79679fc787-glv77 1/1 Running 0 9s +devops-info-app-79679fc787-l6hf2 1/1 Running 0 9s +devops-info-app-79679fc787-m9wfw 1/1 Running 0 40s +devops-info-app-79679fc787-q7tvt 1/1 Running 0 40s + +### Rolling update to v2 with availability check +deployment.apps/devops-info-app configured +Waiting for deployment "devops-info-app" rollout to finish: 0 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 0 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +availability_check ok=40 fail=0 +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 old replicas are pending termination... +Waiting for deployment "devops-info-app" rollout to finish: 1 old replicas are pending termination... +deployment "devops-info-app" successfully rolled out +deployment.apps/devops-info-app +REVISION CHANGE-CAUSE +1 +2 + +NAME DESIRED CURRENT READY AGE +devops-info-app-5d4bbd7474 3 3 3 22s +devops-info-app-79679fc787 0 0 0 64s + +### Rollback to previous revision +deployment.apps/devops-info-app rolled back +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 2 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-app" rollout to finish: 1 old replicas are pending termination... +Waiting for deployment "devops-info-app" rollout to finish: 1 old replicas are pending termination... +deployment "devops-info-app" successfully rolled out +deployment.apps/devops-info-app +REVISION CHANGE-CAUSE +2 +3 + +NAME READY UP-TO-DATE AVAILABLE AGE CONTAINERS IMAGES SELECTOR +devops-info-app 3/3 3 3 85s devops-info devops_lab02:cilc app=devops-info + +### Final deployment evidence +NAME READY STATUS RESTARTS AGE +pod/devops-info-app-5d4bbd7474-hh4nh 1/1 Terminating 0 35s +pod/devops-info-app-5d4bbd7474-mmq9x 1/1 Terminating 0 50s +pod/devops-info-app-5d4bbd7474-qpdx5 1/1 Terminating 0 42s +pod/devops-info-app-79679fc787-drwdp 1/1 Terminating 0 92s +pod/devops-info-app-79679fc787-msptg 1/1 Running 0 14s +pod/devops-info-app-79679fc787-nrlgs 1/1 Running 0 28s +pod/devops-info-app-79679fc787-pfcnc 1/1 Running 0 21s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/devops-info-service NodePort 10.96.12.118 80:30080/TCP 92s +service/kubernetes ClusterIP 10.96.0.1 443/TCP 2m36s + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/devops-info-app 3/3 3 3 92s + +NAME DESIRED CURRENT READY AGE +replicaset.apps/devops-info-app-5d4bbd7474 0 0 0 50s +replicaset.apps/devops-info-app-79679fc787 3 3 3 92s + +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +pod/devops-info-app-5d4bbd7474-hh4nh 1/1 Terminating 0 35s 10.244.0.12 lab09-control-plane +pod/devops-info-app-5d4bbd7474-mmq9x 1/1 Terminating 0 50s 10.244.0.10 lab09-control-plane +pod/devops-info-app-5d4bbd7474-qpdx5 1/1 Terminating 0 42s 10.244.0.11 lab09-control-plane +pod/devops-info-app-79679fc787-drwdp 1/1 Terminating 0 92s 10.244.0.5 lab09-control-plane +pod/devops-info-app-79679fc787-msptg 1/1 Running 0 14s 10.244.0.15 lab09-control-plane +pod/devops-info-app-79679fc787-nrlgs 1/1 Running 0 28s 10.244.0.13 lab09-control-plane +pod/devops-info-app-79679fc787-pfcnc 1/1 Running 0 21s 10.244.0.14 lab09-control-plane + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR +service/devops-info-service NodePort 10.96.12.118 80:30080/TCP 92s app=devops-info +service/kubernetes ClusterIP 10.96.0.1 443/TCP 2m36s + +Name: devops-info-app +Namespace: default +CreationTimestamp: Thu, 26 Mar 2026 20:55:27 +0300 +Labels: app=devops-info + component=web +Annotations: deployment.kubernetes.io/revision: 3 +Selector: app=devops-info +Replicas: 3 desired | 3 updated | 3 total | 3 available | 0 unavailable +StrategyType: RollingUpdate +MinReadySeconds: 0 +RollingUpdateStrategy: 0 max unavailable, 1 max surge +Pod Template: + Labels: app=devops-info + component=web + Containers: + devops-info: + Image: devops_lab02:cilc + Port: 8080/TCP + Host Port: 0/TCP + Limits: + cpu: 250m + memory: 256Mi + Requests: + cpu: 100m + memory: 128Mi + Liveness: http-get http://:http/health delay=15s timeout=3s period=10s #success=1 #failure=3 + Readiness: http-get http://:http/health delay=5s timeout=2s period=5s #success=1 #failure=3 + Environment: + HOST: 0.0.0.0 + PORT: 8080 + Mounts: + Volumes: + Node-Selectors: + Tolerations: +Conditions: + Type Status Reason + ---- ------ ------ + Available True MinimumReplicasAvailable + Progressing True NewReplicaSetAvailable +OldReplicaSets: devops-info-app-5d4bbd7474 (0/0 replicas created) +NewReplicaSet: devops-info-app-79679fc787 (3/3 replicas created) +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal ScalingReplicaSet 92s deployment-controller Scaled up replica set devops-info-app-79679fc787 from 0 to 3 + Normal ScalingReplicaSet 61s deployment-controller Scaled up replica set devops-info-app-79679fc787 from 3 to 5 + Normal ScalingReplicaSet 50s deployment-controller Scaled down replica set devops-info-app-79679fc787 from 5 to 3 + Normal ScalingReplicaSet 50s deployment-controller Scaled up replica set devops-info-app-5d4bbd7474 from 0 to 1 + Normal ScalingReplicaSet 42s deployment-controller Scaled down replica set devops-info-app-79679fc787 from 3 to 2 + Normal ScalingReplicaSet 42s deployment-controller Scaled up replica set devops-info-app-5d4bbd7474 from 1 to 2 + Normal ScalingReplicaSet 35s deployment-controller Scaled down replica set devops-info-app-79679fc787 from 2 to 1 + Normal ScalingReplicaSet 35s deployment-controller Scaled up replica set devops-info-app-5d4bbd7474 from 2 to 3 + Normal ScalingReplicaSet 28s deployment-controller Scaled down replica set devops-info-app-79679fc787 from 1 to 0 + Normal ScalingReplicaSet 7s (x6 over 28s) deployment-controller (combined from similar events): Scaled down replica set devops-info-app-5d4bbd7474 from 1 to 0 + +### Stable state after rollback +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +devops-info-app-79679fc787-msptg 1/1 Running 0 53s 10.244.0.15 lab09-control-plane +devops-info-app-79679fc787-nrlgs 1/1 Running 0 67s 10.244.0.13 lab09-control-plane +devops-info-app-79679fc787-pfcnc 1/1 Running 0 60s 10.244.0.14 lab09-control-plane +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR +devops-info-service NodePort 10.96.12.118 80:30080/TCP 2m11s app=devops-info diff --git a/k8s/service.yml b/k8s/service.yml new file mode 100644 index 0000000000..616b308da8 --- /dev/null +++ b/k8s/service.yml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service + labels: + app: devops-info +spec: + type: NodePort + selector: + app: devops-info + ports: + - name: http + protocol: TCP + port: 80 + targetPort: 8080 + nodePort: 30080 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..40c0e3d103 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,186 @@ +services: + prometheus: + image: prom/prometheus:v3.9.0 + container_name: monitoring-prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.size=10GB" + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + labels: + logging: "promtail" + app: "devops-prometheus" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + loki: + image: grafana/loki:3.0.0 + container_name: monitoring-loki + command: ["-config.file=/etc/loki/config.yml"] + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:3.0.0 + container_name: monitoring-promtail + command: ["-config.file=/etc/promtail/config.yml"] + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - promtail-positions:/tmp + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "/usr/bin/promtail", "-version"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + container_name: monitoring-grafana + env_file: + - .env + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + GF_SECURITY_ALLOW_EMBEDDING: "false" + GF_SERVER_DOMAIN: "localhost" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_METRICS_ENABLED: "true" + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + depends_on: + loki: + condition: service_healthy + prometheus: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.50" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + + app-python: + build: + context: ../app_python + image: devops-info-service:lab08 + container_name: app-python + environment: + HOST: "0.0.0.0" + PORT: "8080" + DEBUG: "false" + ports: + - "8000:8080" + labels: + logging: "promtail" + app: "devops-python" + networks: + - logging + depends_on: + promtail: + condition: service_healthy + prometheus: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "0.50" + memory: 256M + reservations: + cpus: "0.10" + memory: 128M + +volumes: + prometheus-data: + loki-data: + grafana-data: + promtail-positions: + +networks: + logging: + name: monitoring-logging diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..5d0e3cdfc7 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,162 @@ +# LAB07 - Observability & Logging with Loki Stack + +## 1. Architecture + +```text ++------------------+ +------------------+ +| app-python | stdout | Docker Engine | +| (Flask, JSON) +-------->+ json-file logs | ++------------------+ +---------+--------+ + | + | docker_sd_configs + v + +--------+--------+ + | Promtail 3.0 | + | label filter: | + | logging=promtail| + +--------+--------+ + | + | /loki/api/v1/push + v + +--------+--------+ + | Loki 3.0 | + | TSDB + FS store | + | retention 168h | + +--------+--------+ + | + | datasource + v + +--------+--------+ + | Grafana 12.3.1 | + | Explore + Dash | + +-----------------+ +``` + +## 2. Setup Guide + +1. Create secrets file: +```bash +cd monitoring +cp .env.example .env +# set a strong GRAFANA_ADMIN_PASSWORD in .env +``` + +2. Build and start stack: +```bash +docker compose up -d --build +docker compose ps +``` + +3. Verify services: +```bash +curl -fsS http://localhost:3100/ready +curl -fsS http://localhost:9080/targets +curl -fsS http://localhost:3000/api/health +``` + +4. Generate logs: +```bash +for i in {1..20}; do curl -fsS http://localhost:8000/ > /dev/null; done +for i in {1..20}; do curl -fsS http://localhost:8000/health > /dev/null; done +``` + +## 3. Configuration + +### Loki (`monitoring/loki/config.yml`) +- `schema: v13` and `store: tsdb` for Loki 3.0 single-binary setup. +- `object_store: filesystem` for local persistent storage in `loki-data` volume. +- Retention configured with: +```yaml +limits_config: + retention_period: 168h +compactor: + retention_enabled: true +``` + +### Promtail (`monitoring/promtail/config.yml`) +- Docker service discovery via Docker socket. +- Promtail collects only containers labeled `logging=promtail`. +- Relabeling maps: + - `container` from `__meta_docker_container_name` + - `app` from `__meta_docker_container_label_app` + +## 4. Application Logging + +Structured JSON logging implemented in `app_python/app.py`: +- Custom `JSONFormatter` for every log line. +- Startup event (`application_startup`) with app metadata. +- Request started/completed logs with context: + - `method`, `path`, `status_code`, `client_ip` +- Error logs use `logging.exception(...)` with request context. + +Example log line: +```json +{"timestamp":"2026-03-12T16:20:00+00:00","level":"INFO","message":"http_request_completed","method":"GET","path":"/health","status_code":200,"client_ip":"127.0.0.1","app":"devops-info-service"} +``` + +## 5. Dashboard + +Dashboard is provisioned from: +- `monitoring/grafana/dashboards/lab07-logs-dashboard.json` + +Panels and queries: +1. Logs Table: `{app=~"devops-.*"}` +2. Request Rate: `sum by (app) (rate({app=~"devops-.*"}[1m]))` +3. Error Logs: `{app=~"devops-.*"} | json | level="ERROR"` +4. Log Level Distribution: `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` + +## 6. Production Config + +Implemented hardening and operational settings: +- Anonymous Grafana access disabled (`GF_AUTH_ANONYMOUS_ENABLED=false`). +- Admin password comes from `.env` (not committed). +- Resource constraints for all services (`deploy.resources`). +- Health checks enabled for Loki, Promtail, Grafana, and app container. +- Persistent named volumes for Loki and Grafana data. + +## 7. Testing + +### Automated tests +```bash +pytest app_python/tests monitoring/tests +``` + +### Compose validation +```bash +cd monitoring +docker compose config +``` + +### Runtime checks +```bash +docker compose ps +curl -fsS http://localhost:3100/ready +curl -fsS http://localhost:9080/targets +curl -fsS http://localhost:3000/api/health +``` + +### LogQL checks +```logql +{app="devops-python"} +{app="devops-python"} |= "ERROR" +{app="devops-python"} | json | method="GET" +``` + +## 8. Challenges + +1. Loki 3.0 config format changed compared to older tutorials. +- Solution: use `common` section + TSDB schema `v13`. + +2. Keeping dashboard reproducible for grading. +- Solution: provision datasource and dashboard from repo files. + +3. Verifying logging requirements with tests. +- Solution: add pytest coverage for JSON formatter, startup event, and request completion context. + +## Evidence + + `monitoring/docs/screenshots/`: +- `grafana-explore-app-logs.png` (logs from 3+ containers) +- `grafana-dashboard-4-panels.png` +- `grafana-login-no-anonymous.png` +- `docker-compose-ps-healthy.png` diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..77ae38395a --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,182 @@ +# LAB08 - Metrics & Monitoring with Prometheus + +## 1. Architecture + +```text ++---------------------+ scrape /metrics +---------------------+ +| app-python (Flask) |------------------------------> | Prometheus 3.9.0 | +| :8080 | | :9090 | +| - RED HTTP metrics | | TSDB retention 15d | +| - app metrics | +----------+----------+ ++----------+----------+ | + | | PromQL + | logs v + v +----------+----------+ ++----------+----------+ | Grafana 12.3.1 | +| Promtail 3.0.0 |------------------------------> | dashboards + explore | ++----------+----------+ push logs +---------------------+ + | + v ++----------+----------+ +| Loki 3.0.0 | ++---------------------+ +``` + +## 2. Application Instrumentation + +Implemented in `app_python/app.py` with `prometheus-client==0.23.1`. + +### Added endpoints +- `GET /metrics` for Prometheus scrape. + +### Added metrics +- `http_requests_total{method,endpoint,status_code}` (Counter): total request count. +- `http_request_duration_seconds{method,endpoint}` (Histogram): request latency distribution. +- `http_requests_in_progress` (Gauge): concurrent in-flight requests. +- `devops_info_endpoint_calls_total{endpoint}` (Counter): endpoint usage. +- `devops_info_system_collection_seconds` (Histogram): system-info build time for `/` endpoint. + +### Metric design choices +- RED coverage: + - Rate: `http_requests_total` via `rate(...)` + - Errors: `http_requests_total{status_code=~"5.."}` + - Duration: `http_request_duration_seconds` +- Endpoint label normalization prevents high-cardinality labels. + +## 3. Prometheus Configuration + +File: `monitoring/prometheus/prometheus.yml` + +- `scrape_interval: 15s` +- Jobs: + - `prometheus` -> `localhost:9090` + - `app` -> `app-python:8080/metrics` + - `loki` -> `loki:3100/metrics` + - `grafana` -> `grafana:3000/metrics` + +Retention is configured in compose command flags: +- `--storage.tsdb.retention.time=15d` +- `--storage.tsdb.retention.size=10GB` + +## 4. Dashboard Walkthrough + +Provisioned dashboard: +- `monitoring/grafana/dashboards/lab08-metrics-dashboard.json` + +Panels (7 total): +1. `Request Rate`: +- `sum(rate(http_requests_total[5m])) by (endpoint)` +2. `Error Rate`: +- `sum(rate(http_requests_total{status_code=~"5.."}[5m]))` +3. `Request Duration p95`: +- `histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))` +4. `Request Duration Heatmap`: +- `sum by (le) (rate(http_request_duration_seconds_bucket[5m]))` +5. `Active Requests`: +- `http_requests_in_progress` +6. `Status Code Distribution`: +- `sum by (status_code) (rate(http_requests_total[5m]))` +7. `Uptime`: +- `up{job="app"}` + +### Community dashboards +- Prometheus dashboard ID: `3662` (Prometheus Stats) +- Loki dashboard ID: `13407` (Loki Dashboard) +- Import path in Grafana: `Dashboards -> New -> Import` + +## 5. PromQL Examples + +1. Request throughput by endpoint: +```promql +sum(rate(http_requests_total[5m])) by (endpoint) +``` + +2. Total 5xx error rate: +```promql +sum(rate(http_requests_total{status_code=~"5.."}[5m])) +``` + +3. p95 request latency: +```promql +histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m]))) +``` + +4. Current active requests: +```promql +http_requests_in_progress +``` + +5. Service availability check: +```promql +up{job="app"} +``` + +6. Endpoint call intensity (business metric): +```promql +sum(rate(devops_info_endpoint_calls_total[5m])) by (endpoint) +``` + +## 6. Production Setup + +`monitoring/docker-compose.yml` includes: + +- Health checks: + - Prometheus `/-/healthy` + - Loki `/ready` + - Grafana `/api/health` + - app `/health` + - Promtail binary check +- Resource limits: + - Prometheus `1G / 1.0 CPU` + - Loki `1G / 1.0 CPU` + - Grafana `512M / 0.50 CPU` + - app `256M / 0.50 CPU` +- Persistence volumes: + - `prometheus-data` + - `loki-data` + - `grafana-data` + - `promtail-positions` +- Retention policy: + - Prometheus: 15d or 10GB (whichever first) + - Loki: `retention_period: 168h` + +## 7. Testing Results + +### Local checks +```bash +pytest app_python/tests monitoring/tests +cd monitoring && docker compose config +``` + +### Runtime verification commands +```bash +cd monitoring +docker compose up -d --build +docker compose ps +curl -fsS http://localhost:9090/-/healthy +curl -fsS http://localhost:9090/api/v1/targets +curl -fsS http://localhost:8000/metrics | head -n 40 +``` + +### Screenshots to include in `monitoring/docs/screenshots/` +- `lab08-prometheus-targets-up.png` +- `lab08-prometheus-up-query.png` +- `lab08-grafana-6plus-panels.png` +- `lab08-compose-healthy.png` +- `lab08-persistence-proof.png` + +### Metrics vs Logs (Lab07 comparison) +- Use metrics for trends/SLOs/alerts (rates, error %, latency percentiles). +- Use logs for event details and root-cause drill-down. +- Combined workflow: alert from metric spike -> inspect correlated logs in Loki. + +## 8. Challenges & Solutions + +1. Label mismatch between dashboard examples and task label requirement. +- Solution: standardized on `status_code` label in app and dashboard queries. + +2. Grafana self-metrics scrape target reliability. +- Solution: enabled Grafana metrics with `GF_METRICS_ENABLED=true` and added Prometheus scrape job. + +3. Keeping stack reproducible for grading. +- Solution: provisioned Prometheus datasource + versioned dashboard JSON in repo. diff --git a/monitoring/docs/screenshots/compose-ps-healthy.png b/monitoring/docs/screenshots/compose-ps-healthy.png new file mode 100644 index 0000000000..3bef59ef77 Binary files /dev/null and b/monitoring/docs/screenshots/compose-ps-healthy.png differ diff --git a/monitoring/docs/screenshots/grafana-dashboard-4-panels.png b/monitoring/docs/screenshots/grafana-dashboard-4-panels.png new file mode 100644 index 0000000000..0fb6a6777d Binary files /dev/null and b/monitoring/docs/screenshots/grafana-dashboard-4-panels.png differ diff --git a/monitoring/docs/screenshots/grafana-explore-app-logs.png b/monitoring/docs/screenshots/grafana-explore-app-logs.png new file mode 100644 index 0000000000..b914180453 Binary files /dev/null and b/monitoring/docs/screenshots/grafana-explore-app-logs.png differ diff --git a/monitoring/docs/screenshots/grafana-login-no-anonymous.png b/monitoring/docs/screenshots/grafana-login-no-anonymous.png new file mode 100644 index 0000000000..2ee2e35a0a Binary files /dev/null and b/monitoring/docs/screenshots/grafana-login-no-anonymous.png differ diff --git a/monitoring/docs/screenshots/lab08-compose-healthy.png b/monitoring/docs/screenshots/lab08-compose-healthy.png new file mode 100644 index 0000000000..54d807f1de Binary files /dev/null and b/monitoring/docs/screenshots/lab08-compose-healthy.png differ diff --git a/monitoring/docs/screenshots/lab08-grafana-6plus-panels.png b/monitoring/docs/screenshots/lab08-grafana-6plus-panels.png new file mode 100644 index 0000000000..8f23870c53 Binary files /dev/null and b/monitoring/docs/screenshots/lab08-grafana-6plus-panels.png differ diff --git a/monitoring/docs/screenshots/lab08-persistence-proof.png b/monitoring/docs/screenshots/lab08-persistence-proof.png new file mode 100644 index 0000000000..efd8b8bea7 Binary files /dev/null and b/monitoring/docs/screenshots/lab08-persistence-proof.png differ diff --git a/monitoring/docs/screenshots/lab08-prometheus-targets-up.png b/monitoring/docs/screenshots/lab08-prometheus-targets-up.png new file mode 100644 index 0000000000..dcc17fb1bf Binary files /dev/null and b/monitoring/docs/screenshots/lab08-prometheus-targets-up.png differ diff --git a/monitoring/docs/screenshots/lab08-prometheus-up-query.png b/monitoring/docs/screenshots/lab08-prometheus-up-query.png new file mode 100644 index 0000000000..4be55af935 Binary files /dev/null and b/monitoring/docs/screenshots/lab08-prometheus-up-query.png differ diff --git a/monitoring/grafana/dashboards/lab07-logs-dashboard.json b/monitoring/grafana/dashboards/lab07-logs-dashboard.json new file mode 100644 index 0000000000..02dd43410b --- /dev/null +++ b/monitoring/grafana/dashboards/lab07-logs-dashboard.json @@ -0,0 +1,236 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Logs Table", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "percent", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Level Distribution", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "style": "dark", + "tags": [ + "lab07", + "logging" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lab07 - Loki Logs", + "uid": "lab07-loki-logs", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/dashboards/lab08-metrics-dashboard.json b/monitoring/grafana/dashboards/lab08-metrics-dashboard.json new file mode 100644 index 0000000000..27a08a1c56 --- /dev/null +++ b/monitoring/grafana/dashboards/lab08-metrics-dashboard.json @@ -0,0 +1,446 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "mode": "opacity" + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "sort": "none" + }, + "yAxis": { + "axisPlacement": "left" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "http_requests_in_progress", + "range": true, + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "percent", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "{{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "up{job=\"app\"}", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "style": "dark", + "tags": [ + "lab08", + "metrics", + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lab08 - Application Metrics", + "uid": "lab08-app-metrics", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..9580485b73 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: lab07-logging + orgId: 1 + folder: Lab07 + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..f89a52a985 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000000..8b15eba066 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: false + editable: false diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..608f5720e2 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,47 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: 168h + allow_structured_metadata: false + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..f9a3461d0c --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + metrics_path: /metrics + static_configs: + - targets: ['app-python:8080'] + + - job_name: 'loki' + metrics_path: /metrics + static_configs: + - targets: ['loki:3100'] + + - job_name: 'grafana' + metrics_path: /metrics + static_configs: + - targets: ['grafana:3000'] diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..62895a3017 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,28 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_label_app"] + target_label: "app" + - target_label: "job" + replacement: "docker" + pipeline_stages: + - docker: {} diff --git a/monitoring/tests/test_lab07_files.py b/monitoring/tests/test_lab07_files.py new file mode 100644 index 0000000000..4348f211f3 --- /dev/null +++ b/monitoring/tests/test_lab07_files.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import json +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] + + +def _read(path: str) -> str: + return (ROOT / path).read_text(encoding="utf-8") + + +def test_compose_has_required_services_and_images(): + compose = _read("docker-compose.yml") + assert "grafana/loki:3.0.0" in compose + assert "grafana/promtail:3.0.0" in compose + assert "grafana/grafana:12.3.1" in compose + assert "app-python:" in compose + + +def test_compose_has_production_settings(): + compose = _read("docker-compose.yml") + assert "GF_AUTH_ANONYMOUS_ENABLED: \"false\"" in compose + assert "GF_SECURITY_ADMIN_PASSWORD" in compose + assert "healthcheck:" in compose + assert "deploy:" in compose + assert "resources:" in compose + + +def test_promtail_filters_promtail_label_and_extracts_app_label(): + config = _read("promtail/config.yml") + assert "logging=promtail" in config + assert "__meta_docker_container_name" in config + assert "__meta_docker_container_label_app" in config + assert "target_label: \"app\"" in config + + +def test_loki_uses_tsdb_v13_and_retention(): + config = _read("loki/config.yml") + assert "store: tsdb" in config + assert "schema: v13" in config + assert "retention_period: 168h" in config + + +def test_dashboard_has_required_panels_and_queries(): + dashboard_path = ROOT / "grafana/dashboards/lab07-logs-dashboard.json" + dashboard = json.loads(dashboard_path.read_text(encoding="utf-8")) + + panels = dashboard["panels"] + titles = {panel["title"] for panel in panels} + assert titles == { + "Logs Table", + "Request Rate", + "Error Logs", + "Log Level Distribution", + } + + queries = {panel["title"]: panel["targets"][0]["expr"] for panel in panels} + assert queries["Logs Table"] == '{app=~"devops-.*"}' + assert queries["Request Rate"] == 'sum by (app) (rate({app=~"devops-.*"}[1m]))' + assert queries["Error Logs"] == '{app=~"devops-.*"} | json | level="ERROR"' + assert queries["Log Level Distribution"] == ( + 'sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))' + ) diff --git a/monitoring/tests/test_lab08_files.py b/monitoring/tests/test_lab08_files.py new file mode 100644 index 0000000000..5b6892b2e4 --- /dev/null +++ b/monitoring/tests/test_lab08_files.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import json +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] + + +def _read(path: str) -> str: + return (ROOT / path).read_text(encoding="utf-8") + + +def test_compose_includes_prometheus_and_retention_flags(): + compose = _read("docker-compose.yml") + assert "prom/prometheus:v3.9.0" in compose + assert "--storage.tsdb.retention.time=15d" in compose + assert "--storage.tsdb.retention.size=10GB" in compose + assert "prometheus-data" in compose + + +def test_compose_has_required_resource_limits(): + compose = _read("docker-compose.yml") + assert "loki:" in compose + assert "grafana:" in compose + assert "app-python:" in compose + assert "memory: 1G" in compose + assert "cpus: \"1.0\"" in compose + assert "memory: 512M" in compose + assert "cpus: \"0.50\"" in compose + assert "memory: 256M" in compose + + +def test_prometheus_scrape_config_contains_all_jobs(): + config = _read("prometheus/prometheus.yml") + assert "scrape_interval: 15s" in config + assert "job_name: 'prometheus'" in config + assert "job_name: 'app'" in config + assert "job_name: 'loki'" in config + assert "job_name: 'grafana'" in config + assert "app-python:8080" in config + + +def test_prometheus_datasource_is_provisioned(): + datasource = _read("grafana/provisioning/datasources/prometheus.yml") + assert "type: prometheus" in datasource + assert "uid: prometheus" in datasource + assert "url: http://prometheus:9090" in datasource + + +def test_lab08_dashboard_has_required_panels_and_queries(): + dashboard_path = ROOT / "grafana/dashboards/lab08-metrics-dashboard.json" + dashboard = json.loads(dashboard_path.read_text(encoding="utf-8")) + + panels = dashboard["panels"] + titles = {panel["title"] for panel in panels} + assert titles == { + "Request Rate", + "Error Rate", + "Request Duration p95", + "Request Duration Heatmap", + "Active Requests", + "Status Code Distribution", + "Uptime", + } + + queries = {panel["title"]: panel["targets"][0]["expr"] for panel in panels} + assert queries["Request Rate"] == "sum(rate(http_requests_total[5m])) by (endpoint)" + assert queries["Error Rate"] == 'sum(rate(http_requests_total{status_code=~"5.."}[5m]))' + assert queries["Request Duration p95"] == ( + "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))" + ) + assert queries["Request Duration Heatmap"] == ( + "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))" + ) + assert queries["Active Requests"] == "http_requests_in_progress" + assert queries["Status Code Distribution"] == "sum by (status_code) (rate(http_requests_total[5m]))" + assert queries["Uptime"] == 'up{job="app"}'