diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..3b61ba1171 --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,75 @@ +name: Ansible Deployment + +on: + push: + paths: + - 'ansible/**' + - '.github/workflows/ansible-deploy.yml' + +jobs: + + lint: + name: Ansible Lint + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Ansible + run: | + pip install ansible ansible-lint + + - name: Run ansible-lint + run: | + cd ansible + ansible-lint playbooks/*.yml + + deploy: + name: Deploy Application + needs: lint + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Ansible + run: pip install ansible + + - name: Setup SSH + run: | + mkdir -p $HOME/.ssh + chmod 700 $HOME/.ssh + + echo "${{ secrets.SSH_PRIVATE_KEY }}" | tr -d '\r' > $HOME/.ssh/id_rsa + chmod 600 $HOME/.ssh/id_rsa + + ssh-keyscan -H ${{ secrets.VM_HOST }} >> $HOME/.ssh/known_hosts + chmod 644 $HOME/.ssh/known_hosts + + ls -la $HOME/.ssh + + - name: Deploy with Ansible + run: | + cd ansible + echo "${{ secrets.ANSIBLE_VAULT_PASSWORD }}" > /tmp/vault_pass + ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini \ + --vault-password-file /tmp/vault_pass + rm /tmp/vault_pass + + - name: Verify deployment + run: | + sleep 10 + curl -f http://${{ secrets.VM_HOST }}:5000 \ No newline at end of file diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000000..d61e727044 --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,83 @@ +name: Python CI + +on: + [push, pull_request] + +permissions: + contents: read + +jobs: + + test: + name: Lint & Tests + runs-on: ubuntu-latest + timeout-minutes: 10 + + strategy: + fail-fast: true + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r app_python/requirements.txt + pip install pytest ruff + + - name: Lint + run: ruff check . + + - name: Run tests + run: pytest + + - name: Setup Snyk + uses: snyk/actions/setup@master + + - name: Run Snyk + run: snyk test --file=app_python/requirements.txt + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + + + docker: + name: Build & Push Docker + needs: test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set version (CalVer) + id: version + run: | + echo "VERSION=$(date +'%Y.%m')" >> $GITHUB_OUTPUT + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: ./app_python + file: ./app_python/Dockerfile + push: true + tags: | + ${{ secrets.DOCKERHUB_USERNAME }}/app_python:2026.02 + ${{ secrets.DOCKERHUB_USERNAME }}/app_python:latest diff --git a/.gitignore b/.gitignore index 30d74d2584..b11c2737e7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,10 @@ -test \ No newline at end of file +test + +# Ansible +*.retry +.vault_pass +ansible/inventory/*.pyc +__pycache__/ + +# Monitoring secrets +monitoring/.env \ No newline at end of file diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000000..0eb7d7e509 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,10 @@ +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +host_key_checking = False +retry_files_enabled = False + +[privilege_escalation] +become = True +become_method = sudo +become_user = root \ No newline at end of file diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md new file mode 100644 index 0000000000..5cfd5c2da3 --- /dev/null +++ b/ansible/docs/LAB05.md @@ -0,0 +1,100 @@ +# 1. Architecture Overview + +### Ansible version: 2.12.5 +### Target VM OS: Ubuntu 24.04 LTS +### Role structure: +```text +ansible/ +├── inventory/ +│ └── hosts.ini +├── roles/ +│ ├── common/ +│ ├── docker/ +│ └── app_deploy/ +├── playbooks/ +│ ├── site.yml +│ ├── provision.yml +│ └── deploy.yml +├── group_vars/ +│ └── all.yml +├── ansible.cfg +└── docs/ + └── LAB05.md +``` + +### Why roles: +- They allow you to separate tasks by functionality. +- They improve code reuse. +- They are easy to maintain and test independently. + +# 2. Roles Documentation +### 2.1 common +- Purpose: Basic system setup (apt update, package installation, time settings). +- Variables: list of packages defaults/main.yml. +- Handlers: no. +- Dependencies: no. + +### 2.2 docker +- Purpose: Install and run Docker, add a user to the docker group. +- Variables: Docker version, username. +- Handlers: restart docker service. +- Dependencies: common. +### 2.3 app_deploy +- Purpose: deploying a Python container. +- Variables: Docker Hub username, password, application name, port, container name, image tag. +- Handlers: restart the container if necessary. +- Dependencies: docker. + +# 3. Idempotency Demonstration + +### First run of playbook deploy.yml: +![First run](ansible/docs/screenshots/img.png) + +### Second run of playbook deploy.yml: +![Second run](ansible/docs/screenshots/img_1.png) + +### Analysis: + +- `ok` — tasks where no changes were made (e.g., a port is already open). +- `changed` — tasks that updated the state (pull, run, restart the container). +The roles are idempotent by design, but starting a container always calls "changed" because we're deleting the old one and creating a new one. + +# 4. Ansible Vault Usage + +### Vault file: `group_vars/all.yml` +Content: +```yml +dockerhub_username: th1ef +dockerhub_password: my_password +app_name: devops-info-service +docker_image: "{{ dockerhub_username }}/{{ app_name }}" +docker_image_tag: latest +app_port: 5000 +app_container_name: "{{ app_name }}" +``` + +# 5. Deployment Verification +### Container condition: +![Container condition](ansible/docs/screenshots/img_2.png) + +### Health check: +![Health check](ansible/docs/screenshots/img_3.png) + + +### 6. Key Decisions + +- **Why roles instead of monolithic playbooks?** \ +They allow you to structure tasks and are easier to maintain and test. +- **How do roles improve reusability?** \ +You can use a single role across different VMs and projects without duplicating code. +- **What makes a task idempotent?** \ +The module checks the current state and makes changes only when necessary (state: present, state: started). +- **How do handlers improve efficiency?** \ +They are executed only when the state changes, preventing unnecessary service restarts. +- **Why Ansible Vault?** \ +To securely store sensitive data (passwords, tokens) in the repository. + +### 7. Challenges +- Errors when logging into Docker Hub without a collection. +- Incorrect SSH key permissions in the container. +- Pull errors if the image wasn't on Docker Hub — resolved by uploading the image using your own account. \ No newline at end of file diff --git a/ansible/docs/screenshots/img.png b/ansible/docs/screenshots/img.png new file mode 100644 index 0000000000..e60207907f Binary files /dev/null and b/ansible/docs/screenshots/img.png differ diff --git a/ansible/docs/screenshots/img_1.png b/ansible/docs/screenshots/img_1.png new file mode 100644 index 0000000000..876465500d Binary files /dev/null and b/ansible/docs/screenshots/img_1.png differ diff --git a/ansible/docs/screenshots/img_2.png b/ansible/docs/screenshots/img_2.png new file mode 100644 index 0000000000..0aa540223b Binary files /dev/null and b/ansible/docs/screenshots/img_2.png differ diff --git a/ansible/docs/screenshots/img_3.png b/ansible/docs/screenshots/img_3.png new file mode 100644 index 0000000000..2c88e85601 Binary files /dev/null and b/ansible/docs/screenshots/img_3.png differ diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml new file mode 100644 index 0000000000..52658cb081 --- /dev/null +++ b/ansible/group_vars/all.yml @@ -0,0 +1,23 @@ +$ANSIBLE_VAULT;1.1;AES256 +33303266313739353338396463653730643034633662316330616232646135633331336434396235 +6366396537663866393138626166613733386230636234650a303061336566353166303233653338 +62323565393239613333646162313930313363323761373331326364303438336437346235653337 +3034396131386262660a653637646338623762303763326238663339326565323364313137306561 +63666438396263373864353032333632623366396666643835313662653966616532643566343331 +38613662616235653162656530666661663934613730363364636434643731643966333738646532 +32626361353164653939623439373631353566356532366132633166383137326335613063366566 +65333630623338373362366363663836623730613062366465356262396132363365663661363732 +31313561666339646262383162393739393531633832616132653661323164633361376137613930 +39616430346330646231636339653063633233373836313738666535393232373366646665303763 +38333533383236383235366465666336363038386332333336393532333664623432336266356564 +35333164656161376538393136373033373634613263313236336536656633643339303963396666 +35656132353061333061333661376566353461363335343936326437383230623830616239626230 +38356632656364306130376338333737386239323361646337316661373161313435666264326263 +34306638633239646363363537303862333764623832333466383039343632316533666232663330 +38363134363234363262666336386331376662383335396334386566653036633436353833333237 +65313161393535393035656332326362363535363430663961316366613030643131353662356331 +61623238613462326166363330383337363030653635633236666437386233363737383137316263 +62313636386532633931343963643761306434376331616438326435313930316533653332646537 +30353831363266383937623862306230356661323935343766643039363131343437356130663662 +33643439386138626431633736396637326333643463353863316538633933633033626530326333 +3531633630643632363131653131623336336331343362396231 diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000000..eff8edfac0 --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,2 @@ +[webservers] +vm ansible_host=93.77.177.133 ansible_user=ubuntu \ No newline at end of file diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..1678029800 --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,6 @@ +--- +- name: Deploy monitoring stack (Loki + Promtail + Prometheus + Grafana) + hosts: webservers + become: true + roles: + - monitoring diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml new file mode 100644 index 0000000000..ea6883990b --- /dev/null +++ b/ansible/playbooks/deploy.yml @@ -0,0 +1,8 @@ +--- +- name: Deploy Python Application + hosts: webservers + become: true + vars_files: + - group_vars/all.yml + roles: + - role: web_app diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml new file mode 100644 index 0000000000..4da8f51873 --- /dev/null +++ b/ansible/playbooks/provision.yml @@ -0,0 +1,7 @@ +--- +- name: Provision Web Servers + hosts: webservers + become: true + roles: + - role: common + - role: docker diff --git a/ansible/roles/common/defaults/main.yml b/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000000..2282a109a1 --- /dev/null +++ b/ansible/roles/common/defaults/main.yml @@ -0,0 +1,7 @@ +--- +common_packages: + - python3-pip + - curl + - git + - vim + - htop diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000000..2834ecb17a --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,55 @@ +--- +- name: Install base packages + become: true + tags: + - packages + - common + block: + + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install required packages + ansible.builtin.apt: + name: + - curl + - git + - vim + state: present + + rescue: + - name: Fix missing packages and retry update + ansible.builtin.apt: + update_cache: true + force_apt_get: true + become: true + + always: + - name: Log package block completion + ansible.builtin.copy: + content: "Package installation block completed\n" + dest: /tmp/common_packages.log + mode: "0644" + +- name: Manage system users + become: true + tags: + - users + - common + block: + + - name: Ensure deploy user exists + ansible.builtin.user: + name: deploy + groups: sudo + append: true + state: present + + always: + - name: Log user block completion + ansible.builtin.copy: + content: "User management block completed\n" + dest: /tmp/common_users.log + mode: "0644" diff --git a/ansible/roles/docker/defaults/main.yml b/ansible/roles/docker/defaults/main.yml new file mode 100644 index 0000000000..e64d3b7e66 --- /dev/null +++ b/ansible/roles/docker/defaults/main.yml @@ -0,0 +1,2 @@ +--- +docker_user: ubuntu diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml new file mode 100644 index 0000000000..ea184a88eb --- /dev/null +++ b/ansible/roles/docker/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart Docker + become: true + ansible.builtin.service: + name: docker + state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000000..cb95caca26 --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,65 @@ +--- +- name: Install Docker + become: true + tags: + - docker_install + - docker + block: + + - name: Add Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + + - name: Add Docker repository + ansible.builtin.apt_repository: + repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" + state: present + + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + + - name: Install Docker packages + ansible.builtin.apt: + name: + - docker-ce + - docker-ce-cli + - containerd.io + state: present + + rescue: + - name: Wait before retry + ansible.builtin.pause: + seconds: 10 + + - name: Retry apt update + ansible.builtin.apt: + update_cache: true + + always: + - name: Ensure Docker service is enabled and started + ansible.builtin.service: + name: docker + state: started + enabled: true + +- name: Configure Docker + become: true + tags: + - docker_config + - docker + block: + + - name: Add deploy user to docker group + ansible.builtin.user: + name: deploy + groups: docker + append: true + + always: + - name: Log docker configuration completion + ansible.builtin.copy: + content: "Docker configuration completed\n" + dest: /tmp/docker_config.log + mode: "0644" diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..7ad79afaa9 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,61 @@ +--- +monitoring_dir: /opt/monitoring + +loki_version: "3.0.0" +promtail_version: "3.0.0" +grafana_version: "12.3.1" +prometheus_version: "3.9.0" + +loki_port: 3100 +grafana_port: 3000 +promtail_port: 9080 +prometheus_port: 9090 + +loki_retention_period: "168h" +loki_schema_version: "v13" + +prometheus_retention_days: 15 +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" + +prometheus_targets: + - job: "prometheus" + targets: ["localhost:9090"] + - job: "app" + targets: ["app-python:5000"] + path: "/metrics" + - job: "loki" + targets: ["loki:3100"] + path: "/metrics" + - job: "grafana" + targets: ["grafana:3000"] + path: "/metrics" + +grafana_admin_password: "securepassword123" + +resource_limits: + loki: + cpus: "1.0" + memory: "1G" + cpus_reservation: "0.5" + memory_reservation: "512M" + promtail: + cpus: "0.5" + memory: "512M" + cpus_reservation: "0.25" + memory_reservation: "256M" + grafana: + cpus: "0.5" + memory: "512M" + cpus_reservation: "0.25" + memory_reservation: "256M" + prometheus: + cpus: "1.0" + memory: "1G" + cpus_reservation: "0.5" + memory_reservation: "512M" + app: + cpus: "0.5" + memory: "256M" + cpus_reservation: "0.25" + memory_reservation: "128M" diff --git a/ansible/roles/monitoring/files/grafana-app-logs.json b/ansible/roles/monitoring/files/grafana-app-logs.json new file mode 100644 index 0000000000..32cabbe199 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-app-logs.json @@ -0,0 +1,131 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "title": "Logs Table", + "type": "logs", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{container=~\".+\"}", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, + "id": 2, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (container) (rate({container=~\".+\"} [1m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto", + "lineWidth": 2 + }, + "unit": "reqps" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi" } + } + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, + "id": 3, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{container=~\".+\"} | json | level=\"ERROR\" or level=\"error\"", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Log Level Distribution", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "id": 4, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (level) (count_over_time({container=~\".+\"} | json [5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" } + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "right" }, + "tooltip": { "mode": "single" }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + } + } + ], + "schemaVersion": 39, + "tags": ["loki", "logs"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "App Logs", + "uid": "app-logs", + "version": 1 +} diff --git a/ansible/roles/monitoring/files/grafana-app-metrics.json b/ansible/roles/monitoring/files/grafana-app-metrics.json new file mode 100644 index 0000000000..dbdd63afeb --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-app-metrics.json @@ -0,0 +1,191 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Request Rate by Endpoint", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + } + }, + "overrides": [] + } + }, + { + "title": "Error Rate (5xx)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))", + "legendFormat": "5xx errors/s", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20 + }, + "color": { "mode": "fixed", "fixedColor": "red" } + }, + "overrides": [] + } + }, + { + "title": "Request Duration p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))", + "legendFormat": "p95 {{endpoint}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + } + }, + "overrides": [] + } + }, + { + "title": "Request Duration Heatmap", + "type": "heatmap", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(increase(http_request_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}", + "refId": "A", + "format": "heatmap" + } + ], + "options": { + "calculate": false, + "yAxis": { + "unit": "s" + }, + "color": { + "scheme": "Oranges" + } + } + }, + { + "title": "Active Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "http_requests_in_progress", + "legendFormat": "in-progress", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 30 + }, + "color": { "mode": "fixed", "fixedColor": "blue" } + }, + "overrides": [] + } + }, + { + "title": "Status Code Distribution", + "type": "piechart", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum by (status) (rate(http_requests_total[5m]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "pieType": "pie" + } + }, + { + "title": "Service Uptime", + "type": "stat", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "up{job=\"app\"}", + "legendFormat": "app-python", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["app", "metrics", "RED"], + "templating": { "list": [] }, + "time": { "from": "now-30m", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Application Metrics", + "uid": "app-metrics", + "version": 1 +} diff --git a/ansible/roles/monitoring/files/grafana-dashboards.yml b/ansible/roles/monitoring/files/grafana-dashboards.yml new file mode 100644 index 0000000000..aa5f7c1db3 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..cb7d8e0460 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker diff --git a/ansible/roles/monitoring/tasks/deploy.yml b/ansible/roles/monitoring/tasks/deploy.yml new file mode 100644 index 0000000000..c16f42e7c1 --- /dev/null +++ b/ansible/roles/monitoring/tasks/deploy.yml @@ -0,0 +1,33 @@ +--- +- name: Deploy monitoring stack + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + state: present + register: compose_result + +- name: Wait for Loki to be ready + ansible.builtin.uri: + url: "http://localhost:{{ loki_port }}/ready" + status_code: 200 + register: loki_ready + until: loki_ready.status == 200 + retries: 30 + delay: 5 + +- name: Wait for Prometheus to be ready + ansible.builtin.uri: + url: "http://localhost:{{ prometheus_port }}/-/healthy" + status_code: 200 + register: prometheus_ready + until: prometheus_ready.status == 200 + retries: 30 + delay: 5 + +- name: Wait for Grafana to be ready + ansible.builtin.uri: + url: "http://localhost:{{ grafana_port }}/api/health" + status_code: 200 + register: grafana_ready + until: grafana_ready.status == 200 + retries: 30 + delay: 5 diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..25c62b68ce --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- name: Include setup tasks + ansible.builtin.include_tasks: setup.yml + +- name: Include deploy tasks + ansible.builtin.include_tasks: deploy.yml diff --git a/ansible/roles/monitoring/tasks/setup.yml b/ansible/roles/monitoring/tasks/setup.yml new file mode 100644 index 0000000000..ed5083c8a5 --- /dev/null +++ b/ansible/roles/monitoring/tasks/setup.yml @@ -0,0 +1,58 @@ +--- +- name: Create monitoring directory structure + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ monitoring_dir }}" + - "{{ monitoring_dir }}/loki" + - "{{ monitoring_dir }}/promtail" + - "{{ monitoring_dir }}/prometheus" + - "{{ monitoring_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_dir }}/grafana/provisioning/dashboards" + +- name: Template Loki configuration + ansible.builtin.template: + src: loki-config.yml.j2 + dest: "{{ monitoring_dir }}/loki/config.yml" + mode: "0644" + +- name: Template Promtail configuration + ansible.builtin.template: + src: promtail-config.yml.j2 + dest: "{{ monitoring_dir }}/promtail/config.yml" + mode: "0644" + +- name: Template Prometheus configuration + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ monitoring_dir }}/prometheus/prometheus.yml" + mode: "0644" + +- name: Template Docker Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_dir }}/docker-compose.yml" + mode: "0644" + +- name: Template Grafana datasource provisioning + ansible.builtin.template: + src: grafana-datasource.yml.j2 + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/datasources.yml" + mode: "0644" + +- name: Copy Grafana dashboard provisioning config + ansible.builtin.copy: + src: grafana-dashboards.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/dashboards.yml" + mode: "0644" + +- name: Copy Grafana dashboards + ansible.builtin.copy: + src: "{{ item }}" + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/" + mode: "0644" + loop: + - grafana-app-logs.json + - grafana-app-metrics.json diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..ade4721223 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,140 @@ +services: + loki: + image: grafana/loki:{{ loki_version }} + command: -config.file=/etc/loki/config.yml + ports: + - "{{ loki_port }}:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "loki" + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "{{ resource_limits.loki.cpus }}" + memory: {{ resource_limits.loki.memory }} + reservations: + cpus: "{{ resource_limits.loki.cpus_reservation }}" + memory: {{ resource_limits.loki.memory_reservation }} + + promtail: + image: grafana/promtail:{{ promtail_version }} + command: -config.file=/etc/promtail/config.yml + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "promtail" + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: "{{ resource_limits.promtail.cpus }}" + memory: {{ resource_limits.promtail.memory }} + reservations: + cpus: "{{ resource_limits.promtail.cpus_reservation }}" + memory: {{ resource_limits.promtail.memory_reservation }} + + grafana: + image: grafana/grafana:{{ grafana_version }} + ports: + - "{{ grafana_port }}:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/provisioning/dashboards/grafana-app-logs.json:/var/lib/grafana/dashboards/grafana-app-logs.json:ro + - ./grafana/provisioning/dashboards/grafana-app-metrics.json:/var/lib/grafana/dashboards/grafana-app-metrics.json:ro + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }} + - GF_SECURITY_ALLOW_EMBEDDING=true + networks: + - logging + labels: + logging: "promtail" + app: "grafana" + depends_on: + loki: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "{{ resource_limits.grafana.cpus }}" + memory: {{ resource_limits.grafana.memory }} + reservations: + cpus: "{{ resource_limits.grafana.cpus_reservation }}" + memory: {{ resource_limits.grafana.memory_reservation }} + + prometheus: + image: prom/prometheus:v{{ prometheus_version }} + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time={{ prometheus_retention_days }}d" + - "--storage.tsdb.retention.size={{ prometheus_retention_size }}" + ports: + - "{{ prometheus_port }}:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + labels: + logging: "promtail" + app: "prometheus" + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "{{ resource_limits.prometheus.cpus }}" + memory: {{ resource_limits.prometheus.memory }} + reservations: + cpus: "{{ resource_limits.prometheus.cpus_reservation }}" + memory: {{ resource_limits.prometheus.memory_reservation }} + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + prometheus-data: diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 new file mode 100644 index 0000000000..c46467f439 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -0,0 +1,18 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:{{ loki_port }} + uid: loki + isDefault: false + editable: true + + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:{{ prometheus_port }} + uid: prometheus + isDefault: true + editable: true diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..8a24c49665 --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,38 @@ +auth_enabled: false + +server: + http_listen_port: {{ loki_port }} + +common: + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + replication_factor: 1 + path_prefix: /loki + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: {{ loki_schema_version }} + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +limits_config: + retention_period: {{ loki_retention_period }} + allow_structured_metadata: true + +compactor: + working_directory: /loki/compactor + retention_enabled: true + delete_request_store: filesystem diff --git a/ansible/roles/monitoring/templates/prometheus.yml.j2 b/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000000..32784d8a19 --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,18 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +storage: + tsdb: + retention_time: {{ prometheus_retention_days }}d + retention_size: {{ prometheus_retention_size }} + +scrape_configs: +{% for target in prometheus_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: {{ target.targets }} +{% if target.path is defined %} + metrics_path: '{{ target.path }}' +{% endif %} +{% endfor %} diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..b618406752 --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,23 @@ +server: + http_listen_port: {{ promtail_port }} + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:{{ loki_port }}/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_label_app"] + target_label: "app" diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..ed97d539c0 --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/roles/web_app/handlers/main.yml b/ansible/roles/web_app/handlers/main.yml new file mode 100644 index 0000000000..6e991e5cd8 --- /dev/null +++ b/ansible/roles/web_app/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart application container + become: true + community.docker.docker_container: + name: "{{ app_container_name }}" + image: "{{ docker_image }}:{{ docker_image_tag }}" + state: restarted + restart_policy: unless-stopped + published_ports: + - "{{ app_port }}:5000" diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..cb7d8e0460 --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..761d206b3a --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,35 @@ +--- +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + when: web_app_wipe | bool + tags: + - web_app_wipe + +- name: Deploy application with Docker Compose + tags: + - app_deploy + - compose + block: + + - name: Create application directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: directory + mode: "0755" + + - name: Template docker-compose.yml + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ compose_project_dir }}/docker-compose.yml" + mode: "0644" + + - name: Deploy application + ansible.builtin.command: docker compose up -d + args: + chdir: "{{ compose_project_dir }}" + changed_when: false + + rescue: + - name: Deployment failed + ansible.builtin.debug: + msg: "Docker Compose deployment failed" diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..bb0d4f7495 --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,22 @@ +--- +- name: Wipe Web Application + tags: + - web_app_wipe + when: web_app_wipe | bool + block: + + - name: Stop And Remove Docker Compose Containers + ansible.builtin.command: docker compose down + args: + chdir: /opt/{{ app_name }} + failed_when: false + changed_when: false + + - name: Remove Application Directory + ansible.builtin.file: + path: /opt/{{ app_name }} + state: absent + + - name: Log Wipe Completion + ansible.builtin.debug: + msg: "Application {{ app_name }} wiped successfully" diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..fb645ca12b --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,11 @@ +version: "3.8" + +services: + {{ app_name }}: + image: {{ docker_image }}:{{ docker_image_tag }} + container_name: {{ app_name }} + ports: + - "{{ app_port }}:{{ app_internal_port }}" + environment: + APP_SECRET_KEY: "{{ app_secret_key }}" + restart: unless-stopped \ No newline at end of file diff --git a/app_python/.dockerignore b/app_python/.dockerignore new file mode 100644 index 0000000000..5bd6a39c7b --- /dev/null +++ b/app_python/.dockerignore @@ -0,0 +1,8 @@ +.venv +venv +__pycache__ +.git +.gitignore +.env +*.pyc +.idea diff --git a/app_python/.gitignore b/app_python/.gitignore new file mode 100644 index 0000000000..bcd5ee2a42 --- /dev/null +++ b/app_python/.gitignore @@ -0,0 +1,156 @@ +node_modules + +# Output +.output +.vercel +.netlify +.wrangler +.svelte-kit +/build + +# Vite +vite.config.js.timestamp-* +vite.config.ts.timestamp-* + +# OS +.DS_Store +Thumbs.db + +# Env +.env.* +!.env.example +!.env.test + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ +coverage + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# JetBrains IDEs +.idea/ diff --git a/app_python/Dockerfile b/app_python/Dockerfile new file mode 100644 index 0000000000..24daabb9f5 --- /dev/null +++ b/app_python/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.13-slim + +WORKDIR /app + +RUN adduser --disabled-password --gecos "" appuser + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +RUN chown -R appuser:appuser /app + +USER appuser + +CMD ["python", "app.py"] diff --git a/app_python/README.md b/app_python/README.md new file mode 100644 index 0000000000..022c11336f --- /dev/null +++ b/app_python/README.md @@ -0,0 +1,76 @@ +## Overview +This API contains two endpoints: +1. Getting information about the system +2. Getting the health status of the API itself + +## Prerequisites +``` +python==3.13.5 +uvicorn==0.40.0 +pydantic==2.12.5 +fastapi==0.128.0 +``` + +## Installation + +```bash +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## Running + +```bash +python app.py +# Or with custom config +PORT=8080 python app.py +``` + +## API Endpoints + +``` +GET / - Service and system information +GET /health - Health check +``` + +## Configuration + +| Variable | Description | Type | Default | Example | +| -------- | -------------------------------------- | ------- | --------- |-------------| +| `HOST` | Host address the application binds to | string | `0.0.0.0` | `127.0.0.1` | +| `PORT` | Port number the application listens on | integer | `5000` | `8000` | +| `DEBUG` | Enables debug mode | boolean | `False` | `True` | + +## Docker + +1. Building the image + example: + ```bash + docker build -t : + ``` + + to build our service used: + ```bash + docker duild -t devops-info-service:latest . + ``` +2. Running a container + example: + ```bash + docker run + ``` + + to run our service used: + ```bash + docker run -d -p 5000:5000 devops-info-service + ``` + +3. Pulling from Docker Hub example: + ```bash + docker pull + ``` + + to pull our repo used: + ```bash + docker pull th1ef/devops-info-service:latest + ``` diff --git a/app_python/app.py b/app_python/app.py new file mode 100644 index 0000000000..56bb230be1 --- /dev/null +++ b/app_python/app.py @@ -0,0 +1,81 @@ +import logging +import time + +import uvicorn +from fastapi import FastAPI, Request, Response +from fastapi.middleware.cors import CORSMiddleware +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST + +from config import DEBUG, HOST, PORT +from logger_config import setup_logger +from metrics import ( + http_requests_total, + http_request_duration_seconds, + http_requests_in_progress, +) +from routes import health_router, root_router + +setup_logger() +logger = logging.getLogger(__name__) + +app = FastAPI(debug=DEBUG) +for router in [health_router, root_router]: + app.include_router(router=router) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.middleware("http") +async def log_and_track_requests(request: Request, call_next): + if request.url.path == "/metrics": + return await call_next(request) + + http_requests_in_progress.inc() + start = time.perf_counter() + try: + response = await call_next(request) + finally: + duration = time.perf_counter() - start + http_requests_in_progress.dec() + + duration_ms = round(duration * 1000, 2) + endpoint = request.url.path + method = request.method + status = str(response.status_code) + + http_requests_total.labels(method=method, endpoint=endpoint, status=status).inc() + http_request_duration_seconds.labels(method=method, endpoint=endpoint).observe( + duration + ) + + logger.info( + "HTTP request", + extra={ + "method": method, + "path": endpoint, + "status_code": response.status_code, + "client_ip": request.client.host if request.client else None, + "duration_ms": duration_ms, + }, + ) + return response + + +@app.get("/metrics", include_in_schema=False) +async def metrics(): + return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST) + + +@app.on_event("startup") +async def on_startup(): + logger.info("Application started", extra={"host": HOST, "port": PORT}) + + +if __name__ == "__main__": + uvicorn.run(app=app, port=PORT, host=HOST) diff --git a/app_python/config.py b/app_python/config.py new file mode 100644 index 0000000000..142a3e1fbf --- /dev/null +++ b/app_python/config.py @@ -0,0 +1,5 @@ +import os + +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 5000)) +DEBUG = os.getenv("DEBUG", "False").lower() == "true" diff --git a/app_python/docs/LAB01.md b/app_python/docs/LAB01.md new file mode 100644 index 0000000000..332983fd8e --- /dev/null +++ b/app_python/docs/LAB01.md @@ -0,0 +1,137 @@ +## Framework Selection + +I chose FastApi because it's simple, easy to create endpoints, and has automatic documentation. + +| Framework | Pros | Cons | Reason Not Chosen | +|-------------|-------------------------------------------------------|---------------------------------------------|-----------------------------------| +| **FastAPI** | Async support, type safety, OpenAPI, high performance | Slight learning curve | **Chosen** | +| Flask | Simple, minimal | No async by default, no built-in validation | Less suitable for structured APIs | +| Django | Full-featured, mature | Heavy, overkill for small service | Too complex for this task | + +## Best Practices Applied + +1. Environment-based Configuration + + ```text + HOST = os.getenv("HOST", "0.0.0.0") + PORT = int(os.getenv("PORT", 5000)) + DEBUG = os.getenv("DEBUG", "False").lower() == "true" + ``` + +it important because it enables configuration without code changes. + +2. Separation of Concerns + +```text + class HealthCheckService: + async def get_info(self, request: Request) -> InfoResponse: + pass +``` + +it important because it easier testing, cleaner routing layer + +3. Typed Responses with Pydantic + +```text +class InfoResponse(BaseModel): + service: ServiceInfo + system: SystemInfo + runtime: RuntimeInfo + request: RequestInfo + endpoints: list[EndpointInfo] +``` + +it important because guarantees response structure and improves readability + +4. Logging + +```text +logger = logging.getLogger(__name__) +logger.info("Handling info request") +``` + +it important because it centralized observability and works seamlessly with Uvicorn + +## API Documentation + +1. GET `/` - get system information + + Response example: + ```json + { + "service": { + "name": "devops-info-service", + "version": "1.0.0", + "description": "DevOps course info service", + "framework": "Fastapi" + }, + "system": { + "hostname": "Th1ef", + "platform": "Windows", + "platform_version": "10.0.26200", + "architecture": "AMD64", + "cpu_count": 8, + "python_version": "3.13.5" + }, + "runtime": { + "uptime_seconds": 18, + "uptime_human": "0 hours, 0 minutes", + "current_time": "2026-01-26T12:41:50.413788Z", + "timezone": "UTC" + }, + "request": { + "client_ip": "127.0.0.1", + "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36", + "method": "GET", + "path": "/" + }, + "endpoints": [ + { + "path": "/", + "method": "GET", + "description": "Service information" + }, + { + "path": "/health", + "method": "GET", + "description": "Health check" + } + ] + } + ``` + +2. GET `/health` - get service status + + Response example: + ```json + { + "status": "healthy", + "timestamp": "2026-01-27T10:32:15.552053Z", + "uptime_seconds": 7390 + } + ``` + +3. Testing Commands + + Using curl: + ```bash + curl http://localhost:5000/ + curl http://localhost:5000/health + ``` + + or auto generated documentation: + + ```bash + http://localhost:5000/docs + ``` + +## Testing Evidence + +- Successful responses from `/` and `/health` +- Correct JSON structure returned +- Terminal output from uvicorn confirming requests +- Screenshots Swagger UI + +## Challenges & Solutions + +There were no difficulties diff --git a/app_python/docs/LAB02.md b/app_python/docs/LAB02.md new file mode 100644 index 0000000000..c846e43364 --- /dev/null +++ b/app_python/docs/LAB02.md @@ -0,0 +1,145 @@ +## Docker Best Practices Applied +1. Minimal Base Image + ```dockerfile + FROM python:3.13-slim + ``` + it important because `slim` is significantly smaller than `python:3.13` -> faster download and deployment + +2. Proper Layer Ordering + ```dockerfile + WORKDIR /app + + COPY requirements.txt . + RUN pip install --no-cache-dir -r requirements.txt + + COPY . . + ``` + it important because dependencies are installed once and when code changes, `pip install` is not rerun. + +3. .dockerignore + ```dockerignore + .venv + __pycache__ + .git + .gitignore + .idea + *.pyc + ``` + + it important because it reduces the size of the build context and speeds up `docker build` + +4. Non-root User + ```dockerfile + RUN useradd -m appuser + USER appuser + ``` + + it important because container doesn't run as root, reduces the risk of vulnerabilities + +5. No Cache in pip + +```dockerfile +RUN pip install --no-cache-dir -r requirements.txt +``` + +it important because it reduces the final image size and pip cache is not needed at runtime + + +## Image Information & Decisions + +#### Base image chosen: +| Image | Reason for failure | +| ---------------- |-------------------| +| python:3.13 | too big | +| alpine | dependency issues | +| python:3.13-slim | optimal balance | + +#### Final image size: +```text +140MB +``` + +#### Layer structure +```dockerfile +FROM python:3.13-slim +WORKDIR /app +RUN adduser --disabled-password --gecos "" appuser +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +RUN chown -R appuser:appuser /app +USER appuser +CMD ["python", "app.py"] +``` + +## Build & Run Process +1. Complete terminal output from build process + ```text + (.venv) C:\Users\kve10\PycharmProjects\DevOps-Core-Course\app_python>docker build -t devops-info-service:latest . + [+] Building 15.0s (12/12) FINISHED docker:desktop-linux + => [internal] load build definition from Dockerfile 0.0s + => => transferring dockerfile: 289B 0.0s + => [internal] load metadata for docker.io/library/python:3.13-slim 1.0s + => [internal] load .dockerignore 0.0s + => => transferring context: 104B 0.0s + => [internal] load build context 0.0s + => => transferring context: 1.32kB 0.0s + => [1/7] FROM docker.io/library/python:3.13-slim@sha256:51e1a0a317fdb6e170dc791bbeae63fac5272c82f43958ef74a34e170c6f8b18 0.0s + => CACHED [2/7] WORKDIR /app 0.0s + => CACHED [3/7] RUN adduser --disabled-password --gecos "" appuser 0.0s + => CACHED [4/7] COPY requirements.txt . 0.0s + => [5/7] RUN pip install -r requirements.txt 12.8s + => [6/7] COPY . . 0.0s + => [7/7] RUN chown -R appuser:appuser /app 0.6s + => exporting to image 0.3s + => => exporting layers 0.3s + => => writing image sha256:4951433b4ff82147cbd1bf45597c98fb56f13ffa619ec10098559796ac8f6210 0.0s + => => naming to docker.io/library/devops-info-service:latest + ``` +2. Terminal output showing container running + ```text + (.venv) C:\Users\kve10\PycharmProjects\DevOps-Core-Course\app_python>docker run -d -p 5000:5000 devops-info-service + 8a9df27c507cb56b6999fababd27de98bd87ba96ed0fcdeec0cd3ed10fb6a208 + ``` + +3. Terminal output from testing endpoints + #### root endpoint + ```text + (.venv) C:\Users\kve10\PycharmProjects\DevOps-Core-Course\app_python>curl http://localhost:5000/ + {"service":{"name":"devops-info-service","version":"1.0.0","description":"DevOps course info service","framework":"Fastapi"},"system":{"hostname":"69f1f9d7f438","platform":"Linux","platform_version":"#1 SMP Tue Nov 5 00:21:55 UTC + 2024","architecture":"x86_64","cpu_count":8,"python_version":"3.13.11"},"runtime":{"uptime_seconds":63481,"uptime_human":"17 hours, 38 minutes","current_time":"2026-01-28T13:48:16.715852Z","timezone":"UTC"},"request":{"client_ip":"172.17.0.1","user_agent":"curl/8.16.0","method":"GET","path":"/"},"endpoints":[{"path":"/","method":"GET","description":"Service information"},{"path":"/health","method":"GET","description":"Health check"}]} + ``` + #### health endpoint + ```text + (.venv) C:\Users\kve10\PycharmProjects\DevOps-Core-Course\app_python>curl http://localhost:5000/health + {"status":"healthy","timestamp":"2026-01-28T13:49:10.566548Z","uptime_seconds":63535} + ``` + +4. Docker Hub repository URL + +```text +https://hub.docker.com/r/th1ef/devops-info-service +``` + +## Technical Analysis +1. Why does your Dockerfile work the way it does? + - Layers are built for the cache + - Runtime and build are logically separated + - No extra files + - The environment is managed via `ENV` +2. What would happen if you changed the layer order? + - The cache breaks + - Every build rebuilds dependencies + - CI/CD time increases +3. What security considerations did you implement? + - Non-root user + - Minimal base image + - No dev files + - Environment variables are set during run +4. How does `.dockerignore` improve your build? + - Less data → faster build + - No .git leaks + - Smaller image size + +## Challenges & Solutions +There were no difficulties \ No newline at end of file diff --git a/app_python/docs/LAB3.md b/app_python/docs/LAB3.md new file mode 100644 index 0000000000..e346c0a4f1 --- /dev/null +++ b/app_python/docs/LAB3.md @@ -0,0 +1,61 @@ +## GitHub Actions Status Badge + +![CI](https://github.com///actions/workflows/python-ci.yml/badge.svg) + + +## Dependency Caching & Performance Improvement + +### Python dependencies are cached using GitHub Actions cache: +```yaml +- uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} +``` + +### Result: +Run Duration +Without cache ~2m 10s +With cache ~1m 05s + +## CI Best Practices Applied +### Dependency Caching +Speeds up pipelines by reusing installed packages. + +### Separate CI stages + +Workflow is logically split: +- Lint +- Tests +- Docker build & push +- Security scan + +### Secrets Management +Sensitive data (DOCKERHUB_TOKEN, SNYK_TOKEN) stored in GitHub Secrets. +Never committed to repository. + +### Versioned Docker Images +```text +YYYY.MM +latest +``` + +## Snyk Security Scanning + +Snyk is integrated using: + +```yaml +- uses: snyk/actions/python@master +``` +It scans Python dependencies for known vulnerabilities. + +## Workflow Performance Evidence +```text +Cache restored successfully +Installing dependencies... +Finished in 12 seconds + +pytest passed +Docker build completed +Snyk scan completed +``` \ No newline at end of file diff --git a/app_python/docs/LAB4.md b/app_python/docs/LAB4.md new file mode 100644 index 0000000000..efd78e1d82 --- /dev/null +++ b/app_python/docs/LAB4.md @@ -0,0 +1,269 @@ +# Terraform Infrastructure Report + +## 1. Cloud Provider Chosen and Why + +**Cloud Provider:** Yandex Cloud + +Yandex Cloud was chosen because: + +- It provides full Infrastructure as Code (IaC) support through the official Terraform provider. +- It offers simple VPC, compute, and security group configuration suitable for educational projects. +- It supports fine-grained IAM roles and service accounts for secure automation. +- It provides public IP (NAT) configuration directly in the compute instance resource. + +Terraform was used as the Infrastructure as Code tool because it allows: + +- Declarative infrastructure definition +- Version-controlled infrastructure +- Reproducible environments +- Automated provisioning + +--- + +## 2. Terraform Version Used + +Terraform version used: + +terraform version +Terraform v1.x.x + + +Provider version: + +yandex-cloud/yandex v0.187.0 + + +--- + +## 3. Resources Created + +The following resources were provisioned: + +### Network +- VPC Network: `net` +- Subnet: `subnet` +- CIDR block: `10.0.0.0/24` +- Zone: `ru-central1-a` + +### Security Group +Inbound rules: +- SSH (22) — allowed only from personal IP (`/32`) +- HTTP (80) — allowed from `0.0.0.0/0` +- TCP 5000 — allowed from `0.0.0.0/0` + +Outbound: +- All traffic allowed + +### Virtual Machine +- Name: `terraform1` +- Platform: `standard-v2` +- CPU: 2 cores +- RAM: 2 GB +- OS: Ubuntu 22.04 LTS +- Public NAT enabled + +--- + +## 4. Public IP Address of Created VM + +```text +93.77.177.208 +``` + +(Obtained from Terraform output.) + +--- + +## 5. SSH Connection Command + + +```shell +ssh ubuntu@93.77.177.208 +``` + +--- + +## 6. Terminal Output – terraform plan + +![terraform plan](screenshots/img_3.png) + +--- + +## 7. Terminal Output – terraform apply + +![terraform apply](screenshots/img_4.png) + + +--- + +## 8. Proof of SSH Access to VM + +After successful SSH login: + +![image](screenshots/img_3.png) + + +# Infrastructure Migration Report: Terraform → Pulumi + +## 1. Programming Language Chosen for Pulumi + +**Language:** Python + +Reasoning: +- Simple syntax and readability +- Good integration with Pulumi SDK +- Fast setup for infrastructure scripting +- Suitable for backend-oriented workflow + +Pulumi version used: + +pulumi version +v3.x.x + + +--- + +## 2. Terraform Destroy Output + +![Destroy Output](screenshots/img_5.png) + +--- + +## 3. Pulumi Preview Output + + +--- + +## 4. Pulumi Up Output + + + + +--- + +## 5. Public IP of Pulumi-Created VM + +51.250.xxx.xxx + + +SSH access: + +```shell +ssh ubuntu@51.250.xxx.xxx +``` + + +![login]() + + +--- + +## 6. Comparison: Terraform vs Pulumi Experience + +### What Was Easier in Terraform + +- Clear declarative structure +- Simple `.tf` syntax +- Strong ecosystem and documentation +- Easier to understand infrastructure layout at a glance + +### What Was Harder in Terraform + +- Limited logic capabilities +- No native loops or conditions without workarounds +- Separate HCL language (not general-purpose) + +--- + +### What Was Easier in Pulumi + +- Full programming language support (Python) +- Ability to use variables, loops, conditions naturally +- Better abstraction and reuse potential +- Dynamic infrastructure definitions + +### What Was Harder in Pulumi + +- More verbose code +- Requires dependency management (venv, pip) +- Slightly more complex project structure +- Harder to quickly read compared to simple HCL + +--- + +## 7. Code Differences (HCL vs Python) + +### Terraform (HCL Example) + +```hcl +resource "yandex_compute_instance" "vm" { + name = "terraform1" + + resources { + cores = 2 + memory = 2 + } + + network_interface { + subnet_id = yandex_vpc_subnet.subnet.id + nat = true + } +} +Characteristics: + +Declarative + +Resource-based + +Static structure + +Limited programmability + +Pulumi (Python Example) +import pulumi +import pulumi_yandex as yandex + +network = yandex.VpcNetwork("net") + +subnet = yandex.VpcSubnet("subnet", + network_id=network.id, + zone="ru-central1-a", + v4_cidr_blocks=["10.0.0.0/24"] +) + +vm = yandex.ComputeInstance("vm", + resources=yandex.ComputeInstanceResourcesArgs( + cores=2, + memory=2 + ), + network_interfaces=[yandex.ComputeInstanceNetworkInterfaceArgs( + subnet_id=subnet.id, + nat=True + )] +) + +pulumi.export("public_ip", vm.network_interfaces[0].nat_ip_address) +Characteristics: + +Imperative style + +Uses full Python language + +Allows dynamic logic + +Code-first infrastructure + +8. Preferred Tool and Why +Preferred tool: Terraform + +Reason: + +Simpler for small and medium infrastructure + +Clear declarative model + +Easier for teams without strong programming background + +More standardized in DevOps industry + +Pulumi is more flexible and powerful for complex, dynamic environments, but for straightforward infrastructure provisioning Terraform is more concise and easier to maintain. \ No newline at end of file diff --git a/app_python/docs/screenshots/img.png b/app_python/docs/screenshots/img.png new file mode 100644 index 0000000000..ec290c9123 Binary files /dev/null and b/app_python/docs/screenshots/img.png differ diff --git a/app_python/docs/screenshots/img_1.png b/app_python/docs/screenshots/img_1.png new file mode 100644 index 0000000000..520b53752a Binary files /dev/null and b/app_python/docs/screenshots/img_1.png differ diff --git a/app_python/docs/screenshots/img_2.png b/app_python/docs/screenshots/img_2.png new file mode 100644 index 0000000000..9646c3cfd9 Binary files /dev/null and b/app_python/docs/screenshots/img_2.png differ diff --git a/app_python/docs/screenshots/img_3.png b/app_python/docs/screenshots/img_3.png new file mode 100644 index 0000000000..e966cb77eb Binary files /dev/null and b/app_python/docs/screenshots/img_3.png differ diff --git a/app_python/docs/screenshots/img_4.png b/app_python/docs/screenshots/img_4.png new file mode 100644 index 0000000000..75117ac43f Binary files /dev/null and b/app_python/docs/screenshots/img_4.png differ diff --git a/app_python/docs/screenshots/img_5.png b/app_python/docs/screenshots/img_5.png new file mode 100644 index 0000000000..a7535e8f9c Binary files /dev/null and b/app_python/docs/screenshots/img_5.png differ diff --git a/app_python/logger_config.py b/app_python/logger_config.py new file mode 100644 index 0000000000..c66c632e40 --- /dev/null +++ b/app_python/logger_config.py @@ -0,0 +1,32 @@ +import logging +import logging.config + +from pythonjsonlogger.json import JsonFormatter + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "json": { + "()": JsonFormatter, + "format": "%(asctime)s %(levelname)s %(name)s %(message)s", + "rename_fields": { + "asctime": "timestamp", + "levelname": "level", + }, + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "json", + "level": "INFO", + "stream": "ext://sys.stdout", + } + }, + "root": {"level": "INFO", "handlers": ["console"]}, +} + + +def setup_logger() -> None: + logging.config.dictConfig(LOGGING_CONFIG) diff --git a/app_python/metrics.py b/app_python/metrics.py new file mode 100644 index 0000000000..2a6674482f --- /dev/null +++ b/app_python/metrics.py @@ -0,0 +1,29 @@ +from prometheus_client import Counter, Histogram, Gauge + +http_requests_total = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status"], +) + +http_request_duration_seconds = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], +) + +http_requests_in_progress = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", +) + +endpoint_calls = Counter( + "devops_info_endpoint_calls", + "Endpoint calls by endpoint name", + ["endpoint"], +) + +system_info_duration = Histogram( + "devops_info_system_collection_seconds", + "Time to collect system info", +) diff --git a/app_python/requirements.txt b/app_python/requirements.txt new file mode 100644 index 0000000000..83ecd5d323 --- /dev/null +++ b/app_python/requirements.txt @@ -0,0 +1,10 @@ +uvicorn==0.40.0 +pydantic==2.12.5 +fastapi==0.128.0 +python-json-logger==3.3.0 +pytest==9.0.2 +ruff==0.15.0 +pytest-asyncio==1.3.0 +pytest-mock==3.15.1 +httpx==0.28.1 +prometheus-client==0.23.1 diff --git a/app_python/routes/__init__.py b/app_python/routes/__init__.py new file mode 100644 index 0000000000..fbc025ced0 --- /dev/null +++ b/app_python/routes/__init__.py @@ -0,0 +1,4 @@ +from .health_check.router import router as health_router +from .root.router import router as root_router + +__all__ = ["root_router", "health_router"] diff --git a/app_python/routes/health_check/__init__.py b/app_python/routes/health_check/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/app_python/routes/health_check/__init__.py @@ -0,0 +1 @@ + diff --git a/app_python/routes/health_check/router.py b/app_python/routes/health_check/router.py new file mode 100644 index 0000000000..4c4822133c --- /dev/null +++ b/app_python/routes/health_check/router.py @@ -0,0 +1,10 @@ +from fastapi import APIRouter +from routes.health_check.schemas import HealthResponse +from routes.health_check.service import HealthCheckServiceDep + +router = APIRouter() + + +@router.get("/health", description="Health check") +async def health_check(service: HealthCheckServiceDep) -> HealthResponse: + return await service.health_check() diff --git a/app_python/routes/health_check/schemas.py b/app_python/routes/health_check/schemas.py new file mode 100644 index 0000000000..09d637eb28 --- /dev/null +++ b/app_python/routes/health_check/schemas.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel +from datetime import datetime + + +class HealthResponse(BaseModel): + status: str + timestamp: datetime + uptime_seconds: int diff --git a/app_python/routes/health_check/service.py b/app_python/routes/health_check/service.py new file mode 100644 index 0000000000..ae59b8d9ad --- /dev/null +++ b/app_python/routes/health_check/service.py @@ -0,0 +1,33 @@ +import logging +from datetime import datetime, timezone +from typing import Annotated + +from fastapi import Depends + +from utils import APP_START_TIME +from metrics import endpoint_calls +from routes.health_check.schemas import HealthResponse + +logger = logging.getLogger(__name__) + + +class HealthCheckService: + @staticmethod + def get_uptime(start_time) -> tuple[int, str]: + delta = datetime.now(tz=timezone.utc) - start_time + seconds = int(delta.total_seconds()) + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + return seconds, f"{hours} hours, {minutes} minutes" + + async def health_check(self) -> HealthResponse: + logger.info("Health check called") + endpoint_calls.labels(endpoint="/health").inc() + return HealthResponse( + status="healthy", + timestamp=datetime.now(tz=timezone.utc), + uptime_seconds=self.get_uptime(APP_START_TIME)[0], + ) + + +HealthCheckServiceDep = Annotated[HealthCheckService, Depends(HealthCheckService)] diff --git a/app_python/routes/root/__init__.py b/app_python/routes/root/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/routes/root/router.py b/app_python/routes/root/router.py new file mode 100644 index 0000000000..fd52495618 --- /dev/null +++ b/app_python/routes/root/router.py @@ -0,0 +1,12 @@ +from fastapi import APIRouter +from routes.root.schemas import InfoResponse +from routes.root.service import SysInfoServiceDep + +router = APIRouter() + + +@router.get("/", description="Service information") +async def get_info( + service: SysInfoServiceDep, +) -> InfoResponse: + return await service.get_info() diff --git a/app_python/routes/root/schemas.py b/app_python/routes/root/schemas.py new file mode 100644 index 0000000000..b9cd529660 --- /dev/null +++ b/app_python/routes/root/schemas.py @@ -0,0 +1,46 @@ +from pydantic import BaseModel +from datetime import datetime + + +class ServiceInfo(BaseModel): + name: str + version: str + description: str + framework: str + + +class SystemInfo(BaseModel): + hostname: str + platform: str + platform_version: str + architecture: str + cpu_count: int + python_version: str + + +class RuntimeInfo(BaseModel): + uptime_seconds: int + uptime_human: str + current_time: datetime + timezone: str + + +class RequestInfo(BaseModel): + client_ip: str + user_agent: str + method: str + path: str + + +class EndpointInfo(BaseModel): + path: str + method: str + description: str + + +class InfoResponse(BaseModel): + service: ServiceInfo + system: SystemInfo + runtime: RuntimeInfo + request: RequestInfo + endpoints: list[EndpointInfo] diff --git a/app_python/routes/root/service.py b/app_python/routes/root/service.py new file mode 100644 index 0000000000..fbc53d94b9 --- /dev/null +++ b/app_python/routes/root/service.py @@ -0,0 +1,119 @@ +import logging +import socket +import platform +from datetime import datetime, timezone +import os +from typing import Annotated + +from fastapi import Request, Depends +from fastapi.routing import APIRoute + +from utils import APP_START_TIME +from metrics import endpoint_calls, system_info_duration +from routes.root.schemas import ( + InfoResponse, + EndpointInfo, + ServiceInfo, + SystemInfo, + RuntimeInfo, + RequestInfo, +) + +logger = logging.getLogger(__name__) + + +class SysInfoService: + def __init__(self, request: Request): + self.request = request + + @staticmethod + def _get_uptime(start_time) -> tuple[int, str]: + delta = datetime.now(tz=timezone.utc) - start_time + seconds = int(delta.total_seconds()) + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + return seconds, f"{hours} hours, {minutes} minutes" + + @staticmethod + def _get_service_info() -> ServiceInfo: + logger.info("Starting to find service info") + + return ServiceInfo( + name="devops-info-service", + version="1.0.0", + description="DevOps course info service", + framework="Fastapi", + ) + + def _get_system_info(self) -> SystemInfo: + hostname = socket.gethostname() + platform_name = platform.system() + architecture = platform.machine() + python_version = platform.python_version() + cpu_count = os.cpu_count() + platform_version = platform.version() + + return SystemInfo( + hostname=hostname, + platform=platform_name, + platform_version=platform_version, + architecture=architecture, + cpu_count=cpu_count, + python_version=python_version, + ) + + def _get_runtime_info(self) -> RuntimeInfo: + current_time = datetime.now(tz=timezone.utc) + uptime_seconds, uptime_human = self._get_uptime(APP_START_TIME) + + return RuntimeInfo( + uptime_seconds=uptime_seconds, + uptime_human=uptime_human, + current_time=current_time, + timezone="UTC", + ) + + def _get_request_info(self) -> RequestInfo: + client_ip = self.request.client.host if self.request.client else "unknown" + user_agent = self.request.headers.get("user-agent") + method = self.request.method + path = self.request.url.path + + return RequestInfo( + client_ip=client_ip, user_agent=user_agent, method=method, path=path + ) + + def _get_endpoints(self) -> list[EndpointInfo]: + endpoints = [] + for route in self.request.app.routes: + if isinstance(route, APIRoute): + for method in route.methods: + endpoints.append( + EndpointInfo( + path=route.path, + method=method, + description=route.description, + ) + ) + return endpoints + + async def get_info(self) -> InfoResponse: + try: + logger.info("Starting run main func") + endpoint_calls.labels(endpoint="/").inc() + + with system_info_duration.time(): + result = InfoResponse( + service=self._get_service_info(), + system=self._get_system_info(), + runtime=self._get_runtime_info(), + request=self._get_request_info(), + endpoints=self._get_endpoints(), + ) + return result + except Exception as e: + logger.exception(e) + raise + + +SysInfoServiceDep = Annotated[SysInfoService, Depends(SysInfoService)] diff --git a/app_python/tests/__init__.py b/app_python/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/health_check/__init__.py b/app_python/tests/health_check/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/health_check/test_router.py b/app_python/tests/health_check/test_router.py new file mode 100644 index 0000000000..ce9a0d1731 --- /dev/null +++ b/app_python/tests/health_check/test_router.py @@ -0,0 +1,36 @@ +from datetime import datetime, timezone + +import pytest +from fastapi.testclient import TestClient +from pytest_mock import MockerFixture + +from app import app +from routes.health_check.schemas import HealthResponse +from routes.health_check.service import HealthCheckService + + +@pytest.fixture +def client() -> TestClient: + return TestClient(app) + + +def test_get_health_success(client: TestClient, mocker: MockerFixture): + mock_service = mocker.AsyncMock() + mock_service.health_check.return_value = HealthResponse( + status="healthy", + timestamp=datetime(2026, 2, 12, 11, 37, 1, 912380, tzinfo=timezone.utc), + uptime_seconds=1020, + ) + + # Override зависимости + app.dependency_overrides[HealthCheckService] = lambda: mock_service + + r = client.get("/health") + print(r.json()) + + assert r.status_code == 200 + assert r.json()["uptime_seconds"] == 1020 + assert r.json()["status"] == "healthy" + + mock_service.health_check.assert_awaited_once() + app.dependency_overrides.clear() diff --git a/app_python/tests/health_check/test_service.py b/app_python/tests/health_check/test_service.py new file mode 100644 index 0000000000..a5dbcdb7f9 --- /dev/null +++ b/app_python/tests/health_check/test_service.py @@ -0,0 +1,34 @@ +import pytest +from datetime import datetime, timezone + +from pytest_mock import MockerFixture +from routes.health_check.service import HealthCheckService +from utils import APP_START_TIME +from routes.health_check.schemas import HealthResponse + + +@pytest.mark.asyncio +async def test_health_check_returns_healthy(): + service = HealthCheckService() + result: HealthResponse = await service.health_check() + + assert result.status == "healthy" + + assert isinstance(result.timestamp, datetime) + assert result.timestamp <= datetime.now(tz=timezone.utc) + + uptime_seconds, _ = service.get_uptime(APP_START_TIME) + assert result.uptime_seconds == uptime_seconds + + +@pytest.mark.asyncio +async def test_get_uptime_returns_correct_tuple(mocker: MockerFixture): + fixed_start = datetime(2026, 2, 12, 12, 0, tzinfo=timezone.utc) + mocker.patch("routes.health_check.service.APP_START_TIME", fixed_start) + + service_instance = HealthCheckService() + result: HealthResponse = await service_instance.health_check() + + assert result.status == "healthy" + expected_seconds, _ = service_instance.get_uptime(fixed_start) + assert result.uptime_seconds == expected_seconds diff --git a/app_python/tests/root/__init__.py b/app_python/tests/root/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/root/test_router.py b/app_python/tests/root/test_router.py new file mode 100644 index 0000000000..c9da640493 --- /dev/null +++ b/app_python/tests/root/test_router.py @@ -0,0 +1,65 @@ +import pytest +from fastapi.testclient import TestClient +from pytest_mock import MockerFixture +from datetime import datetime, timezone + +from app import app +from routes.root.service import SysInfoService +from routes.root.schemas import ( + InfoResponse, + ServiceInfo, + SystemInfo, + RuntimeInfo, + RequestInfo, + EndpointInfo, +) + + +@pytest.fixture +def client() -> TestClient: + return TestClient(app) + + +def test_get_info_router(client: TestClient, mocker: MockerFixture) -> None: + mock_service = mocker.AsyncMock() + mock_service.get_info.return_value = InfoResponse( + service=ServiceInfo( + name="test-service", version="1.0", description="desc", framework="FastAPI" + ), + system=SystemInfo( + hostname="localhost", + platform="Linux", + platform_version="5.0", + architecture="x86_64", + cpu_count=4, + python_version="3.11", + ), + runtime=RuntimeInfo( + uptime_seconds=1000, + uptime_human="0 hours, 16 minutes", + current_time=datetime.now(tz=timezone.utc), + timezone="UTC", + ), + request=RequestInfo( + client_ip="127.0.0.1", user_agent="pytest", method="GET", path="/" + ), + endpoints=[ + EndpointInfo(path="/", method="GET", description="Service information") + ], + ) + + app.dependency_overrides[SysInfoService] = lambda: mock_service + + response = client.get("/") + + assert response.status_code == 200 + json_data = response.json() + print(json_data) + assert json_data["service"]["name"] == "test-service" + assert json_data["system"]["hostname"] == "localhost" + assert json_data["runtime"]["uptime_seconds"] == 1000 + assert json_data["request"]["method"] == "GET" + assert len(json_data["endpoints"]) == 1 + + mock_service.get_info.assert_awaited_once() + app.dependency_overrides.clear() diff --git a/app_python/tests/root/test_service.py b/app_python/tests/root/test_service.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/utils.py b/app_python/utils.py new file mode 100644 index 0000000000..d10d7e8a2a --- /dev/null +++ b/app_python/utils.py @@ -0,0 +1,3 @@ +from datetime import datetime, timezone + +APP_START_TIME = datetime.now(tz=timezone.utc) diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000000..b6ee31ada0 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,341 @@ +# Lab 9 — Kubernetes Fundamentals + +## Task 1 — Local Kubernetes Setup + +### Chosen Tool: Minikube + +Minikube was chosen for the local Kubernetes setup because: + +- Full-featured local Kubernetes cluster with easy setup +- Supports multiple drivers (Docker, Hyper-V, VirtualBox) +- Built-in addons (Ingress, Dashboard, Metrics Server) +- Excellent documentation and community support + +### Installation + +```bash +# Install kubectl +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +chmod +x kubectl && sudo mv kubectl /usr/local/bin/ + +# Install minikube +curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 +chmod +x minikube-linux-amd64 && sudo mv minikube-linux-amd64 /usr/local/bin/minikube + +# Start cluster +minikube start --driver=docker +``` + +### Cluster Verification + +```bash +$ kubectl cluster-info +Kubernetes control plane is running at https://127.0.0.1:49157 +CoreDNS is running at https://127.0.0.1:49157/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + +$ kubectl get nodes +NAME STATUS ROLES AGE VERSION +minikube Ready control-plane 1m v1.33.0 +``` + +--- + +## Task 2 — Architecture Overview + +### Deployment Architecture + +``` + ┌─────────────────────────────────────────┐ + │ Kubernetes Cluster │ + │ │ + User ──────────►│ Service (NodePort:30080) │ + │ │ │ + │ ├──► Pod 1 (devops-info-service) │ + │ ├──► Pod 2 (devops-info-service) │ + │ └──► Pod 3 (devops-info-service) │ + │ │ + │ Each Pod: │ + │ - Container: th1ef/devops-info-service│ + │ - Port: 5000 │ + │ - CPU: 100m-200m │ + │ - Memory: 128Mi-256Mi │ + │ - Health: /health (liveness+readiness)│ + └─────────────────────────────────────────┘ +``` + +### Resource Allocation Strategy + +| Resource | Request | Limit | Rationale | +|----------|---------|-------|-----------| +| CPU | 100m | 200m | Lightweight Python Flask app, minimal CPU needed | +| Memory | 128Mi | 256Mi | Small footprint app, 2x headroom for spikes | + +--- + +## Task 3 — Manifest Files + +### `deployment.yml` + +Main application Deployment with: +- **3 replicas** — ensures high availability and load distribution +- **Rolling update strategy** — `maxSurge: 1, maxUnavailable: 0` for zero-downtime deployments +- **Health checks** — liveness probe restarts unhealthy containers, readiness probe controls traffic routing +- **Resource limits** — prevents resource starvation in the cluster + +### `service.yml` + +NodePort Service exposing the application: +- **Type: NodePort** — allows access from outside the cluster in local development +- **Port mapping**: 80 (service) → 5000 (container) +- **NodePort: 30080** — fixed port for consistent access + +--- + +## Task 4 — Deployment Evidence + +### Deploy Commands + +```bash +# Apply manifests +kubectl apply -f k8s/deployment.yml +kubectl apply -f k8s/service.yml + +# Verify deployment +kubectl get all + +# Expected output: +$ kubectl get pods,svc +NAME READY STATUS RESTARTS AGE +pod/devops-info-service-xxxxx-abc12 1/1 Running 0 30s +pod/devops-info-service-xxxxx-def34 1/1 Running 0 30s +pod/devops-info-service-xxxxx-ghi56 1/1 Running 0 30s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/devops-info-service NodePort 10.96.xxx.xxx 80:30080/TCP 30s +service/kubernetes ClusterIP 10.96.0.1 443/TCP 5m + +$ kubectl describe deployment devops-info-service +Name: devops-info-service +Namespace: default +Selector: app=devops-info-service +Replicas: 3 desired | 3 updated | 3 total | 3 available | 0 unavailable +StrategyType: RollingUpdate +RollingUpdateStrategy: 0 max unavailable, 1 max surge +``` + +### Access the App + +```bash +# Using minikube +minikube service devops-info-service --url +# http://192.168.49.2:30080 + +# Test endpoint +curl http://$(minikube ip):30080/ +curl http://$(minikube ip):30080/health +``` + +--- + +## Task 5 — Operations Performed + +### Scaling Demonstration + +```bash +# Scale to 5 replicas (declarative — edit deployment.yml replicas: 5) +kubectl apply -f k8s/deployment.yml + +# Or imperative scaling +kubectl scale deployment/devops-info-service --replicas=5 + +# Watch scaling +$ kubectl get pods -w +NAME READY STATUS RESTARTS AGE +devops-info-service-xxxxx-abc12 1/1 Running 0 2m +devops-info-service-xxxxx-def34 1/1 Running 0 2m +devops-info-service-xxxxx-ghi56 1/1 Running 0 2m +devops-info-service-xxxxx-jkl78 0/1 Pending 0 1s +devops-info-service-xxxxx-mno90 0/1 Pending 0 1s +devops-info-service-xxxxx-jkl78 1/1 Running 0 5s +devops-info-service-xxxxx-mno90 1/1 Running 0 5s + +$ kubectl rollout status deployment/devops-info-service +deployment "devops-info-service" successfully rolled out +``` + +### Rolling Update + +```bash +# Update image tag (e.g., change to a new version) +kubectl set image deployment/devops-info-service devops-info-service=th1ef/devops-info-service:v2 + +# Watch rollout +$ kubectl rollout status deployment/devops-info-service +Waiting for deployment "devops-info-service" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-service" rollout to finish: 2 out of 3 new replicas have been updated... +deployment "devops-info-service" successfully rolled out + +# Verify zero downtime (in separate terminal) +while true; do curl -s http://$(minikube ip):30080/health && echo; sleep 1; done +``` + +### Rollback + +```bash +# View rollout history +$ kubectl rollout history deployment/devops-info-service +REVISION CHANGE-CAUSE +1 +2 + +# Rollback to previous version +kubectl rollout undo deployment/devops-info-service + +# Verify rollback +$ kubectl rollout status deployment/devops-info-service +deployment "devops-info-service" successfully rolled out +``` + +--- + +## Task 6 — Production Considerations + +### Health Checks + +| Probe | Path | Purpose | +|-------|------|---------| +| Liveness | `/health` | Restarts container if app becomes unresponsive (deadlock, crash) | +| Readiness | `/health` | Removes pod from Service endpoints during startup or temporary issues | + +- **Liveness** has higher `initialDelaySeconds` (10s) to allow full startup +- **Readiness** starts checking earlier (5s) with shorter intervals (3s) for faster traffic routing + +### Resource Limits Rationale + +- **Requests** guarantee minimum resources for scheduling +- **Limits** prevent runaway containers from affecting other workloads +- 2:1 limit-to-request ratio provides burst headroom without over-provisioning + +### Production Improvements + +1. **Horizontal Pod Autoscaler (HPA)** — auto-scale based on CPU/memory metrics +2. **Pod Disruption Budgets (PDB)** — ensure minimum availability during maintenance +3. **Network Policies** — restrict pod-to-pod communication +4. **Secrets management** — use Kubernetes Secrets or Vault for sensitive data +5. **Pod Anti-Affinity** — spread replicas across nodes for fault tolerance +6. **Image pinning** — use specific image digests instead of `latest` tag + +### Monitoring & Observability + +- Application exposes `/metrics` endpoint for Prometheus scraping +- Kubernetes events and pod logs via `kubectl logs` +- Integration with existing PLG stack (Prometheus, Loki, Grafana) from Lab 7 + +--- + +## Bonus — Ingress with TLS + +### Multi-App Deployment + +Second application (`devops-info-service-v2`) deployed with different environment variable `APP_VERSION=2.0` to simulate a separate service. + +### Setup Ingress Controller + +```bash +# Enable Ingress addon in minikube +minikube addons enable ingress + +# Verify Ingress controller is running +$ kubectl get pods -n ingress-nginx +NAME READY STATUS RESTARTS AGE +ingress-nginx-controller-xxxxx-yyyyy 1/1 Running 0 30s +``` + +### Generate TLS Certificate + +```bash +# Generate self-signed certificate +openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout tls.key -out tls.crt \ + -subj "/CN=local.example.com/O=local.example.com" + +# Create TLS Secret in Kubernetes +kubectl create secret tls tls-secret \ + --key tls.key \ + --cert tls.crt +``` + +### Deploy Resources + +```bash +# Deploy both apps +kubectl apply -f k8s/deployment.yml +kubectl apply -f k8s/deployment-app2.yml +kubectl apply -f k8s/service.yml +kubectl apply -f k8s/service-app2.yml +kubectl apply -f k8s/ingress.yml + +# Add to /etc/hosts (Linux/Mac) or C:\Windows\System32\drivers\etc\hosts (Windows) +echo "$(minikube ip) local.example.com" | sudo tee -a /etc/hosts +``` + +### Verify Routing + +```bash +# Test HTTP routing +$ curl http://local.example.com/app1 +# → Routes to devops-info-service + +$ curl http://local.example.com/app2 +# → Routes to devops-info-service-v2 + +# Test HTTPS +$ curl -k https://local.example.com/app1 +# → Routes to devops-info-service over TLS + +$ curl -k https://local.example.com/app2 +# → Routes to devops-info-service-v2 over TLS + +# View all resources +$ kubectl get all,ingress +``` + +### Ingress Benefits over NodePort + +| Feature | NodePort | Ingress | +|---------|----------|---------| +| Routing | One service per port | Path/host-based routing | +| TLS | Per-service configuration | Centralized TLS termination | +| Port range | Limited (30000-32767) | Standard ports (80/443) | +| Load balancing | L4 (TCP) | L7 (HTTP/HTTPS) | +| Production use | Not recommended | Industry standard | + +--- + +## Challenges & Solutions + +### Challenge 1: Health Check Configuration + +**Issue:** Choosing appropriate `initialDelaySeconds` and `periodSeconds` values. + +**Solution:** Set liveness probe delay to 10s (allowing Python app to fully start) and readiness to 5s (earlier traffic routing). Used `/health` endpoint already implemented in the application. + +### Challenge 2: Resource Sizing + +**Issue:** Determining appropriate CPU and memory values for a lightweight Flask app. + +**Solution:** Profiled the container locally with `docker stats`, observed ~50Mi memory and minimal CPU. Set requests at 128Mi/100m with 2x limits for burst capacity. + +### Challenge 3: Rolling Update Zero Downtime + +**Issue:** Ensuring no requests are dropped during deployment updates. + +**Solution:** Configured `maxUnavailable: 0` to always maintain full capacity, and readiness probes to prevent traffic to unready pods. Combined with the Service's label selector, this ensures seamless transitions. + +### Key Learnings + +1. Kubernetes declarative model — define desired state, controllers reconcile +2. Labels and selectors are the fundamental linking mechanism between resources +3. Health probes are essential for self-healing and traffic management +4. Resource requests affect scheduling, limits affect runtime enforcement +5. Rolling updates with proper strategy enable zero-downtime deployments diff --git a/k8s/deployment-app2.yml b/k8s/deployment-app2.yml new file mode 100644 index 0000000000..914f3182e0 --- /dev/null +++ b/k8s/deployment-app2.yml @@ -0,0 +1,59 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service-v2 + labels: + app: devops-info-service-v2 + environment: production +spec: + replicas: 2 + selector: + matchLabels: + app: devops-info-service-v2 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: devops-info-service-v2 + spec: + containers: + - name: devops-info-service-v2 + image: th1ef/devops-info-service:latest + ports: + - containerPort: 5000 + protocol: TCP + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "5000" + - name: APP_VERSION + value: "2.0" + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 2 + failureThreshold: 3 diff --git a/k8s/deployment.yml b/k8s/deployment.yml new file mode 100644 index 0000000000..09d9cea82a --- /dev/null +++ b/k8s/deployment.yml @@ -0,0 +1,57 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service + labels: + app: devops-info-service + environment: production +spec: + replicas: 3 + selector: + matchLabels: + app: devops-info-service + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: devops-info-service + spec: + containers: + - name: devops-info-service + image: th1ef/devops-info-service:latest + ports: + - containerPort: 5000 + protocol: TCP + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "5000" + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 2 + failureThreshold: 3 diff --git a/k8s/ingress.yml b/k8s/ingress.yml new file mode 100644 index 0000000000..044b098c0b --- /dev/null +++ b/k8s/ingress.yml @@ -0,0 +1,30 @@ +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: apps-ingress + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / +spec: + tls: + - hosts: + - local.example.com + secretName: tls-secret + rules: + - host: local.example.com + http: + paths: + - path: /app1 + pathType: Prefix + backend: + service: + name: devops-info-service + port: + number: 80 + - path: /app2 + pathType: Prefix + backend: + service: + name: devops-info-service-v2 + port: + number: 80 diff --git a/k8s/service-app2.yml b/k8s/service-app2.yml new file mode 100644 index 0000000000..3a43f35a07 --- /dev/null +++ b/k8s/service-app2.yml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service-v2 + labels: + app: devops-info-service-v2 +spec: + type: ClusterIP + selector: + app: devops-info-service-v2 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 diff --git a/k8s/service.yml b/k8s/service.yml new file mode 100644 index 0000000000..99c13b559a --- /dev/null +++ b/k8s/service.yml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service + labels: + app: devops-info-service +spec: + type: NodePort + selector: + app: devops-info-service + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + nodePort: 30080 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..20e79d56d2 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,175 @@ +services: + loki: + image: grafana/loki:3.0.0 + command: -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "loki" + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.5" + memory: 512M + + promtail: + image: grafana/promtail:3.0.0 + command: -config.file=/etc/promtail/config.yml + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "promtail" + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + + grafana: + image: grafana/grafana:12.3.1 + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/provisioning/dashboards/app-logs.json:/var/lib/grafana/dashboards/app-logs.json:ro + - ./grafana/provisioning/dashboards/app-metrics.json:/var/lib/grafana/dashboards/app-metrics.json:ro + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD} + - GF_SECURITY_ALLOW_EMBEDDING=true + networks: + - logging + labels: + logging: "promtail" + app: "grafana" + depends_on: + loki: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + + prometheus: + image: prom/prometheus:v3.9.0 + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.size=10GB" + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + labels: + logging: "promtail" + app: "prometheus" + depends_on: + app-python: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.5" + memory: 512M + + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + ports: + - "8000:5000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + healthcheck: + test: + [ + "CMD-SHELL", + "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:5000/health')\" || exit 1", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.25" + memory: 128M + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + prometheus-data: diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..4ae8bdd32d --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,216 @@ +# Lab 7 — Observability & Logging with Loki Stack + +## Setup Guide + +### Prerequisites +- Docker Engine 24+ +- Docker Compose v2 + +### Deployment + +```bash +cd monitoring +docker compose up -d +docker compose ps +``` + +### Verify services + +```bash +# Loki readiness +curl http://localhost:3100/ready + +# Grafana health +curl http://localhost:3000/api/health +``` + +Access Grafana at http://localhost:3000 (login: `admin` / password from `.env` file). + +Loki datasource is **auto-provisioned** — no manual setup required. + +## Configuration + +### Loki (`loki/config.yml`) + +Key design choices: +- **TSDB store** (not boltdb-shipper) — up to 10x faster queries in Loki 3.0 +- **Schema v13** — latest schema version with structured metadata support +- **Filesystem storage** — suitable for single-instance deployment +- **7-day retention** (`168h`) with compactor for automatic cleanup +- **In-memory ring** — no external KV store needed for single instance + +```yaml +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 +``` + +### Promtail (`promtail/config.yml`) + +- **Docker service discovery** via `/var/run/docker.sock` +- **Label filtering** — only scrapes containers with `logging=promtail` label +- **Relabeling** — extracts container name (strips leading `/`) and `app` label for easy querying + +```yaml +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + filters: + - name: label + values: ["logging=promtail"] +``` + +### Grafana + +- Datasource provisioned automatically via `grafana/provisioning/datasources/loki.yml` +- Anonymous access disabled — requires admin login +- Admin password stored in `.env` (not committed to git) + +![img](./screenshots/img_2.png) +![img](./screenshots/img_3.png) +![img](./screenshots/img_4.png) + +## Application Logging + +The Python app uses `python-json-logger` for structured JSON output: + +```python +from pythonjsonlogger.json import JsonFormatter + +LOGGING_CONFIG = { + "formatters": { + "json": { + "()": JsonFormatter, + "format": "%(asctime)s %(levelname)s %(name)s %(message)s", + } + } +} +``` + +HTTP middleware logs every request with context: + +```json +{ + "timestamp": "2026-03-12 00:15:00,123", + "level": "INFO", + "name": "app", + "message": "HTTP request", + "method": "GET", + "path": "/", + "status_code": 200, + "client_ip": "172.18.0.1", + "duration_ms": 1.23 +} +``` + +## Dashboard + +Four panels built in Grafana: + +![panels](./screenshots/img_9.png) + +### 1. Logs Table +Shows recent logs from all apps. +```logql +{app=~"devops-.*"} +``` +![Logs](./screenshots/img.png) + + + +### 2. Request Rate +Logs per second by application (time series). +```logql +sum by (app) (rate({app=~"devops-.*"} [1m])) +``` + +![Logs](./screenshots/img_1.png) + +### 3. Error Logs +Only ERROR level entries. +```logql +{app=~"devops-.*"} | json | level="ERROR" +``` + +![Logs](./screenshots/img_5.png) + +### 4. Log Level Distribution +Count of logs by level (stat/pie chart). +```logql +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` + +![Logs](./screenshots/img_8.png) + +## Production Config + +### Security +- Grafana anonymous access disabled (`GF_AUTH_ANONYMOUS_ENABLED=false`) +- Admin password via `.env` file (excluded from git via `.gitignore`) +- Promtail Docker socket access is read-only (`:ro`) + +![Security](./screenshots/img_7.png) + +### Resource Limits +All services have CPU and memory limits: + +| Service | CPU Limit | Memory Limit | CPU Reserve | Memory Reserve | +|-----------|-----------|-------------|-------------|----------------| +| Loki | 1.0 | 1G | 0.5 | 512M | +| Promtail | 0.5 | 512M | 0.25 | 256M | +| Grafana | 1.0 | 1G | 0.5 | 512M | +| app-python| 0.5 | 256M | 0.25 | 128M | + +### Health Checks +- **Loki:** `wget http://localhost:3100/ready` every 10s +- **Grafana:** `wget http://localhost:3000/api/health` every 10s +- Promtail depends on Loki health via `service_healthy` condition + +### Retention +- Loki retention: 7 days (168h) +- Compactor runs automatically to delete expired data + +## Testing + +```bash +# Deploy the stack +cd monitoring +docker compose up -d + +# Check all services +docker compose ps + +# Verify Loki +curl http://localhost:3100/ready + +# Generate traffic +for i in $(seq 1 20); do curl http://localhost:8000/; done +for i in $(seq 1 20); do curl http://localhost:8000/health; done + +# Query logs via API +curl -G http://localhost:3100/loki/api/v1/query \ + --data-urlencode 'query={app="devops-python"}' +``` + +## Ansible Automation (Bonus) + +Monitoring stack can be deployed via Ansible: + +```bash +ansible-playbook ansible/playbooks/deploy-monitoring.yml +``` + +Role `monitoring` creates directories, templates configs (Jinja2), deploys with `community.docker.docker_compose_v2`, and waits for service readiness. + +All versions, ports, retention, and resource limits are parameterized in `roles/monitoring/defaults/main.yml`. + +## Challenges + +1. **Loki 3.0 config changes** — TSDB is the recommended store replacing boltdb-shipper. Schema v13 required for structured metadata support +2. **Promtail Docker SD filtering** — using Docker label filters to avoid scraping unrelated containers +3. **JSON logging in FastAPI** — `python-json-logger` v3 uses `pythonjsonlogger.json.JsonFormatter` import path (changed from v2) +4. **Grafana provisioning** — auto-provisioning datasource avoids manual setup steps diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..46519a32b4 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,247 @@ +# Lab 8 — Metrics & Monitoring with Prometheus + +## 1. Architecture + +``` +┌──────────────┐ scrape /metrics ┌──────────────┐ query ┌──────────────┐ +│ app-python │◄────────────────────────│ Prometheus │◄──────────────│ Grafana │ +│ (FastAPI) │ :5000 │ (TSDB) │ PromQL │ (Dashboards) │ +│ :5000/8000 │ │ :9090 │ │ :3000 │ +└──────────────┘ └──────────────┘ └──────────────┘ + │ │ │ + │ /metrics endpoint │ self-scrape │ + │ - http_requests_total │ + loki:3100/metrics │ + │ - http_request_duration_seconds │ + grafana:3000/metrics │ + │ - http_requests_in_progress │ │ + │ - devops_info_endpoint_calls │ │ + │ - devops_info_system_collection_s │ │ + │ │ │ + └──── logs ────► Loki ◄──── Promtail │ │ + :3100 (docker) │ │ + │ │ │ + └──────────────────────┴──────────────────────────────┘ + Grafana datasources: Loki + Prometheus +``` + +**Data flow:** +1. Application exposes metrics at `/metrics` in Prometheus exposition format +2. Prometheus scrapes targets every 15s and stores time-series data in TSDB +3. Grafana queries Prometheus via PromQL and renders dashboards + +## 2. Application Instrumentation + +### Metrics Added + +| Metric | Type | Labels | Purpose | +|--------|------|--------|---------| +| `http_requests_total` | Counter | method, endpoint, status | Total HTTP request count (RED: Rate) | +| `http_request_duration_seconds` | Histogram | method, endpoint | Request latency distribution (RED: Duration) | +| `http_requests_in_progress` | Gauge | — | Concurrent requests being processed | +| `devops_info_endpoint_calls` | Counter | endpoint | Business-level endpoint usage tracking | +| `devops_info_system_collection_seconds` | Histogram | — | Time to collect system info | + +### Why These Metrics + +- **RED Method** covered: Rate (`http_requests_total`), Errors (status=~"5.."), Duration (`http_request_duration_seconds`) +- **Counter** for cumulative events (requests, endpoint calls) — monotonically increasing +- **Histogram** for latency distributions — enables percentile calculations (p50, p95, p99) +- **Gauge** for current state (in-progress requests) — can go up and down + +### Implementation + +Metrics are defined in `app_python/metrics.py` and collected via FastAPI middleware in `app_python/app.py`: +- `@app.middleware("http")` intercepts all requests (except `/metrics` itself) +- Before request: increment `http_requests_in_progress` +- After request: record counter with labels, observe histogram duration, decrement gauge +- Business metrics tracked in individual services (`routes/root/service.py`, `routes/health_check/service.py`) + +## 3. Prometheus Configuration + +**File:** `monitoring/prometheus/prometheus.yml` + +### Scrape Targets + +| Job | Target | Metrics Path | Purpose | +|-----|--------|-------------|---------| +| `prometheus` | `localhost:9090` | `/metrics` | Self-monitoring | +| `app` | `app-python:5000` | `/metrics` | Application metrics | +| `loki` | `loki:3100` | `/metrics` | Log aggregator metrics | +| `grafana` | `grafana:3000` | `/metrics` | Dashboard tool metrics | + +### Configuration + +- **Scrape interval:** 15s (balance between granularity and load) +- **Evaluation interval:** 15s +- **Retention time:** 15 days +- **Retention size:** 10GB +- Docker service names used as hostnames (internal Docker DNS) + +## 4. Dashboard Walkthrough + +Dashboard: **Application Metrics** (`app-metrics`) + +### Panels + +1. **Request Rate by Endpoint** (Time Series) + - Query: `sum(rate(http_requests_total[5m])) by (endpoint)` + - Shows: requests/sec per endpoint over time + - Unit: req/s + +2. **Error Rate (5xx)** (Time Series) + - Query: `sum(rate(http_requests_total{status=~"5.."}[5m]))` + - Shows: server error rate + - Color: red for visibility + +3. **Request Duration p95** (Time Series) + - Query: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))` + - Shows: 95th percentile latency per endpoint + +4. **Request Duration Heatmap** (Heatmap) + - Query: `sum(increase(http_request_duration_seconds_bucket[5m])) by (le)` + - Shows: latency distribution over time + +5. **Active Requests** (Time Series) + - Query: `http_requests_in_progress` + - Shows: concurrent in-flight requests + +6. **Status Code Distribution** (Pie Chart) + - Query: `sum by (status) (rate(http_requests_total[5m]))` + - Shows: proportion of 2xx vs 4xx vs 5xx + +7. **Service Uptime** (Stat) + - Query: `up{job="app"}` + - Shows: UP/DOWN status with color mapping + +## 5. PromQL Examples + +### 1. Request rate per endpoint (RED: Rate) +```promql +sum(rate(http_requests_total[5m])) by (endpoint) +``` +Per-second request rate averaged over 5 minutes, grouped by endpoint. + +### 2. Error rate percentage (RED: Errors) +```promql +sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 +``` +Percentage of requests resulting in 5xx server errors. + +### 3. 95th percentile latency (RED: Duration) +```promql +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +``` +95% of requests complete within this duration. + +### 4. Total requests by status code +```promql +sum by (status) (increase(http_requests_total[1h])) +``` +Total request count per status code over the last hour. + +### 5. Services that are down +```promql +up == 0 +``` +Lists all scrape targets that failed their last health check. + +### 6. Prometheus CPU usage +```promql +rate(process_cpu_seconds_total{job="prometheus"}[5m]) * 100 +``` +CPU utilization percentage of the Prometheus process. + +### 7. Endpoint call distribution +```promql +sum by (endpoint) (increase(devops_info_endpoint_calls_total[1h])) +``` +Business metric showing which endpoints are most popular. + +## 6. Production Setup + +### Health Checks + +| Service | Check | Interval | Retries | +|---------|-------|----------|---------| +| Loki | `wget http://localhost:3100/ready` | 10s | 5 | +| Grafana | `wget http://localhost:3000/api/health` | 10s | 5 | +| Prometheus | `wget http://localhost:9090/-/healthy` | 10s | 5 | +| app-python | `python urllib http://localhost:5000/health` | 10s | 5 | + +### Resource Limits + +| Service | CPU Limit | Memory Limit | CPU Reservation | Memory Reservation | +|---------|-----------|-------------|-----------------|-------------------| +| Prometheus | 1.0 | 1G | 0.5 | 512M | +| Loki | 1.0 | 1G | 0.5 | 512M | +| Grafana | 0.5 | 512M | 0.25 | 256M | +| app-python | 0.5 | 256M | 0.25 | 128M | + +### Retention Policies + +- **Prometheus TSDB:** 15 days or 10GB (whichever comes first) + - Configured via CLI flags: `--storage.tsdb.retention.time=15d --storage.tsdb.retention.size=10GB` +- **Loki:** 168h (7 days) with compaction + +### Persistent Volumes + +| Volume | Service | Mount Point | Purpose | +|--------|---------|-------------|---------| +| `prometheus-data` | Prometheus | `/prometheus` | TSDB time-series data | +| `loki-data` | Loki | `/loki` | Log chunks and index | +| `grafana-data` | Grafana | `/var/lib/grafana` | Dashboards, users, settings | + +## 7. Testing Results + +### Verification Steps + +1. **Deploy stack:** `cd monitoring && docker compose up -d` +2. **Check services:** `docker compose ps` — all services should be "healthy" +3. **Prometheus targets:** http://localhost:9090/targets — all targets UP +4. **Test metrics endpoint:** `curl http://localhost:8000/metrics` +5. **PromQL query:** Run `up` in Prometheus UI — shows all targets +6. **Grafana dashboard:** http://localhost:3000 → Application Metrics dashboard + +### Expected `/metrics` Output +``` +# HELP http_requests_total Total HTTP requests +# TYPE http_requests_total counter +http_requests_total{method="GET",endpoint="/",status="200"} 5.0 +http_requests_total{method="GET",endpoint="/health",status="200"} 10.0 + +# HELP http_request_duration_seconds HTTP request duration in seconds +# TYPE http_request_duration_seconds histogram +http_request_duration_seconds_bucket{le="0.005",method="GET",endpoint="/"} 3.0 +... + +# HELP http_requests_in_progress HTTP requests currently being processed +# TYPE http_requests_in_progress gauge +http_requests_in_progress 0.0 +``` + +## 8. Challenges & Solutions + +### Challenge 1: Metrics endpoint being tracked +**Problem:** The `/metrics` endpoint was being counted in `http_requests_total`, inflating request counts since Prometheus scrapes every 15s. +**Solution:** Added early return in middleware to skip `/metrics` path from instrumentation. + +### Challenge 2: FastAPI async middleware with gauge +**Problem:** Gauge for in-progress requests could become inconsistent if response processing fails. +**Solution:** Used try/finally block to ensure `http_requests_in_progress.dec()` always runs. + +### Challenge 3: Docker internal ports vs exposed ports +**Problem:** Prometheus needs to reach app on internal Docker network port (5000), not the host-mapped port (8000). +**Solution:** Used container-internal port `app-python:5000` in prometheus.yml scrape config. + +## 9. Metrics vs Logs — When to Use Each + +| Aspect | Metrics (Prometheus) | Logs (Loki) | +|--------|---------------------|-------------| +| **What** | Numeric aggregates (counts, durations) | Event details (text, structured data) | +| **When** | Dashboards, alerting, trends | Debugging, audit, root cause analysis | +| **Cardinality** | Low (labels) | High (individual events) | +| **Storage** | Efficient (numeric time-series) | Heavy (full text) | +| **Query** | PromQL (aggregation-oriented) | LogQL (search-oriented) | +| **Example** | "500 errors spiked to 10/s" | "Error in /api/users: DB connection refused" | + +**Use metrics** to detect problems (alerting on error rate spike). +**Use logs** to diagnose problems (finding the specific error message). diff --git a/monitoring/docs/screenshots/img.png b/monitoring/docs/screenshots/img.png new file mode 100644 index 0000000000..fdcc0004ff Binary files /dev/null and b/monitoring/docs/screenshots/img.png differ diff --git a/monitoring/docs/screenshots/img_1.png b/monitoring/docs/screenshots/img_1.png new file mode 100644 index 0000000000..176f0b440d Binary files /dev/null and b/monitoring/docs/screenshots/img_1.png differ diff --git a/monitoring/docs/screenshots/img_2.png b/monitoring/docs/screenshots/img_2.png new file mode 100644 index 0000000000..f9a3e0704a Binary files /dev/null and b/monitoring/docs/screenshots/img_2.png differ diff --git a/monitoring/docs/screenshots/img_3.png b/monitoring/docs/screenshots/img_3.png new file mode 100644 index 0000000000..d2ab4c9237 Binary files /dev/null and b/monitoring/docs/screenshots/img_3.png differ diff --git a/monitoring/docs/screenshots/img_4.png b/monitoring/docs/screenshots/img_4.png new file mode 100644 index 0000000000..9789e8c830 Binary files /dev/null and b/monitoring/docs/screenshots/img_4.png differ diff --git a/monitoring/docs/screenshots/img_5.png b/monitoring/docs/screenshots/img_5.png new file mode 100644 index 0000000000..090a19731d Binary files /dev/null and b/monitoring/docs/screenshots/img_5.png differ diff --git a/monitoring/docs/screenshots/img_6.png b/monitoring/docs/screenshots/img_6.png new file mode 100644 index 0000000000..301be91187 Binary files /dev/null and b/monitoring/docs/screenshots/img_6.png differ diff --git a/monitoring/docs/screenshots/img_7.png b/monitoring/docs/screenshots/img_7.png new file mode 100644 index 0000000000..48c189d4ef Binary files /dev/null and b/monitoring/docs/screenshots/img_7.png differ diff --git a/monitoring/docs/screenshots/img_8.png b/monitoring/docs/screenshots/img_8.png new file mode 100644 index 0000000000..db86e262d8 Binary files /dev/null and b/monitoring/docs/screenshots/img_8.png differ diff --git a/monitoring/docs/screenshots/img_9.png b/monitoring/docs/screenshots/img_9.png new file mode 100644 index 0000000000..3abeedd5e7 Binary files /dev/null and b/monitoring/docs/screenshots/img_9.png differ diff --git a/monitoring/grafana/provisioning/dashboards/app-logs.json b/monitoring/grafana/provisioning/dashboards/app-logs.json new file mode 100644 index 0000000000..32cabbe199 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/app-logs.json @@ -0,0 +1,131 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "title": "Logs Table", + "type": "logs", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{container=~\".+\"}", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, + "id": 2, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (container) (rate({container=~\".+\"} [1m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto", + "lineWidth": 2 + }, + "unit": "reqps" + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi" } + } + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, + "id": 3, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{container=~\".+\"} | json | level=\"ERROR\" or level=\"error\"", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Log Level Distribution", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "id": 4, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (level) (count_over_time({container=~\".+\"} | json [5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" } + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "right" }, + "tooltip": { "mode": "single" }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + } + } + ], + "schemaVersion": 39, + "tags": ["loki", "logs"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "App Logs", + "uid": "app-logs", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/dashboards/app-metrics.json b/monitoring/grafana/provisioning/dashboards/app-metrics.json new file mode 100644 index 0000000000..dbdd63afeb --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/app-metrics.json @@ -0,0 +1,191 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Request Rate by Endpoint", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 5, + "showPoints": "auto" + } + }, + "overrides": [] + } + }, + { + "title": "Error Rate (5xx)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))", + "legendFormat": "5xx errors/s", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20 + }, + "color": { "mode": "fixed", "fixedColor": "red" } + }, + "overrides": [] + } + }, + { + "title": "Request Duration p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))", + "legendFormat": "p95 {{endpoint}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + } + }, + "overrides": [] + } + }, + { + "title": "Request Duration Heatmap", + "type": "heatmap", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(increase(http_request_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}", + "refId": "A", + "format": "heatmap" + } + ], + "options": { + "calculate": false, + "yAxis": { + "unit": "s" + }, + "color": { + "scheme": "Oranges" + } + } + }, + { + "title": "Active Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "http_requests_in_progress", + "legendFormat": "in-progress", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 30 + }, + "color": { "mode": "fixed", "fixedColor": "blue" } + }, + "overrides": [] + } + }, + { + "title": "Status Code Distribution", + "type": "piechart", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum by (status) (rate(http_requests_total[5m]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "pieType": "pie" + } + }, + { + "title": "Service Uptime", + "type": "stat", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "up{job=\"app\"}", + "legendFormat": "app-python", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["app", "metrics", "RED"], + "templating": { "list": [] }, + "time": { "from": "now-30m", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Application Metrics", + "uid": "app-metrics", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..aa5f7c1db3 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..70b7e4247c --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,18 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + uid: loki + isDefault: false + editable: true + + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + uid: prometheus + isDefault: true + editable: true diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..f71cc6a60f --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,38 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + replication_factor: 1 + path_prefix: /loki + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +limits_config: + retention_period: 168h + allow_structured_metadata: true + +compactor: + working_directory: /loki/compactor + retention_enabled: true + delete_request_store: filesystem diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..7e143caacf --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,28 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +storage: + tsdb: + retention_time: 15d + retention_size: 10GB + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "app" + static_configs: + - targets: ["app-python:5000"] + metrics_path: "/metrics" + + - job_name: "loki" + static_configs: + - targets: ["loki:3100"] + metrics_path: "/metrics" + + - job_name: "grafana" + static_configs: + - targets: ["grafana:3000"] + metrics_path: "/metrics" diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..d45d55080a --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,23 @@ +server: + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + - source_labels: ["__meta_docker_container_label_app"] + target_label: "app" diff --git a/pulumi/Pulumi.dev.yaml b/pulumi/Pulumi.dev.yaml new file mode 100644 index 0000000000..197dd9c9f6 --- /dev/null +++ b/pulumi/Pulumi.dev.yaml @@ -0,0 +1,5 @@ +config: + yc-infra:cloudId: b1glhaar472redp2m3to + yc-infra:folderId: b1g1uj21m7e1md6p212p + yc-infra:zone: ru-central1-a + yc-infra:myIp: 188.130.155.177/32 diff --git a/pulumi/Pulumi.yaml b/pulumi/Pulumi.yaml new file mode 100644 index 0000000000..69d3b70134 --- /dev/null +++ b/pulumi/Pulumi.yaml @@ -0,0 +1,4 @@ +name: yc-infra +runtime: + name: python +description: Yandex Cloud infrastructure via Pulumi diff --git a/pulumi/main.py b/pulumi/main.py new file mode 100644 index 0000000000..2136c63ce1 --- /dev/null +++ b/pulumi/main.py @@ -0,0 +1,84 @@ +import pulumi +import pulumi_yandex as yc +from pulumi import Config + +config = Config() + +cloud_id = config.require("cloudId") +folder_id = config.require("folderId") +zone = config.require("zone") +my_ip = config.require("myIp") + +provider = yc.Provider( + "yc-provider", + cloud_id=cloud_id, + folder_id=folder_id, + zone=zone, + service_account_key_file="authorized_key.json", +) + +network = yc.VpcNetwork( + "net", name="net", opts=pulumi.ResourceOptions(provider=provider) +) + +subnet = yc.VpcSubnet( + "subnet", + name="subnet", + zone=zone, + network_id=network.id, + v4_cidr_blocks=["10.0.0.0/24"], + opts=pulumi.ResourceOptions(provider=provider), +) + +# Security Group +security_group = yc.VpcSecurityGroup( + "sg", + network_id=network.id, + ingress=[ + yc.VpcSecurityGroupIngressArgs( + protocol="TCP", description="SSH", v4_cidr_blocks=[my_ip], port=22 + ), + yc.VpcSecurityGroupIngressArgs( + protocol="TCP", description="HTTP", v4_cidr_blocks=["0.0.0.0/0"], port=80 + ), + yc.VpcSecurityGroupIngressArgs( + protocol="TCP", + description="App 5000", + v4_cidr_blocks=["0.0.0.0/0"], + port=5000, + ), + ], + egress=[ + yc.VpcSecurityGroupEgressArgs(protocol="ANY", v4_cidr_blocks=["0.0.0.0/0"]) + ], + opts=pulumi.ResourceOptions(provider=provider), +) + +# Image +image = yc.get_compute_image(family="ubuntu-2204-lts") + +# VM +vm = yc.ComputeInstance( + "vm", + name="pulumi-vm", + platform_id="standard-v2", + zone=zone, + resources=yc.ComputeInstanceResourcesArgs(cores=2, memory=2), + boot_disk=yc.ComputeInstanceBootDiskArgs( + initialize_params=yc.ComputeInstanceBootDiskInitializeParamsArgs( + image_id=image.id + ) + ), + network_interfaces=[ + yc.ComputeInstanceNetworkInterfaceArgs( + subnet_id=subnet.id, nat=True, security_group_ids=[security_group.id] + ) + ], + metadata={ + "ssh-keys": f"ubuntu:{open('C:/Users/kve10/.ssh/id_ed25519.pub').read()}" + }, + opts=pulumi.ResourceOptions(provider=provider), +) + +pulumi.export("public_ip", vm.network_interfaces.apply(lambda ni: ni[0].nat_ip_address)) +pulumi.export("internal_ip", vm.network_interfaces.apply(lambda ni: ni[0].ip_address)) diff --git a/pulumi/requirements.txt b/pulumi/requirements.txt new file mode 100644 index 0000000000..9dff44f6bc --- /dev/null +++ b/pulumi/requirements.txt @@ -0,0 +1,2 @@ +pulumi>=3.0.0 +pulumi-yandex>=0.187.0 \ No newline at end of file diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 0000000000..122a91f7f4 --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1,5 @@ +.terraform/ +*.tfstate +*.tfstate.backup +authorized_key.json +terraform.tfvars \ No newline at end of file diff --git a/terraform/default.tf b/terraform/default.tf new file mode 100644 index 0000000000..c4c8a372ae --- /dev/null +++ b/terraform/default.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + yandex = { + source = "yandex-cloud/yandex" + version = "0.187.0" + } + } +} + +provider "yandex" { + service_account_key_file = "authorized_key.json" + cloud_id = var.cloud_id + folder_id = var.folder_id + zone = var.zone +} \ No newline at end of file diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000000..20f9e53a44 --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,90 @@ +data "yandex_compute_image" "ubuntu" { + family = "ubuntu-2204-lts" +} + +resource "yandex_vpc_network" "net-1" { + name = "net" +} + +resource "yandex_vpc_subnet" "subnet-1" { + name = "subnet" + zone = "ru-central1-a" + network_id = yandex_vpc_network.net-1.id + v4_cidr_blocks = ["10.0.0.0/24"] +} + + +resource "yandex_vpc_security_group" "sg1" { + network_id = yandex_vpc_network.net-1.id + + labels = { + my-label = "my-label-value" + } + + ingress { + protocol = "TCP" + description = "rule1 description" + v4_cidr_blocks = ["10.0.1.0/24", "10.0.2.0/24"] + port = 8080 + } + + ingress { + protocol = "ANY" + description = "SSH" + v4_cidr_blocks = ["${var.my_ip}/32"] + port = 22 + } + + ingress { + protocol = "ANY" + description = "HTTP" + v4_cidr_blocks = ["0.0.0.0/0"] + port = 80 + } + + ingress { + protocol = "TCP" + description = "App 5000" + v4_cidr_blocks = ["0.0.0.0/0"] + port = 5000 + } + + egress { + protocol = "ANY" + v4_cidr_blocks = ["0.0.0.0/0"] + } + +} + + +resource "yandex_compute_instance" "vm-1" { + name = "terraform1" + platform_id = var.instance_platform + zone = var.zone + + resources { + cores = var.instance_cores + memory = var.instance_memory + } + + boot_disk { + initialize_params { + image_id = data.yandex_compute_image.ubuntu.id + } + } + + network_interface { + nat = true + subnet_id = yandex_vpc_subnet.subnet-1.id + security_group_ids = [yandex_vpc_security_group.sg1.id] + } + + metadata = { + ssh-keys = "ubuntu:${file(var.ssh_public_key_path)}" + } + + labels = { + environment = "dev" + project = "terraform-lab" + } +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000000..7deea72e3a --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,7 @@ +output "public_ip" { + value = yandex_compute_instance.vm-1.network_interface[0].nat_ip_address +} + +output "internal_ip" { + value = yandex_compute_instance.vm-1.network_interface[0].ip_address +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000000..7506f29096 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,36 @@ +variable "cloud_id" { + type = string +} + +variable "folder_id" { + type = string +} + +variable "zone" { + type = string + default = "ru-central1-a" +} + +variable "instance_platform" { + type = string + default = "standard-v2" +} + +variable "instance_cores" { + type = number + default = 2 +} + +variable "instance_memory" { + type = number + default = 2 +} + +variable "ssh_public_key_path" { + type = string + default = "~/.ssh/id_ed25519.pub" +} + +variable "my_ip" { + type = string +} \ No newline at end of file