From 5d1f509fa945a9bd1313c067c471bd092b205269 Mon Sep 17 00:00:00 2001 From: Cdeth567 <11kvvkvv11@mail.ru> Date: Tue, 24 Mar 2026 21:36:27 +0300 Subject: [PATCH] lab09 completed --- ansible/playbooks/deploy-monitoring.yml | 8 + ansible/roles/monitoring/defaults/main.yml | 39 ++ .../files/grafana-app-dashboard.json | 408 ++++++++++++++ .../files/grafana-dashboard-provider.yml | 12 + .../monitoring/files/grafana-datasources.yml | 20 + .../files/grafana-logs-dashboard.json | 278 ++++++++++ .../roles/monitoring/files/loki-config.yml | 59 ++ .../monitoring/files/promtail-config.yml | 28 + ansible/roles/monitoring/tasks/main.yml | 100 ++++ .../templates/docker-compose.yml.j2 | 184 +++++++ .../monitoring/templates/prometheus.yml.j2 | 16 + app_python/Dockerfile | 16 +- app_python/app.py | 153 +++++- app_python/requirements.txt | 3 +- app_python/tests/test_app.py | 143 ++--- k8s/README.md | 515 ++++++++++++++++++ k8s/deployment-app2.yml | 91 ++++ k8s/deployment-update.yml | 91 ++++ k8s/deployment.yml | 91 ++++ k8s/ingress.yml | 33 ++ k8s/namespace.yml | 7 + k8s/service-app2.yml | 20 + k8s/service.yml | 21 + 23 files changed, 2249 insertions(+), 87 deletions(-) create mode 100644 ansible/playbooks/deploy-monitoring.yml create mode 100644 ansible/roles/monitoring/defaults/main.yml create mode 100644 ansible/roles/monitoring/files/grafana-app-dashboard.json create mode 100644 ansible/roles/monitoring/files/grafana-dashboard-provider.yml create mode 100644 ansible/roles/monitoring/files/grafana-datasources.yml create mode 100644 ansible/roles/monitoring/files/grafana-logs-dashboard.json create mode 100644 ansible/roles/monitoring/files/loki-config.yml create mode 100644 ansible/roles/monitoring/files/promtail-config.yml create mode 100644 ansible/roles/monitoring/tasks/main.yml create mode 100644 ansible/roles/monitoring/templates/docker-compose.yml.j2 create mode 100644 ansible/roles/monitoring/templates/prometheus.yml.j2 create mode 100644 k8s/README.md create mode 100644 k8s/deployment-app2.yml create mode 100644 k8s/deployment-update.yml create mode 100644 k8s/deployment.yml create mode 100644 k8s/ingress.yml create mode 100644 k8s/namespace.yml create mode 100644 k8s/service-app2.yml create mode 100644 k8s/service.yml diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..3dcad07cbf --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,8 @@ +--- +- name: Deploy monitoring stack with Prometheus, Loki and Grafana + hosts: webservers + become: true + + roles: + - docker + - monitoring diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..df29c2bc07 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,39 @@ +--- +monitoring_dir: /opt/monitoring +monitoring_project_name: lab08-monitoring +monitoring_network_name: logging + +grafana_version: "12.3.1" +loki_version: "3.0.0" +promtail_version: "3.0.0" +prometheus_version: "3.9.0" + +monitoring_app_image: "{{ docker_image | default('devops-info-service') }}" +monitoring_app_tag: "{{ docker_image_tag | default('latest') }}" +monitoring_app_container_name: app-python +monitoring_app_port: 8000 +monitoring_app_health_endpoint: /health + +grafana_admin_user: admin +grafana_admin_password: admin +prometheus_port: 9090 +prometheus_retention_days: 15 +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" + +prometheus_targets: + - job: prometheus + targets: + - localhost:9090 + - job: loki + targets: + - loki:3100 + path: /metrics + - job: grafana + targets: + - grafana:3000 + path: /metrics + - job: app + targets: + - app-python:8000 + path: /metrics diff --git a/ansible/roles/monitoring/files/grafana-app-dashboard.json b/ansible/roles/monitoring/files/grafana-app-dashboard.json new file mode 100644 index 0000000000..10353c15ef --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-app-dashboard.json @@ -0,0 +1,408 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "legendFormat": "{{endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx error rate", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "{{endpoint}} p95", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration p95", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "mode": "scheme" + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ops" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Duration Heatmap", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(http_requests_in_progress)", + "legendFormat": "in progress", + "range": true, + "refId": "A" + } + ], + "title": "Active Requests", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "displayLabels": [ + "name", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "{{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "index": 0, + "text": "DOWN" + }, + "1": { + "index": 1, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "up{job=\"app\"}", + "legendFormat": "app", + "range": true, + "refId": "A" + } + ], + "title": "Application Uptime", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 40, + "style": "dark", + "tags": [ + "lab08", + "prometheus", + "grafana", + "metrics" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab 08 - Application Metrics", + "uid": "lab08-app-metrics", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/monitoring/files/grafana-dashboard-provider.yml b/ansible/roles/monitoring/files/grafana-dashboard-provider.yml new file mode 100644 index 0000000000..f181b0fb6f --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-dashboard-provider.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: observability-dashboards + orgId: 1 + folder: Observability + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/ansible/roles/monitoring/files/grafana-datasources.yml b/ansible/roles/monitoring/files/grafana-datasources.yml new file mode 100644 index 0000000000..8eb54b2c6f --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-datasources.yml @@ -0,0 +1,20 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: false + editable: true + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + httpMethod: POST + manageAlerts: true diff --git a/ansible/roles/monitoring/files/grafana-logs-dashboard.json b/ansible/roles/monitoring/files/grafana-logs-dashboard.json new file mode 100644 index 0000000000..958bcaf55a --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-logs-dashboard.json @@ -0,0 +1,278 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Recent Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 12, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "logs/sec" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "legendFormat": "{{app}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Request Rate by App", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 3, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 4, + "options": { + "displayLabels": [ + "name", + "percent", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "editorMode": "code", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "legendFormat": "{{level}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Level Distribution (Last 5m)", + "type": "piechart" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "lab07", + "loki", + "observability" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Lab 07 - Loki Observability", + "uid": "lab07-loki-observability", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/monitoring/files/loki-config.yml b/ansible/roles/monitoring/files/loki-config.yml new file mode 100644 index 0000000000..851528e43b --- /dev/null +++ b/ansible/roles/monitoring/files/loki-config.yml @@ -0,0 +1,59 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +ingester: + lifecycler: + address: 127.0.0.1 + ring: + kvstore: + store: inmemory + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + wal: + enabled: true + dir: /loki/wal + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + allow_structured_metadata: true + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/ansible/roles/monitoring/files/promtail-config.yml b/ansible/roles/monitoring/files/promtail-config.yml new file mode 100644 index 0000000000..db86da2270 --- /dev/null +++ b/ansible/roles/monitoring/files/promtail-config.yml @@ -0,0 +1,28 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: containers + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 10s + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + - source_labels: [__meta_docker_container_label_logging] + regex: promtail + action: keep + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + - source_labels: [__meta_docker_container_id] + target_label: container_id diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..ceeef8d60b --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,100 @@ +--- +- name: Create monitoring directory structure + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ monitoring_dir }}" + - "{{ monitoring_dir }}/loki" + - "{{ monitoring_dir }}/promtail" + - "{{ monitoring_dir }}/prometheus" + - "{{ monitoring_dir }}/grafana" + - "{{ monitoring_dir }}/grafana/dashboards" + - "{{ monitoring_dir }}/grafana/provisioning" + - "{{ monitoring_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_dir }}/grafana/provisioning/dashboards" + +- name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_dir }}/docker-compose.yml" + mode: "0644" + +- name: Template Prometheus config + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ monitoring_dir }}/prometheus/prometheus.yml" + mode: "0644" + +- name: Copy Loki config + ansible.builtin.copy: + src: loki-config.yml + dest: "{{ monitoring_dir }}/loki/config.yml" + mode: "0644" + +- name: Copy Promtail config + ansible.builtin.copy: + src: promtail-config.yml + dest: "{{ monitoring_dir }}/promtail/config.yml" + mode: "0644" + +- name: Copy Grafana datasources provisioning + ansible.builtin.copy: + src: grafana-datasources.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/datasources.yml" + mode: "0644" + +- name: Copy Grafana dashboard provider + ansible.builtin.copy: + src: grafana-dashboard-provider.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/dashboard.yml" + mode: "0644" + +- name: Copy Grafana dashboards + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "{{ monitoring_dir }}/grafana/dashboards/{{ item.dest }}" + mode: "0644" + loop: + - src: grafana-app-dashboard.json + dest: lab08-metrics-dashboard.json + - src: grafana-logs-dashboard.json + dest: lab07-observability.json + +- name: Deploy monitoring stack + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + state: present + pull: policy + recreate: auto + +- name: Wait for Grafana health + ansible.builtin.uri: + url: http://localhost:3000/api/health + method: GET + status_code: 200 + register: grafana_health + retries: 20 + delay: 5 + until: grafana_health.status == 200 + +- name: Wait for Prometheus health + ansible.builtin.uri: + url: http://localhost:9090/-/healthy + method: GET + status_code: 200 + register: prometheus_health + retries: 20 + delay: 5 + until: prometheus_health.status == 200 + +- name: Wait for application health + ansible.builtin.uri: + url: "http://localhost:{{ monitoring_app_port }}{{ monitoring_app_health_endpoint }}" + method: GET + status_code: 200 + register: app_health + retries: 20 + delay: 5 + until: app_health.status == 200 diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..413fd84362 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,184 @@ +name: {{ monitoring_project_name }} + +version: "3.8" + +x-loki-resources: &loki-resources + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + +x-prometheus-resources: &prometheus-resources + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + +x-grafana-resources: &grafana-resources + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.25" + memory: 256M + +x-app-resources: &app-resources + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.25" + memory: 128M + +services: + loki: + image: grafana/loki:{{ loki_version }} + container_name: loki + command: + - -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - {{ monitoring_network_name }} + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + <<: *loki-resources + + promtail: + image: grafana/promtail:{{ promtail_version }} + container_name: promtail + command: + - -config.file=/etc/promtail/config.yml + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-positions:/tmp/positions + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - {{ monitoring_network_name }} + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9080/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + <<: *app-resources + + prometheus: + image: prom/prometheus:v{{ prometheus_version }} + container_name: prometheus + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.retention.time={{ prometheus_retention_days }}d + - --storage.tsdb.retention.size={{ prometheus_retention_size }} + ports: + - "{{ prometheus_port }}:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - {{ monitoring_network_name }} + depends_on: + loki: + condition: service_healthy + grafana: + condition: service_started + app-python: + condition: service_started + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + <<: *prometheus-resources + + grafana: + image: grafana/grafana:{{ grafana_version }} + container_name: grafana + ports: + - "3000:3000" + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ALLOW_EMBEDDING: "false" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_METRICS_ENABLED: "true" + GF_SECURITY_ADMIN_USER: "{{ grafana_admin_user }}" + GF_SECURITY_ADMIN_PASSWORD: "{{ grafana_admin_password }}" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - {{ monitoring_network_name }} + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 25s + <<: *grafana-resources + + app-python: + image: {{ monitoring_app_image }}:{{ monitoring_app_tag }} + container_name: {{ monitoring_app_container_name }} + environment: + HOST: 0.0.0.0 + PORT: "{{ monitoring_app_port }}" + PYTHONUNBUFFERED: "1" + ports: + - "{{ monitoring_app_port }}:{{ monitoring_app_port }}" + networks: + - {{ monitoring_network_name }} + depends_on: + promtail: + condition: service_healthy + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:{{ monitoring_app_port }}{{ monitoring_app_health_endpoint }} || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + <<: *app-resources + +volumes: + loki-data: + grafana-data: + promtail-positions: + prometheus-data: + +networks: + {{ monitoring_network_name }}: + driver: bridge diff --git a/ansible/roles/monitoring/templates/prometheus.yml.j2 b/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000000..69e01b87b1 --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,16 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +scrape_configs: +{% for target in prometheus_targets %} + - job_name: {{ target.job | to_json }} + {% if target.path is defined %} + metrics_path: {{ target.path | to_json }} + {% endif %} + static_configs: + - targets: +{% for item in target.targets %} + - {{ item | to_json }} +{% endfor %} +{% endfor %} diff --git a/app_python/Dockerfile b/app_python/Dockerfile index ab5526ede7..4b53e0678d 100644 --- a/app_python/Dockerfile +++ b/app_python/Dockerfile @@ -2,28 +2,24 @@ FROM python:3.13-slim -# 1) Basic env for predictable Python behavior ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 -# 2) Create non-root user -RUN addgroup --system app && adduser --system --ingroup app app +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* \ + && addgroup --system --gid 1000 app \ + && adduser --system --uid 1000 --ingroup app app -# 3) Workdir WORKDIR /app -# 4) Install dependencies (layer caching) COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# 5) Copy only necessary source files COPY app.py . -# 6) Security: drop privileges -USER app +USER 1000:1000 -# 7) Document port (your app uses PORT env, default 5000) EXPOSE 5000 -# 8) Start the app CMD ["python", "app.py"] diff --git a/app_python/app.py b/app_python/app.py index c63be56286..790930b4b2 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -7,16 +7,59 @@ import time from datetime import datetime, timezone -from flask import Flask, g, has_request_context, jsonify, request +from flask import Flask, Response, g, has_request_context, jsonify, request +from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest from werkzeug.exceptions import HTTPException -SERVICE_NAME = "devops-info-service" -SERVICE_VERSION = "1.0.0" +SERVICE_NAME = os.getenv("SERVICE_NAME", "devops-info-service") +SERVICE_VERSION = os.getenv("SERVICE_VERSION", "1.0.0") +SERVICE_DESCRIPTION = os.getenv("SERVICE_DESCRIPTION", "DevOps course info service") HOST = os.getenv("HOST", "0.0.0.0") PORT = int(os.getenv("PORT", 5000)) DEBUG = os.getenv("DEBUG", "False").lower() == "true" START_TIME = datetime.now(timezone.utc) +REQUEST_LATENCY_BUCKETS = ( + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.25, + 0.5, + 1.0, + 2.5, + 5.0, + 10.0, +) + +http_requests_total = Counter( + "http_requests_total", + "Total HTTP requests processed by the Flask application", + ["method", "endpoint", "status_code"], +) +http_request_duration_seconds = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], + buckets=REQUEST_LATENCY_BUCKETS, +) +http_requests_in_progress = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", + ["method", "endpoint"], +) +endpoint_calls_total = Counter( + "devops_info_endpoint_calls_total", + "Application-specific counter for endpoint usage", + ["endpoint"], +) +system_info_collection_seconds = Histogram( + "devops_info_system_collection_seconds", + "Time spent collecting system information", + buckets=(0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1), +) + class JSONFormatter(logging.Formatter): """Format log records as structured JSON for Loki/Grafana.""" @@ -106,14 +149,15 @@ def get_uptime() -> dict: def get_system_info() -> dict: """Collect system information.""" - return { - "hostname": socket.gethostname(), - "platform": platform.system(), - "platform_version": get_platform_version(), - "architecture": platform.machine(), - "cpu_count": os.cpu_count(), - "python_version": platform.python_version(), - } + with system_info_collection_seconds.time(): + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": get_platform_version(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count(), + "python_version": platform.python_version(), + } def get_client_ip() -> str | None: @@ -127,6 +171,17 @@ def get_client_ip() -> str | None: return request.remote_addr +def normalize_endpoint() -> str: + """Return a low-cardinality endpoint label for Prometheus metrics.""" + if not has_request_context(): + return "unknown" + if request.url_rule and request.url_rule.rule: + return request.url_rule.rule + if request.path in {"/", "/health", "/metrics"}: + return request.path + return "unmatched" + + def build_request_log_context(status_code: int | None = None) -> dict: """Build structured context for request-related logs.""" context: dict[str, object] = {"event": "http.request", "service": SERVICE_NAME} @@ -154,15 +209,46 @@ def build_request_log_context(status_code: int | None = None) -> dict: return context +def finalize_request_metrics(status_code: int) -> None: + """Record Prometheus metrics for the current request exactly once.""" + if not has_request_context() or not getattr(g, "metrics_tracked", False): + return + if getattr(g, "metrics_finalized", False): + return + + duration = time.perf_counter() - g.request_started_at + endpoint = g.request_metrics_endpoint + method = request.method + + http_requests_total.labels( + method=method, + endpoint=endpoint, + status_code=str(status_code), + ).inc() + http_request_duration_seconds.labels(method=method, endpoint=endpoint).observe(duration) + endpoint_calls_total.labels(endpoint=endpoint).inc() + http_requests_in_progress.labels(method=method, endpoint=endpoint).dec() + g.metrics_finalized = True + + @app.before_request def track_request_start() -> None: - """Store request start time for structured logging.""" + """Store request start time for structured logging and metrics.""" g.request_started_at = time.perf_counter() + g.request_metrics_endpoint = normalize_endpoint() + g.metrics_tracked = True + g.metrics_finalized = False + http_requests_in_progress.labels( + method=request.method, + endpoint=g.request_metrics_endpoint, + ).inc() @app.after_request def log_response(response): - """Log every completed HTTP request as JSON.""" + """Log every completed HTTP request as JSON and write metrics.""" + finalize_request_metrics(response.status_code) + level = logging.INFO if response.status_code >= 400: level = logging.ERROR @@ -175,6 +261,22 @@ def log_response(response): return response +@app.teardown_request +def cleanup_request_metrics(exc) -> None: + """Ensure the in-progress gauge is decremented even on failures.""" + if not has_request_context() or not getattr(g, "metrics_tracked", False): + return + + if getattr(g, "metrics_finalized", False): + return + + http_requests_in_progress.labels( + method=request.method, + endpoint=g.request_metrics_endpoint, + ).dec() + g.metrics_finalized = True + + @app.route("/") def index(): """Main endpoint - service and system information.""" @@ -184,7 +286,7 @@ def index(): "service": { "name": SERVICE_NAME, "version": SERVICE_VERSION, - "description": "DevOps course info service", + "description": SERVICE_DESCRIPTION, "framework": "Flask", }, "system": get_system_info(), @@ -205,6 +307,8 @@ def index(): "endpoints": [ {"path": "/", "method": "GET", "description": "Service information"}, {"path": "/health", "method": "GET", "description": "Health check"}, + {"path": "/ready", "method": "GET", "description": "Readiness check"}, + {"path": "/metrics", "method": "GET", "description": "Prometheus metrics"}, ], } @@ -225,6 +329,27 @@ def health(): ) +@app.route("/ready") +def ready(): + """Readiness check endpoint for Kubernetes probes.""" + return jsonify( + { + "status": "ready", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + "timestamp": datetime.now(timezone.utc) + .isoformat(timespec="milliseconds") + .replace("+00:00", "Z"), + } + ) + + +@app.route("/metrics") +def metrics(): + """Expose Prometheus metrics for scraping.""" + return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) + + @app.errorhandler(404) def not_found(error): return ( diff --git a/app_python/requirements.txt b/app_python/requirements.txt index 78180a1ad1..f6309a6723 100644 --- a/app_python/requirements.txt +++ b/app_python/requirements.txt @@ -1 +1,2 @@ -Flask==3.1.0 \ No newline at end of file +Flask==3.1.0 +prometheus-client==0.23.1 \ No newline at end of file diff --git a/app_python/tests/test_app.py b/app_python/tests/test_app.py index b032c06e50..e53ecaf809 100644 --- a/app_python/tests/test_app.py +++ b/app_python/tests/test_app.py @@ -1,62 +1,81 @@ -import pytest -from app import app as flask_app - - -@pytest.fixture() -def client(): - flask_app.config["TESTING"] = True - with flask_app.test_client() as client: - yield client - - -def test_root_endpoint_returns_200_and_json(client): - resp = client.get("/", headers={"User-Agent": "pytest"}) - assert resp.status_code == 200 - data = resp.get_json() - assert isinstance(data, dict) - - # top-level keys - for key in ["service", "system", "runtime", "request", "endpoints"]: - assert key in data - - # service structure - assert data["service"]["name"] == "devops-info-service" - assert data["service"]["framework"] == "Flask" - - # system structure - for key in ["hostname", "platform", "architecture", "cpu_count", "python_version"]: - assert key in data["system"] - - # runtime - assert "uptime_seconds" in data["runtime"] - assert isinstance(data["runtime"]["uptime_seconds"], int) - - # endpoints list - assert isinstance(data["endpoints"], list) - assert any(e["path"] == "/" for e in data["endpoints"]) - assert any(e["path"] == "/health" for e in data["endpoints"]) - - -def test_health_endpoint_returns_200_and_expected_fields(client): - resp = client.get("/health") - assert resp.status_code == 200 - data = resp.get_json() - - assert data["status"] == "healthy" - assert "timestamp" in data - assert "uptime_seconds" in data - assert isinstance(data["uptime_seconds"], int) - - -def test_unknown_endpoint_returns_404_json(client): - resp = client.get("/no-such-endpoint") - assert resp.status_code == 404 - data = resp.get_json() - - assert data["error"] == "Not Found" - assert "message" in data - - -def test_method_not_allowed_returns_405(client): - resp = client.post("/health") - assert resp.status_code == 405 +import pytest +from app import app as flask_app + + +@pytest.fixture() +def client(): + flask_app.config["TESTING"] = True + with flask_app.test_client() as client: + yield client + + +def test_root_endpoint_returns_200_and_json(client): + resp = client.get("/", headers={"User-Agent": "pytest"}) + assert resp.status_code == 200 + data = resp.get_json() + assert isinstance(data, dict) + + for key in ["service", "system", "runtime", "request", "endpoints"]: + assert key in data + + assert data["service"]["name"] == "devops-info-service" + assert data["service"]["framework"] == "Flask" + + for key in ["hostname", "platform", "architecture", "cpu_count", "python_version"]: + assert key in data["system"] + + assert "uptime_seconds" in data["runtime"] + assert isinstance(data["runtime"]["uptime_seconds"], int) + + assert isinstance(data["endpoints"], list) + assert any(e["path"] == "/" for e in data["endpoints"]) + assert any(e["path"] == "/health" for e in data["endpoints"]) + assert any(e["path"] == "/metrics" for e in data["endpoints"]) + + +def test_health_endpoint_returns_200_and_expected_fields(client): + resp = client.get("/health") + assert resp.status_code == 200 + data = resp.get_json() + + assert data["status"] == "healthy" + assert "timestamp" in data + assert "uptime_seconds" in data + assert isinstance(data["uptime_seconds"], int) + + +def test_metrics_endpoint_exposes_prometheus_metrics(client): + client.get("/") + client.get("/health") + client.get("/does-not-exist") + + resp = client.get("/metrics") + assert resp.status_code == 200 + metrics_text = resp.get_data(as_text=True) + + assert "# HELP http_requests_total" in metrics_text + assert 'http_requests_total{endpoint="/",method="GET",status_code="200"}' in metrics_text + assert 'http_requests_total{endpoint="/health",method="GET",status_code="200"}' in metrics_text + assert 'http_requests_total{endpoint="unmatched",method="GET",status_code="404"}' in metrics_text + assert "# TYPE http_request_duration_seconds histogram" in metrics_text + assert "http_requests_in_progress" in metrics_text + assert "devops_info_endpoint_calls_total" in metrics_text + assert "devops_info_system_collection_seconds" in metrics_text + + +def test_unknown_endpoint_returns_404_json(client): + resp = client.get("/no-such-endpoint") + assert resp.status_code == 404 + data = resp.get_json() + + assert data["error"] == "Not Found" + assert "message" in data + + +def test_method_not_allowed_returns_405_json(client): + resp = client.post("/health") + assert resp.status_code == 405 + data = resp.get_json() + + assert data["error"] == "Method Not Allowed" + assert "message" in data diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000000..2d4771a7cd --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,515 @@ +# Kubernetes Implementation + +## Architecture Overview + +The application is deployed into a dedicated Kubernetes namespace called `devops-lab09`. + +Architecture of the deployment: + +- **1 Namespace**: `devops-lab09` +- **1 Deployment**: `devops-info-service` +- **3 Pods** in the base configuration +- **1 Service** of type `NodePort` +- **1 container per Pod** +- **HTTP probes** for health and readiness control + +Networking flow: + +1. External request reaches the Service exposed through Minikube. +2. The `NodePort` Service forwards traffic to healthy Pods selected by labels. +3. The Deployment ensures the desired number of replicas is always running. +4. Readiness probe prevents traffic from reaching Pods that are not ready. +5. Liveness and startup probes help recover from unhealthy or slow-starting containers. + +Resource allocation strategy: + +- Requests: + - `cpu: 100m` + - `memory: 128Mi` +- Limits: + - `cpu: 250m` + - `memory: 256Mi` + +This configuration is small enough for local Minikube usage, but still demonstrates production-oriented resource governance. + +--- + +## Manifest Files + +### `namespace.yml` +Creates the namespace `devops-lab09` for logical isolation of Kubernetes resources. + +### `deployment.yml` +Main Deployment manifest for the Python application. + +Key choices: +- `replicas: 3` to satisfy the task requirement and demonstrate high availability +- `RollingUpdate` strategy with: + - `maxSurge: 1` + - `maxUnavailable: 0` +- resource requests and limits +- environment variables for app metadata +- container exposed on port `5000` +- `livenessProbe` on `/health` +- `readinessProbe` on `/ready` +- `startupProbe` on `/ready` + +Why these values were chosen: +- 3 replicas are the minimum required by the assignment and provide redundancy +- `maxUnavailable: 0` helps ensure no downtime during updates +- `maxSurge: 1` allows gradual replacement of Pods +- modest CPU/memory settings are appropriate for local development while still demonstrating best practice + +### `service.yml` +Creates a `NodePort` Service for exposing the Deployment outside the cluster. + +Key choices: +- `type: NodePort` +- service port `80` +- target container port `5000` +- fixed nodePort `30080` + +Why: +- NodePort is explicitly recommended in the assignment for local cluster access +- fixed nodePort makes local testing predictable + +### `deployment-update.yml` +Used to demonstrate rolling updates. +This manifest changes application configuration so that a new rollout occurs and the new version can be verified via `/ready`. + +### Unused bonus manifests +The repository may also contain: +- `deployment-app2.yml` +- `service-app2.yml` +- `ingress.yml` + +These were not used because the bonus task was intentionally not completed. + +--- + +## Deployment Evidence + +### Cluster setup verification + +Commands used: + +```powershell +kubectl cluster-info +kubectl get nodes -o wide +``` + +Observed output: + +```text +Kubernetes control plane is running at https://127.0.0.1:56880 +CoreDNS is running at https://127.0.0.1:56880/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy +``` + +```text +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +minikube Ready control-plane 40m v1.35.1 192.168.49.2 Debian GNU/Linux 12 (bookworm) 6.6.87.2-microsoft-standard-WSL2 docker://29.2.1 +``` + +### Deployment state + +Commands used: + +```powershell +kubectl get all -n devops-lab09 +kubectl get pods,svc -o wide -n devops-lab09 +kubectl describe deployment devops-info-service -n devops-lab09 +``` + +Observed output: + +```text +NAME READY STATUS RESTARTS AGE +pod/devops-info-service-6dc8c746f4-7bp9n 1/1 Running 0 2m15s +pod/devops-info-service-6dc8c746f4-kxx8k 1/1 Running 0 2m15s +pod/devops-info-service-6dc8c746f4-njvzw 1/1 Running 0 2m15s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/devops-info-service NodePort 10.110.237.170 80:30080/TCP 34m + +NAME READY UP-TO-DATE AVAILABLE AGE +deployment.apps/devops-info-service 3/3 3 3 2m16s +``` + +```text +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +pod/devops-info-service-6dc8c746f4-7bp9n 1/1 Running 0 2m15s 10.244.0.8 minikube +pod/devops-info-service-6dc8c746f4-kxx8k 1/1 Running 0 2m15s 10.244.0.9 minikube +pod/devops-info-service-6dc8c746f4-njvzw 1/1 Running 0 2m15s 10.244.0.10 minikube + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR +service/devops-info-service NodePort 10.110.237.170 80:30080/TCP 34m app.kubernetes.io/instance=devops-info-service,app.kubernetes.io/name=devops-info-service +``` + +Deployment description confirmed: +- 3 desired replicas +- RollingUpdate strategy +- probes configured +- resource requests and limits configured + +### Application working verification + +Service URL: + +```powershell +minikube service devops-info-service -n devops-lab09 --url +``` + +Observed URL: + +```text +http://127.0.0.1:49708 +``` + +Endpoint checks: + +```powershell +curl http://127.0.0.1:49708/ +curl http://127.0.0.1:49708/health +curl http://127.0.0.1:49708/ready +``` + +Observed results: + +- `/` returned `200 OK` +- `/health` returned `200 OK` +- `/ready` returned `200 OK` + +Example `/health` response: + +```json +{"status":"healthy","timestamp":"2026-03-24T18:18:05.669Z","uptime_seconds":111} +``` + +Example `/ready` response: + +```json +{"service":"devops-info-service","status":"ready","timestamp":"2026-03-24T18:18:13.566Z","version":"1.0.0-k8s"} +``` + +--- + +## Operations Performed + +### 1. Image build + +```powershell +minikube image build -t cdeth567/devops-info-service:lab09 .\app_python +``` + +### 2. Initial deployment + +```powershell +kubectl apply -f .\k8s\namespace.yml +kubectl apply -f .\k8s\deployment.yml +kubectl apply -f .\k8s\service.yml +``` + +### 3. Scaling demonstration + +Scale command: + +```powershell +kubectl scale deployment/devops-info-service --replicas=5 -n devops-lab09 +kubectl rollout status deployment/devops-info-service -n devops-lab09 +kubectl get pods -n devops-lab09 +``` + +Observed output: + +```text +deployment "devops-info-service" successfully rolled out +``` + +```text +NAME READY STATUS RESTARTS AGE +devops-info-service-6dc8c746f4-7bp9n 1/1 Running 0 2m48s +devops-info-service-6dc8c746f4-fq87x 1/1 Running 0 12s +devops-info-service-6dc8c746f4-kxx8k 1/1 Running 0 2m48s +devops-info-service-6dc8c746f4-njvzw 1/1 Running 0 2m48s +devops-info-service-6dc8c746f4-skgb2 1/1 Running 0 12s +``` + +Then the Deployment was returned to the base configuration: + +```powershell +kubectl apply -f .\k8s\deployment.yml +kubectl rollout status deployment/devops-info-service -n devops-lab09 +``` + +### 4. Rolling update demonstration + +Update command: + +```powershell +kubectl apply -f .\k8s\deployment-update.yml +kubectl rollout status deployment/devops-info-service -n devops-lab09 +kubectl rollout history deployment/devops-info-service -n devops-lab09 +kubectl get pods -n devops-lab09 +``` + +Observed output: + +```text +deployment "devops-info-service" successfully rolled out +``` + +```text +deployment.apps/devops-info-service +REVISION CHANGE-CAUSE +1 +2 +``` + +Pods during update: + +```text +NAME READY STATUS RESTARTS AGE +devops-info-service-5675c54f79-27x4l 1/1 Running 0 21s +devops-info-service-5675c54f79-5dd2p 1/1 Running 0 8s +devops-info-service-5675c54f79-6qwqs 1/1 Running 0 14s +devops-info-service-6dc8c746f4-7bp9n 1/1 Terminating 0 3m44s +devops-info-service-6dc8c746f4-kxx8k 1/1 Terminating 0 3m44s +devops-info-service-6dc8c746f4-njvzw 1/1 Terminating 0 3m44s +``` + +Version check after update: + +```powershell +curl http://127.0.0.1:49708/ready +``` + +Response: + +```json +{"service":"devops-info-service","status":"ready","timestamp":"2026-03-24T18:21:00.104Z","version":"1.1.0-k8s"} +``` + +### 5. Rollback demonstration + +Rollback commands: + +```powershell +kubectl rollout undo deployment/devops-info-service -n devops-lab09 +kubectl rollout status deployment/devops-info-service -n devops-lab09 +kubectl rollout history deployment/devops-info-service -n devops-lab09 +kubectl get pods -n devops-lab09 +``` + +Observed output: + +```text +deployment.apps/devops-info-service rolled back +deployment "devops-info-service" successfully rolled out +``` + +Pods after rollback: + +```text +NAME READY STATUS RESTARTS AGE +devops-info-service-5675c54f79-27x4l 1/1 Terminating 0 116s +devops-info-service-5675c54f79-5dd2p 1/1 Terminating 0 103s +devops-info-service-5675c54f79-6qwqs 1/1 Terminating 0 109s +devops-info-service-6dc8c746f4-jncd8 1/1 Running 0 15s +devops-info-service-6dc8c746f4-pbxjd 1/1 Running 0 9s +devops-info-service-6dc8c746f4-qwr2t 1/1 Running 0 21s +``` + +Version check after rollback: + +```powershell +curl http://127.0.0.1:49708/ready +``` + +Response: + +```json +{"service":"devops-info-service","status":"ready","timestamp":"2026-03-24T18:21:30.812Z","version":"1.0.0-k8s"} +``` + +### 6. Zero downtime verification + +Zero downtime was verified during the rolling update by checking service availability before and after the rollout and confirming that: +- new Pods became Ready before old Pods were terminated +- Service endpoint continued returning `200 OK` +- update and rollback both completed successfully without total service loss + +This was supported by: +- `maxUnavailable: 0` +- readiness probes on `/ready` +- successful responses from the application during rollout validation + +### 7. Service access method + +The application was accessed using: + +```powershell +minikube service devops-info-service -n devops-lab09 --url +``` + +On Windows with Docker driver, the terminal used for this command must stay open because Minikube keeps a local tunnel active. + +--- + +## Production Considerations + +### Health checks implemented + +The Deployment uses: +- `livenessProbe` on `/health` +- `readinessProbe` on `/ready` +- `startupProbe` on `/ready` + +Why: +- **liveness probe** detects broken containers and allows automatic restart +- **readiness probe** ensures traffic is sent only to ready Pods +- **startup probe** protects slow starts from being killed too early + +This closely reflects production deployment best practices for HTTP services. + +### Resource limits rationale + +Chosen values: + +- requests: + - `cpu: 100m` + - `memory: 128Mi` +- limits: + - `cpu: 250m` + - `memory: 256Mi` + +Rationale: +- sufficient for a lightweight Flask service in a local cluster +- demonstrates proper scheduling hints and protection against uncontrolled resource usage +- helps avoid resource starvation and noisy neighbor effects + +### How this could be improved for production + +For a real production environment, I would improve the setup by: + +- using a real container registry and immutable image tags +- adding Horizontal Pod Autoscaler +- using Ingress or Gateway API instead of direct NodePort exposure +- separating config into ConfigMaps and Secrets +- adding PodDisruptionBudget +- adding affinity / anti-affinity rules +- using multiple nodes instead of single-node Minikube +- storing logs centrally + +### Monitoring and observability strategy + +For production observability I would add: + +- Prometheus for metrics collection +- Grafana dashboards +- centralized logging (for example Loki or ELK) +- alerting on Pod restarts, probe failures, CPU/memory saturation, and rollout failures +- Kubernetes events monitoring +- tracing if the application becomes distributed + +--- + +## Challenges & Solutions + +### Challenge 1 — `CreateContainerConfigError` + +Initial Pods failed with: + +```text +CreateContainerConfigError +``` + +Detailed debugging with `kubectl describe pod` showed: + +```text +Error: container has runAsNonRoot and image has non-numeric user (app), cannot verify user is non-root +``` + +#### Cause +The Deployment required non-root execution, but the image used a named user instead of a numeric UID. + +#### Solution +The Dockerfile was updated to create a numeric user and run the app as: + +```dockerfile +RUN addgroup --system --gid 1000 app \ + && adduser --system --uid 1000 --ingroup app app + +USER 1000:1000 +``` + +#### What I learned +Kubernetes security settings may reject a container even before startup if the runtime cannot verify non-root execution. + +### Challenge 2 — `CrashLoopBackOff` + +After fixing the non-root issue, Pods still failed and entered: + +```text +CrashLoopBackOff +``` + +Debugging using: +- `kubectl describe pod` +- `kubectl logs` +- `kubectl get events` + +showed: + +```text +Startup probe failed: HTTP probe failed with statuscode: 404 +``` + +and application logs confirmed: + +```text +"path": "/ready", "status_code": 404 +``` + +#### Cause +`startupProbe` and `readinessProbe` were configured to call `/ready`, but the application image did not yet contain that endpoint. + +#### Solution +A `/ready` endpoint was added to `app_python/app.py`, the image was rebuilt, and the Deployment was recreated. + +#### What I learned +Kubernetes probes are strict and extremely useful for debugging application readiness problems. +If probes and application endpoints do not match, Pods may restart even when the container process itself starts successfully. + +### Debugging methods used + +The main Kubernetes debugging commands used during this lab were: + +```powershell +kubectl describe pod -n devops-lab09 +kubectl logs -n devops-lab09 +kubectl get events -n devops-lab09 --sort-by=.metadata.creationTimestamp +kubectl describe deployment devops-info-service -n devops-lab09 +``` + +These commands were enough to identify both configuration-level and application-level issues. + +--- + +## Conclusion + +The required Kubernetes tasks for Lab 09 were completed successfully without the bonus task. + +Completed items: + +- local Minikube cluster setup +- Deployment manifest with 3 replicas +- NodePort Service +- liveness, readiness and startup probes +- resource requests and limits +- service accessibility from outside the cluster +- scaling to 5 replicas +- rolling update +- rollback +- documentation with evidence, production considerations, and troubleshooting + +Bonus task with Ingress and TLS was intentionally not completed. diff --git a/k8s/deployment-app2.yml b/k8s/deployment-app2.yml new file mode 100644 index 0000000000..9f8c714d6e --- /dev/null +++ b/k8s/deployment-app2.yml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service-app2 + namespace: devops-lab09 + labels: + app.kubernetes.io/name: devops-info-service-app2 + app.kubernetes.io/instance: devops-info-service-app2 + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 2.0.0-k8s +spec: + replicas: 2 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app.kubernetes.io/name: devops-info-service-app2 + app.kubernetes.io/instance: devops-info-service-app2 + template: + metadata: + labels: + app.kubernetes.io/name: devops-info-service-app2 + app.kubernetes.io/instance: devops-info-service-app2 + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 2.0.0-k8s + spec: + automountServiceAccountToken: false + terminationGracePeriodSeconds: 30 + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: devops-info-service-app2 + image: cdeth567/devops-info-service:lab09 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 5000 + protocol: TCP + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "5000" + - name: SERVICE_NAME + value: "devops-info-service-app2" + - name: SERVICE_VERSION + value: "2.0.0-k8s" + - name: SERVICE_DESCRIPTION + value: "Second application for path based Ingress routing" + resources: + requests: + cpu: 50m + memory: 96Mi + limits: + cpu: 150m + memory: 192Mi + startupProbe: + httpGet: + path: /ready + port: http + periodSeconds: 5 + failureThreshold: 12 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL diff --git a/k8s/deployment-update.yml b/k8s/deployment-update.yml new file mode 100644 index 0000000000..0ba78ef7ab --- /dev/null +++ b/k8s/deployment-update.yml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service + namespace: devops-lab09 + labels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 1.1.0-k8s +spec: + replicas: 3 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + template: + metadata: + labels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 1.1.0-k8s + spec: + automountServiceAccountToken: false + terminationGracePeriodSeconds: 30 + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: devops-info-service + image: cdeth567/devops-info-service:lab09 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 5000 + protocol: TCP + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "5000" + - name: SERVICE_NAME + value: "devops-info-service" + - name: SERVICE_VERSION + value: "1.1.0-k8s" + - name: SERVICE_DESCRIPTION + value: "DevOps course info service on Kubernetes (updated rollout)" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + startupProbe: + httpGet: + path: /ready + port: http + periodSeconds: 5 + failureThreshold: 12 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL diff --git a/k8s/deployment.yml b/k8s/deployment.yml new file mode 100644 index 0000000000..d58639cafe --- /dev/null +++ b/k8s/deployment.yml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service + namespace: devops-lab09 + labels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 1.0.0-k8s +spec: + replicas: 3 + revisionHistoryLimit: 10 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + template: + metadata: + labels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/version: 1.0.0-k8s + spec: + automountServiceAccountToken: false + terminationGracePeriodSeconds: 30 + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: devops-info-service + image: cdeth567/devops-info-service:lab09 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 5000 + protocol: TCP + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "5000" + - name: SERVICE_NAME + value: "devops-info-service" + - name: SERVICE_VERSION + value: "1.0.0-k8s" + - name: SERVICE_DESCRIPTION + value: "DevOps course info service on Kubernetes" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + startupProbe: + httpGet: + path: /ready + port: http + periodSeconds: 5 + failureThreshold: 12 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL diff --git a/k8s/ingress.yml b/k8s/ingress.yml new file mode 100644 index 0000000000..9764ce86d6 --- /dev/null +++ b/k8s/ingress.yml @@ -0,0 +1,33 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: devops-lab09-ingress + namespace: devops-lab09 + annotations: + nginx.ingress.kubernetes.io/use-regex: "true" + nginx.ingress.kubernetes.io/rewrite-target: /$2 + nginx.ingress.kubernetes.io/ssl-redirect: "true" +spec: + ingressClassName: nginx + tls: + - hosts: + - devops-lab09.local + secretName: devops-lab09-tls + rules: + - host: devops-lab09.local + http: + paths: + - path: /app1(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-info-service + port: + number: 80 + - path: /app2(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: devops-info-service-app2 + port: + number: 80 diff --git a/k8s/namespace.yml b/k8s/namespace.yml new file mode 100644 index 0000000000..3bc418d3e7 --- /dev/null +++ b/k8s/namespace.yml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: devops-lab09 + labels: + app.kubernetes.io/part-of: devops-core-course + app.kubernetes.io/managed-by: kubectl diff --git a/k8s/service-app2.yml b/k8s/service-app2.yml new file mode 100644 index 0000000000..9632b4ef4a --- /dev/null +++ b/k8s/service-app2.yml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service-app2 + namespace: devops-lab09 + labels: + app.kubernetes.io/name: devops-info-service-app2 + app.kubernetes.io/instance: devops-info-service-app2 + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: devops-info-service-app2 + app.kubernetes.io/instance: devops-info-service-app2 + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http diff --git a/k8s/service.yml b/k8s/service.yml new file mode 100644 index 0000000000..475f889ef9 --- /dev/null +++ b/k8s/service.yml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service + namespace: devops-lab09 + labels: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + app.kubernetes.io/component: web + app.kubernetes.io/part-of: devops-core-course +spec: + type: NodePort + selector: + app.kubernetes.io/name: devops-info-service + app.kubernetes.io/instance: devops-info-service + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http + nodePort: 30080