diff --git a/labs/app_python/app.py b/labs/app_python/app.py index a5eeb3c199..37baf6d2f0 100644 --- a/labs/app_python/app.py +++ b/labs/app_python/app.py @@ -1,132 +1,193 @@ """ -DevOps Info Service with JSON Logging +DevOps Info Service +Веб-сервис для предоставления информации о системе и состоянии сервиса """ import os import socket -import logging +import platform from datetime import datetime, timezone from flask import Flask, jsonify, request -from pythonjsonlogger import jsonlogger -# Flask app +# Prometheus +from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST + +# Создаем приложение Flask app = Flask(__name__) -# ENV config +# Настройки из переменных окружения HOST = os.getenv('HOST', '0.0.0.0') PORT = int(os.getenv('PORT', 5000)) DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' +# Время запуска сервиса START_TIME = datetime.now(timezone.utc) -# Docker info +# Docker информация IS_DOCKER = os.path.exists('/.dockerenv') CONTAINER_ID = socket.gethostname() if IS_DOCKER else None # ---------------------------- -# JSON LOGGING CONFIG +# PROMETHEUS METRICS # ---------------------------- -logger = logging.getLogger() -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -formatter = jsonlogger.JsonFormatter( - '%(asctime)s %(levelname)s %(message)s %(method)s %(path)s %(status)s %(client_ip)s %(duration)s' +http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status'] +) + +http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration in seconds', + ['method', 'endpoint'] +) + +http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'Number of HTTP requests in progress' ) -handler.setFormatter(formatter) -logger.addHandler(handler) # ---------------------------- # HELPERS # ---------------------------- + def get_uptime(): delta = datetime.now(timezone.utc) - START_TIME seconds = int(delta.total_seconds()) - return {'seconds': seconds, 'human': f"{seconds} seconds"} + + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + secs = seconds % 60 + + parts = [] + if hours > 0: + parts.append(f"{hours} hour{'s' if hours != 1 else ''}") + if minutes > 0: + parts.append(f"{minutes} minute{'s' if minutes != 1 else ''}") + if secs > 0 or not parts: + parts.append(f"{secs} second{'s' if secs != 1 else ''}") + + return { + 'seconds': seconds, + 'human': ', '.join(parts) + } # ---------------------------- -# LOGGING MIDDLEWARE +# METRICS MIDDLEWARE # ---------------------------- + @app.before_request -def log_request(): +def before_request(): request.start_time = datetime.now(timezone.utc) - logging.info( - "request_received", - extra={ - "method": request.method, - "path": request.path, - "client_ip": request.remote_addr - } - ) + http_requests_in_progress.inc() @app.after_request -def log_response(response): +def after_request(response): duration = (datetime.now(timezone.utc) - request.start_time).total_seconds() - logging.info( - "request_completed", - extra={ - "method": request.method, - "path": request.path, - "status": response.status_code, - "client_ip": request.remote_addr, - "duration": duration - } - ) + + http_requests_total.labels( + method=request.method, + endpoint=request.path, + status=str(response.status_code) + ).inc() + + http_request_duration_seconds.labels( + method=request.method, + endpoint=request.path + ).observe(duration) + + http_requests_in_progress.dec() + return response # ---------------------------- # ROUTES # ---------------------------- + @app.route('/') def main_endpoint(): + system_info = { + 'hostname': socket.gethostname(), + 'platform': platform.system(), + 'platform_version': platform.version(), + 'architecture': platform.machine(), + 'cpu_count': os.cpu_count(), + 'python_version': platform.python_version(), + 'is_docker_container': IS_DOCKER, + 'container_id': CONTAINER_ID + } + + runtime_info = { + 'uptime_seconds': get_uptime()['seconds'], + 'uptime_human': get_uptime()['human'], + 'current_time': datetime.now(timezone.utc).isoformat(), + 'timezone': 'UTC', + 'start_time': START_TIME.isoformat() + } + + request_info = { + 'client_ip': request.remote_addr, + 'user_agent': request.headers.get('User-Agent', 'Unknown'), + 'method': request.method, + 'path': request.path + } + return jsonify({ - 'service': 'devops-info-service', - 'status': 'running', - 'time': datetime.now(timezone.utc).isoformat() + 'service': { + 'name': 'devops-info-service', + 'version': '2.0.0', + 'description': 'DevOps course info service (Dockerized)', + 'framework': 'Flask', + 'environment': 'docker' if IS_DOCKER else 'local' + }, + 'system': system_info, + 'runtime': runtime_info, + 'request': request_info }) @app.route('/health') def health_check(): return jsonify({ 'status': 'healthy', - 'uptime': get_uptime()['seconds'] + 'timestamp': datetime.now(timezone.utc).isoformat(), + 'uptime_seconds': get_uptime()['seconds'], + 'environment': 'docker' if IS_DOCKER else 'local', + 'container_id': CONTAINER_ID }) @app.route('/docker') def docker_info(): return jsonify({ 'is_docker': IS_DOCKER, - 'container_id': CONTAINER_ID + 'container_id': CONTAINER_ID, + 'docker_env': dict(os.environ) if IS_DOCKER else None, + 'message': 'Running in Docker container' if IS_DOCKER else 'Running locally' }) +@app.route('/metrics') +def metrics(): + return generate_latest(), 200, {'Content-Type': CONTENT_TYPE_LATEST} + # ---------------------------- # ERRORS # ---------------------------- + @app.errorhandler(404) def not_found(error): - logging.error( - "not_found", - extra={ - "method": request.method, - "path": request.path, - "status": 404, - "client_ip": request.remote_addr - } - ) - return jsonify({'error': 'Not Found'}), 404 + return jsonify({ + 'error': 'Not Found', + 'message': 'Endpoint does not exist' + }), 404 # ---------------------------- # START # ---------------------------- + if __name__ == '__main__': - logging.info( - "service_started", - extra={ - "host": HOST, - "port": PORT, - "debug": DEBUG, - "docker": IS_DOCKER, - "container_id": CONTAINER_ID - } - ) + print(f"Starting DevOps Info Service on {HOST}:{PORT}") + print(f"Debug mode: {DEBUG}") + print(f"Docker environment: {IS_DOCKER}") + print(f"Container ID: {CONTAINER_ID}") + app.run(host=HOST, port=PORT, debug=DEBUG) \ No newline at end of file diff --git a/labs/app_python/requirements.txt b/labs/app_python/requirements.txt index 6a7d4a4546..fa23297994 100644 --- a/labs/app_python/requirements.txt +++ b/labs/app_python/requirements.txt @@ -6,4 +6,6 @@ python-dotenv==1.0.1 # Runtime dependencies Werkzeug==3.1.3 -Jinja2==3.1.4 \ No newline at end of file +Jinja2==3.1.4 + +prometheus-client==0.23.1 \ No newline at end of file diff --git a/labs/monitoring/docker-compose.yml b/labs/monitoring/docker-compose.yml index d5efe8db44..554acf1c32 100644 --- a/labs/monitoring/docker-compose.yml +++ b/labs/monitoring/docker-compose.yml @@ -87,10 +87,36 @@ services: interval: 10s timeout: 5s retries: 5 + + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 volumes: loki-data: grafana-data: + prometheus-data: networks: logging: \ No newline at end of file diff --git a/labs/monitoring/docs/LAB08.md b/labs/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..81960d3d0a --- /dev/null +++ b/labs/monitoring/docs/LAB08.md @@ -0,0 +1,202 @@ +# Lab 8 — Metrics & Monitoring with Prometheus + +## 1. Architecture + +This lab extends the observability stack from Lab 7 by adding metrics collection. + +Flow: +App → Prometheus → Grafana + +* Application exposes `/metrics` +* Prometheus scrapes metrics every 15s +* Grafana visualizes metrics + +Components: + +* Prometheus — metrics collection and storage +* Grafana — visualization +* Python app — metrics exporter +* Loki — logs + +--- + +## 2. Application Instrumentation + +The application was instrumented using `prometheus_client`. + +### Implemented Metrics + +Counter: + +* http_requests_total +* Labels: method, endpoint, status + +Histogram: + +* http_request_duration_seconds + +Gauge: + +* http_requests_in_progress + +These metrics follow the RED method: + +* Rate → request count +* Errors → failed requests +* Duration → latency + +--- + +## 3. Prometheus Configuration + +Scrape interval: 15s + +Targets: + +* prometheus:9090 +* app-python:5000 +* loki:3100 +* grafana:3000 + +Config snippet: + +```yaml +scrape_configs: + - job_name: 'app' + static_configs: + - targets: ['app-python:5000'] + metrics_path: /metrics +``` + +Retention: + +* 15 days +* 10GB + +--- + +## 4. Dashboard Walkthrough + +Request Rate: + +``` +sum(rate(http_requests_total[5m])) by (endpoint) +``` + +Error Rate: + +``` +sum(rate(http_requests_total{status=~"5.."}[5m])) +``` + +Request Duration (p95): + +``` +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +Active Requests: + +``` +http_requests_in_progress +``` + +Status Code Distribution: + +``` +sum by (status) (rate(http_requests_total[5m])) +``` + +Service Uptime: + +``` +up{job="app"} +``` + +--- + +## 5. PromQL Examples + +``` +rate(http_requests_total[5m]) +sum(rate(http_requests_total[5m])) +sum by (endpoint) (rate(http_requests_total[5m])) +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +up == 0 +``` + +--- + +## 6. Production Configuration + +Resource Limits: + +* Prometheus: 1 CPU / 1GB RAM +* Grafana: 1 CPU / 1GB RAM +* Loki: 1 CPU / 1GB RAM +* App: 0.5 CPU / 512MB RAM + +Health Checks: + +* Prometheus: /-/healthy +* App: /health +* Grafana: /api/health +* Loki: /ready + +Persistence: + +* prometheus-data +* grafana-data +* loki-data + +--- + +## 7. Testing Results + +* Prometheus targets are UP +* Metrics endpoint is accessible +* Dashboard shows live data +* Logs and metrics work together + +--- + +## 8. Challenges & Solutions + +Prometheus config error: + +* retention settings in YAML caused error +* fixed by moving retention to command + +No metrics in Grafana: + +* wrong data source URL +* fixed using http://prometheus:9090 + +--- + +## 9. Metrics vs Logs + +* Logs show events +* Metrics show aggregated data + +Both are required for observability + +--- + +## Evidence + +### Prometheus Targets + +![Prometheus Targets](../screenshots/targets.png) + +### Metrics Endpoint + +![Metrics](../screenshots/metrics.png) + +### Dashboard + +![Dashboard](../screenshots/grafana-metrics.png) + +### PromQL Query + +![PromQL](../screenshots/promql.png) diff --git a/labs/monitoring/prometheus/prometheus.yml b/labs/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..3281518602 --- /dev/null +++ b/labs/monitoring/prometheus/prometheus.yml @@ -0,0 +1,21 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['app-python:5000'] + metrics_path: /metrics + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] \ No newline at end of file diff --git a/labs/monitoring/screenshots/grafana-metrics.png b/labs/monitoring/screenshots/grafana-metrics.png new file mode 100644 index 0000000000..c0abf8ea4f Binary files /dev/null and b/labs/monitoring/screenshots/grafana-metrics.png differ diff --git a/labs/monitoring/screenshots/metrics.png b/labs/monitoring/screenshots/metrics.png new file mode 100644 index 0000000000..44c992b151 Binary files /dev/null and b/labs/monitoring/screenshots/metrics.png differ diff --git a/labs/monitoring/screenshots/promql.png b/labs/monitoring/screenshots/promql.png new file mode 100644 index 0000000000..370fc09adb Binary files /dev/null and b/labs/monitoring/screenshots/promql.png differ diff --git a/labs/monitoring/screenshots/targets.png b/labs/monitoring/screenshots/targets.png new file mode 100644 index 0000000000..c1cfab33e5 Binary files /dev/null and b/labs/monitoring/screenshots/targets.png differ