diff --git a/app_python/app.py b/app_python/app.py index 29cb4e95d9..192b259897 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -7,27 +7,75 @@ import socket import platform import logging +import sys from datetime import datetime, timezone +import time +from fastapi.responses import JSONResponse, Response +from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse +from pythonjsonlogger import jsonlogger import uvicorn HOST = os.getenv("HOST", "0.0.0.0") PORT = int(os.getenv("PORT", 5000)) -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", + +# Configure JSON logging +logger = logging.getLogger("devops-info-service") +logger.setLevel(logging.INFO) +logger.handlers.clear() + +log_handler = logging.StreamHandler(sys.stdout) +formatter = jsonlogger.JsonFormatter( + "%(asctime)s %(levelname)s %(message)s %(method)s %(path)s %(client_ip)s %(status_code)s" +) +log_handler.setFormatter(formatter) +logger.addHandler(log_handler) +logger.propagate = False + +# Prometheus metrics +HTTP_REQUESTS_TOTAL = Counter( + "http_requests_total", + "Total HTTP requests", + ["method", "endpoint", "status"], ) -logger = logging.getLogger(__name__) +HTTP_REQUEST_DURATION_SECONDS = Histogram( + "http_request_duration_seconds", + "HTTP request duration in seconds", + ["method", "endpoint"], +) + +HTTP_REQUESTS_IN_PROGRESS = Gauge( + "http_requests_in_progress", + "HTTP requests currently being processed", +) + +DEVOPS_INFO_ENDPOINT_CALLS_TOTAL = Counter( + "devops_info_endpoint_calls_total", + "Total endpoint calls in DevOps Info Service", + ["endpoint"], +) + +DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS = Histogram( + "devops_info_system_collection_seconds", + "System information collection duration in seconds", +) START_TIME = datetime.now(timezone.utc) app = FastAPI(title="DevOps Info Service") -logger.info("Application initialized") +logger.info( + "Application initialized", + extra={ + "method": "", + "path": "", + "client_ip": "", + "status_code": "", + }, +) def get_uptime(): @@ -38,10 +86,16 @@ def get_uptime(): minutes = (seconds % 3600) // 60 return seconds, f"{hours} hours, {minutes} minutes" +def normalize_endpoint(path: str) -> str: + if path in ["/", "/health", "/metrics"]: + return path + return "other" def get_system_info(): """Collect system information.""" - return { + start = time.time() + + info = { "hostname": socket.gethostname(), "platform": platform.system(), "platform_version": platform.release(), @@ -50,12 +104,54 @@ def get_system_info(): "python_version": platform.python_version(), } + DEVOPS_INFO_SYSTEM_COLLECTION_SECONDS.observe(time.time() - start) + return info + + +@app.middleware("http") +async def log_requests(request: Request, call_next): + """Log every HTTP request in JSON format and collect Prometheus metrics.""" + endpoint = normalize_endpoint(request.url.path) + method = request.method + + HTTP_REQUESTS_IN_PROGRESS.inc() + DEVOPS_INFO_ENDPOINT_CALLS_TOTAL.labels(endpoint=endpoint).inc() + + start_time = time.time() + response = await call_next(request) + duration = time.time() - start_time + + status = str(response.status_code) + + HTTP_REQUESTS_TOTAL.labels( + method=method, + endpoint=endpoint, + status=status, + ).inc() + + HTTP_REQUEST_DURATION_SECONDS.labels( + method=method, + endpoint=endpoint, + ).observe(duration) + + HTTP_REQUESTS_IN_PROGRESS.dec() + + logger.info( + "HTTP request processed", + extra={ + "method": method, + "path": request.url.path, + "client_ip": request.client.host if request.client else "", + "status_code": response.status_code, + }, + ) + + return response + @app.get("/") async def index(request: Request): """Main endpoint returning service and system information.""" - logger.info("Handling request to '/'") - uptime_seconds, uptime_human = get_uptime() return { @@ -81,6 +177,7 @@ async def index(request: Request): "endpoints": [ {"path": "/", "method": "GET", "description": "Service information"}, {"path": "/health", "method": "GET", "description": "Health check"}, + {"path": "/metrics", "method": "GET", "description": "Prometheus metrics"}, ], } @@ -88,8 +185,6 @@ async def index(request: Request): @app.get("/health") async def health(): """Health check endpoint for monitoring.""" - logger.info("Health check requested") - uptime_seconds, _ = get_uptime() return { "status": "healthy", @@ -97,10 +192,23 @@ async def health(): "uptime_seconds": uptime_seconds, } +@app.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint.""" + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) @app.exception_handler(404) async def not_found(request: Request, exc): """Handle 404 errors.""" + logger.warning( + "Endpoint not found", + extra={ + "method": request.method, + "path": request.url.path, + "client_ip": request.client.host if request.client else "", + "status_code": 404, + }, + ) return JSONResponse( status_code=404, content={"error": "Not Found", "message": "Endpoint does not exist"}, @@ -110,13 +218,29 @@ async def not_found(request: Request, exc): @app.exception_handler(500) async def internal_error(request: Request, exc): """Handle unexpected server errors.""" - logger.error(f"Internal server error: {exc}") + logger.error( + "Internal server error", + extra={ + "method": request.method, + "path": request.url.path, + "client_ip": request.client.host if request.client else "", + "status_code": 500, + }, + ) return JSONResponse( status_code=500, content={"error": "Internal Server Error", "message": "An unexpected error occurred"}, ) -if __name__ == "__main__": - logger.info(f"Starting server on {HOST}:{PORT}") - uvicorn.run("app:app", host=HOST, port=PORT) +if __name__ == "__main__": + logger.info( + "Starting server", + extra={ + "method": "", + "path": "", + "client_ip": "", + "status_code": "", + }, + ) + uvicorn.run(app, host=HOST, port=PORT) diff --git a/app_python/requirements.txt b/app_python/requirements.txt index ebc98913e8..ef9f879780 100644 --- a/app_python/requirements.txt +++ b/app_python/requirements.txt @@ -1,2 +1,4 @@ fastapi==0.115.8 uvicorn[standard]==0.32.0 +python-json-logger +prometheus-client==0.23.1 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..e5773d69f8 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,130 @@ +version: "3.8" + +services: + loki: + image: grafana/loki:3.0.0 + container_name: loki + ports: + - "3100:3100" + command: -config.file=/etc/loki/config.yml + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + memory: 1G + cpus: "1.0" + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + command: -config.file=/etc/promtail/config.yml + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + restart: unless-stopped + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin123 + volumes: + - grafana-data:/var/lib/grafana + networks: + - logging + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 20s + deploy: + resources: + limits: + memory: 512M + cpus: "0.5" + + prometheus: + image: prom/prometheus:v3.9.0 + container_name: prometheus + ports: + - "9090:9090" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.size=10GB" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - logging + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + memory: 1G + cpus: "1.0" + + app-python: + image: fayzullin/devops-info-service:latest + container_name: app-python + ports: + - "8000:5000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + service_name: "devops-python" + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + deploy: + resources: + limits: + memory: 256M + cpus: "0.5" + +volumes: + loki-data: + grafana-data: + prometheus-data: + +networks: + logging: + driver: bridge diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..63ee3fc179 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,443 @@ +# Lab 8 — Metrics & Monitoring with Prometheus + +## Overview + +In this lab, I instrumented my Python application with Prometheus metrics and deployed a complete monitoring stack using Prometheus and Grafana. The application now exposes a `/metrics` endpoint, Prometheus scrapes metrics from all configured targets, and Grafana visualizes the collected data through custom dashboards. + +This lab extends the observability stack from Lab 7 by adding metrics-based monitoring on top of logs. + +Technologies used: + +- FastAPI +- prometheus_client +- Prometheus v3.9.0 +- Grafana v12.3.1 +- Docker Compose + +--- + +## 1. Architecture + +The monitoring architecture is based on a pull model. + +**Metric flow:** + +Application → `/metrics` endpoint → Prometheus scrapes metrics → Grafana queries Prometheus → dashboards visualize metrics + +### Components + +- **Python application** exposes Prometheus metrics on `/metrics` +- **Prometheus** scrapes metrics every 15 seconds +- **Grafana** uses Prometheus as a data source +- **Loki + Promtail** from Lab 7 remain available for logs + +### Monitoring targets + +The following targets were configured in Prometheus: + +- `prometheus` → `localhost:9090` +- `app` → `app-python:5000/metrics` +- `loki` → `loki:3100/metrics` +- `grafana` → `grafana:3000/metrics` + +--- + +## 2. Application Instrumentation + +I added Prometheus instrumentation to the FastAPI application using the `prometheus_client` Python library. + +### Installed dependency + +```txt +prometheus-client==0.23.1 +``` + +Metrics added +1. http_requests_total + +Type: Counter +Purpose: Counts total HTTP requests +Labels: method, endpoint, status + +This metric is used to calculate request rate, request distribution, and error rate. + +2. http_request_duration_seconds + +Type: Histogram +Purpose: Measures request latency in seconds +Labels: method, endpoint + +This metric is used for latency analysis and percentile calculations such as p95. + +3. http_requests_in_progress + +Type: Gauge +Purpose: Tracks active requests currently being processed + +This metric shows concurrency and current request load. + +4. devops_info_endpoint_calls_total + +Type: Counter +Purpose: Tracks how many times each application endpoint was called +Labels: endpoint + +This is an application-specific metric. + +5. devops_info_system_collection_seconds + +Type: Histogram +Purpose: Measures how long system information collection takes + +This is an internal business/application metric. + +Why these metrics were chosen + +The selected metrics follow the RED method: + +Rate → request counters + +Errors → request counters filtered by error status codes + +Duration → latency histogram + +This provides a strong baseline for monitoring a request-driven API service. + +## 3. Metrics Endpoint + +The application exposes a Prometheus-compatible endpoint: + +/metrics + +The endpoint returns metrics in Prometheus text exposition format. + +Example output +# HELP http_requests_total Total HTTP requests +# TYPE http_requests_total counter +http_requests_total{endpoint="/",method="GET",status="200"} 1.0 + +# HELP http_request_duration_seconds HTTP request duration in seconds +# TYPE http_request_duration_seconds histogram + +# HELP http_requests_in_progress HTTP requests currently being processed +# TYPE http_requests_in_progress gauge + +The endpoint was tested locally and also through Docker Compose after deployment. + +## 4. Prometheus Configuration + +Prometheus was added to the Docker Compose monitoring stack. + +Service image +prom/prometheus:v3.9.0 +Exposed port +9090 +Main configuration file + +monitoring/prometheus/prometheus.yml + +Configuration used +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "app" + static_configs: + - targets: ["app-python:5000"] + metrics_path: /metrics + + - job_name: "loki" + static_configs: + - targets: ["loki:3100"] + metrics_path: /metrics + + - job_name: "grafana" + static_configs: + - targets: ["grafana:3000"] + metrics_path: /metrics +Scrape interval + +scrape_interval: 15s + +evaluation_interval: 15s + +This is frequent enough for lab-scale monitoring and dashboard responsiveness. + +## 5. Prometheus Verification + +After deployment, I verified Prometheus through the web UI. + +Prometheus endpoints tested + +http://localhost:9090 + +http://localhost:9090/targets + +http://localhost:9090/-/healthy + +Result + +All configured targets were successfully scraped and reported as UP: + +app + +grafana + +loki + +prometheus + +This confirmed that the monitoring stack was working correctly and all services were reachable through the Docker network. + +## 6. Grafana Prometheus Data Source + +I added Prometheus as a Grafana data source. + +Configuration + +Type: Prometheus + +URL: http://prometheus:9090 + +The data source test was successful and Grafana was able to query Prometheus metrics. + +## 7. Dashboard Walkthrough + +I created a custom metrics dashboard in Grafana. + +The dashboard includes at least 6 panels and focuses on the RED method. + +Panel 1 — Request Rate + +Type: Time series + +sum(rate(http_requests_total[5m])) by (endpoint) + +Purpose: Shows requests per second for each endpoint. + +Panel 2 — Error Rate + +Type: Time series + +sum(rate(http_requests_total{status=~"5.."}[5m])) + +Purpose: Shows the rate of server-side errors (5xx responses). + +Panel 3 — Requests per Status + +Type: Pie chart / Bar chart + +sum(rate(http_requests_total[5m])) by (status) + +Purpose: Visualizes the distribution of status codes. + +Panel 4 — p95 Latency + +Type: Time series + +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) + +Purpose: Shows the 95th percentile request duration. + +Panel 5 — Active Requests + +Type: Gauge + +http_requests_in_progress + +Purpose: Shows how many requests are currently in progress. + +Panel 6 — Service Uptime + +Type: Stat + +up{job="app"} + +Purpose: Indicates whether the application target is up (1) or down (0). + +Optional extra panel — Endpoint Calls + +Type: Time series + +sum(rate(devops_info_endpoint_calls_total[5m])) by (endpoint) + +Purpose: Shows per-endpoint business metric usage. + +## 8. PromQL Examples + +Below are several PromQL queries I used during testing and dashboard creation. + +1. Check all targets +up +2. Request rate per endpoint +sum(rate(http_requests_total[5m])) by (endpoint) +3. Error rate +sum(rate(http_requests_total{status=~"5.."}[5m])) +4. p95 latency +histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) +5. Active requests +http_requests_in_progress +6. Status code distribution +sum(rate(http_requests_total[5m])) by (status) +7. Application-specific endpoint usage +sum(rate(devops_info_endpoint_calls_total[5m])) by (endpoint) +## 9. Production Configuration + +I also applied production-oriented settings to the monitoring stack. + +Health checks + +Health checks were configured for key services: + +Prometheus: http://localhost:9090/-/healthy + +Grafana: http://localhost:3000/api/health + +Loki: http://localhost:3100/ready + +App: http://localhost:5000/health + +This allows Docker to monitor service health and improves reliability. + +Resource limits + +Resource limits were added to prevent uncontrolled resource usage. + +Configured examples: + +Prometheus: 1G memory, 1 CPU + +Loki: 1G memory, 1 CPU + +Grafana: 512M memory, 0.5 CPU + +App: 256M memory, 0.5 CPU + +Data retention + +Prometheus retention was configured with: + +15d time retention + +10GB storage limit + +This helps control disk usage and improve query performance. + +Persistent volumes + +Persistent volumes were configured for: + +Prometheus data + +Loki data + +Grafana data + +This ensures dashboards and collected metrics survive restarts. + +## 10. Testing Results +Metrics endpoint + +The /metrics endpoint successfully returned: + +default Python/process metrics + +custom HTTP request metrics + +custom application-specific metrics + +Prometheus targets + +All targets were visible and UP in the Prometheus /targets page. + +Grafana dashboard + +The Grafana dashboard displayed live data after generating traffic with curl requests. + +Traffic generation used +for i in {1..50}; do curl http://localhost:8000/; done +for i in {1..50}; do curl http://localhost:8000/health; done + +This generated request counters and histogram observations, allowing the dashboard to display non-empty charts. + +## 11. Metrics vs Logs + +This lab builds on Lab 7. + +Logs are useful for: + +investigating specific events + +debugging failures + +analyzing detailed request context + +Metrics are useful for: + +trend analysis + +dashboards + +alerting + +performance and availability monitoring + +Combined value + +Logs explain what happened, while metrics show how much, how often, and how fast. Together they provide stronger observability. + +## 12. Challenges and Solutions +Challenge 1 — Duplicate metrics registration + +Initially, the app crashed due to duplicate Prometheus metric registration. This happened because uvicorn.run("app:app", ...) re-imported the module. + +Solution: +Changed startup to: + +uvicorn.run(app, host=HOST, port=PORT) + +This prevented double import and duplicate metric registration. + +Challenge 2 — Loki config mount issue + +At one point Loki failed because config.yml was accidentally created as a directory instead of a file. + +Solution: +Removed the directory, recreated config.yml as a proper file, and restarted the stack. + +Challenge 3 — Promtail config mount issue + +A similar mount issue occurred with Promtail. + +Solution: +Recreated monitoring/promtail/config.yml as a file and restarted the stack. + +Challenge 4 — Empty graphs in Grafana + +Initially some panels showed no data. + +Solution: +Generated traffic with repeated curl requests to populate the metrics. + +## 13. Conclusion + +In this lab, I successfully instrumented the FastAPI application with Prometheus metrics and deployed a complete monitoring stack with Prometheus and Grafana. + +The final solution includes: + +a working /metrics endpoint + +request counters, latency histograms, and active request gauges + +Prometheus scraping multiple services + +Grafana dashboards for live metric visualization + +production-related health checks, limits, retention, and persistence + +This lab completed the metrics side of observability and complemented the logging setup from Lab 7. diff --git a/monitoring/docs/screenshots/Dashboard.jpg b/monitoring/docs/screenshots/Dashboard.jpg new file mode 100644 index 0000000000..0f54e61ab9 Binary files /dev/null and b/monitoring/docs/screenshots/Dashboard.jpg differ diff --git a/monitoring/docs/screenshots/Prometheus.jpg b/monitoring/docs/screenshots/Prometheus.jpg new file mode 100644 index 0000000000..adb94515d2 Binary files /dev/null and b/monitoring/docs/screenshots/Prometheus.jpg differ diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..31fdabb621 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,35 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + +limits_config: + retention_period: 168h + allow_structured_metadata: false + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..6e1ae9ff10 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "app" + static_configs: + - targets: ["app-python:5000"] + metrics_path: /metrics + + - job_name: "loki" + static_configs: + - targets: ["loki:3100"] + metrics_path: /metrics + + - job_name: "grafana" + static_configs: + - targets: ["grafana:3000"] + metrics_path: /metrics diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..e54ed1473b --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,33 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + + - source_labels: ['__meta_docker_container_label_logging'] + regex: 'promtail' + action: keep + + - source_labels: ['__meta_docker_container_label_app'] + target_label: 'app' + + - source_labels: ['__meta_docker_container_label_service_name'] + target_label: 'service_name' + + - target_label: 'job' + replacement: 'docker'