From 9694d4787145dd1c9e96248fbb56e0c25f3a9dba Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Apr 2025 21:51:15 +0300 Subject: [PATCH 1/4] refactor(servers): keep only really neccessary metrics --- .../dashboards/log-monitoring-dashboard.json | 189 +-------- .../dashboards/performance-dashboard.json | 383 +----------------- 2 files changed, 3 insertions(+), 569 deletions(-) diff --git a/test-servers/grafana/provisioning/dashboards/log-monitoring-dashboard.json b/test-servers/grafana/provisioning/dashboards/log-monitoring-dashboard.json index 3e8d819..7d68173 100644 --- a/test-servers/grafana/provisioning/dashboards/log-monitoring-dashboard.json +++ b/test-servers/grafana/provisioning/dashboards/log-monitoring-dashboard.json @@ -457,99 +457,6 @@ } ] }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "description": "Estimated time required to process the current queue based on recent processing rates. This helps predict how long it will take to clear backlogs in the system.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "title": "Estimated Queue Processing Time", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "estimated_processing_time_seconds", - "instant": false, - "interval": "", - "legendFormat": "Time (seconds)", - "range": true, - "refId": "A" - } - ] - }, { "datasource": { "type": "prometheus", @@ -607,8 +514,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 24 + "x": 12, + "y": 16 }, "id": 7, "options": { @@ -641,98 +548,6 @@ "refId": "A" } ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "description": "Tracks the number of consistency errors detected over time. These errors occur when the consistency ratio falls below or rises above the acceptable thresholds, indicating potential data loss or duplication.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 9, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "title": "Consistency Errors", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "editorMode": "code", - "exemplar": false, - "expr": "consistency_errors_total", - "instant": false, - "interval": "", - "legendFormat": "Errors", - "range": true, - "refId": "A" - } - ] } ], "refresh": "5s", diff --git a/test-servers/grafana/provisioning/dashboards/performance-dashboard.json b/test-servers/grafana/provisioning/dashboards/performance-dashboard.json index be9659e..130dca1 100644 --- a/test-servers/grafana/provisioning/dashboards/performance-dashboard.json +++ b/test-servers/grafana/provisioning/dashboards/performance-dashboard.json @@ -214,196 +214,6 @@ "title": "Processing Rate (logs/sec)", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "area" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 500 - }, - { - "color": "red", - "value": 1000 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "expr": "rabbitmq_queue_size", - "legendFormat": "Queue Size", - "range": true, - "refId": "A" - } - ], - "title": "RabbitMQ Queue Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "area" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "expr": "rabbitmq_queue_rate", - "legendFormat": "Queue Growth Rate", - "range": true, - "refId": "A" - } - ], - "title": "Queue Growth Rate (logs/sec)", - "type": "timeseries" - }, { "datasource": { "type": "prometheus", @@ -429,7 +239,7 @@ "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 8 }, "id": 5, "options": { @@ -470,197 +280,6 @@ ], "title": "Logs Processed Distribution", "type": "piechart" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 6, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.3.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "expr": "performance_errors_total", - "legendFormat": "Errors", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "expr": "performance_warnings_total", - "hide": false, - "legendFormat": "Warnings", - "range": true, - "refId": "B" - } - ], - "title": "Performance Alerts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "histogram_quantile(0.95, sum(rate(log_processing_latency_ms_bucket[5m])) by (le, component))", - "format": "time_series", - "hide": false, - "instant": false, - "legendFormat": "{{component}} P95", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "builder", - "expr": "histogram_quantile(0.50, sum(rate(log_processing_latency_ms_bucket[5m])) by (le, component))", - "hide": false, - "legendFormat": "{{component}} P50", - "range": true, - "refId": "B" - } - ], - "title": "Log Processing Latency Percentiles", - "type": "timeseries" } ], "refresh": "10s", From 7954134c207f317882fa62d938c635b9a913c667 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Apr 2025 22:11:35 +0300 Subject: [PATCH 2/4] docs(servers): adjust README with performance analyzer --- test-servers/README.md | 85 ++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/test-servers/README.md b/test-servers/README.md index 0f54321..a25fe20 100644 --- a/test-servers/README.md +++ b/test-servers/README.md @@ -57,12 +57,29 @@ A service that monitors the entire pipeline to ensure logs are processed correct - `CONSISTENCY_THRESHOLD_LOW`, `CONSISTENCY_THRESHOLD_HIGH` - Thresholds for consistency alerts - `METRICS_PORT` - Port for Prometheus metrics +### Performance Analyzer (`performance_analyzer/`) + +A service that analyzes and exports performance metrics for the log processing pipeline. + +- **Key Features**: + - Monitors log processing times across different components + - Calculates processing rates and throughput + - Identifies performance bottlenecks + - Provides real-time performance metrics for the pipeline + - Exports metrics to Prometheus for visualization in Grafana + +- **Environment Variables**: + - `PYTHON_SERVER_METRICS_URL` - URL to fetch generator metrics + - `CHECK_INTERVAL` - Time between performance checks (seconds) + - `METRICS_PORT` - Port for Prometheus metrics + - `RABBITMQ_*` - RabbitMQ connection settings for queue monitoring + ### Prometheus (`prometheus/`) Time-series database for storing and querying metrics from all components. - **Configuration**: - - Scrapes metrics from Python Server and Consistency Validator + - Scrapes metrics from Python Server, Consistency Validator, and Performance Analyzer - Default settings: 15s scrape interval, 15s evaluation interval - Configured via `prometheus.yml` @@ -74,9 +91,9 @@ Visualization platform for metrics stored in Prometheus. - Real-time log generation vs. processing rates - Consistency ratio gauge (ideal: 95-105%) - RabbitMQ queue depth monitoring - - Estimated processing time for queued messages - Consistency trends over time - - Error detection and visualization + - Performance metrics visualization + - Multiple dashboards for different monitoring focuses ## Metrics Overview @@ -89,26 +106,37 @@ The system exposes the following key metrics: | `logs_processed_total` | Gauge | Total number of logs processed and stored in MongoDB | | `rabbitmq_queue_depth` | Gauge | Current number of messages in the RabbitMQ queue | | `consistency_ratio` | Gauge | Ratio between processed and generated logs (percentage) | -| `estimated_processing_time_seconds` | Gauge | Estimated time to process all queued logs | | `consistency_checks_total` | Counter | Total number of consistency checks performed | -| `consistency_errors_total` | Counter | Total number of consistency errors detected | | `connection_errors_total` | Counter | Total connection errors during log generation/processing | | `active_workers` | Gauge | Number of active worker threads in Python Server | +| `log_processing_time_ms` | Gauge | Processing time for logs in milliseconds | +| `log_processing_rate` | Gauge | Rate of log processing (logs/sec) | +| `logs_processed_total_by_component` | Gauge | Logs processed by each component | + +## Grafana Dashboards + +The system includes two specialized dashboards: -## Grafana Dashboard +### 1. Log Monitoring Dashboard -The provided dashboard (`grafana/dashboard.json`) visualizes system performance with: +This dashboard (`log-monitoring-dashboard.json`) focuses on data consistency and pipeline health: 1. **Logs (Generated vs Processed)**: Time-series graph showing the gap between logs generated and processed 2. **Data Consistency Ratio**: Gauge showing processing consistency percentage 3. **Log Counters**: Current totals for generated and processed logs 4. **RabbitMQ Queue Depth**: Current number of messages in queue -5. **Queue Depth Trend**: Time-series graph of queue depth over time -6. **Estimated Queue Processing Time**: Time required to process the current queue -7. **Consistency Ratio Trend**: Time-series graph showing consistency patterns -8. **Consistency Errors**: Count of detected consistency problems over time +5. **RabbitMQ Queue Depth Trend**: Time-series graph of queue depth over time +6. **Consistency Ratio Trend**: Time-series graph showing consistency patterns -The dashboard uses color-coded thresholds: +### 2. Performance Dashboard + +This dashboard (`performance-dashboard.json`) focuses on system performance metrics: + +1. **Log Processing Time (ms)**: Time-series graph showing processing times +2. **Processing Rate (logs/sec)**: Time-series graph of throughput +3. **Logs Processed Distribution**: Pie chart showing log distribution by component + +The dashboards use color-coded thresholds: - Green: System functioning normally (consistency 95-105%) - Yellow: Warning range (consistency 80-95% or 105-120%) - Red: Critical issues (consistency <80% or >120%) @@ -148,39 +176,24 @@ The Data Consistency Ratio gauge uses specific thresholds that have been careful - Above 120%: Serious duplication issues or fundamental measurement errors - Immediate investigation is necessary -#### Log Counter Difference Interpretation - -The "Log Counters" panel often shows a slight difference between the "Generated" and "Processed" counts, with Generated typically higher than Processed. This is both expected and correct for several reasons: - -1. **Pipeline Latency**: Logs take time to flow through the system from generation to processing completion. At any given moment, some logs are in transit (in RabbitMQ or being processed). +#### Log Counters Interpretation -2. **Measurement Timing**: The metrics are collected at slightly different points in time due to the Prometheus scrape interval, creating small natural discrepancies. - -3. **Asynchronous Processing**: The log generation and processing systems operate asynchronously and independently, so perfect count alignment would actually be suspicious. - -A small, stable difference (typically proportional to the generation rate and system scale) indicates the system is functioning correctly. Only a growing difference over time should be cause for concern, as it indicates the processing cannot keep up with generation. +The "Logs (Generated vs Processed)" panel shows a stepped lines of "Generated" and "Processed" counts. This is done due to a 15 seconds interval between collecting metrics to deal with the message queue storing some part of the logs before they are consumed be the analyzer. ## Running the Environment -To start the entire test environment: - -```bash -cd test-servers -docker-compose up -d -``` - Service endpoints: -- Grafana: http://localhost:3000 (admin/admin) +- Grafana: http://localhost:3000 - Prometheus: http://localhost:9090 (http://prometheus:9090 for Grafana Datasource) -- RabbitMQ Management: http://localhost:15672 (guest/guest) -- MongoDB Express: http://localhost:8082 (admin/pass) +- RabbitMQ Management: http://localhost:15672 +- MongoDB Express: http://localhost:8081 ## Adding New Test Servers -To add a new test server instance (e.g., test-servers-3) to increase log generation load: +To add a new test server instance (e.g., test-servers-4) to increase log generation load: -1. In `docker/docker-compose.yml`, duplicate an existing test-servers block, increment the container name to test-servers-3, and assign a new port (e.g., 8003:8000). -2. Update the `PYTHON_SERVER_METRICS_URL` environment variable in both consistency-validator and performance-analyzer services to include the new server's metrics endpoint (e.g., add `,http://test-servers-3:8000/metrics`). -3. Add the new server to the prometheus.yml targets: `- targets: ['test-servers-1:8000', 'test-servers-2:8000', 'test-servers-3:8000']`. +1. In `docker/docker-compose.yml`, duplicate an existing test-servers block, increment the container name to test-servers-4, and assign a new port (e.g., 8003:8000). +2. Update the `PYTHON_SERVER_METRICS_URL` environment variable in both consistency-validator and performance-analyzer services to include the new server's metrics endpoint (e.g., add `,http://test-servers-4:8000/metrics`). +3. Add the new server to the prometheus.yml targets: `- targets: ['test-servers-1:8000', 'test-servers-2:8000', 'test-servers-3:8000', 'test-servers-4:8000']`. 4. Update any service dependencies to include the new server. 5. Restart the environment using `docker-compose down -v && docker-compose up -d` from the docker directory. From bcb158f780c992f3b751445ee47cef8454edce2a Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Apr 2025 23:30:11 +0300 Subject: [PATCH 3/4] refactor(servers): remove debug parts --- .../consistency_validator.py | 3 -- .../performance_analyzer.py | 29 +++---------------- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/test-servers/consistency_validator/consistency_validator.py b/test-servers/consistency_validator/consistency_validator.py index 05cac3d..0076819 100644 --- a/test-servers/consistency_validator/consistency_validator.py +++ b/test-servers/consistency_validator/consistency_validator.py @@ -29,7 +29,6 @@ PROCESSING_DELAY_ALLOWANCE = int(os.getenv('PROCESSING_DELAY_ALLOWANCE', 120)) METRICS_PORT = int(os.getenv('METRICS_PORT', 8080)) -# Update to store metrics per server GENERATED_LOGS = Gauge('logs_generated_total', 'Total number of generated logs', ['server']) GENERATED_LOGS_TOTAL = Gauge('logs_generated_total_combined', 'Total combined logs generated from all servers') PROCESSED_LOGS = Gauge('logs_processed_total', 'Total number of processed logs') @@ -93,7 +92,6 @@ def get_generated_logs_counts(self): server_count = int(float(value)) break - # Extract server name from URL server_name = server_url.split('//')[1].split(':')[0] server_counts[server_name] = server_count total_count += server_count @@ -104,7 +102,6 @@ def get_generated_logs_counts(self): except Exception as e: logger.error(f"Error getting generated logs count from {server_url}: {e}") - # Установка значения общего количества сгенерированных логов GENERATED_LOGS_TOTAL.set(total_count) logger.info(f"Total logs generated across all servers: {total_count}") diff --git a/test-servers/performance_analyzer/performance_analyzer.py b/test-servers/performance_analyzer/performance_analyzer.py index b9cc909..3880965 100644 --- a/test-servers/performance_analyzer/performance_analyzer.py +++ b/test-servers/performance_analyzer/performance_analyzer.py @@ -25,10 +25,9 @@ RABBITMQ_PASSWORD = os.getenv('RABBITMQ_PASSWORD', 'guest') RABBITMQ_QUEUE = os.getenv('RABBITMQ_QUEUE', 'logs') METRICS_PORT = int(os.getenv('METRICS_PORT', 8091)) -PERFORMANCE_THRESHOLD_WARNING = float(os.getenv('PERFORMANCE_THRESHOLD_WARNING', 500)) # ms -PERFORMANCE_THRESHOLD_CRITICAL = float(os.getenv('PERFORMANCE_THRESHOLD_CRITICAL', 1000)) # ms +PERFORMANCE_THRESHOLD_WARNING = float(os.getenv('PERFORMANCE_THRESHOLD_WARNING', 500)) +PERFORMANCE_THRESHOLD_CRITICAL = float(os.getenv('PERFORMANCE_THRESHOLD_CRITICAL', 1000)) -# Prometheus metrics PROCESSING_TIME_GAUGE = Gauge('log_processing_time_ms', 'Average log processing time in milliseconds', ['component']) PROCESSING_RATE = Gauge('log_processing_rate', 'Number of logs processed per second', ['component']) LOGS_TOTAL = Gauge('logs_processed_total_by_component', 'Total number of logs processed', ['component']) @@ -81,7 +80,6 @@ def get_logs_count_from_mongodb(self): return 0 def get_server_metrics(self): - """Get metrics from all test servers""" server_metrics = {} for server_url in PYTHON_SERVER_METRICS_URLS: @@ -98,7 +96,6 @@ def get_server_metrics(self): except ValueError: continue - # Extract server name from URL server_name = server_url.split('//')[1].split(':')[0] server_metrics[server_name] = metrics logger.info(f"Collected metrics from {server_name}") @@ -139,28 +136,23 @@ def analyze_performance(self): PERFORMANCE_CHECKS.inc() - # Обновляем метрики очереди if queue_depth is not None: QUEUE_SIZE.set(queue_depth) - # Расчет скорости обработки (если доступны исторические данные) if self.last_processed_count is not None and self.last_check_time is not None: elapsed_seconds = (current_time - self.last_check_time).total_seconds() if elapsed_seconds > 0: - # Скорость обработки logs_delta = processed_count - self.last_processed_count processing_rate = logs_delta / elapsed_seconds PROCESSING_RATE.labels(component="analyzer").set(processing_rate) logger.info(f"Processing rate: {processing_rate:.2f} logs/sec") - # Оценка среднего времени обработки if logs_delta > 0: avg_processing_time_ms = (elapsed_seconds * 1000) / logs_delta PROCESSING_TIME_GAUGE.labels(component="analyzer").set(avg_processing_time_ms) LATENCY_HISTOGRAM.labels(component="analyzer").observe(avg_processing_time_ms) logger.info(f"Average processing time: {avg_processing_time_ms:.2f} ms per log") - # Проверка порогов производительности if avg_processing_time_ms > PERFORMANCE_THRESHOLD_CRITICAL: logger.error(f"CRITICAL: Processing time ({avg_processing_time_ms:.2f} ms) exceeds critical threshold ({PERFORMANCE_THRESHOLD_CRITICAL} ms)") PERFORMANCE_ERRORS.inc() @@ -168,7 +160,6 @@ def analyze_performance(self): logger.warning(f"WARNING: Processing time ({avg_processing_time_ms:.2f} ms) exceeds warning threshold ({PERFORMANCE_THRESHOLD_WARNING} ms)") PERFORMANCE_WARNINGS.inc() - # Расчет скорости изменения очереди if queue_depth is not None and self.last_queue_depth is not None: queue_change_rate = (queue_depth - self.last_queue_depth) / elapsed_seconds QUEUE_RATE.set(queue_change_rate) @@ -177,23 +168,18 @@ def analyze_performance(self): else: logger.info(f"Queue shrinking at rate of {abs(queue_change_rate):.2f} logs/second") - # Обновляем серверные метрики for server_name, metrics in server_metrics.items(): - # Устанавливаем метрики для каждого сервера for metric_key, metric_value in metrics.items(): - # Пример кастомных метрик серверов, которые нас интересуют if metric_key == 'logs_generated_total': LOGS_TOTAL.labels(component=f"server_{server_name}").set(metric_value) elif 'processing_time' in metric_key and '_count' not in metric_key and '_sum' not in metric_key: PROCESSING_TIME_GAUGE.labels(component=f"server_{server_name}").set(metric_value) LATENCY_HISTOGRAM.labels(component=f"server_{server_name}").observe(metric_value) - # Сохраняем текущее состояние для следующего анализа self.last_processed_count = processed_count self.last_check_time = current_time self.last_queue_depth = queue_depth - # Добавляем данные для исторического анализа performance_point = { "timestamp": current_time.isoformat(), "processed_count": processed_count, @@ -202,20 +188,17 @@ def analyze_performance(self): } self.performance_history.append(performance_point) - if len(self.performance_history) > 60: # Храним час данных с интервалом 1 минута + if len(self.performance_history) > 60: self.performance_history.pop(0) return performance_point def analyze_trends(self): - """Анализ трендов производительности за последнюю историю метрик""" if len(self.performance_history) < 5: return {"status": "insufficient_data", "message": "Недостаточно данных для анализа трендов"} - # Берем последние 5 точек для анализа краткосрочных трендов recent_points = self.performance_history[-5:] - # Анализ тренда скорости обработки processing_trend = "stable" if all(recent_points[i]["processed_count"] < recent_points[i+1]["processed_count"] for i in range(len(recent_points)-1)): @@ -224,7 +207,6 @@ def analyze_trends(self): for i in range(len(recent_points)-1)): processing_trend = "degrading" - # Анализ тренда очереди queue_trend = "stable" if all(recent_points[i].get("queue_depth", 0) > recent_points[i+1].get("queue_depth", 0) for i in range(len(recent_points)-1)): @@ -233,7 +215,6 @@ def analyze_trends(self): for i in range(len(recent_points)-1)): queue_trend = "degrading" - # Анализ общей производительности overall_status = "healthy" if queue_trend == "degrading" and processing_trend != "improving": overall_status = "at_risk" @@ -253,7 +234,6 @@ def export_performance_metrics(self): performance_data = self.analyze_performance() trends_analysis = self.analyze_trends() - # Добавляем прогнозы и рекомендации metrics = { "timestamp": datetime.now().isoformat(), "current": performance_data, @@ -261,7 +241,6 @@ def export_performance_metrics(self): "historical_data": self.performance_history[-10:] if len(self.performance_history) > 0 else [] } - # Добавляем рекомендации на основе анализа recommendations = [] if trends_analysis["status"] == "critical": recommendations.append("Критическая ситуация: рассмотрите возможность масштабирования компонента analyzer") @@ -300,6 +279,6 @@ def run(self): time.sleep(CHECK_INTERVAL) if __name__ == "__main__": - time.sleep(10) # Даем время другим компонентам загрузиться + time.sleep(10) analyzer = PerformanceAnalyzer() analyzer.run() \ No newline at end of file From f112cf0750f031eb6ff82ddc94fc6f39e0b7a363 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 30 Apr 2025 23:31:39 +0300 Subject: [PATCH 4/4] refactor(servers): code format --- test-servers/consistency_validator/consistency_validator.py | 1 - test-servers/performance_analyzer/performance_analyzer.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/test-servers/consistency_validator/consistency_validator.py b/test-servers/consistency_validator/consistency_validator.py index 0076819..7ed0c51 100644 --- a/test-servers/consistency_validator/consistency_validator.py +++ b/test-servers/consistency_validator/consistency_validator.py @@ -28,7 +28,6 @@ CONSISTENCY_THRESHOLD_HIGH = float(os.getenv('CONSISTENCY_THRESHOLD_HIGH', 120)) PROCESSING_DELAY_ALLOWANCE = int(os.getenv('PROCESSING_DELAY_ALLOWANCE', 120)) METRICS_PORT = int(os.getenv('METRICS_PORT', 8080)) - GENERATED_LOGS = Gauge('logs_generated_total', 'Total number of generated logs', ['server']) GENERATED_LOGS_TOTAL = Gauge('logs_generated_total_combined', 'Total combined logs generated from all servers') PROCESSED_LOGS = Gauge('logs_processed_total', 'Total number of processed logs') diff --git a/test-servers/performance_analyzer/performance_analyzer.py b/test-servers/performance_analyzer/performance_analyzer.py index 3880965..2c3b97b 100644 --- a/test-servers/performance_analyzer/performance_analyzer.py +++ b/test-servers/performance_analyzer/performance_analyzer.py @@ -4,7 +4,7 @@ import json from pymongo import MongoClient import requests -from datetime import datetime, timedelta +from datetime import datetime import pika from prometheus_client import start_http_server, Gauge, Counter, Histogram @@ -27,7 +27,6 @@ METRICS_PORT = int(os.getenv('METRICS_PORT', 8091)) PERFORMANCE_THRESHOLD_WARNING = float(os.getenv('PERFORMANCE_THRESHOLD_WARNING', 500)) PERFORMANCE_THRESHOLD_CRITICAL = float(os.getenv('PERFORMANCE_THRESHOLD_CRITICAL', 1000)) - PROCESSING_TIME_GAUGE = Gauge('log_processing_time_ms', 'Average log processing time in milliseconds', ['component']) PROCESSING_RATE = Gauge('log_processing_rate', 'Number of logs processed per second', ['component']) LOGS_TOTAL = Gauge('logs_processed_total_by_component', 'Total number of logs processed', ['component'])