diff --git a/dev/clickhouse-config.xml b/dev/clickhouse-config.xml
deleted file mode 100644
index 3bee44cf2b..0000000000
--- a/dev/clickhouse-config.xml
+++ /dev/null
@@ -1,1426 +0,0 @@
-
-
-
-
-
- information
- 1
-
-
-
-
-
-
-
- none
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 8123
-
-
- 9000
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 4096
-
-
- 3
-
-
-
-
- false
-
-
- /path/to/ssl_cert_file
- /path/to/ssl_key_file
-
-
- false
-
-
- /path/to/ssl_ca_cert_file
-
-
- deflate
-
-
- 2
-
-
- -1
- -1
-
-
- false
-
-
-
-
-
-
-
-
- 0
- 0
-
-
- 1000
-
-
- 0
-
-
-
- 10000
-
-
-
-
-
- 0.9
-
-
- 4194304
-
-
- 0
-
-
-
-
-
- 8589934592
-
-
- 5368709120
-
-
-
- 1000
-
-
- 134217728
-
-
- 10000
-
- false
-
-
- /srv/clickhouse/
-
-
-
-
- local
- /root/go/src/github.com/percona/pmm/dev/clickhouse-backups/
-
-
-
-
- backup
-
-
-
-
- /srv/clickhouse/tmp/
-
-
- 0
- 0
- 0
-
-
- sha256_password
-
-
- 12
-
-
-
-
-
-
-
-
- /srv/clickhouse/user_files/
-
-
-
-
-
-
-
-
-
-
-
-
- users.xml
-
-
-
- /srv/clickhouse/access/
-
-
-
-
-
-
-
- false
-
-
- false
-
-
- false
-
-
- false
-
-
- false
-
-
- 600
-
-
-
- default
-
-
-
-
-
-
-
-
-
-
-
- default
-
-
-
-
-
-
-
-
- true
-
-
- false
-
- ' | sed -e 's|.*>\(.*\)<.*|\1|')
- wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
- apt install --no-install-recommends -f ./clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
- clickhouse-jdbc-bridge &
-
- * [CentOS/RHEL]
- export MVN_URL=https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc-bridge/
- export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '' | sed -e 's|.*>\(.*\)<.*|\1|')
- wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
- dnf install -y ./clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
- clickhouse-jdbc-bridge &
-
- Please refer to https://github.com/ClickHouse/clickhouse-jdbc-bridge#usage for more information.
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- localhost
- 9000
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 3600
-
-
-
- 3600
-
-
- 60
-
-
-
-
-
-
-
-
- /metrics
- 9363
-
- true
-
- true
- true
-
-
-
-
-
- system
-
-
- toYYYYMM(event_date)
-
-
-
-
-
-
-
- 7500
-
- 1048576
-
- 8192
-
- 524288
-
- false
-
-
-
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
- system
-
-
- toYYYYMM(event_date)
- 7500
- 1048576
- 8192
- 524288
-
- false
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
- system
-
- toYYYYMM(event_date)
- 7500
- 1048576
- 8192
- 524288
- false
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
- system
-
- toYYYYMM(event_date)
- 7500
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
- system
-
- toYYYYMM(event_date)
- 7500
- 1048576
- 8192
- 524288
- false
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
-
-
- system
-
- 7500
- 1048576
- 8192
- 524288
- 1000
- false
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
- system
-
- 7000
- 1048576
- 8192
- 524288
- false
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
-
-
- engine MergeTree
- partition by toYYYYMM(finish_date)
- order by (finish_date, finish_time_us, trace_id)
-
- system
-
- 7500
- 1048576
- 8192
- 524288
- false
-
-
-
-
-
- system
-
-
-
- 1000
- 1024
- 1024
- 512
- true
- event_date + INTERVAL 30 DAY DELETE
-
-
-
-
-
-
-
- system
-
-
- toYYYYMM(event_date)
- 7500
- 1048576
- 8192
- 524288
- false
-
-
-
-
- system
-
-
- 7500
- 1048576
- 8192
- 524288
- false
- event_date
- event_date + INTERVAL 3 DAY
-
-
-
-
-
-
-
-
-
- *_dictionary.*ml
-
-
- *_function.*ml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- /clickhouse/task_queue/ddl
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- click_cost
- any
-
- 0
- 3600
-
-
- 86400
- 60
-
-
-
- max
-
- 0
- 60
-
-
- 3600
- 300
-
-
- 86400
- 3600
-
-
-
-
-
- /srv/clickhouse/format_schemas/
-
-
-
-
- hide encrypt/decrypt arguments
- ((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\)
-
- \1(???)
-
-
-
-
-
-
-
-
-
- false
-
- false
-
-
- https://endpoint-disable
-
-
-
-
-
-
-
-
-
-
- 1073741824
- 1024
- 1048576
- 30000000
-
-
-
-
-
-
diff --git a/dev/otel/.env.example b/dev/otel/.env.example
new file mode 100644
index 0000000000..fc8c19fd5d
--- /dev/null
+++ b/dev/otel/.env.example
@@ -0,0 +1,6 @@
+# PMM OpenTelemetry Environment Configuration
+# Copy this file to .env and update the values below
+
+# Email configuration for Grafana SMTP notifications (required)
+GF_SMTP_FROM_ADDRESS=admin@yourcompany.com
+GF_SECURITY_ADMIN_EMAIL=security@yourcompany.com
diff --git a/dev/otel/README.md b/dev/otel/README.md
new file mode 100644
index 0000000000..440fc74e52
--- /dev/null
+++ b/dev/otel/README.md
@@ -0,0 +1,234 @@
+# Logging Functionality in Percona Monitoring and Management (PMM)
+
+## Goals
+While the overarching goal it provide a robust logging system that allows users to monitor and troubleshoot their database environments effectively, the specific requirements for the logging functionality in PMM are as follows:
+- ability to collect logs from various sources and in various formats
+- integrate with PMM and provide an interface for viewing anf querying the logs
+- support log retention policies to manage disk space
+- support alerting based on log events and notify users via available channels (e.g., email, slack, etc.)
+- support a developer-friendly architecture that allows for easy extension and modification
+- ability to scale with the growing needs of users and handle large volumes of log data efficiently
+- stay true to open source principles
+
+## Overview
+Percona Monitoring and Management (PMM) provides a robust logging system that allows users to monitor and troubleshoot their database environments effectively. This document outlines the logging functionality available in PMM, including how to configure, view, and manage logs.
+
+## Architecture
+PMM's logging architecture is designed to extract logs produced by various components, be they internal or external to PMM. The logs are collected, processed and then persisted to facilitate easy searching and filtering, making it easier for users to identify issues and monitor system health.
+
+### Architecture Diagrams
+These architecture diagrams illustrate the flow of logs from various sources through the OpenTelemetry Collector to the PMM server, where they are stored in ClickHouse and visualized in Grafana.
+
+
+
+
+
+
+### Logging Components
+PMM's logging functionality consists of several key components:
+- **PMM Server**: The central component that collects and stores logs from various sources.
+- **ClickHouse**: The underlying storage system where logs are stored, which can be local or remote.
+- **Open Telemetry (Otel) Collector**: Collector agents installed on systems that gather, process and send logs to PMM server.
+- **Grafana**: A user interface component that allows users to view and search logs persisted to PMM.
+- **Clickhouse Datasource**: A Grafana-authorded ClickHouse datasource used to visualize and query logs.
+
+## Features
+- **Centralized Logging**: Collects logs from multiple sources into a single system for easier management and analysis.
+- **Log Levels**: Supports multiple log levels (debug, info, warn, error, fatal) to allow the user to filter through log severity.
+- **Log Retention**: Defined the log lifetime duration and automatically drops stale records to save on disk space.
+- **Scalability**: Built on ClickHouse, which can handle large volumes of log data efficiently.
+- **Integration with Grafana**: Provides a powerful visualization layer for logs, enabling users to create custom dashboards and alerts.
+- **Flexible Configuration**: Allows users to customize log collection, processing, and storage according to their needs.
+- **Alerting Capabilities**: Integrates with Grafana's alerting system to notify users of critical log events.
+- **Search and Filter**: Provides a user-friendly interface for searching and filtering logs, making it accessible even for non-technical users.
+- **OpenTelemetry Compliance**: Follows OpenTelemetry standards, ensuring compatibility with a wide range of logging tools and services.
+- **Dev friendly**: The architecture is designed to be easily extendable, allowing developers to add new log sources or modify existing configurations without significant overhead. The change-deploy-test cycle is streamlined, enabling rapid iteration and testing of new log sources or configurations in a local dev environment.
+
+
+## Database Schema
+PMM uses a structured schema to store logs in ClickHouse. The schema follows the OpenTelemetry recommendations for log data, ensuring compatibility and ease of use. You can find the schema definition in the PMM documentation or read more about it in the [OpenTelemetry documentation](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/clickhouseexporter/example/default_ddl/logs.sql).
+
+The database creation is managed by ClickHouse's `startup_scripts` functionality, which automatically initializes the database and tables when PMM boots up. The schema includes fields for log messages, timestamps, severity levels, and other relevant metadata.
+
+The table to store logs is named `logs`, and it is created in the `otel` database. The schema is automatically created by the collector when PMM server starts, ensuring that the necessary structure is in place for log storage.
+
+The schema is optimized for efficient querying and indexing, allowing users to quickly retrieve logs based on various criteria. The table is partitioned by date to improve performance and manageability. It heavily utilizes ClickHouse's compression codecs to reduce storage requirements while maintaining fast access to log data.
+
+The following SQL statement creates the `logs` table in the `otel` database:
+
+```plaintext
+CREATE TABLE otel.logs
+(
+ `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)),
+ `TimestampTime` DateTime DEFAULT toDateTime(Timestamp),
+ `TraceId` String CODEC(ZSTD(1)),
+ `SpanId` String CODEC(ZSTD(1)),
+ `TraceFlags` UInt8,
+ `SeverityText` LowCardinality(String) CODEC(ZSTD(1)),
+ `SeverityNumber` UInt8,
+ `ServiceName` LowCardinality(String) CODEC(ZSTD(1)),
+ `Body` String CODEC(ZSTD(1)),
+ `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)),
+ `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
+ `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)),
+ `ScopeName` String CODEC(ZSTD(1)),
+ `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)),
+ `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
+ `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
+ INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1,
+ INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
+ INDEX idx_body Body TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8
+)
+ENGINE = MergeTree
+PARTITION BY toDate(TimestampTime)
+PRIMARY KEY (ServiceName, TimestampTime)
+ORDER BY (ServiceName, TimestampTime, Timestamp)
+TTL TimestampTime + toIntervalDay(3)
+SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1
+```
+
+## Logging Configuration
+
+PMM's logging configuration is managed through a [YAML file](/dev/otel/config.yml) mounted as `/etc/otel/config.yaml` to `otel-collector` container. This file allows users to customize various aspects of logging, including log file locations, severity levels, output formats, in-flight transformations and more.
+
+To read about the configuration options, refer to the [OpenTelemetry Collector Configuration](https://opentelemetry.io/docs/collector/configuration/) documentation. The configuration file is structured in a way that allows users to define receivers, processors, exporters, and other components that control how logs are collected, processed, and stored.
+
+The list of available recievers, processors, exporters, etc can be found in the [builder-config.yaml](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/cmd/otelcontribcol/builder-config.yaml)
+
+### Log Levels
+PMM supports several log levels, which can be set in the configuration file:
+- `debug`: Detailed information, typically of interest only when diagnosing problems.
+- `info`: General information about the system's operation.
+- `warn`: Events that may signal about a potential issue.
+- `error`: Error events that might still allow the application to continue running.
+- `fatal`: Severe errors that cause premature termination of the application.
+
+It is a good practice to set the log level to `info` or `warn` in production environments to avoid excessive logging, while `debug` can be used during development or troubleshooting.
+
+For detailed information about how these levels map to OpenTelemetry severity numbers, see the [OpenTelemetry Severity Numbers](#opentelemetry-severity-numbers) section below.
+
+
+### Example Configuration
+```yaml
+logging:
+ level: info
+ file: /var/log/pmm-server.log
+```
+
+## Viewing Logs
+
+### Accessing Logs
+Logs can be accessed directly from the log file specified in the configuration. You can use standard command-line tools like `cat`, `less`, or `tail` to view the logs.
+```bash
+tail -f /var/log/pmm-server.log
+```
+
+## Log Management
+
+### Log Retention
+To manage log retention, PMM .
+
+### Exporting Logs
+PMM allows exporting logs to external systems for long-term storage or further analysis. You can configure the Otel exporter to send logs to a remote ClickHouse instance or other supported backends.
+
+### Integration with Grafana
+PMM integrates with Grafana to provide a user-friendly interface for viewing and analyzing logs. You can create custom dashboards to visualize log data, set up alerts based on log events, and use Grafana's powerful search capabilities to filter logs.
+
+## Troubleshooting
+
+### Raw database queries
+You can run raw SQL queries against the ClickHouse database to retrieve logs. This is useful for advanced users who want to perform custom queries or analyze logs in detail. For example, you can run the following query to retrieve logs from the last 24 hours:
+
+```sql
+SELECT *
+FROM otel.logs
+WHERE TimestampTime >= now() - INTERVAL 1 DAY
+ORDER BY TimestampTime DESC
+```
+
+To connect to ClickHouse, you can run `docker exec -it pmm-server clickhouse-client --user=default --password=clickhouse -d otel`. This will leverage the ClickHouse client available from the PMM server container.
+
+### Common Issues
+- **Logs Not Appearing**: Ensure that the logging configuration is correctly set up and that the PMM server has permission to write to the specified log file.
+- **Log Levels Not Working**: Verify that the log level is correctly set in the configuration file and that otel-collector has been restarted after making changes.
+- **Log Rotation Issues**: Log rotation is the reponsibility of the underlying system where the logs get sourced from. OpenTelemetry Collector does not handle log rotation itself. Ensure that your system's log rotation settings are correctly configured to manage log file sizes and retention.
+
+## Conclusion
+PMM's logging functionality provides a comprehensive solution for monitoring and managing database environments. By leveraging the structured logging architecture, users can effectively troubleshoot issues, monitor system health, and gain insights into their database operations. Proper configuration and management of logs are essential for maintaining an efficient and reliable monitoring system.
+
+## References
+- [PMM Documentation](https://www.percona.com/doc/percona-monitoring-and-management/index.html)
+- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
+- [ClickHouse Documentation](https://clickhouse.com/docs/en/)
+- [Grafana Documentation](https://grafana.com/docs/)
+
+## Additional Notes
+
+### OpenTelemetry Severity Numbers
+
+PMM follows the **OpenTelemetry specification** for log severity levels, which uses the syslog RFC 5424 standard. The severity numbers may appear sparse, but this is intentional and provides several benefits:
+
+#### Severity Level Mapping
+
+| Level Name | Severity Number Range | Actual Number Used | Syslog Level | Description |
+|------------|----------------------|-------------------|--------------|-------------|
+| `TRACE` | 1-4 | - | - | Finest-grained debug info |
+| `DEBUG` | 5-8 | 5 | Debug (7) | Debug information |
+| `INFO` | 9-12 | 9 | Informational (6) | General information |
+| `WARN` | 13-16 | 13 | Warning (4) | Warning conditions |
+| `ERROR` | 17-20 | 17 | Error (3) | Error conditions |
+| `FATAL` | 21-24 | 21 | Critical/Alert/Emergency (2/1/0) | System unusable |
+
+#### Why Sparse Numbering?
+
+1. **Granularity**: Each severity level gets a range of 4 numbers, allowing for sub-levels (e.g., INFO1=9, INFO2=10, INFO3=11, INFO4=12)
+2. **Backward Compatibility**: Matches the well-established syslog standard used for decades
+3. **Future Extensions**: Leaves room for new severity levels between existing ones
+4. **Industry Standard**: Compatible with most logging systems (rsyslog, journald, etc.)
+5. **Interoperability**: Works seamlessly with monitoring tools like Grafana and Prometheus
+
+#### HTTP Status Code to Severity Mapping
+
+In PMM's web server log processing, HTTP status codes are automatically mapped to appropriate severity levels:
+
+- **2xx Status Codes** → `INFO` (SeverityNumber: 9) - Successful requests (e.g., 200, 201, 202, 301, 303, etc.)
+- **4xx Status Codes** → `WARN` (SeverityNumber: 13) - Client errors (e.g., 404, 403)
+- **5xx Status Codes** → `ERROR` (SeverityNumber: 17) - Server errors (e.g., 500, 502, 503)
+
+This mapping ensures that log severity accurately reflects the nature of each request and helps with monitoring and alerting.
+
+### Comparison with Other Systems
+Previously, we have explored other logging systems like Grafana Loki and VictoriaLogs. While these systems offer similar functionalities, the current solution using OpenTelemetry Collector and ClickHouse provides several advantages:
+
+- **Disk Usage**: ClickHouse's columnar storage is very efficient in storing large volumes of log data and comsumes minimal disk space compared to other systems.
+- **Performance**: ClickHouse is optimized for high-performance queries, making it suitable for real
+- **Scalability**: The architecture is designed to handle high log volumes efficiently, making it suitable for large-scale deployments.
+- **Flexibility**: OpenTelemetry's modular architecture allows for easy integration with various log sources and sinks, enabling users to customize their logging setup according to their needs.
+- **Rich Ecosystem**: The combination of OpenTelemetry, ClickHouse, and Grafana provides a powerful ecosystem for log management, visualization, and alerting, leveraging the strengths of each component
+
+#### Comparison Table
+It's important to note that all solutions have their own strengths and weaknesses, and the choice of logging system should be based on specific use cases and requirements. Below is a comparison table summarizing the key features of OpenTelemtry-based logging solution against Grafana Loki and VictoriaLogs:
+
+| Feature | PMM (OpenTelemetry + ClickHouse) | Grafana Loki | VictoriaLogs |
+|------------------------|----------------------------------|--------------|--------------|
+| Disk Usage | Low | High | Moderate |
+| Performance | High | Low | Moderate |
+| Scalability | High (designed for large volumes)| Moderate | Moderate |
+| HA Support | Yes (ClickHouse replication) | Yes | Yes |
+| Extensibility | High | Moderate | Moderate |
+| Ecosystem | Rich | Moderate | Low |
+| Log Retention | Yes (via TTL in ClickHouse) | Yes | Yes |
+| Log Visualization | Yes | Yes | Yes |
+| Log Search & Filtering | Yes | Yes | Yes |
+| Alerting Integration | Yes | Yes | Yes |
+| Query language | SQL (ClickHouse) | LogQL | LogsQL |
+| Licensing | Apache 2.0 | AGPLv3 | Apache 2.0 |
+| External Dependencies | OpenTelemetry Collector | Loki, Log collector | VictoriaLogs, Log collector |
+| Community Support | Strong | Strong | Moderate |
+| Documentation | Comprehensive | Moderate | Good |
+| Requires Build | No | Yes | Yes |
diff --git a/dev/otel/SETUP.md b/dev/otel/SETUP.md
new file mode 100644
index 0000000000..866927fc22
--- /dev/null
+++ b/dev/otel/SETUP.md
@@ -0,0 +1,252 @@
+# OpenTelemetry Logging Setup Instructions (PoC)
+
+## Project Structure
+This project has the following directory structure:
+
+```
+├── clickhouse
+│ ├── config.d
+│ │ └── config-override.xml
+│ ├── client-config.xml
+│ ├── config-override.xml
+│ └── test.sql
+├── doc
+│ ├── otel-collector.png
+│ ├── password-change-failure-alert.png
+│ └── password-change-success-alert.png
+├── grafana
+│ ├── alert-rules.yml
+│ ├── change-admin-password
+│ ├── clickhouse-datasource.yml
+│ ├── contact-points.yml
+│ ├── datasources.yml
+│ └── notification-policies.yml
+├── nginx
+│ └── nginx.conf
+├── test
+│ ├── clickhouse-test.sh
+│ └── setup-test.sh
+├── .env.example
+├── config.yml
+├── docker-compose.yml
+├── README.md
+└── SETUP.md
+```
+
+## Setup Steps
+
+### 1. Clone the Project
+```bash
+git clone https://github.com/percona/pmm.git
+cd pmm/dev/otel
+```
+
+### 2. Configure Environment
+```bash
+# Copy the example environment file
+cp .env.example .env
+
+# Edit the .env file with your settings
+vim .env # or use your preferred editor
+```
+
+**Required Environment Variables:**
+- `GF_SMTP_FROM_ADDRESS`: Email address for sending alert notifications
+- `GF_SECURITY_ADMIN_EMAIL`: Admin email address for Grafana (for sending user invites, etc.)
+
+**Example .env configuration:**
+```bash
+# Email configuration for Grafana SMTP notifications (required)
+GF_SMTP_FROM_ADDRESS=admin@yourcompany.com
+GF_SECURITY_ADMIN_EMAIL=security@yourcompany.com
+```
+
+### 3. Update Email Addresses for Alerts
+Edit the contact points configuration to use your email addresses:
+```bash
+# Edit the contact points file
+vim grafana/contact-points.yml
+
+# Update the addresses with your emails:
+# addresses: "admin@yourcompany.com;security@yourcompany.com"
+```
+
+### 4. Start the Environment
+```bash
+# Start all services
+docker compose up -d
+
+# Check service status
+docker compose ps
+
+# View logs
+docker compose logs -f cert-generator
+docker compose logs -f otel-collector
+docker compose logs -f pmm-server
+```
+
+### 5. Generate Logs
+PMM generates quite some logs on during user interaction, so after a few moments of interaction, you can start exploring the logs. However, you may choose to generate a few log lines manually, for example Nginx logs, for testing purposes:
+
+```bash
+# Generate various HTTP responses
+curl -k -u admin:admin https://localhost/ # 200 OK
+curl -k -u admin:admin https://localhost/graph/api/users # 200 OK
+curl -k https://localhost/graph/api/users/1 # 401 Unauthorized
+curl -k -u admin:admin https://localhost/graph/nonexistent # 404 Not Found
+```
+
+### 6. Access ClickHouse
+```bash
+# Connect to ClickHouse CLI
+docker exec -it pmm-server clickhouse-client --user=default --password=clickhouse --database=otel
+```
+
+### 7. Run Test Queries
+Execute the test queries from the `clickhouse/test.sql` file in the ClickHouse client.
+
+### 8. Test Security Alerts
+To test the admin password change alert, you can change the admin password in Grafana. This will trigger an alert if configured correctly:
+
+```bash
+# Use the command line tool:
+docker exec -it pmm-server change-admin-password ""
+```
+
+**Expected behavior:**
+- The alert should trigger within 1 minute of password change
+- You should receive an email notification delivered to the configured addresses
+- Check MailHog UI at http://localhost:8025 to see emails sent by triggered alerts
+
+Likewise, to test the admin password change failure alert, you can pass an empty password or one that does not meet the password requirements:
+
+```bash
+# Use the command line tool with an empty password
+docker exec -it pmm-server change-admin-password ""
+```
+
+### 9. Monitor Alert System
+```bash
+# Check Grafana alerting logs
+docker exec -it pmm-server bash
+grep "ngalert" /srv/logs/grafana.log
+
+# View alert rules in Grafana UI
+# Go to https://localhost:443
+# Navigate to Alerting > Alert Rules
+
+# Check contact points and notification policies
+# Navigate to Alerting > Contact Points
+# Navigate to Alerting > Notification Policies
+```
+
+### 10. Adding more Logs
+You can add more log sources to PMM server by modifying the `config.yml` file. If you want to add an external log source, you can configure the OpenTelemetry Collector to scrape logs from that source. To read more, refer to the [OpenTelemetry Collector documentation](https://opentelemetry.io/docs/collector/configuration).
+
+### 11. Changing Log Retention
+To change the log retention period, modify the ClickHouse table TTL settings in the `clickhouse/config.d/config-override.xml` file:
+```xml
+
+
+
+ 1
+
+
+
+
+
+
+
+
+
+
+```
+
+Alternatively, you can also change the TTL by running the following query using the ClickHouse client:
+```sql
+ALTER TABLE otel.logs MODIFY TTL TimestampTime + INTERVAL 7 DAY;
+```
+
+### 12. Creating Dashboards
+You can create custom dashboards in PMM to visualize the logs. Use the `ClickHouse-Logs` data source to query the `otel.logs` table and create panels for different log types, such as Nginx access logs, Grafana logs, pmm-managed logs, pmm-agent logs, and more.
+
+#### Example Query for Log Linecount by Service
+Panel Description: Log Linecount by Service
+```sql
+SELECT ServiceName AS service, COUNT(*) as count FROM "otel"."logs" WHERE ( timestamp >= $__fromTime AND timestamp <= $__toTime ) GROUP BY service ORDER BY count DESC;
+```
+
+#### Example Query for Nginx Status by Severity
+Panel Description: Nginx Status by Severity
+```sql
+SELECT
+CASE WHEN LogAttributes['status'] = '' THEN 'N/A' ELSE LogAttributes['status'] END AS mapping, COUNT(*) AS count
+FROM otel.logs
+WHERE ( Timestamp >= $__fromTime AND Timestamp <= $__toTime ) AND ServiceName = 'nginx'
+GROUP BY LogAttributes['status'], SeverityNumber
+ORDER BY LogAttributes['status']
+```
+
+#### Example Query for General Logs
+Panel Description: General Logs
+```sql
+SELECT Timestamp as "timestamp", Body as "body", SeverityText as "level", LogAttributes as "labels", TraceId as "traceID" FROM "otel"."logs" WHERE ( timestamp >= $__fromTime AND timestamp <= $__toTime ) ORDER BY timestamp DESC LIMIT 1000
+```
+
+## Troubleshooting
+
+### Check Project Setup
+```bash
+cd test
+bash setup-test.sh
+```
+
+### ClickHouse Data Verification
+```bash
+# Check table exists and has data
+docker exec -it pmm-server clickhouse-client --user=default --password=clickhouse --database=otel -q "SELECT count() FROM otel.logs"
+
+# View most recent logs
+docker exec -it pmm-server clickhouse-client --user=default --password=clickhouse --database=otel -q "SELECT * FROM otel.logs ORDER BY Timestamp DESC LIMIT 10"
+```
+
+## Services and Ports
+
+- **PMM**: https://localhost:443
+- **ClickHouse Native**: localhost:9000
+- **OpenTelemetry OTLP gRPC**: localhost:4317
+- **OpenTelemetry OTLP HTTP**: localhost:4318
+- **MailHog Web UI**: http://localhost:8025 (for testing email notifications)
+
+## Alert System Configuration
+
+This PoC includes a complete security alerting system that monitors:
+
+### Security Alerts:
+- **Admin Password Changes**: Detects when admin password is successfully reset
+- **Failed Password Attempts**: Detects failed admin password change attempts
+
+### Alert Configuration Files:
+- `grafana/alert-rules.yml`: Defines the alert rules and queries
+- `grafana/contact-points.yml`: Email notification configuration
+- `grafana/notification-policies.yml`: Alert routing and grouping policies
+- `grafana/datasources.yml`: ClickHouse data source for log queries
+
+### Notification Flow:
+1. OpenTelemetry Collector ingests Grafana logs
+2. Logs are stored in ClickHouse `otel.logs` table
+3. Grafana alert rules query ClickHouse for security events
+4. Alerts are routed via notification policies
+5. Email notifications are sent via configured SMTP (MailHog for testing)
+
+## Cleanup
+```bash
+# Stop and remove all containers (keep the data)
+docker compose down
+
+# Stop and remove all container and volumes (this will delete all data)
+docker compose down -v
+
+# Remove images
+docker compose down --rmi all
+```
\ No newline at end of file
diff --git a/dev/otel/clickhouse/client-config.xml b/dev/otel/clickhouse/client-config.xml
new file mode 100644
index 0000000000..61ba6f3f8b
--- /dev/null
+++ b/dev/otel/clickhouse/client-config.xml
@@ -0,0 +1,25 @@
+
+
+
+
+ true
+ true
+ sslv2,sslv3
+ true
+
+
+
+ RejectCertificateHandler
+
+
+
+
+
+ {display_name} :)
+ {display_name} \e[1;32m:)\e[0m
+ {display_name} \e[1;31m:)\e[0m
+
+
+ clickhouse
+ otel
+
diff --git a/dev/otel/clickhouse/config.d/config-override.xml b/dev/otel/clickhouse/config.d/config-override.xml
new file mode 100644
index 0000000000..5d767c8ef4
--- /dev/null
+++ b/dev/otel/clickhouse/config.d/config-override.xml
@@ -0,0 +1,13 @@
+
+
+
+ 0.0.0.0
+
+
+
+ false
+
+ CREATE DATABASE IF NOT EXISTS otel
+
+
+
diff --git a/dev/otel/clickhouse/test.sql b/dev/otel/clickhouse/test.sql
new file mode 100644
index 0000000000..f1fb2902b2
--- /dev/null
+++ b/dev/otel/clickhouse/test.sql
@@ -0,0 +1,162 @@
+-- Test queries for the OpenTelemetry ClickHouse exporter auto-generated schema
+
+-- Sample queries for the OpenTelemetry ClickHouse exporter auto-generated schema
+
+-- 1. Basic log query with service filtering
+SELECT
+ Timestamp,
+ ServiceName,
+ SeverityText,
+ SeverityNumber,
+ LogAttributes['status'] as status,
+ LogAttributes['request_method'] as method,
+ LogAttributes['request'] as uri,
+ Body
+FROM otel.logs
+WHERE ServiceName = 'nginx'
+ORDER BY Timestamp DESC
+LIMIT 10;
+
+-- 2. Error analysis by severity level
+SELECT
+ Timestamp,
+ ServiceName,
+ SeverityText,
+ SeverityNumber,
+ LogAttributes['status'] as status,
+ LogAttributes['remote_addr'] as client_ip,
+ LogAttributes['request'] as request_uri
+FROM otel.logs
+WHERE ServiceName = 'nginx' AND SeverityText IN ('error', 'fatal', 'warn')
+ORDER BY Timestamp DESC
+LIMIT 100;
+
+-- 3. Service overview by severity
+SELECT
+ ServiceName,
+ ResourceAttributes['service.version'] as service_version,
+ ResourceAttributes['environment'] as environment,
+ SeverityText,
+ count() as log_count,
+ min(Timestamp) as first_log,
+ max(Timestamp) as last_log
+FROM otel.logs
+GROUP BY ServiceName, service_version, environment, SeverityText
+ORDER BY log_count DESC;
+
+-- 4. Log volume over time (last 24 hours, grouped by hour)
+SELECT
+ toStartOfHour(Timestamp) as hour,
+ SeverityText,
+ count() as log_count
+FROM otel.logs
+WHERE Timestamp >= now() - INTERVAL 24 HOUR
+GROUP BY hour, SeverityText
+ORDER BY hour DESC, SeverityText;
+
+-- 5. Real-time monitoring query using severity
+SELECT
+ toStartOfHour(Timestamp) as hour,
+ ServiceName,
+ countIf(SeverityText = 'info') as info_logs,
+ countIf(SeverityText = 'warn') as warn_logs,
+ countIf(SeverityText = 'error') as error_logs,
+ countIf(SeverityText = 'fatal') as fatal_logs,
+ countIf(toUInt16OrZero(LogAttributes['status']) >= 400) as http_errors,
+ avg(toFloat64OrZero(LogAttributes['request_time'])) as avg_response_time
+FROM otel.logs
+WHERE Timestamp >= now() - INTERVAL 1 HOUR
+GROUP BY hour, ServiceName
+ORDER BY hour DESC;
+
+-- 6. Error rates (4xx and 5xx responses)
+SELECT
+ toStartOfHour(Timestamp) as hour,
+ CASE
+ WHEN toUInt16OrZero(LogAttributes['status']) BETWEEN 400 AND 499 THEN '4xx_errors'
+ WHEN toUInt16OrZero(LogAttributes['status']) BETWEEN 500 AND 599 THEN '5xx_errors'
+ ELSE 'other'
+ END as error_category,
+ count() as error_count,
+ round((count() * 100.0) / (
+ SELECT count()
+ FROM otel.logs
+ WHERE LogAttributes['status'] != ''
+ AND toUInt16OrZero(LogAttributes['status']) > 0
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ ), 2) as error_percentage
+FROM otel.logs
+WHERE LogAttributes['status'] != ''
+ AND toUInt16OrZero(LogAttributes['status']) >= 400
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ AND ServiceName = 'nginx'
+GROUP BY hour, error_category
+ORDER BY hour DESC, error_category;
+
+-- 7. Successful request rates (2xx and 3xx responses)
+SELECT
+ toStartOfHour(Timestamp) as hour,
+ CASE
+ WHEN toUInt16OrZero(LogAttributes['status']) BETWEEN 200 AND 299 THEN '2xx_success'
+ WHEN toUInt16OrZero(LogAttributes['status']) BETWEEN 300 AND 399 THEN '3xx_redirect'
+ END as success_category,
+ count() as success_count,
+ round((count() * 100.0) / (
+ SELECT count()
+ FROM otel.logs
+ WHERE LogAttributes['status'] != ''
+ AND toUInt16OrZero(LogAttributes['status']) > 0
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ AND ServiceName = 'nginx'
+ ), 2) as success_percentage
+FROM otel.logs
+WHERE LogAttributes['status'] != ''
+ AND toUInt16OrZero(LogAttributes['status']) BETWEEN 200 AND 399
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ AND ServiceName = 'nginx'
+GROUP BY hour, success_category
+ORDER BY hour DESC, success_category;
+
+-- 11. Overall request statistics
+SELECT
+ count() as total_requests,
+ countIf(toUInt16OrZero(LogAttributes['status']) BETWEEN 200 AND 299) as success_2xx,
+ countIf(toUInt16OrZero(LogAttributes['status']) BETWEEN 300 AND 399) as redirect_3xx,
+ countIf(toUInt16OrZero(LogAttributes['status']) BETWEEN 400 AND 499) as client_error_4xx,
+ countIf(toUInt16OrZero(LogAttributes['status']) BETWEEN 500 AND 599) as server_error_5xx,
+ round(avg(toFloat64OrZero(LogAttributes['request_time'])), 3) as avg_response_time,
+ round(sum(toUInt64OrZero(LogAttributes['body_bytes_sent'])) / 1024 / 1024, 2) as total_mb_sent
+FROM otel.logs
+WHERE LogAttributes['status'] != ''
+ AND toUInt16OrZero(LogAttributes['status']) > 0
+ AND ServiceName = 'nginx'
+ AND Timestamp >= now() - INTERVAL 24 HOUR;
+
+-- 12. Top requested URIs
+SELECT
+ LogAttributes['request'] as request_uri,
+ count() as request_count,
+ countIf(toUInt16OrZero(LogAttributes['status']) BETWEEN 200 AND 299) as success_count,
+ countIf(toUInt16OrZero(LogAttributes['status']) >= 400) as error_count,
+ round(avg(toFloat64OrZero(LogAttributes['request_time'])), 3) as avg_response_time
+FROM otel.logs
+WHERE LogAttributes['request'] != ''
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ AND ServiceName = 'nginx'
+GROUP BY request_uri
+ORDER BY request_count DESC
+LIMIT 10;
+
+-- 14. Error log analysis
+SELECT
+ toStartOfHour(Timestamp) as hour,
+ SeverityText,
+ LogAttributes['log_level'] as log_level,
+ count() as error_count,
+ groupArray(LogAttributes['message']) as sample_messages
+FROM otel.logs
+WHERE SeverityText IN ('ERROR', 'FATAL', 'WARN')
+ AND Timestamp >= now() - INTERVAL 24 HOUR
+ AND ServiceName = 'nginx'
+GROUP BY hour, SeverityText, log_level
+ORDER BY hour DESC, SeverityText;
diff --git a/dev/otel/config.yml b/dev/otel/config.yml
new file mode 100644
index 0000000000..261aa42c10
--- /dev/null
+++ b/dev/otel/config.yml
@@ -0,0 +1,238 @@
+receivers:
+ # File log receiver for Nginx access logs
+ filelog/nginx_access:
+ include: [/srv/logs/nginx.log]
+ operators:
+ - type: key_value_parser
+ parse_from: body
+ parse_to: attributes
+ pair_delimiter: " "
+ key_value_delimiter: "="
+ - type: time_parser
+ parse_from: attributes.time
+ layout: '2006-01-02T15:04:05-07:00'
+ layout_type: gotime
+ # Add a severity level based on HTTP status code
+ - type: add
+ field: attributes.level
+ value: 'EXPR(int(attributes.status) >= 500 ? "error" : (int(attributes.status) >= 400 ? "warn" : "info"))'
+ # Parse the severity from the log level we just set
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ info: info
+ warn: warn
+ error: error
+
+ # File log receiver for Nginx error logs
+ filelog/nginx_error:
+ include: [/srv/logs/nginx-error.log]
+ operators:
+ - type: regex_parser
+ regex: '^(?P\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) \[(?P\w+)\] (?P\d+)#(?P\d+): (?P.*?)(?:, client: (?P[^,]+))?(?:, server: (?P[^,]+))?(?:, request: "(?P[^"]*)")?(?:, host: "(?P[^"]*)")?.*'
+ parse_from: body
+ parse_to: attributes
+ - type: time_parser
+ parse_from: attributes.timestamp
+ layout: '2006/01/02 15:04:05'
+ layout_type: gotime
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ debug: debug
+ info: info
+ notice: info
+ warn: warn
+ error: error
+ crit: fatal
+ alert: fatal
+ emerg: fatal
+
+ # File log receiver for Grafana logs
+ filelog/grafana:
+ include: [/srv/logs/grafana.log]
+ operators:
+ - type: key_value_parser
+ parse_from: body
+ parse_to: attributes
+ pair_delimiter: " "
+ key_value_delimiter: "="
+ - type: time_parser
+ parse_from: attributes.t
+ layout: '2006-01-02T15:04:05.000000000Z07:00'
+ layout_type: gotime
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ debug: debug
+ info: info
+ warn: warn
+ error: error
+ # Move the parsed message to body for consistency
+ - type: move
+ from: attributes.msg
+ to: body
+
+ # File log receiver for PMM managed logs
+ filelog/pmm_managed:
+ include: [/srv/logs/pmm-managed.log]
+ operators:
+ - type: key_value_parser
+ parse_from: body
+ parse_to: attributes
+ pair_delimiter: " "
+ key_value_delimiter: "="
+ - type: time_parser
+ parse_from: attributes.time
+ layout: '2006-01-02T15:04:05.000Z07:00'
+ layout_type: gotime
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ debug: debug
+ info: info
+ warning: warn
+ warn: warn
+ error: error
+ fatal: fatal
+ panic: fatal
+ # Move the parsed message to body for consistency
+ - type: move
+ from: attributes.msg
+ to: body
+
+ # File log receiver for PMM agent logs
+ filelog/pmm_agent:
+ include: [/srv/logs/pmm-agent.log]
+ operators:
+ - type: key_value_parser
+ parse_from: body
+ parse_to: attributes
+ pair_delimiter: " "
+ key_value_delimiter: "="
+ - type: time_parser
+ parse_from: attributes.time
+ layout: '2006-01-02T15:04:05.000Z07:00'
+ layout_type: gotime
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ debug: debug
+ info: info
+ warning: warn
+ warn: warn
+ error: error
+ fatal: fatal
+ panic: fatal
+ # Move the parsed message to body for consistency
+ - type: move
+ from: attributes.msg
+ to: body
+
+ # File log receiver for PostgreSQL logs
+ filelog/postgres:
+ include: [/logs/postgresql/postgresql.log]
+ start_at: beginning
+ operators:
+ - type: regex_parser
+ regex: '^time="(?P[^\"]+)" process=(?P\d+) lineno=(?P\d+) transaction=(?P\w+) db=(?P[^ ]*) user=(?P[^ ]*) app=(?P[^ ]*) client=(?P[^ ]*) (?P\w+): (?P.*)$'
+ parse_from: body
+ parse_to: attributes
+ - type: time_parser
+ parse_from: attributes.timestamp
+ layout: '2006-01-02 15:04:05.000 MST'
+ layout_type: gotime
+ - type: severity_parser
+ parse_from: attributes.level
+ preset: none
+ mapping:
+ debug: debug
+ info: info
+ notice: info
+ warning: warn
+ warn: warn
+ error: error
+ fatal: fatal
+ panic: fatal
+ LOG: info
+ - type: move
+ from: attributes.msg
+ to: body
+
+processors:
+ # Batch processor to optimize performance
+ batch:
+ timeout: 1s
+ send_batch_size: 1024
+ send_batch_max_size: 2048
+
+ # Transform processor to handle log.file.name attribute and service names
+ transform:
+ error_mode: ignore
+ log_statements:
+ # Copy log.file.name to log_file and remove original
+ - set(log.attributes["log_file"], log.attributes["log.file.name"]) where log.attributes["log.file.name"] != nil
+ - delete_key(log.attributes, "log.file.name") where log.attributes["log.file.name"] != nil
+ - delete_key(log.attributes, "level") where log.attributes["level"] != nil
+
+ # Set service attributes based on log file
+ - set(resource.attributes["service.name"], "nginx") where log.attributes["log_file"] == "nginx.log" or log.attributes["log_file"] == "nginx-access.log"
+ - set(resource.attributes["service.version"], "1.20.1") where log.attributes["log_file"] == "nginx.log" or log.attributes["log_file"] == "nginx-access.log"
+
+ - set(resource.attributes["service.name"], "grafana") where log.attributes["log_file"] == "grafana.log"
+ - set(resource.attributes["service.version"], "11.6.1") where log.attributes["log_file"] == "grafana.log"
+
+ - set(resource.attributes["service.name"], "pmm-managed") where log.attributes["log_file"] == "pmm-managed.log"
+ - set(resource.attributes["service.version"], "3.3.1") where log.attributes["log_file"] == "pmm-managed.log"
+
+ - set(resource.attributes["service.name"], "pmm-agent") where log.attributes["log_file"] == "pmm-agent.log"
+ - set(resource.attributes["service.version"], "2.42.0") where log.attributes["log_file"] == "pmm-agent.log"
+
+ - set(resource.attributes["service.name"], "postgres") where log.attributes["log_file"] == "postgresql.log"
+ - set(resource.attributes["service.version"], "17.2-1") where log.attributes["log_file"] == "postgresql.log"
+
+ # Memory limiter to prevent OOM
+ memory_limiter:
+ limit_mib: 512
+ check_interval: 1s
+
+exporters:
+ # ClickHouse exporter for logs
+ clickhouse:
+ endpoint: tcp://pmm-server:9000
+ database: otel
+ username: default
+ password: clickhouse
+ logs_table_name: logs
+ ttl: 72h
+ create_schema: true
+ timeout: 5s
+ retry_on_failure:
+ enabled: true
+ initial_interval: 5s
+ max_interval: 30s
+ max_elapsed_time: 300s
+
+ # Debug exporter for troubleshooting
+ debug:
+ verbosity: basic
+
+service:
+ pipelines:
+ logs:
+ receivers: [filelog/nginx_access, filelog/nginx_error, filelog/grafana, filelog/pmm_managed, filelog/pmm_agent, filelog/postgres]
+ processors: [memory_limiter, transform, batch]
+ exporters: [clickhouse, debug]
+
+ extensions: []
+
+ telemetry:
+ logs:
+ level: "info"
+ metrics:
+ level: basic
diff --git a/dev/otel/doc/Logging functionality of PMM - Loki and VictoriaLogs.jpg b/dev/otel/doc/Logging functionality of PMM - Loki and VictoriaLogs.jpg
new file mode 100644
index 0000000000..f34e04ec18
Binary files /dev/null and b/dev/otel/doc/Logging functionality of PMM - Loki and VictoriaLogs.jpg differ
diff --git a/dev/otel/doc/PMM Log Collection Diagram.jpg b/dev/otel/doc/PMM Log Collection Diagram.jpg
new file mode 100644
index 0000000000..9737ad307e
Binary files /dev/null and b/dev/otel/doc/PMM Log Collection Diagram.jpg differ
diff --git a/dev/otel/doc/PMM Logging Diagram - External Storage.jpg b/dev/otel/doc/PMM Logging Diagram - External Storage.jpg
new file mode 100644
index 0000000000..bd3cd04945
Binary files /dev/null and b/dev/otel/doc/PMM Logging Diagram - External Storage.jpg differ
diff --git a/dev/otel/doc/PMM Logging Diagram - Internal Storage.jpg b/dev/otel/doc/PMM Logging Diagram - Internal Storage.jpg
new file mode 100644
index 0000000000..989fe22870
Binary files /dev/null and b/dev/otel/doc/PMM Logging Diagram - Internal Storage.jpg differ
diff --git a/dev/otel/doc/otel-collector.png b/dev/otel/doc/otel-collector.png
new file mode 100644
index 0000000000..02d3cdd65d
Binary files /dev/null and b/dev/otel/doc/otel-collector.png differ
diff --git a/dev/otel/doc/password-change-faliure-alert.png b/dev/otel/doc/password-change-faliure-alert.png
new file mode 100644
index 0000000000..7ef1decc78
Binary files /dev/null and b/dev/otel/doc/password-change-faliure-alert.png differ
diff --git a/dev/otel/doc/password-change-success-alert.png b/dev/otel/doc/password-change-success-alert.png
new file mode 100644
index 0000000000..53467e1d7a
Binary files /dev/null and b/dev/otel/doc/password-change-success-alert.png differ
diff --git a/dev/otel/docker-compose.yml b/dev/otel/docker-compose.yml
new file mode 100644
index 0000000000..64228d508d
--- /dev/null
+++ b/dev/otel/docker-compose.yml
@@ -0,0 +1,103 @@
+services:
+ # PMM Server with built-in ClickHouse
+ pmm-server:
+ # image: percona/pmm-server:3
+ image: perconalab/pmm-server:3.8.0-rc
+ platform: linux/amd64
+ container_name: pmm-server
+ hostname: pmm-server
+ restart: always
+ environment:
+ - GF_ANALYTICS_ENABLED=0
+ - GF_ANALYTICS_REPORTING_ENABLED=0
+ - GF_REPORTING_ENABLED=0
+ - GF_NEWS_NEWS_FEED_ENABLED=0
+ - GF_SECURITY_DISABLE_GRAVATAR=1
+ - GF_SECURITY_ADMIN_EMAIL=${GF_SECURITY_ADMIN_EMAIL:?GF_SECURITY_ADMIN_EMAIL must be set}
+ - GF_SMTP_ENABLED=1
+ - GF_SMTP_HOST=mailpit:1025
+ - GF_SMTP_FROM_NAME=Percona
+ - GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:?GF_SMTP_FROM_ADDRESS must be set}
+ - GF_LOG_FILTERS=ngalert.api:error
+ ports:
+ - "443:8443"
+ - "9000:9000"
+ volumes:
+ - pmm-data:/srv
+ - ./grafana/change-admin-password:/usr/local/sbin/change-admin-password:ro
+ - ./grafana/datasources.yml:/usr/share/grafana/conf/provisioning/datasources/otel-ds.yml:ro
+ - ./grafana/alert-rules.yml:/usr/share/grafana/conf/provisioning/alerting/otel-alert-rules.yml:ro
+ - ./grafana/contact-points.yml:/usr/share/grafana/conf/provisioning/alerting/otel-contactpoints.yml:ro
+ - ./grafana/notification-policies.yml:/usr/share/grafana/conf/provisioning/alerting/otel-policies.yml:ro
+ - ./clickhouse/config.d/config-override.xml:/etc/clickhouse-server/config.d/config-override.xml:ro
+ - ./clickhouse/client-config.xml:/etc/clickhouse-client/config.xml:ro
+ networks:
+ - otel
+
+ # OpenTelemetry Collector
+ otel-collector:
+ image: otel/opentelemetry-collector-contrib:latest
+ container_name: otel-collector
+ restart: unless-stopped
+ command: ["--config=/etc/otel/config.yml"]
+ volumes:
+ - ./config.yml:/etc/otel/config.yml
+ - pmm-data:/srv
+ - pg-logs:/logs/postgresql
+ depends_on:
+ pmm-server:
+ condition: service_healthy
+ mailpit:
+ condition: service_started
+ networks:
+ - otel
+
+ # for SMTP testing
+ mailpit:
+ image: axllent/mailpit
+ container_name: mail-server
+ hostname: mail-server
+ restart: always
+ ports:
+ - "8025:8025" # Web UI
+ networks:
+ - otel
+
+ postgres:
+ image: percona/percona-distribution-postgresql:17.9
+ platform: ${PLATFORM:-linux/amd64}
+ container_name: postgres
+ hostname: postgres
+ restart: unless-stopped
+ command: |
+ postgres
+ -c config_file=/etc/postgresql/postgresql.conf
+ -c shared_preload_libraries=pg_stat_monitor
+ -c pg_stat_monitor.pgsm_query_max_len=4096
+ -c pg_stat_monitor.pgsm_enable_query_plan=1
+ -c pg_stat_monitor.pgsm_enable_pgsm_query_id=1
+ -c pg_stat_monitor.pgsm_normalized_query=1
+ environment:
+ - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-pmm-password}
+ volumes:
+ - $HOME/config/postgres/postgres-init.sh:/docker-entrypoint-initdb.d/postgres-init.sh
+ - $HOME/config/postgres/postgresql-otel.conf:/etc/postgresql/postgresql.conf:ro
+ - pg-logs:/var/log/postgresql
+ ports:
+ - 127.0.0.1:5432:5432
+ networks:
+ - otel
+
+networks:
+ otel:
+ name: otel
+ driver: bridge
+
+volumes:
+ pmm-data:
+ name: pmm-data
+ pg-logs:
+ name: pg-logs
+ clickhouse-certs:
+ name: clickhouse-certs
+ external: true
diff --git a/dev/otel/grafana/alert-rules.yml b/dev/otel/grafana/alert-rules.yml
new file mode 100644
index 0000000000..0012676cdb
--- /dev/null
+++ b/dev/otel/grafana/alert-rules.yml
@@ -0,0 +1,253 @@
+# Grafana alert rules configuration
+# This should be added to your Grafana provisioning/alerting directory
+# Ref: https://github.com/grafana/grafana/blob/main/devenv/alert_rules.yaml
+
+apiVersion: 1
+
+groups:
+ - name: security-alerts
+ orgId: 1
+ folder: Security
+ interval: 10s
+ rules:
+ - uid: admin-password-change
+ title: Admin Password Change Alert
+ condition: C
+ data:
+ - refId: A
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasourceUid: clickhouse-logs
+ model:
+ # Query to detect admin password changes in logs
+ rawSql: |
+ SELECT
+ Timestamp as time,
+ toInt32(COUNT(*)) as value,
+ ServiceName as service,
+ Body as message
+ FROM logs
+ WHERE
+ Timestamp >= now() - INTERVAL 30 SECOND
+ AND ServiceName = 'grafana'
+ AND SeverityText = 'info'
+ AND Body = 'Admin password has been reset.'
+ GROUP BY time, service, message
+ ORDER BY Timestamp DESC
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ expression: A
+ reducer: last
+ type: reduce
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params: [0]
+ type: gt
+ operator:
+ type: and
+ query:
+ params: [B]
+ reducer:
+ type: last
+ type: query
+ expression: B
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: C
+ type: threshold
+ noDataState: OK
+ execErrState: Alerting
+ for: 0s
+ annotations:
+ description: "Admin password change detected in Grafana logs."
+ summary: "Admin Password Change"
+ labels:
+ severity: critical
+ team: security
+ notification_settings:
+ receiver: email-log-alerts
+
+ - uid: admin-password-change-failed
+ title: Admin Password Change Failed
+ condition: C
+ data:
+ - refId: A
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasourceUid: clickhouse-logs
+ model:
+ # Query to detect failed admin password change attempts in logs
+ rawSql: |
+ SELECT
+ Timestamp as time,
+ toInt32(COUNT(*)) as value,
+ ServiceName as service,
+ Body as message
+ FROM logs
+ WHERE
+ Timestamp >= now() - INTERVAL 30 SECOND
+ AND ServiceName = 'grafana'
+ AND SeverityText = 'error'
+ AND Body LIKE '%Failed to reset admin password%'
+ GROUP BY time, service, message
+ ORDER BY Timestamp DESC
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ expression: A
+ reducer: last
+ type: reduce
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params: [0]
+ type: gt
+ operator:
+ type: and
+ query:
+ params: [B]
+ reducer:
+ type: last
+ type: query
+ expression: B
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: C
+ type: threshold
+ noDataState: OK
+ execErrState: Alerting
+ for: 0s
+ annotations:
+ description: "Failed attempt to change admin password detected in Grafana logs."
+ summary: "Admin Password Change Failed"
+ labels:
+ severity: critical
+ team: security
+ notification_settings:
+ receiver: email-log-alerts
+
+ - uid: postgres-shutdown
+ title: PostgreSQL Database System Shutdown
+ condition: C
+ data:
+ - refId: A
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasourceUid: clickhouse-logs
+ model:
+ # Query to detect PostgreSQL shutdown in logs
+ rawSql: |
+ SELECT
+ Timestamp as time,
+ toInt32(COUNT(*)) as value,
+ ServiceName as service,
+ Body as message
+ FROM logs
+ WHERE
+ Timestamp >= now() - INTERVAL 30 SECOND
+ AND ServiceName = 'postgres'
+ AND Body = 'database system is shut down'
+ GROUP BY time, service, message
+ ORDER BY Timestamp DESC
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: A
+ - refId: B
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ expression: A
+ reducer: last
+ type: reduce
+ refId: B
+ - refId: C
+ datasourceUid: __expr__
+ queryType: ""
+ relativeTimeRange:
+ from: 30
+ to: 0
+ datasource:
+ type: __expr__
+ uid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params: [0]
+ type: gt
+ operator:
+ type: and
+ query:
+ params: [B]
+ reducer:
+ type: last
+ type: query
+ expression: B
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: C
+ type: threshold
+ noDataState: OK
+ execErrState: Alerting
+ for: 0s
+ annotations:
+ description: "PostgreSQL database system shutdown detected in logs."
+ summary: "PostgreSQL Shutdown"
+ labels:
+ severity: warning
+ team: dba
+ notification_settings:
+ receiver: email-log-alerts
diff --git a/dev/otel/grafana/change-admin-password b/dev/otel/grafana/change-admin-password
new file mode 100755
index 0000000000..0d48ca3375
--- /dev/null
+++ b/dev/otel/grafana/change-admin-password
@@ -0,0 +1,11 @@
+#!/bin/bash
+#
+# Change password for default admin user in PMM
+
+if grafana cli --config=/etc/grafana/grafana.ini --homepath /usr/share/grafana admin reset-admin-password "$1"; then
+ echo "Admin password successfully reset."
+ echo "logger=administration t=$(date -u +"%Y-%m-%dT%H:%M:%S.%N"Z) level=info msg=\"Admin password has been reset.\"" >> /srv/logs/grafana.log
+else
+ echo "logger=administration t=$(date -u +"%Y-%m-%dT%H:%M:%S.%N"Z) level=error msg=\"Failed to reset admin password.\"" >> /srv/logs/grafana.log
+ exit 1
+fi
diff --git a/dev/otel/grafana/clickhouse-datasource.yml b/dev/otel/grafana/clickhouse-datasource.yml
new file mode 100644
index 0000000000..7ec02ca756
--- /dev/null
+++ b/dev/otel/grafana/clickhouse-datasource.yml
@@ -0,0 +1,25 @@
+# Grafana data source configuration for ClickHouse
+# This should be added to your Grafana provisioning/datasources directory
+
+apiVersion: 1
+
+datasources:
+ # https://github.com/grafana/clickhouse-datasource?tab=readme-ov-file#with-a-configuration-file
+ - name: ClickHouse-Logs
+ uid: clickhouse-logs
+ orgId: 1
+ version: 2
+ type: grafana-clickhouse-datasource
+ jsonData:
+ defaultDatabase: pmm
+ defaultTable: metrics
+ username: ${PMM_CLICKHOUSE_USER}
+ port: ${PMM_CLICKHOUSE_PORT}
+ host: ${PMM_CLICKHOUSE_HOST}
+ tlsSkipVerify: false
+ logs:
+ defaultDatabase: otel
+ defaultTable: logs
+ otelEnabled: true
+ secureJsonData:
+ password: ${PMM_CLICKHOUSE_PASSWORD}
diff --git a/dev/otel/grafana/clickhouse-otel-dashboard.json b/dev/otel/grafana/clickhouse-otel-dashboard.json
new file mode 100644
index 0000000000..81a7fdb5a9
--- /dev/null
+++ b/dev/otel/grafana/clickhouse-otel-dashboard.json
@@ -0,0 +1,508 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "datasource",
+ "uid": "grafana"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "target": {
+ "limit": 100,
+ "matchAny": false,
+ "tags": [],
+ "type": "dashboard"
+ },
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": 84,
+ "links": [],
+ "panels": [
+ {
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "fillOpacity": 80,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineWidth": 1,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 12,
+ "options": {
+ "barRadius": 0,
+ "barWidth": 0.97,
+ "fullHighlight": false,
+ "groupWidth": 0.7,
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "orientation": "auto",
+ "showValue": "auto",
+ "stacking": "none",
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ },
+ "xTickLabelRotation": 0,
+ "xTickLabelSpacing": 0
+ },
+ "pluginVersion": "11.6.1",
+ "targets": [
+ {
+ "builderOptions": {
+ "aggregates": [
+ {
+ "aggregateType": "count",
+ "alias": "count",
+ "column": "*"
+ }
+ ],
+ "columns": [
+ {
+ "alias": "service",
+ "name": "servicename"
+ }
+ ],
+ "database": "otel",
+ "groupBy": ["service"],
+ "mode": "aggregate",
+ "orderBy": [
+ {
+ "dir": "DESC",
+ "name": "count"
+ }
+ ],
+ "queryType": "table",
+ "table": "logs"
+ },
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "editorType": "sql",
+ "format": 1,
+ "meta": {
+ "builderOptions": {
+ "aggregates": [
+ {
+ "aggregateType": "count",
+ "alias": "count",
+ "column": "*"
+ }
+ ],
+ "columns": [
+ {
+ "alias": "service",
+ "name": "servicename"
+ }
+ ],
+ "database": "otel",
+ "groupBy": ["service"],
+ "mode": "aggregate",
+ "orderBy": [
+ {
+ "dir": "DESC",
+ "name": "count"
+ }
+ ],
+ "queryType": "table",
+ "table": "logs"
+ }
+ },
+ "pluginVersion": "4.4.0",
+ "queryType": "table",
+ "rawSql": "SELECT ServiceName AS service, COUNT(*) as count FROM \"otel\".\"logs\" WHERE ( Timestamp >= $__fromTime AND Timestamp <= $__toTime ) GROUP BY service ORDER BY count DESC;",
+ "refId": "A"
+ }
+ ],
+ "title": "Log Line Count by Service",
+ "type": "barchart"
+ },
+ {
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "semi-dark-blue"
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 13,
+ "options": {
+ "displayMode": "basic",
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": false
+ },
+ "maxVizHeight": 75,
+ "minVizHeight": 16,
+ "minVizWidth": 8,
+ "namePlacement": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": true
+ },
+ "showUnfilled": true,
+ "sizing": "manual",
+ "valueMode": "color"
+ },
+ "pluginVersion": "11.6.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "editorType": "sql",
+ "format": 2,
+ "meta": {
+ "builderOptions": {
+ "columns": [],
+ "database": "",
+ "limit": 1000,
+ "mode": "list",
+ "queryType": "table",
+ "table": ""
+ }
+ },
+ "pluginVersion": "4.4.0",
+ "queryType": "logs",
+ "rawSql": "SELECT\nCASE WHEN LogAttributes['status'] = '' THEN 'N/A' ELSE LogAttributes['status'] END AS mapping, COUNT(*) as count\nFROM otel.logs\nWHERE ( Timestamp >= $__fromTime AND Timestamp <= $__toTime ) AND ServiceName = 'nginx'\nGROUP BY LogAttributes['status'], SeverityNumber\nORDER BY LogAttributes['status']",
+ "refId": "A"
+ }
+ ],
+ "title": "Nginx Status by Severity",
+ "type": "bargauge"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "id": 10,
+ "panels": [],
+ "title": "Logs",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 21,
+ "w": 24,
+ "x": 0,
+ "y": 9
+ },
+ "id": 11,
+ "options": {
+ "dedupStrategy": "none",
+ "enableInfiniteScrolling": false,
+ "enableLogDetails": true,
+ "prettifyLogMessage": false,
+ "showCommonLabels": false,
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": false
+ },
+ "pluginVersion": "11.6.1",
+ "targets": [
+ {
+ "builderOptions": {
+ "columns": [
+ {
+ "hint": "trace_id",
+ "name": "TraceId"
+ },
+ {
+ "hint": "time",
+ "name": "Timestamp"
+ },
+ {
+ "hint": "log_level",
+ "name": "SeverityText"
+ },
+ {
+ "hint": "log_message",
+ "name": "Body"
+ },
+ {
+ "hint": "log_labels",
+ "name": "LogAttributes"
+ }
+ ],
+ "database": "otel",
+ "filters": [
+ {
+ "condition": "AND",
+ "filterType": "custom",
+ "hint": "time",
+ "key": "",
+ "operator": "WITH IN DASHBOARD TIME RANGE",
+ "type": "datetime"
+ },
+ {
+ "condition": "AND",
+ "filterType": "custom",
+ "hint": "log_level",
+ "key": "",
+ "operator": "IS ANYTHING",
+ "type": "string"
+ }
+ ],
+ "limit": 1000,
+ "meta": {
+ "logMessageLike": "",
+ "otelEnabled": true,
+ "otelVersion": "latest"
+ },
+ "mode": "list",
+ "orderBy": [
+ {
+ "default": true,
+ "dir": "DESC",
+ "hint": "time",
+ "name": ""
+ }
+ ],
+ "queryType": "logs",
+ "table": "logs"
+ },
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "clickhouse-logs"
+ },
+ "editorType": "sql",
+ "format": 2,
+ "meta": {
+ "builderOptions": {
+ "columns": [
+ {
+ "hint": "trace_id",
+ "name": "TraceId"
+ },
+ {
+ "hint": "time",
+ "name": "Timestamp"
+ },
+ {
+ "hint": "log_level",
+ "name": "SeverityText"
+ },
+ {
+ "hint": "log_message",
+ "name": "Body"
+ },
+ {
+ "hint": "log_labels",
+ "name": "LogAttributes"
+ }
+ ],
+ "database": "otel",
+ "filters": [
+ {
+ "condition": "AND",
+ "filterType": "custom",
+ "hint": "time",
+ "key": "",
+ "operator": "WITH IN DASHBOARD TIME RANGE",
+ "type": "datetime"
+ },
+ {
+ "condition": "AND",
+ "filterType": "custom",
+ "hint": "log_level",
+ "key": "",
+ "operator": "IS ANYTHING",
+ "type": "string"
+ }
+ ],
+ "limit": 1000,
+ "meta": {
+ "logMessageLike": "",
+ "otelEnabled": true,
+ "otelVersion": "latest"
+ },
+ "mode": "list",
+ "orderBy": [
+ {
+ "default": true,
+ "dir": "DESC",
+ "hint": "time",
+ "name": ""
+ }
+ ],
+ "queryType": "logs",
+ "table": "logs"
+ }
+ },
+ "pluginVersion": "4.4.0",
+ "queryType": "logs",
+ "rawSql": "SELECT Timestamp as \"timestamp\", Body as \"body\", SeverityText as \"level\", LogAttributes as \"labels\", TraceId as \"traceID\" FROM \"otel\".\"logs\" WHERE ( timestamp >= $__fromTime AND timestamp <= $__toTime ) ORDER BY timestamp DESC LIMIT 1000",
+ "refId": "A"
+ }
+ ],
+ "title": "General Logs",
+ "type": "logs"
+ }
+ ],
+ "preload": false,
+ "refresh": "",
+ "schemaVersion": 41,
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "ClickHouse-Logs",
+ "value": "clickhouse-logs"
+ },
+ "includeAll": false,
+ "label": "ClickHouse instance",
+ "name": "datasource",
+ "options": [],
+ "query": "grafana-clickhouse-datasource",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "current": {
+ "text": "All",
+ "value": ["$__all"]
+ },
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "${datasource}"
+ },
+ "definition": "SELECT DISTINCT ServiceName FROM logs",
+ "includeAll": true,
+ "label": "Service Name",
+ "multi": true,
+ "name": "serviceName",
+ "options": [],
+ "query": "SELECT DISTINCT ServiceName FROM logs",
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ },
+ {
+ "allValue": "ALL",
+ "current": {
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": {
+ "type": "grafana-clickhouse-datasource",
+ "uid": "${datasource}"
+ },
+ "definition": "SELECT DISTINCT TraceId FROM otel WHERE ParentSpanId = '' LIMIT 100",
+ "includeAll": true,
+ "label": "Trace Id",
+ "name": "trace_id",
+ "options": [],
+ "query": "SELECT DISTINCT TraceId FROM otel WHERE ParentSpanId = '' LIMIT 100",
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "ClickHouse OTel Dashboard",
+ "version": 1
+}
diff --git a/dev/otel/grafana/contact-points.yml b/dev/otel/grafana/contact-points.yml
new file mode 100644
index 0000000000..ed58618bef
--- /dev/null
+++ b/dev/otel/grafana/contact-points.yml
@@ -0,0 +1,38 @@
+# Grafana contact points configuration (modern alerting)
+# This should be added to your Grafana provisioning/alerting directory
+
+apiVersion: 1
+
+contactPoints:
+ - orgId: 1
+ name: email-log-alerts
+ receivers:
+ - uid: email-log-alerts-receiver
+ type: email
+ settings:
+ # Multiple addresses can be specified, separated by semicolons
+ addresses: "security@yourcompany.com"
+ subject: "PMM Log Alert: {{ .GroupLabels.alertname }}"
+ message: |
+ PMM Security Alert
+
+ Alert: {{ .GroupLabels.alertname }}
+ Status: {{ .Status }}
+
+ Alert Details:
+ {{ range .Alerts }}
+ - Summary: {{ .Annotations.summary }}
+ - Description: {{ .Annotations.description }}
+ - Severity: {{ .Labels.severity }}
+ - Team: {{ .Labels.team }}
+ - Service: {{ .Labels.service }}
+ - Alert Time: {{ .StartsAt }}
+ {{ if .Labels.message }} - Log message: {{ .Labels.message }}{{ end }}
+ {{ if .Labels.user }} - User: {{ .Labels.user }}{{ end }}
+
+ {{ end }}
+ Please investigate this security event immediately.
+
+ Generated by PMM OpenTelemetry Logging System
+ # We don't want to resolve the message automatically, as it is not relevant for this type of alert
+ disableResolveMessage: true
diff --git a/dev/otel/grafana/datasources.yml b/dev/otel/grafana/datasources.yml
new file mode 100644
index 0000000000..9042b6c6ca
--- /dev/null
+++ b/dev/otel/grafana/datasources.yml
@@ -0,0 +1,35 @@
+# Grafana data sources configuration
+# This file provisions the default data sources for the OpenTelemetry logging setup
+
+apiVersion: 1
+
+deleteDatasources:
+ - name: ClickHouse-Logs
+ orgId: 1
+
+datasources:
+ # ClickHouse data source for OpenTelemetry logs
+ # https://github.com/grafana/clickhouse-datasource?tab=readme-ov-file#with-a-configuration-file
+ - name: ClickHouse-Logs
+ uid: clickhouse-logs
+ orgId: 1
+ version: 2
+ type: grafana-clickhouse-datasource
+ isDefault: false
+ jsonData:
+ defaultDatabase: otel
+ defaultTable: logs
+ host: ${PMM_CLICKHOUSE_HOST}
+ port: ${PMM_CLICKHOUSE_PORT}
+ username: ${PMM_CLICKHOUSE_USER}
+ tlsSkipVerify: false
+ logs:
+ defaultDatabase: otel
+ defaultTable: logs
+ otelEnabled: true
+ # otelVersion: v1
+ timeColumn: Timestamp
+ levelColumn: SeverityText
+ messageColumn: Body
+ secureJsonData:
+ password: ${PMM_CLICKHOUSE_PASSWORD}
diff --git a/dev/otel/grafana/notification-policies.yml b/dev/otel/grafana/notification-policies.yml
new file mode 100644
index 0000000000..205ef696a5
--- /dev/null
+++ b/dev/otel/grafana/notification-policies.yml
@@ -0,0 +1,26 @@
+# Grafana notification policies configuration (modern alerting)
+# This should be added to your Grafana provisioning/alerting directory
+
+apiVersion: 1
+
+policies:
+ - orgId: 1
+ receiver: email-log-alerts
+ group_by:
+ - grafana_folder
+ - alertname
+ group_wait: 10s
+ group_interval: 5m
+ repeat_interval: 24h
+ routes:
+ # Single route for all security alerts
+ - receiver: email-log-alerts
+ object_matchers:
+ - ["team", "=", "security"]
+ group_by:
+ - alertname
+ - severity
+ group_wait: 10s
+ group_interval: 1m
+ repeat_interval: 24h
+ continue: false
diff --git a/dev/otel/test/clickhouse-test.sh b/dev/otel/test/clickhouse-test.sh
new file mode 100644
index 0000000000..ee7d9cbc63
--- /dev/null
+++ b/dev/otel/test/clickhouse-test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo "Testing HTTP status code to OpenTelemetry severity mapping..."
+
+# Wait for logs to be processed
+echo "Waiting for logs to be processed..."
+sleep 5
+
+# Check the mapping results
+echo "Checking severity mapping results from ClickHouse..."
+docker exec otel-clickhouse clickhouse-client --user=default --password=clickhouse --query "
+SELECT
+ CONCAT(
+ 'HTTP ', CASE WHEN LogAttributes['status'] = '' THEN 'N/A' ELSE LogAttributes['status'] END,
+ ' -> ', SeverityText,
+ ' (', toString(SeverityNumber), ')'
+ ) AS mapping
+FROM otel.logs
+WHERE Timestamp > now() - INTERVAL 5 MINUTE
+GROUP BY LogAttributes['status'], SeverityText, SeverityNumber
+ORDER BY LogAttributes['status']
+"
+
+echo ""
+echo "Expected mappings:"
+echo "- HTTP 2xx -> INFO (9)"
+echo "- HTTP 4xx -> WARN (13)"
+echo "- HTTP 5xx -> ERROR (17)"
+
+# Check the log count within the last 5 minutes
+echo ""
+echo "Checking log count in ClickHouse for the last 5 minutes..."
+docker exec otel-clickhouse clickhouse-client --user=default --password=clickhouse --query "
+SELECT count() FROM otel.logs WHERE Timestamp >= now() - INTERVAL 5 MINUTE
+"
diff --git a/dev/otel/test/setup-test.sh b/dev/otel/test/setup-test.sh
new file mode 100644
index 0000000000..bb81e282a3
--- /dev/null
+++ b/dev/otel/test/setup-test.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This script helps configure email notifications and starts the services
+
+set -e
+
+# Check if services are running
+echo "🔍 Checking service health..."
+if ! docker compose ps | grep -q "pmm-server.*Up"; then
+ echo "❌ PMM Server is not running"
+ docker compose logs pmm-server
+ exit 1
+fi
+
+if ! docker compose ps | grep -q "otel-collector.*Up"; then
+ echo "❌ OpenTelemetry Collector is not running"
+ docker compose logs otel-collector
+ exit 1
+fi
+
+echo "✅ Services are running successfully"
+
+# Display access information
+echo ""
+echo "🎉 Setup completed successfully!"
+echo ""
+echo "📊 Access Information:"
+echo " - PMM Server (Grafana): https://localhost:443"
+echo " - ClickHouse: localhost:9000"
+echo " - MailHog (Email Testing): http://localhost:8025"
+echo " - OpenTelemetry Collector Metrics: http://localhost:8888/metrics"
+echo ""
+echo "🔐 Default Credentials:"
+echo " - Username: admin"
+echo " - Password: admin"
+echo ""
+echo "📧 Email Configuration:"
+echo " - SMTP is configured to use MailHog for testing"
+echo " - Check http://localhost:8025 for sent emails"
+echo " - Update .env file with real SMTP settings for production"
+echo ""
+echo "🚨 Alert Rules:"
+echo " - Admin password change"
+echo " - Admin password change failure"
+echo ""
+echo "📝 Next Steps:"
+echo " 1. Login to Grafana at https://localhost with the default credentials"
+echo " 2. Check the 'Security' folder for alert rules"
+echo " 3. Test alerts by changing the admin password or failing to do so (use change-admin-password script)"
+echo " 4. Configure real email/Slack/etc notifications in production"
+echo ""
+echo "🔧 Troubleshooting:"
+echo " - View logs: docker compose logs [service-name]"
+echo " - Check collector config: config.yml"
+echo " - Test ClickHouse: docker compose exec pmm-server clickhouse-client --password=clickhouse --query 'SELECT 1'"
+echo ""