From 4fcf79189d88ce4bbe3dee7b249010be9bcc307d Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Fri, 10 Apr 2026 21:56:05 -0500 Subject: [PATCH 01/11] Add query labels via sqlcommenter comment parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract user-defined key='value' labels from SQL comments in the sqlcommenter format (/* controller='users',action='show' */) and export them to ClickHouse as a JSON column and to OTel as db.query.label.* attributes. Parsing happens entirely in the background worker (cold path) with zero changes to the hot path — hooks, events, ring buffer, and DSA are untouched. The bgworker scans the query text backward for the last /* */ comment block, parses key='value' pairs with URL decoding, and passes the structured ParseResult to each exporter directly. - New GUC: pg_stat_ch.track_labels (bool, default true, PGC_SIGHUP) - New ClickHouse column: labels JSON(max_dynamic_paths=64) - Parser: ExtractLastComment + ParseSqlcommenter + SerializeLabelsJson - 45 unit tests covering parsing, URL decoding, truncation, and edge cases - TAP integration test for end-to-end ClickHouse label export --- CMakeLists.txt | 10 + docker/init/00-schema.sql | 2 + include/config/guc.h | 1 + src/config/guc.cc | 12 + src/export/clickhouse_exporter.cc | 6 + src/export/exporter_interface.h | 6 + src/export/otel_exporter.cc | 9 + src/export/sqlcommenter_parse.cc | 195 +++++++++++++++ src/export/sqlcommenter_parse.h | 38 +++ src/export/stats_exporter.cc | 13 + t/030_query_labels.pl | 172 +++++++++++++ test/regression/expected/guc.out | 10 +- test/regression/sql/guc.sql | 2 + test/unit/sqlcommenter_parse_test.cc | 345 +++++++++++++++++++++++++++ 14 files changed, 820 insertions(+), 1 deletion(-) create mode 100644 src/export/sqlcommenter_parse.cc create mode 100644 src/export/sqlcommenter_parse.h create mode 100644 t/030_query_labels.pl create mode 100644 test/unit/sqlcommenter_parse_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 259bd92..878835d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,6 +149,16 @@ if(PSCH_BUILD_UNIT_TESTS) ) pg_stat_ch_set_warnings(hostname_test) + add_executable(sqlcommenter_parse_test + test/unit/sqlcommenter_parse_test.cc + src/export/sqlcommenter_parse.cc + ) + target_include_directories(sqlcommenter_parse_test PRIVATE src include) + target_compile_features(sqlcommenter_parse_test PRIVATE cxx_std_17) + target_link_libraries(sqlcommenter_parse_test PRIVATE GTest::gtest_main) + pg_stat_ch_set_warnings(sqlcommenter_parse_test) + include(GoogleTest) gtest_discover_tests(hostname_test) + gtest_discover_tests(sqlcommenter_parse_test) endif() diff --git a/docker/init/00-schema.sql b/docker/init/00-schema.sql index 4b15ef2..9482c42 100644 --- a/docker/init/00-schema.sql +++ b/docker/init/00-schema.sql @@ -59,6 +59,8 @@ CREATE TABLE pg_stat_ch.events_raw query String COMMENT 'Full SQL query text (may be truncated). Used for debugging and query analysis.', + labels JSON(max_dynamic_paths=64) COMMENT 'Query labels from sqlcommenter comments (key=value pairs in /* */ blocks). Access subpaths directly: labels.controller, labels.action. Empty {} when no labels present. See: https://google.github.io/sqlcommenter/', + -- ======================================================================== -- Shared buffer metrics (main buffer cache) -- ======================================================================== diff --git a/include/config/guc.h b/include/config/guc.h index 70d9273..2d6b27c 100644 --- a/include/config/guc.h +++ b/include/config/guc.h @@ -28,6 +28,7 @@ extern int psch_otel_log_batch_size; extern int psch_otel_log_max_bytes; extern int psch_otel_log_delay_ms; extern int psch_otel_metric_interval_ms; +extern bool psch_track_labels; extern bool psch_debug_force_locked_overflow; // Initialize GUC variables diff --git a/src/config/guc.cc b/src/config/guc.cc index 9535025..7c1b088 100644 --- a/src/config/guc.cc +++ b/src/config/guc.cc @@ -32,6 +32,7 @@ int psch_otel_log_batch_size = 8192; int psch_otel_log_max_bytes = 3 * 1024 * 1024; // 3 MiB: gRPC default max is 4 MiB int psch_otel_log_delay_ms = 100; int psch_otel_metric_interval_ms = 5000; +bool psch_track_labels = true; bool psch_debug_force_locked_overflow = false; // Log level options (matches PostgreSQL's server_message_level_options pattern) @@ -307,6 +308,17 @@ void PschInitGuc(void) { PGC_SUSET, 0, nullptr, nullptr, nullptr); + DefineCustomBoolVariable( + "pg_stat_ch.track_labels", + "Extract sqlcommenter labels from query comments.", + "When enabled, the background worker parses /* key='value' */ comments " + "and exports structured labels to ClickHouse (JSON) or OTel (attributes).", + &psch_track_labels, + true, + PGC_SIGHUP, + 0, + nullptr, nullptr, nullptr); + DefineCustomBoolVariable( "pg_stat_ch.debug_force_locked_overflow", "Force HandleOverflow in locked path (debug/test only).", diff --git a/src/export/clickhouse_exporter.cc b/src/export/clickhouse_exporter.cc index 743c537..4455a46 100644 --- a/src/export/clickhouse_exporter.cc +++ b/src/export/clickhouse_exporter.cc @@ -75,10 +75,15 @@ class ClickHouseExporter : public StatsExporter { shared_ptr> DbOperationColumn() final { return TagString("cmd_type"); } shared_ptr> DbQueryTextColumn() final { return RecordString("query"); } + void AppendLabels(const ParseResult& labels) final { + labels_col_->Append(SerializeLabelsJson(labels)); + } + void BeginBatch() final { block = std::make_unique(); columns.clear(); exported_count = 0; + labels_col_ = Wrap("labels"); } void BeginRow() final { ++exported_count; } bool CommitBatch() final; @@ -116,6 +121,7 @@ class ClickHouseExporter : public StatsExporter { std::unique_ptr client; std::unique_ptr block; std::vector> columns; + shared_ptr> labels_col_; int consecutive_failures = 0; int exported_count = 0; }; diff --git a/src/export/exporter_interface.h b/src/export/exporter_interface.h index 183ecb7..d87ea9e 100644 --- a/src/export/exporter_interface.h +++ b/src/export/exporter_interface.h @@ -6,6 +6,8 @@ #include #include +#include "export/sqlcommenter_parse.h" + class StatsExporter { protected: using string = std::string; @@ -65,6 +67,10 @@ class StatsExporter { virtual shared_ptr> DbOperationColumn() = 0; // Query text. CH: RecordString "query"; OTel semconv: "db.query.text". virtual shared_ptr> DbQueryTextColumn() = 0; + // Query labels from sqlcommenter comments. Called inside the event loop. + // CH: serializes to JSON, appends to a String "labels" column; + // OTel: sets per-label attributes prefixed with "db.query.label.". + virtual void AppendLabels(const ParseResult& labels) = 0; virtual void BeginBatch() = 0; virtual void BeginRow() = 0; diff --git a/src/export/otel_exporter.cc b/src/export/otel_exporter.cc index cef7290..3ddfef7 100644 --- a/src/export/otel_exporter.cc +++ b/src/export/otel_exporter.cc @@ -129,6 +129,15 @@ class OTelExporter : public StatsExporter { shared_ptr> DbQueryTextColumn() final { return Wrap>("db.query.text"); } + void AppendLabels(const ParseResult& labels) final { + for (int i = 0; i < labels.count; ++i) { + string attr_name = "db.query.label."; + attr_name.append(labels.labels[i].key.data(), labels.labels[i].key.size()); + string val(labels.labels[i].value); + current_log_record->SetAttribute(attr_name, val); + current_row_tags[attr_name] = std::move(val); + } + } bool EstablishNewConnection() final; bool IsConnected() const final { return metrics_provider && log_provider; } diff --git a/src/export/sqlcommenter_parse.cc b/src/export/sqlcommenter_parse.cc new file mode 100644 index 0000000..167c0d7 --- /dev/null +++ b/src/export/sqlcommenter_parse.cc @@ -0,0 +1,195 @@ +#include "export/sqlcommenter_parse.h" + +#include + +namespace { + +int DecodeHexByte(char hi, char lo) { + auto hex = [](char c) -> int { + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + return -1; + }; + int h = hex(hi); + int l = hex(lo); + if (h < 0 || l < 0) + return -1; + return (h << 4) | l; +} + +// URL-decode src into dst buffer. Returns the number of bytes written. +size_t UrlDecode(std::string_view src, char* dst, size_t max_len) { + size_t written = 0; + size_t i = 0; + while (i < src.size() && written < max_len) { + if (src[i] == '%' && i + 2 < src.size()) { + int byte = DecodeHexByte(src[i + 1], src[i + 2]); + if (byte >= 0) { + dst[written++] = static_cast(byte); + i += 3; + continue; + } + } + dst[written++] = src[i++]; + } + return written; +} + +bool NeedsUrlDecode(std::string_view sv) { + return sv.find('%') != std::string_view::npos; +} + +void AppendJsonEscaped(std::string& out, std::string_view sv) { + for (char c : sv) { + switch (c) { + case '"': + out += "\\\""; + break; + case '\\': + out += "\\\\"; + break; + case '\b': + out += "\\b"; + break; + case '\f': + out += "\\f"; + break; + case '\n': + out += "\\n"; + break; + case '\r': + out += "\\r"; + break; + case '\t': + out += "\\t"; + break; + default: + if (static_cast(c) < 0x20) { + char buf[8]; + std::snprintf(buf, sizeof(buf), "\\u%04x", static_cast(c)); + out += buf; + } else { + out += c; + } + } + } +} + +} // namespace + +std::string_view ExtractLastComment(std::string_view query) { + auto end_pos = query.rfind("*/"); + if (end_pos == std::string_view::npos) + return {}; + + auto start_pos = query.rfind("/*", end_pos); + if (start_pos == std::string_view::npos) + return {}; + + size_t content_start = start_pos + 2; + if (content_start >= end_pos) + return {}; + return query.substr(content_start, end_pos - content_start); +} + +ParseResult ParseSqlcommenter(std::string_view comment) { + ParseResult result; + size_t pos = 0; + + auto skip_ws = [&]() { + while (pos < comment.size() && (comment[pos] == ' ' || comment[pos] == '\t' || + comment[pos] == '\n' || comment[pos] == '\r')) { + ++pos; + } + }; + + while (pos < comment.size() && result.count < kMaxLabels) { + skip_ws(); + if (pos >= comment.size()) + break; + + // Parse key: read until '=', whitespace, or ',' + size_t key_start = pos; + while (pos < comment.size() && comment[pos] != '=' && comment[pos] != ' ' && + comment[pos] != '\t' && comment[pos] != '\n' && comment[pos] != '\r' && + comment[pos] != ',') { + ++pos; + } + if (pos == key_start) { + ++pos; + continue; + } + std::string_view raw_key = comment.substr(key_start, pos - key_start); + + skip_ws(); + if (pos >= comment.size() || comment[pos] != '=') + continue; + ++pos; + + skip_ws(); + if (pos >= comment.size() || comment[pos] != '\'') + continue; + ++pos; + + // Parse value: read until closing single quote + size_t val_start = pos; + while (pos < comment.size() && comment[pos] != '\'') { + ++pos; + } + if (pos >= comment.size()) + break; + std::string_view raw_value = comment.substr(val_start, pos - val_start); + ++pos; + + Label& label = result.labels[result.count]; + if (NeedsUrlDecode(raw_key)) { + size_t len = UrlDecode(raw_key, label.decoded_key, kMaxKeyLen); + label.key = std::string_view(label.decoded_key, len); + } else { + label.key = raw_key.substr(0, kMaxKeyLen); + } + + if (NeedsUrlDecode(raw_value)) { + size_t len = UrlDecode(raw_value, label.decoded_value, kMaxValueLen); + label.value = std::string_view(label.decoded_value, len); + } else { + label.value = raw_value.substr(0, kMaxValueLen); + } + + ++result.count; + + skip_ws(); + if (pos < comment.size() && comment[pos] == ',') + ++pos; + } + + if (result.count == kMaxLabels) { + skip_ws(); + if (pos < comment.size()) + result.truncated = true; + } + + return result; +} + +std::string SerializeLabelsJson(const ParseResult& result) { + if (result.count == 0) + return "{}"; + + std::string json = "{"; + for (int i = 0; i < result.count; ++i) { + if (i > 0) + json += ','; + json += '"'; + AppendJsonEscaped(json, result.labels[i].key); + json += "\":\""; + AppendJsonEscaped(json, result.labels[i].value); + json += '"'; + } + json += '}'; + return json; +} diff --git a/src/export/sqlcommenter_parse.h b/src/export/sqlcommenter_parse.h new file mode 100644 index 0000000..f120a4b --- /dev/null +++ b/src/export/sqlcommenter_parse.h @@ -0,0 +1,38 @@ +#ifndef PG_STAT_CH_SRC_EXPORT_SQLCOMMENTER_PARSE_H_ +#define PG_STAT_CH_SRC_EXPORT_SQLCOMMENTER_PARSE_H_ + +#include +#include +#include + +constexpr int kMaxLabels = 16; +constexpr int kMaxKeyLen = 32; +constexpr int kMaxValueLen = 128; + +struct Label { + std::string_view key; // Points into query buffer or decoded_key + std::string_view value; // Points into query buffer or decoded_value + char decoded_key[kMaxKeyLen]; + char decoded_value[kMaxValueLen]; +}; + +struct ParseResult { + std::array labels; + int count = 0; + bool truncated = false; +}; + +// Find the last /* ... */ comment in query text. +// Returns the content between delimiters (exclusive), or empty if not found. +std::string_view ExtractLastComment(std::string_view query); + +// Parse sqlcommenter format: key1='value1',key2='value2' +// URL-decodes percent-encoded values per sqlcommenter spec. +// Keys truncated to 32 chars, values to 128 chars. +ParseResult ParseSqlcommenter(std::string_view comment); + +// Serialize parsed labels to JSON: {"key1":"val1","key2":"val2"} +// Returns "{}" if count == 0. +std::string SerializeLabelsJson(const ParseResult& result); + +#endif // PG_STAT_CH_SRC_EXPORT_SQLCOMMENTER_PARSE_H_ diff --git a/src/export/stats_exporter.cc b/src/export/stats_exporter.cc index e11af99..51cc60b 100644 --- a/src/export/stats_exporter.cc +++ b/src/export/stats_exporter.cc @@ -15,6 +15,7 @@ extern "C" { #include "export/clickhouse_exporter.h" #include "export/exporter_interface.h" #include "export/otel_exporter.h" +#include "export/sqlcommenter_parse.h" #include "export/stats_exporter.h" #include "queue/event.h" #include "queue/shmem.h" @@ -201,6 +202,18 @@ void ExportEventStats(const std::vector& events, StatsExporter* expor "client_addr_len"); col_app->Append(std::string(ev.application_name, alen)); col_client_addr->Append(std::string(ev.client_addr, clen)); + + if (psch_track_labels) { + std::string_view query_text(ev.query, qlen); + std::string_view comment = ExtractLastComment(query_text); + if (!comment.empty()) { + exporter->AppendLabels(ParseSqlcommenter(comment)); + } else { + exporter->AppendLabels(ParseResult{}); + } + } else { + exporter->AppendLabels(ParseResult{}); + } } elog(DEBUG1, "pg_stat_ch: finished processing %zu events", events.size()); } diff --git a/t/030_query_labels.pl b/t/030_query_labels.pl new file mode 100644 index 0000000..110a7db --- /dev/null +++ b/t/030_query_labels.pl @@ -0,0 +1,172 @@ +#!/usr/bin/env perl +# Test: Query labels via sqlcommenter comments → ClickHouse JSON column +# Prerequisites: ClickHouse container must be running +# docker compose -f docker/docker-compose.test.yml up -d + +use strict; +use warnings; +use lib 't'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use psch; + +# Skip if Docker/ClickHouse not available +if (!psch_clickhouse_available()) { + plan skip_all => 'Docker not available, skipping ClickHouse labels tests'; +} + +my $ch_check = `curl -s 'http://localhost:18123/' --data 'SELECT 1' 2>/dev/null`; +if ($ch_check !~ /^1/) { + plan skip_all => + 'ClickHouse container not running. Start with: docker compose -f docker/docker-compose.test.yml up -d'; +} + +psch_query_clickhouse("TRUNCATE TABLE IF EXISTS pg_stat_ch.events_raw"); + +my $node = psch_init_node_with_clickhouse('ch_labels', + flush_interval_ms => 100, + batch_max => 100, +); + +# Test 1: Query with sqlcommenter labels appears with JSON labels in CH +subtest 'basic labels export' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', + "SELECT 1 /* controller='users',action='show' */"); + + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + my $labels = psch_wait_for_clickhouse_query( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%controller%' LIMIT 1", + sub { $_[0] ne '' && $_[0] ne '{}' }, + 10 + ); + + like($labels, qr/controller/, 'labels JSON contains controller key'); + like($labels, qr/users/, 'labels JSON contains users value'); + like($labels, qr/action/, 'labels JSON contains action key'); + like($labels, qr/show/, 'labels JSON contains show value'); +}; + +# Test 2: Query without comment gets empty labels +subtest 'no comment produces empty labels' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', 'SELECT 42'); + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + my $labels = psch_wait_for_clickhouse_query( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%42%' LIMIT 1", + sub { $_[0] ne '' }, + 10 + ); + + like($labels, qr/^\{\}$/, 'no-comment query has empty {} labels'); +}; + +# Test 3: Multiple label keys +subtest 'multiple labels' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', + "SELECT 1 /* controller='orders',action='index',framework='rails',db_driver='pg' */"); + + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + # Query individual label subpaths via ClickHouse JSON syntax + my $controller = psch_wait_for_clickhouse_query( + "SELECT labels.controller FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%orders%' LIMIT 1", + sub { $_[0] =~ /orders/ }, + 10 + ); + like($controller, qr/orders/, 'labels.controller = orders'); + + my $framework = psch_query_clickhouse( + "SELECT labels.framework FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%orders%' LIMIT 1"); + like($framework, qr/rails/, 'labels.framework = rails'); +}; + +# Test 4: track_labels = off produces empty labels +subtest 'track_labels off' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', + "ALTER SYSTEM SET pg_stat_ch.track_labels = off"); + $node->reload(); + sleep(1); + + $node->safe_psql('postgres', + "SELECT 1 /* controller='ignored' */"); + + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + my $labels = psch_wait_for_clickhouse_query( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%ignored%' LIMIT 1", + sub { $_[0] ne '' }, + 10 + ); + like($labels, qr/^\{\}$/, 'track_labels=off produces empty labels'); + + # Re-enable + $node->safe_psql('postgres', + "ALTER SYSTEM SET pg_stat_ch.track_labels = on"); + $node->reload(); + sleep(1); +}; + +# Test 5: URL-encoded values are decoded +subtest 'url encoded values' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', + "SELECT 1 /* route='/api/users%3Fid%3D1' */"); + + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + my $labels = psch_wait_for_clickhouse_query( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%route%' LIMIT 1", + sub { $_[0] ne '' && $_[0] ne '{}' }, + 10 + ); + + like($labels, qr{/api/users\?id=1}, 'URL-encoded value decoded correctly'); +}; + +# Test 6: Only last comment is extracted +subtest 'multiple comments takes last' => sub { + psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); + psch_reset_stats($node); + + $node->safe_psql('postgres', + "SELECT /* first='ignored' */ 1 /* second='captured' */"); + + $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); + + my $labels = psch_wait_for_clickhouse_query( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%second%' LIMIT 1", + sub { $_[0] ne '' && $_[0] ne '{}' }, + 10 + ); + + like($labels, qr/captured/, 'last comment labels exported'); + unlike($labels, qr/ignored/, 'first comment labels not exported'); +}; + +$node->stop(); +done_testing(); diff --git a/test/regression/expected/guc.out b/test/regression/expected/guc.out index 833de5d..3d31689 100644 --- a/test/regression/expected/guc.out +++ b/test/regression/expected/guc.out @@ -24,8 +24,9 @@ SELECT name FROM pg_settings WHERE name LIKE 'pg_stat_ch.%' ORDER BY name COLLAT pg_stat_ch.otel_metric_interval_ms pg_stat_ch.queue_capacity pg_stat_ch.string_area_size + pg_stat_ch.track_labels pg_stat_ch.use_otel -(22 rows) +(23 rows) SHOW pg_stat_ch.enabled; pg_stat_ch.enabled @@ -67,6 +68,13 @@ SHOW pg_stat_ch.log_min_elevel; warning (1 row) +-- Test track_labels GUC (PGC_SIGHUP: can't SET, only ALTER SYSTEM + reload) +SHOW pg_stat_ch.track_labels; + pg_stat_ch.track_labels +------------------------- + on +(1 row) + -- Test that clickhouse_password is hidden from non-superusers (issue #3) CREATE ROLE psch_test_user LOGIN; SET ROLE psch_test_user; diff --git a/test/regression/sql/guc.sql b/test/regression/sql/guc.sql index 39c95ee..a543ca0 100644 --- a/test/regression/sql/guc.sql +++ b/test/regression/sql/guc.sql @@ -11,6 +11,8 @@ SET pg_stat_ch.log_min_elevel = 'notice'; SHOW pg_stat_ch.log_min_elevel; RESET pg_stat_ch.log_min_elevel; SHOW pg_stat_ch.log_min_elevel; +-- Test track_labels GUC (PGC_SIGHUP: can't SET, only ALTER SYSTEM + reload) +SHOW pg_stat_ch.track_labels; -- Test that clickhouse_password is hidden from non-superusers (issue #3) CREATE ROLE psch_test_user LOGIN; SET ROLE psch_test_user; diff --git a/test/unit/sqlcommenter_parse_test.cc b/test/unit/sqlcommenter_parse_test.cc new file mode 100644 index 0000000..bfe9103 --- /dev/null +++ b/test/unit/sqlcommenter_parse_test.cc @@ -0,0 +1,345 @@ +#include + +#include +#include + +#include "export/sqlcommenter_parse.h" + +// ============================================================================ +// ExtractLastComment tests +// ============================================================================ + +TEST(ExtractLastComment, BasicComment) { + auto result = ExtractLastComment("SELECT 1 /* hello */"); + EXPECT_EQ(result, " hello "); +} + +TEST(ExtractLastComment, NoComment) { + auto result = ExtractLastComment("SELECT 1"); + EXPECT_TRUE(result.empty()); +} + +TEST(ExtractLastComment, EmptyComment) { + auto result = ExtractLastComment("SELECT 1 /**/"); + EXPECT_TRUE(result.empty()); +} + +TEST(ExtractLastComment, MultipleComments) { + auto result = ExtractLastComment("SELECT /* first */ 1 /* second */"); + EXPECT_EQ(result, " second "); +} + +TEST(ExtractLastComment, CommentWithSqlcommenter) { + auto result = ExtractLastComment( + "SELECT * FROM users WHERE id = $1 /* controller='users',action='show' */"); + EXPECT_EQ(result, " controller='users',action='show' "); +} + +TEST(ExtractLastComment, NoClosingDelimiter) { + auto result = ExtractLastComment("SELECT 1 /* unclosed"); + EXPECT_TRUE(result.empty()); +} + +TEST(ExtractLastComment, NoOpeningDelimiter) { + auto result = ExtractLastComment("SELECT 1 */"); + EXPECT_TRUE(result.empty()); +} + +TEST(ExtractLastComment, CommentAtStart) { + auto result = ExtractLastComment("/* comment */ SELECT 1"); + EXPECT_EQ(result, " comment "); +} + +TEST(ExtractLastComment, WhitespaceOnlyComment) { + auto result = ExtractLastComment("SELECT 1 /* */"); + EXPECT_EQ(result, " "); +} + +TEST(ExtractLastComment, NormalizedQueryWithPlaceholders) { + auto result = ExtractLastComment( + "SELECT * FROM users WHERE id = $1 AND name = $2 /* controller='users' */"); + EXPECT_EQ(result, " controller='users' "); +} + +TEST(ExtractLastComment, EmptyQuery) { + auto result = ExtractLastComment(""); + EXPECT_TRUE(result.empty()); +} + +TEST(ExtractLastComment, AdjacentComments) { + auto result = ExtractLastComment("SELECT 1 /* a *//* b */"); + EXPECT_EQ(result, " b "); +} + +// ============================================================================ +// ParseSqlcommenter tests +// ============================================================================ + +TEST(ParseSqlcommenter, BasicPairs) { + auto result = ParseSqlcommenter("controller='users',action='show'"); + EXPECT_EQ(result.count, 2); + EXPECT_EQ(result.labels[0].key, "controller"); + EXPECT_EQ(result.labels[0].value, "users"); + EXPECT_EQ(result.labels[1].key, "action"); + EXPECT_EQ(result.labels[1].value, "show"); + EXPECT_FALSE(result.truncated); +} + +TEST(ParseSqlcommenter, SinglePair) { + auto result = ParseSqlcommenter("key='value'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "key"); + EXPECT_EQ(result.labels[0].value, "value"); +} + +TEST(ParseSqlcommenter, WithWhitespace) { + auto result = ParseSqlcommenter(" controller = 'users' , action = 'show' "); + EXPECT_EQ(result.count, 2); + EXPECT_EQ(result.labels[0].key, "controller"); + EXPECT_EQ(result.labels[0].value, "users"); + EXPECT_EQ(result.labels[1].key, "action"); + EXPECT_EQ(result.labels[1].value, "show"); +} + +TEST(ParseSqlcommenter, EmptyComment) { + auto result = ParseSqlcommenter(""); + EXPECT_EQ(result.count, 0); +} + +TEST(ParseSqlcommenter, WhitespaceOnly) { + auto result = ParseSqlcommenter(" \t "); + EXPECT_EQ(result.count, 0); +} + +TEST(ParseSqlcommenter, EmptyValue) { + auto result = ParseSqlcommenter("key=''"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "key"); + EXPECT_EQ(result.labels[0].value, ""); +} + +TEST(ParseSqlcommenter, UrlEncodedValue) { + auto result = ParseSqlcommenter("key='hello%20world'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "hello world"); +} + +TEST(ParseSqlcommenter, UrlEncodedComma) { + auto result = ParseSqlcommenter("key='a%2Cb'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "a,b"); +} + +TEST(ParseSqlcommenter, UrlEncodedSingleQuote) { + auto result = ParseSqlcommenter("key='it%27s'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "it's"); +} + +TEST(ParseSqlcommenter, UrlEncodedKey) { + auto result = ParseSqlcommenter("my%20key='value'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "my key"); +} + +TEST(ParseSqlcommenter, MalformedPercentEncoding) { + // %ZZ is not valid hex; should be passed through literally + auto result = ParseSqlcommenter("key='%ZZfoo'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "%ZZfoo"); +} + +TEST(ParseSqlcommenter, PercentAtEnd) { + // Trailing % without two hex digits + auto result = ParseSqlcommenter("key='foo%'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "foo%"); +} + +TEST(ParseSqlcommenter, PercentWithOneChar) { + auto result = ParseSqlcommenter("key='foo%2'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "foo%2"); +} + +TEST(ParseSqlcommenter, KeyTruncation) { + // Key longer than 32 chars should be truncated + std::string long_key(50, 'a'); + std::string comment = long_key + "='value'"; + auto result = ParseSqlcommenter(comment); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key.size(), static_cast(kMaxKeyLen)); +} + +TEST(ParseSqlcommenter, ValueTruncation) { + // Value longer than 128 chars should be truncated + std::string long_value(200, 'b'); + std::string comment = "key='" + long_value + "'"; + auto result = ParseSqlcommenter(comment); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value.size(), static_cast(kMaxValueLen)); +} + +TEST(ParseSqlcommenter, MaxLabels) { + std::string comment; + for (int i = 0; i < kMaxLabels; ++i) { + if (i > 0) comment += ','; + comment += "k" + std::to_string(i) + "='v" + std::to_string(i) + "'"; + } + auto result = ParseSqlcommenter(comment); + EXPECT_EQ(result.count, kMaxLabels); + EXPECT_FALSE(result.truncated); +} + +TEST(ParseSqlcommenter, MoreThanMaxLabels) { + std::string comment; + for (int i = 0; i < kMaxLabels + 3; ++i) { + if (i > 0) comment += ','; + comment += "k" + std::to_string(i) + "='v" + std::to_string(i) + "'"; + } + auto result = ParseSqlcommenter(comment); + EXPECT_EQ(result.count, kMaxLabels); + EXPECT_TRUE(result.truncated); +} + +TEST(ParseSqlcommenter, MissingEquals) { + auto result = ParseSqlcommenter("key'value'"); + EXPECT_EQ(result.count, 0); +} + +TEST(ParseSqlcommenter, MissingOpenQuote) { + auto result = ParseSqlcommenter("key=value'"); + EXPECT_EQ(result.count, 0); +} + +TEST(ParseSqlcommenter, MissingCloseQuote) { + auto result = ParseSqlcommenter("key='value"); + EXPECT_EQ(result.count, 0); +} + +TEST(ParseSqlcommenter, MixedValidAndInvalid) { + auto result = ParseSqlcommenter("good='yes',bad,also_good='yep'"); + EXPECT_EQ(result.count, 2); + EXPECT_EQ(result.labels[0].key, "good"); + EXPECT_EQ(result.labels[0].value, "yes"); + EXPECT_EQ(result.labels[1].key, "also_good"); + EXPECT_EQ(result.labels[1].value, "yep"); +} + +TEST(ParseSqlcommenter, TrailingComma) { + auto result = ParseSqlcommenter("key='value',"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "key"); +} + +TEST(ParseSqlcommenter, LeadingWhitespace) { + auto result = ParseSqlcommenter(" key='value'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "key"); +} + +TEST(ParseSqlcommenter, RealWorldSqlcommenter) { + auto result = ParseSqlcommenter( + "controller='users',action='show',framework='rails',db_driver='pg'"); + EXPECT_EQ(result.count, 4); + EXPECT_EQ(result.labels[0].key, "controller"); + EXPECT_EQ(result.labels[0].value, "users"); + EXPECT_EQ(result.labels[1].key, "action"); + EXPECT_EQ(result.labels[1].value, "show"); + EXPECT_EQ(result.labels[2].key, "framework"); + EXPECT_EQ(result.labels[2].value, "rails"); + EXPECT_EQ(result.labels[3].key, "db_driver"); + EXPECT_EQ(result.labels[3].value, "pg"); +} + +// ============================================================================ +// SerializeLabelsJson tests +// ============================================================================ + +TEST(SerializeLabelsJson, EmptyResult) { + ParseResult result; + EXPECT_EQ(SerializeLabelsJson(result), "{}"); +} + +TEST(SerializeLabelsJson, SingleLabel) { + ParseResult result; + result.count = 1; + result.labels[0].key = "controller"; + result.labels[0].value = "users"; + EXPECT_EQ(SerializeLabelsJson(result), R"({"controller":"users"})"); +} + +TEST(SerializeLabelsJson, MultipleLabels) { + ParseResult result; + result.count = 2; + result.labels[0].key = "controller"; + result.labels[0].value = "users"; + result.labels[1].key = "action"; + result.labels[1].value = "show"; + EXPECT_EQ(SerializeLabelsJson(result), + R"({"controller":"users","action":"show"})"); +} + +TEST(SerializeLabelsJson, EscapesDoubleQuotes) { + ParseResult result; + result.count = 1; + result.labels[0].key = "key"; + result.labels[0].value = "val\"ue"; + EXPECT_EQ(SerializeLabelsJson(result), R"({"key":"val\"ue"})"); +} + +TEST(SerializeLabelsJson, EscapesBackslash) { + ParseResult result; + result.count = 1; + result.labels[0].key = "key"; + result.labels[0].value = "path\\to"; + EXPECT_EQ(SerializeLabelsJson(result), R"({"key":"path\\to"})"); +} + +TEST(SerializeLabelsJson, EscapesControlChars) { + ParseResult result; + result.count = 1; + result.labels[0].key = "key"; + result.labels[0].value = "line1\nline2"; + EXPECT_EQ(SerializeLabelsJson(result), R"({"key":"line1\nline2"})"); +} + +// ============================================================================ +// End-to-end: ExtractLastComment -> ParseSqlcommenter -> SerializeLabelsJson +// ============================================================================ + +TEST(EndToEnd, FullPipeline) { + std::string_view query = + "SELECT * FROM users WHERE id = $1 " + "/* controller='users',action='show',job='UserSync' */"; + auto comment = ExtractLastComment(query); + EXPECT_FALSE(comment.empty()); + + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 3); + + std::string json = SerializeLabelsJson(parsed); + EXPECT_EQ(json, + R"({"controller":"users","action":"show","job":"UserSync"})"); +} + +TEST(EndToEnd, QueryWithNoComment) { + std::string_view query = "SELECT * FROM users WHERE id = $1"; + auto comment = ExtractLastComment(query); + EXPECT_TRUE(comment.empty()); + // No comment → empty ParseResult → "{}" + ParseResult result; + EXPECT_EQ(SerializeLabelsJson(result), "{}"); +} + +TEST(EndToEnd, UrlEncodedLabels) { + std::string_view query = + "SELECT 1 /* app='my%20app',route='/api/users%3Fid%3D1' */"; + auto comment = ExtractLastComment(query); + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 2); + EXPECT_EQ(parsed.labels[0].key, "app"); + EXPECT_EQ(parsed.labels[0].value, "my app"); + EXPECT_EQ(parsed.labels[1].key, "route"); + EXPECT_EQ(parsed.labels[1].value, "/api/users?id=1"); +} From 21c3e78e399adce4c34499a1c46a3404b37b1cc9 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 09:45:11 -0500 Subject: [PATCH 02/11] Handle escaped quotes per sqlcommenter spec, add migration and docs - Parse \' as escaped single quote in values (spec meta char escaping) - Combined MetaUnescapeAndUrlDecode replaces UrlDecode for spec compliance - Add ClickHouse migration for existing installs (001_add_labels_column.sql) - Document labels column in events-schema reference and migration in ClickHouse guide - Fix CI to build all unit test targets (sqlcommenter_parse_test was missing) - Add 9 spec compliance tests (escaped quotes, spec exhibit round-trip) --- .github/workflows/ci.yml | 2 +- docker/migrations/001_add_labels_column.sql | 15 +++ docs/guides/clickhouse.md | 26 +++++ docs/reference/events-schema.mdx | 10 ++ src/export/sqlcommenter_parse.cc | 44 ++++++--- test/unit/sqlcommenter_parse_test.cc | 102 ++++++++++++++++++++ 6 files changed, 183 insertions(+), 16 deletions(-) create mode 100644 docker/migrations/001_add_labels_column.sql diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d590f0b..13d908b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,7 +131,7 @@ jobs: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - name: Build - run: cmake --build build_unit --target hostname_test --parallel + run: cmake --build build_unit --parallel - name: Run unit tests run: ctest --test-dir build_unit --output-on-failure diff --git a/docker/migrations/001_add_labels_column.sql b/docker/migrations/001_add_labels_column.sql new file mode 100644 index 0000000..16f73a4 --- /dev/null +++ b/docker/migrations/001_add_labels_column.sql @@ -0,0 +1,15 @@ +-- Migration 001: Add labels column for sqlcommenter support +-- +-- This migration adds the `labels` column to `events_raw` for existing +-- installations. New installations already include this column via +-- docker/init/00-schema.sql. +-- +-- Run with: +-- clickhouse-client < docker/migrations/001_add_labels_column.sql +-- +-- Safe to re-run: ALTER TABLE ADD COLUMN IF NOT EXISTS is idempotent. + +ALTER TABLE pg_stat_ch.events_raw + ADD COLUMN IF NOT EXISTS labels JSON(max_dynamic_paths=64) + COMMENT 'Query labels from sqlcommenter comments (key=value pairs in /* */ blocks). Access subpaths directly: labels.controller, labels.action. Empty {} when no labels present. See: https://google.github.io/sqlcommenter/' + AFTER query; diff --git a/docs/guides/clickhouse.md b/docs/guides/clickhouse.md index 99f2c60..341fc66 100644 --- a/docs/guides/clickhouse.md +++ b/docs/guides/clickhouse.md @@ -80,6 +80,32 @@ Four materialized views provide pre-aggregated analytics: For view schemas, query patterns, and the `-State`/`-Merge` aggregation pattern, see [materialized views](/reference/materialized-views). +## Schema migrations + +When upgrading pg_stat_ch, new columns or schema changes may be required. Migration scripts are provided in [`docker/migrations/`](https://github.com/ClickHouse/pg_stat_ch/tree/main/docker/migrations) and are safe to re-run (idempotent). + +Apply all pending migrations: + +```bash +for f in docker/migrations/*.sql; do + clickhouse-client < "$f" +done +``` + +Or apply a specific migration: + +```bash +clickhouse-client < docker/migrations/001_add_labels_column.sql +``` + +| Migration | Version | Description | +|---|---|---| +| `001_add_labels_column.sql` | 0.2+ | Adds `labels JSON` column for [sqlcommenter](https://google.github.io/sqlcommenter/) query label support | + + +New installations using `docker/init/00-schema.sql` already include all schema changes. Migrations are only needed for existing ClickHouse instances. + + ## Data retention The `events_raw` table has no TTL by default. To limit storage, add a TTL: diff --git a/docs/reference/events-schema.mdx b/docs/reference/events-schema.mdx index c163154..bf2fcfd 100644 --- a/docs/reference/events-schema.mdx +++ b/docs/reference/events-schema.mdx @@ -30,6 +30,16 @@ The table is partitioned by date (`toDate(ts_start)`) and ordered by `ts_start` Query normalization replaces literals with placeholders (`$N`). This means `SELECT * FROM users WHERE id = 42` becomes `SELECT * FROM users WHERE id = $1`. No passwords, tokens, or PII are exported in query text. +## Query labels + +| Column | Type | Description | +|---|---|---| +| `labels` | `JSON(max_dynamic_paths=64)` | Key-value labels extracted from [sqlcommenter](https://google.github.io/sqlcommenter/) comments appended to the query. For example, `/* controller='users',action='show' */` produces `{"controller":"users","action":"show"}`. Access subpaths directly in ClickHouse: `labels.controller`, `labels.action`. Empty `{}` when no labels are present. | + + +Labels are parsed from the **last** `/* */` comment block in the query text. The parser supports URL-encoded values and escaped single quotes per the sqlcommenter specification. Controlled by the [`pg_stat_ch.track_labels`](/reference/configuration) GUC (default: `true`). + + ## Shared buffer usage These columns show how queries interact with PostgreSQL's `shared_buffers` cache. The cache hit ratio for a query is `shared_blks_hit / (shared_blks_hit + shared_blks_read)`. Target above 99% for OLTP workloads. diff --git a/src/export/sqlcommenter_parse.cc b/src/export/sqlcommenter_parse.cc index 167c0d7..9050961 100644 --- a/src/export/sqlcommenter_parse.cc +++ b/src/export/sqlcommenter_parse.cc @@ -21,28 +21,35 @@ int DecodeHexByte(char hi, char lo) { return (h << 4) | l; } -// URL-decode src into dst buffer. Returns the number of bytes written. -size_t UrlDecode(std::string_view src, char* dst, size_t max_len) { +bool NeedsDecode(std::string_view sv) { + return sv.find('%') != std::string_view::npos || sv.find('\\') != std::string_view::npos; +} + +// Meta-unescape (\' → ') and URL-decode in a single pass. +// Per sqlcommenter spec, meta unescaping happens before URL decoding, +// but the two transforms don't overlap so a combined pass is equivalent. +size_t MetaUnescapeAndUrlDecode(std::string_view src, char* dst, size_t max_len) { size_t written = 0; size_t i = 0; while (i < src.size() && written < max_len) { - if (src[i] == '%' && i + 2 < src.size()) { + if (src[i] == '\\' && i + 1 < src.size() && src[i + 1] == '\'') { + dst[written++] = '\''; + i += 2; + } else if (src[i] == '%' && i + 2 < src.size()) { int byte = DecodeHexByte(src[i + 1], src[i + 2]); if (byte >= 0) { dst[written++] = static_cast(byte); i += 3; continue; } + dst[written++] = src[i++]; + } else { + dst[written++] = src[i++]; } - dst[written++] = src[i++]; } return written; } -bool NeedsUrlDecode(std::string_view sv) { - return sv.find('%') != std::string_view::npos; -} - void AppendJsonEscaped(std::string& out, std::string_view sv) { for (char c : sv) { switch (c) { @@ -135,10 +142,17 @@ ParseResult ParseSqlcommenter(std::string_view comment) { continue; ++pos; - // Parse value: read until closing single quote + // Parse value: read until unescaped closing single quote. + // Per sqlcommenter spec, \' is an escaped quote inside the value. size_t val_start = pos; - while (pos < comment.size() && comment[pos] != '\'') { - ++pos; + while (pos < comment.size()) { + if (comment[pos] == '\\' && pos + 1 < comment.size() && comment[pos + 1] == '\'') { + pos += 2; // skip escaped quote + } else if (comment[pos] == '\'') { + break; + } else { + ++pos; + } } if (pos >= comment.size()) break; @@ -146,15 +160,15 @@ ParseResult ParseSqlcommenter(std::string_view comment) { ++pos; Label& label = result.labels[result.count]; - if (NeedsUrlDecode(raw_key)) { - size_t len = UrlDecode(raw_key, label.decoded_key, kMaxKeyLen); + if (NeedsDecode(raw_key)) { + size_t len = MetaUnescapeAndUrlDecode(raw_key, label.decoded_key, kMaxKeyLen); label.key = std::string_view(label.decoded_key, len); } else { label.key = raw_key.substr(0, kMaxKeyLen); } - if (NeedsUrlDecode(raw_value)) { - size_t len = UrlDecode(raw_value, label.decoded_value, kMaxValueLen); + if (NeedsDecode(raw_value)) { + size_t len = MetaUnescapeAndUrlDecode(raw_value, label.decoded_value, kMaxValueLen); label.value = std::string_view(label.decoded_value, len); } else { label.value = raw_value.substr(0, kMaxValueLen); diff --git a/test/unit/sqlcommenter_parse_test.cc b/test/unit/sqlcommenter_parse_test.cc index bfe9103..8859a0c 100644 --- a/test/unit/sqlcommenter_parse_test.cc +++ b/test/unit/sqlcommenter_parse_test.cc @@ -252,6 +252,108 @@ TEST(ParseSqlcommenter, RealWorldSqlcommenter) { EXPECT_EQ(result.labels[3].value, "pg"); } +// ============================================================================ +// Spec compliance: meta character escaping (\' → ') +// Per sqlcommenter spec, single quotes in values are escaped as \' +// ============================================================================ + +TEST(ParseSqlcommenter, EscapedSingleQuoteInValue) { + // key='it\'s' → value is "it's" + auto result = ParseSqlcommenter("key='it\\'s'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].key, "key"); + EXPECT_EQ(result.labels[0].value, "it's"); +} + +TEST(ParseSqlcommenter, MultipleEscapedQuotesInValue) { + // key='can\'t won\'t' → value is "can't won't" + auto result = ParseSqlcommenter("key='can\\'t won\\'t'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "can't won't"); +} + +TEST(ParseSqlcommenter, EscapedQuoteAtValueStart) { + auto result = ParseSqlcommenter("key='\\'hello'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "'hello"); +} + +TEST(ParseSqlcommenter, EscapedQuoteAtValueEnd) { + auto result = ParseSqlcommenter("key='hello\\''"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "hello'"); +} + +TEST(ParseSqlcommenter, EscapedQuoteWithUrlEncoding) { + // Meta unescaping and URL decoding combined + auto result = ParseSqlcommenter("key='it\\'s%20great'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "it's great"); +} + +TEST(ParseSqlcommenter, BackslashNotFollowedByQuote) { + // Backslash followed by non-quote char is kept literally + // Input string: key='path\to' (one backslash before 't') + auto result = ParseSqlcommenter("key='path\\to'"); + EXPECT_EQ(result.count, 1); + EXPECT_EQ(result.labels[0].value, "path\\to"); +} + +// ============================================================================ +// Spec compliance: spec exhibit round-trip +// ============================================================================ + +TEST(ParseSqlcommenter, SpecExhibitParsing) { + // From the sqlcommenter spec exhibit: + // action='%2Fparam*d',controller='index',framework='spring', + // traceparent='00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01', + // tracestate='congo%3Dt61rcWkgMzE%2Crojo%3D00f067aa0ba902b7' + auto result = ParseSqlcommenter( + "action='%2Fparam*d'," + "controller='index'," + "framework='spring'," + "traceparent='00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01'," + "tracestate='congo%3Dt61rcWkgMzE%2Crojo%3D00f067aa0ba902b7'"); + EXPECT_EQ(result.count, 5); + EXPECT_EQ(result.labels[0].key, "action"); + EXPECT_EQ(result.labels[0].value, "/param*d"); + EXPECT_EQ(result.labels[1].key, "controller"); + EXPECT_EQ(result.labels[1].value, "index"); + EXPECT_EQ(result.labels[2].key, "framework"); + EXPECT_EQ(result.labels[2].value, "spring"); + EXPECT_EQ(result.labels[3].key, "traceparent"); + EXPECT_EQ(result.labels[3].value, + "00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01"); + EXPECT_EQ(result.labels[4].key, "tracestate"); + EXPECT_EQ(result.labels[4].value, "congo=t61rcWkgMzE,rojo=00f067aa0ba902b7"); +} + +TEST(ParseSqlcommenter, SpecExhibitFullPipeline) { + // Full pipeline: extract comment from the spec exhibit SQL, parse, serialize + std::string_view query = + "SELECT * FROM FOO " + "/*action='%2Fparam*d',controller='index',framework='spring'," + "traceparent='00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01'," + "tracestate='congo%3Dt61rcWkgMzE%2Crojo%3D00f067aa0ba902b7'*/"; + auto comment = ExtractLastComment(query); + EXPECT_FALSE(comment.empty()); + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 5); + EXPECT_EQ(parsed.labels[0].value, "/param*d"); + EXPECT_EQ(parsed.labels[4].value, "congo=t61rcWkgMzE,rojo=00f067aa0ba902b7"); +} + +TEST(ParseSqlcommenter, UrlEncodedSpecialChars) { + // Slash, equals, ampersand — common in route and tracestate values + auto result = ParseSqlcommenter( + "route='%2Fpolls%201000',state='k1%3Dv1%26k2%3Dv2'"); + EXPECT_EQ(result.count, 2); + EXPECT_EQ(result.labels[0].key, "route"); + EXPECT_EQ(result.labels[0].value, "/polls 1000"); + EXPECT_EQ(result.labels[1].key, "state"); + EXPECT_EQ(result.labels[1].value, "k1=v1&k2=v2"); +} + // ============================================================================ // SerializeLabelsJson tests // ============================================================================ From 1feb6cf2a8316879a6795faaa2fbb75482f01ad4 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 09:52:52 -0500 Subject: [PATCH 03/11] Add comprehensive sqlcommenter spec compliance tests - Real-world Django ORM sample (6 labels with URL-encoded framework version, route, traceparent, tracestate) - Multi-line comment extraction and parsing - Multiple comment blocks (last block wins) - Non-sqlcommenter last comment returns no labels - -- style comments don't interfere with /* */ extraction - Full end-to-end Django pipeline with JSON verification 61 tests total (up from 54). --- test/unit/sqlcommenter_parse_test.cc | 100 +++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/test/unit/sqlcommenter_parse_test.cc b/test/unit/sqlcommenter_parse_test.cc index 8859a0c..9b706d3 100644 --- a/test/unit/sqlcommenter_parse_test.cc +++ b/test/unit/sqlcommenter_parse_test.cc @@ -354,6 +354,106 @@ TEST(ParseSqlcommenter, UrlEncodedSpecialChars) { EXPECT_EQ(result.labels[1].value, "k1=v1&k2=v2"); } +// ============================================================================ +// Spec compliance: real-world samples and edge cases +// ============================================================================ + +TEST(ParseSqlcommenter, RealWorldDjangoSample) { + // Exact sample from the sqlcommenter homepage (Django ORM output) + auto result = ParseSqlcommenter( + "controller='index',db_driver='django.db.backends.postgresql'," + "framework='django%3A2.2.1',route='%5Epolls/%24'," + "traceparent='00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01'," + "tracestate='congo%3Dt61rcWkgMzE%2Crojo%3D00f067aa0ba902b7'"); + EXPECT_EQ(result.count, 6); + EXPECT_EQ(result.labels[0].key, "controller"); + EXPECT_EQ(result.labels[0].value, "index"); + EXPECT_EQ(result.labels[1].key, "db_driver"); + EXPECT_EQ(result.labels[1].value, "django.db.backends.postgresql"); + EXPECT_EQ(result.labels[2].key, "framework"); + EXPECT_EQ(result.labels[2].value, "django:2.2.1"); + EXPECT_EQ(result.labels[3].key, "route"); + EXPECT_EQ(result.labels[3].value, "^polls/$"); + EXPECT_EQ(result.labels[4].key, "traceparent"); + EXPECT_EQ(result.labels[4].value, + "00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01"); + EXPECT_EQ(result.labels[5].key, "tracestate"); + EXPECT_EQ(result.labels[5].value, "congo=t61rcWkgMzE,rojo=00f067aa0ba902b7"); +} + +TEST(ExtractLastComment, MultiLineComment) { + // sqlcommenter comments can span multiple lines in logs + auto result = ExtractLastComment( + "INSERT INTO t1 VALUES ($1, $2)\n" + "/*controller='index',\n" + "framework='django%3A2.2.1'*/"); + EXPECT_EQ(result, + "controller='index',\n" + "framework='django%3A2.2.1'"); +} + +TEST(ExtractLastComment, MultipleBlocksReturnsLast) { + // Non-sqlcommenter comment followed by sqlcommenter — only last is returned + auto result = ExtractLastComment( + "SELECT /*+ IndexScan(t1) */ * FROM t1 " + "/* controller='index',action='show' */"); + EXPECT_EQ(result, " controller='index',action='show' "); +} + +TEST(ExtractLastComment, DashDashCommentIgnored) { + // -- style comments don't affect /* */ extraction + auto result = ExtractLastComment( + "SELECT * FROM t1 -- this is a line comment\n" + "/* controller='index' */"); + EXPECT_EQ(result, " controller='index' "); +} + +TEST(EndToEnd, RealWorldDjangoFullPipeline) { + // Full pipeline with the real-world Django sample from the sqlcommenter docs + std::string_view query = + "INSERT INTO \"polls_question\" (\"question_text\", \"pub_date\") VALUES " + "($1, $2) RETURNING \"polls_question\".\"id\" " + "/*controller='index',db_driver='django.db.backends.postgresql'," + "framework='django%3A2.2.1',route='%5Epolls/%24'," + "traceparent='00-5bd66ef5095369c7b0d1f8f4bd33716a-c532cb4098ac3dd2-01'," + "tracestate='congo%3Dt61rcWkgMzE%2Crojo%3D00f067aa0ba902b7'*/"; + auto comment = ExtractLastComment(query); + EXPECT_FALSE(comment.empty()); + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 6); + EXPECT_EQ(parsed.labels[2].value, "django:2.2.1"); + EXPECT_EQ(parsed.labels[3].value, "^polls/$"); + std::string json = SerializeLabelsJson(parsed); + // Verify JSON contains all keys + EXPECT_NE(json.find("\"controller\""), std::string::npos); + EXPECT_NE(json.find("\"traceparent\""), std::string::npos); + EXPECT_NE(json.find("\"tracestate\""), std::string::npos); +} + +TEST(EndToEnd, NonSqlcommenterLastComment) { + // If the last comment is not sqlcommenter format, no labels are extracted + std::string_view query = + "SELECT * FROM t1 /* controller='index' */ WHERE id = $1 /* regular comment */"; + auto comment = ExtractLastComment(query); + EXPECT_EQ(comment, " regular comment "); + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 0); +} + +TEST(EndToEnd, MultiLineCommentParsing) { + std::string_view query = + "SELECT $1\n" + "/*controller='index',\n" + "framework='django%3A2.2.1',\n" + "route='%5Epolls/%24'*/"; + auto comment = ExtractLastComment(query); + auto parsed = ParseSqlcommenter(comment); + EXPECT_EQ(parsed.count, 3); + EXPECT_EQ(parsed.labels[0].value, "index"); + EXPECT_EQ(parsed.labels[1].value, "django:2.2.1"); + EXPECT_EQ(parsed.labels[2].value, "^polls/$"); +} + // ============================================================================ // SerializeLabelsJson tests // ============================================================================ From f1cd581009aeec8133d9824976ed5c3420b0e4ed Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 09:59:25 -0500 Subject: [PATCH 04/11] Refactor sqlcommenter parser for readability Extract Scanner struct with SkipWhitespace/Consume/ScanKey/ScanQuotedValue methods. Add DecodeField helper to deduplicate key/value decode logic. Replace inline character checks with IsWhitespace/IsKeyTerminator. Use std::from_chars for hex decoding. --- src/export/sqlcommenter_parse.cc | 174 +++++++++++++++++-------------- 1 file changed, 95 insertions(+), 79 deletions(-) diff --git a/src/export/sqlcommenter_parse.cc b/src/export/sqlcommenter_parse.cc index 9050961..a4409d2 100644 --- a/src/export/sqlcommenter_parse.cc +++ b/src/export/sqlcommenter_parse.cc @@ -1,31 +1,32 @@ #include "export/sqlcommenter_parse.h" +#include #include namespace { int DecodeHexByte(char hi, char lo) { - auto hex = [](char c) -> int { - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - if (c >= 'A' && c <= 'F') - return c - 'A' + 10; + char buf[2] = {hi, lo}; + unsigned value = 0; + auto [ptr, ec] = std::from_chars(buf, buf + 2, value, 16); + if (ec != std::errc{} || ptr != buf + 2) return -1; - }; - int h = hex(hi); - int l = hex(lo); - if (h < 0 || l < 0) - return -1; - return (h << 4) | l; + return static_cast(value); +} + +bool IsWhitespace(char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; +} + +bool IsKeyTerminator(char c) { + return c == '=' || c == ',' || IsWhitespace(c); } bool NeedsDecode(std::string_view sv) { return sv.find('%') != std::string_view::npos || sv.find('\\') != std::string_view::npos; } -// Meta-unescape (\' → ') and URL-decode in a single pass. +// Meta-unescape (\' -> ') and URL-decode (%XX) in a single pass. // Per sqlcommenter spec, meta unescaping happens before URL decoding, // but the two transforms don't overlap so a combined pass is equivalent. size_t MetaUnescapeAndUrlDecode(std::string_view src, char* dst, size_t max_len) { @@ -50,6 +51,15 @@ size_t MetaUnescapeAndUrlDecode(std::string_view src, char* dst, size_t max_len) return written; } +// Decode a raw field (meta-unescape + URL-decode) into buf, or truncate if no decoding needed. +std::string_view DecodeField(std::string_view raw, char* buf, size_t max_len) { + if (NeedsDecode(raw)) { + size_t len = MetaUnescapeAndUrlDecode(raw, buf, max_len); + return {buf, len}; + } + return raw.substr(0, max_len); +} + void AppendJsonEscaped(std::string& out, std::string_view sv) { for (char c : sv) { switch (c) { @@ -86,6 +96,56 @@ void AppendJsonEscaped(std::string& out, std::string_view sv) { } } +// Lightweight scanner for walking through a sqlcommenter comment. +struct Scanner { + std::string_view text; + size_t pos = 0; + + bool AtEnd() const { return pos >= text.size(); } + + void SkipWhitespace() { + while (!AtEnd() && IsWhitespace(text[pos])) + ++pos; + } + + // Consume a specific character; returns false without advancing if not matched. + bool Consume(char expected) { + if (AtEnd() || text[pos] != expected) + return false; + ++pos; + return true; + } + + // Scan a key: sequence of non-terminator characters. + std::string_view ScanKey() { + size_t start = pos; + while (!AtEnd() && !IsKeyTerminator(text[pos])) + ++pos; + return text.substr(start, pos - start); + } + + // Scan a single-quoted value, handling \' escapes per sqlcommenter spec. + // Assumes the opening quote was already consumed. + // Returns the raw content (before decoding). Sets *ok = false on unterminated quote. + std::string_view ScanQuotedValue(bool* ok) { + size_t start = pos; + while (!AtEnd()) { + if (text[pos] == '\\' && pos + 1 < text.size() && text[pos + 1] == '\'') { + pos += 2; + } else if (text[pos] == '\'') { + auto val = text.substr(start, pos - start); + ++pos; + *ok = true; + return val; + } else { + ++pos; + } + } + *ok = false; + return {}; + } +}; + } // namespace std::string_view ExtractLastComment(std::string_view query) { @@ -105,85 +165,41 @@ std::string_view ExtractLastComment(std::string_view query) { ParseResult ParseSqlcommenter(std::string_view comment) { ParseResult result; - size_t pos = 0; + Scanner scan{comment}; - auto skip_ws = [&]() { - while (pos < comment.size() && (comment[pos] == ' ' || comment[pos] == '\t' || - comment[pos] == '\n' || comment[pos] == '\r')) { - ++pos; - } - }; - - while (pos < comment.size() && result.count < kMaxLabels) { - skip_ws(); - if (pos >= comment.size()) - break; + while (!scan.AtEnd() && result.count < kMaxLabels) { + scan.SkipWhitespace(); - // Parse key: read until '=', whitespace, or ',' - size_t key_start = pos; - while (pos < comment.size() && comment[pos] != '=' && comment[pos] != ' ' && - comment[pos] != '\t' && comment[pos] != '\n' && comment[pos] != '\r' && - comment[pos] != ',') { - ++pos; - } - if (pos == key_start) { - ++pos; + auto raw_key = scan.ScanKey(); + if (raw_key.empty()) { + if (!scan.AtEnd()) + ++scan.pos; continue; } - std::string_view raw_key = comment.substr(key_start, pos - key_start); - skip_ws(); - if (pos >= comment.size() || comment[pos] != '=') + scan.SkipWhitespace(); + if (!scan.Consume('=')) continue; - ++pos; - - skip_ws(); - if (pos >= comment.size() || comment[pos] != '\'') + scan.SkipWhitespace(); + if (!scan.Consume('\'')) continue; - ++pos; - // Parse value: read until unescaped closing single quote. - // Per sqlcommenter spec, \' is an escaped quote inside the value. - size_t val_start = pos; - while (pos < comment.size()) { - if (comment[pos] == '\\' && pos + 1 < comment.size() && comment[pos + 1] == '\'') { - pos += 2; // skip escaped quote - } else if (comment[pos] == '\'') { - break; - } else { - ++pos; - } - } - if (pos >= comment.size()) + bool ok = false; + auto raw_value = scan.ScanQuotedValue(&ok); + if (!ok) break; - std::string_view raw_value = comment.substr(val_start, pos - val_start); - ++pos; - Label& label = result.labels[result.count]; - if (NeedsDecode(raw_key)) { - size_t len = MetaUnescapeAndUrlDecode(raw_key, label.decoded_key, kMaxKeyLen); - label.key = std::string_view(label.decoded_key, len); - } else { - label.key = raw_key.substr(0, kMaxKeyLen); - } - - if (NeedsDecode(raw_value)) { - size_t len = MetaUnescapeAndUrlDecode(raw_value, label.decoded_value, kMaxValueLen); - label.value = std::string_view(label.decoded_value, len); - } else { - label.value = raw_value.substr(0, kMaxValueLen); - } - - ++result.count; + Label& label = result.labels[result.count++]; + label.key = DecodeField(raw_key, label.decoded_key, kMaxKeyLen); + label.value = DecodeField(raw_value, label.decoded_value, kMaxValueLen); - skip_ws(); - if (pos < comment.size() && comment[pos] == ',') - ++pos; + scan.SkipWhitespace(); + scan.Consume(','); } if (result.count == kMaxLabels) { - skip_ws(); - if (pos < comment.size()) + scan.SkipWhitespace(); + if (!scan.AtEnd()) result.truncated = true; } From 0b9cf76c2d6b75d7e52f387c666b9d878748f047 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:11:03 -0500 Subject: [PATCH 05/11] Use nlohmann-json (already vendored by otel-cpp) for JSON serialization Replaces hand-rolled AppendJsonEscaped + string-concat JSON builder with nlohmann::ordered_json. Escaping is now handled by a well-tested library instead of custom code. --- CMakeLists.txt | 4 +++ src/export/sqlcommenter_parse.cc | 55 +++----------------------------- 2 files changed, 9 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 878835d..6372ece 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ target_include_directories(pg_stat_ch SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/third_party/opentelemetry-cpp/api/include ${CMAKE_SOURCE_DIR}/third_party/opentelemetry-cpp/sdk/include ${CMAKE_SOURCE_DIR}/third_party/opentelemetry-cpp/exporters/otlp/include + ${CMAKE_SOURCE_DIR}/third_party/opentelemetry-cpp/third_party/nlohmann-json/single_include ) target_link_libraries(pg_stat_ch PRIVATE PostgreSQLServer::PostgreSQLServer @@ -154,6 +155,9 @@ if(PSCH_BUILD_UNIT_TESTS) src/export/sqlcommenter_parse.cc ) target_include_directories(sqlcommenter_parse_test PRIVATE src include) + target_include_directories(sqlcommenter_parse_test SYSTEM PRIVATE + ${CMAKE_SOURCE_DIR}/third_party/opentelemetry-cpp/third_party/nlohmann-json/single_include + ) target_compile_features(sqlcommenter_parse_test PRIVATE cxx_std_17) target_link_libraries(sqlcommenter_parse_test PRIVATE GTest::gtest_main) pg_stat_ch_set_warnings(sqlcommenter_parse_test) diff --git a/src/export/sqlcommenter_parse.cc b/src/export/sqlcommenter_parse.cc index a4409d2..e5de3c9 100644 --- a/src/export/sqlcommenter_parse.cc +++ b/src/export/sqlcommenter_parse.cc @@ -1,7 +1,8 @@ #include "export/sqlcommenter_parse.h" #include -#include + +#include namespace { @@ -60,42 +61,6 @@ std::string_view DecodeField(std::string_view raw, char* buf, size_t max_len) { return raw.substr(0, max_len); } -void AppendJsonEscaped(std::string& out, std::string_view sv) { - for (char c : sv) { - switch (c) { - case '"': - out += "\\\""; - break; - case '\\': - out += "\\\\"; - break; - case '\b': - out += "\\b"; - break; - case '\f': - out += "\\f"; - break; - case '\n': - out += "\\n"; - break; - case '\r': - out += "\\r"; - break; - case '\t': - out += "\\t"; - break; - default: - if (static_cast(c) < 0x20) { - char buf[8]; - std::snprintf(buf, sizeof(buf), "\\u%04x", static_cast(c)); - out += buf; - } else { - out += c; - } - } - } -} - // Lightweight scanner for walking through a sqlcommenter comment. struct Scanner { std::string_view text; @@ -207,19 +172,9 @@ ParseResult ParseSqlcommenter(std::string_view comment) { } std::string SerializeLabelsJson(const ParseResult& result) { - if (result.count == 0) - return "{}"; - - std::string json = "{"; + nlohmann::ordered_json obj = nlohmann::ordered_json::object(); for (int i = 0; i < result.count; ++i) { - if (i > 0) - json += ','; - json += '"'; - AppendJsonEscaped(json, result.labels[i].key); - json += "\":\""; - AppendJsonEscaped(json, result.labels[i].value); - json += '"'; + obj[std::string(result.labels[i].key)] = std::string(result.labels[i].value); } - json += '}'; - return json; + return obj.dump(-1, ' ', false, nlohmann::ordered_json::error_handler_t::replace); } From 84c436b9f00d69124d95e2bb082d62a7d9acbf21 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:23:16 -0500 Subject: [PATCH 06/11] Fix TAP tests: labels column type, OTel metric names, test numbering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change labels column from JSON(max_dynamic_paths=64) to String DEFAULT '{}' — clickhouse-cpp doesn't support the JSON column type - Fix OTel export test: duration metric renamed to pg_stat_ch_db_client_operation_duration_seconds, label db → db_name - Fix OTel precedence warning: !system(...) == 0 → system(...) != 0 - Fix query labels test: use JSONExtractString instead of subpath syntax, use DDL instead of normalized literals for no-comment test - Rename 030_query_labels.pl → 032_query_labels.pl (avoid collisions with 030_nested_query_normalization and 031_standby_export) --- docker/init/00-schema.sql | 2 +- docker/migrations/001_add_labels_column.sql | 2 +- docs/reference/events-schema.mdx | 2 +- t/024_otel_export.pl | 8 ++++---- t/025_otel_reconnect.pl | 2 +- t/{030_query_labels.pl => 032_query_labels.pl} | 13 +++++++------ 6 files changed, 15 insertions(+), 14 deletions(-) rename t/{030_query_labels.pl => 032_query_labels.pl} (91%) diff --git a/docker/init/00-schema.sql b/docker/init/00-schema.sql index 9482c42..9aca1f6 100644 --- a/docker/init/00-schema.sql +++ b/docker/init/00-schema.sql @@ -59,7 +59,7 @@ CREATE TABLE pg_stat_ch.events_raw query String COMMENT 'Full SQL query text (may be truncated). Used for debugging and query analysis.', - labels JSON(max_dynamic_paths=64) COMMENT 'Query labels from sqlcommenter comments (key=value pairs in /* */ blocks). Access subpaths directly: labels.controller, labels.action. Empty {} when no labels present. See: https://google.github.io/sqlcommenter/', + labels String DEFAULT '{}' COMMENT 'Query labels from sqlcommenter comments (key=value pairs in /* */ blocks). Access subpaths directly: labels.controller, labels.action. Empty {} when no labels present. See: https://google.github.io/sqlcommenter/', -- ======================================================================== -- Shared buffer metrics (main buffer cache) diff --git a/docker/migrations/001_add_labels_column.sql b/docker/migrations/001_add_labels_column.sql index 16f73a4..ef4c626 100644 --- a/docker/migrations/001_add_labels_column.sql +++ b/docker/migrations/001_add_labels_column.sql @@ -10,6 +10,6 @@ -- Safe to re-run: ALTER TABLE ADD COLUMN IF NOT EXISTS is idempotent. ALTER TABLE pg_stat_ch.events_raw - ADD COLUMN IF NOT EXISTS labels JSON(max_dynamic_paths=64) + ADD COLUMN IF NOT EXISTS labels String DEFAULT '{}' COMMENT 'Query labels from sqlcommenter comments (key=value pairs in /* */ blocks). Access subpaths directly: labels.controller, labels.action. Empty {} when no labels present. See: https://google.github.io/sqlcommenter/' AFTER query; diff --git a/docs/reference/events-schema.mdx b/docs/reference/events-schema.mdx index bf2fcfd..41149ad 100644 --- a/docs/reference/events-schema.mdx +++ b/docs/reference/events-schema.mdx @@ -34,7 +34,7 @@ Query normalization replaces literals with placeholders (`$N`). This means `SELE | Column | Type | Description | |---|---|---| -| `labels` | `JSON(max_dynamic_paths=64)` | Key-value labels extracted from [sqlcommenter](https://google.github.io/sqlcommenter/) comments appended to the query. For example, `/* controller='users',action='show' */` produces `{"controller":"users","action":"show"}`. Access subpaths directly in ClickHouse: `labels.controller`, `labels.action`. Empty `{}` when no labels are present. | +| `labels` | `String DEFAULT '{}'` | Key-value labels extracted from [sqlcommenter](https://google.github.io/sqlcommenter/) comments appended to the query. For example, `/* controller='users',action='show' */` produces `{"controller":"users","action":"show"}`. Access subpaths directly in ClickHouse: `labels.controller`, `labels.action`. Empty `{}` when no labels are present. | Labels are parsed from the **last** `/* */` comment block in the query text. The parser supports URL-encoded values and escaped single quotes per the sqlcommenter specification. Controlled by the [`pg_stat_ch.track_labels`](/reference/configuration) GUC (default: `true`). diff --git a/t/024_otel_export.pl b/t/024_otel_export.pl index e5e7e60..a923c5b 100755 --- a/t/024_otel_export.pl +++ b/t/024_otel_export.pl @@ -14,7 +14,7 @@ use psch; # Skip if Docker not available -if (!system("docker ps >/dev/null 2>&1") == 0) { +if (system("docker ps >/dev/null 2>&1") != 0) { plan skip_all => 'Docker not available, skipping OTel tests'; } @@ -49,7 +49,7 @@ is($stats->{send_failures}, 0, 'No send failures'); # Verify metrics arrived at the collector (Prometheus endpoint) - my $count = psch_get_otel_histogram_total('pg_stat_ch_duration_us_unit'); + my $count = psch_get_otel_histogram_total('pg_stat_ch_db_client_operation_duration_seconds'); cmp_ok($count, '>=', 3, "duration_us metric has >= 3 observations (got $count)"); }; @@ -103,7 +103,7 @@ 'metrics carry job="pg_stat_ch" label (from service.name resource attribute)'); # db tag should appear as a label on duration_us observations - ok(psch_otel_metric_has_label('pg_stat_ch_duration_us_unit', 'db', 'postgres'), + ok(psch_otel_metric_has_label('pg_stat_ch_db_client_operation_duration_seconds', 'db_name', 'postgres'), 'duration_us metric has db="postgres" label'); # rows metric captures how many rows were returned/affected @@ -114,7 +114,7 @@ my $sum = 0; for my $line (split /\n/, $prometheus_output) { next if $line =~ /^#/; - if ($line =~ /^pg_stat_ch_duration_us_unit_sum(?:\{[^}]*\})?\s+(\d+(?:\.\d+)?)/) { + if ($line =~ /^pg_stat_ch_db_client_operation_duration_seconds_sum(?:\{[^}]*\})?\s+(\S+)/) { $sum += $1; } } diff --git a/t/025_otel_reconnect.pl b/t/025_otel_reconnect.pl index 2ac042c..f0cad4b 100755 --- a/t/025_otel_reconnect.pl +++ b/t/025_otel_reconnect.pl @@ -14,7 +14,7 @@ use psch; # Skip if Docker not available -if (!system("docker ps >/dev/null 2>&1") == 0) { +if (system("docker ps >/dev/null 2>&1") != 0) { plan skip_all => 'Docker not available, skipping OTel tests'; } diff --git a/t/030_query_labels.pl b/t/032_query_labels.pl similarity index 91% rename from t/030_query_labels.pl rename to t/032_query_labels.pl index 110a7db..8c25a94 100644 --- a/t/030_query_labels.pl +++ b/t/032_query_labels.pl @@ -59,13 +59,14 @@ psch_query_clickhouse("TRUNCATE TABLE pg_stat_ch.events_raw"); psch_reset_stats($node); - $node->safe_psql('postgres', 'SELECT 42'); + $node->safe_psql('postgres', 'CREATE TABLE nocomment_test(id int)'); + $node->safe_psql('postgres', 'DROP TABLE nocomment_test'); $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); my $labels = psch_wait_for_clickhouse_query( "SELECT labels FROM pg_stat_ch.events_raw " - . "WHERE query LIKE '%42%' LIMIT 1", - sub { $_[0] ne '' }, + . "WHERE query LIKE '%nocomment_test%' LIMIT 1", + sub { defined $_[0] && $_[0] ne '' }, 10 ); @@ -82,9 +83,9 @@ $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); - # Query individual label subpaths via ClickHouse JSON syntax + # Query individual label values via JSONExtractString my $controller = psch_wait_for_clickhouse_query( - "SELECT labels.controller FROM pg_stat_ch.events_raw " + "SELECT JSONExtractString(labels, 'controller') FROM pg_stat_ch.events_raw " . "WHERE query LIKE '%orders%' LIMIT 1", sub { $_[0] =~ /orders/ }, 10 @@ -92,7 +93,7 @@ like($controller, qr/orders/, 'labels.controller = orders'); my $framework = psch_query_clickhouse( - "SELECT labels.framework FROM pg_stat_ch.events_raw " + "SELECT JSONExtractString(labels, 'framework') FROM pg_stat_ch.events_raw " . "WHERE query LIKE '%orders%' LIMIT 1"); like($framework, qr/rails/, 'labels.framework = rails'); }; From 19f3018771d6503267475c42aae03011a18d1fd3 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:23:48 -0500 Subject: [PATCH 07/11] ci: add ClickHouse and OTel TAP test jobs New CI jobs that run the integration TAP tests against PG 18: - clickhouse-tap: starts ClickHouse via docker-compose, runs 010_clickhouse_export and 011_clickhouse_reconnect - otel-tap: starts OTel collector via docker-compose, runs 024_otel_export and 025_otel_reconnect --- .github/workflows/ci.yml | 96 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 13d908b..041445b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -93,6 +93,102 @@ jobs: specs/results/ specs/tmp_check/log/ + clickhouse-tap: + name: ClickHouse TAP Tests + runs-on: depot-ubuntu-24.04-16 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Add PostgreSQL APT repository + run: | + sudo apt-get update + sudo apt-get install -y curl ca-certificates gnupg + curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null + echo "deb [arch=amd64] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list + sudo apt-get update + + - name: Install PostgreSQL 18 + run: sudo apt-get install -y postgresql-18 postgresql-server-dev-18 + + - name: Install build dependencies + run: sudo apt-get install -y cmake ninja-build g++ libssl-dev + + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: clickhouse-tap + max-size: 500M + + - name: Build and install extension + run: | + cmake --preset default \ + -DPG_CONFIG=/usr/lib/postgresql/18/bin/pg_config \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + cmake --build build --parallel + sudo cmake --install build + + - name: Start ClickHouse + run: docker compose -f docker/docker-compose.test.yml up -d --wait + + - name: Run ClickHouse TAP tests + run: ./scripts/run-tests.sh 18 clickhouse + + - name: Cleanup + if: always() + run: docker compose -f docker/docker-compose.test.yml down --volumes 2>/dev/null || true + + otel-tap: + name: OTel TAP Tests + runs-on: depot-ubuntu-24.04-16 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Add PostgreSQL APT repository + run: | + sudo apt-get update + sudo apt-get install -y curl ca-certificates gnupg + curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null + echo "deb [arch=amd64] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list + sudo apt-get update + + - name: Install PostgreSQL 18 + run: sudo apt-get install -y postgresql-18 postgresql-server-dev-18 + + - name: Install build dependencies + run: sudo apt-get install -y cmake ninja-build g++ libssl-dev + + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: otel-tap + max-size: 500M + + - name: Build and install extension + run: | + cmake --preset default \ + -DPG_CONFIG=/usr/lib/postgresql/18/bin/pg_config \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + cmake --build build --parallel + sudo cmake --install build + + - name: Start OTel collector + run: docker compose -f docker/docker-compose.otel.yml up -d --wait + + - name: Run OTel TAP tests + run: ./scripts/run-tests.sh 18 otel + + - name: Cleanup + if: always() + run: docker compose -f docker/docker-compose.otel.yml down --volumes 2>/dev/null || true + unit-tests: name: C++ Unit Tests runs-on: depot-ubuntu-24.04-16 From d42738d8584380e61f3aef466c04fa07248cfac2 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:24:56 -0500 Subject: [PATCH 08/11] ci: use docker-compose for TAP tests, add OTel collector - Revert redundant clickhouse-tap/otel-tap jobs from ci.yml (ci-tap.yml already runs all TAP tests) - Switch ci-tap.yml from raw docker run to docker-compose so port mappings match what the tests expect (18123/19000) - Start OTel collector alongside ClickHouse so OTel TAP tests run instead of being skipped --- .github/workflows/ci-tap.yml | 30 +++-------- .github/workflows/ci.yml | 96 ------------------------------------ 2 files changed, 7 insertions(+), 119 deletions(-) diff --git a/.github/workflows/ci-tap.yml b/.github/workflows/ci-tap.yml index 0d7727a..259f316 100644 --- a/.github/workflows/ci-tap.yml +++ b/.github/workflows/ci-tap.yml @@ -73,28 +73,10 @@ jobs: cmake --build build --parallel cmake --install build - - name: Start ClickHouse + - name: Start ClickHouse and OTel collector run: | - docker rm -f clickhouse-test 2>/dev/null || true - docker run -d --name clickhouse-test \ - --network host \ - clickhouse/clickhouse-server:26.1 - # Wait for ClickHouse to be ready - for i in {1..30}; do - if curl -sf 'http://localhost:8123/' --data 'SELECT 1' >/dev/null 2>&1; then - echo "ClickHouse ready" - exit 0 - fi - echo "Waiting for ClickHouse... ($i/30)" - sleep 1 - done - echo "ClickHouse not ready after 30s" - docker logs clickhouse-test - exit 1 - - - name: Initialize ClickHouse schema - run: | - docker exec clickhouse-test clickhouse-client --multiquery < docker/init/00-schema.sql + docker compose -f docker/docker-compose.test.yml up -d --wait + docker compose -f docker/docker-compose.otel.yml up -d --wait - name: Run TAP tests run: | @@ -110,6 +92,8 @@ jobs: name: tap-test-logs path: t/tmp_check/ - - name: Cleanup ClickHouse + - name: Cleanup if: always() - run: docker rm -f clickhouse-test 2>/dev/null || true + run: | + docker compose -f docker/docker-compose.test.yml down --volumes 2>/dev/null || true + docker compose -f docker/docker-compose.otel.yml down --volumes 2>/dev/null || true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 041445b..13d908b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -93,102 +93,6 @@ jobs: specs/results/ specs/tmp_check/log/ - clickhouse-tap: - name: ClickHouse TAP Tests - runs-on: depot-ubuntu-24.04-16 - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - fetch-depth: 0 - - - name: Add PostgreSQL APT repository - run: | - sudo apt-get update - sudo apt-get install -y curl ca-certificates gnupg - curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null - echo "deb [arch=amd64] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list - sudo apt-get update - - - name: Install PostgreSQL 18 - run: sudo apt-get install -y postgresql-18 postgresql-server-dev-18 - - - name: Install build dependencies - run: sudo apt-get install -y cmake ninja-build g++ libssl-dev - - - name: Setup ccache - uses: hendrikmuhs/ccache-action@v1.2 - with: - key: clickhouse-tap - max-size: 500M - - - name: Build and install extension - run: | - cmake --preset default \ - -DPG_CONFIG=/usr/lib/postgresql/18/bin/pg_config \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel - sudo cmake --install build - - - name: Start ClickHouse - run: docker compose -f docker/docker-compose.test.yml up -d --wait - - - name: Run ClickHouse TAP tests - run: ./scripts/run-tests.sh 18 clickhouse - - - name: Cleanup - if: always() - run: docker compose -f docker/docker-compose.test.yml down --volumes 2>/dev/null || true - - otel-tap: - name: OTel TAP Tests - runs-on: depot-ubuntu-24.04-16 - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - fetch-depth: 0 - - - name: Add PostgreSQL APT repository - run: | - sudo apt-get update - sudo apt-get install -y curl ca-certificates gnupg - curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null - echo "deb [arch=amd64] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list - sudo apt-get update - - - name: Install PostgreSQL 18 - run: sudo apt-get install -y postgresql-18 postgresql-server-dev-18 - - - name: Install build dependencies - run: sudo apt-get install -y cmake ninja-build g++ libssl-dev - - - name: Setup ccache - uses: hendrikmuhs/ccache-action@v1.2 - with: - key: otel-tap - max-size: 500M - - - name: Build and install extension - run: | - cmake --preset default \ - -DPG_CONFIG=/usr/lib/postgresql/18/bin/pg_config \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - cmake --build build --parallel - sudo cmake --install build - - - name: Start OTel collector - run: docker compose -f docker/docker-compose.otel.yml up -d --wait - - - name: Run OTel TAP tests - run: ./scripts/run-tests.sh 18 otel - - - name: Cleanup - if: always() - run: docker compose -f docker/docker-compose.otel.yml down --volumes 2>/dev/null || true - unit-tests: name: C++ Unit Tests runs-on: depot-ubuntu-24.04-16 From a6af354d199d73f0b639ff32f87d8d94544c5c35 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:34:20 -0500 Subject: [PATCH 09/11] ci: fix OTel collector healthcheck for distroless image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The otel-collector-contrib image is distroless — no shell, wget, or curl available for Docker healthchecks. Remove the in-container healthcheck and poll from the host via curl in CI instead. --- .github/workflows/ci-tap.yml | 11 ++++++++++- docker/docker-compose.otel.yml | 7 ++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-tap.yml b/.github/workflows/ci-tap.yml index 259f316..5a6ac03 100644 --- a/.github/workflows/ci-tap.yml +++ b/.github/workflows/ci-tap.yml @@ -76,7 +76,16 @@ jobs: - name: Start ClickHouse and OTel collector run: | docker compose -f docker/docker-compose.test.yml up -d --wait - docker compose -f docker/docker-compose.otel.yml up -d --wait + docker compose -f docker/docker-compose.otel.yml up -d + # Poll OTel health endpoint (distroless image has no shell for healthcheck) + for i in $(seq 1 30); do + if curl -sf http://localhost:13133/ >/dev/null 2>&1; then + echo "OTel collector ready" + break + fi + echo "Waiting for OTel collector... ($i/30)" + sleep 1 + done - name: Run TAP tests run: | diff --git a/docker/docker-compose.otel.yml b/docker/docker-compose.otel.yml index 4cf52c3..7c5dcd4 100644 --- a/docker/docker-compose.otel.yml +++ b/docker/docker-compose.otel.yml @@ -9,8 +9,5 @@ services: - "4317:4317" # gRPC OTLP receiver (matches psch_otel_endpoint default) - "9091:9090" # Prometheus metrics exporter (host:9091 → container:9090) - "13133:13133" # Health check HTTP endpoint - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:13133/"] - interval: 5s - timeout: 5s - retries: 10 + # Note: no healthcheck — the contrib image is distroless (no shell/curl/wget). + # Use host-side polling (curl localhost:13133) to verify readiness. From 1017e0245bbb80ae5abe0e59399480f6776ab28b Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:53:12 -0500 Subject: [PATCH 10/11] Fix OTel TAP test timing: set metric_interval_ms=1s, add sleep The OTel periodic metric reader defaults to 5s export interval. Tests that scrape the Prometheus endpoint need to wait for at least one export cycle. Set otel_metric_interval_ms=1000 in the test helper and add sleep(3) before Prometheus scrapes. --- t/024_otel_export.pl | 11 ++++++++--- t/psch.pm | 14 ++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/t/024_otel_export.pl b/t/024_otel_export.pl index a923c5b..4c9cbf2 100755 --- a/t/024_otel_export.pl +++ b/t/024_otel_export.pl @@ -26,8 +26,9 @@ # Initialize node with OTel export enabled my $node = psch_init_node_with_otel('otel_export', - flush_interval_ms => 100, - batch_max => 100, + flush_interval_ms => 100, + batch_max => 100, + otel_metric_interval_ms => 1000, ); # Test 1: Basic export - run queries and verify events are exported @@ -48,6 +49,9 @@ my $stats = psch_get_stats($node); is($stats->{send_failures}, 0, 'No send failures'); + # Wait for OTel periodic metric reader to export (1s interval + margin) + sleep(3); + # Verify metrics arrived at the collector (Prometheus endpoint) my $count = psch_get_otel_histogram_total('pg_stat_ch_db_client_operation_duration_seconds'); cmp_ok($count, '>=', 3, "duration_us metric has >= 3 observations (got $count)"); @@ -95,7 +99,8 @@ $node->safe_psql('postgres', 'DROP TABLE test_otel_labels'); $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); - sleep(2); + # Wait for OTel periodic metric reader to export (1s interval + margin) + sleep(3); # service.name resource attribute is mapped to the job label by the Prometheus exporter my $prometheus_output = `curl -s 'http://localhost:9091/metrics' 2>/dev/null`; diff --git a/t/psch.pm b/t/psch.pm index 51dcb2b..26126dd 100644 --- a/t/psch.pm +++ b/t/psch.pm @@ -240,12 +240,13 @@ sub psch_stop_otelcol { sub psch_init_node_with_otel { my ($name, %opts) = @_; - my $queue_capacity = $opts{queue_capacity} // 65536; - my $flush_interval_ms = $opts{flush_interval_ms} // 100; # Fast flush for tests - my $batch_max = $opts{batch_max} // 1000; - my $enabled = $opts{enabled} // 'on'; - my $otel_endpoint = $opts{otel_endpoint} // 'localhost:4317'; - my $hostname = $opts{hostname} // 'test-host'; + my $queue_capacity = $opts{queue_capacity} // 65536; + my $flush_interval_ms = $opts{flush_interval_ms} // 100; # Fast flush for tests + my $batch_max = $opts{batch_max} // 1000; + my $enabled = $opts{enabled} // 'on'; + my $otel_endpoint = $opts{otel_endpoint} // 'localhost:4317'; + my $hostname = $opts{hostname} // 'test-host'; + my $otel_metric_interval_ms = $opts{otel_metric_interval_ms} // 1000; my $node = PostgreSQL::Test::Cluster->new($name); $node->init(); @@ -258,6 +259,7 @@ pg_stat_ch.batch_max = $batch_max pg_stat_ch.use_otel = on pg_stat_ch.otel_endpoint = '$otel_endpoint' pg_stat_ch.hostname = '$hostname' +pg_stat_ch.otel_metric_interval_ms = $otel_metric_interval_ms }); $node->start(); $node->safe_psql('postgres', 'CREATE EXTENSION pg_stat_ch'); From 2ca802a61a0205e0d39f49553f352b49b49a5f04 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Mon, 13 Apr 2026 10:59:46 -0500 Subject: [PATCH 11/11] Fix track_labels off test: use DDL, wait for CH row before asserting The test was using a SELECT with a comment (normalized away) and a racy wait predicate. Use DDL (CREATE/DROP TABLE) whose text survives normalization, wait for the row to arrive in ClickHouse, then assert. --- t/032_query_labels.pl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/t/032_query_labels.pl b/t/032_query_labels.pl index 8c25a94..79ca39f 100644 --- a/t/032_query_labels.pl +++ b/t/032_query_labels.pl @@ -106,19 +106,25 @@ $node->safe_psql('postgres', "ALTER SYSTEM SET pg_stat_ch.track_labels = off"); $node->reload(); - sleep(1); + sleep(2); # Give bgworker time to pick up SIGHUP $node->safe_psql('postgres', - "SELECT 1 /* controller='ignored' */"); + "CREATE TABLE labels_off_test(id int)"); + $node->safe_psql('postgres', + "DROP TABLE labels_off_test"); $node->safe_psql('postgres', 'SELECT pg_stat_ch_flush()'); - my $labels = psch_wait_for_clickhouse_query( - "SELECT labels FROM pg_stat_ch.events_raw " - . "WHERE query LIKE '%ignored%' LIMIT 1", - sub { $_[0] ne '' }, + # Wait for rows to arrive in ClickHouse first + psch_wait_for_clickhouse_query( + "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%labels_off_test%'", + sub { $_[0] >= 1 }, 10 ); + + my $labels = psch_query_clickhouse( + "SELECT labels FROM pg_stat_ch.events_raw " + . "WHERE query LIKE '%labels_off_test%' LIMIT 1"); like($labels, qr/^\{\}$/, 'track_labels=off produces empty labels'); # Re-enable