From 65fc7c283472f321312ee20eb6021f5f69bc6f20 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 16:40:35 +0300
Subject: [PATCH 01/21] tests/slo_workloads: align with ydb-slo-action v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v2 SLO action (ydb-platform/ydb-slo-action) starts a workload
container once per run, passes config via env vars (WORKLOAD_REF,
WORKLOAD_DURATION, YDB_CONNECTION_STRING, ...), and queries
sdk_operation_latency_p{50,95,99}_seconds as pre-computed gauges
labeled by (operation_type, operation_status, ref).

Changes:

- Dockerfile: drop SLO_BRANCH_REF compile-time ref baking; a single
  ENTRYPOINT binary without CMD is now sufficient.
- utils/utils.cpp: subcommand is optional — with no free-arg the
  binary runs create -> run -> cleanup sequentially (cleanup always
  runs, even if run errored). Option resolution follows the standard
  CLI > env > default priority via .DefaultValue():
  - -c from YDB_CONNECTION_STRING (or built from YDB_ENDPOINT +
    YDB_DATABASE);
  - --metrics-push-url from OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
  - --time from WORKLOAD_DURATION.
- utils/metrics.cpp: replace the OTel Histogram with HDR-backed
  gauges. Only successful operations are recorded; a background
  thread snapshots p50/p95/p99 every second, publishes the gauges
  with operation_status="success", then resets the HDR window.
  sdk_retry_attempts_total becomes a counter of (retry_attempts + 1)
  per op. ref is read from WORKLOAD_REF at startup, not compile time.
- utils/CMakeLists.txt: pull HdrHistogram_c 0.11.8 via FetchContent
  (opt-in via the outer YDB_SDK_TESTS flag) and link slo-utils
  against hdr_histogram_static.
---
 tests/slo_workloads/Dockerfile           |   3 +-
 tests/slo_workloads/utils/CMakeLists.txt |  19 +-
 tests/slo_workloads/utils/metrics.cpp    | 239 ++++++++++++++---------
 tests/slo_workloads/utils/utils.cpp      |  70 ++++++-
 tests/slo_workloads/utils/utils.h        |   3 +-
 5 files changed, 237 insertions(+), 97 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 091ce23066..65d9c7693b 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -1,7 +1,6 @@
 FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
-ARG REF=unknown
 
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
@@ -120,7 +119,7 @@ COPY . /ydb-cpp-sdk
 WORKDIR /ydb-cpp-sdk
 RUN rm -rf build
 
-RUN cmake -DSLO_BRANCH_REF=${REF} --preset ${PRESET}
+RUN cmake --preset ${PRESET}
 RUN cmake --build --preset default --target slo-key-value
 
 ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"]
diff --git a/tests/slo_workloads/utils/CMakeLists.txt b/tests/slo_workloads/utils/CMakeLists.txt
index e8589a568f..5434af9bb4 100644
--- a/tests/slo_workloads/utils/CMakeLists.txt
+++ b/tests/slo_workloads/utils/CMakeLists.txt
@@ -1,3 +1,16 @@
+include(FetchContent)
+
+FetchContent_Declare(
+  hdr_histogram
+  GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git
+  GIT_TAG        0.11.8
+  EXCLUDE_FROM_ALL
+)
+set(HDR_HISTOGRAM_BUILD_PROGRAMS OFF CACHE BOOL "" FORCE)
+set(HDR_HISTOGRAM_BUILD_SHARED   OFF CACHE BOOL "" FORCE)
+set(HDR_LOG_REQUIRED             OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(hdr_histogram)
+
 add_library(slo-utils)
 
 target_link_libraries(slo-utils PUBLIC
@@ -9,9 +22,9 @@ target_link_libraries(slo-utils PUBLIC
   opentelemetry-cpp::otlp_http_metric_exporter
 )
 
-if (SLO_BRANCH_REF)
-  target_compile_definitions(slo-utils PRIVATE REF=${SLO_BRANCH_REF})
-endif()
+target_link_libraries(slo-utils PRIVATE
+  hdr_histogram_static
+)
 
 target_sources(slo-utils PRIVATE
   executor.cpp
diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 50e1f859c0..996a48780f 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -11,21 +11,93 @@
 
 #include <ydb-cpp-sdk/client/resources/ydb_resources.h>
 
+#include <util/system/env.h>
+
+#include <hdr/hdr_histogram.h>
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <thread>
+
 
 using namespace std::chrono_literals;
 
-#ifdef REF
-static constexpr const std::string_view REF_LABEL = Y_STRINGIZE(REF);
-#else
-static constexpr const std::string_view REF_LABEL = "unknown";
-#endif
+namespace {
+
+constexpr std::int64_t kHdrMinLatencyNs = 1'000;          // 1 us
+constexpr std::int64_t kHdrMaxLatencyNs = 60'000'000'000; // 60 s
+constexpr int kHdrSignificantFigures = 3;
+
+std::string ResolveWorkloadRef() {
+    std::string ref = GetEnv("WORKLOAD_REF");
+    return ref.empty() ? "unknown" : ref;
+}
+
+// Minimal thread-safe wrapper around hdr_histogram for a single
+// (operation_type, operation_status="success") series. Only successful
+// latencies are recorded; errors are excluded from the percentile stream
+// per deploy/metrics.yaml.
+class TLatencyRecorder {
+public:
+    TLatencyRecorder() {
+        hdr_histogram* raw = nullptr;
+        int rc = hdr_init(kHdrMinLatencyNs, kHdrMaxLatencyNs, kHdrSignificantFigures, &raw);
+        Y_ABORT_UNLESS(rc == 0, "hdr_init failed: %d", rc);
+        Histogram_.reset(raw);
+    }
+
+    void Record(TDuration d) {
+        std::int64_t ns = static_cast<std::int64_t>(d.NanoSeconds());
+        if (ns < kHdrMinLatencyNs) {
+            ns = kHdrMinLatencyNs;
+        } else if (ns > kHdrMaxLatencyNs) {
+            ns = kHdrMaxLatencyNs;
+        }
+        std::lock_guard lock(Mutex_);
+        hdr_record_value(Histogram_.get(), ns);
+    }
+
+    // Returns p50/p95/p99 as seconds and resets the recorder window so
+    // gauges reflect only the most recent interval.
+    struct TPercentiles {
+        double P50 = 0.0;
+        double P95 = 0.0;
+        double P99 = 0.0;
+        bool HasData = false;
+    };
+
+    TPercentiles SnapshotAndReset() {
+        TPercentiles out;
+        std::lock_guard lock(Mutex_);
+        if (Histogram_->total_count == 0) {
+            return out;
+        }
+        out.HasData = true;
+        out.P50 = hdr_value_at_percentile(Histogram_.get(), 50.0) / 1e9;
+        out.P95 = hdr_value_at_percentile(Histogram_.get(), 95.0) / 1e9;
+        out.P99 = hdr_value_at_percentile(Histogram_.get(), 99.0) / 1e9;
+        hdr_reset(Histogram_.get());
+        return out;
+    }
+
+private:
+    struct THdrDeleter {
+        void operator()(hdr_histogram* h) const noexcept { if (h) hdr_close(h); }
+    };
+
+    std::mutex Mutex_;
+    std::unique_ptr<hdr_histogram, THdrDeleter> Histogram_;
+};
 
 class TOtelMetricsPusher : public IMetricsPusher {
 public:
     TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType)
         : OperationType_(operationType)
+        , Ref_(ResolveWorkloadRef())
         , CommonAttributes_{
-            {"ref", std::string(REF_LABEL)},
+            {"ref", Ref_},
             {"sdk", "cpp"},
             {"sdk_version", NYdb::GetSdkSemver()}
         }
@@ -36,12 +108,11 @@ class TOtelMetricsPusher : public IMetricsPusher {
         auto exporter = opentelemetry::exporter::otlp::OtlpHttpMetricExporterFactory::Create(exporterOptions);
 
         opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions;
-        readerOptions.export_interval_millis = 250ms;
-        readerOptions.export_timeout_millis  = 200ms;
+        readerOptions.export_interval_millis = 1000ms;
+        readerOptions.export_timeout_millis  = 900ms;
 
         auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions);
 
-        // Create MeterContext with resource
         auto context = std::make_unique<opentelemetry::sdk::metrics::MeterContext>(
             std::unique_ptr<opentelemetry::sdk::metrics::ViewRegistry>(new opentelemetry::sdk::metrics::ViewRegistry()),
             opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_))
@@ -53,97 +124,85 @@ class TOtelMetricsPusher : public IMetricsPusher {
         Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver());
 
         InitMetrics();
+        StartPercentilePublisher();
+    }
+
+    ~TOtelMetricsPusher() override {
+        PublisherShouldStop_.store(true);
+        if (PublisherThread_.joinable()) {
+            PublisherThread_.join();
+        }
     }
 
     void PushRequestData(const TRequestData& requestData) override {
-        if (requestData.Status == NYdb::EStatus::SUCCESS) {
-            OperationsSuccessTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
-        } else {
-            ErrorsTotal_->Add(1, MergeAttributes({{"status", YdbStatusToString(requestData.Status)}}));
-            OperationsFailureTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
+        const bool success = requestData.Status == NYdb::EStatus::SUCCESS;
+        const std::string status = success ? "success" : "error";
+
+        OperationsTotal_->Add(1, MergeAttributes({
+            {"operation_type", OperationType_},
+            {"operation_status", status},
+        }));
+
+        // sdk_retry_attempts_total = total number of technical attempts
+        // including the first one. TStatUnit counts only post-first attempts,
+        // so add 1 to include the initial attempt.
+        RetryAttemptsTotal_->Add(static_cast<double>(requestData.RetryAttempts + 1),
+            MergeAttributes({
+                {"operation_type", OperationType_},
+            })
+        );
+
+        if (success) {
+            Latency_.Record(requestData.Delay);
         }
-        OperationsTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}}));
-        OperationLatencySeconds_->Record(requestData.Delay.SecondsFloat(), MergeAttributes({{"operation_type", OperationType_}, {"status", YdbStatusToString(requestData.Status)}}));
-        RetryAttempts_->Record(requestData.RetryAttempts, MergeAttributes({{"operation_type", OperationType_}}));
     }
 
 private:
     void InitMetrics() {
-        ErrorsTotal_ = Meter_->CreateUInt64Counter("sdk_errors_total",
-            "Total number of errors encountered, categorized by error type."
+        OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total",
+            "Total number of operations, categorized by operation type and status."
         );
 
-        OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
-            "Total number of operations, categorized by type attempted by the SDK."
-        );
-    
-        OperationsSuccessTotal_ = Meter_->CreateUInt64Counter("sdk_operations_success_total",
-            "Total number of successful operations, categorized by type."
+        RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total",
+            "Total number of retry attempts (including the first attempt), categorized by operation type."
         );
 
-        OperationsFailureTotal_ = Meter_->CreateUInt64Counter("sdk_operations_failure_total",
-            "Total number of failed operations, categorized by type."
+        LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds",
+            "P50 latency of successful operations in seconds.", "s"
         );
-
-        OperationLatencySeconds_ = CreateDoubleHistogram("sdk_operation_latency_seconds",
-            "Latency of operations performed by the SDK in seconds, categorized by type and status.",
-            {
-				0.001,  // 1 ms
-				0.002,  // 2 ms
-				0.003,  // 3 ms
-				0.004,  // 4 ms
-				0.005,  // 5 ms
-				0.0075, // 7.5 ms
-				0.010,  // 10 ms
-				0.020,  // 20 ms
-				0.050,  // 50 ms
-				0.100,  // 100 ms
-				0.200,  // 200 ms
-				0.500,  // 500 ms
-				1.000,  // 1 s
-			},
-            "s"
+        LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds",
+            "P95 latency of successful operations in seconds.", "s"
         );
-
-        RetryAttempts_ = Meter_->CreateInt64Gauge("sdk_retry_attempts",
-            "Current retry attempts, categorized by operation type."
+        LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds",
+            "P99 latency of successful operations in seconds.", "s"
         );
     }
 
-    std::unique_ptr<opentelemetry::metrics::Histogram<double>> CreateDoubleHistogram(
-        const std::string& name,
-        const std::string& description,
-        const std::vector<double>& buckets,
-        const std::string& unit = {})
-    {
-        auto selector = std::make_unique<opentelemetry::sdk::metrics::InstrumentSelector>(
-            opentelemetry::sdk::metrics::InstrumentType::kHistogram,
-            name,
-            unit
-        );
-
-        auto meterSelector = std::make_unique<opentelemetry::sdk::metrics::MeterSelector>(
-            "slo_workloads",
-            NYdb::GetSdkSemver(),
-            ""
-        );
-
-        auto histogramConfig = std::make_shared<opentelemetry::sdk::metrics::HistogramAggregationConfig>();
-        histogramConfig->boundaries_ = buckets;
-
-        auto view = std::make_unique<opentelemetry::sdk::metrics::View>(
-            "",
-            "",
-            opentelemetry::sdk::metrics::AggregationType::kHistogram,
-            histogramConfig
-        );
-
-        MeterProvider_->AddView(std::move(selector), std::move(meterSelector), std::move(view));
+    void StartPercentilePublisher() {
+        PublisherThread_ = std::thread([this]() {
+            while (!PublisherShouldStop_.load(std::memory_order_relaxed)) {
+                std::this_thread::sleep_for(1s);
+                PublishPercentiles();
+            }
+            // Final flush before exit.
+            PublishPercentiles();
+        });
+    }
 
-        return Meter_->CreateDoubleHistogram(name, description, unit);
+    void PublishPercentiles() {
+        auto snapshot = Latency_.SnapshotAndReset();
+        if (!snapshot.HasData) {
+            return;
+        }
+        auto attrs = MergeAttributes({
+            {"operation_type", OperationType_},
+            {"operation_status", "success"},
+        });
+        LatencyP50_->Record(snapshot.P50, attrs);
+        LatencyP95_->Record(snapshot.P95, attrs);
+        LatencyP99_->Record(snapshot.P99, attrs);
     }
 
-    // Helper to merge common attributes with metric-specific ones
     std::map<std::string, std::string> MergeAttributes(const std::map<std::string, std::string>& metricAttrs) const {
         std::map<std::string, std::string> result = CommonAttributes_;
         result.insert(metricAttrs.begin(), metricAttrs.end());
@@ -151,17 +210,21 @@ class TOtelMetricsPusher : public IMetricsPusher {
     }
 
     std::string OperationType_;
-    std::map<std::string, std::string> CommonAttributes_;  // ref, sdk, sdk_version
+    std::string Ref_;
+    std::map<std::string, std::string> CommonAttributes_;
 
     std::unique_ptr<opentelemetry::sdk::metrics::MeterProvider> MeterProvider_;
     std::shared_ptr<opentelemetry::metrics::Meter> Meter_;
 
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> ErrorsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsSuccessTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsFailureTotal_;
-    std::unique_ptr<opentelemetry::metrics::Histogram<double>> OperationLatencySeconds_;
-    std::unique_ptr<opentelemetry::metrics::Gauge<int64_t>> RetryAttempts_;
+    std::unique_ptr<opentelemetry::metrics::Counter<double>> OperationsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<double>> RetryAttemptsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP50_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP95_;
+    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP99_;
+
+    TLatencyRecorder Latency_;
+    std::thread PublisherThread_;
+    std::atomic<bool> PublisherShouldStop_{false};
 };
 
 class TNoopMetricsPusher : public IMetricsPusher {
@@ -169,6 +232,8 @@ class TNoopMetricsPusher : public IMetricsPusher {
     void PushRequestData([[maybe_unused]] const TRequestData& requestData) override {}
 };
 
+} // namespace
+
 std::unique_ptr<IMetricsPusher> CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) {
     return std::make_unique<TOtelMetricsPusher>(metricsPushUrl, operationType);
 }
diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index b7572c2981..78231c8e1c 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -7,6 +7,8 @@
 #include <util/folder/path.h>
 #include <util/folder/dirut.h>
 #include <util/stream/file.h>
+#include <util/string/builder.h>
+#include <util/string/cast.h>
 #include <util/string/strip.h>
 #include <util/system/env.h>
 #include <util/random/random.h>
@@ -110,6 +112,19 @@ std::string GetDatabase(const std::string& connectionString) {
     return {};
 }
 
+static std::string DefaultConnectionStringFromEnv() {
+    std::string cs = GetEnv("YDB_CONNECTION_STRING");
+    if (!cs.empty()) {
+        return cs;
+    }
+    std::string endpoint = GetEnv("YDB_ENDPOINT");
+    std::string database = GetEnv("YDB_DATABASE");
+    if (!endpoint.empty() && !database.empty()) {
+        return TStringBuilder() << endpoint << "/?database=" << database;
+    }
+    return {};
+}
+
 int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TCleanupCommand cleanup) {
     TOpts opts = TOpts::Default();
 
@@ -121,8 +136,15 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     std::string statConfigFile;
     std::string balancingPolicy;
 
-    opts.AddLongOption('c', "connection-string", "YDB connection string").Required().RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE")
+    std::string defaultConnectionString = DefaultConnectionStringFromEnv();
+
+    auto& connOpt = opts.AddLongOption('c', "connection-string", "YDB connection string").RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE")
         .StoreResult(&connectionString);
+    if (!defaultConnectionString.empty()) {
+        connOpt.DefaultValue(defaultConnectionString);
+    } else {
+        connOpt.Required();
+    }
     opts.AddLongOption('p', "prefix", "Base prefix for tables").RequiredArgument("PATH")
         .StoreResult(&prefix);
     opts.AddLongOption('k', "token", "security token").RequiredArgument("TOKEN")
@@ -136,7 +158,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     opts.AddLongOption('b', "balancing-policy", "Balancing policy").Optional().DefaultValue("use-all-nodes").RequiredArgument("(use-all-nodes|prefer-local-dc|prefer-primary-pile)")
         .StoreResult(&balancingPolicy);
     opts.AddHelpOption('h');
-    opts.SetFreeArgsMin(1);
+    opts.SetFreeArgsMin(0);
     opts.SetFreeArgTitle(0, "<COMMAND>", GetCmdList());
     opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER;
 
@@ -144,7 +166,8 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     size_t freeArgsPos = res.GetFreeArgsPos();
     argc -= freeArgsPos;
     argv += freeArgsPos;
-    ECommandType command = ParseCommand(*argv);
+
+    ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All;
     if (command == ECommandType::Unknown) {
         Cerr << "Unknown command '" << *argv << "'" << Endl;
         return EXIT_FAILURE;
@@ -202,6 +225,28 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             Cout << "Launching cleanup command..." << Endl;
             result = cleanup(dbOptions, argc);
             break;
+        case ECommandType::All: {
+            Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl;
+            // Synthesize argv with a fake program name so the inner NLastGetopt
+            // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0]
+            // as the program name and parse zero real args.
+            char programName[] = "slo";
+            char* fakeArgv[] = { programName, nullptr };
+            int fakeArgc = 1;
+
+            Cout << "[all] Launching create command..." << Endl;
+            result = create(dbOptions, fakeArgc, fakeArgv);
+            if (!result) {
+                Cout << "[all] Launching run command..." << Endl;
+                result = run(dbOptions, fakeArgc, fakeArgv);
+            }
+            Cout << "[all] Launching cleanup command..." << Endl;
+            int cleanupRc = cleanup(dbOptions, fakeArgc);
+            if (!result) {
+                result = cleanupRc;
+            }
+            break;
+        }
         default:
             Cerr << "Unknown command" << Endl;
             return EXIT_FAILURE;
@@ -216,7 +261,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
 }
 
 std::string GetCmdList() {
-    return "create, run, cleanup";
+    return "create, run, cleanup (omit to run create -> run -> cleanup in one process)";
 }
 
 ECommandType ParseCommand(const char* cmd) {
@@ -425,6 +470,11 @@ TTableStats GetTableStats(TDatabaseOptions& dbOptions, const std::string& tableN
 }
 
 void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) {
+    std::string metricsPushUrlFromEnv = GetEnv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT");
+    if (!metricsPushUrlFromEnv.empty()) {
+        options.MetricsPushUrl = metricsPushUrlFromEnv;
+    }
+
     opts.AddLongOption("threads", "Number of threads to use").RequiredArgument("NUM")
         .DefaultValue(options.MaxInputThreads).StoreResult(&options.MaxInputThreads);
     opts.AddLongOption("stop-on-error", "Stop thread if an error occured").NoArgument()
@@ -485,6 +535,18 @@ bool ParseOptionsCreate(int argc, char** argv, TCreateOptions& createOptions) {
 bool ParseOptionsRun(int argc, char** argv, TRunOptions& runOptions) {
     TOpts opts = TOpts::Default();
     ParseOptionsCommon(opts, runOptions.CommonOptions);
+
+    if (std::string workloadDuration = GetEnv("WORKLOAD_DURATION"); !workloadDuration.empty()) {
+        try {
+            std::uint32_t parsed = FromString<std::uint32_t>(workloadDuration);
+            if (parsed > 0) {
+                runOptions.CommonOptions.SecondsToRun = parsed;
+            }
+        } catch (const std::exception& e) {
+            Cerr << "Invalid WORKLOAD_DURATION env value '" << workloadDuration << "': " << e.what() << Endl;
+        }
+    }
+
     opts.AddLongOption("time", "Time to run (Seconds)").RequiredArgument("Seconds")
         .DefaultValue(runOptions.CommonOptions.SecondsToRun).StoreResult(&runOptions.CommonOptions.SecondsToRun);
     opts.AddLongOption("read-rps", "Request generation rate for read requests (Thread A)").RequiredArgument("NUM")
diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h
index 65be9f4891..3eb3c48978 100644
--- a/tests/slo_workloads/utils/utils.h
+++ b/tests/slo_workloads/utils/utils.h
@@ -98,7 +98,8 @@ enum class ECommandType {
     Unknown,
     Create,
     Run,
-    Cleanup
+    Cleanup,
+    All,  // No free-arg passed: execute Create -> Run -> Cleanup in one process
 };
 
 struct TTableStats {

From 4d8f5385b6954e23111c1bea143367c852114a17 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:07:41 +0300
Subject: [PATCH 02/21] ci(slo): delegate to ydb-slo-action/init@v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite the SLO workflows following the pattern used by ydb-java-sdk
(ydb-platform/ydb-java-sdk#644).

slo.yml:
- Drop the hand-rolled docker run orchestration that spun up YDB and
  invoked the workload with --dont-push / explicit create/run phases.
  The v2 action owns that lifecycle via deploy/compose.yml — we just
  hand it two prebuilt images.
- Gate on the `SLO` PR label.
- Build both images with a single `docker build` per ref; if the
  baseline commit can't be built (missing Dockerfile or compile error
  on a historical SHA), fall back to the current image so the run is
  comparable against itself rather than silently failing.
- Drop `--build-arg REF=…` — ref is now read from WORKLOAD_REF env at
  runtime.
- Rename matrix entry to `cpp-key-value` to match the built binary and
  collapse the per-compiler matrix to a single clang entry (gcc variant
  can be added back when needed).

slo_report.yml:
- Pin to @v2.
- Add a second job that removes the `SLO` label from the PR after the
  report is published, matching js-sdk/java-sdk/go-sdk.
---
 .github/workflows/slo.yml        | 278 ++++++++-----------------------
 .github/workflows/slo_report.yml |  25 ++-
 2 files changed, 95 insertions(+), 208 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 0618526617..4cdf4bfec8 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -2,275 +2,143 @@ name: SLO
 
 on:
   pull_request:
-    types: [opened, reopened, synchronize]
-    branches:
-      - main
-  workflow_dispatch:
-    inputs:
-      github_issue:
-        description: "GitHub issue number where the SLO results will be reported"
-        required: true
-      baseline_ref:
-        description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)"
-        required: false
-      slo_workload_duration_seconds:
-        description: "Duration of the SLO workload in seconds"
-        required: false
-        default: "600"
-      slo_workload_read_max_rps:
-        description: "Maximum read RPS for the SLO workload"
-        required: false
-        default: "1000"
-      slo_workload_write_max_rps:
-        description: "Maximum write RPS for the SLO workload"
-        required: false
-        default: "100"
+    types: [opened, reopened, synchronize, labeled]
 
 jobs:
   ydb-slo-action:
+    if: contains(github.event.pull_request.labels.*.name, 'SLO')
+
     name: Run YDB SLO Tests
     runs-on: ubuntu-latest
 
+    permissions:
+      contents: read
+
     strategy:
+      fail-fast: false
       matrix:
-        compiler: [clang, gcc]
-        include:
-          - workload: table
+        sdk:
+          - name: cpp-key-value
+            preset: release-test-clang
+            command: ""
 
     concurrency:
-      group: slo-${{ github.ref }}-${{ matrix.os }}-${{ matrix.workload }}-${{ matrix.compiler }}
+      group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
       cancel-in-progress: true
 
     steps:
       - name: Install dependencies
         run: |
+          set -euxo pipefail
           YQ_VERSION=v4.48.2
           BUILDX_VERSION=0.30.1
           COMPOSE_VERSION=2.40.3
 
-          sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \
-            sudo chmod +x /usr/local/bin/yq
+          sudo curl -fLo /usr/local/bin/yq \
+            "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64"
+          sudo chmod +x /usr/local/bin/yq
 
-          echo "Updating Docker plugins..."
           sudo mkdir -p /usr/local/lib/docker/cli-plugins
 
-          echo "Installing Docker Buildx ${BUILDX_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \
             "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx
 
-          echo "Installing Docker Compose ${COMPOSE_VERSION}..."
           sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \
             "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64"
           sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
-          echo "Installed versions:"
           yq --version
           docker --version
           docker buildx version
           docker compose version
 
-      - name: Checkout current version
+      - name: Checkout current SDK version
         uses: actions/checkout@v5
         with:
-          path: current
+          path: sdk-current
           fetch-depth: 0
           submodules: true
 
       - name: Determine baseline commit
         id: baseline
+        working-directory: sdk-current
         run: |
-          cd current
-          if [[ -n "${{ inputs.baseline_ref }}" ]]; then
-            BASELINE="${{ inputs.baseline_ref }}"
-          else
-            BASELINE=$(git merge-base HEAD origin/main)
-          fi
-          echo "sha=$BASELINE" >> $GITHUB_OUTPUT
+          set -euo pipefail
+          BASELINE=$(git merge-base HEAD origin/main)
+          echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT"
 
-          # Try to determine a human-readable ref name for baseline
-          # Check if baseline is on main
-          if git merge-base --is-ancestor $BASELINE origin/main && \
-             [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then
+          if git merge-base --is-ancestor "${BASELINE}" origin/main && \
+             [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then
             BASELINE_REF="main"
           else
-            # Try to find a branch containing this commit
-            BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "")
-            if [ -n "$BRANCH" ]; then
+            BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "")
+            if [ -n "${BRANCH}" ]; then
               BASELINE_REF="${BRANCH}@${BASELINE:0:7}"
             else
               BASELINE_REF="${BASELINE:0:7}"
             fi
           fi
-          echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT
+          echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT"
 
-      - name: Checkout baseline version
+      - name: Checkout baseline SDK version
         uses: actions/checkout@v5
         with:
           ref: ${{ steps.baseline.outputs.sha }}
-          path: baseline
+          path: sdk-baseline
           fetch-depth: 1
           submodules: true
 
-      - name: Build Workload Image
+      - name: Build current workload image
+        working-directory: sdk-current
         run: |
-          echo "Cleaning up Docker system before builds..."
-          docker system prune -af --volumes
-          docker builder prune -af
-          df -h
-
-          # Build current version
-          if [ -f "$GITHUB_WORKSPACE/current/tests/slo_workloads/Dockerfile" ]; then
-            echo "Building current app image..."
-            cd "$GITHUB_WORKSPACE/current"
-
-            # Use SLO-specific .dockerignore
-            cp tests/slo_workloads/.dockerignore .dockerignore
-
-            docker build -t ydb-app-current \
-              --build-arg REF="${{ github.head_ref || github.ref_name }}" \
-              --build-arg PRESET=release-test-${{ matrix.compiler }} \
-              -f tests/slo_workloads/Dockerfile .
-
-            # Clean up .dockerignore
-            rm -f .dockerignore
-          else
-            echo "No current app Dockerfile found"
-            exit 1
+          set -euxo pipefail
+          cp tests/slo_workloads/.dockerignore .dockerignore
+          docker build \
+            --platform linux/amd64 \
+            --build-arg PRESET=${{ matrix.sdk.preset }} \
+            -t ydb-app-current \
+            -f tests/slo_workloads/Dockerfile \
+            .
+          rm -f .dockerignore
+
+      - name: Build baseline workload image
+        working-directory: sdk-baseline
+        run: |
+          set -euxo pipefail
+          # If the historical commit lacks the SLO workload files or can't
+          # compile, fall back to the current image so the SLO run is still
+          # comparable against itself rather than silently failing.
+          if [ ! -f tests/slo_workloads/Dockerfile ]; then
+            echo "Baseline commit has no SLO Dockerfile; reusing current image"
+            docker tag ydb-app-current ydb-app-baseline
+            exit 0
           fi
 
-          docker system prune -f --volumes
-          docker builder prune -af
-
-          # Build baseline version
-          if [ -f "$GITHUB_WORKSPACE/baseline/tests/slo_workloads/Dockerfile" ]; then
-            echo "Building baseline app image..."
-            cd "$GITHUB_WORKSPACE/baseline"
-
-            # Use SLO-specific .dockerignore
-            cp tests/slo_workloads/.dockerignore .dockerignore
-
-            docker build -t ydb-app-baseline \
-              --build-arg REF="${{ steps.baseline.outputs.ref }}" \
-              --build-arg PRESET=release-test-${{ matrix.compiler }} \
-              -f tests/slo_workloads/Dockerfile .
-
-            # Clean up .dockerignore
-            rm -f .dockerignore
-          else
-            echo "No baseline app Dockerfile found"
-            exit 1
+          cp tests/slo_workloads/.dockerignore .dockerignore
+          if ! docker build \
+                --platform linux/amd64 \
+                --build-arg PRESET=${{ matrix.sdk.preset }} \
+                -t ydb-app-baseline \
+                -f tests/slo_workloads/Dockerfile \
+                .
+          then
+            echo "Baseline build failed; reusing current image"
+            docker tag ydb-app-current ydb-app-baseline
           fi
+          rm -f .dockerignore
 
-          docker system prune -f --volumes
-          docker builder prune -af
-
-          echo "Final disk space after builds:"
-          df -h
-
-      - name: Initialize YDB SLO
-        uses: ydb-platform/ydb-slo-action/init@main
+      - name: Run SLO Tests
+        uses: ydb-platform/ydb-slo-action/init@v2
+        timeout-minutes: 30
         with:
-          github_issue: ${{ github.event.inputs.github_issue }}
+          github_issue: ${{ github.event.pull_request.number }}
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          workload_name: ${{ matrix.workload }}-${{ matrix.compiler }}
+          workload_name: ${{ matrix.sdk.name }}
+          workload_duration: "600"
           workload_current_ref: ${{ github.head_ref || github.ref_name }}
+          workload_current_image: ydb-app-current
+          workload_current_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
           workload_baseline_ref: ${{ steps.baseline.outputs.ref }}
-
-      - name: Prepare SLO Database
-        run: |
-          echo "Preparing SLO database..."
-          docker run --rm --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-current --connection-string grpc://ydb:2136/?database=/Root/testdb create --dont-push
-
-      - name: Run SLO Tests (parallel)
-        timeout-minutes: 15
-        run: |
-          DURATION=${{ inputs.slo_workload_duration_seconds || 600 }}
-          READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }}
-          WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 100 }}
-
-          ARGS="--connection-string grpc://ydb:2136/?database=/Root/testdb run \
-            --metrics-push-url http://prometheus:9090/api/v1/otlp/v1/metrics \
-            --time $DURATION \
-            --read-rps $READ_RPS \
-            --write-rps $WRITE_RPS \
-            --read-timeout 100 \
-            --write-timeout 100"
-
-          echo "Starting ydb-app-current..."
-          docker run -d \
-            --name ydb-app-current \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-current $ARGS
-
-          echo "Starting ydb-app-baseline..."
-          docker run -d \
-            --name ydb-app-baseline \
-            --network ydb_ydb-net \
-            --add-host "ydb:172.28.0.11" \
-            --add-host "ydb:172.28.0.12" \
-            --add-host "ydb:172.28.0.13" \
-            --add-host "ydb:172.28.0.99" \
-            ydb-app-baseline $ARGS
-
-          # Show initial logs
-          echo ""
-          echo "==================== INITIAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== INITIAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          # Wait for workloads to complete
-          echo "Waiting for workloads to complete (${DURATION}s)..."
-          sleep ${DURATION}
-
-          # Stop containers after workload duration and wait for graceful shutdown
-          echo "Stopping containers after ${DURATION}s..."
-          docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Force kill if still running
-          docker kill ydb-app-current ydb-app-baseline 2>&1 || true
-
-          # Check exit codes
-          CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
-          BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "0")
-
-          echo "Current container exit code: $CURRENT_EXIT"
-          echo "Baseline container exit code: $BASELINE_EXIT"
-
-          # Show final logs
-          echo ""
-          echo "==================== FINAL CURRENT LOGS ===================="
-          docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
-          echo ""
-          echo "==================== FINAL BASELINE LOGS ===================="
-          docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
-          echo ""
-
-          echo "SUCCESS: Workloads completed successfully"
-
-      - if: always()
-        name: Store logs
-        run: |
-          docker logs ydb-app-current > current.log 2>&1 || echo "No current container"
-          docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container"
-
-      - if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.workload }}-${{ matrix.compiler }}-slo-cpp-sdk-logs
-          path: |
-            ./current.log
-            ./baseline.log
-          retention-days: 1
+          workload_baseline_image: ydb-app-baseline
+          workload_baseline_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100
diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml
index 0a7c2e3483..b2bbee5172 100644
--- a/.github/workflows/slo_report.yml
+++ b/.github/workflows/slo_report.yml
@@ -7,17 +7,36 @@ on:
       - completed
 
 jobs:
-  ydb-slo-action-report:
+  publish-slo-report:
+    if: github.event.workflow_run.conclusion == 'success'
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
       checks: write
       contents: read
       pull-requests: write
-    if: github.event.workflow_run.conclusion == 'success'
     steps:
       - name: Publish YDB SLO Report
-        uses: ydb-platform/ydb-slo-action/report@main
+        uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           github_run_id: ${{ github.event.workflow_run.id }}
+
+  remove-slo-label:
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    name: Remove SLO Label
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Remove SLO label from PR
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
+          REPO: ${{ github.event.workflow_run.repository.full_name }}
+        run: |
+          set -euo pipefail
+          PR=$(jq -r '.[0].number' <<<"$PRS")
+          if [ "$PR" != "null" ] && [ -n "$PR" ]; then
+            gh pr edit "$PR" --repo "$REPO" --remove-label SLO
+          fi

From 2d9a928d10ab416c7c2b92c1bbe1e7759367db31 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:11:59 +0300
Subject: [PATCH 03/21] tests/slo_workloads: address Copilot review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Counters: switch OperationsTotal/RetryAttemptsTotal from DoubleCounter
  to UInt64Counter. Operation counts and retry attempts are inherently
  integer; float representation could drift for very high cumulative
  values and doesn't match the Prometheus counter convention.
- Percentile gauges: publish 0.0 on empty-window intervals instead of
  returning early. OTel's sync Gauge holds the last Record() value and
  the periodic exporter re-emits it every collection cycle — without an
  explicit reset, gauges would look "stuck" at the last non-empty value
  after load stops, contradicting the per-second HDR reset semantics.
---
 tests/slo_workloads/utils/metrics.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 996a48780f..4506c1ca39 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -138,7 +138,7 @@ class TOtelMetricsPusher : public IMetricsPusher {
         const bool success = requestData.Status == NYdb::EStatus::SUCCESS;
         const std::string status = success ? "success" : "error";
 
-        OperationsTotal_->Add(1, MergeAttributes({
+        OperationsTotal_->Add(uint64_t{1}, MergeAttributes({
             {"operation_type", OperationType_},
             {"operation_status", status},
         }));
@@ -146,7 +146,7 @@ class TOtelMetricsPusher : public IMetricsPusher {
         // sdk_retry_attempts_total = total number of technical attempts
         // including the first one. TStatUnit counts only post-first attempts,
         // so add 1 to include the initial attempt.
-        RetryAttemptsTotal_->Add(static_cast<double>(requestData.RetryAttempts + 1),
+        RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1,
             MergeAttributes({
                 {"operation_type", OperationType_},
             })
@@ -159,11 +159,11 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
 private:
     void InitMetrics() {
-        OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total",
+        OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
             "Total number of operations, categorized by operation type and status."
         );
 
-        RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total",
+        RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total",
             "Total number of retry attempts (including the first attempt), categorized by operation type."
         );
 
@@ -191,16 +191,18 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
     void PublishPercentiles() {
         auto snapshot = Latency_.SnapshotAndReset();
-        if (!snapshot.HasData) {
-            return;
-        }
         auto attrs = MergeAttributes({
             {"operation_type", OperationType_},
             {"operation_status", "success"},
         });
-        LatencyP50_->Record(snapshot.P50, attrs);
-        LatencyP95_->Record(snapshot.P95, attrs);
-        LatencyP99_->Record(snapshot.P99, attrs);
+        // When no successful ops landed in the last second, publish 0.0
+        // for all percentiles so the gauges reset with the HDR window
+        // rather than appearing "stuck" at the last non-empty value (the
+        // OTel periodic exporter would otherwise re-emit the previous
+        // Record() value on every collection cycle).
+        LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs);
+        LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs);
+        LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs);
     }
 
     std::map<std::string, std::string> MergeAttributes(const std::map<std::string, std::string>& metricAttrs) const {
@@ -216,8 +218,8 @@ class TOtelMetricsPusher : public IMetricsPusher {
     std::unique_ptr<opentelemetry::sdk::metrics::MeterProvider> MeterProvider_;
     std::shared_ptr<opentelemetry::metrics::Meter> Meter_;
 
-    std::unique_ptr<opentelemetry::metrics::Counter<double>> OperationsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Counter<double>> RetryAttemptsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsTotal_;
+    std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> RetryAttemptsTotal_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP50_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP95_;
     std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP99_;

From cbe3597d10ed9ac7f2b7af9229dd4ed7d1602f3f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 17:27:27 +0300
Subject: [PATCH 04/21] tests/slo_workloads: make apt tolerant of PPA timeouts

Shared CI runners (and local docker builds) periodically hit
connection timeouts on ppa.launchpadcontent.net. Telling apt itself
to retry via Acquire::Retries=5 plus a 60 s connect timeout handles
the blip in-place without a shell retry loop.
---
 tests/slo_workloads/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 65d9c7693b..ca9d77d5fe 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -2,6 +2,12 @@ FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
+# Make apt tolerant of transient PPA/mirror timeouts (shared runners see
+# ppa.launchpadcontent.net connection timeouts every few builds).
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \
+    echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries
+
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
 

From 1799ac35f1c15a1ff84368c04beee6022a328a1c Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 18:06:21 +0300
Subject: [PATCH 05/21] ci(slo): make 30-min build resilient to network flakes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two separate issues were making every SLO PR a coin flip:

1. Any transient network error (GitHub release download, PPA timeout,
   abseil/protobuf/grpc tarball) kills the entire 30-min build. Each
   of the 8 wget calls and both apt-get steps is now retried:
   - wget gets --tries=5 --waitretry=15 --timeout=60
     --retry-connrefused --retry-on-http-error=500,502,503,504
     via a shared WGET_OPTS env var.
   - apt already has Acquire::Retries=5.

2. Every CI run does a full cold build. The Dockerfile's toolchain and
   dep layers (~25 min) never change between SDK commits, so caching
   them is trivially safe.
   - Switch slo.yml to docker/build-push-action@v6 with GHA type=gha
     cache export/import, scoped per preset.
   - First build on a runner still takes ~30 min; subsequent builds
     where only the SDK source changed should take ~3 min.
   - Baseline build uses continue-on-error + an explicit fallback
     step that retags ydb-app-current as ydb-app-baseline when the
     historical commit won't compile — replaces the inline shell
     if-else that docker/build-push-action can't express.
---
 .github/workflows/slo.yml      | 78 ++++++++++++++++++++--------------
 tests/slo_workloads/Dockerfile | 25 ++++++-----
 2 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 4cdf4bfec8..0982b7125c 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -89,44 +89,56 @@ jobs:
           fetch-depth: 1
           submodules: true
 
-      - name: Build current workload image
-        working-directory: sdk-current
-        run: |
-          set -euxo pipefail
-          cp tests/slo_workloads/.dockerignore .dockerignore
-          docker build \
-            --platform linux/amd64 \
-            --build-arg PRESET=${{ matrix.sdk.preset }} \
-            -t ydb-app-current \
-            -f tests/slo_workloads/Dockerfile \
-            .
-          rm -f .dockerignore
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
 
-      - name: Build baseline workload image
-        working-directory: sdk-baseline
+      # .dockerignore lives under tests/slo_workloads/; buildx expects it at
+      # the context root, so place a copy in each SDK checkout.
+      - name: Stage .dockerignore
         run: |
           set -euxo pipefail
-          # If the historical commit lacks the SLO workload files or can't
-          # compile, fall back to the current image so the SLO run is still
-          # comparable against itself rather than silently failing.
-          if [ ! -f tests/slo_workloads/Dockerfile ]; then
-            echo "Baseline commit has no SLO Dockerfile; reusing current image"
-            docker tag ydb-app-current ydb-app-baseline
-            exit 0
+          cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
+          if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then
+            cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
           fi
 
-          cp tests/slo_workloads/.dockerignore .dockerignore
-          if ! docker build \
-                --platform linux/amd64 \
-                --build-arg PRESET=${{ matrix.sdk.preset }} \
-                -t ydb-app-baseline \
-                -f tests/slo_workloads/Dockerfile \
-                .
-          then
-            echo "Baseline build failed; reusing current image"
-            docker tag ydb-app-current ydb-app-baseline
-          fi
-          rm -f .dockerignore
+      # A clean build of the SLO image takes ~30 min because the Dockerfile
+      # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.
+      # The GHA cache lets subsequent runs reuse every layer up to the SDK
+      # source COPY, so only the actual workload link step reruns (~3 min).
+      - name: Build current workload image
+        uses: docker/build-push-action@v6
+        with:
+          context: sdk-current
+          file: sdk-current/tests/slo_workloads/Dockerfile
+          platforms: linux/amd64
+          tags: ydb-app-current
+          load: true
+          build-args: PRESET=${{ matrix.sdk.preset }}
+          cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}
+          cache-to: type=gha,mode=max,scope=slo-${{ matrix.sdk.preset }}
+
+      - name: Build baseline workload image
+        id: baseline-build
+        continue-on-error: true
+        uses: docker/build-push-action@v6
+        with:
+          context: sdk-baseline
+          file: sdk-baseline/tests/slo_workloads/Dockerfile
+          platforms: linux/amd64
+          tags: ydb-app-baseline
+          load: true
+          build-args: PRESET=${{ matrix.sdk.preset }}
+          cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }}
+
+      # If the historical commit lacks the SLO Dockerfile or can't compile,
+      # reuse the current image so the SLO run is still comparable against
+      # itself rather than failing outright.
+      - name: Fall back to current image for baseline
+        if: steps.baseline-build.outcome == 'failure'
+        run: |
+          echo "Baseline build failed; reusing current image as baseline."
+          docker tag ydb-app-current ydb-app-baseline
 
       - name: Run SLO Tests
         uses: ydb-platform/ydb-slo-action/init@v2
diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index ca9d77d5fe..7f8ea5e78e 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -2,12 +2,15 @@ FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
-# Make apt tolerant of transient PPA/mirror timeouts (shared runners see
-# ppa.launchpadcontent.net connection timeouts every few builds).
+# Every RUN that hits the network retries on transient failures so one
+# flake doesn't throw away 30 min of previous build work. apt gets five
+# Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \
     echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries
 
+ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504"
+
 # Install software-properties-common for add-apt-repository
 RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
 
@@ -20,7 +23,7 @@ RUN apt-get -y update && apt-get -y install \
 
 # Install CMake
 ENV CMAKE_VERSION=3.27.7
-RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \
+RUN wget $WGET_OPTS https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \
     -q -O cmake-install.sh \
     && chmod u+x cmake-install.sh \
     && ./cmake-install.sh --skip-license --prefix=/usr/local \
@@ -28,7 +31,7 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cm
 
 # Install LLVM
 ENV LLVM_VERSION=16
-RUN wget https://apt.llvm.org/llvm.sh && \
+RUN wget $WGET_OPTS https://apt.llvm.org/llvm.sh && \
     chmod u+x llvm.sh && \
     ./llvm.sh ${LLVM_VERSION} && \
     rm llvm.sh
@@ -45,7 +48,7 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 10000 && \
 # Install abseil-cpp
 ENV ABSEIL_CPP_VERSION=20230802.0
 ENV ABSEIL_CPP_INSTALL_DIR=/root/ydb_deps/absl
-RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \
     tar -xvzf abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz && cd abseil-cpp-${ABSEIL_CPP_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DABSL_PROPAGATE_CXX_STD=ON .. && \
@@ -56,7 +59,7 @@ RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/ab
 # Install protobuf
 ENV PROTOBUF_VERSION=3.21.12
 ENV PROTOBUF_INSTALL_DIR=/root/ydb_deps/protobuf
-RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \
     tar -xvzf protobuf-${PROTOBUF_VERSION}.tar.gz && cd protobuf-${PROTOBUF_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_INSTALL=ON -Dprotobuf_ABSL_PROVIDER=package .. && \
@@ -67,7 +70,7 @@ RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffe
 # Install grpc
 ENV GRPC_VERSION=1.54.3
 ENV GRPC_INSTALL_DIR=/root/ydb_deps/grpc
-RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \
     tar -xvzf grpc-${GRPC_VERSION}.tar.gz && cd grpc-${GRPC_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_PREFIX_PATH="${ABSEIL_CPP_INSTALL_DIR};${PROTOBUF_INSTALL_DIR}" \
@@ -84,7 +87,7 @@ RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/ref
 # Install base64
 ENV BASE64_VERSION=0.5.2
 ENV BASE64_INSTALL_DIR=/root/ydb_deps/base64
-RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \
     tar -xvzf base64-${BASE64_VERSION}.tar.gz && cd base64-${BASE64_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -95,7 +98,7 @@ RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/arc
 # Install brotli
 ENV BROTLI_VERSION=1.1.0
 ENV BROTLI_INSTALL_DIR=/root/ydb_deps/brotli
-RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \
     tar -xvzf brotli-${BROTLI_VERSION}.tar.gz && cd brotli-${BROTLI_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -106,7 +109,7 @@ RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/arc
 # Install jwt-cpp
 ENV JWT_CPP_VERSION=0.7.0
 ENV JWT_CPP_INSTALL_DIR=/root/ydb_deps/jwt-cpp
-RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \
+RUN wget $WGET_OPTS -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \
     tar -xvzf jwt-cpp-${JWT_CPP_VERSION}.tar.gz && cd jwt-cpp-${JWT_CPP_VERSION} && \
     mkdir build && cd build && \
     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \
@@ -116,7 +119,7 @@ RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-
 
 # Install ccache 4.8.1 or above
 ENV CCACHE_VERSION=4.8.1
-RUN wget https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
+RUN wget $WGET_OPTS https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
     && tar -xf ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \
     && cp ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/local/bin/ \
     && rm -rf ccache-${CCACHE_VERSION}-linux-x86_64 ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz

From da782d4c559470ede50c9857bbdb045d9bf4bb4f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 18:07:02 +0300
Subject: [PATCH 06/21] tests/slo_workloads: retry PPA/apt steps at the shell
 level

The earlier Acquire::Retries=5 tweak covered HTTP-level errors but
didn't handle TCP connect timeouts to ppa.launchpadcontent.net, which
is the actual failure mode observed on shared CI runners. Wrap both
the add-apt-repository step and the main apt-get install in shell
retry loops (5 attempts, 15/30/45/60/75 s backoff) so a CDN blip no
longer kills the 30-min build.
---
 tests/slo_workloads/Dockerfile | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 7f8ea5e78e..7680560cac 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -11,15 +11,34 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
 
 ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504"
 
-# Install software-properties-common for add-apt-repository
-RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
+# Install software-properties-common and add the gcc-13 PPA.
+# Acquire::Retries only retries HTTP errors; TCP connect timeouts to
+# ppa.launchpadcontent.net still drop through and kill the step. Wrap the
+# whole command in a shell retry loop with exponential backoff so a CDN
+# blip doesn't throw away 30 minutes of downstream build work.
+RUN for i in 1 2 3 4 5; do \
+        apt-get -y update && \
+        apt-get -y install software-properties-common && \
+        add-apt-repository -y ppa:ubuntu-toolchain-r/test && \
+        apt-get -y update && \
+        break; \
+        echo "add-apt-repository attempt $i failed; sleeping $((i * 15))s"; \
+        sleep $((i * 15)); \
+    done && \
+    apt-cache show gcc-13 > /dev/null  # fail fast if PPA never came up
 
 # Install C++ tools and libraries
-RUN apt-get -y update && apt-get -y install \
-    git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \
-    rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \
-    libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN for i in 1 2 3 4 5; do \
+        apt-get -y install \
+            git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \
+            rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \
+            libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 && \
+        break; \
+        echo "apt-get install attempt $i failed; sleeping $((i * 15))s"; \
+        sleep $((i * 15)); \
+        apt-get -y update || true; \
+    done && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Install CMake
 ENV CMAKE_VERSION=3.27.7

From a8da8bb6de1fcec95e257bd7fc54f8e790d7deeb Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Tue, 5 May 2026 20:07:39 +0300
Subject: [PATCH 07/21] tests/slo_workloads: forward run-phase args in implicit
 All mode

The v2 SLO action invokes the workload as `slo-key-value <run-args>`
without a subcommand keyword. The global parser used to error on
unknown long options like --read-rps; now it tolerates them and the
implicit All branch passes the leftover argv to the run phase.
---
 tests/slo_workloads/utils/utils.cpp | 33 +++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 78231c8e1c..89a642b61d 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -161,6 +161,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     opts.SetFreeArgsMin(0);
     opts.SetFreeArgTitle(0, "<COMMAND>", GetCmdList());
     opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER;
+    // Run-phase options (--read-rps, --write-rps, …) reach DoMain when the
+    // caller invokes the workload without an explicit subcommand (the v2 SLO
+    // action contract). Tolerate them here so the global parser stops at the
+    // first unknown option instead of erroring; they are forwarded to the
+    // run phase below.
+    opts.AllowUnknownLongOptions_ = true;
 
     TOptsParseResult res(&opts, argc, argv);
     size_t freeArgsPos = res.GetFreeArgsPos();
@@ -169,8 +175,14 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
 
     ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All;
     if (command == ECommandType::Unknown) {
-        Cerr << "Unknown command '" << *argv << "'" << Endl;
-        return EXIT_FAILURE;
+        if (argv[0][0] == '-') {
+            // First leftover token is an option, not a subcommand keyword:
+            // treat as implicit All mode and let the run phase parse it.
+            command = ECommandType::All;
+        } else {
+            Cerr << "Unknown command '" << *argv << "'" << Endl;
+            return EXIT_FAILURE;
+        }
     }
 
     if (prefix.empty()) {
@@ -227,18 +239,25 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             break;
         case ECommandType::All: {
             Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl;
-            // Synthesize argv with a fake program name so the inner NLastGetopt
-            // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0]
-            // as the program name and parse zero real args.
+            // Forward leftover argv to the run phase so options like
+            // --read-rps / --write-rps take effect. argv[0] here is the first
+            // run-phase option (no subcommand keyword was supplied), so
+            // prepend a synthetic program name for ParseOptionsRun.
             char programName[] = "slo";
-            char* fakeArgv[] = { programName, nullptr };
+            std::vector<char*> runArgv;
+            runArgv.reserve(argc + 1);
+            runArgv.push_back(programName);
+            for (int i = 0; i < argc; ++i) {
+                runArgv.push_back(argv[i]);
+            }
             int fakeArgc = 1;
+            char* fakeArgv[] = { programName, nullptr };
 
             Cout << "[all] Launching create command..." << Endl;
             result = create(dbOptions, fakeArgc, fakeArgv);
             if (!result) {
                 Cout << "[all] Launching run command..." << Endl;
-                result = run(dbOptions, fakeArgc, fakeArgv);
+                result = run(dbOptions, static_cast<int>(runArgv.size()), runArgv.data());
             }
             Cout << "[all] Launching cleanup command..." << Endl;
             int cleanupRc = cleanup(dbOptions, fakeArgc);

From 4d27c9f612284ccd7a5e611c70b069ea2980b29f Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 00:20:53 +0300
Subject: [PATCH 08/21] ci(slo): overlay current workload harness onto baseline
 checkout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The baseline image used to be built from the merge-base checkout's
tests/slo_workloads/, so any harness change on the PR (Dockerfile,
CLI parser, …) was absent from the baseline. SDK comparison should
only vary the library, not the harness — so reuse current's harness
on both sides.
---
 .github/workflows/slo.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 0982b7125c..63145bba8f 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -92,15 +92,19 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      # .dockerignore lives under tests/slo_workloads/; buildx expects it at
-      # the context root, so place a copy in each SDK checkout.
-      - name: Stage .dockerignore
+      # Use current's workload harness (Dockerfile, sources, .dockerignore) for
+      # both builds so only the SDK library differs between current and
+      # baseline. Without this the baseline image picks up the harness from
+      # the merge-base commit, which can lag behind the action's contract.
+      # buildx also expects .dockerignore at the context root, not under
+      # tests/, so copy it up in each checkout.
+      - name: Stage workload harness
         run: |
           set -euxo pipefail
+          rm -rf sdk-baseline/tests/slo_workloads
+          cp -a sdk-current/tests/slo_workloads sdk-baseline/tests/slo_workloads
           cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
-          if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then
-            cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
-          fi
+          cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
 
       # A clean build of the SLO image takes ~30 min because the Dockerfile
       # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.

From a6caf4ef2a74dc198b9c36051445037f89dcefbe Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 00:27:27 +0300
Subject: [PATCH 09/21] ci(slo): wire ccache into the cmake build via BuildKit
 cache mount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cmake configure/build now run under ccache as the C/C++ compiler
launcher, with /root/.ccache exposed as a BuildKit cache mount so
state persists across runs through cache-to=type=gha,mode=max
(BuildKit ≥0.13 exports cache mounts under mode=max). Cold runs
incur the usual full compile; warm runs reuse object hashes and
should drop cmake --build from ~14 min to a few minutes.
---
 tests/slo_workloads/Dockerfile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile
index 7680560cac..f87be8f8df 100644
--- a/tests/slo_workloads/Dockerfile
+++ b/tests/slo_workloads/Dockerfile
@@ -1,7 +1,16 @@
+# syntax=docker/dockerfile:1.7
 FROM ubuntu:22.04
 
 ARG PRESET=release-test-clang
 
+# ccache settings consumed by the configure/build steps below. The cache dir
+# is materialised by the BuildKit cache mount on those RUN steps; values
+# elsewhere in the image are inert.
+ENV CCACHE_DIR=/root/.ccache
+ENV CCACHE_MAXSIZE=2G
+ENV CCACHE_COMPRESS=true
+ENV CCACHE_COMPILERCHECK=content
+
 # Every RUN that hits the network retries on transient failures so one
 # flake doesn't throw away 30 min of previous build work. apt gets five
 # Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS.
@@ -147,7 +156,13 @@ COPY . /ydb-cpp-sdk
 WORKDIR /ydb-cpp-sdk
 RUN rm -rf build
 
-RUN cmake --preset ${PRESET}
-RUN cmake --build --preset default --target slo-key-value
+RUN --mount=type=cache,target=/root/.ccache,sharing=locked \
+    cmake --preset ${PRESET} \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+RUN --mount=type=cache,target=/root/.ccache,sharing=locked \
+    ccache --zero-stats >/dev/null \
+    && cmake --build --preset default --target slo-key-value \
+    && ccache --show-stats
 
 ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"]

From 09e662f97d5c2dd8c8a0b5b9e4c78fe745c2141e Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 03:17:03 +0300
Subject: [PATCH 10/21] tests/slo_workloads: read prefix from YDB_DATABASE when
 connection-string lacks ?database=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v2 SLO action provides YDB_CONNECTION_STRING in path form
(grpc://host:port/Root/testdb), which GetDatabase can't parse, so
prefix stayed empty and create issued CreateTable("key_value")
without the database root → BAD_REQUEST. Falling back to the
YDB_DATABASE env var (which the action also sets) restores the
correct table path.
---
 tests/slo_workloads/utils/utils.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 89a642b61d..f52461283b 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -188,6 +188,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
     if (prefix.empty()) {
         prefix = GetDatabase(connectionString);
     }
+    if (prefix.empty()) {
+        // YDB SLO action sets YDB_CONNECTION_STRING in path form
+        // (grpc://host:port/Root/testdb), which GetDatabase can't parse.
+        // Fall back to YDB_DATABASE which the action sets alongside it.
+        prefix = GetEnv("YDB_DATABASE");
+    }
 
     if (!ParseToken(token, tokenFile)) {
         return EXIT_FAILURE;

From a581162623f4982bcf4e81a7321328d5146431f6 Mon Sep 17 00:00:00 2001
From: Vladislav Polyakov <polRk@ydb.tech>
Date: Wed, 6 May 2026 03:23:57 +0300
Subject: [PATCH 11/21] ci(slo): persist ccache between runs via actions/cache
 + buildkit-cache-dance

cache-to: type=gha,mode=max exports layer cache but not the contents
of --mount=type=cache, so the ccache mount started empty on every
run (0% hit rate observed). Restore /root/.ccache from a host dir
backed by actions/cache, inject it into BuildKit before each build
via the cache-dance action, and extract the updated state for the
next run. Same scope is shared by current and baseline since the
SDK code overlap dominates ccache hit potential.
---
 .github/workflows/slo.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index 63145bba8f..db9e4d9841 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -106,6 +106,33 @@ jobs:
           cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore
           cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore
 
+      # `cache-to: type=gha` does NOT export `--mount=type=cache` content, so
+      # ccache state is lost between runs. Persist /root/.ccache via host
+      # directory + cache-dance: actions/cache restores the host dir, the
+      # dance injects it into the BuildKit cache mount before the build and
+      # extracts the updated state afterwards for the next save.
+      - name: Restore ccache
+        id: ccache
+        uses: actions/cache@v4
+        with:
+          path: ccache
+          key: slo-ccache-${{ matrix.sdk.preset }}-${{ github.run_id }}
+          restore-keys: |
+            slo-ccache-${{ matrix.sdk.preset }}-
+
+      - name: Inject ccache into BuildKit
+        uses: reproducible-containers/buildkit-cache-dance@v3.1.2
+        with:
+          cache-map: |
+            {
+              "ccache": "/root/.ccache"
+            }
+          # Always extract so newly-compiled TUs from this run are saved by
+          # actions/cache (key uses ${{ github.run_id }}, so each run gets
+          # its own snapshot). Without extraction the cache stays frozen at
+          # whatever was first persisted.
+          skip-extraction: false
+
       # A clean build of the SLO image takes ~30 min because the Dockerfile
       # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source.
       # The GHA cache lets subsequent runs reuse every layer up to the SDK

From 4436c0751491ed2e93bf359219e31b5e185ff57d Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Fri, 29 May 2026 21:39:17 +0300
Subject: [PATCH 12/21] fix slo workload

---
 .github/workflows/slo_report.yml       | 39 ++++++++++++++++++--------
 tests/slo_workloads/key_value/drop.cpp | 26 +++++++++++++----
 tests/slo_workloads/utils/utils.cpp    |  3 ++
 3 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml
index b2bbee5172..0ccd36abe9 100644
--- a/.github/workflows/slo_report.yml
+++ b/.github/workflows/slo_report.yml
@@ -12,6 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
+      actions: read
       checks: write
       contents: read
       pull-requests: write
@@ -23,20 +24,36 @@ jobs:
           github_run_id: ${{ github.event.workflow_run.id }}
 
   remove-slo-label:
-    if: github.event.workflow_run.event == 'pull_request'
+    needs: publish-slo-report
+    if: always() && github.event.workflow_run.event == 'pull_request'
     runs-on: ubuntu-latest
     name: Remove SLO Label
     permissions:
       pull-requests: write
     steps:
       - name: Remove SLO label from PR
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }}
-          REPO: ${{ github.event.workflow_run.repository.full_name }}
-        run: |
-          set -euo pipefail
-          PR=$(jq -r '.[0].number' <<<"$PRS")
-          if [ "$PR" != "null" ] && [ -n "$PR" ]; then
-            gh pr edit "$PR" --repo "$REPO" --remove-label SLO
-          fi
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const pullRequests = context.payload.workflow_run.pull_requests;
+            if (pullRequests && pullRequests.length > 0) {
+              for (const pr of pullRequests) {
+                try {
+                  await github.rest.issues.removeLabel({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: pr.number,
+                    name: 'SLO'
+                  });
+                  console.log(`Removed SLO label from PR #${pr.number}`);
+                } catch (error) {
+                  if (error.status === 404) {
+                    console.log(`SLO label not found on PR #${pr.number}, skipping`);
+                  } else {
+                    throw error;
+                  }
+                }
+              }
+            } else {
+              console.log('No pull requests associated with this workflow run');
+            }
diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp
index ae749bc903..ef3be2f8b1 100644
--- a/tests/slo_workloads/key_value/drop.cpp
+++ b/tests/slo_workloads/key_value/drop.cpp
@@ -4,15 +4,31 @@ using namespace NLastGetopt;
 using namespace NYdb;
 using namespace NYdb::NTable;
 
-static void DropTable(TTableClient& client, const std::string& path) {
-    NYdb::NStatusHelpers::ThrowOnError(client.RetryOperationSync([path](TSession session) {
-        return session.DropTable(path).ExtractValueSync();
-    }));
+namespace {
+
+static bool DropTableWithRetry(TTableClient& client, const std::string& path) {
+    try {
+        RetryBackoff(client, 5, [path](TSession session) {
+            TStatus status = session.DropTable(path).ExtractValueSync();
+            if (status.GetStatus() == EStatus::NOT_FOUND) {
+                return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues());
+            }
+            return status;
+        });
+        return true;
+    } catch (const NYdb::NStatusHelpers::TYdbErrorException& e) {
+        Cerr << "DropTable failed after retries: " << e << Endl;
+        return false;
+    }
 }
 
+} // namespace
+
 int DropTable(TDatabaseOptions& dbOptions) {
     TTableClient client(dbOptions.Driver);
-    DropTable(client, JoinPath(dbOptions.Prefix, TableName));
+    if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) {
+        return EXIT_FAILURE;
+    }
     Cout << "Table dropped." << Endl;
     return EXIT_SUCCESS;
 }
diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index f52461283b..5257ef0dbd 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -269,6 +269,9 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             int cleanupRc = cleanup(dbOptions, fakeArgc);
             if (!result) {
                 result = cleanupRc;
+            } else if (cleanupRc) {
+                Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc
+                    << ") but run succeeded; ignoring cleanup exit code." << Endl;
             }
             break;
         }

From f9e9839d937f6429b6dd70d7ff78b5b06d34224e Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Fri, 29 May 2026 22:50:40 +0300
Subject: [PATCH 13/21] fix too long timeouts

---
 tests/slo_workloads/key_value/drop.cpp      | 28 ++++++++++++++-------
 tests/slo_workloads/key_value/key_value.cpp |  2 ++
 tests/slo_workloads/utils/metrics.cpp       |  5 +++-
 tests/slo_workloads/utils/statistics.cpp    | 10 +++++++-
 tests/slo_workloads/utils/statistics.h      |  2 ++
 5 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp
index ef3be2f8b1..9f465728c2 100644
--- a/tests/slo_workloads/key_value/drop.cpp
+++ b/tests/slo_workloads/key_value/drop.cpp
@@ -7,19 +7,28 @@ using namespace NYdb::NTable;
 namespace {
 
 static bool DropTableWithRetry(TTableClient& client, const std::string& path) {
-    try {
-        RetryBackoff(client, 5, [path](TSession session) {
-            TStatus status = session.DropTable(path).ExtractValueSync();
-            if (status.GetStatus() == EStatus::NOT_FOUND) {
+    TDuration delay = TDuration::Seconds(1);
+    constexpr std::uint32_t kMaxAttempts = 3;
+
+    for (std::uint32_t attempt = 1; attempt <= kMaxAttempts; ++attempt) {
+        TStatus status = client.RetryOperationSync([path](TSession session) {
+            TStatus dropStatus = session.DropTable(path).ExtractValueSync();
+            if (dropStatus.GetStatus() == EStatus::NOT_FOUND) {
                 return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues());
             }
-            return status;
+            return dropStatus;
         });
-        return true;
-    } catch (const NYdb::NStatusHelpers::TYdbErrorException& e) {
-        Cerr << "DropTable failed after retries: " << e << Endl;
-        return false;
+        if (status.IsSuccess()) {
+            return true;
+        }
+        Cerr << "DropTable attempt " << attempt << " failed: " << status << Endl;
+        if (attempt == kMaxAttempts) {
+            break;
+        }
+        Sleep(delay);
+        delay = Min(delay * 2, TDuration::Seconds(8));
     }
+    return false;
 }
 
 } // namespace
@@ -27,6 +36,7 @@ static bool DropTableWithRetry(TTableClient& client, const std::string& path) {
 int DropTable(TDatabaseOptions& dbOptions) {
     TTableClient client(dbOptions.Driver);
     if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) {
+        Cerr << "DropTable failed after all retries." << Endl;
         return EXIT_FAILURE;
     }
     Cout << "Table dropped." << Endl;
diff --git a/tests/slo_workloads/key_value/key_value.cpp b/tests/slo_workloads/key_value/key_value.cpp
index 1cb40f3620..23bd3ff184 100644
--- a/tests/slo_workloads/key_value/key_value.cpp
+++ b/tests/slo_workloads/key_value/key_value.cpp
@@ -50,6 +50,7 @@ int DoCreate(TDatabaseOptions& dbOptions, int argc, char** argv) {
     jobs->Start();
     jobs->Wait();
     jobs->ShowProgress();
+    jobs.reset();
 
     return EXIT_SUCCESS;
 }
@@ -95,6 +96,7 @@ int DoRun(TDatabaseOptions& dbOptions, int argc, char** argv) {
     Cout << "All jobs finished: " << TInstant::Now().ToRfc822StringLocal() << Endl;
 
     jobs->ShowProgress();
+    jobs.reset();
 
     return EXIT_SUCCESS;
 }
diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 4506c1ca39..30a2ee19ca 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -109,7 +109,7 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
         opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions;
         readerOptions.export_interval_millis = 1000ms;
-        readerOptions.export_timeout_millis  = 900ms;
+        readerOptions.export_timeout_millis  = 500ms;
 
         auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions);
 
@@ -132,6 +132,9 @@ class TOtelMetricsPusher : public IMetricsPusher {
         if (PublisherThread_.joinable()) {
             PublisherThread_.join();
         }
+        if (MeterProvider_) {
+            MeterProvider_->Shutdown(std::chrono::seconds(3));
+        }
     }
 
     void PushRequestData(const TRequestData& requestData) override {
diff --git a/tests/slo_workloads/utils/statistics.cpp b/tests/slo_workloads/utils/statistics.cpp
index 15789e7620..dddc35c56a 100644
--- a/tests/slo_workloads/utils/statistics.cpp
+++ b/tests/slo_workloads/utils/statistics.cpp
@@ -36,6 +36,11 @@ TStat::TStat(const std::optional<std::string>& metricsPushUrl, const std::string
     MetricsPushQueue.Start(20);
 }
 
+TStat::~TStat() {
+    MetricsPushQueue.Stop();
+    MetricsPusher.reset();
+}
+
 void TStat::Start() {
     StartTime = TInstant::Now();
 }
@@ -71,9 +76,12 @@ void TStat::FinishRequest(const std::shared_ptr<TStatUnit>& unit, const TFinalSt
     }
 
     ScheduleMetricsPush([this, delay, status, unit]() {
+        NYdb::EStatus requestStatus = status
+            ? status->GetStatus()
+            : NYdb::EStatus::CLIENT_DEADLINE_EXCEEDED;
         MetricsPusher->PushRequestData({
             .Delay = delay,
-            .Status = status->GetStatus(),
+            .Status = requestStatus,
             .RetryAttempts = unit->RetryAttempts
         });
     });
diff --git a/tests/slo_workloads/utils/statistics.h b/tests/slo_workloads/utils/statistics.h
index 9e7a092430..c61a267db4 100644
--- a/tests/slo_workloads/utils/statistics.h
+++ b/tests/slo_workloads/utils/statistics.h
@@ -62,6 +62,8 @@ class TStat {
 
     TInstant GetStartTime() const;
 
+    ~TStat();
+
 private:
     void ScheduleMetricsPush(std::function<void()> func);
 

From cce12fbebda26450cf493d0dcd220f019d3d58c0 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sat, 30 May 2026 17:48:25 +0300
Subject: [PATCH 14/21] remove slop manyal retry, some cli flags and fix double
 shutdown

---
 tests/slo_workloads/key_value/create.cpp |   4 +-
 tests/slo_workloads/key_value/drop.cpp   |  41 ++----
 tests/slo_workloads/utils/executor.cpp   | 160 +++--------------------
 tests/slo_workloads/utils/executor.h     |  46 +------
 tests/slo_workloads/utils/metrics.cpp    |   5 +-
 tests/slo_workloads/utils/utils.cpp      |   5 -
 tests/slo_workloads/utils/utils.h        |  25 ----
 7 files changed, 32 insertions(+), 254 deletions(-)

diff --git a/tests/slo_workloads/key_value/create.cpp b/tests/slo_workloads/key_value/create.cpp
index a79db76a12..a105caba7f 100644
--- a/tests/slo_workloads/key_value/create.cpp
+++ b/tests/slo_workloads/key_value/create.cpp
@@ -8,7 +8,7 @@ using namespace NYdb::NTable;
 
 namespace {
     void CreateTable(TTableClient& client, const std::string& prefix) {
-        RetryBackoff(client, 5, [prefix](TSession session) {
+        NYdb::NStatusHelpers::ThrowOnError(client.RetryOperationSync([prefix](TSession session) {
             auto desc = TTableBuilder()
                 .AddNullableColumn("object_id_key", EPrimitiveType::Uint32)
                 .AddNullableColumn("object_id", EPrimitiveType::Uint32)
@@ -27,7 +27,7 @@ namespace {
                 , std::move(desc)
                 , std::move(tableSettings)
             ).ExtractValueSync();
-        });
+        }));
     }
 } //namespace
 
diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp
index 9f465728c2..e22c92c894 100644
--- a/tests/slo_workloads/key_value/drop.cpp
+++ b/tests/slo_workloads/key_value/drop.cpp
@@ -4,39 +4,18 @@ using namespace NLastGetopt;
 using namespace NYdb;
 using namespace NYdb::NTable;
 
-namespace {
-
-static bool DropTableWithRetry(TTableClient& client, const std::string& path) {
-    TDuration delay = TDuration::Seconds(1);
-    constexpr std::uint32_t kMaxAttempts = 3;
-
-    for (std::uint32_t attempt = 1; attempt <= kMaxAttempts; ++attempt) {
-        TStatus status = client.RetryOperationSync([path](TSession session) {
-            TStatus dropStatus = session.DropTable(path).ExtractValueSync();
-            if (dropStatus.GetStatus() == EStatus::NOT_FOUND) {
-                return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues());
-            }
-            return dropStatus;
-        });
-        if (status.IsSuccess()) {
-            return true;
-        }
-        Cerr << "DropTable attempt " << attempt << " failed: " << status << Endl;
-        if (attempt == kMaxAttempts) {
-            break;
-        }
-        Sleep(delay);
-        delay = Min(delay * 2, TDuration::Seconds(8));
-    }
-    return false;
-}
-
-} // namespace
-
 int DropTable(TDatabaseOptions& dbOptions) {
     TTableClient client(dbOptions.Driver);
-    if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) {
-        Cerr << "DropTable failed after all retries." << Endl;
+    const std::string path = JoinPath(dbOptions.Prefix, TableName);
+    TStatus status = client.RetryOperationSync([path](TSession session) {
+        TStatus dropStatus = session.DropTable(path).ExtractValueSync();
+        if (dropStatus.GetStatus() == EStatus::NOT_FOUND) {
+            return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues());
+        }
+        return dropStatus;
+    });
+    if (!status.IsSuccess()) {
+        Cerr << "DropTable failed: " << status << Endl;
         return EXIT_FAILURE;
     }
     Cout << "Table dropped." << Endl;
diff --git a/tests/slo_workloads/utils/executor.cpp b/tests/slo_workloads/utils/executor.cpp
index 28add35c98..aa0a637905 100644
--- a/tests/slo_workloads/utils/executor.cpp
+++ b/tests/slo_workloads/utils/executor.cpp
@@ -16,62 +16,17 @@ TInsistentClient::TInsistentClient(const TCommonOptions& opts)
             .AllowRequestMigration(true)
     )
     , ClientMaxRetries(opts.MaxRetries)
-    , Timeout(opts.ReactionTime)
-    , RetryTimeout(Timeout / 2)
-    , SessionTimeout(Timeout + ReactionTimeDelay)
-    , UseApplicationTimeout(opts.UseApplicationTimeout)
-    , SendPreventiveRequest(opts.SendPreventiveRequest)
+    , SessionTimeout(opts.ReactionTime + ReactionTimeDelay)
 {
-    if (UseApplicationTimeout || SendPreventiveRequest) {
-        CallbackQueue.Start(opts.MaxCallbackThreads);
-        // Thread that executes timeout callbacks
-        auto threadFunc = [this]() {
-            TDuration timeToSleep;
-            while (!ShouldStop.WaitT(timeToSleep)) {
-                TInstant wakeupTime;
-                TInstant now;
-                with_lock(CallbacksLock) {
-                    now = TInstant::Now();
-                    while (!TimeoutCallbacks.empty() && now >= TimeoutCallbacks.front().ExecucionTime) {
-                        Y_UNUSED(CallbackQueue.AddFunc(TimeoutCallbacks.front().Callback));
-                        RemoveTimeoutIter(TimeoutCallbacks.front().context);
-                    }
-                    while (!RetryCallbacks.empty() && now >= RetryCallbacks.front().ExecucionTime) {
-                        Y_UNUSED(CallbackQueue.AddFunc(RetryCallbacks.front().Callback));
-                        RemoveRetryIter(RetryCallbacks.front().context);
-                    }
-                    if (RetryCallbacks.empty()) {
-                        wakeupTime = now + RetryTimeout;
-                    } else {
-                        wakeupTime = RetryCallbacks.front().ExecucionTime;
-                    }
-                    if (!TimeoutCallbacks.empty()) {
-                        wakeupTime = Min(wakeupTime, TimeoutCallbacks.front().ExecucionTime);
-                    }
-                }
-                timeToSleep = wakeupTime - now;
-            }
-        };
-        WorkThread.reset(SystemThreadFactory()->Run(threadFunc).Release());
-    }
 }
 
 TInsistentClient::~TInsistentClient() {
-    ShouldStop.Signal();
-    if (UseApplicationTimeout || SendPreventiveRequest) {
-        if (WorkThread) {
-            WorkThread->Join();
-        } else {
-            Cerr << (TStringBuilder() << "TInsistentClient::~TINsistentClient Error: WorkThread is not running." << Endl);
-        }
-        CallbackQueue.Stop();
-    }
     Client.Stop().Wait(WaitTimeout);
 }
 
 void TInsistentClient::Report(TStringBuilder& out) const {
-    out << "Client retries sent: total " << CounterSStart.load()
-        << ", successful " << CounterSOk.load() << Endl;
+    out << "Operations dispatched: " << CounterStart.load()
+        << ", succeeded: " << CounterOk.load() << Endl;
 }
 
 std::uint64_t TInsistentClient::GetActiveSessions() const {
@@ -79,112 +34,27 @@ std::uint64_t TInsistentClient::GetActiveSessions() const {
     return static_cast<std::uint64_t>(sessions);
 }
 
-void TInsistentClient::ClearContext(std::shared_ptr<TOperationContext>& context) {
-    if (SendPreventiveRequest) {
-        RemoveRetryIter(context);
-    }
-    if (UseApplicationTimeout) {
-        RemoveTimeoutIter(context);
-    }
-}
-
-void TInsistentClient::RemoveRetryIter(std::shared_ptr<TOperationContext>& context) {
-    if (context->RetryIter.Valid) {
-        context->RetryIter.Valid = false;
-        RetryCallbacks.erase(context->RetryIter.RealIter);
-    }
-}
-
-void TInsistentClient::RemoveTimeoutIter(std::shared_ptr<TOperationContext>& context) {
-    if (context->TimeoutIter.Valid) {
-        context->TimeoutIter.Valid = false;
-        TimeoutCallbacks.erase(context->TimeoutIter.RealIter);
-    }
-}
-
 TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation) {
     TTracedPromise<TFinalStatus> promise = TTracedPromise<TFinalStatus>(
         NThreading::NewPromise<TFinalStatus>(),
         &ExecutorPromises
     );
-    std::shared_ptr<TOperationContext> context = std::make_shared<TOperationContext>();
 
-    auto launchOperation = [this, operation, promise, context](bool firstTime) mutable {
-        with_lock(context->Lock) {
-            if (context->Finished) {
-                return;
-            }
-        }
-        auto callback = [promise, context, firstTime, this](const NYdb::TAsyncStatus& future) mutable {
-            Y_ABORT_UNLESS(future.HasValue());
-            // Not setting promise under lock to avoid deadlock
-            bool firstCallback = false;
-            with_lock(context->Lock) {
-                if (!context->Finished) {
-                    context->Finished = true;
-                    firstCallback = true;
-                }
-            }
-            if (firstCallback) {
-                promise.SetValue(future.GetValue());
-                with_lock(CallbacksLock) {
-                    if (firstTime) {
-                        CounterFOk.fetch_add(1);
-                    } else {
-                        CounterSOk.fetch_add(1);
-                    }
-                    ClearContext(context);
-                }
-            }
-        };
-        if (firstTime) {
-            CounterFStart.fetch_add(1);
-        } else {
-            CounterSStart.fetch_add(1);
-        }
-        NYdb::NTable::TRetryOperationSettings settings;
-        settings.MaxRetries(ClientMaxRetries);
-        settings.GetSessionClientTimeout(SessionTimeout);
-        auto future = Client.RetryOperation(operation, settings);
-        future.Subscribe(std::move(callback));
-    };
+    CounterStart.fetch_add(1);
 
-    with_lock(CallbacksLock) {
-        TInstant now = TInstant::Now();
+    NYdb::NTable::TRetryOperationSettings settings;
+    settings.MaxRetries(ClientMaxRetries);
+    settings.GetSessionClientTimeout(SessionTimeout);
 
-        if (SendPreventiveRequest) {
-            auto onRetryTimeout = [launchOperation]() mutable {
-                launchOperation(false);
-            };
-
-            RetryCallbacks.push_back({ now + RetryTimeout, onRetryTimeout, context });
-            context->RetryIter = { --RetryCallbacks.end() };
-        }
-
-        if (UseApplicationTimeout) {
-            auto onTimeout = [this, promise, context]() mutable {
-                // Not setting promise under lock to avoid deadlock
-                bool firstCallback = false;
-                with_lock(context->Lock) {
-                    if (!context->Finished) {
-                        context->Finished = true;
-                        firstCallback = true;
-                    }
-                }
-                if (firstCallback) {
-                    promise.SetValue(TFinalStatus());
-                    with_lock(CallbacksLock) {
-                        ClearContext(context);
-                    }
-                }
-            };
-
-            TimeoutCallbacks.push_back({ now + Timeout, onTimeout, context });
-            context->TimeoutIter = { --TimeoutCallbacks.end() };
+    auto future = Client.RetryOperation(operation, settings);
+    future.Subscribe([promise, this](const NYdb::TAsyncStatus& f) mutable {
+        Y_ABORT_UNLESS(f.HasValue());
+        const auto& status = f.GetValue();
+        if (status.IsSuccess()) {
+            CounterOk.fetch_add(1);
         }
-    }
-
-    launchOperation(true);
+        promise.SetValue(status);
+    });
 
     return promise.GetFuture();
 }
diff --git a/tests/slo_workloads/utils/executor.h b/tests/slo_workloads/utils/executor.h
index 69ed261a35..c20df89d6d 100644
--- a/tests/slo_workloads/utils/executor.h
+++ b/tests/slo_workloads/utils/executor.h
@@ -50,26 +50,6 @@ class TTracedPromise : public NThreading::TPromise<T> {
 
 class TInsistentClient {
 public:
-    struct TDelayedCallback;
-
-    struct TCheckedIterator {
-        std::list<TDelayedCallback>::iterator RealIter;
-        bool Valid = true;
-    };
-
-    struct TOperationContext {
-        bool Finished = false;
-        TAdaptiveLock Lock;
-        TCheckedIterator RetryIter;
-        TCheckedIterator TimeoutIter;
-    };
-
-    struct TDelayedCallback {
-        TInstant ExecucionTime;
-        std::function<void()> Callback;
-        std::shared_ptr<TOperationContext> context;
-    };
-
     TInsistentClient(const TCommonOptions& opts);
     ~TInsistentClient();
     void Report(TStringBuilder& out) const;
@@ -77,32 +57,12 @@ class TInsistentClient {
     std::uint64_t GetActiveSessions() const;
 
 private:
-    void ClearContext(std::shared_ptr<TOperationContext>& context);
-    void RemoveRetryIter(std::shared_ptr<TOperationContext>& context);
-    void RemoveTimeoutIter(std::shared_ptr<TOperationContext>& context);
-
-    TThreadPool CallbackQueue;
     NYdb::NTable::TTableClient Client;
     std::uint32_t ClientMaxRetries;
-    TDuration Timeout;
-    TDuration RetryTimeout;
     TDuration SessionTimeout;
-    TAdaptiveLock CallbacksLock;
-    std::unique_ptr<IThreadFactory::IThread> WorkThread;
-    TManualEvent ShouldStop;
-    std::list<TDelayedCallback> RetryCallbacks;
-    std::list<TDelayedCallback> TimeoutCallbacks;
-    bool UseApplicationTimeout;
-    bool SendPreventiveRequest;
-
-    // Ok received on the First try
-    std::atomic<std::uint64_t> CounterFOk = 0;
-    // Ok received on the Second try
-    std::atomic<std::uint64_t> CounterSOk = 0;
-    // First try launches (= total)
-    std::atomic<std::uint64_t> CounterFStart = 0;
-    // Second try launches
-    std::atomic<std::uint64_t> CounterSStart = 0;
+
+    std::atomic<std::uint64_t> CounterStart = 0;
+    std::atomic<std::uint64_t> CounterOk = 0;
 };
 
 class TExecutor {
diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 30a2ee19ca..27a257c104 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -132,9 +132,8 @@ class TOtelMetricsPusher : public IMetricsPusher {
         if (PublisherThread_.joinable()) {
             PublisherThread_.join();
         }
-        if (MeterProvider_) {
-            MeterProvider_->Shutdown(std::chrono::seconds(3));
-        }
+        // MeterProvider destructor calls Shutdown(); do not call it explicitly
+        // here — the OTel SDK rejects a second Shutdown with a warning.
     }
 
     void PushRequestData(const TRequestData& requestData) override {
diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 5257ef0dbd..320e53b700 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -515,11 +515,6 @@ void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) {
         .SetFlag(&options.DontPushMetrics).DefaultValue(options.DontPushMetrics);
     opts.AddLongOption("metrics-push-url", "URL to push metrics").RequiredArgument("URL")
         .DefaultValue(options.MetricsPushUrl).StoreResult(&options.MetricsPushUrl);
-    opts.AddLongOption("app-timeout", "Use application timeout (over SDK)").NoArgument()
-        .SetFlag(&options.UseApplicationTimeout).DefaultValue(options.UseApplicationTimeout);
-    opts.AddLongOption("prevention-request", "Send prevention request at 1/2 of timeout").NoArgument()
-        .SetFlag(&options.SendPreventiveRequest).DefaultValue(options.SendPreventiveRequest);
-
     opts.MutuallyExclusive("dont-push", "metrics-push-url");
 }
 
diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h
index 3eb3c48978..28576adaad 100644
--- a/tests/slo_workloads/utils/utils.h
+++ b/tests/slo_workloads/utils/utils.h
@@ -49,8 +49,6 @@ struct TCommonOptions {
     std::uint32_t MaxRetries = 50;
     TDuration ReactionTime = DefaultReactionTime;
     bool StopOnError = false;
-    bool UseApplicationTimeout = false;
-    bool SendPreventiveRequest = false;
 
     // Generator options:
     std::uint32_t MinLength = 20;
@@ -118,29 +116,6 @@ ECommandType ParseCommand(const char* cmd);
 
 std::string JoinPath(const std::string& prefix, const std::string& path);
 
-inline void RetryBackoff(
-    NYdb::NTable::TTableClient& client,
-    std::uint32_t retries,
-    const NYdb::NTable::TTableClient::TOperationSyncFunc& func
-) {
-    TDuration delay = TDuration::Seconds(5);
-    while (retries) {
-        NYdb::TStatus status = client.RetryOperationSync(func);
-        if (status.IsSuccess()) {
-            return;
-        }
-        --retries;
-        if (!retries) {
-            Cerr << "Create request failed after all retries." << Endl;
-            Cerr << status << Endl;
-            NYdb::NStatusHelpers::ThrowOnError(status);
-        }
-        Cerr << "Create request failed. Sleeping for " << delay << Endl;
-        Sleep(delay);
-        delay *= 2;
-    }
-}
-
 std::string GenerateRandomString(std::uint32_t minLength, std::uint32_t maxLength);
 
 NYdb::TParams PackValuesToParamsAsList(const std::vector<NYdb::TValue>& items, const std::string name = "$items");

From 8c4e579a651aea827b5d99293ebc9e5226cb09b0 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sat, 30 May 2026 18:26:15 +0300
Subject: [PATCH 15/21] fix double MeterProvider

---
 tests/slo_workloads/utils/metrics.cpp | 257 ++++++++++++++++++--------
 1 file changed, 177 insertions(+), 80 deletions(-)

diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index 27a257c104..b89f8f3b48 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -19,8 +19,8 @@
 #include <chrono>
 #include <memory>
 #include <mutex>
-#include <thread>
-
+#include <shared_mutex>
+#include <unordered_map>
 
 using namespace std::chrono_literals;
 
@@ -35,10 +35,8 @@ std::string ResolveWorkloadRef() {
     return ref.empty() ? "unknown" : ref;
 }
 
-// Minimal thread-safe wrapper around hdr_histogram for a single
-// (operation_type, operation_status="success") series. Only successful
-// latencies are recorded; errors are excluded from the percentile stream
-// per deploy/metrics.yaml.
+// Thread-safe HDR histogram. Only successful latencies are recorded; errors
+// are excluded from the percentile stream (operation_status="success").
 class TLatencyRecorder {
 public:
     TLatencyRecorder() {
@@ -59,8 +57,6 @@ class TLatencyRecorder {
         hdr_record_value(Histogram_.get(), ns);
     }
 
-    // Returns p50/p95/p99 as seconds and resets the recorder window so
-    // gauges reflect only the most recent interval.
     struct TPercentiles {
         double P50 = 0.0;
         double P95 = 0.0;
@@ -68,6 +64,11 @@ class TLatencyRecorder {
         bool HasData = false;
     };
 
+    // Snapshot all three percentiles from one consistent HDR state and reset
+    // the window — so each export cycle's gauge reflects only the last
+    // interval's latencies. Reading p50/p95/p99 in one critical section
+    // matches the Java workload's batch-callback pattern (avoids the race
+    // where p99 would observe a histogram already reset by p50).
     TPercentiles SnapshotAndReset() {
         TPercentiles out;
         std::lock_guard lock(Mutex_);
@@ -91,11 +92,14 @@ class TLatencyRecorder {
     std::unique_ptr<hdr_histogram, THdrDeleter> Histogram_;
 };
 
-class TOtelMetricsPusher : public IMetricsPusher {
+// Process-wide pusher: ONE MeterProvider with one OTLP exporter shared by
+// all operation types. Publishing duplicate MeterProviders against the same
+// Prometheus endpoint produces racing `target_info` writes for the same
+// resource label set, which Prometheus rejects as `out of order sample`.
+class TOtelSharedPusher {
 public:
-    TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType)
-        : OperationType_(operationType)
-        , Ref_(ResolveWorkloadRef())
+    explicit TOtelSharedPusher(const std::string& metricsPushUrl)
+        : Ref_(ResolveWorkloadRef())
         , CommonAttributes_{
             {"ref", Ref_},
             {"sdk", "cpp"},
@@ -111,11 +115,13 @@ class TOtelMetricsPusher : public IMetricsPusher {
         readerOptions.export_interval_millis = 1000ms;
         readerOptions.export_timeout_millis  = 500ms;
 
-        auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions);
+        auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(
+            std::move(exporter), readerOptions);
 
         auto context = std::make_unique<opentelemetry::sdk::metrics::MeterContext>(
             std::unique_ptr<opentelemetry::sdk::metrics::ViewRegistry>(new opentelemetry::sdk::metrics::ViewRegistry()),
-            opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_))
+            opentelemetry::sdk::resource::Resource::Create(
+                opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_))
         );
 
         MeterProvider_ = opentelemetry::sdk::metrics::MeterProviderFactory::Create(std::move(context));
@@ -124,96 +130,159 @@ class TOtelMetricsPusher : public IMetricsPusher {
         Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver());
 
         InitMetrics();
-        StartPercentilePublisher();
     }
 
-    ~TOtelMetricsPusher() override {
-        PublisherShouldStop_.store(true);
-        if (PublisherThread_.joinable()) {
-            PublisherThread_.join();
-        }
+    ~TOtelSharedPusher() {
+        // Remove observable-gauge callbacks before MeterProvider tears down
+        // the readers, so a final collection in flight cannot see this object
+        // half-destroyed.
+        if (LatencyP50_) LatencyP50_->RemoveCallback(&TOtelSharedPusher::ObserveP50, this);
+        if (LatencyP95_) LatencyP95_->RemoveCallback(&TOtelSharedPusher::ObserveP95, this);
+        if (LatencyP99_) LatencyP99_->RemoveCallback(&TOtelSharedPusher::ObserveP99, this);
         // MeterProvider destructor calls Shutdown(); do not call it explicitly
         // here — the OTel SDK rejects a second Shutdown with a warning.
     }
 
-    void PushRequestData(const TRequestData& requestData) override {
-        const bool success = requestData.Status == NYdb::EStatus::SUCCESS;
+    void Record(const std::string& operationType, const TRequestData& data) {
+        const bool success = data.Status == NYdb::EStatus::SUCCESS;
         const std::string status = success ? "success" : "error";
 
         OperationsTotal_->Add(uint64_t{1}, MergeAttributes({
-            {"operation_type", OperationType_},
+            {"operation_type", operationType},
             {"operation_status", status},
         }));
 
         // sdk_retry_attempts_total = total number of technical attempts
-        // including the first one. TStatUnit counts only post-first attempts,
-        // so add 1 to include the initial attempt.
-        RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1,
-            MergeAttributes({
-                {"operation_type", OperationType_},
-            })
-        );
+        // including the first one. RetryAttempts counts only post-first
+        // attempts, so add 1 to include the initial attempt.
+        RetryAttemptsTotal_->Add(data.RetryAttempts + 1, MergeAttributes({
+            {"operation_type", operationType},
+        }));
 
         if (success) {
-            Latency_.Record(requestData.Delay);
+            GetOrCreateSeries(operationType).Recorder.Record(data.Delay);
         }
     }
 
 private:
+    struct TSeries {
+        TLatencyRecorder Recorder;
+        // Cached snapshot, refreshed by the callback that runs first per
+        // export cycle (ObserveP50). All three callbacks read from these
+        // atomics so p50/p95/p99 land in the same export with consistent
+        // values from one HDR snapshot.
+        std::atomic<double> P50{0.0};
+        std::atomic<double> P95{0.0};
+        std::atomic<double> P99{0.0};
+        std::atomic<bool> HasData{false};
+    };
+
+    TSeries& GetOrCreateSeries(const std::string& op) {
+        {
+            std::shared_lock lock(SeriesMutex_);
+            auto it = Series_.find(op);
+            if (it != Series_.end()) {
+                return *it->second;
+            }
+        }
+        std::unique_lock lock(SeriesMutex_);
+        auto& slot = Series_[op];
+        if (!slot) {
+            slot = std::make_unique<TSeries>();
+        }
+        return *slot;
+    }
+
+    std::map<std::string, std::string> MergeAttributes(
+        const std::map<std::string, std::string>& metricAttrs) const
+    {
+        std::map<std::string, std::string> result = CommonAttributes_;
+        result.insert(metricAttrs.begin(), metricAttrs.end());
+        return result;
+    }
+
     void InitMetrics() {
         OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
-            "Total number of operations, categorized by operation type and status."
-        );
-
+            "Total number of operations, categorized by operation type and status.");
         RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total",
-            "Total number of retry attempts (including the first attempt), categorized by operation type."
-        );
-
-        LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds",
-            "P50 latency of successful operations in seconds.", "s"
-        );
-        LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds",
-            "P95 latency of successful operations in seconds.", "s"
-        );
-        LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds",
-            "P99 latency of successful operations in seconds.", "s"
-        );
+            "Total number of retry attempts (including the first attempt), categorized by operation type.");
+
+        LatencyP50_ = Meter_->CreateDoubleObservableGauge(
+            "sdk_operation_latency_p50_seconds",
+            "P50 latency of successful operations in seconds.", "s");
+        LatencyP95_ = Meter_->CreateDoubleObservableGauge(
+            "sdk_operation_latency_p95_seconds",
+            "P95 latency of successful operations in seconds.", "s");
+        LatencyP99_ = Meter_->CreateDoubleObservableGauge(
+            "sdk_operation_latency_p99_seconds",
+            "P99 latency of successful operations in seconds.", "s");
+
+        LatencyP50_->AddCallback(&TOtelSharedPusher::ObserveP50, this);
+        LatencyP95_->AddCallback(&TOtelSharedPusher::ObserveP95, this);
+        LatencyP99_->AddCallback(&TOtelSharedPusher::ObserveP99, this);
     }
 
-    void StartPercentilePublisher() {
-        PublisherThread_ = std::thread([this]() {
-            while (!PublisherShouldStop_.load(std::memory_order_relaxed)) {
-                std::this_thread::sleep_for(1s);
-                PublishPercentiles();
+    // ObserveP50 runs first per export cycle, snapshots+resets each series'
+    // HDR once, and caches p50/p95/p99 in the series atomics for the
+    // subsequent two callbacks to read. This mirrors the Java workload's
+    // single-snapshot strategy without requiring a batch-callback API.
+    static void ObserveP50(opentelemetry::metrics::ObserverResult result, void* state) {
+        auto* self = static_cast<TOtelSharedPusher*>(state);
+        auto obs = opentelemetry::nostd::get<
+            opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObserverResultT<double>>>(result);
+
+        std::shared_lock lock(self->SeriesMutex_);
+        for (const auto& [op, series] : self->Series_) {
+            auto snap = series->Recorder.SnapshotAndReset();
+            if (snap.HasData) {
+                series->P50.store(snap.P50);
+                series->P95.store(snap.P95);
+                series->P99.store(snap.P99);
+                series->HasData.store(true);
+            } else {
+                series->HasData.store(false);
             }
-            // Final flush before exit.
-            PublishPercentiles();
-        });
+            if (!series->HasData.load()) {
+                continue;
+            }
+            auto attrs = self->MergeAttributes({
+                {"operation_type", op},
+                {"operation_status", "success"},
+            });
+            obs->Observe(series->P50.load(),
+                opentelemetry::common::MakeKeyValueIterableView(attrs));
+        }
     }
 
-    void PublishPercentiles() {
-        auto snapshot = Latency_.SnapshotAndReset();
-        auto attrs = MergeAttributes({
-            {"operation_type", OperationType_},
-            {"operation_status", "success"},
-        });
-        // When no successful ops landed in the last second, publish 0.0
-        // for all percentiles so the gauges reset with the HDR window
-        // rather than appearing "stuck" at the last non-empty value (the
-        // OTel periodic exporter would otherwise re-emit the previous
-        // Record() value on every collection cycle).
-        LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs);
-        LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs);
-        LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs);
+    static void ObserveP95(opentelemetry::metrics::ObserverResult result, void* state) {
+        ObserveCached(result, state, &TSeries::P95);
     }
 
-    std::map<std::string, std::string> MergeAttributes(const std::map<std::string, std::string>& metricAttrs) const {
-        std::map<std::string, std::string> result = CommonAttributes_;
-        result.insert(metricAttrs.begin(), metricAttrs.end());
-        return result;
+    static void ObserveP99(opentelemetry::metrics::ObserverResult result, void* state) {
+        ObserveCached(result, state, &TSeries::P99);
+    }
+
+    static void ObserveCached(opentelemetry::metrics::ObserverResult result, void* state,
+                              std::atomic<double> TSeries::*field)
+    {
+        auto* self = static_cast<TOtelSharedPusher*>(state);
+        auto obs = opentelemetry::nostd::get<
+            opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObserverResultT<double>>>(result);
+
+        std::shared_lock lock(self->SeriesMutex_);
+        for (const auto& [op, series] : self->Series_) {
+            if (!series->HasData.load()) {
+                continue;
+            }
+            auto attrs = self->MergeAttributes({
+                {"operation_type", op},
+                {"operation_status", "success"},
+            });
+            obs->Observe((series.get()->*field).load(),
+                opentelemetry::common::MakeKeyValueIterableView(attrs));
+        }
     }
 
-    std::string OperationType_;
     std::string Ref_;
     std::map<std::string, std::string> CommonAttributes_;
 
@@ -222,13 +291,41 @@ class TOtelMetricsPusher : public IMetricsPusher {
 
     std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> OperationsTotal_;
     std::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> RetryAttemptsTotal_;
-    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP50_;
-    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP95_;
-    std::unique_ptr<opentelemetry::metrics::Gauge<double>> LatencyP99_;
+    std::shared_ptr<opentelemetry::metrics::ObservableInstrument> LatencyP50_;
+    std::shared_ptr<opentelemetry::metrics::ObservableInstrument> LatencyP95_;
+    std::shared_ptr<opentelemetry::metrics::ObservableInstrument> LatencyP99_;
+
+    std::shared_mutex SeriesMutex_;
+    std::unordered_map<std::string, std::unique_ptr<TSeries>> Series_;
+};
+
+std::mutex g_sharedMu;
+std::weak_ptr<TOtelSharedPusher> g_shared;
+
+std::shared_ptr<TOtelSharedPusher> GetOrCreateSharedPusher(const std::string& url) {
+    std::lock_guard lock(g_sharedMu);
+    auto sp = g_shared.lock();
+    if (!sp) {
+        sp = std::make_shared<TOtelSharedPusher>(url);
+        g_shared = sp;
+    }
+    return sp;
+}
+
+class TOtelMetricsPusherWrapper : public IMetricsPusher {
+public:
+    TOtelMetricsPusherWrapper(std::shared_ptr<TOtelSharedPusher> shared, std::string operationType)
+        : Shared_(std::move(shared))
+        , OperationType_(std::move(operationType))
+    {}
+
+    void PushRequestData(const TRequestData& requestData) override {
+        Shared_->Record(OperationType_, requestData);
+    }
 
-    TLatencyRecorder Latency_;
-    std::thread PublisherThread_;
-    std::atomic<bool> PublisherShouldStop_{false};
+private:
+    std::shared_ptr<TOtelSharedPusher> Shared_;
+    std::string OperationType_;
 };
 
 class TNoopMetricsPusher : public IMetricsPusher {
@@ -239,7 +336,7 @@ class TNoopMetricsPusher : public IMetricsPusher {
 } // namespace
 
 std::unique_ptr<IMetricsPusher> CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) {
-    return std::make_unique<TOtelMetricsPusher>(metricsPushUrl, operationType);
+    return std::make_unique<TOtelMetricsPusherWrapper>(GetOrCreateSharedPusher(metricsPushUrl), operationType);
 }
 
 std::unique_ptr<IMetricsPusher> CreateNoopMetricsPusher() {

From 2c193bcc180220a386eaf239fd755b84b988ad61 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sat, 30 May 2026 18:49:38 +0300
Subject: [PATCH 16/21] do not fail while cleanup

---
 tests/slo_workloads/utils/utils.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index 320e53b700..c258e95c81 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -267,11 +267,17 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             }
             Cout << "[all] Launching cleanup command..." << Endl;
             int cleanupRc = cleanup(dbOptions, fakeArgc);
-            if (!result) {
-                result = cleanupRc;
-            } else if (cleanupRc) {
+            // Cleanup runs while chaos-monkey is still killing nodes, so a
+            // DropTable failure here is expected noise and must not mask a
+            // successful run. Surface the run's status; only fall back to
+            // the cleanup status when run itself failed and we have nothing
+            // else to report.
+            if (cleanupRc && !result) {
                 Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc
                     << ") but run succeeded; ignoring cleanup exit code." << Endl;
+            } else if (cleanupRc) {
+                Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc
+                    << "); preserving earlier run failure." << Endl;
             }
             break;
         }

From 71d1123deebf26202b298cd41ef7dd3e69d2d946 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sat, 30 May 2026 19:25:20 +0300
Subject: [PATCH 17/21] fix the report comment workload

---
 .github/workflows/slo.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
index db9e4d9841..fe31086f89 100644
--- a/.github/workflows/slo.yml
+++ b/.github/workflows/slo.yml
@@ -139,6 +139,9 @@ jobs:
       # source COPY, so only the actual workload link step reruns (~3 min).
       - name: Build current workload image
         uses: docker/build-push-action@v6
+        env:
+          DOCKER_BUILD_SUMMARY: "false"
+          DOCKER_BUILD_RECORD_UPLOAD: "false"
         with:
           context: sdk-current
           file: sdk-current/tests/slo_workloads/Dockerfile
@@ -153,6 +156,9 @@ jobs:
         id: baseline-build
         continue-on-error: true
         uses: docker/build-push-action@v6
+        env:
+          DOCKER_BUILD_SUMMARY: "false"
+          DOCKER_BUILD_RECORD_UPLOAD: "false"
         with:
           context: sdk-baseline
           file: sdk-baseline/tests/slo_workloads/Dockerfile

From d832f6a658175c3a6dec0a53f80f268ada4d5a14 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sat, 30 May 2026 20:50:29 +0300
Subject: [PATCH 18/21] fix manual report

---
 .github/workflows/slo_report.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml
index 0ccd36abe9..3ae91d6712 100644
--- a/.github/workflows/slo_report.yml
+++ b/.github/workflows/slo_report.yml
@@ -5,10 +5,16 @@ on:
     workflows: ["SLO"]
     types:
       - completed
+  workflow_dispatch:
+    inputs:
+      slo_run_id:
+        description: "Run ID of the SLO workflow whose artifacts to report on"
+        required: true
+        type: string
 
 jobs:
   publish-slo-report:
-    if: github.event.workflow_run.conclusion == 'success'
+    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
@@ -21,11 +27,11 @@ jobs:
         uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          github_run_id: ${{ github.event.workflow_run.id }}
+          github_run_id: ${{ inputs.slo_run_id || github.event.workflow_run.id }}
 
   remove-slo-label:
     needs: publish-slo-report
-    if: always() && github.event.workflow_run.event == 'pull_request'
+    if: always() && github.event_name == 'workflow_run' && github.event.workflow_run.event == 'pull_request'
     runs-on: ubuntu-latest
     name: Remove SLO Label
     permissions:

From f290f84ff18fb253acccfcf03d6a9bb0c550fcf9 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sun, 31 May 2026 00:15:16 +0300
Subject: [PATCH 19/21] remove testing action

---
 .github/workflows/slo_report.yml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml
index 3ae91d6712..0ccd36abe9 100644
--- a/.github/workflows/slo_report.yml
+++ b/.github/workflows/slo_report.yml
@@ -5,16 +5,10 @@ on:
     workflows: ["SLO"]
     types:
       - completed
-  workflow_dispatch:
-    inputs:
-      slo_run_id:
-        description: "Run ID of the SLO workflow whose artifacts to report on"
-        required: true
-        type: string
 
 jobs:
   publish-slo-report:
-    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
+    if: github.event.workflow_run.conclusion == 'success'
     runs-on: ubuntu-latest
     name: Publish YDB SLO Report
     permissions:
@@ -27,11 +21,11 @@ jobs:
         uses: ydb-platform/ydb-slo-action/report@v2
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          github_run_id: ${{ inputs.slo_run_id || github.event.workflow_run.id }}
+          github_run_id: ${{ github.event.workflow_run.id }}
 
   remove-slo-label:
     needs: publish-slo-report
-    if: always() && github.event_name == 'workflow_run' && github.event.workflow_run.event == 'pull_request'
+    if: always() && github.event.workflow_run.event == 'pull_request'
     runs-on: ubuntu-latest
     name: Remove SLO Label
     permissions:

From b18a8edb0edfc364d8b24578706965489020c672 Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Sun, 31 May 2026 00:43:07 +0300
Subject: [PATCH 20/21] fix review issues

---
 tests/slo_workloads/utils/metrics.cpp | 129 +++++++++++++-------------
 tests/slo_workloads/utils/utils.cpp   |   5 +-
 2 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp
index b89f8f3b48..708c83a563 100644
--- a/tests/slo_workloads/utils/metrics.cpp
+++ b/tests/slo_workloads/utils/metrics.cpp
@@ -145,38 +145,67 @@ class TOtelSharedPusher {
 
     void Record(const std::string& operationType, const TRequestData& data) {
         const bool success = data.Status == NYdb::EStatus::SUCCESS;
-        const std::string status = success ? "success" : "error";
+        auto& series = GetOrCreateSeries(operationType);
 
-        OperationsTotal_->Add(uint64_t{1}, MergeAttributes({
-            {"operation_type", operationType},
-            {"operation_status", status},
-        }));
+        OperationsTotal_->Add(uint64_t{1},
+            opentelemetry::common::MakeKeyValueIterableView(
+                success ? series.SuccessAttrs : series.ErrorAttrs));
 
         // sdk_retry_attempts_total = total number of technical attempts
         // including the first one. RetryAttempts counts only post-first
         // attempts, so add 1 to include the initial attempt.
-        RetryAttemptsTotal_->Add(data.RetryAttempts + 1, MergeAttributes({
-            {"operation_type", operationType},
-        }));
+        RetryAttemptsTotal_->Add(data.RetryAttempts + 1,
+            opentelemetry::common::MakeKeyValueIterableView(series.RetryAttrs));
 
         if (success) {
-            GetOrCreateSeries(operationType).Recorder.Record(data.Delay);
+            series.Recorder.Record(data.Delay);
         }
     }
 
 private:
     struct TSeries {
         TLatencyRecorder Recorder;
-        // Cached snapshot, refreshed by the callback that runs first per
-        // export cycle (ObserveP50). All three callbacks read from these
-        // atomics so p50/p95/p99 land in the same export with consistent
-        // values from one HDR snapshot.
+        // Pre-merged attribute maps used on the hot path so Record() does not
+        // allocate / copy CommonAttributes_ per call.
+        std::map<std::string, std::string> SuccessAttrs;
+        std::map<std::string, std::string> ErrorAttrs;
+        std::map<std::string, std::string> RetryAttrs;
+
+        // Cached snapshot, refreshed by EnsureSnapshot() in whichever
+        // observable-gauge callback fires first per export cycle. All three
+        // callbacks read from these atomics so p50/p95/p99 land in the same
+        // export with consistent values from one HDR snapshot — independent
+        // of the order the SDK iterates instruments.
         std::atomic<double> P50{0.0};
         std::atomic<double> P95{0.0};
         std::atomic<double> P99{0.0};
         std::atomic<bool> HasData{false};
+        std::mutex SnapshotMutex;
+        std::chrono::steady_clock::time_point LastSnapshot{};
     };
 
+    // Half the export interval — guarantees one snapshot per cycle while
+    // tolerating arbitrary callback ordering.
+    static constexpr std::chrono::milliseconds kSnapshotFreshness{500};
+
+    void EnsureSnapshot(TSeries& series) {
+        auto now = std::chrono::steady_clock::now();
+        std::lock_guard lock(series.SnapshotMutex);
+        if (now - series.LastSnapshot < kSnapshotFreshness) {
+            return;
+        }
+        auto snap = series.Recorder.SnapshotAndReset();
+        if (snap.HasData) {
+            series.P50.store(snap.P50);
+            series.P95.store(snap.P95);
+            series.P99.store(snap.P99);
+            series.HasData.store(true);
+        } else {
+            series.HasData.store(false);
+        }
+        series.LastSnapshot = now;
+    }
+
     TSeries& GetOrCreateSeries(const std::string& op) {
         {
             std::shared_lock lock(SeriesMutex_);
@@ -189,18 +218,18 @@ class TOtelSharedPusher {
         auto& slot = Series_[op];
         if (!slot) {
             slot = std::make_unique<TSeries>();
+            slot->SuccessAttrs = CommonAttributes_;
+            slot->SuccessAttrs["operation_type"] = op;
+            slot->SuccessAttrs["operation_status"] = "success";
+            slot->ErrorAttrs = CommonAttributes_;
+            slot->ErrorAttrs["operation_type"] = op;
+            slot->ErrorAttrs["operation_status"] = "error";
+            slot->RetryAttrs = CommonAttributes_;
+            slot->RetryAttrs["operation_type"] = op;
         }
         return *slot;
     }
 
-    std::map<std::string, std::string> MergeAttributes(
-        const std::map<std::string, std::string>& metricAttrs) const
-    {
-        std::map<std::string, std::string> result = CommonAttributes_;
-        result.insert(metricAttrs.begin(), metricAttrs.end());
-        return result;
-    }
-
     void InitMetrics() {
         OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total",
             "Total number of operations, categorized by operation type and status.");
@@ -222,48 +251,23 @@ class TOtelSharedPusher {
         LatencyP99_->AddCallback(&TOtelSharedPusher::ObserveP99, this);
     }
 
-    // ObserveP50 runs first per export cycle, snapshots+resets each series'
-    // HDR once, and caches p50/p95/p99 in the series atomics for the
-    // subsequent two callbacks to read. This mirrors the Java workload's
-    // single-snapshot strategy without requiring a batch-callback API.
-    static void ObserveP50(opentelemetry::metrics::ObserverResult result, void* state) {
-        auto* self = static_cast<TOtelSharedPusher*>(state);
-        auto obs = opentelemetry::nostd::get<
-            opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObserverResultT<double>>>(result);
-
-        std::shared_lock lock(self->SeriesMutex_);
-        for (const auto& [op, series] : self->Series_) {
-            auto snap = series->Recorder.SnapshotAndReset();
-            if (snap.HasData) {
-                series->P50.store(snap.P50);
-                series->P95.store(snap.P95);
-                series->P99.store(snap.P99);
-                series->HasData.store(true);
-            } else {
-                series->HasData.store(false);
-            }
-            if (!series->HasData.load()) {
-                continue;
-            }
-            auto attrs = self->MergeAttributes({
-                {"operation_type", op},
-                {"operation_status", "success"},
-            });
-            obs->Observe(series->P50.load(),
-                opentelemetry::common::MakeKeyValueIterableView(attrs));
-        }
+    static void ObserveP50(opentelemetry::metrics::ObserverResult r, void* s) {
+        ObservePercentile(r, s, &TSeries::P50);
     }
-
-    static void ObserveP95(opentelemetry::metrics::ObserverResult result, void* state) {
-        ObserveCached(result, state, &TSeries::P95);
+    static void ObserveP95(opentelemetry::metrics::ObserverResult r, void* s) {
+        ObservePercentile(r, s, &TSeries::P95);
     }
-
-    static void ObserveP99(opentelemetry::metrics::ObserverResult result, void* state) {
-        ObserveCached(result, state, &TSeries::P99);
+    static void ObserveP99(opentelemetry::metrics::ObserverResult r, void* s) {
+        ObservePercentile(r, s, &TSeries::P99);
     }
 
-    static void ObserveCached(opentelemetry::metrics::ObserverResult result, void* state,
-                              std::atomic<double> TSeries::*field)
+    // Each callback ensures a fresh snapshot exists for the current export
+    // cycle (EnsureSnapshot is a no-op if one was taken less than
+    // kSnapshotFreshness ago). Whichever of P50/P95/P99 the SDK invokes first
+    // performs the snapshot+reset; the others read cached atomics. Order
+    // between the three callbacks is irrelevant.
+    static void ObservePercentile(opentelemetry::metrics::ObserverResult result, void* state,
+                                  std::atomic<double> TSeries::*field)
     {
         auto* self = static_cast<TOtelSharedPusher*>(state);
         auto obs = opentelemetry::nostd::get<
@@ -271,15 +275,12 @@ class TOtelSharedPusher {
 
         std::shared_lock lock(self->SeriesMutex_);
         for (const auto& [op, series] : self->Series_) {
+            self->EnsureSnapshot(*series);
             if (!series->HasData.load()) {
                 continue;
             }
-            auto attrs = self->MergeAttributes({
-                {"operation_type", op},
-                {"operation_status", "success"},
-            });
             obs->Observe((series.get()->*field).load(),
-                opentelemetry::common::MakeKeyValueIterableView(attrs));
+                opentelemetry::common::MakeKeyValueIterableView(series->SuccessAttrs));
         }
     }
 
diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp
index c258e95c81..d686d4958c 100644
--- a/tests/slo_workloads/utils/utils.cpp
+++ b/tests/slo_workloads/utils/utils.cpp
@@ -251,11 +251,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             // prepend a synthetic program name for ParseOptionsRun.
             char programName[] = "slo";
             std::vector<char*> runArgv;
-            runArgv.reserve(argc + 1);
+            runArgv.reserve(argc + 2);
             runArgv.push_back(programName);
             for (int i = 0; i < argc; ++i) {
                 runArgv.push_back(argv[i]);
             }
+            runArgv.push_back(nullptr);
             int fakeArgc = 1;
             char* fakeArgv[] = { programName, nullptr };
 
@@ -263,7 +264,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean
             result = create(dbOptions, fakeArgc, fakeArgv);
             if (!result) {
                 Cout << "[all] Launching run command..." << Endl;
-                result = run(dbOptions, static_cast<int>(runArgv.size()), runArgv.data());
+                result = run(dbOptions, static_cast<int>(runArgv.size() - 1), runArgv.data());
             }
             Cout << "[all] Launching cleanup command..." << Endl;
             int cleanupRc = cleanup(dbOptions, fakeArgc);

From 4ebfc5a231463f372d4a55a865614440269b6d4a Mon Sep 17 00:00:00 2001
From: Artem Ermoshkin <shfdis@yandex-team.ru>
Date: Mon, 1 Jun 2026 11:49:22 +0300
Subject: [PATCH 21/21] fix copilot issue

---
 tests/slo_workloads/utils/executor.cpp | 15 +++++++++------
 tests/slo_workloads/utils/executor.h   |  3 ++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/slo_workloads/utils/executor.cpp b/tests/slo_workloads/utils/executor.cpp
index aa0a637905..270f89a8bc 100644
--- a/tests/slo_workloads/utils/executor.cpp
+++ b/tests/slo_workloads/utils/executor.cpp
@@ -34,7 +34,9 @@ std::uint64_t TInsistentClient::GetActiveSessions() const {
     return static_cast<std::uint64_t>(sessions);
 }
 
-TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation) {
+TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation,
+    const std::shared_ptr<TStatUnit>& stat)
+{
     TTracedPromise<TFinalStatus> promise = TTracedPromise<TFinalStatus>(
         NThreading::NewPromise<TFinalStatus>(),
         &ExecutorPromises
@@ -46,7 +48,11 @@ TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableC
     settings.MaxRetries(ClientMaxRetries);
     settings.GetSessionClientTimeout(SessionTimeout);
 
-    auto future = Client.RetryOperation(operation, settings);
+    auto wrappedOperation = [operation, stat](NYdb::NTable::TSession session) {
+        stat->IncRetryAttempts();
+        return operation(session);
+    };
+    auto future = Client.RetryOperation(wrappedOperation, settings);
     future.Subscribe([promise, this](const NYdb::TAsyncStatus& f) mutable {
         Y_ABORT_UNLESS(f.HasValue());
         const auto& status = f.GetValue();
@@ -146,10 +152,7 @@ bool TExecutor::Execute(const NYdb::NTable::TTableClient::TOperationFunc& func)
 
         auto stat = Stats.StartRequest();
 
-        auto future = InsistentClient.ExecuteWithRetry([func, stat](NYdb::NTable::TSession session) {
-            auto result = func(session);
-            return result;
-        });
+        auto future = InsistentClient.ExecuteWithRetry(func, stat);
 
         future.Subscribe([this, stat, SemaphoreWrapper](const TAsyncFinalStatus& future) mutable {
             Y_ABORT_UNLESS(future.HasValue());
diff --git a/tests/slo_workloads/utils/executor.h b/tests/slo_workloads/utils/executor.h
index c20df89d6d..637d791b9a 100644
--- a/tests/slo_workloads/utils/executor.h
+++ b/tests/slo_workloads/utils/executor.h
@@ -53,7 +53,8 @@ class TInsistentClient {
     TInsistentClient(const TCommonOptions& opts);
     ~TInsistentClient();
     void Report(TStringBuilder& out) const;
-    TAsyncFinalStatus ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation);
+    TAsyncFinalStatus ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation,
+        const std::shared_ptr<TStatUnit>& stat);
     std::uint64_t GetActiveSessions() const;
 
 private: