From 65fc7c283472f321312ee20eb6021f5f69bc6f20 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 16:40:35 +0300 Subject: [PATCH 01/21] tests/slo_workloads: align with ydb-slo-action v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v2 SLO action (ydb-platform/ydb-slo-action) starts a workload container once per run, passes config via env vars (WORKLOAD_REF, WORKLOAD_DURATION, YDB_CONNECTION_STRING, ...), and queries sdk_operation_latency_p{50,95,99}_seconds as pre-computed gauges labeled by (operation_type, operation_status, ref). Changes: - Dockerfile: drop SLO_BRANCH_REF compile-time ref baking; a single ENTRYPOINT binary without CMD is now sufficient. - utils/utils.cpp: subcommand is optional — with no free-arg the binary runs create -> run -> cleanup sequentially (cleanup always runs, even if run errored). Option resolution follows the standard CLI > env > default priority via .DefaultValue(): - -c from YDB_CONNECTION_STRING (or built from YDB_ENDPOINT + YDB_DATABASE); - --metrics-push-url from OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; - --time from WORKLOAD_DURATION. - utils/metrics.cpp: replace the OTel Histogram with HDR-backed gauges. Only successful operations are recorded; a background thread snapshots p50/p95/p99 every second, publishes the gauges with operation_status="success", then resets the HDR window. sdk_retry_attempts_total becomes a counter of (retry_attempts + 1) per op. ref is read from WORKLOAD_REF at startup, not compile time. - utils/CMakeLists.txt: pull HdrHistogram_c 0.11.8 via FetchContent (opt-in via the outer YDB_SDK_TESTS flag) and link slo-utils against hdr_histogram_static. --- tests/slo_workloads/Dockerfile | 3 +- tests/slo_workloads/utils/CMakeLists.txt | 19 +- tests/slo_workloads/utils/metrics.cpp | 239 ++++++++++++++--------- tests/slo_workloads/utils/utils.cpp | 70 ++++++- tests/slo_workloads/utils/utils.h | 3 +- 5 files changed, 237 insertions(+), 97 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 091ce23066..65d9c7693b 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -1,7 +1,6 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang -ARG REF=unknown # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test @@ -120,7 +119,7 @@ COPY . /ydb-cpp-sdk WORKDIR /ydb-cpp-sdk RUN rm -rf build -RUN cmake -DSLO_BRANCH_REF=${REF} --preset ${PRESET} +RUN cmake --preset ${PRESET} RUN cmake --build --preset default --target slo-key-value ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"] diff --git a/tests/slo_workloads/utils/CMakeLists.txt b/tests/slo_workloads/utils/CMakeLists.txt index e8589a568f..5434af9bb4 100644 --- a/tests/slo_workloads/utils/CMakeLists.txt +++ b/tests/slo_workloads/utils/CMakeLists.txt @@ -1,3 +1,16 @@ +include(FetchContent) + +FetchContent_Declare( + hdr_histogram + GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git + GIT_TAG 0.11.8 + EXCLUDE_FROM_ALL +) +set(HDR_HISTOGRAM_BUILD_PROGRAMS OFF CACHE BOOL "" FORCE) +set(HDR_HISTOGRAM_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(HDR_LOG_REQUIRED OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(hdr_histogram) + add_library(slo-utils) target_link_libraries(slo-utils PUBLIC @@ -9,9 +22,9 @@ target_link_libraries(slo-utils PUBLIC opentelemetry-cpp::otlp_http_metric_exporter ) -if (SLO_BRANCH_REF) - target_compile_definitions(slo-utils PRIVATE REF=${SLO_BRANCH_REF}) -endif() +target_link_libraries(slo-utils PRIVATE + hdr_histogram_static +) target_sources(slo-utils PRIVATE executor.cpp diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 50e1f859c0..996a48780f 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -11,21 +11,93 @@ #include +#include + +#include + +#include +#include +#include +#include +#include + using namespace std::chrono_literals; -#ifdef REF -static constexpr const std::string_view REF_LABEL = Y_STRINGIZE(REF); -#else -static constexpr const std::string_view REF_LABEL = "unknown"; -#endif +namespace { + +constexpr std::int64_t kHdrMinLatencyNs = 1'000; // 1 us +constexpr std::int64_t kHdrMaxLatencyNs = 60'000'000'000; // 60 s +constexpr int kHdrSignificantFigures = 3; + +std::string ResolveWorkloadRef() { + std::string ref = GetEnv("WORKLOAD_REF"); + return ref.empty() ? "unknown" : ref; +} + +// Minimal thread-safe wrapper around hdr_histogram for a single +// (operation_type, operation_status="success") series. Only successful +// latencies are recorded; errors are excluded from the percentile stream +// per deploy/metrics.yaml. +class TLatencyRecorder { +public: + TLatencyRecorder() { + hdr_histogram* raw = nullptr; + int rc = hdr_init(kHdrMinLatencyNs, kHdrMaxLatencyNs, kHdrSignificantFigures, &raw); + Y_ABORT_UNLESS(rc == 0, "hdr_init failed: %d", rc); + Histogram_.reset(raw); + } + + void Record(TDuration d) { + std::int64_t ns = static_cast(d.NanoSeconds()); + if (ns < kHdrMinLatencyNs) { + ns = kHdrMinLatencyNs; + } else if (ns > kHdrMaxLatencyNs) { + ns = kHdrMaxLatencyNs; + } + std::lock_guard lock(Mutex_); + hdr_record_value(Histogram_.get(), ns); + } + + // Returns p50/p95/p99 as seconds and resets the recorder window so + // gauges reflect only the most recent interval. + struct TPercentiles { + double P50 = 0.0; + double P95 = 0.0; + double P99 = 0.0; + bool HasData = false; + }; + + TPercentiles SnapshotAndReset() { + TPercentiles out; + std::lock_guard lock(Mutex_); + if (Histogram_->total_count == 0) { + return out; + } + out.HasData = true; + out.P50 = hdr_value_at_percentile(Histogram_.get(), 50.0) / 1e9; + out.P95 = hdr_value_at_percentile(Histogram_.get(), 95.0) / 1e9; + out.P99 = hdr_value_at_percentile(Histogram_.get(), 99.0) / 1e9; + hdr_reset(Histogram_.get()); + return out; + } + +private: + struct THdrDeleter { + void operator()(hdr_histogram* h) const noexcept { if (h) hdr_close(h); } + }; + + std::mutex Mutex_; + std::unique_ptr Histogram_; +}; class TOtelMetricsPusher : public IMetricsPusher { public: TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) : OperationType_(operationType) + , Ref_(ResolveWorkloadRef()) , CommonAttributes_{ - {"ref", std::string(REF_LABEL)}, + {"ref", Ref_}, {"sdk", "cpp"}, {"sdk_version", NYdb::GetSdkSemver()} } @@ -36,12 +108,11 @@ class TOtelMetricsPusher : public IMetricsPusher { auto exporter = opentelemetry::exporter::otlp::OtlpHttpMetricExporterFactory::Create(exporterOptions); opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions; - readerOptions.export_interval_millis = 250ms; - readerOptions.export_timeout_millis = 200ms; + readerOptions.export_interval_millis = 1000ms; + readerOptions.export_timeout_millis = 900ms; auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions); - // Create MeterContext with resource auto context = std::make_unique( std::unique_ptr(new opentelemetry::sdk::metrics::ViewRegistry()), opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_)) @@ -53,97 +124,85 @@ class TOtelMetricsPusher : public IMetricsPusher { Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver()); InitMetrics(); + StartPercentilePublisher(); + } + + ~TOtelMetricsPusher() override { + PublisherShouldStop_.store(true); + if (PublisherThread_.joinable()) { + PublisherThread_.join(); + } } void PushRequestData(const TRequestData& requestData) override { - if (requestData.Status == NYdb::EStatus::SUCCESS) { - OperationsSuccessTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); - } else { - ErrorsTotal_->Add(1, MergeAttributes({{"status", YdbStatusToString(requestData.Status)}})); - OperationsFailureTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); + const bool success = requestData.Status == NYdb::EStatus::SUCCESS; + const std::string status = success ? "success" : "error"; + + OperationsTotal_->Add(1, MergeAttributes({ + {"operation_type", OperationType_}, + {"operation_status", status}, + })); + + // sdk_retry_attempts_total = total number of technical attempts + // including the first one. TStatUnit counts only post-first attempts, + // so add 1 to include the initial attempt. + RetryAttemptsTotal_->Add(static_cast(requestData.RetryAttempts + 1), + MergeAttributes({ + {"operation_type", OperationType_}, + }) + ); + + if (success) { + Latency_.Record(requestData.Delay); } - OperationsTotal_->Add(1, MergeAttributes({{"operation_type", OperationType_}})); - OperationLatencySeconds_->Record(requestData.Delay.SecondsFloat(), MergeAttributes({{"operation_type", OperationType_}, {"status", YdbStatusToString(requestData.Status)}})); - RetryAttempts_->Record(requestData.RetryAttempts, MergeAttributes({{"operation_type", OperationType_}})); } private: void InitMetrics() { - ErrorsTotal_ = Meter_->CreateUInt64Counter("sdk_errors_total", - "Total number of errors encountered, categorized by error type." + OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total", + "Total number of operations, categorized by operation type and status." ); - OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", - "Total number of operations, categorized by type attempted by the SDK." - ); - - OperationsSuccessTotal_ = Meter_->CreateUInt64Counter("sdk_operations_success_total", - "Total number of successful operations, categorized by type." + RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total", + "Total number of retry attempts (including the first attempt), categorized by operation type." ); - OperationsFailureTotal_ = Meter_->CreateUInt64Counter("sdk_operations_failure_total", - "Total number of failed operations, categorized by type." + LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds", + "P50 latency of successful operations in seconds.", "s" ); - - OperationLatencySeconds_ = CreateDoubleHistogram("sdk_operation_latency_seconds", - "Latency of operations performed by the SDK in seconds, categorized by type and status.", - { - 0.001, // 1 ms - 0.002, // 2 ms - 0.003, // 3 ms - 0.004, // 4 ms - 0.005, // 5 ms - 0.0075, // 7.5 ms - 0.010, // 10 ms - 0.020, // 20 ms - 0.050, // 50 ms - 0.100, // 100 ms - 0.200, // 200 ms - 0.500, // 500 ms - 1.000, // 1 s - }, - "s" + LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds", + "P95 latency of successful operations in seconds.", "s" ); - - RetryAttempts_ = Meter_->CreateInt64Gauge("sdk_retry_attempts", - "Current retry attempts, categorized by operation type." + LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds", + "P99 latency of successful operations in seconds.", "s" ); } - std::unique_ptr> CreateDoubleHistogram( - const std::string& name, - const std::string& description, - const std::vector& buckets, - const std::string& unit = {}) - { - auto selector = std::make_unique( - opentelemetry::sdk::metrics::InstrumentType::kHistogram, - name, - unit - ); - - auto meterSelector = std::make_unique( - "slo_workloads", - NYdb::GetSdkSemver(), - "" - ); - - auto histogramConfig = std::make_shared(); - histogramConfig->boundaries_ = buckets; - - auto view = std::make_unique( - "", - "", - opentelemetry::sdk::metrics::AggregationType::kHistogram, - histogramConfig - ); - - MeterProvider_->AddView(std::move(selector), std::move(meterSelector), std::move(view)); + void StartPercentilePublisher() { + PublisherThread_ = std::thread([this]() { + while (!PublisherShouldStop_.load(std::memory_order_relaxed)) { + std::this_thread::sleep_for(1s); + PublishPercentiles(); + } + // Final flush before exit. + PublishPercentiles(); + }); + } - return Meter_->CreateDoubleHistogram(name, description, unit); + void PublishPercentiles() { + auto snapshot = Latency_.SnapshotAndReset(); + if (!snapshot.HasData) { + return; + } + auto attrs = MergeAttributes({ + {"operation_type", OperationType_}, + {"operation_status", "success"}, + }); + LatencyP50_->Record(snapshot.P50, attrs); + LatencyP95_->Record(snapshot.P95, attrs); + LatencyP99_->Record(snapshot.P99, attrs); } - // Helper to merge common attributes with metric-specific ones std::map MergeAttributes(const std::map& metricAttrs) const { std::map result = CommonAttributes_; result.insert(metricAttrs.begin(), metricAttrs.end()); @@ -151,17 +210,21 @@ class TOtelMetricsPusher : public IMetricsPusher { } std::string OperationType_; - std::map CommonAttributes_; // ref, sdk, sdk_version + std::string Ref_; + std::map CommonAttributes_; std::unique_ptr MeterProvider_; std::shared_ptr Meter_; - std::unique_ptr> ErrorsTotal_; - std::unique_ptr> OperationsTotal_; - std::unique_ptr> OperationsSuccessTotal_; - std::unique_ptr> OperationsFailureTotal_; - std::unique_ptr> OperationLatencySeconds_; - std::unique_ptr> RetryAttempts_; + std::unique_ptr> OperationsTotal_; + std::unique_ptr> RetryAttemptsTotal_; + std::unique_ptr> LatencyP50_; + std::unique_ptr> LatencyP95_; + std::unique_ptr> LatencyP99_; + + TLatencyRecorder Latency_; + std::thread PublisherThread_; + std::atomic PublisherShouldStop_{false}; }; class TNoopMetricsPusher : public IMetricsPusher { @@ -169,6 +232,8 @@ class TNoopMetricsPusher : public IMetricsPusher { void PushRequestData([[maybe_unused]] const TRequestData& requestData) override {} }; +} // namespace + std::unique_ptr CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) { return std::make_unique(metricsPushUrl, operationType); } diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index b7572c2981..78231c8e1c 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -110,6 +112,19 @@ std::string GetDatabase(const std::string& connectionString) { return {}; } +static std::string DefaultConnectionStringFromEnv() { + std::string cs = GetEnv("YDB_CONNECTION_STRING"); + if (!cs.empty()) { + return cs; + } + std::string endpoint = GetEnv("YDB_ENDPOINT"); + std::string database = GetEnv("YDB_DATABASE"); + if (!endpoint.empty() && !database.empty()) { + return TStringBuilder() << endpoint << "/?database=" << database; + } + return {}; +} + int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TCleanupCommand cleanup) { TOpts opts = TOpts::Default(); @@ -121,8 +136,15 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean std::string statConfigFile; std::string balancingPolicy; - opts.AddLongOption('c', "connection-string", "YDB connection string").Required().RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE") + std::string defaultConnectionString = DefaultConnectionStringFromEnv(); + + auto& connOpt = opts.AddLongOption('c', "connection-string", "YDB connection string").RequiredArgument("SCHEMA://HOST:PORT/?DATABASE=DATABASE") .StoreResult(&connectionString); + if (!defaultConnectionString.empty()) { + connOpt.DefaultValue(defaultConnectionString); + } else { + connOpt.Required(); + } opts.AddLongOption('p', "prefix", "Base prefix for tables").RequiredArgument("PATH") .StoreResult(&prefix); opts.AddLongOption('k', "token", "security token").RequiredArgument("TOKEN") @@ -136,7 +158,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean opts.AddLongOption('b', "balancing-policy", "Balancing policy").Optional().DefaultValue("use-all-nodes").RequiredArgument("(use-all-nodes|prefer-local-dc|prefer-primary-pile)") .StoreResult(&balancingPolicy); opts.AddHelpOption('h'); - opts.SetFreeArgsMin(1); + opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "", GetCmdList()); opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER; @@ -144,7 +166,8 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean size_t freeArgsPos = res.GetFreeArgsPos(); argc -= freeArgsPos; argv += freeArgsPos; - ECommandType command = ParseCommand(*argv); + + ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All; if (command == ECommandType::Unknown) { Cerr << "Unknown command '" << *argv << "'" << Endl; return EXIT_FAILURE; @@ -202,6 +225,28 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean Cout << "Launching cleanup command..." << Endl; result = cleanup(dbOptions, argc); break; + case ECommandType::All: { + Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl; + // Synthesize argv with a fake program name so the inner NLastGetopt + // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0] + // as the program name and parse zero real args. + char programName[] = "slo"; + char* fakeArgv[] = { programName, nullptr }; + int fakeArgc = 1; + + Cout << "[all] Launching create command..." << Endl; + result = create(dbOptions, fakeArgc, fakeArgv); + if (!result) { + Cout << "[all] Launching run command..." << Endl; + result = run(dbOptions, fakeArgc, fakeArgv); + } + Cout << "[all] Launching cleanup command..." << Endl; + int cleanupRc = cleanup(dbOptions, fakeArgc); + if (!result) { + result = cleanupRc; + } + break; + } default: Cerr << "Unknown command" << Endl; return EXIT_FAILURE; @@ -216,7 +261,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean } std::string GetCmdList() { - return "create, run, cleanup"; + return "create, run, cleanup (omit to run create -> run -> cleanup in one process)"; } ECommandType ParseCommand(const char* cmd) { @@ -425,6 +470,11 @@ TTableStats GetTableStats(TDatabaseOptions& dbOptions, const std::string& tableN } void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) { + std::string metricsPushUrlFromEnv = GetEnv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"); + if (!metricsPushUrlFromEnv.empty()) { + options.MetricsPushUrl = metricsPushUrlFromEnv; + } + opts.AddLongOption("threads", "Number of threads to use").RequiredArgument("NUM") .DefaultValue(options.MaxInputThreads).StoreResult(&options.MaxInputThreads); opts.AddLongOption("stop-on-error", "Stop thread if an error occured").NoArgument() @@ -485,6 +535,18 @@ bool ParseOptionsCreate(int argc, char** argv, TCreateOptions& createOptions) { bool ParseOptionsRun(int argc, char** argv, TRunOptions& runOptions) { TOpts opts = TOpts::Default(); ParseOptionsCommon(opts, runOptions.CommonOptions); + + if (std::string workloadDuration = GetEnv("WORKLOAD_DURATION"); !workloadDuration.empty()) { + try { + std::uint32_t parsed = FromString(workloadDuration); + if (parsed > 0) { + runOptions.CommonOptions.SecondsToRun = parsed; + } + } catch (const std::exception& e) { + Cerr << "Invalid WORKLOAD_DURATION env value '" << workloadDuration << "': " << e.what() << Endl; + } + } + opts.AddLongOption("time", "Time to run (Seconds)").RequiredArgument("Seconds") .DefaultValue(runOptions.CommonOptions.SecondsToRun).StoreResult(&runOptions.CommonOptions.SecondsToRun); opts.AddLongOption("read-rps", "Request generation rate for read requests (Thread A)").RequiredArgument("NUM") diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h index 65be9f4891..3eb3c48978 100644 --- a/tests/slo_workloads/utils/utils.h +++ b/tests/slo_workloads/utils/utils.h @@ -98,7 +98,8 @@ enum class ECommandType { Unknown, Create, Run, - Cleanup + Cleanup, + All, // No free-arg passed: execute Create -> Run -> Cleanup in one process }; struct TTableStats { From 4d8f5385b6954e23111c1bea143367c852114a17 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:07:41 +0300 Subject: [PATCH 02/21] ci(slo): delegate to ydb-slo-action/init@v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the SLO workflows following the pattern used by ydb-java-sdk (ydb-platform/ydb-java-sdk#644). slo.yml: - Drop the hand-rolled docker run orchestration that spun up YDB and invoked the workload with --dont-push / explicit create/run phases. The v2 action owns that lifecycle via deploy/compose.yml — we just hand it two prebuilt images. - Gate on the `SLO` PR label. - Build both images with a single `docker build` per ref; if the baseline commit can't be built (missing Dockerfile or compile error on a historical SHA), fall back to the current image so the run is comparable against itself rather than silently failing. - Drop `--build-arg REF=…` — ref is now read from WORKLOAD_REF env at runtime. - Rename matrix entry to `cpp-key-value` to match the built binary and collapse the per-compiler matrix to a single clang entry (gcc variant can be added back when needed). slo_report.yml: - Pin to @v2. - Add a second job that removes the `SLO` label from the PR after the report is published, matching js-sdk/java-sdk/go-sdk. --- .github/workflows/slo.yml | 278 ++++++++----------------------- .github/workflows/slo_report.yml | 25 ++- 2 files changed, 95 insertions(+), 208 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 0618526617..4cdf4bfec8 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -2,275 +2,143 @@ name: SLO on: pull_request: - types: [opened, reopened, synchronize] - branches: - - main - workflow_dispatch: - inputs: - github_issue: - description: "GitHub issue number where the SLO results will be reported" - required: true - baseline_ref: - description: "Baseline commit/branch/tag to compare against (leave empty to auto-detect merge-base with main)" - required: false - slo_workload_duration_seconds: - description: "Duration of the SLO workload in seconds" - required: false - default: "600" - slo_workload_read_max_rps: - description: "Maximum read RPS for the SLO workload" - required: false - default: "1000" - slo_workload_write_max_rps: - description: "Maximum write RPS for the SLO workload" - required: false - default: "100" + types: [opened, reopened, synchronize, labeled] jobs: ydb-slo-action: + if: contains(github.event.pull_request.labels.*.name, 'SLO') + name: Run YDB SLO Tests runs-on: ubuntu-latest + permissions: + contents: read + strategy: + fail-fast: false matrix: - compiler: [clang, gcc] - include: - - workload: table + sdk: + - name: cpp-key-value + preset: release-test-clang + command: "" concurrency: - group: slo-${{ github.ref }}-${{ matrix.os }}-${{ matrix.workload }}-${{ matrix.compiler }} + group: slo-${{ github.ref }}-${{ matrix.sdk.name }} cancel-in-progress: true steps: - name: Install dependencies run: | + set -euxo pipefail YQ_VERSION=v4.48.2 BUILDX_VERSION=0.30.1 COMPOSE_VERSION=2.40.3 - sudo curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o /usr/local/bin/yq && \ - sudo chmod +x /usr/local/bin/yq + sudo curl -fLo /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" + sudo chmod +x /usr/local/bin/yq - echo "Updating Docker plugins..." sudo mkdir -p /usr/local/lib/docker/cli-plugins - echo "Installing Docker Buildx ${BUILDX_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-buildx \ "https://github.com/docker/buildx/releases/download/v${BUILDX_VERSION}/buildx-v${BUILDX_VERSION}.linux-amd64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-buildx - echo "Installing Docker Compose ${COMPOSE_VERSION}..." sudo curl -fLo /usr/local/lib/docker/cli-plugins/docker-compose \ "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-linux-x86_64" sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose - echo "Installed versions:" yq --version docker --version docker buildx version docker compose version - - name: Checkout current version + - name: Checkout current SDK version uses: actions/checkout@v5 with: - path: current + path: sdk-current fetch-depth: 0 submodules: true - name: Determine baseline commit id: baseline + working-directory: sdk-current run: | - cd current - if [[ -n "${{ inputs.baseline_ref }}" ]]; then - BASELINE="${{ inputs.baseline_ref }}" - else - BASELINE=$(git merge-base HEAD origin/main) - fi - echo "sha=$BASELINE" >> $GITHUB_OUTPUT + set -euo pipefail + BASELINE=$(git merge-base HEAD origin/main) + echo "sha=${BASELINE}" >> "$GITHUB_OUTPUT" - # Try to determine a human-readable ref name for baseline - # Check if baseline is on main - if git merge-base --is-ancestor $BASELINE origin/main && \ - [ "$(git rev-parse origin/main)" = "$BASELINE" ]; then + if git merge-base --is-ancestor "${BASELINE}" origin/main && \ + [ "$(git rev-parse origin/main)" = "${BASELINE}" ]; then BASELINE_REF="main" else - # Try to find a branch containing this commit - BRANCH=$(git branch -r --contains $BASELINE | grep -v HEAD | head -1 | sed 's/.*\///' || echo "") - if [ -n "$BRANCH" ]; then + BRANCH=$(git branch -r --contains "${BASELINE}" | grep -v HEAD | head -1 | sed 's|.*/||' || echo "") + if [ -n "${BRANCH}" ]; then BASELINE_REF="${BRANCH}@${BASELINE:0:7}" else BASELINE_REF="${BASELINE:0:7}" fi fi - echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT + echo "ref=${BASELINE_REF}" >> "$GITHUB_OUTPUT" - - name: Checkout baseline version + - name: Checkout baseline SDK version uses: actions/checkout@v5 with: ref: ${{ steps.baseline.outputs.sha }} - path: baseline + path: sdk-baseline fetch-depth: 1 submodules: true - - name: Build Workload Image + - name: Build current workload image + working-directory: sdk-current run: | - echo "Cleaning up Docker system before builds..." - docker system prune -af --volumes - docker builder prune -af - df -h - - # Build current version - if [ -f "$GITHUB_WORKSPACE/current/tests/slo_workloads/Dockerfile" ]; then - echo "Building current app image..." - cd "$GITHUB_WORKSPACE/current" - - # Use SLO-specific .dockerignore - cp tests/slo_workloads/.dockerignore .dockerignore - - docker build -t ydb-app-current \ - --build-arg REF="${{ github.head_ref || github.ref_name }}" \ - --build-arg PRESET=release-test-${{ matrix.compiler }} \ - -f tests/slo_workloads/Dockerfile . - - # Clean up .dockerignore - rm -f .dockerignore - else - echo "No current app Dockerfile found" - exit 1 + set -euxo pipefail + cp tests/slo_workloads/.dockerignore .dockerignore + docker build \ + --platform linux/amd64 \ + --build-arg PRESET=${{ matrix.sdk.preset }} \ + -t ydb-app-current \ + -f tests/slo_workloads/Dockerfile \ + . + rm -f .dockerignore + + - name: Build baseline workload image + working-directory: sdk-baseline + run: | + set -euxo pipefail + # If the historical commit lacks the SLO workload files or can't + # compile, fall back to the current image so the SLO run is still + # comparable against itself rather than silently failing. + if [ ! -f tests/slo_workloads/Dockerfile ]; then + echo "Baseline commit has no SLO Dockerfile; reusing current image" + docker tag ydb-app-current ydb-app-baseline + exit 0 fi - docker system prune -f --volumes - docker builder prune -af - - # Build baseline version - if [ -f "$GITHUB_WORKSPACE/baseline/tests/slo_workloads/Dockerfile" ]; then - echo "Building baseline app image..." - cd "$GITHUB_WORKSPACE/baseline" - - # Use SLO-specific .dockerignore - cp tests/slo_workloads/.dockerignore .dockerignore - - docker build -t ydb-app-baseline \ - --build-arg REF="${{ steps.baseline.outputs.ref }}" \ - --build-arg PRESET=release-test-${{ matrix.compiler }} \ - -f tests/slo_workloads/Dockerfile . - - # Clean up .dockerignore - rm -f .dockerignore - else - echo "No baseline app Dockerfile found" - exit 1 + cp tests/slo_workloads/.dockerignore .dockerignore + if ! docker build \ + --platform linux/amd64 \ + --build-arg PRESET=${{ matrix.sdk.preset }} \ + -t ydb-app-baseline \ + -f tests/slo_workloads/Dockerfile \ + . + then + echo "Baseline build failed; reusing current image" + docker tag ydb-app-current ydb-app-baseline fi + rm -f .dockerignore - docker system prune -f --volumes - docker builder prune -af - - echo "Final disk space after builds:" - df -h - - - name: Initialize YDB SLO - uses: ydb-platform/ydb-slo-action/init@main + - name: Run SLO Tests + uses: ydb-platform/ydb-slo-action/init@v2 + timeout-minutes: 30 with: - github_issue: ${{ github.event.inputs.github_issue }} + github_issue: ${{ github.event.pull_request.number }} github_token: ${{ secrets.GITHUB_TOKEN }} - workload_name: ${{ matrix.workload }}-${{ matrix.compiler }} + workload_name: ${{ matrix.sdk.name }} + workload_duration: "600" workload_current_ref: ${{ github.head_ref || github.ref_name }} + workload_current_image: ydb-app-current + workload_current_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100 workload_baseline_ref: ${{ steps.baseline.outputs.ref }} - - - name: Prepare SLO Database - run: | - echo "Preparing SLO database..." - docker run --rm --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-current --connection-string grpc://ydb:2136/?database=/Root/testdb create --dont-push - - - name: Run SLO Tests (parallel) - timeout-minutes: 15 - run: | - DURATION=${{ inputs.slo_workload_duration_seconds || 600 }} - READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }} - WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 100 }} - - ARGS="--connection-string grpc://ydb:2136/?database=/Root/testdb run \ - --metrics-push-url http://prometheus:9090/api/v1/otlp/v1/metrics \ - --time $DURATION \ - --read-rps $READ_RPS \ - --write-rps $WRITE_RPS \ - --read-timeout 100 \ - --write-timeout 100" - - echo "Starting ydb-app-current..." - docker run -d \ - --name ydb-app-current \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-current $ARGS - - echo "Starting ydb-app-baseline..." - docker run -d \ - --name ydb-app-baseline \ - --network ydb_ydb-net \ - --add-host "ydb:172.28.0.11" \ - --add-host "ydb:172.28.0.12" \ - --add-host "ydb:172.28.0.13" \ - --add-host "ydb:172.28.0.99" \ - ydb-app-baseline $ARGS - - # Show initial logs - echo "" - echo "==================== INITIAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== INITIAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - # Wait for workloads to complete - echo "Waiting for workloads to complete (${DURATION}s)..." - sleep ${DURATION} - - # Stop containers after workload duration and wait for graceful shutdown - echo "Stopping containers after ${DURATION}s..." - docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true - - # Force kill if still running - docker kill ydb-app-current ydb-app-baseline 2>&1 || true - - # Check exit codes - CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1") - BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "0") - - echo "Current container exit code: $CURRENT_EXIT" - echo "Baseline container exit code: $BASELINE_EXIT" - - # Show final logs - echo "" - echo "==================== FINAL CURRENT LOGS ====================" - docker logs -n 15 ydb-app-current 2>&1 || echo "No current container" - echo "" - echo "==================== FINAL BASELINE LOGS ====================" - docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container" - echo "" - - echo "SUCCESS: Workloads completed successfully" - - - if: always() - name: Store logs - run: | - docker logs ydb-app-current > current.log 2>&1 || echo "No current container" - docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container" - - - if: always() - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.workload }}-${{ matrix.compiler }}-slo-cpp-sdk-logs - path: | - ./current.log - ./baseline.log - retention-days: 1 + workload_baseline_image: ydb-app-baseline + workload_baseline_command: ${{ matrix.sdk.command }} --read-rps 1000 --write-rps 100 diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml index 0a7c2e3483..b2bbee5172 100644 --- a/.github/workflows/slo_report.yml +++ b/.github/workflows/slo_report.yml @@ -7,17 +7,36 @@ on: - completed jobs: - ydb-slo-action-report: + publish-slo-report: + if: github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: checks: write contents: read pull-requests: write - if: github.event.workflow_run.conclusion == 'success' steps: - name: Publish YDB SLO Report - uses: ydb-platform/ydb-slo-action/report@main + uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} github_run_id: ${{ github.event.workflow_run.id }} + + remove-slo-label: + if: github.event.workflow_run.event == 'pull_request' + runs-on: ubuntu-latest + name: Remove SLO Label + permissions: + pull-requests: write + steps: + - name: Remove SLO label from PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} + REPO: ${{ github.event.workflow_run.repository.full_name }} + run: | + set -euo pipefail + PR=$(jq -r '.[0].number' <<<"$PRS") + if [ "$PR" != "null" ] && [ -n "$PR" ]; then + gh pr edit "$PR" --repo "$REPO" --remove-label SLO + fi From 2d9a928d10ab416c7c2b92c1bbe1e7759367db31 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:11:59 +0300 Subject: [PATCH 03/21] tests/slo_workloads: address Copilot review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Counters: switch OperationsTotal/RetryAttemptsTotal from DoubleCounter to UInt64Counter. Operation counts and retry attempts are inherently integer; float representation could drift for very high cumulative values and doesn't match the Prometheus counter convention. - Percentile gauges: publish 0.0 on empty-window intervals instead of returning early. OTel's sync Gauge holds the last Record() value and the periodic exporter re-emits it every collection cycle — without an explicit reset, gauges would look "stuck" at the last non-empty value after load stops, contradicting the per-second HDR reset semantics. --- tests/slo_workloads/utils/metrics.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 996a48780f..4506c1ca39 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -138,7 +138,7 @@ class TOtelMetricsPusher : public IMetricsPusher { const bool success = requestData.Status == NYdb::EStatus::SUCCESS; const std::string status = success ? "success" : "error"; - OperationsTotal_->Add(1, MergeAttributes({ + OperationsTotal_->Add(uint64_t{1}, MergeAttributes({ {"operation_type", OperationType_}, {"operation_status", status}, })); @@ -146,7 +146,7 @@ class TOtelMetricsPusher : public IMetricsPusher { // sdk_retry_attempts_total = total number of technical attempts // including the first one. TStatUnit counts only post-first attempts, // so add 1 to include the initial attempt. - RetryAttemptsTotal_->Add(static_cast(requestData.RetryAttempts + 1), + RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1, MergeAttributes({ {"operation_type", OperationType_}, }) @@ -159,11 +159,11 @@ class TOtelMetricsPusher : public IMetricsPusher { private: void InitMetrics() { - OperationsTotal_ = Meter_->CreateDoubleCounter("sdk_operations_total", + OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", "Total number of operations, categorized by operation type and status." ); - RetryAttemptsTotal_ = Meter_->CreateDoubleCounter("sdk_retry_attempts_total", + RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total", "Total number of retry attempts (including the first attempt), categorized by operation type." ); @@ -191,16 +191,18 @@ class TOtelMetricsPusher : public IMetricsPusher { void PublishPercentiles() { auto snapshot = Latency_.SnapshotAndReset(); - if (!snapshot.HasData) { - return; - } auto attrs = MergeAttributes({ {"operation_type", OperationType_}, {"operation_status", "success"}, }); - LatencyP50_->Record(snapshot.P50, attrs); - LatencyP95_->Record(snapshot.P95, attrs); - LatencyP99_->Record(snapshot.P99, attrs); + // When no successful ops landed in the last second, publish 0.0 + // for all percentiles so the gauges reset with the HDR window + // rather than appearing "stuck" at the last non-empty value (the + // OTel periodic exporter would otherwise re-emit the previous + // Record() value on every collection cycle). + LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs); + LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs); + LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs); } std::map MergeAttributes(const std::map& metricAttrs) const { @@ -216,8 +218,8 @@ class TOtelMetricsPusher : public IMetricsPusher { std::unique_ptr MeterProvider_; std::shared_ptr Meter_; - std::unique_ptr> OperationsTotal_; - std::unique_ptr> RetryAttemptsTotal_; + std::unique_ptr> OperationsTotal_; + std::unique_ptr> RetryAttemptsTotal_; std::unique_ptr> LatencyP50_; std::unique_ptr> LatencyP95_; std::unique_ptr> LatencyP99_; From cbe3597d10ed9ac7f2b7af9229dd4ed7d1602f3f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 17:27:27 +0300 Subject: [PATCH 04/21] tests/slo_workloads: make apt tolerant of PPA timeouts Shared CI runners (and local docker builds) periodically hit connection timeouts on ppa.launchpadcontent.net. Telling apt itself to retry via Acquire::Retries=5 plus a 60 s connect timeout handles the blip in-place without a shell retry loop. --- tests/slo_workloads/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 65d9c7693b..ca9d77d5fe 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -2,6 +2,12 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang +# Make apt tolerant of transient PPA/mirror timeouts (shared runners see +# ppa.launchpadcontent.net connection timeouts every few builds). +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \ + echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries + # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test From 1799ac35f1c15a1ff84368c04beee6022a328a1c Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 18:06:21 +0300 Subject: [PATCH 05/21] ci(slo): make 30-min build resilient to network flakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two separate issues were making every SLO PR a coin flip: 1. Any transient network error (GitHub release download, PPA timeout, abseil/protobuf/grpc tarball) kills the entire 30-min build. Each of the 8 wget calls and both apt-get steps is now retried: - wget gets --tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504 via a shared WGET_OPTS env var. - apt already has Acquire::Retries=5. 2. Every CI run does a full cold build. The Dockerfile's toolchain and dep layers (~25 min) never change between SDK commits, so caching them is trivially safe. - Switch slo.yml to docker/build-push-action@v6 with GHA type=gha cache export/import, scoped per preset. - First build on a runner still takes ~30 min; subsequent builds where only the SDK source changed should take ~3 min. - Baseline build uses continue-on-error + an explicit fallback step that retags ydb-app-current as ydb-app-baseline when the historical commit won't compile — replaces the inline shell if-else that docker/build-push-action can't express. --- .github/workflows/slo.yml | 78 ++++++++++++++++++++-------------- tests/slo_workloads/Dockerfile | 25 ++++++----- 2 files changed, 59 insertions(+), 44 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 4cdf4bfec8..0982b7125c 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -89,44 +89,56 @@ jobs: fetch-depth: 1 submodules: true - - name: Build current workload image - working-directory: sdk-current - run: | - set -euxo pipefail - cp tests/slo_workloads/.dockerignore .dockerignore - docker build \ - --platform linux/amd64 \ - --build-arg PRESET=${{ matrix.sdk.preset }} \ - -t ydb-app-current \ - -f tests/slo_workloads/Dockerfile \ - . - rm -f .dockerignore + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - - name: Build baseline workload image - working-directory: sdk-baseline + # .dockerignore lives under tests/slo_workloads/; buildx expects it at + # the context root, so place a copy in each SDK checkout. + - name: Stage .dockerignore run: | set -euxo pipefail - # If the historical commit lacks the SLO workload files or can't - # compile, fall back to the current image so the SLO run is still - # comparable against itself rather than silently failing. - if [ ! -f tests/slo_workloads/Dockerfile ]; then - echo "Baseline commit has no SLO Dockerfile; reusing current image" - docker tag ydb-app-current ydb-app-baseline - exit 0 + cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore + if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then + cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore fi - cp tests/slo_workloads/.dockerignore .dockerignore - if ! docker build \ - --platform linux/amd64 \ - --build-arg PRESET=${{ matrix.sdk.preset }} \ - -t ydb-app-baseline \ - -f tests/slo_workloads/Dockerfile \ - . - then - echo "Baseline build failed; reusing current image" - docker tag ydb-app-current ydb-app-baseline - fi - rm -f .dockerignore + # A clean build of the SLO image takes ~30 min because the Dockerfile + # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. + # The GHA cache lets subsequent runs reuse every layer up to the SDK + # source COPY, so only the actual workload link step reruns (~3 min). + - name: Build current workload image + uses: docker/build-push-action@v6 + with: + context: sdk-current + file: sdk-current/tests/slo_workloads/Dockerfile + platforms: linux/amd64 + tags: ydb-app-current + load: true + build-args: PRESET=${{ matrix.sdk.preset }} + cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }} + cache-to: type=gha,mode=max,scope=slo-${{ matrix.sdk.preset }} + + - name: Build baseline workload image + id: baseline-build + continue-on-error: true + uses: docker/build-push-action@v6 + with: + context: sdk-baseline + file: sdk-baseline/tests/slo_workloads/Dockerfile + platforms: linux/amd64 + tags: ydb-app-baseline + load: true + build-args: PRESET=${{ matrix.sdk.preset }} + cache-from: type=gha,scope=slo-${{ matrix.sdk.preset }} + + # If the historical commit lacks the SLO Dockerfile or can't compile, + # reuse the current image so the SLO run is still comparable against + # itself rather than failing outright. + - name: Fall back to current image for baseline + if: steps.baseline-build.outcome == 'failure' + run: | + echo "Baseline build failed; reusing current image as baseline." + docker tag ydb-app-current ydb-app-baseline - name: Run SLO Tests uses: ydb-platform/ydb-slo-action/init@v2 diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index ca9d77d5fe..7f8ea5e78e 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -2,12 +2,15 @@ FROM ubuntu:22.04 ARG PRESET=release-test-clang -# Make apt tolerant of transient PPA/mirror timeouts (shared runners see -# ppa.launchpadcontent.net connection timeouts every few builds). +# Every RUN that hits the network retries on transient failures so one +# flake doesn't throw away 30 min of previous build work. apt gets five +# Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo 'Acquire::http::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries && \ echo 'Acquire::https::Timeout "60";' >> /etc/apt/apt.conf.d/80-retries +ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504" + # Install software-properties-common for add-apt-repository RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test @@ -20,7 +23,7 @@ RUN apt-get -y update && apt-get -y install \ # Install CMake ENV CMAKE_VERSION=3.27.7 -RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \ +RUN wget $WGET_OPTS https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \ -q -O cmake-install.sh \ && chmod u+x cmake-install.sh \ && ./cmake-install.sh --skip-license --prefix=/usr/local \ @@ -28,7 +31,7 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cm # Install LLVM ENV LLVM_VERSION=16 -RUN wget https://apt.llvm.org/llvm.sh && \ +RUN wget $WGET_OPTS https://apt.llvm.org/llvm.sh && \ chmod u+x llvm.sh && \ ./llvm.sh ${LLVM_VERSION} && \ rm llvm.sh @@ -45,7 +48,7 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 10000 && \ # Install abseil-cpp ENV ABSEIL_CPP_VERSION=20230802.0 ENV ABSEIL_CPP_INSTALL_DIR=/root/ydb_deps/absl -RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/abseil-cpp/archive/refs/tags/${ABSEIL_CPP_VERSION}.tar.gz && \ tar -xvzf abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz && cd abseil-cpp-${ABSEIL_CPP_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DABSL_PROPAGATE_CXX_STD=ON .. && \ @@ -56,7 +59,7 @@ RUN wget -O abseil-cpp-${ABSEIL_CPP_VERSION}.tar.gz https://github.com/abseil/ab # Install protobuf ENV PROTOBUF_VERSION=3.21.12 ENV PROTOBUF_INSTALL_DIR=/root/ydb_deps/protobuf -RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffers/protobuf/archive/refs/tags/v${PROTOBUF_VERSION}.tar.gz && \ tar -xvzf protobuf-${PROTOBUF_VERSION}.tar.gz && cd protobuf-${PROTOBUF_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_INSTALL=ON -Dprotobuf_ABSL_PROVIDER=package .. && \ @@ -67,7 +70,7 @@ RUN wget -O protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/protocolbuffe # Install grpc ENV GRPC_VERSION=1.54.3 ENV GRPC_INSTALL_DIR=/root/ydb_deps/grpc -RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/refs/tags/v${GRPC_VERSION}.tar.gz && \ tar -xvzf grpc-${GRPC_VERSION}.tar.gz && cd grpc-${GRPC_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_PREFIX_PATH="${ABSEIL_CPP_INSTALL_DIR};${PROTOBUF_INSTALL_DIR}" \ @@ -84,7 +87,7 @@ RUN wget -O grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/ref # Install base64 ENV BASE64_VERSION=0.5.2 ENV BASE64_INSTALL_DIR=/root/ydb_deps/base64 -RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/archive/refs/tags/v${BASE64_VERSION}.tar.gz && \ tar -xvzf base64-${BASE64_VERSION}.tar.gz && cd base64-${BASE64_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -95,7 +98,7 @@ RUN wget -O base64-${BASE64_VERSION}.tar.gz https://github.com/aklomp/base64/arc # Install brotli ENV BROTLI_VERSION=1.1.0 ENV BROTLI_INSTALL_DIR=/root/ydb_deps/brotli -RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/archive/refs/tags/v${BROTLI_VERSION}.tar.gz && \ tar -xvzf brotli-${BROTLI_VERSION}.tar.gz && cd brotli-${BROTLI_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -106,7 +109,7 @@ RUN wget -O brotli-${BROTLI_VERSION}.tar.gz https://github.com/google/brotli/arc # Install jwt-cpp ENV JWT_CPP_VERSION=0.7.0 ENV JWT_CPP_INSTALL_DIR=/root/ydb_deps/jwt-cpp -RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \ +RUN wget $WGET_OPTS -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt-cpp/archive/refs/tags/v${JWT_CPP_VERSION}.tar.gz && \ tar -xvzf jwt-cpp-${JWT_CPP_VERSION}.tar.gz && cd jwt-cpp-${JWT_CPP_VERSION} && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release .. && \ @@ -116,7 +119,7 @@ RUN wget -O jwt-cpp-${JWT_CPP_VERSION}.tar.gz https://github.com/Thalhammer/jwt- # Install ccache 4.8.1 or above ENV CCACHE_VERSION=4.8.1 -RUN wget https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ +RUN wget $WGET_OPTS https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ && tar -xf ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz \ && cp ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/local/bin/ \ && rm -rf ccache-${CCACHE_VERSION}-linux-x86_64 ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz From da782d4c559470ede50c9857bbdb045d9bf4bb4f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 18:07:02 +0300 Subject: [PATCH 06/21] tests/slo_workloads: retry PPA/apt steps at the shell level The earlier Acquire::Retries=5 tweak covered HTTP-level errors but didn't handle TCP connect timeouts to ppa.launchpadcontent.net, which is the actual failure mode observed on shared CI runners. Wrap both the add-apt-repository step and the main apt-get install in shell retry loops (5 attempts, 15/30/45/60/75 s backoff) so a CDN blip no longer kills the 30-min build. --- tests/slo_workloads/Dockerfile | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 7f8ea5e78e..7680560cac 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -11,15 +11,34 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ ENV WGET_OPTS="--tries=5 --waitretry=15 --timeout=60 --retry-connrefused --retry-on-http-error=500,502,503,504" -# Install software-properties-common for add-apt-repository -RUN apt-get -y update && apt-get -y install software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test +# Install software-properties-common and add the gcc-13 PPA. +# Acquire::Retries only retries HTTP errors; TCP connect timeouts to +# ppa.launchpadcontent.net still drop through and kill the step. Wrap the +# whole command in a shell retry loop with exponential backoff so a CDN +# blip doesn't throw away 30 minutes of downstream build work. +RUN for i in 1 2 3 4 5; do \ + apt-get -y update && \ + apt-get -y install software-properties-common && \ + add-apt-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -y update && \ + break; \ + echo "add-apt-repository attempt $i failed; sleeping $((i * 15))s"; \ + sleep $((i * 15)); \ + done && \ + apt-cache show gcc-13 > /dev/null # fail fast if PPA never came up # Install C++ tools and libraries -RUN apt-get -y update && apt-get -y install \ - git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \ - rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \ - libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN for i in 1 2 3 4 5; do \ + apt-get -y install \ + git gdb wget ninja-build libidn11-dev ragel yasm libc-ares-dev libre2-dev \ + rapidjson-dev zlib1g-dev libxxhash-dev libzstd-dev libsnappy-dev libgtest-dev libgmock-dev \ + libbz2-dev liblz4-dev libdouble-conversion-dev libssl-dev libstdc++-13-dev gcc-13 g++-13 && \ + break; \ + echo "apt-get install attempt $i failed; sleeping $((i * 15))s"; \ + sleep $((i * 15)); \ + apt-get -y update || true; \ + done && \ + apt-get clean && rm -rf /var/lib/apt/lists/* # Install CMake ENV CMAKE_VERSION=3.27.7 From a8da8bb6de1fcec95e257bd7fc54f8e790d7deeb Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Tue, 5 May 2026 20:07:39 +0300 Subject: [PATCH 07/21] tests/slo_workloads: forward run-phase args in implicit All mode The v2 SLO action invokes the workload as `slo-key-value ` without a subcommand keyword. The global parser used to error on unknown long options like --read-rps; now it tolerates them and the implicit All branch passes the leftover argv to the run phase. --- tests/slo_workloads/utils/utils.cpp | 33 +++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 78231c8e1c..89a642b61d 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -161,6 +161,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "", GetCmdList()); opts.ArgPermutation_ = NLastGetopt::REQUIRE_ORDER; + // Run-phase options (--read-rps, --write-rps, …) reach DoMain when the + // caller invokes the workload without an explicit subcommand (the v2 SLO + // action contract). Tolerate them here so the global parser stops at the + // first unknown option instead of erroring; they are forwarded to the + // run phase below. + opts.AllowUnknownLongOptions_ = true; TOptsParseResult res(&opts, argc, argv); size_t freeArgsPos = res.GetFreeArgsPos(); @@ -169,8 +175,14 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean ECommandType command = (argc > 0) ? ParseCommand(*argv) : ECommandType::All; if (command == ECommandType::Unknown) { - Cerr << "Unknown command '" << *argv << "'" << Endl; - return EXIT_FAILURE; + if (argv[0][0] == '-') { + // First leftover token is an option, not a subcommand keyword: + // treat as implicit All mode and let the run phase parse it. + command = ECommandType::All; + } else { + Cerr << "Unknown command '" << *argv << "'" << Endl; + return EXIT_FAILURE; + } } if (prefix.empty()) { @@ -227,18 +239,25 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean break; case ECommandType::All: { Cout << "Launching full lifecycle: create -> run -> cleanup" << Endl; - // Synthesize argv with a fake program name so the inner NLastGetopt - // parsers (ParseOptionsCreate / ParseOptionsRun) treat argv[0] - // as the program name and parse zero real args. + // Forward leftover argv to the run phase so options like + // --read-rps / --write-rps take effect. argv[0] here is the first + // run-phase option (no subcommand keyword was supplied), so + // prepend a synthetic program name for ParseOptionsRun. char programName[] = "slo"; - char* fakeArgv[] = { programName, nullptr }; + std::vector runArgv; + runArgv.reserve(argc + 1); + runArgv.push_back(programName); + for (int i = 0; i < argc; ++i) { + runArgv.push_back(argv[i]); + } int fakeArgc = 1; + char* fakeArgv[] = { programName, nullptr }; Cout << "[all] Launching create command..." << Endl; result = create(dbOptions, fakeArgc, fakeArgv); if (!result) { Cout << "[all] Launching run command..." << Endl; - result = run(dbOptions, fakeArgc, fakeArgv); + result = run(dbOptions, static_cast(runArgv.size()), runArgv.data()); } Cout << "[all] Launching cleanup command..." << Endl; int cleanupRc = cleanup(dbOptions, fakeArgc); From 4d27c9f612284ccd7a5e611c70b069ea2980b29f Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 00:20:53 +0300 Subject: [PATCH 08/21] ci(slo): overlay current workload harness onto baseline checkout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The baseline image used to be built from the merge-base checkout's tests/slo_workloads/, so any harness change on the PR (Dockerfile, CLI parser, …) was absent from the baseline. SDK comparison should only vary the library, not the harness — so reuse current's harness on both sides. --- .github/workflows/slo.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 0982b7125c..63145bba8f 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -92,15 +92,19 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - # .dockerignore lives under tests/slo_workloads/; buildx expects it at - # the context root, so place a copy in each SDK checkout. - - name: Stage .dockerignore + # Use current's workload harness (Dockerfile, sources, .dockerignore) for + # both builds so only the SDK library differs between current and + # baseline. Without this the baseline image picks up the harness from + # the merge-base commit, which can lag behind the action's contract. + # buildx also expects .dockerignore at the context root, not under + # tests/, so copy it up in each checkout. + - name: Stage workload harness run: | set -euxo pipefail + rm -rf sdk-baseline/tests/slo_workloads + cp -a sdk-current/tests/slo_workloads sdk-baseline/tests/slo_workloads cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore - if [ -f sdk-baseline/tests/slo_workloads/.dockerignore ]; then - cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore - fi + cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore # A clean build of the SLO image takes ~30 min because the Dockerfile # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. From a6caf4ef2a74dc198b9c36051445037f89dcefbe Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 00:27:27 +0300 Subject: [PATCH 09/21] ci(slo): wire ccache into the cmake build via BuildKit cache mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cmake configure/build now run under ccache as the C/C++ compiler launcher, with /root/.ccache exposed as a BuildKit cache mount so state persists across runs through cache-to=type=gha,mode=max (BuildKit ≥0.13 exports cache mounts under mode=max). Cold runs incur the usual full compile; warm runs reuse object hashes and should drop cmake --build from ~14 min to a few minutes. --- tests/slo_workloads/Dockerfile | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/slo_workloads/Dockerfile b/tests/slo_workloads/Dockerfile index 7680560cac..f87be8f8df 100644 --- a/tests/slo_workloads/Dockerfile +++ b/tests/slo_workloads/Dockerfile @@ -1,7 +1,16 @@ +# syntax=docker/dockerfile:1.7 FROM ubuntu:22.04 ARG PRESET=release-test-clang +# ccache settings consumed by the configure/build steps below. The cache dir +# is materialised by the BuildKit cache mount on those RUN steps; values +# elsewhere in the image are inert. +ENV CCACHE_DIR=/root/.ccache +ENV CCACHE_MAXSIZE=2G +ENV CCACHE_COMPRESS=true +ENV CCACHE_COMPILERCHECK=content + # Every RUN that hits the network retries on transient failures so one # flake doesn't throw away 30 min of previous build work. apt gets five # Acquire retries + 60 s timeouts; wget gets the equivalent via WGET_OPTS. @@ -147,7 +156,13 @@ COPY . /ydb-cpp-sdk WORKDIR /ydb-cpp-sdk RUN rm -rf build -RUN cmake --preset ${PRESET} -RUN cmake --build --preset default --target slo-key-value +RUN --mount=type=cache,target=/root/.ccache,sharing=locked \ + cmake --preset ${PRESET} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache +RUN --mount=type=cache,target=/root/.ccache,sharing=locked \ + ccache --zero-stats >/dev/null \ + && cmake --build --preset default --target slo-key-value \ + && ccache --show-stats ENTRYPOINT ["./build/tests/slo_workloads/key_value/slo-key-value"] From 09e662f97d5c2dd8c8a0b5b9e4c78fe745c2141e Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 03:17:03 +0300 Subject: [PATCH 10/21] tests/slo_workloads: read prefix from YDB_DATABASE when connection-string lacks ?database= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v2 SLO action provides YDB_CONNECTION_STRING in path form (grpc://host:port/Root/testdb), which GetDatabase can't parse, so prefix stayed empty and create issued CreateTable("key_value") without the database root → BAD_REQUEST. Falling back to the YDB_DATABASE env var (which the action also sets) restores the correct table path. --- tests/slo_workloads/utils/utils.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 89a642b61d..f52461283b 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -188,6 +188,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean if (prefix.empty()) { prefix = GetDatabase(connectionString); } + if (prefix.empty()) { + // YDB SLO action sets YDB_CONNECTION_STRING in path form + // (grpc://host:port/Root/testdb), which GetDatabase can't parse. + // Fall back to YDB_DATABASE which the action sets alongside it. + prefix = GetEnv("YDB_DATABASE"); + } if (!ParseToken(token, tokenFile)) { return EXIT_FAILURE; From a581162623f4982bcf4e81a7321328d5146431f6 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Wed, 6 May 2026 03:23:57 +0300 Subject: [PATCH 11/21] ci(slo): persist ccache between runs via actions/cache + buildkit-cache-dance cache-to: type=gha,mode=max exports layer cache but not the contents of --mount=type=cache, so the ccache mount started empty on every run (0% hit rate observed). Restore /root/.ccache from a host dir backed by actions/cache, inject it into BuildKit before each build via the cache-dance action, and extract the updated state for the next run. Same scope is shared by current and baseline since the SDK code overlap dominates ccache hit potential. --- .github/workflows/slo.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 63145bba8f..db9e4d9841 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -106,6 +106,33 @@ jobs: cp sdk-current/tests/slo_workloads/.dockerignore sdk-current/.dockerignore cp sdk-baseline/tests/slo_workloads/.dockerignore sdk-baseline/.dockerignore + # `cache-to: type=gha` does NOT export `--mount=type=cache` content, so + # ccache state is lost between runs. Persist /root/.ccache via host + # directory + cache-dance: actions/cache restores the host dir, the + # dance injects it into the BuildKit cache mount before the build and + # extracts the updated state afterwards for the next save. + - name: Restore ccache + id: ccache + uses: actions/cache@v4 + with: + path: ccache + key: slo-ccache-${{ matrix.sdk.preset }}-${{ github.run_id }} + restore-keys: | + slo-ccache-${{ matrix.sdk.preset }}- + + - name: Inject ccache into BuildKit + uses: reproducible-containers/buildkit-cache-dance@v3.1.2 + with: + cache-map: | + { + "ccache": "/root/.ccache" + } + # Always extract so newly-compiled TUs from this run are saved by + # actions/cache (key uses ${{ github.run_id }}, so each run gets + # its own snapshot). Without extraction the cache stays frozen at + # whatever was first persisted. + skip-extraction: false + # A clean build of the SLO image takes ~30 min because the Dockerfile # rebuilds the full C++ toolchain + abseil/protobuf/grpc from source. # The GHA cache lets subsequent runs reuse every layer up to the SDK From 4436c0751491ed2e93bf359219e31b5e185ff57d Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Fri, 29 May 2026 21:39:17 +0300 Subject: [PATCH 12/21] fix slo workload --- .github/workflows/slo_report.yml | 39 ++++++++++++++++++-------- tests/slo_workloads/key_value/drop.cpp | 26 +++++++++++++---- tests/slo_workloads/utils/utils.cpp | 3 ++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml index b2bbee5172..0ccd36abe9 100644 --- a/.github/workflows/slo_report.yml +++ b/.github/workflows/slo_report.yml @@ -12,6 +12,7 @@ jobs: runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: + actions: read checks: write contents: read pull-requests: write @@ -23,20 +24,36 @@ jobs: github_run_id: ${{ github.event.workflow_run.id }} remove-slo-label: - if: github.event.workflow_run.event == 'pull_request' + needs: publish-slo-report + if: always() && github.event.workflow_run.event == 'pull_request' runs-on: ubuntu-latest name: Remove SLO Label permissions: pull-requests: write steps: - name: Remove SLO label from PR - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PRS: ${{ toJSON(github.event.workflow_run.pull_requests) }} - REPO: ${{ github.event.workflow_run.repository.full_name }} - run: | - set -euo pipefail - PR=$(jq -r '.[0].number' <<<"$PRS") - if [ "$PR" != "null" ] && [ -n "$PR" ]; then - gh pr edit "$PR" --repo "$REPO" --remove-label SLO - fi + uses: actions/github-script@v7 + with: + script: | + const pullRequests = context.payload.workflow_run.pull_requests; + if (pullRequests && pullRequests.length > 0) { + for (const pr of pullRequests) { + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pr.number, + name: 'SLO' + }); + console.log(`Removed SLO label from PR #${pr.number}`); + } catch (error) { + if (error.status === 404) { + console.log(`SLO label not found on PR #${pr.number}, skipping`); + } else { + throw error; + } + } + } + } else { + console.log('No pull requests associated with this workflow run'); + } diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp index ae749bc903..ef3be2f8b1 100644 --- a/tests/slo_workloads/key_value/drop.cpp +++ b/tests/slo_workloads/key_value/drop.cpp @@ -4,15 +4,31 @@ using namespace NLastGetopt; using namespace NYdb; using namespace NYdb::NTable; -static void DropTable(TTableClient& client, const std::string& path) { - NYdb::NStatusHelpers::ThrowOnError(client.RetryOperationSync([path](TSession session) { - return session.DropTable(path).ExtractValueSync(); - })); +namespace { + +static bool DropTableWithRetry(TTableClient& client, const std::string& path) { + try { + RetryBackoff(client, 5, [path](TSession session) { + TStatus status = session.DropTable(path).ExtractValueSync(); + if (status.GetStatus() == EStatus::NOT_FOUND) { + return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues()); + } + return status; + }); + return true; + } catch (const NYdb::NStatusHelpers::TYdbErrorException& e) { + Cerr << "DropTable failed after retries: " << e << Endl; + return false; + } } +} // namespace + int DropTable(TDatabaseOptions& dbOptions) { TTableClient client(dbOptions.Driver); - DropTable(client, JoinPath(dbOptions.Prefix, TableName)); + if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) { + return EXIT_FAILURE; + } Cout << "Table dropped." << Endl; return EXIT_SUCCESS; } diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index f52461283b..5257ef0dbd 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -269,6 +269,9 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean int cleanupRc = cleanup(dbOptions, fakeArgc); if (!result) { result = cleanupRc; + } else if (cleanupRc) { + Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc + << ") but run succeeded; ignoring cleanup exit code." << Endl; } break; } From f9e9839d937f6429b6dd70d7ff78b5b06d34224e Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Fri, 29 May 2026 22:50:40 +0300 Subject: [PATCH 13/21] fix too long timeouts --- tests/slo_workloads/key_value/drop.cpp | 28 ++++++++++++++------- tests/slo_workloads/key_value/key_value.cpp | 2 ++ tests/slo_workloads/utils/metrics.cpp | 5 +++- tests/slo_workloads/utils/statistics.cpp | 10 +++++++- tests/slo_workloads/utils/statistics.h | 2 ++ 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp index ef3be2f8b1..9f465728c2 100644 --- a/tests/slo_workloads/key_value/drop.cpp +++ b/tests/slo_workloads/key_value/drop.cpp @@ -7,19 +7,28 @@ using namespace NYdb::NTable; namespace { static bool DropTableWithRetry(TTableClient& client, const std::string& path) { - try { - RetryBackoff(client, 5, [path](TSession session) { - TStatus status = session.DropTable(path).ExtractValueSync(); - if (status.GetStatus() == EStatus::NOT_FOUND) { + TDuration delay = TDuration::Seconds(1); + constexpr std::uint32_t kMaxAttempts = 3; + + for (std::uint32_t attempt = 1; attempt <= kMaxAttempts; ++attempt) { + TStatus status = client.RetryOperationSync([path](TSession session) { + TStatus dropStatus = session.DropTable(path).ExtractValueSync(); + if (dropStatus.GetStatus() == EStatus::NOT_FOUND) { return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues()); } - return status; + return dropStatus; }); - return true; - } catch (const NYdb::NStatusHelpers::TYdbErrorException& e) { - Cerr << "DropTable failed after retries: " << e << Endl; - return false; + if (status.IsSuccess()) { + return true; + } + Cerr << "DropTable attempt " << attempt << " failed: " << status << Endl; + if (attempt == kMaxAttempts) { + break; + } + Sleep(delay); + delay = Min(delay * 2, TDuration::Seconds(8)); } + return false; } } // namespace @@ -27,6 +36,7 @@ static bool DropTableWithRetry(TTableClient& client, const std::string& path) { int DropTable(TDatabaseOptions& dbOptions) { TTableClient client(dbOptions.Driver); if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) { + Cerr << "DropTable failed after all retries." << Endl; return EXIT_FAILURE; } Cout << "Table dropped." << Endl; diff --git a/tests/slo_workloads/key_value/key_value.cpp b/tests/slo_workloads/key_value/key_value.cpp index 1cb40f3620..23bd3ff184 100644 --- a/tests/slo_workloads/key_value/key_value.cpp +++ b/tests/slo_workloads/key_value/key_value.cpp @@ -50,6 +50,7 @@ int DoCreate(TDatabaseOptions& dbOptions, int argc, char** argv) { jobs->Start(); jobs->Wait(); jobs->ShowProgress(); + jobs.reset(); return EXIT_SUCCESS; } @@ -95,6 +96,7 @@ int DoRun(TDatabaseOptions& dbOptions, int argc, char** argv) { Cout << "All jobs finished: " << TInstant::Now().ToRfc822StringLocal() << Endl; jobs->ShowProgress(); + jobs.reset(); return EXIT_SUCCESS; } diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 4506c1ca39..30a2ee19ca 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -109,7 +109,7 @@ class TOtelMetricsPusher : public IMetricsPusher { opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions readerOptions; readerOptions.export_interval_millis = 1000ms; - readerOptions.export_timeout_millis = 900ms; + readerOptions.export_timeout_millis = 500ms; auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions); @@ -132,6 +132,9 @@ class TOtelMetricsPusher : public IMetricsPusher { if (PublisherThread_.joinable()) { PublisherThread_.join(); } + if (MeterProvider_) { + MeterProvider_->Shutdown(std::chrono::seconds(3)); + } } void PushRequestData(const TRequestData& requestData) override { diff --git a/tests/slo_workloads/utils/statistics.cpp b/tests/slo_workloads/utils/statistics.cpp index 15789e7620..dddc35c56a 100644 --- a/tests/slo_workloads/utils/statistics.cpp +++ b/tests/slo_workloads/utils/statistics.cpp @@ -36,6 +36,11 @@ TStat::TStat(const std::optional& metricsPushUrl, const std::string MetricsPushQueue.Start(20); } +TStat::~TStat() { + MetricsPushQueue.Stop(); + MetricsPusher.reset(); +} + void TStat::Start() { StartTime = TInstant::Now(); } @@ -71,9 +76,12 @@ void TStat::FinishRequest(const std::shared_ptr& unit, const TFinalSt } ScheduleMetricsPush([this, delay, status, unit]() { + NYdb::EStatus requestStatus = status + ? status->GetStatus() + : NYdb::EStatus::CLIENT_DEADLINE_EXCEEDED; MetricsPusher->PushRequestData({ .Delay = delay, - .Status = status->GetStatus(), + .Status = requestStatus, .RetryAttempts = unit->RetryAttempts }); }); diff --git a/tests/slo_workloads/utils/statistics.h b/tests/slo_workloads/utils/statistics.h index 9e7a092430..c61a267db4 100644 --- a/tests/slo_workloads/utils/statistics.h +++ b/tests/slo_workloads/utils/statistics.h @@ -62,6 +62,8 @@ class TStat { TInstant GetStartTime() const; + ~TStat(); + private: void ScheduleMetricsPush(std::function func); From cce12fbebda26450cf493d0dcd220f019d3d58c0 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sat, 30 May 2026 17:48:25 +0300 Subject: [PATCH 14/21] remove slop manyal retry, some cli flags and fix double shutdown --- tests/slo_workloads/key_value/create.cpp | 4 +- tests/slo_workloads/key_value/drop.cpp | 41 ++---- tests/slo_workloads/utils/executor.cpp | 160 +++-------------------- tests/slo_workloads/utils/executor.h | 46 +------ tests/slo_workloads/utils/metrics.cpp | 5 +- tests/slo_workloads/utils/utils.cpp | 5 - tests/slo_workloads/utils/utils.h | 25 ---- 7 files changed, 32 insertions(+), 254 deletions(-) diff --git a/tests/slo_workloads/key_value/create.cpp b/tests/slo_workloads/key_value/create.cpp index a79db76a12..a105caba7f 100644 --- a/tests/slo_workloads/key_value/create.cpp +++ b/tests/slo_workloads/key_value/create.cpp @@ -8,7 +8,7 @@ using namespace NYdb::NTable; namespace { void CreateTable(TTableClient& client, const std::string& prefix) { - RetryBackoff(client, 5, [prefix](TSession session) { + NYdb::NStatusHelpers::ThrowOnError(client.RetryOperationSync([prefix](TSession session) { auto desc = TTableBuilder() .AddNullableColumn("object_id_key", EPrimitiveType::Uint32) .AddNullableColumn("object_id", EPrimitiveType::Uint32) @@ -27,7 +27,7 @@ namespace { , std::move(desc) , std::move(tableSettings) ).ExtractValueSync(); - }); + })); } } //namespace diff --git a/tests/slo_workloads/key_value/drop.cpp b/tests/slo_workloads/key_value/drop.cpp index 9f465728c2..e22c92c894 100644 --- a/tests/slo_workloads/key_value/drop.cpp +++ b/tests/slo_workloads/key_value/drop.cpp @@ -4,39 +4,18 @@ using namespace NLastGetopt; using namespace NYdb; using namespace NYdb::NTable; -namespace { - -static bool DropTableWithRetry(TTableClient& client, const std::string& path) { - TDuration delay = TDuration::Seconds(1); - constexpr std::uint32_t kMaxAttempts = 3; - - for (std::uint32_t attempt = 1; attempt <= kMaxAttempts; ++attempt) { - TStatus status = client.RetryOperationSync([path](TSession session) { - TStatus dropStatus = session.DropTable(path).ExtractValueSync(); - if (dropStatus.GetStatus() == EStatus::NOT_FOUND) { - return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues()); - } - return dropStatus; - }); - if (status.IsSuccess()) { - return true; - } - Cerr << "DropTable attempt " << attempt << " failed: " << status << Endl; - if (attempt == kMaxAttempts) { - break; - } - Sleep(delay); - delay = Min(delay * 2, TDuration::Seconds(8)); - } - return false; -} - -} // namespace - int DropTable(TDatabaseOptions& dbOptions) { TTableClient client(dbOptions.Driver); - if (!DropTableWithRetry(client, JoinPath(dbOptions.Prefix, TableName))) { - Cerr << "DropTable failed after all retries." << Endl; + const std::string path = JoinPath(dbOptions.Prefix, TableName); + TStatus status = client.RetryOperationSync([path](TSession session) { + TStatus dropStatus = session.DropTable(path).ExtractValueSync(); + if (dropStatus.GetStatus() == EStatus::NOT_FOUND) { + return TStatus(EStatus::SUCCESS, NYdb::NIssue::TIssues()); + } + return dropStatus; + }); + if (!status.IsSuccess()) { + Cerr << "DropTable failed: " << status << Endl; return EXIT_FAILURE; } Cout << "Table dropped." << Endl; diff --git a/tests/slo_workloads/utils/executor.cpp b/tests/slo_workloads/utils/executor.cpp index 28add35c98..aa0a637905 100644 --- a/tests/slo_workloads/utils/executor.cpp +++ b/tests/slo_workloads/utils/executor.cpp @@ -16,62 +16,17 @@ TInsistentClient::TInsistentClient(const TCommonOptions& opts) .AllowRequestMigration(true) ) , ClientMaxRetries(opts.MaxRetries) - , Timeout(opts.ReactionTime) - , RetryTimeout(Timeout / 2) - , SessionTimeout(Timeout + ReactionTimeDelay) - , UseApplicationTimeout(opts.UseApplicationTimeout) - , SendPreventiveRequest(opts.SendPreventiveRequest) + , SessionTimeout(opts.ReactionTime + ReactionTimeDelay) { - if (UseApplicationTimeout || SendPreventiveRequest) { - CallbackQueue.Start(opts.MaxCallbackThreads); - // Thread that executes timeout callbacks - auto threadFunc = [this]() { - TDuration timeToSleep; - while (!ShouldStop.WaitT(timeToSleep)) { - TInstant wakeupTime; - TInstant now; - with_lock(CallbacksLock) { - now = TInstant::Now(); - while (!TimeoutCallbacks.empty() && now >= TimeoutCallbacks.front().ExecucionTime) { - Y_UNUSED(CallbackQueue.AddFunc(TimeoutCallbacks.front().Callback)); - RemoveTimeoutIter(TimeoutCallbacks.front().context); - } - while (!RetryCallbacks.empty() && now >= RetryCallbacks.front().ExecucionTime) { - Y_UNUSED(CallbackQueue.AddFunc(RetryCallbacks.front().Callback)); - RemoveRetryIter(RetryCallbacks.front().context); - } - if (RetryCallbacks.empty()) { - wakeupTime = now + RetryTimeout; - } else { - wakeupTime = RetryCallbacks.front().ExecucionTime; - } - if (!TimeoutCallbacks.empty()) { - wakeupTime = Min(wakeupTime, TimeoutCallbacks.front().ExecucionTime); - } - } - timeToSleep = wakeupTime - now; - } - }; - WorkThread.reset(SystemThreadFactory()->Run(threadFunc).Release()); - } } TInsistentClient::~TInsistentClient() { - ShouldStop.Signal(); - if (UseApplicationTimeout || SendPreventiveRequest) { - if (WorkThread) { - WorkThread->Join(); - } else { - Cerr << (TStringBuilder() << "TInsistentClient::~TINsistentClient Error: WorkThread is not running." << Endl); - } - CallbackQueue.Stop(); - } Client.Stop().Wait(WaitTimeout); } void TInsistentClient::Report(TStringBuilder& out) const { - out << "Client retries sent: total " << CounterSStart.load() - << ", successful " << CounterSOk.load() << Endl; + out << "Operations dispatched: " << CounterStart.load() + << ", succeeded: " << CounterOk.load() << Endl; } std::uint64_t TInsistentClient::GetActiveSessions() const { @@ -79,112 +34,27 @@ std::uint64_t TInsistentClient::GetActiveSessions() const { return static_cast(sessions); } -void TInsistentClient::ClearContext(std::shared_ptr& context) { - if (SendPreventiveRequest) { - RemoveRetryIter(context); - } - if (UseApplicationTimeout) { - RemoveTimeoutIter(context); - } -} - -void TInsistentClient::RemoveRetryIter(std::shared_ptr& context) { - if (context->RetryIter.Valid) { - context->RetryIter.Valid = false; - RetryCallbacks.erase(context->RetryIter.RealIter); - } -} - -void TInsistentClient::RemoveTimeoutIter(std::shared_ptr& context) { - if (context->TimeoutIter.Valid) { - context->TimeoutIter.Valid = false; - TimeoutCallbacks.erase(context->TimeoutIter.RealIter); - } -} - TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation) { TTracedPromise promise = TTracedPromise( NThreading::NewPromise(), &ExecutorPromises ); - std::shared_ptr context = std::make_shared(); - auto launchOperation = [this, operation, promise, context](bool firstTime) mutable { - with_lock(context->Lock) { - if (context->Finished) { - return; - } - } - auto callback = [promise, context, firstTime, this](const NYdb::TAsyncStatus& future) mutable { - Y_ABORT_UNLESS(future.HasValue()); - // Not setting promise under lock to avoid deadlock - bool firstCallback = false; - with_lock(context->Lock) { - if (!context->Finished) { - context->Finished = true; - firstCallback = true; - } - } - if (firstCallback) { - promise.SetValue(future.GetValue()); - with_lock(CallbacksLock) { - if (firstTime) { - CounterFOk.fetch_add(1); - } else { - CounterSOk.fetch_add(1); - } - ClearContext(context); - } - } - }; - if (firstTime) { - CounterFStart.fetch_add(1); - } else { - CounterSStart.fetch_add(1); - } - NYdb::NTable::TRetryOperationSettings settings; - settings.MaxRetries(ClientMaxRetries); - settings.GetSessionClientTimeout(SessionTimeout); - auto future = Client.RetryOperation(operation, settings); - future.Subscribe(std::move(callback)); - }; + CounterStart.fetch_add(1); - with_lock(CallbacksLock) { - TInstant now = TInstant::Now(); + NYdb::NTable::TRetryOperationSettings settings; + settings.MaxRetries(ClientMaxRetries); + settings.GetSessionClientTimeout(SessionTimeout); - if (SendPreventiveRequest) { - auto onRetryTimeout = [launchOperation]() mutable { - launchOperation(false); - }; - - RetryCallbacks.push_back({ now + RetryTimeout, onRetryTimeout, context }); - context->RetryIter = { --RetryCallbacks.end() }; - } - - if (UseApplicationTimeout) { - auto onTimeout = [this, promise, context]() mutable { - // Not setting promise under lock to avoid deadlock - bool firstCallback = false; - with_lock(context->Lock) { - if (!context->Finished) { - context->Finished = true; - firstCallback = true; - } - } - if (firstCallback) { - promise.SetValue(TFinalStatus()); - with_lock(CallbacksLock) { - ClearContext(context); - } - } - }; - - TimeoutCallbacks.push_back({ now + Timeout, onTimeout, context }); - context->TimeoutIter = { --TimeoutCallbacks.end() }; + auto future = Client.RetryOperation(operation, settings); + future.Subscribe([promise, this](const NYdb::TAsyncStatus& f) mutable { + Y_ABORT_UNLESS(f.HasValue()); + const auto& status = f.GetValue(); + if (status.IsSuccess()) { + CounterOk.fetch_add(1); } - } - - launchOperation(true); + promise.SetValue(status); + }); return promise.GetFuture(); } diff --git a/tests/slo_workloads/utils/executor.h b/tests/slo_workloads/utils/executor.h index 69ed261a35..c20df89d6d 100644 --- a/tests/slo_workloads/utils/executor.h +++ b/tests/slo_workloads/utils/executor.h @@ -50,26 +50,6 @@ class TTracedPromise : public NThreading::TPromise { class TInsistentClient { public: - struct TDelayedCallback; - - struct TCheckedIterator { - std::list::iterator RealIter; - bool Valid = true; - }; - - struct TOperationContext { - bool Finished = false; - TAdaptiveLock Lock; - TCheckedIterator RetryIter; - TCheckedIterator TimeoutIter; - }; - - struct TDelayedCallback { - TInstant ExecucionTime; - std::function Callback; - std::shared_ptr context; - }; - TInsistentClient(const TCommonOptions& opts); ~TInsistentClient(); void Report(TStringBuilder& out) const; @@ -77,32 +57,12 @@ class TInsistentClient { std::uint64_t GetActiveSessions() const; private: - void ClearContext(std::shared_ptr& context); - void RemoveRetryIter(std::shared_ptr& context); - void RemoveTimeoutIter(std::shared_ptr& context); - - TThreadPool CallbackQueue; NYdb::NTable::TTableClient Client; std::uint32_t ClientMaxRetries; - TDuration Timeout; - TDuration RetryTimeout; TDuration SessionTimeout; - TAdaptiveLock CallbacksLock; - std::unique_ptr WorkThread; - TManualEvent ShouldStop; - std::list RetryCallbacks; - std::list TimeoutCallbacks; - bool UseApplicationTimeout; - bool SendPreventiveRequest; - - // Ok received on the First try - std::atomic CounterFOk = 0; - // Ok received on the Second try - std::atomic CounterSOk = 0; - // First try launches (= total) - std::atomic CounterFStart = 0; - // Second try launches - std::atomic CounterSStart = 0; + + std::atomic CounterStart = 0; + std::atomic CounterOk = 0; }; class TExecutor { diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 30a2ee19ca..27a257c104 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -132,9 +132,8 @@ class TOtelMetricsPusher : public IMetricsPusher { if (PublisherThread_.joinable()) { PublisherThread_.join(); } - if (MeterProvider_) { - MeterProvider_->Shutdown(std::chrono::seconds(3)); - } + // MeterProvider destructor calls Shutdown(); do not call it explicitly + // here — the OTel SDK rejects a second Shutdown with a warning. } void PushRequestData(const TRequestData& requestData) override { diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 5257ef0dbd..320e53b700 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -515,11 +515,6 @@ void ParseOptionsCommon(TOpts& opts, TCommonOptions& options) { .SetFlag(&options.DontPushMetrics).DefaultValue(options.DontPushMetrics); opts.AddLongOption("metrics-push-url", "URL to push metrics").RequiredArgument("URL") .DefaultValue(options.MetricsPushUrl).StoreResult(&options.MetricsPushUrl); - opts.AddLongOption("app-timeout", "Use application timeout (over SDK)").NoArgument() - .SetFlag(&options.UseApplicationTimeout).DefaultValue(options.UseApplicationTimeout); - opts.AddLongOption("prevention-request", "Send prevention request at 1/2 of timeout").NoArgument() - .SetFlag(&options.SendPreventiveRequest).DefaultValue(options.SendPreventiveRequest); - opts.MutuallyExclusive("dont-push", "metrics-push-url"); } diff --git a/tests/slo_workloads/utils/utils.h b/tests/slo_workloads/utils/utils.h index 3eb3c48978..28576adaad 100644 --- a/tests/slo_workloads/utils/utils.h +++ b/tests/slo_workloads/utils/utils.h @@ -49,8 +49,6 @@ struct TCommonOptions { std::uint32_t MaxRetries = 50; TDuration ReactionTime = DefaultReactionTime; bool StopOnError = false; - bool UseApplicationTimeout = false; - bool SendPreventiveRequest = false; // Generator options: std::uint32_t MinLength = 20; @@ -118,29 +116,6 @@ ECommandType ParseCommand(const char* cmd); std::string JoinPath(const std::string& prefix, const std::string& path); -inline void RetryBackoff( - NYdb::NTable::TTableClient& client, - std::uint32_t retries, - const NYdb::NTable::TTableClient::TOperationSyncFunc& func -) { - TDuration delay = TDuration::Seconds(5); - while (retries) { - NYdb::TStatus status = client.RetryOperationSync(func); - if (status.IsSuccess()) { - return; - } - --retries; - if (!retries) { - Cerr << "Create request failed after all retries." << Endl; - Cerr << status << Endl; - NYdb::NStatusHelpers::ThrowOnError(status); - } - Cerr << "Create request failed. Sleeping for " << delay << Endl; - Sleep(delay); - delay *= 2; - } -} - std::string GenerateRandomString(std::uint32_t minLength, std::uint32_t maxLength); NYdb::TParams PackValuesToParamsAsList(const std::vector& items, const std::string name = "$items"); From 8c4e579a651aea827b5d99293ebc9e5226cb09b0 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sat, 30 May 2026 18:26:15 +0300 Subject: [PATCH 15/21] fix double MeterProvider --- tests/slo_workloads/utils/metrics.cpp | 257 ++++++++++++++++++-------- 1 file changed, 177 insertions(+), 80 deletions(-) diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index 27a257c104..b89f8f3b48 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -19,8 +19,8 @@ #include #include #include -#include - +#include +#include using namespace std::chrono_literals; @@ -35,10 +35,8 @@ std::string ResolveWorkloadRef() { return ref.empty() ? "unknown" : ref; } -// Minimal thread-safe wrapper around hdr_histogram for a single -// (operation_type, operation_status="success") series. Only successful -// latencies are recorded; errors are excluded from the percentile stream -// per deploy/metrics.yaml. +// Thread-safe HDR histogram. Only successful latencies are recorded; errors +// are excluded from the percentile stream (operation_status="success"). class TLatencyRecorder { public: TLatencyRecorder() { @@ -59,8 +57,6 @@ class TLatencyRecorder { hdr_record_value(Histogram_.get(), ns); } - // Returns p50/p95/p99 as seconds and resets the recorder window so - // gauges reflect only the most recent interval. struct TPercentiles { double P50 = 0.0; double P95 = 0.0; @@ -68,6 +64,11 @@ class TLatencyRecorder { bool HasData = false; }; + // Snapshot all three percentiles from one consistent HDR state and reset + // the window — so each export cycle's gauge reflects only the last + // interval's latencies. Reading p50/p95/p99 in one critical section + // matches the Java workload's batch-callback pattern (avoids the race + // where p99 would observe a histogram already reset by p50). TPercentiles SnapshotAndReset() { TPercentiles out; std::lock_guard lock(Mutex_); @@ -91,11 +92,14 @@ class TLatencyRecorder { std::unique_ptr Histogram_; }; -class TOtelMetricsPusher : public IMetricsPusher { +// Process-wide pusher: ONE MeterProvider with one OTLP exporter shared by +// all operation types. Publishing duplicate MeterProviders against the same +// Prometheus endpoint produces racing `target_info` writes for the same +// resource label set, which Prometheus rejects as `out of order sample`. +class TOtelSharedPusher { public: - TOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) - : OperationType_(operationType) - , Ref_(ResolveWorkloadRef()) + explicit TOtelSharedPusher(const std::string& metricsPushUrl) + : Ref_(ResolveWorkloadRef()) , CommonAttributes_{ {"ref", Ref_}, {"sdk", "cpp"}, @@ -111,11 +115,13 @@ class TOtelMetricsPusher : public IMetricsPusher { readerOptions.export_interval_millis = 1000ms; readerOptions.export_timeout_millis = 500ms; - auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOptions); + auto metricReader = opentelemetry::sdk::metrics::PeriodicExportingMetricReaderFactory::Create( + std::move(exporter), readerOptions); auto context = std::make_unique( std::unique_ptr(new opentelemetry::sdk::metrics::ViewRegistry()), - opentelemetry::sdk::resource::Resource::Create(opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_)) + opentelemetry::sdk::resource::Resource::Create( + opentelemetry::common::MakeKeyValueIterableView(CommonAttributes_)) ); MeterProvider_ = opentelemetry::sdk::metrics::MeterProviderFactory::Create(std::move(context)); @@ -124,96 +130,159 @@ class TOtelMetricsPusher : public IMetricsPusher { Meter_ = MeterProvider_->GetMeter("slo_workloads", NYdb::GetSdkSemver()); InitMetrics(); - StartPercentilePublisher(); } - ~TOtelMetricsPusher() override { - PublisherShouldStop_.store(true); - if (PublisherThread_.joinable()) { - PublisherThread_.join(); - } + ~TOtelSharedPusher() { + // Remove observable-gauge callbacks before MeterProvider tears down + // the readers, so a final collection in flight cannot see this object + // half-destroyed. + if (LatencyP50_) LatencyP50_->RemoveCallback(&TOtelSharedPusher::ObserveP50, this); + if (LatencyP95_) LatencyP95_->RemoveCallback(&TOtelSharedPusher::ObserveP95, this); + if (LatencyP99_) LatencyP99_->RemoveCallback(&TOtelSharedPusher::ObserveP99, this); // MeterProvider destructor calls Shutdown(); do not call it explicitly // here — the OTel SDK rejects a second Shutdown with a warning. } - void PushRequestData(const TRequestData& requestData) override { - const bool success = requestData.Status == NYdb::EStatus::SUCCESS; + void Record(const std::string& operationType, const TRequestData& data) { + const bool success = data.Status == NYdb::EStatus::SUCCESS; const std::string status = success ? "success" : "error"; OperationsTotal_->Add(uint64_t{1}, MergeAttributes({ - {"operation_type", OperationType_}, + {"operation_type", operationType}, {"operation_status", status}, })); // sdk_retry_attempts_total = total number of technical attempts - // including the first one. TStatUnit counts only post-first attempts, - // so add 1 to include the initial attempt. - RetryAttemptsTotal_->Add(requestData.RetryAttempts + 1, - MergeAttributes({ - {"operation_type", OperationType_}, - }) - ); + // including the first one. RetryAttempts counts only post-first + // attempts, so add 1 to include the initial attempt. + RetryAttemptsTotal_->Add(data.RetryAttempts + 1, MergeAttributes({ + {"operation_type", operationType}, + })); if (success) { - Latency_.Record(requestData.Delay); + GetOrCreateSeries(operationType).Recorder.Record(data.Delay); } } private: + struct TSeries { + TLatencyRecorder Recorder; + // Cached snapshot, refreshed by the callback that runs first per + // export cycle (ObserveP50). All three callbacks read from these + // atomics so p50/p95/p99 land in the same export with consistent + // values from one HDR snapshot. + std::atomic P50{0.0}; + std::atomic P95{0.0}; + std::atomic P99{0.0}; + std::atomic HasData{false}; + }; + + TSeries& GetOrCreateSeries(const std::string& op) { + { + std::shared_lock lock(SeriesMutex_); + auto it = Series_.find(op); + if (it != Series_.end()) { + return *it->second; + } + } + std::unique_lock lock(SeriesMutex_); + auto& slot = Series_[op]; + if (!slot) { + slot = std::make_unique(); + } + return *slot; + } + + std::map MergeAttributes( + const std::map& metricAttrs) const + { + std::map result = CommonAttributes_; + result.insert(metricAttrs.begin(), metricAttrs.end()); + return result; + } + void InitMetrics() { OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", - "Total number of operations, categorized by operation type and status." - ); - + "Total number of operations, categorized by operation type and status."); RetryAttemptsTotal_ = Meter_->CreateUInt64Counter("sdk_retry_attempts_total", - "Total number of retry attempts (including the first attempt), categorized by operation type." - ); - - LatencyP50_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p50_seconds", - "P50 latency of successful operations in seconds.", "s" - ); - LatencyP95_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p95_seconds", - "P95 latency of successful operations in seconds.", "s" - ); - LatencyP99_ = Meter_->CreateDoubleGauge("sdk_operation_latency_p99_seconds", - "P99 latency of successful operations in seconds.", "s" - ); + "Total number of retry attempts (including the first attempt), categorized by operation type."); + + LatencyP50_ = Meter_->CreateDoubleObservableGauge( + "sdk_operation_latency_p50_seconds", + "P50 latency of successful operations in seconds.", "s"); + LatencyP95_ = Meter_->CreateDoubleObservableGauge( + "sdk_operation_latency_p95_seconds", + "P95 latency of successful operations in seconds.", "s"); + LatencyP99_ = Meter_->CreateDoubleObservableGauge( + "sdk_operation_latency_p99_seconds", + "P99 latency of successful operations in seconds.", "s"); + + LatencyP50_->AddCallback(&TOtelSharedPusher::ObserveP50, this); + LatencyP95_->AddCallback(&TOtelSharedPusher::ObserveP95, this); + LatencyP99_->AddCallback(&TOtelSharedPusher::ObserveP99, this); } - void StartPercentilePublisher() { - PublisherThread_ = std::thread([this]() { - while (!PublisherShouldStop_.load(std::memory_order_relaxed)) { - std::this_thread::sleep_for(1s); - PublishPercentiles(); + // ObserveP50 runs first per export cycle, snapshots+resets each series' + // HDR once, and caches p50/p95/p99 in the series atomics for the + // subsequent two callbacks to read. This mirrors the Java workload's + // single-snapshot strategy without requiring a batch-callback API. + static void ObserveP50(opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto obs = opentelemetry::nostd::get< + opentelemetry::nostd::shared_ptr>>(result); + + std::shared_lock lock(self->SeriesMutex_); + for (const auto& [op, series] : self->Series_) { + auto snap = series->Recorder.SnapshotAndReset(); + if (snap.HasData) { + series->P50.store(snap.P50); + series->P95.store(snap.P95); + series->P99.store(snap.P99); + series->HasData.store(true); + } else { + series->HasData.store(false); } - // Final flush before exit. - PublishPercentiles(); - }); + if (!series->HasData.load()) { + continue; + } + auto attrs = self->MergeAttributes({ + {"operation_type", op}, + {"operation_status", "success"}, + }); + obs->Observe(series->P50.load(), + opentelemetry::common::MakeKeyValueIterableView(attrs)); + } } - void PublishPercentiles() { - auto snapshot = Latency_.SnapshotAndReset(); - auto attrs = MergeAttributes({ - {"operation_type", OperationType_}, - {"operation_status", "success"}, - }); - // When no successful ops landed in the last second, publish 0.0 - // for all percentiles so the gauges reset with the HDR window - // rather than appearing "stuck" at the last non-empty value (the - // OTel periodic exporter would otherwise re-emit the previous - // Record() value on every collection cycle). - LatencyP50_->Record(snapshot.HasData ? snapshot.P50 : 0.0, attrs); - LatencyP95_->Record(snapshot.HasData ? snapshot.P95 : 0.0, attrs); - LatencyP99_->Record(snapshot.HasData ? snapshot.P99 : 0.0, attrs); + static void ObserveP95(opentelemetry::metrics::ObserverResult result, void* state) { + ObserveCached(result, state, &TSeries::P95); } - std::map MergeAttributes(const std::map& metricAttrs) const { - std::map result = CommonAttributes_; - result.insert(metricAttrs.begin(), metricAttrs.end()); - return result; + static void ObserveP99(opentelemetry::metrics::ObserverResult result, void* state) { + ObserveCached(result, state, &TSeries::P99); + } + + static void ObserveCached(opentelemetry::metrics::ObserverResult result, void* state, + std::atomic TSeries::*field) + { + auto* self = static_cast(state); + auto obs = opentelemetry::nostd::get< + opentelemetry::nostd::shared_ptr>>(result); + + std::shared_lock lock(self->SeriesMutex_); + for (const auto& [op, series] : self->Series_) { + if (!series->HasData.load()) { + continue; + } + auto attrs = self->MergeAttributes({ + {"operation_type", op}, + {"operation_status", "success"}, + }); + obs->Observe((series.get()->*field).load(), + opentelemetry::common::MakeKeyValueIterableView(attrs)); + } } - std::string OperationType_; std::string Ref_; std::map CommonAttributes_; @@ -222,13 +291,41 @@ class TOtelMetricsPusher : public IMetricsPusher { std::unique_ptr> OperationsTotal_; std::unique_ptr> RetryAttemptsTotal_; - std::unique_ptr> LatencyP50_; - std::unique_ptr> LatencyP95_; - std::unique_ptr> LatencyP99_; + std::shared_ptr LatencyP50_; + std::shared_ptr LatencyP95_; + std::shared_ptr LatencyP99_; + + std::shared_mutex SeriesMutex_; + std::unordered_map> Series_; +}; + +std::mutex g_sharedMu; +std::weak_ptr g_shared; + +std::shared_ptr GetOrCreateSharedPusher(const std::string& url) { + std::lock_guard lock(g_sharedMu); + auto sp = g_shared.lock(); + if (!sp) { + sp = std::make_shared(url); + g_shared = sp; + } + return sp; +} + +class TOtelMetricsPusherWrapper : public IMetricsPusher { +public: + TOtelMetricsPusherWrapper(std::shared_ptr shared, std::string operationType) + : Shared_(std::move(shared)) + , OperationType_(std::move(operationType)) + {} + + void PushRequestData(const TRequestData& requestData) override { + Shared_->Record(OperationType_, requestData); + } - TLatencyRecorder Latency_; - std::thread PublisherThread_; - std::atomic PublisherShouldStop_{false}; +private: + std::shared_ptr Shared_; + std::string OperationType_; }; class TNoopMetricsPusher : public IMetricsPusher { @@ -239,7 +336,7 @@ class TNoopMetricsPusher : public IMetricsPusher { } // namespace std::unique_ptr CreateOtelMetricsPusher(const std::string& metricsPushUrl, const std::string& operationType) { - return std::make_unique(metricsPushUrl, operationType); + return std::make_unique(GetOrCreateSharedPusher(metricsPushUrl), operationType); } std::unique_ptr CreateNoopMetricsPusher() { From 2c193bcc180220a386eaf239fd755b84b988ad61 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sat, 30 May 2026 18:49:38 +0300 Subject: [PATCH 16/21] do not fail while cleanup --- tests/slo_workloads/utils/utils.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index 320e53b700..c258e95c81 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -267,11 +267,17 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean } Cout << "[all] Launching cleanup command..." << Endl; int cleanupRc = cleanup(dbOptions, fakeArgc); - if (!result) { - result = cleanupRc; - } else if (cleanupRc) { + // Cleanup runs while chaos-monkey is still killing nodes, so a + // DropTable failure here is expected noise and must not mask a + // successful run. Surface the run's status; only fall back to + // the cleanup status when run itself failed and we have nothing + // else to report. + if (cleanupRc && !result) { Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc << ") but run succeeded; ignoring cleanup exit code." << Endl; + } else if (cleanupRc) { + Cerr << "[all] Warning: cleanup failed (exit " << cleanupRc + << "); preserving earlier run failure." << Endl; } break; } From 71d1123deebf26202b298cd41ef7dd3e69d2d946 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sat, 30 May 2026 19:25:20 +0300 Subject: [PATCH 17/21] fix the report comment workload --- .github/workflows/slo.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index db9e4d9841..fe31086f89 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -139,6 +139,9 @@ jobs: # source COPY, so only the actual workload link step reruns (~3 min). - name: Build current workload image uses: docker/build-push-action@v6 + env: + DOCKER_BUILD_SUMMARY: "false" + DOCKER_BUILD_RECORD_UPLOAD: "false" with: context: sdk-current file: sdk-current/tests/slo_workloads/Dockerfile @@ -153,6 +156,9 @@ jobs: id: baseline-build continue-on-error: true uses: docker/build-push-action@v6 + env: + DOCKER_BUILD_SUMMARY: "false" + DOCKER_BUILD_RECORD_UPLOAD: "false" with: context: sdk-baseline file: sdk-baseline/tests/slo_workloads/Dockerfile From d832f6a658175c3a6dec0a53f80f268ada4d5a14 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sat, 30 May 2026 20:50:29 +0300 Subject: [PATCH 18/21] fix manual report --- .github/workflows/slo_report.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml index 0ccd36abe9..3ae91d6712 100644 --- a/.github/workflows/slo_report.yml +++ b/.github/workflows/slo_report.yml @@ -5,10 +5,16 @@ on: workflows: ["SLO"] types: - completed + workflow_dispatch: + inputs: + slo_run_id: + description: "Run ID of the SLO workflow whose artifacts to report on" + required: true + type: string jobs: publish-slo-report: - if: github.event.workflow_run.conclusion == 'success' + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: @@ -21,11 +27,11 @@ jobs: uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} - github_run_id: ${{ github.event.workflow_run.id }} + github_run_id: ${{ inputs.slo_run_id || github.event.workflow_run.id }} remove-slo-label: needs: publish-slo-report - if: always() && github.event.workflow_run.event == 'pull_request' + if: always() && github.event_name == 'workflow_run' && github.event.workflow_run.event == 'pull_request' runs-on: ubuntu-latest name: Remove SLO Label permissions: From f290f84ff18fb253acccfcf03d6a9bb0c550fcf9 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sun, 31 May 2026 00:15:16 +0300 Subject: [PATCH 19/21] remove testing action --- .github/workflows/slo_report.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/slo_report.yml b/.github/workflows/slo_report.yml index 3ae91d6712..0ccd36abe9 100644 --- a/.github/workflows/slo_report.yml +++ b/.github/workflows/slo_report.yml @@ -5,16 +5,10 @@ on: workflows: ["SLO"] types: - completed - workflow_dispatch: - inputs: - slo_run_id: - description: "Run ID of the SLO workflow whose artifacts to report on" - required: true - type: string jobs: publish-slo-report: - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + if: github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest name: Publish YDB SLO Report permissions: @@ -27,11 +21,11 @@ jobs: uses: ydb-platform/ydb-slo-action/report@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} - github_run_id: ${{ inputs.slo_run_id || github.event.workflow_run.id }} + github_run_id: ${{ github.event.workflow_run.id }} remove-slo-label: needs: publish-slo-report - if: always() && github.event_name == 'workflow_run' && github.event.workflow_run.event == 'pull_request' + if: always() && github.event.workflow_run.event == 'pull_request' runs-on: ubuntu-latest name: Remove SLO Label permissions: From b18a8edb0edfc364d8b24578706965489020c672 Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Sun, 31 May 2026 00:43:07 +0300 Subject: [PATCH 20/21] fix review issues --- tests/slo_workloads/utils/metrics.cpp | 129 +++++++++++++------------- tests/slo_workloads/utils/utils.cpp | 5 +- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/tests/slo_workloads/utils/metrics.cpp b/tests/slo_workloads/utils/metrics.cpp index b89f8f3b48..708c83a563 100644 --- a/tests/slo_workloads/utils/metrics.cpp +++ b/tests/slo_workloads/utils/metrics.cpp @@ -145,38 +145,67 @@ class TOtelSharedPusher { void Record(const std::string& operationType, const TRequestData& data) { const bool success = data.Status == NYdb::EStatus::SUCCESS; - const std::string status = success ? "success" : "error"; + auto& series = GetOrCreateSeries(operationType); - OperationsTotal_->Add(uint64_t{1}, MergeAttributes({ - {"operation_type", operationType}, - {"operation_status", status}, - })); + OperationsTotal_->Add(uint64_t{1}, + opentelemetry::common::MakeKeyValueIterableView( + success ? series.SuccessAttrs : series.ErrorAttrs)); // sdk_retry_attempts_total = total number of technical attempts // including the first one. RetryAttempts counts only post-first // attempts, so add 1 to include the initial attempt. - RetryAttemptsTotal_->Add(data.RetryAttempts + 1, MergeAttributes({ - {"operation_type", operationType}, - })); + RetryAttemptsTotal_->Add(data.RetryAttempts + 1, + opentelemetry::common::MakeKeyValueIterableView(series.RetryAttrs)); if (success) { - GetOrCreateSeries(operationType).Recorder.Record(data.Delay); + series.Recorder.Record(data.Delay); } } private: struct TSeries { TLatencyRecorder Recorder; - // Cached snapshot, refreshed by the callback that runs first per - // export cycle (ObserveP50). All three callbacks read from these - // atomics so p50/p95/p99 land in the same export with consistent - // values from one HDR snapshot. + // Pre-merged attribute maps used on the hot path so Record() does not + // allocate / copy CommonAttributes_ per call. + std::map SuccessAttrs; + std::map ErrorAttrs; + std::map RetryAttrs; + + // Cached snapshot, refreshed by EnsureSnapshot() in whichever + // observable-gauge callback fires first per export cycle. All three + // callbacks read from these atomics so p50/p95/p99 land in the same + // export with consistent values from one HDR snapshot — independent + // of the order the SDK iterates instruments. std::atomic P50{0.0}; std::atomic P95{0.0}; std::atomic P99{0.0}; std::atomic HasData{false}; + std::mutex SnapshotMutex; + std::chrono::steady_clock::time_point LastSnapshot{}; }; + // Half the export interval — guarantees one snapshot per cycle while + // tolerating arbitrary callback ordering. + static constexpr std::chrono::milliseconds kSnapshotFreshness{500}; + + void EnsureSnapshot(TSeries& series) { + auto now = std::chrono::steady_clock::now(); + std::lock_guard lock(series.SnapshotMutex); + if (now - series.LastSnapshot < kSnapshotFreshness) { + return; + } + auto snap = series.Recorder.SnapshotAndReset(); + if (snap.HasData) { + series.P50.store(snap.P50); + series.P95.store(snap.P95); + series.P99.store(snap.P99); + series.HasData.store(true); + } else { + series.HasData.store(false); + } + series.LastSnapshot = now; + } + TSeries& GetOrCreateSeries(const std::string& op) { { std::shared_lock lock(SeriesMutex_); @@ -189,18 +218,18 @@ class TOtelSharedPusher { auto& slot = Series_[op]; if (!slot) { slot = std::make_unique(); + slot->SuccessAttrs = CommonAttributes_; + slot->SuccessAttrs["operation_type"] = op; + slot->SuccessAttrs["operation_status"] = "success"; + slot->ErrorAttrs = CommonAttributes_; + slot->ErrorAttrs["operation_type"] = op; + slot->ErrorAttrs["operation_status"] = "error"; + slot->RetryAttrs = CommonAttributes_; + slot->RetryAttrs["operation_type"] = op; } return *slot; } - std::map MergeAttributes( - const std::map& metricAttrs) const - { - std::map result = CommonAttributes_; - result.insert(metricAttrs.begin(), metricAttrs.end()); - return result; - } - void InitMetrics() { OperationsTotal_ = Meter_->CreateUInt64Counter("sdk_operations_total", "Total number of operations, categorized by operation type and status."); @@ -222,48 +251,23 @@ class TOtelSharedPusher { LatencyP99_->AddCallback(&TOtelSharedPusher::ObserveP99, this); } - // ObserveP50 runs first per export cycle, snapshots+resets each series' - // HDR once, and caches p50/p95/p99 in the series atomics for the - // subsequent two callbacks to read. This mirrors the Java workload's - // single-snapshot strategy without requiring a batch-callback API. - static void ObserveP50(opentelemetry::metrics::ObserverResult result, void* state) { - auto* self = static_cast(state); - auto obs = opentelemetry::nostd::get< - opentelemetry::nostd::shared_ptr>>(result); - - std::shared_lock lock(self->SeriesMutex_); - for (const auto& [op, series] : self->Series_) { - auto snap = series->Recorder.SnapshotAndReset(); - if (snap.HasData) { - series->P50.store(snap.P50); - series->P95.store(snap.P95); - series->P99.store(snap.P99); - series->HasData.store(true); - } else { - series->HasData.store(false); - } - if (!series->HasData.load()) { - continue; - } - auto attrs = self->MergeAttributes({ - {"operation_type", op}, - {"operation_status", "success"}, - }); - obs->Observe(series->P50.load(), - opentelemetry::common::MakeKeyValueIterableView(attrs)); - } + static void ObserveP50(opentelemetry::metrics::ObserverResult r, void* s) { + ObservePercentile(r, s, &TSeries::P50); } - - static void ObserveP95(opentelemetry::metrics::ObserverResult result, void* state) { - ObserveCached(result, state, &TSeries::P95); + static void ObserveP95(opentelemetry::metrics::ObserverResult r, void* s) { + ObservePercentile(r, s, &TSeries::P95); } - - static void ObserveP99(opentelemetry::metrics::ObserverResult result, void* state) { - ObserveCached(result, state, &TSeries::P99); + static void ObserveP99(opentelemetry::metrics::ObserverResult r, void* s) { + ObservePercentile(r, s, &TSeries::P99); } - static void ObserveCached(opentelemetry::metrics::ObserverResult result, void* state, - std::atomic TSeries::*field) + // Each callback ensures a fresh snapshot exists for the current export + // cycle (EnsureSnapshot is a no-op if one was taken less than + // kSnapshotFreshness ago). Whichever of P50/P95/P99 the SDK invokes first + // performs the snapshot+reset; the others read cached atomics. Order + // between the three callbacks is irrelevant. + static void ObservePercentile(opentelemetry::metrics::ObserverResult result, void* state, + std::atomic TSeries::*field) { auto* self = static_cast(state); auto obs = opentelemetry::nostd::get< @@ -271,15 +275,12 @@ class TOtelSharedPusher { std::shared_lock lock(self->SeriesMutex_); for (const auto& [op, series] : self->Series_) { + self->EnsureSnapshot(*series); if (!series->HasData.load()) { continue; } - auto attrs = self->MergeAttributes({ - {"operation_type", op}, - {"operation_status", "success"}, - }); obs->Observe((series.get()->*field).load(), - opentelemetry::common::MakeKeyValueIterableView(attrs)); + opentelemetry::common::MakeKeyValueIterableView(series->SuccessAttrs)); } } diff --git a/tests/slo_workloads/utils/utils.cpp b/tests/slo_workloads/utils/utils.cpp index c258e95c81..d686d4958c 100644 --- a/tests/slo_workloads/utils/utils.cpp +++ b/tests/slo_workloads/utils/utils.cpp @@ -251,11 +251,12 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean // prepend a synthetic program name for ParseOptionsRun. char programName[] = "slo"; std::vector runArgv; - runArgv.reserve(argc + 1); + runArgv.reserve(argc + 2); runArgv.push_back(programName); for (int i = 0; i < argc; ++i) { runArgv.push_back(argv[i]); } + runArgv.push_back(nullptr); int fakeArgc = 1; char* fakeArgv[] = { programName, nullptr }; @@ -263,7 +264,7 @@ int DoMain(int argc, char** argv, TCreateCommand create, TRunCommand run, TClean result = create(dbOptions, fakeArgc, fakeArgv); if (!result) { Cout << "[all] Launching run command..." << Endl; - result = run(dbOptions, static_cast(runArgv.size()), runArgv.data()); + result = run(dbOptions, static_cast(runArgv.size() - 1), runArgv.data()); } Cout << "[all] Launching cleanup command..." << Endl; int cleanupRc = cleanup(dbOptions, fakeArgc); From 4ebfc5a231463f372d4a55a865614440269b6d4a Mon Sep 17 00:00:00 2001 From: Artem Ermoshkin Date: Mon, 1 Jun 2026 11:49:22 +0300 Subject: [PATCH 21/21] fix copilot issue --- tests/slo_workloads/utils/executor.cpp | 15 +++++++++------ tests/slo_workloads/utils/executor.h | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/slo_workloads/utils/executor.cpp b/tests/slo_workloads/utils/executor.cpp index aa0a637905..270f89a8bc 100644 --- a/tests/slo_workloads/utils/executor.cpp +++ b/tests/slo_workloads/utils/executor.cpp @@ -34,7 +34,9 @@ std::uint64_t TInsistentClient::GetActiveSessions() const { return static_cast(sessions); } -TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation) { +TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation, + const std::shared_ptr& stat) +{ TTracedPromise promise = TTracedPromise( NThreading::NewPromise(), &ExecutorPromises @@ -46,7 +48,11 @@ TAsyncFinalStatus TInsistentClient::ExecuteWithRetry(const NYdb::NTable::TTableC settings.MaxRetries(ClientMaxRetries); settings.GetSessionClientTimeout(SessionTimeout); - auto future = Client.RetryOperation(operation, settings); + auto wrappedOperation = [operation, stat](NYdb::NTable::TSession session) { + stat->IncRetryAttempts(); + return operation(session); + }; + auto future = Client.RetryOperation(wrappedOperation, settings); future.Subscribe([promise, this](const NYdb::TAsyncStatus& f) mutable { Y_ABORT_UNLESS(f.HasValue()); const auto& status = f.GetValue(); @@ -146,10 +152,7 @@ bool TExecutor::Execute(const NYdb::NTable::TTableClient::TOperationFunc& func) auto stat = Stats.StartRequest(); - auto future = InsistentClient.ExecuteWithRetry([func, stat](NYdb::NTable::TSession session) { - auto result = func(session); - return result; - }); + auto future = InsistentClient.ExecuteWithRetry(func, stat); future.Subscribe([this, stat, SemaphoreWrapper](const TAsyncFinalStatus& future) mutable { Y_ABORT_UNLESS(future.HasValue()); diff --git a/tests/slo_workloads/utils/executor.h b/tests/slo_workloads/utils/executor.h index c20df89d6d..637d791b9a 100644 --- a/tests/slo_workloads/utils/executor.h +++ b/tests/slo_workloads/utils/executor.h @@ -53,7 +53,8 @@ class TInsistentClient { TInsistentClient(const TCommonOptions& opts); ~TInsistentClient(); void Report(TStringBuilder& out) const; - TAsyncFinalStatus ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation); + TAsyncFinalStatus ExecuteWithRetry(const NYdb::NTable::TTableClient::TOperationFunc& operation, + const std::shared_ptr& stat); std::uint64_t GetActiveSessions() const; private: