From 581840d2f723507fff547cc36fcfb909609c70ab Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 12 Jun 2026 17:52:39 +0100 Subject: [PATCH] feat(webapp): add task metadata cache resolution metrics Emit a `task_meta_cache.resolve` counter on the trigger path, labeled by lookup path (locked / current) and the source that satisfied it (cache / replica / writer / miss). cache over total is the cache hit rate (its inverse is coldness); writer over total is how often the read replica returned empty for a row the primary had. Labels are bounded, with no env / worker / slug ids. --- .../task-meta-cache-resolution-metrics.md | 6 +++ .../app/runEngine/concerns/queues.server.ts | 35 ++++++++++++++--- .../taskMetadataCacheTelemetry.server.ts | 38 +++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 .server-changes/task-meta-cache-resolution-metrics.md create mode 100644 apps/webapp/app/services/taskMetadataCacheTelemetry.server.ts diff --git a/.server-changes/task-meta-cache-resolution-metrics.md b/.server-changes/task-meta-cache-resolution-metrics.md new file mode 100644 index 0000000000..07b47adcb6 --- /dev/null +++ b/.server-changes/task-meta-cache-resolution-metrics.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Emit metrics for task metadata cache resolution on the trigger path, surfacing the cache hit rate and how often a read replica returned empty for a row the primary had. diff --git a/apps/webapp/app/runEngine/concerns/queues.server.ts b/apps/webapp/app/runEngine/concerns/queues.server.ts index 1c3f9a06aa..5fea4eedd8 100644 --- a/apps/webapp/app/runEngine/concerns/queues.server.ts +++ b/apps/webapp/app/runEngine/concerns/queues.server.ts @@ -20,6 +20,10 @@ import { createCache, createLRUMemoryStore, DefaultStatefulContext, Namespace } import { singleton } from "~/utils/singleton"; import type { TaskMetadataCache, TaskMetadataEntry } from "~/services/taskMetadataCache.server"; import { taskMetadataCacheInstance } from "~/services/taskMetadataCacheInstance.server"; +import { + recordTaskMetaResolve, + type TaskMetaResolveSource, +} from "~/services/taskMetadataCacheTelemetry.server"; // LRU cache for environment queue sizes to reduce Redis calls const queueSizeCache = singleton("queueSizeCache", () => { @@ -266,7 +270,10 @@ export class DefaultQueueManager implements QueueManager { slug: string ): Promise { const cached = await this.taskMetaCache.getByWorker(workerId, slug); - if (cached) return cached; + if (cached) { + recordTaskMetaResolve("locked", "cache"); + return cached; + } // Cache miss. Read the row from the replica first; if the replica comes // back empty, re-check the writer before concluding the task is missing. @@ -277,11 +284,13 @@ export class DefaultQueueManager implements QueueManager { // registered. The writer read only runs on this rare miss-then-empty path, // never on the hot path. let row = await this.findLockedTaskRow(this.replicaPrisma, workerId, environmentId, slug); + let source: TaskMetaResolveSource = "replica"; if (!row && this.replicaPrisma !== this.prisma) { row = await this.findLockedTaskRow(this.prisma, workerId, environmentId, slug); if (row) { + source = "writer"; logger.warn("Locked task metadata missing on replica but found on writer", { workerId, environmentId, @@ -290,7 +299,12 @@ export class DefaultQueueManager implements QueueManager { } } - if (!row) return null; + if (!row) { + recordTaskMetaResolve("locked", "miss"); + return null; + } + + recordTaskMetaResolve("locked", source); const entry: TaskMetadataEntry = { slug, @@ -336,14 +350,20 @@ export class DefaultQueueManager implements QueueManager { slug: string ): Promise { const cached = await this.taskMetaCache.getCurrent(environment.id, slug); - if (cached) return cached; + if (cached) { + recordTaskMetaResolve("current", "cache"); + return cached; + } // Cold cache: discover the current worker for the env. Replica is fine — // the adjacent BackgroundWorkerTask lookup below uses `replicaPrisma` too // (replica lag for "just deployed" is bounded the same way for both // queries; reading from the writer here would only widen the window). const worker = await findCurrentWorkerFromEnvironment(environment, this.replicaPrisma); - if (!worker) return null; + if (!worker) { + recordTaskMetaResolve("current", "miss"); + return null; + } const row = await this.replicaPrisma.backgroundWorkerTask.findFirst({ where: { workerId: worker.id, runtimeEnvironmentId: environment.id, slug }, @@ -354,7 +374,12 @@ export class DefaultQueueManager implements QueueManager { }, }); - if (!row) return null; + if (!row) { + recordTaskMetaResolve("current", "miss"); + return null; + } + + recordTaskMetaResolve("current", "replica"); const entry: TaskMetadataEntry = { slug, diff --git a/apps/webapp/app/services/taskMetadataCacheTelemetry.server.ts b/apps/webapp/app/services/taskMetadataCacheTelemetry.server.ts new file mode 100644 index 0000000000..4e7a117621 --- /dev/null +++ b/apps/webapp/app/services/taskMetadataCacheTelemetry.server.ts @@ -0,0 +1,38 @@ +import { getMeter } from "@internal/tracing"; + +const meter = getMeter("task-meta-cache"); + +/** + * One counter for every task-metadata resolution on the trigger path, with two + * bounded labels: + * + * path: "locked" - lockToVersion / triggerAndWait (reads the by-worker hash) + * "current" - default trigger (reads the env hash) + * source: where the metadata was resolved from: + * "cache" - Redis hit (warm) + * "replica" - cache miss, the read replica had the row + * "writer" - cache miss + replica empty, the primary had the row + * (i.e. the replica was stale for an existing row) + * "miss" - not found anywhere (genuinely not registered) + * + * Derived signals: + * cache / total -> cache hit rate (the inverse is coldness) + * writer / total -> how often the replica returned empty for + * a row the primary had + * + * No env / worker / slug labels: those are unbounded in production. + */ +const resolveCounter = meter.createCounter("task_meta_cache.resolve", { + description: + "Task metadata resolutions on the trigger path, by lookup path and the source that satisfied them", +}); + +export type TaskMetaResolvePath = "locked" | "current"; +export type TaskMetaResolveSource = "cache" | "replica" | "writer" | "miss"; + +export function recordTaskMetaResolve( + path: TaskMetaResolvePath, + source: TaskMetaResolveSource +): void { + resolveCounter.add(1, { path, source }); +}