Skip to content

Commit e5e89ec

Browse files
committed
feat(webapp): add an emergency span cap for trace summary queries
A new optional TRACE_VIEW_EMERGENCY_SPAN_CAP env var clamps the trace summary and detailed trace summary span limits on both event store paths (ClickHouse and Postgres), covering the dashboard trace view and the public run trace endpoint. Unset by default, so nothing changes unless an operator sets it.
1 parent cf3ecdd commit e5e89ec

4 files changed

Lines changed: 29 additions & 10 deletions

File tree

apps/webapp/app/env.server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,9 @@ const EnvironmentSchema = z
726726
MAXIMUM_LIVE_RELOADING_EVENTS: z.coerce.number().int().default(1000),
727727
MAXIMUM_TRACE_SUMMARY_VIEW_COUNT: z.coerce.number().int().default(25_000),
728728
MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT: z.coerce.number().int().default(10_000),
729+
// Emergency circuit breaker: when set, clamps the trace summary and detailed
730+
// summary span limits on both event store paths to this value. Unset = disabled.
731+
TRACE_VIEW_EMERGENCY_SPAN_CAP: z.coerce.number().int().positive().optional(),
729732
TASK_PAYLOAD_OFFLOAD_THRESHOLD: z.coerce.number().int().default(524_288), // 512KB
730733
BATCH_PAYLOAD_OFFLOAD_THRESHOLD: z.coerce.number().int().optional(), // Defaults to TASK_PAYLOAD_OFFLOAD_THRESHOLD if not set
731734
TASK_PAYLOAD_MAXIMUM_SIZE: z.coerce.number().int().default(3_145_728), // 3MB

apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { ClickHouse } from "@internal/clickhouse";
22
import { createHash } from "crypto";
33
import { ClickhouseEventRepository } from "~/v3/eventRepository/clickhouseEventRepository.server";
44
import { env } from "~/env.server";
5+
import { clampToEmergencySpanCap } from "~/v3/eventRepository/emergencySpanCap.server";
56
import { singleton } from "~/utils/singleton";
67
import type { OrganizationDataStoresRegistry } from "~/services/dataStores/organizationDataStoresRegistry.server";
78
import { type IEventRepository } from "~/v3/eventRepository/eventRepository.types";
@@ -533,9 +534,12 @@ function buildEventRepository(store: string, clickhouse: ClickHouse): Clickhouse
533534
clickhouse,
534535
batchSize: env.EVENTS_CLICKHOUSE_BATCH_SIZE,
535536
flushInterval: env.EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS,
536-
maximumTraceSummaryViewCount: env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT,
537-
maximumTraceDetailedSummaryViewCount:
538-
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT,
537+
maximumTraceSummaryViewCount: clampToEmergencySpanCap(
538+
env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT
539+
),
540+
maximumTraceDetailedSummaryViewCount: clampToEmergencySpanCap(
541+
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT
542+
),
539543
maximumLiveReloadingSetting: env.EVENTS_CLICKHOUSE_MAX_LIVE_RELOADING_SETTING,
540544
insertStrategy: env.EVENTS_CLICKHOUSE_INSERT_STRATEGY,
541545
waitForAsyncInsert: env.EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT === "1",
@@ -557,9 +561,12 @@ function buildEventRepository(store: string, clickhouse: ClickHouse): Clickhouse
557561
clickhouse: clickhouse,
558562
batchSize: env.EVENTS_CLICKHOUSE_BATCH_SIZE,
559563
flushInterval: env.EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS,
560-
maximumTraceSummaryViewCount: env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT,
561-
maximumTraceDetailedSummaryViewCount:
562-
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT,
564+
maximumTraceSummaryViewCount: clampToEmergencySpanCap(
565+
env.EVENTS_CLICKHOUSE_MAX_TRACE_SUMMARY_VIEW_COUNT
566+
),
567+
maximumTraceDetailedSummaryViewCount: clampToEmergencySpanCap(
568+
env.EVENTS_CLICKHOUSE_MAX_TRACE_DETAILED_SUMMARY_VIEW_COUNT
569+
),
563570
maximumLiveReloadingSetting: env.EVENTS_CLICKHOUSE_MAX_LIVE_RELOADING_SETTING,
564571
insertStrategy: env.EVENTS_CLICKHOUSE_INSERT_STRATEGY,
565572
waitForAsyncInsert: env.EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT === "1",
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { env } from "~/env.server";
2+
3+
// Emergency circuit breaker for trace views: when TRACE_VIEW_EMERGENCY_SPAN_CAP
4+
// is set, clamp a trace summary span limit to it. Unset = no clamping.
5+
export function clampToEmergencySpanCap(limit: number): number {
6+
const cap = env.TRACE_VIEW_EMERGENCY_SPAN_CAP;
7+
return cap === undefined ? limit : Math.min(limit, cap);
8+
}

apps/webapp/app/v3/taskEventStore.server.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import { Prisma, TaskEvent } from "@trigger.dev/database";
33
import type { PrismaClient, PrismaReplicaClient } from "~/db.server";
44
import { env } from "~/env.server";
5+
import { clampToEmergencySpanCap } from "~/v3/eventRepository/emergencySpanCap.server";
56

67
export type CommonTaskEvent = Omit<TaskEvent, "id">;
78
export type TraceEvent = Pick<
@@ -192,7 +193,7 @@ export class TaskEventStore {
192193
: Prisma.empty
193194
}
194195
ORDER BY "startTime" ASC
195-
LIMIT ${env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT}
196+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT)}
196197
`;
197198
} else {
198199
return await this.readReplica.$queryRaw<TraceEvent[]>`
@@ -220,7 +221,7 @@ export class TaskEventStore {
220221
: Prisma.empty
221222
}
222223
ORDER BY "startTime" ASC
223-
LIMIT ${env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT}
224+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_SUMMARY_VIEW_COUNT)}
224225
`;
225226
}
226227
}
@@ -270,7 +271,7 @@ export class TaskEventStore {
270271
: Prisma.empty
271272
}
272273
ORDER BY "startTime" ASC
273-
LIMIT ${env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT}
274+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT)}
274275
`;
275276
} else {
276277
return await this.readReplica.$queryRaw<DetailedTraceEvent[]>`
@@ -299,7 +300,7 @@ export class TaskEventStore {
299300
: Prisma.empty
300301
}
301302
ORDER BY "startTime" ASC
302-
LIMIT ${env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT}
303+
LIMIT ${clampToEmergencySpanCap(env.MAXIMUM_TRACE_DETAILED_SUMMARY_VIEW_COUNT)}
303304
`;
304305
}
305306
}

0 commit comments

Comments
 (0)