From c327f71007a2318c2f10210914016999be5cd1bd Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Thu, 11 Jun 2026 13:07:51 +0100 Subject: [PATCH 1/2] test(webapp): stop streamBatchItems container tests timing out on cold start The streamBatchItems suite was intermittently failing with a 30s test timeout. The 30s budget covers container-fixture setup, and each of the 16 cases boots its own per-test Redis container and spins up a full RunEngine; a cold container boot counts against the test's own timeout, so under CI Docker contention whichever test booted while Docker was busiest could exceed 30s. It is not a product-logic hang. Two changes: - Add a containerTestWithIsolatedRedisNoClickhouse fixture (Postgres template-clone + per-test Redis, no ClickHouse) and use it here. These tests never touch ClickHouse, yet the previous fixture's auto resetClickhouse forced a ClickHouse boot + full migration onto the cold-start test. - Raise the suite testTimeout from 30s to 120s, matching the run-engine package convention for tests of this footprint (RunEngine + per-test container). --- .../test/engine/streamBatchItems.test.ts | 12 +++++++--- internal-packages/testcontainers/src/index.ts | 22 +++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/apps/webapp/test/engine/streamBatchItems.test.ts b/apps/webapp/test/engine/streamBatchItems.test.ts index f4c2f21f8de..691842b4288 100644 --- a/apps/webapp/test/engine/streamBatchItems.test.ts +++ b/apps/webapp/test/engine/streamBatchItems.test.ts @@ -19,8 +19,9 @@ import { setupAuthenticatedEnvironment } from "@internal/run-engine/tests"; // Per-test redis (isolated): each test spins up its own RunEngine and runs batch work, which leaves // background activity on redis that outlives the test - sharing a worker redis across the 16 cases // here caused cross-test interference and 30s seal-timeout flakes. Same carve-out as the run-engine -// batch tests. -import { containerTestWithIsolatedRedis as containerTest } from "@internal/testcontainers"; +// batch tests. The NoClickhouse variant skips the worker-scoped ClickHouse boot+migrate (these tests +// never touch ClickHouse), which the cold-start test would otherwise pay inside its test timeout. +import { containerTestWithIsolatedRedisNoClickhouse as containerTest } from "@internal/testcontainers"; import { trace } from "@opentelemetry/api"; import { PrismaClient } from "@trigger.dev/database"; import { BatchId } from "@trigger.dev/core/v3/isomorphic"; @@ -33,7 +34,12 @@ import { } from "../../app/runEngine/services/streamBatchItems.server"; import { ServiceValidationError } from "../../app/v3/services/baseService.server"; -vi.setConfig({ testTimeout: 30_000 }); // 30 seconds timeout +// 120s (not 30s): each of the 16 cases here boots its own per-test Redis container and spins up a +// full RunEngine, and a cold container boot counts against the test's own timeout. Under CI Docker +// contention that boot can take tens of seconds, so 30s was too tight and the flake landed on +// whichever test happened to boot while Docker was busiest. 120s matches the run-engine package +// convention for tests of this footprint (RunEngine + per-test container). +vi.setConfig({ testTimeout: 120_000 }); describe("StreamBatchItemsService", () => { /** diff --git a/internal-packages/testcontainers/src/index.ts b/internal-packages/testcontainers/src/index.ts index 0047f996df9..c1e890bcadf 100644 --- a/internal-packages/testcontainers/src/index.ts +++ b/internal-packages/testcontainers/src/index.ts @@ -536,6 +536,28 @@ export const containerTestWithIsolatedRedis = test.extend({ + network, + postgresContainer: clonedPostgresContainer, + prisma: prismaFromContainer, + redisContainer, + redisOptions, + }); + // For tests that exercise the Postgres -> ClickHouse logical-replication pipeline (WAL slots, // publications, REPLICA IDENTITY). These need a dedicated Postgres per test - the worker-scoped + // template-clone model used by containerTest doesn't carry logical replication across cloned dbs. From 9c4acb81a3a3774f3df2815f56511cfd53c0e55e Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Thu, 11 Jun 2026 13:15:35 +0100 Subject: [PATCH 2/2] test(webapp): trim verbose comments in streamBatchItems fix --- apps/webapp/test/engine/streamBatchItems.test.ts | 14 ++++---------- internal-packages/testcontainers/src/index.ts | 7 ++----- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/apps/webapp/test/engine/streamBatchItems.test.ts b/apps/webapp/test/engine/streamBatchItems.test.ts index 691842b4288..072f24986b1 100644 --- a/apps/webapp/test/engine/streamBatchItems.test.ts +++ b/apps/webapp/test/engine/streamBatchItems.test.ts @@ -16,11 +16,8 @@ vi.mock("~/services/platform.v3.server", async (importOriginal) => { import { RunEngine } from "@internal/run-engine"; import { setupAuthenticatedEnvironment } from "@internal/run-engine/tests"; -// Per-test redis (isolated): each test spins up its own RunEngine and runs batch work, which leaves -// background activity on redis that outlives the test - sharing a worker redis across the 16 cases -// here caused cross-test interference and 30s seal-timeout flakes. Same carve-out as the run-engine -// batch tests. The NoClickhouse variant skips the worker-scoped ClickHouse boot+migrate (these tests -// never touch ClickHouse), which the cold-start test would otherwise pay inside its test timeout. +// Per-test redis isolation: each test runs its own RunEngine whose background work outlives the test +// body. NoClickhouse because this suite never touches ClickHouse - skips the worker-scoped boot+migrate. import { containerTestWithIsolatedRedisNoClickhouse as containerTest } from "@internal/testcontainers"; import { trace } from "@opentelemetry/api"; import { PrismaClient } from "@trigger.dev/database"; @@ -34,11 +31,8 @@ import { } from "../../app/runEngine/services/streamBatchItems.server"; import { ServiceValidationError } from "../../app/v3/services/baseService.server"; -// 120s (not 30s): each of the 16 cases here boots its own per-test Redis container and spins up a -// full RunEngine, and a cold container boot counts against the test's own timeout. Under CI Docker -// contention that boot can take tens of seconds, so 30s was too tight and the flake landed on -// whichever test happened to boot while Docker was busiest. 120s matches the run-engine package -// convention for tests of this footprint (RunEngine + per-test container). +// 120s: a cold per-test container boot counts against the test's own timeout, and under CI Docker +// contention 30s was too tight. Matches the run-engine convention for this footprint. vi.setConfig({ testTimeout: 120_000 }); describe("StreamBatchItemsService", () => { diff --git a/internal-packages/testcontainers/src/index.ts b/internal-packages/testcontainers/src/index.ts index c1e890bcadf..4927c162fe0 100644 --- a/internal-packages/testcontainers/src/index.ts +++ b/internal-packages/testcontainers/src/index.ts @@ -544,11 +544,8 @@ type ContainerWithIsolatedRedisNoClickhouseContext = { redisOptions: RedisOptions; }; -// Postgres (template-clone) + per-test Redis, and NOTHING else. Same Redis isolation as -// containerTestWithIsolatedRedis (for background work that outlives the test body) but without the -// worker-scoped ClickHouse boot+migrate that the `resetClickhouse` auto fixture would otherwise force -// on the first test in the file. Use this for tests that touch Postgres + Redis but never ClickHouse - -// it removes the heaviest item from the cold-start container budget. +// Like containerTestWithIsolatedRedis (template-clone Postgres + per-test Redis) but with no +// ClickHouse - for suites that touch Postgres + Redis but never ClickHouse, avoiding its boot+migrate. export const containerTestWithIsolatedRedisNoClickhouse = test.extend({ network,