From 2a2a2c84df396b5a5496c3c51358a73dc2b0f855 Mon Sep 17 00:00:00 2001 From: Miguel Angel Simon Sierra Date: Thu, 25 Jun 2026 16:47:05 -0400 Subject: [PATCH 1/2] fix(producer): stop retrying capture attempts that made zero progress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A structurally broken composition (never-ready page, zero duration, or unparseable HTML) captures no frames, so the adaptive retry loop kept re-running it at halved parallelism — 16->8->4->2->1 workers — each attempt burning a full readiness/protocol timeout per worker. That multiplied wall-clock to ~46min on broken renders and was the driver of the render P95 blowup (~370k -> 2.79M ms) seen Jun 20-22. Add captureAttemptMadeProgress(): when an attempt leaves at least as many frames missing as it set out to capture, it made no forward progress, so the composition is broken rather than the workers being flaky. Bail immediately instead of retrying. A partially-captured attempt still retries, so genuine flaky-worker gaps are unaffected. --- .../src/services/renderOrchestrator.test.ts | 16 ++++++++ .../src/services/renderOrchestrator.ts | 37 ++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/packages/producer/src/services/renderOrchestrator.test.ts b/packages/producer/src/services/renderOrchestrator.test.ts index 03ee4e9817..02862cdeeb 100644 --- a/packages/producer/src/services/renderOrchestrator.test.ts +++ b/packages/producer/src/services/renderOrchestrator.test.ts @@ -7,6 +7,7 @@ import type { CompiledComposition } from "./htmlCompiler.js"; import { buildMissingFrameRetryBatches, + captureAttemptMadeProgress, collectVideoMetadataHints, collectVideoReadinessSkipIds, extractStandaloneEntryFromIndex, @@ -95,6 +96,21 @@ describe("extractStandaloneEntryFromIndex", () => { }); }); +describe("captureAttemptMadeProgress", () => { + it("retries when the attempt captured at least one frame toward its target", () => { + // targeted 100 frames, 40 still missing -> 60 captured -> worth retrying the rest + expect(captureAttemptMadeProgress(100, 40)).toBe(true); + expect(captureAttemptMadeProgress(100, 99)).toBe(true); + }); + + it("bails when the attempt captured nothing (structurally broken composition)", () => { + // targeted 100 frames, 100 still missing -> zero progress -> don't burn another timeout cycle + expect(captureAttemptMadeProgress(100, 100)).toBe(false); + // defensive: never-greater-than guard, treat >= target as no progress + expect(captureAttemptMadeProgress(100, 120)).toBe(false); + }); +}); + describe("shouldUseStreamingEncode", () => { const streamingEnabledConfig = { enableStreamingEncode: true, diff --git a/packages/producer/src/services/renderOrchestrator.ts b/packages/producer/src/services/renderOrchestrator.ts index dc1867063c..a7cc3773a4 100644 --- a/packages/producer/src/services/renderOrchestrator.ts +++ b/packages/producer/src/services/renderOrchestrator.ts @@ -548,6 +548,24 @@ export function getNextRetryWorkerCount(currentWorkers: number): number { return Math.max(1, Math.floor(currentWorkers / 2)); } +/** + * A retry only pays off if the attempt that just finished captured at least one + * frame toward its target. When it captured nothing (frames still missing >= + * frames it set out to capture), the composition is structurally broken — a + * never-ready page, zero duration, or unparseable HTML — not a flaky worker. + * Re-running it at lower parallelism just burns another full readiness/protocol + * timeout per worker, which is what blew up render P95 in Jun 2026 (broken + * renders walked 16->8->4->2->1 workers, ~46min each before finally failing). + * ponytail: total zero-progress == broken; a partially-captured attempt still + * retries, so genuine flaky-worker gaps are unaffected. + */ +export function captureAttemptMadeProgress( + attemptTargetFrameCount: number, + remainingFrameCount: number, +): boolean { + return remainingFrameCount < attemptTargetFrameCount; +} + export function isRecoverableParallelCaptureError(error: unknown): boolean { const message = normalizeErrorMessage(error); return ( @@ -673,10 +691,13 @@ export async function executeDiskCaptureWithAdaptiveRetry(options: { if (remaining.length === 0) { return attempts; } - if (!options.allowRetry || currentWorkers <= 1) { - throw new Error( - `[Render] Capture completed but ${countFrameRanges(remaining)} frame(s) are missing`, - ); + const remainingCount = countFrameRanges(remaining); + if ( + !options.allowRetry || + currentWorkers <= 1 || + !captureAttemptMadeProgress(frameCount, remainingCount) + ) { + throw new Error(`[Render] Capture completed but ${remainingCount} frame(s) are missing`); } const nextWorkers = getNextRetryWorkerCount(currentWorkers); @@ -697,7 +718,13 @@ export async function executeDiskCaptureWithAdaptiveRetry(options: { if (remaining.length === 0) { return attempts; } - if (!options.allowRetry || currentWorkers <= 1 || !isRecoverableParallelCaptureError(error)) { + const remainingCount = countFrameRanges(remaining); + if ( + !options.allowRetry || + currentWorkers <= 1 || + !isRecoverableParallelCaptureError(error) || + !captureAttemptMadeProgress(frameCount, remainingCount) + ) { throw error; } From 80ac896a835c80e6c8487dc58a7355d564604430 Mon Sep 17 00:00:00 2001 From: Miguel Angel Simon Sierra Date: Thu, 25 Jun 2026 17:05:15 -0400 Subject: [PATCH 2/2] fix(producer): log the zero-progress bail + cover it with an integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on the no-progress capture guard: - Warn before bailing so an oncall can tell a structurally-broken render that bailed fast apart from one that exhausted worker-halving retries (both previously threw the same "frame(s) are missing" message). - Add an integration test that drives executeDiskCaptureWithAdaptiveRetry through the bail (capture functions mocked to write nothing) and asserts a single attempt runs — the gate would otherwise walk 4->2->1 workers. Guards the placement of the gate, not just the predicate. - Reword the helper docstring (drop stray prefix and internal incident detail). --- .../src/services/renderOrchestrator.test.ts | 60 ++++++++++++++++++- .../src/services/renderOrchestrator.ts | 29 +++++---- 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/packages/producer/src/services/renderOrchestrator.test.ts b/packages/producer/src/services/renderOrchestrator.test.ts index 02862cdeeb..d7a23d6877 100644 --- a/packages/producer/src/services/renderOrchestrator.test.ts +++ b/packages/producer/src/services/renderOrchestrator.test.ts @@ -2,12 +2,22 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; import { join, win32 } from "node:path"; import { tmpdir } from "node:os"; -import type { EngineConfig, ExtractedFrames } from "@hyperframes/engine"; +import type { CaptureOptions, EngineConfig, ExtractedFrames } from "@hyperframes/engine"; +import { executeParallelCapture, mergeWorkerFrames } from "@hyperframes/engine"; import type { CompiledComposition } from "./htmlCompiler.js"; +// Replace only the two engine functions the adaptive-retry loop uses to touch +// disk; everything else (distributeFrames, types, etc.) stays real so the loop +// runs for real against a temp framesDir. +vi.mock("@hyperframes/engine", async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, executeParallelCapture: vi.fn(), mergeWorkerFrames: vi.fn() }; +}); + import { buildMissingFrameRetryBatches, captureAttemptMadeProgress, + executeDiskCaptureWithAdaptiveRetry, collectVideoMetadataHints, collectVideoReadinessSkipIds, extractStandaloneEntryFromIndex, @@ -111,6 +121,54 @@ describe("captureAttemptMadeProgress", () => { }); }); +describe("executeDiskCaptureWithAdaptiveRetry — zero-progress bail (integration)", () => { + const makeLog = () => ({ error: vi.fn(), warn: vi.fn(), info: vi.fn(), debug: vi.fn() }); + + afterEach(() => { + vi.mocked(executeParallelCapture).mockReset(); + vi.mocked(mergeWorkerFrames).mockReset(); + }); + + it("runs exactly one attempt (no worker-halving retries) when an attempt captures zero frames", async () => { + // Capture writes nothing -> framesDir stays empty -> every frame still missing. + vi.mocked(executeParallelCapture).mockResolvedValue([]); + vi.mocked(mergeWorkerFrames).mockResolvedValue(undefined); + + const workDir = mkdtempSync(join(tmpdir(), "hf-retry-work-")); + const framesDir = mkdtempSync(join(tmpdir(), "hf-retry-frames-")); + const log = makeLog(); + try { + await expect( + executeDiskCaptureWithAdaptiveRetry({ + serverUrl: "http://localhost:0", + workDir, + framesDir, + totalFrames: 4, + initialWorkerCount: 4, + allowRetry: true, + frameExt: "jpg", + captureOptions: {} as CaptureOptions, + createBeforeCaptureHook: () => null, + cfg: {} as EngineConfig, + log, + dedupPerfs: [], + }), + ).rejects.toThrow(/4 frame\(s\) are missing/); + + // The gate under test: without it the loop would walk 4 -> 2 -> 1 workers + // (3 capture calls) before giving up. One call proves it bailed immediately. + expect(vi.mocked(executeParallelCapture)).toHaveBeenCalledTimes(1); + expect(log.warn).toHaveBeenCalledWith( + expect.stringContaining("no forward progress"), + expect.objectContaining({ attempt: 0, frameCount: 4, remainingCount: 4 }), + ); + } finally { + rmSync(workDir, { recursive: true, force: true }); + rmSync(framesDir, { recursive: true, force: true }); + } + }); +}); + describe("shouldUseStreamingEncode", () => { const streamingEnabledConfig = { enableStreamingEncode: true, diff --git a/packages/producer/src/services/renderOrchestrator.ts b/packages/producer/src/services/renderOrchestrator.ts index a7cc3773a4..77e5d0c03b 100644 --- a/packages/producer/src/services/renderOrchestrator.ts +++ b/packages/producer/src/services/renderOrchestrator.ts @@ -554,10 +554,9 @@ export function getNextRetryWorkerCount(currentWorkers: number): number { * frames it set out to capture), the composition is structurally broken — a * never-ready page, zero duration, or unparseable HTML — not a flaky worker. * Re-running it at lower parallelism just burns another full readiness/protocol - * timeout per worker, which is what blew up render P95 in Jun 2026 (broken - * renders walked 16->8->4->2->1 workers, ~46min each before finally failing). - * ponytail: total zero-progress == broken; a partially-captured attempt still - * retries, so genuine flaky-worker gaps are unaffected. + * timeout per worker, turning a render that can never succeed into a long hang. + * A partially-captured attempt still retries, so genuine flaky-worker gaps are + * unaffected. */ export function captureAttemptMadeProgress( attemptTargetFrameCount: number, @@ -692,11 +691,14 @@ export async function executeDiskCaptureWithAdaptiveRetry(options: { return attempts; } const remainingCount = countFrameRanges(remaining); - if ( - !options.allowRetry || - currentWorkers <= 1 || - !captureAttemptMadeProgress(frameCount, remainingCount) - ) { + const madeProgress = captureAttemptMadeProgress(frameCount, remainingCount); + if (!madeProgress) { + options.log.warn( + "[Render] Capture attempt made no forward progress; composition is likely structurally broken — not retrying.", + { attempt, frameCount, remainingCount, workers: currentWorkers }, + ); + } + if (!options.allowRetry || currentWorkers <= 1 || !madeProgress) { throw new Error(`[Render] Capture completed but ${remainingCount} frame(s) are missing`); } @@ -719,11 +721,18 @@ export async function executeDiskCaptureWithAdaptiveRetry(options: { return attempts; } const remainingCount = countFrameRanges(remaining); + const madeProgress = captureAttemptMadeProgress(frameCount, remainingCount); + if (!madeProgress) { + options.log.warn( + "[Render] Capture attempt made no forward progress; composition is likely structurally broken — not retrying.", + { attempt, frameCount, remainingCount, workers: currentWorkers }, + ); + } if ( !options.allowRetry || currentWorkers <= 1 || !isRecoverableParallelCaptureError(error) || - !captureAttemptMadeProgress(frameCount, remainingCount) + !madeProgress ) { throw error; }