From 4fee52704fa6b276102302e832cdb451eeb4d930 Mon Sep 17 00:00:00 2001 From: d-robotics Date: Sat, 13 Jun 2026 20:04:43 +0800 Subject: [PATCH] fix two tool-retry bugs in the agent execute loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - structured retry leaked stale content: `structuredBlocks` was declared once outside the retry loop and only set on a successful executeStructured, never reset per attempt. If attempt 0 returned structured blocks + a transient error and a later attempt threw (timeout/reset), the call returned the new attempt's error text paired with the PREVIOUS attempt's structured blocks. Reset structuredBlocks at the top of each attempt. - retry backoff leaked an abort listener: the abortable backoff registered an `abort` listener on the long-lived run signal but only removed it on the abort path, so every normally-completing backoff leaked one listener (accumulating across retries/tool-calls in a run → MaxListenersExceededWarning). Remove the listener on the normal-completion path too. Found by a parallel bug-hunt over the tool-execution subsystem; both ship with red-before-green specs. Verified: agent suite 168 files green, boundaries + hygiene + lint clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/core/tools/execute-tool-call.ts | 21 +++++--- ...-tool-retry-backoff-listener-leak.spec.mjs | 48 +++++++++++++++++ .../execute-tool-retry-structured.spec.mjs | 53 +++++++++++++++++++ 3 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs create mode 100644 packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs diff --git a/packages/dmoss-agent/src/core/tools/execute-tool-call.ts b/packages/dmoss-agent/src/core/tools/execute-tool-call.ts index 929fb3f..94994f3 100644 --- a/packages/dmoss-agent/src/core/tools/execute-tool-call.ts +++ b/packages/dmoss-agent/src/core/tools/execute-tool-call.ts @@ -278,6 +278,9 @@ export async function executeOneToolCall( let attemptErrFlag = false; let attemptText = ''; let attemptTimeout = false; + // Reset per-attempt structured output so a retry that rejects can't surface + // a previous attempt's blocks alongside this attempt's error text. + structuredBlocks = undefined; const timeoutAbortCtrl = new AbortController(); try { @@ -378,16 +381,18 @@ export async function executeOneToolCall( logger.debug( `[execute-tool-call] retry #${retriesUsed}/${MAX_RETRY_ATTEMPTS} for ${call.name}(${call.id}) after ${delayMs}ms: ${rawMsg.slice(0, 120)}`, ); - // Abortable backoff — abort immediately cancels the wait + // Abortable backoff — abort immediately cancels the wait. The abort + // listener must be removed on the normal-completion path too, otherwise + // it leaks on the long-lived run signal across every retry. let backoffTimer: ReturnType | undefined; - await Promise.race([ - new Promise((resolve) => { backoffTimer = setTimeout(resolve, delayMs); }), - abortable( - new Promise(() => {}), - deps.abortSignal, - ).catch(() => {}), - ]); + let onBackoffAbort: (() => void) | undefined; + await new Promise((resolve) => { + backoffTimer = setTimeout(resolve, delayMs); + onBackoffAbort = () => resolve(); + deps.abortSignal.addEventListener('abort', onBackoffAbort, { once: true }); + }); if (backoffTimer) clearTimeout(backoffTimer); + if (onBackoffAbort) deps.abortSignal.removeEventListener('abort', onBackoffAbort); // Re-check abort after backoff; if aborted, break with cancelled path if (deps.abortSignal.aborted) { aborted = { by: 'user' }; diff --git a/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs b/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs new file mode 100644 index 0000000..6dee163 --- /dev/null +++ b/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs @@ -0,0 +1,48 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { getEventListeners } from 'node:events'; +import { executeOneToolCall } from '../dist/core/tools/execute-tool-call.js'; + +describe('execute-tool retry backoff listener hygiene', () => { + it('does not leak abort listeners on the run signal across retries', async () => { + const runAbort = new AbortController(); + let attempt = 0; + const tool = { + name: 'always_transient_probe', + description: 'always fails transiently to exercise both retry backoffs', + metadata: { transientRetry: true }, + inputSchema: { type: 'object', properties: {} }, + async execute() { + attempt++; + throw new Error('connection reset by peer'); + }, + }; + + const outcome = await executeOneToolCall( + { id: 'call-leak', name: 'always_transient_probe', input: {} }, + { + toolsForRun: [tool], + toolCtx: { workspaceDir: process.cwd(), sessionKey: 's' }, + sessionKey: 's', + abortSignal: runAbort.signal, + toolTimeoutMs: 2_000, + enableHeartbeat: false, + heartbeatIntervalMs: 1_000, + skipHeartbeatToolNames: new Set(), + push: () => {}, + }, + ); + + assert.equal(outcome.kind, 'completed'); + assert.equal(outcome.isError, true); + // Two backoff waits (500ms + 1500ms) ran and completed normally; neither + // should leave an abort listener registered on the long-lived run signal. + assert.ok(attempt >= 3, 'all retry attempts must have run'); + const listeners = getEventListeners(runAbort.signal, 'abort').length; + assert.equal( + listeners, + 0, + `run signal must have no leaked abort listeners, found ${listeners}`, + ); + }); +}); diff --git a/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs b/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs new file mode 100644 index 0000000..e81a2ae --- /dev/null +++ b/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs @@ -0,0 +1,53 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { executeOneToolCall } from '../dist/core/tools/execute-tool-call.js'; + +describe('execute-tool transient retry + structured content', () => { + it('does not surface stale structuredContent when a retry attempt rejects', async () => { + let attempt = 0; + const tool = { + name: 'flaky_structured_probe', + description: 'structured tool that errors transiently then rejects', + metadata: { transientRetry: true, sideEffectClass: 'readonly' }, + inputSchema: { type: 'object', properties: {} }, + async executeStructured() { + attempt++; + if (attempt === 1) { + // First attempt: structured isError with a transient message -> eligible for retry + return { + content: [{ type: 'text', text: 'connection reset by peer' }], + isError: true, + }; + } + // Retry attempt: reject (no structured content produced) + throw new Error('connection reset by peer'); + }, + }; + + const outcome = await executeOneToolCall( + { id: 'call-flaky', name: 'flaky_structured_probe', input: {} }, + { + toolsForRun: [tool], + toolCtx: { workspaceDir: process.cwd(), sessionKey: 's' }, + sessionKey: 's', + abortSignal: new AbortController().signal, + toolTimeoutMs: 2_000, + enableHeartbeat: false, + heartbeatIntervalMs: 1_000, + skipHeartbeatToolNames: new Set(), + push: () => {}, + }, + ); + + assert.equal(outcome.kind, 'completed'); + assert.equal(outcome.isError, true); + // The retry rejected and produced no structured content, so the result must + // not carry the first attempt's blocks alongside the retry's error text. + assert.equal( + outcome.structuredContent, + undefined, + 'stale structuredContent from a prior attempt must not be returned', + ); + assert.ok(attempt >= 2, 'test must exercise the retry path'); + }); +});