From 4fee52704fa6b276102302e832cdb451eeb4d930 Mon Sep 17 00:00:00 2001
From: d-robotics <d-robotics@d-roboticsdeMacBook-Pro.local>
Date: Sat, 13 Jun 2026 20:04:43 +0800
Subject: [PATCH] fix two tool-retry bugs in the agent execute loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- structured retry leaked stale content: `structuredBlocks` was declared once
  outside the retry loop and only set on a successful executeStructured, never
  reset per attempt. If attempt 0 returned structured blocks + a transient error
  and a later attempt threw (timeout/reset), the call returned the new attempt's
  error text paired with the PREVIOUS attempt's structured blocks. Reset
  structuredBlocks at the top of each attempt.
- retry backoff leaked an abort listener: the abortable backoff registered an
  `abort` listener on the long-lived run signal but only removed it on the abort
  path, so every normally-completing backoff leaked one listener (accumulating
  across retries/tool-calls in a run → MaxListenersExceededWarning). Remove the
  listener on the normal-completion path too.

Found by a parallel bug-hunt over the tool-execution subsystem; both ship with
red-before-green specs. Verified: agent suite 168 files green, boundaries +
hygiene + lint clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/core/tools/execute-tool-call.ts       | 21 +++++---
 ...-tool-retry-backoff-listener-leak.spec.mjs | 48 +++++++++++++++++
 .../execute-tool-retry-structured.spec.mjs    | 53 +++++++++++++++++++
 3 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs
 create mode 100644 packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs
diff --git a/packages/dmoss-agent/src/core/tools/execute-tool-call.ts b/packages/dmoss-agent/src/core/tools/execute-tool-call.ts
index 929fb3f..94994f3 100644
--- a/packages/dmoss-agent/src/core/tools/execute-tool-call.ts
+++ b/packages/dmoss-agent/src/core/tools/execute-tool-call.ts
@@ -278,6 +278,9 @@ export async function executeOneToolCall(
     let attemptErrFlag = false;
     let attemptText = '';
     let attemptTimeout = false;
+    // Reset per-attempt structured output so a retry that rejects can't surface
+    // a previous attempt's blocks alongside this attempt's error text.
+    structuredBlocks = undefined;
     const timeoutAbortCtrl = new AbortController();
 
     try {
@@ -378,16 +381,18 @@ export async function executeOneToolCall(
         logger.debug(
           `[execute-tool-call] retry #${retriesUsed}/${MAX_RETRY_ATTEMPTS} for ${call.name}(${call.id}) after ${delayMs}ms: ${rawMsg.slice(0, 120)}`,
         );
-        // Abortable backoff — abort immediately cancels the wait
+        // Abortable backoff — abort immediately cancels the wait. The abort
+        // listener must be removed on the normal-completion path too, otherwise
+        // it leaks on the long-lived run signal across every retry.
         let backoffTimer: ReturnType<typeof setTimeout> | undefined;
-        await Promise.race([
-          new Promise<void>((resolve) => { backoffTimer = setTimeout(resolve, delayMs); }),
-          abortable(
-            new Promise<never>(() => {}),
-            deps.abortSignal,
-          ).catch(() => {}),
-        ]);
+        let onBackoffAbort: (() => void) | undefined;
+        await new Promise<void>((resolve) => {
+          backoffTimer = setTimeout(resolve, delayMs);
+          onBackoffAbort = () => resolve();
+          deps.abortSignal.addEventListener('abort', onBackoffAbort, { once: true });
+        });
         if (backoffTimer) clearTimeout(backoffTimer);
+        if (onBackoffAbort) deps.abortSignal.removeEventListener('abort', onBackoffAbort);
         // Re-check abort after backoff; if aborted, break with cancelled path
         if (deps.abortSignal.aborted) {
           aborted = { by: 'user' };
diff --git a/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs b/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs
new file mode 100644
index 0000000..6dee163
--- /dev/null
+++ b/packages/dmoss-agent/test/execute-tool-retry-backoff-listener-leak.spec.mjs
@@ -0,0 +1,48 @@
+import { describe, it } from 'node:test';
+import assert from 'node:assert/strict';
+import { getEventListeners } from 'node:events';
+import { executeOneToolCall } from '../dist/core/tools/execute-tool-call.js';
+
+describe('execute-tool retry backoff listener hygiene', () => {
+  it('does not leak abort listeners on the run signal across retries', async () => {
+    const runAbort = new AbortController();
+    let attempt = 0;
+    const tool = {
+      name: 'always_transient_probe',
+      description: 'always fails transiently to exercise both retry backoffs',
+      metadata: { transientRetry: true },
+      inputSchema: { type: 'object', properties: {} },
+      async execute() {
+        attempt++;
+        throw new Error('connection reset by peer');
+      },
+    };
+
+    const outcome = await executeOneToolCall(
+      { id: 'call-leak', name: 'always_transient_probe', input: {} },
+      {
+        toolsForRun: [tool],
+        toolCtx: { workspaceDir: process.cwd(), sessionKey: 's' },
+        sessionKey: 's',
+        abortSignal: runAbort.signal,
+        toolTimeoutMs: 2_000,
+        enableHeartbeat: false,
+        heartbeatIntervalMs: 1_000,
+        skipHeartbeatToolNames: new Set(),
+        push: () => {},
+      },
+    );
+
+    assert.equal(outcome.kind, 'completed');
+    assert.equal(outcome.isError, true);
+    // Two backoff waits (500ms + 1500ms) ran and completed normally; neither
+    // should leave an abort listener registered on the long-lived run signal.
+    assert.ok(attempt >= 3, 'all retry attempts must have run');
+    const listeners = getEventListeners(runAbort.signal, 'abort').length;
+    assert.equal(
+      listeners,
+      0,
+      `run signal must have no leaked abort listeners, found ${listeners}`,
+    );
+  });
+});
diff --git a/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs b/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs
new file mode 100644
index 0000000..e81a2ae
--- /dev/null
+++ b/packages/dmoss-agent/test/execute-tool-retry-structured.spec.mjs
@@ -0,0 +1,53 @@
+import { describe, it } from 'node:test';
+import assert from 'node:assert/strict';
+import { executeOneToolCall } from '../dist/core/tools/execute-tool-call.js';
+
+describe('execute-tool transient retry + structured content', () => {
+  it('does not surface stale structuredContent when a retry attempt rejects', async () => {
+    let attempt = 0;
+    const tool = {
+      name: 'flaky_structured_probe',
+      description: 'structured tool that errors transiently then rejects',
+      metadata: { transientRetry: true, sideEffectClass: 'readonly' },
+      inputSchema: { type: 'object', properties: {} },
+      async executeStructured() {
+        attempt++;
+        if (attempt === 1) {
+          // First attempt: structured isError with a transient message -> eligible for retry
+          return {
+            content: [{ type: 'text', text: 'connection reset by peer' }],
+            isError: true,
+          };
+        }
+        // Retry attempt: reject (no structured content produced)
+        throw new Error('connection reset by peer');
+      },
+    };
+
+    const outcome = await executeOneToolCall(
+      { id: 'call-flaky', name: 'flaky_structured_probe', input: {} },
+      {
+        toolsForRun: [tool],
+        toolCtx: { workspaceDir: process.cwd(), sessionKey: 's' },
+        sessionKey: 's',
+        abortSignal: new AbortController().signal,
+        toolTimeoutMs: 2_000,
+        enableHeartbeat: false,
+        heartbeatIntervalMs: 1_000,
+        skipHeartbeatToolNames: new Set(),
+        push: () => {},
+      },
+    );
+
+    assert.equal(outcome.kind, 'completed');
+    assert.equal(outcome.isError, true);
+    // The retry rejected and produced no structured content, so the result must
+    // not carry the first attempt's blocks alongside the retry's error text.
+    assert.equal(
+      outcome.structuredContent,
+      undefined,
+      'stale structuredContent from a prior attempt must not be returned',
+    );
+    assert.ok(attempt >= 2, 'test must exercise the retry path');
+  });
+});