docker · derekmisler · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/action.yml b/action.yml
@@ -71,6 +71,10 @@ inputs:
     description: "Base delay in seconds between retries (doubles each attempt)"
     required: false
     default: "5"
+  retry-on-timeout:
+    description: "Number of additional retry attempts when the agent times out (exit code 124). Independent of max-retries — both budgets can be consumed in the same run. Default 0 = no timeout retries."
+    required: false
+    default: "0"
   extra-args:
     description: "Additional arguments to pass to docker agent run"
     required: false

diff --git a/review-pr/action.yml b/review-pr/action.yml
@@ -777,6 +777,7 @@ runs:
         extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }}
         add-prompt-files: ${{ inputs.add-prompt-files }}
         max-retries: "1"  # One retry handles transient API failures (e.g. Anthropic 400s) without risking duplicate reviews from full pipeline restarts
+        retry-on-timeout: "1"  # Retry once on timeout — agent may succeed on a second pass after a transient infra hiccup
         skip-summary: "true"
         org-membership-token: ${{ inputs.org-membership-token }}
         auth-org: ${{ inputs.auth-org }}
@@ -848,8 +849,9 @@ runs:
           else
             TIMEOUT_BODY="⏱️ **PR Review Timed Out** — The review agent hit the 2700 s time limit. This usually happens on large or complex diffs. Re-request a review from \`docker-agent\` to retry — if it times out again, consider splitting the PR into smaller pieces."
           fi
-          if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-            -f body="$TIMEOUT_BODY" \
+          if ! jq -n --arg body "$TIMEOUT_BODY" --arg event "COMMENT" \
+            '{body: $body, event: $event, comments: []}' \
+          | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \
             2>&1; then
             echo "::warning::Failed to post timeout comment to PR"
           fi
@@ -862,8 +864,9 @@ runs:
             STATUS="⚠️ **Review completed with warnings** (exit code: $EXIT_CODE)"
           else
             STATUS="❌ **Review failed** (exit code: $EXIT_CODE)"
-            if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-              -f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \
+            if ! jq -n --arg body "❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." --arg event "COMMENT" \
+              '{body: $body, event: $event, comments: []}' \
+            | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \
               2>&1; then
               echo "::warning::Failed to post fallback comment to PR"
             fi
@@ -873,17 +876,24 @@ runs:
           # Defense-in-depth: if the log exists but no review was posted, post a fallback LGTM comment.
           # This guards against the agent exiting 0 without calling gh api (e.g., zero-findings early exit).
           if [ -n "$VERBOSE_LOG_FILE" ] && [ -f "$VERBOSE_LOG_FILE" ] && ! grep -qE 'pullrequestreview-[0-9]+' "$VERBOSE_LOG_FILE" 2>/dev/null; then
-            # Dedup guard: skip posting if an identical fallback comment already exists.
-            EXISTING=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-              --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \
+            # Dedup guard: skip posting if an identical fallback PR review already exists.
+            # Also check the old issue-comment endpoint for a one-time migration guard
+            # (prior runs posted to /issues/comments; this prevents a duplicate on the first
+            # run of this new code against PRs that already received an LGTM issue comment).
+            EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \
+              --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \
               2>/dev/null || echo "0")
-            if [ "${EXISTING:-0}" -gt 0 ]; then
-              echo "ℹ️  Fallback LGTM comment already exists — skipping duplicate post"
+            EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
+              --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \
+              2>/dev/null || echo "0")
+            if [ "${EXISTING:-0}" -gt 0 ] || [ "${EXISTING_OLD:-0}" -gt 0 ]; then
+              echo "ℹ️  Fallback LGTM review already exists — skipping duplicate post"
             else
-              echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM comment"
-              gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-                -f body="🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." 2>&1 || \
-                echo "::warning::Failed to post fallback LGTM comment to PR"
+              echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM review"
+              jq -n --arg body "🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." --arg event "COMMENT" \
+                '{body: $body, event: $event, comments: []}' \
+              | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - 2>&1 || \
+                echo "::warning::Failed to post fallback LGTM review to PR"
             fi
           fi
         fi

diff --git a/src/main/__tests__/exec.test.ts b/src/main/__tests__/exec.test.ts
@@ -69,6 +69,7 @@ function baseOpts(overrides: Partial<Parameters<typeof runAgent>[0]> = {}) {
     timeout: 0,
     maxRetries: 0,
     retryDelay: 0,
+    retryOnTimeout: 0,
     debug: false,
     anthropicApiKey: 'sk-ant-test',
     telemetryTags: 'source=test',
@@ -248,17 +249,55 @@ describe('runAgent', () => {
     expect(writtenData.toString()).toContain('Raw prompt');
   });
 
-  it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout', async () => {
+  it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout when retryOnTimeout=0', async () => {
     // Return 124 (our timeout sentinel — simulate the timer firing)
     mockSpawn.mockReturnValue(makeMockChild(TIMEOUT_EXIT_CODE));
 
-    const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3 }));
+    const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3, retryOnTimeout: 0 }));
 
     expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE);
     // Only spawned once — no retries after timeout
     expect(mockSpawn).toHaveBeenCalledOnce();
   });
 
+  it('retries once on timeout when retryOnTimeout=1, succeeds on retry', async () => {
+    mockSpawn
+      .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE))
+      .mockImplementation(() => makeMockChild(0));
+
+    const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 }));
+
+    expect(result.exitCode).toBe(0);
+    // First attempt timed out, second attempt succeeded
+    expect(mockSpawn).toHaveBeenCalledTimes(2);
+  });
+
+  it('stops retrying on timeout after retryOnTimeout attempts', async () => {
+    mockSpawn.mockImplementation(() => makeMockChild(TIMEOUT_EXIT_CODE));
+
+    const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 }));
+
+    expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE);
+    // 1 initial + 1 timeout retry = 2 total
+    expect(mockSpawn).toHaveBeenCalledTimes(2);
+  });
+
+  it('timeout retry budget is independent of failure retry budget (mixed sequence)', async () => {
+    // Attempt 1: normal failure — consumes one failureRetryCount slot
+    // Attempt 2: timeout — should still get its own retryOnTimeout=1 slot
+    // Attempt 3: success
+    mockSpawn
+      .mockImplementationOnce(() => makeMockChild(1)) // failure
+      .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) // timeout
+      .mockImplementation(() => makeMockChild(0)); // success
+
+    const result = await runAgent(baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 }));
+
+    expect(result.exitCode).toBe(0);
+    // failure retry + timeout retry + final success = 3 spawns
+    expect(mockSpawn).toHaveBeenCalledTimes(3);
+  });
+
   it('kills process with SIGTERM when timeout fires (FIX D)', async () => {
     // Child exits naturally in 5 s; action timeout is 50 ms.
     // The real timer path fires SIGTERM, then the child is killed.

diff --git a/src/main/exec.ts b/src/main/exec.ts
@@ -47,6 +47,8 @@ export interface RunAgentOptions {
   maxRetries: number;
   /** Base delay between retries in seconds (doubles each attempt). */
   retryDelay: number;
+  /** Number of additional retry attempts allowed when the agent times out (exit 124). */
+  retryOnTimeout: number;
   /** Whether debug mode is enabled. */
   debug: boolean;
 
@@ -276,23 +278,30 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
 
   const startTime = Date.now();
   let exitCode = 1;
-  let attempt = 0;
+  let totalAttempt = 0;
+  // Track the two retry budgets independently so mixed-failure sequences
+  // (e.g. a non-timeout failure followed by a timeout) consume only from
+  // the correct budget and never silently exhaust the other.
+  let failureRetryCount = 0;
+  let timeoutRetryCount = 0;
   let currentDelay = opts.retryDelay;
 
   while (true) {
-    attempt++;
-
-    if (attempt > 1) {
-      core.info(
-        `🔄 Retry attempt ${attempt - 1} of ${opts.maxRetries} (waiting ${currentDelay}s)...`,
-      );
+    totalAttempt++;
+
+    if (totalAttempt > 1) {
+      const retryLabel =
+        exitCode === TIMEOUT_EXIT_CODE
+          ? `Timeout retry ${timeoutRetryCount} of ${opts.retryOnTimeout}`
+          : `Retry ${failureRetryCount} of ${opts.maxRetries}`;
+      core.info(`🔄 ${retryLabel} (waiting ${currentDelay}s)...`);
       await sleep(currentDelay);
       currentDelay *= 2;
 
       // Reset verbose log separator for retry
       const separator = [
         '',
-        `========== RETRY ATTEMPT ${attempt} (${new Date().toISOString()}) ==========`,
+        `========== RETRY ATTEMPT ${totalAttempt} (${new Date().toISOString()}) ==========`,
         '',
       ].join(os.EOL);
       fs.appendFileSync(opts.verboseLogFile, separator, 'utf-8');
@@ -320,15 +329,22 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {
 
     if (exitCode === TIMEOUT_EXIT_CODE) {
       core.error(`Agent execution timed out after ${opts.timeout} seconds`);
-      break; // Don't retry timeouts
-    }
-
-    if (attempt > opts.maxRetries) {
-      core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`);
-      break;
+      if (timeoutRetryCount >= opts.retryOnTimeout) {
+        break; // Timeout retry budget exhausted
+      }
+      timeoutRetryCount++;
+      core.warning(
+        `Timeout — will retry (${timeoutRetryCount}/${opts.retryOnTimeout} timeout retries used)`,
+      );
+      // fall through to retry
+    } else {
+      if (failureRetryCount >= opts.maxRetries) {
+        core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`);
+        break;
+      }
+      failureRetryCount++;
+      core.warning(`Agent failed (exit code: ${exitCode}), will retry...`);
     }
-
-    core.warning(`Agent failed (exit code: ${exitCode}), will retry...`);
   }
 
   const executionTime = Math.round((Date.now() - startTime) / 1000);

diff --git a/src/main/index.ts b/src/main/index.ts
@@ -217,6 +217,7 @@ async function run(): Promise<void> {
     const timeout = parseInt(core.getInput('timeout') || '0', 10);
     const maxRetries = parseInt(core.getInput('max-retries') || '2', 10);
     const retryDelay = parseInt(core.getInput('retry-delay') || '5', 10);
+    const retryOnTimeout = parseInt(core.getInput('retry-on-timeout') || '0', 10);
     const yolo = core.getBooleanInput('yolo');
     const workingDirectory = core.getInput('working-directory') || '.';
     const extraArgs = core.getInput('extra-args');
@@ -342,6 +343,7 @@ async function run(): Promise<void> {
       timeout,
       maxRetries,
       retryDelay,
+      retryOnTimeout,
       debug,
       anthropicApiKey,
       openaiApiKey,