From e45609a8da8b94d867f0136b87b5cb13613cce2c Mon Sep 17 00:00:00 2001 From: Derek Misler Date: Thu, 25 Jun 2026 12:12:30 +0000 Subject: [PATCH 1/4] feat: retry on timeout + post fallbacks as PR reviews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change 1 — retry-on-timeout: - Add retry-on-timeout input (default 0) to root action.yml - exec.ts: when exit 124, break only if retryOnTimeout budget exhausted; otherwise fall through to retry loop - index.ts: read and pass retryOnTimeout to runAgent() - review-pr/action.yml: set retry-on-timeout: "1" on Run PR Review step so a timed-out review gets one automatic retry - Add two new unit tests: retry succeeds on second attempt; stops after budget Change 2 — fallback comments → PR reviews: - Post all three fallback paths (timeout / failure / defense-in-depth LGTM) via POST /pulls/{pr}/reviews with event:COMMENT instead of POST /issues/{pr}/comments — enables GitHub's re-request review button - Update dedup guard to query /pulls/{pr}/reviews instead of /issues/comments --- action.yml | 4 ++++ review-pr/action.yml | 26 +++++++++++++++----------- src/main/__tests__/exec.test.ts | 27 +++++++++++++++++++++++++-- src/main/exec.ts | 18 ++++++++++++------ src/main/index.ts | 2 ++ 5 files changed, 58 insertions(+), 19 deletions(-) diff --git a/action.yml b/action.yml index ddebf68..5784732 100644 --- a/action.yml +++ b/action.yml @@ -71,6 +71,10 @@ inputs: description: "Base delay in seconds between retries (doubles each attempt)" required: false default: "5" + retry-on-timeout: + description: "Number of additional retry attempts when the agent times out (exit code 124). Default 0 = no timeout retries." + required: false + default: "0" extra-args: description: "Additional arguments to pass to docker agent run" required: false diff --git a/review-pr/action.yml b/review-pr/action.yml index 1867584..9b2c8fe 100644 --- a/review-pr/action.yml +++ b/review-pr/action.yml @@ -777,6 +777,7 @@ runs: extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }} add-prompt-files: ${{ inputs.add-prompt-files }} max-retries: "1" # One retry handles transient API failures (e.g. Anthropic 400s) without risking duplicate reviews from full pipeline restarts + retry-on-timeout: "1" # Retry once on timeout — agent may succeed on a second pass after a transient infra hiccup skip-summary: "true" org-membership-token: ${{ inputs.org-membership-token }} auth-org: ${{ inputs.auth-org }} @@ -848,8 +849,9 @@ runs: else TIMEOUT_BODY="⏱️ **PR Review Timed Out** — The review agent hit the 2700 s time limit. This usually happens on large or complex diffs. Re-request a review from \`docker-agent\` to retry — if it times out again, consider splitting the PR into smaller pieces." fi - if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="$TIMEOUT_BODY" \ + if ! jq -n --arg body "$TIMEOUT_BODY" --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \ 2>&1; then echo "::warning::Failed to post timeout comment to PR" fi @@ -862,8 +864,9 @@ runs: STATUS="⚠️ **Review completed with warnings** (exit code: $EXIT_CODE)" else STATUS="❌ **Review failed** (exit code: $EXIT_CODE)" - if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \ + if ! jq -n --arg body "❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \ 2>&1; then echo "::warning::Failed to post fallback comment to PR" fi @@ -873,17 +876,18 @@ runs: # Defense-in-depth: if the log exists but no review was posted, post a fallback LGTM comment. # This guards against the agent exiting 0 without calling gh api (e.g., zero-findings early exit). if [ -n "$VERBOSE_LOG_FILE" ] && [ -f "$VERBOSE_LOG_FILE" ] && ! grep -qE 'pullrequestreview-[0-9]+' "$VERBOSE_LOG_FILE" 2>/dev/null; then - # Dedup guard: skip posting if an identical fallback comment already exists. - EXISTING=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ + # Dedup guard: skip posting if an identical fallback PR review already exists. + EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \ --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ 2>/dev/null || echo "0") if [ "${EXISTING:-0}" -gt 0 ]; then - echo "ℹ️ Fallback LGTM comment already exists — skipping duplicate post" + echo "ℹ️ Fallback LGTM review already exists — skipping duplicate post" else - echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM comment" - gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." 2>&1 || \ - echo "::warning::Failed to post fallback LGTM comment to PR" + echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM review" + jq -n --arg body "🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - 2>&1 || \ + echo "::warning::Failed to post fallback LGTM review to PR" fi fi fi diff --git a/src/main/__tests__/exec.test.ts b/src/main/__tests__/exec.test.ts index 6bbe453..5552cc8 100644 --- a/src/main/__tests__/exec.test.ts +++ b/src/main/__tests__/exec.test.ts @@ -69,6 +69,7 @@ function baseOpts(overrides: Partial[0]> = {}) { timeout: 0, maxRetries: 0, retryDelay: 0, + retryOnTimeout: 0, debug: false, anthropicApiKey: 'sk-ant-test', telemetryTags: 'source=test', @@ -248,17 +249,39 @@ describe('runAgent', () => { expect(writtenData.toString()).toContain('Raw prompt'); }); - it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout', async () => { + it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout when retryOnTimeout=0', async () => { // Return 124 (our timeout sentinel — simulate the timer firing) mockSpawn.mockReturnValue(makeMockChild(TIMEOUT_EXIT_CODE)); - const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3 })); + const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3, retryOnTimeout: 0 })); expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE); // Only spawned once — no retries after timeout expect(mockSpawn).toHaveBeenCalledOnce(); }); + it('retries once on timeout when retryOnTimeout=1, succeeds on retry', async () => { + mockSpawn + .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) + .mockImplementation(() => makeMockChild(0)); + + const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 })); + + expect(result.exitCode).toBe(0); + // First attempt timed out, second attempt succeeded + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + + it('stops retrying on timeout after retryOnTimeout attempts', async () => { + mockSpawn.mockImplementation(() => makeMockChild(TIMEOUT_EXIT_CODE)); + + const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 })); + + expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE); + // 1 initial + 1 timeout retry = 2 total + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + it('kills process with SIGTERM when timeout fires (FIX D)', async () => { // Child exits naturally in 5 s; action timeout is 50 ms. // The real timer path fires SIGTERM, then the child is killed. diff --git a/src/main/exec.ts b/src/main/exec.ts index 807fa29..4336828 100644 --- a/src/main/exec.ts +++ b/src/main/exec.ts @@ -47,6 +47,8 @@ export interface RunAgentOptions { maxRetries: number; /** Base delay between retries in seconds (doubles each attempt). */ retryDelay: number; + /** Number of additional retry attempts allowed when the agent times out (exit 124). */ + retryOnTimeout: number; /** Whether debug mode is enabled. */ debug: boolean; @@ -320,15 +322,19 @@ export async function runAgent(opts: RunAgentOptions): Promise { if (exitCode === TIMEOUT_EXIT_CODE) { core.error(`Agent execution timed out after ${opts.timeout} seconds`); - break; // Don't retry timeouts - } - - if (attempt > opts.maxRetries) { + if (opts.retryOnTimeout <= 0 || attempt > opts.retryOnTimeout) { + break; // No more timeout retries + } + core.warning( + `Timeout on attempt ${attempt} — will retry (${attempt}/${opts.retryOnTimeout} timeout retries used)`, + ); + // fall through to shared retry path below + } else if (attempt > opts.maxRetries) { core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`); break; + } else { + core.warning(`Agent failed (exit code: ${exitCode}), will retry...`); } - - core.warning(`Agent failed (exit code: ${exitCode}), will retry...`); } const executionTime = Math.round((Date.now() - startTime) / 1000); diff --git a/src/main/index.ts b/src/main/index.ts index 71382ef..c942fc7 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -217,6 +217,7 @@ async function run(): Promise { const timeout = parseInt(core.getInput('timeout') || '0', 10); const maxRetries = parseInt(core.getInput('max-retries') || '2', 10); const retryDelay = parseInt(core.getInput('retry-delay') || '5', 10); + const retryOnTimeout = parseInt(core.getInput('retry-on-timeout') || '0', 10); const yolo = core.getBooleanInput('yolo'); const workingDirectory = core.getInput('working-directory') || '.'; const extraArgs = core.getInput('extra-args'); @@ -342,6 +343,7 @@ async function run(): Promise { timeout, maxRetries, retryDelay, + retryOnTimeout, debug, anthropicApiKey, openaiApiKey, From 21eb0a7e6604109c88ca8d56cf253d151ded2a96 Mon Sep 17 00:00:00 2001 From: Derek Misler Date: Thu, 25 Jun 2026 12:19:32 +0000 Subject: [PATCH 2/4] fix: independent retry counters + dedup guard migration safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit exec.ts: replace shared attempt counter with separate failureRetryCount and timeoutRetryCount so mixed-failure sequences (e.g. a non-timeout failure followed by a timeout) consume only from the correct budget. The previous code used attempt > retryOnTimeout which silently exhausted the timeout budget when a prior failure had already advanced attempt. Add a test covering the mixed failure→timeout→success sequence. review-pr/action.yml: extend the LGTM dedup guard to also check the old issues/comments endpoint alongside the new pulls/reviews endpoint, so the first run of this code against a PR that previously received an LGTM issue comment does not post a duplicate review. --- review-pr/action.yml | 8 +++++++- src/main/__tests__/exec.test.ts | 18 ++++++++++++++++++ src/main/exec.ts | 32 ++++++++++++++++++++------------ 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/review-pr/action.yml b/review-pr/action.yml index 9b2c8fe..67970f4 100644 --- a/review-pr/action.yml +++ b/review-pr/action.yml @@ -877,10 +877,16 @@ runs: # This guards against the agent exiting 0 without calling gh api (e.g., zero-findings early exit). if [ -n "$VERBOSE_LOG_FILE" ] && [ -f "$VERBOSE_LOG_FILE" ] && ! grep -qE 'pullrequestreview-[0-9]+' "$VERBOSE_LOG_FILE" 2>/dev/null; then # Dedup guard: skip posting if an identical fallback PR review already exists. + # Also check the old issue-comment endpoint for a one-time migration guard + # (prior runs posted to /issues/comments; this prevents a duplicate on the first + # run of this new code against PRs that already received an LGTM issue comment). EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \ --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ 2>/dev/null || echo "0") - if [ "${EXISTING:-0}" -gt 0 ]; then + EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ + --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ + 2>/dev/null || echo "0") + if [ "${EXISTING:-0}" -gt 0 ] || [ "${EXISTING_OLD:-0}" -gt 0 ]; then echo "ℹ️ Fallback LGTM review already exists — skipping duplicate post" else echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM review" diff --git a/src/main/__tests__/exec.test.ts b/src/main/__tests__/exec.test.ts index 5552cc8..28dd037 100644 --- a/src/main/__tests__/exec.test.ts +++ b/src/main/__tests__/exec.test.ts @@ -282,6 +282,24 @@ describe('runAgent', () => { expect(mockSpawn).toHaveBeenCalledTimes(2); }); + it('timeout retry budget is independent of failure retry budget (mixed sequence)', async () => { + // Attempt 1: normal failure — consumes one failureRetryCount slot + // Attempt 2: timeout — should still get its own retryOnTimeout=1 slot + // Attempt 3: success + mockSpawn + .mockImplementationOnce(() => makeMockChild(1)) // failure + .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) // timeout + .mockImplementation(() => makeMockChild(0)); // success + + const result = await runAgent( + baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 }), + ); + + expect(result.exitCode).toBe(0); + // failure retry + timeout retry + final success = 3 spawns + expect(mockSpawn).toHaveBeenCalledTimes(3); + }); + it('kills process with SIGTERM when timeout fires (FIX D)', async () => { // Child exits naturally in 5 s; action timeout is 50 ms. // The real timer path fires SIGTERM, then the child is killed. diff --git a/src/main/exec.ts b/src/main/exec.ts index 4336828..a7ba717 100644 --- a/src/main/exec.ts +++ b/src/main/exec.ts @@ -278,15 +278,20 @@ export async function runAgent(opts: RunAgentOptions): Promise { const startTime = Date.now(); let exitCode = 1; - let attempt = 0; + let totalAttempt = 0; + // Track the two retry budgets independently so mixed-failure sequences + // (e.g. a non-timeout failure followed by a timeout) consume only from + // the correct budget and never silently exhaust the other. + let failureRetryCount = 0; + let timeoutRetryCount = 0; let currentDelay = opts.retryDelay; while (true) { - attempt++; + totalAttempt++; - if (attempt > 1) { + if (totalAttempt > 1) { core.info( - `🔄 Retry attempt ${attempt - 1} of ${opts.maxRetries} (waiting ${currentDelay}s)...`, + `🔄 Retry attempt ${totalAttempt - 1} (waiting ${currentDelay}s)...`, ); await sleep(currentDelay); currentDelay *= 2; @@ -294,7 +299,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { // Reset verbose log separator for retry const separator = [ '', - `========== RETRY ATTEMPT ${attempt} (${new Date().toISOString()}) ==========`, + `========== RETRY ATTEMPT ${totalAttempt} (${new Date().toISOString()}) ==========`, '', ].join(os.EOL); fs.appendFileSync(opts.verboseLogFile, separator, 'utf-8'); @@ -322,17 +327,20 @@ export async function runAgent(opts: RunAgentOptions): Promise { if (exitCode === TIMEOUT_EXIT_CODE) { core.error(`Agent execution timed out after ${opts.timeout} seconds`); - if (opts.retryOnTimeout <= 0 || attempt > opts.retryOnTimeout) { - break; // No more timeout retries + if (timeoutRetryCount >= opts.retryOnTimeout) { + break; // Timeout retry budget exhausted } + timeoutRetryCount++; core.warning( - `Timeout on attempt ${attempt} — will retry (${attempt}/${opts.retryOnTimeout} timeout retries used)`, + `Timeout — will retry (${timeoutRetryCount}/${opts.retryOnTimeout} timeout retries used)`, ); - // fall through to shared retry path below - } else if (attempt > opts.maxRetries) { - core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`); - break; + // fall through to retry } else { + if (failureRetryCount >= opts.maxRetries) { + core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`); + break; + } + failureRetryCount++; core.warning(`Agent failed (exit code: ${exitCode}), will retry...`); } } From e1e29b40433b92cecc414f85b88f06416c397d48 Mon Sep 17 00:00:00 2001 From: Derek Misler Date: Thu, 25 Jun 2026 12:20:53 +0000 Subject: [PATCH 3/4] style: fix biome formatting (trailing alignment spaces) --- src/main/__tests__/exec.test.ts | 8 +++----- src/main/exec.ts | 4 +--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/main/__tests__/exec.test.ts b/src/main/__tests__/exec.test.ts index 28dd037..e8c2845 100644 --- a/src/main/__tests__/exec.test.ts +++ b/src/main/__tests__/exec.test.ts @@ -287,13 +287,11 @@ describe('runAgent', () => { // Attempt 2: timeout — should still get its own retryOnTimeout=1 slot // Attempt 3: success mockSpawn - .mockImplementationOnce(() => makeMockChild(1)) // failure + .mockImplementationOnce(() => makeMockChild(1)) // failure .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) // timeout - .mockImplementation(() => makeMockChild(0)); // success + .mockImplementation(() => makeMockChild(0)); // success - const result = await runAgent( - baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 }), - ); + const result = await runAgent(baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 })); expect(result.exitCode).toBe(0); // failure retry + timeout retry + final success = 3 spawns diff --git a/src/main/exec.ts b/src/main/exec.ts index a7ba717..cef6187 100644 --- a/src/main/exec.ts +++ b/src/main/exec.ts @@ -290,9 +290,7 @@ export async function runAgent(opts: RunAgentOptions): Promise { totalAttempt++; if (totalAttempt > 1) { - core.info( - `🔄 Retry attempt ${totalAttempt - 1} (waiting ${currentDelay}s)...`, - ); + core.info(`🔄 Retry attempt ${totalAttempt - 1} (waiting ${currentDelay}s)...`); await sleep(currentDelay); currentDelay *= 2; From bcc0d514914eac432d8d9fada11f2a7051ee8436 Mon Sep 17 00:00:00 2001 From: Derek Misler Date: Thu, 25 Jun 2026 12:25:02 +0000 Subject: [PATCH 4/4] fix: clearer retry log labels, null-safe jq, action.yml description exec.ts: retry log now shows 'Timeout retry N of M' vs 'Retry N of M' so ops can distinguish which budget fired without reading code. review-pr/action.yml: guard jq selects with (.body // "") to avoid a type error when .body is null (draft reviews, deleted comments), which previously caused the || echo 0 fallback to fire and post a duplicate. action.yml: retry-on-timeout description notes that the budget is independent of max-retries so both can be consumed in the same run. --- action.yml | 2 +- review-pr/action.yml | 4 ++-- src/main/exec.ts | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/action.yml b/action.yml index 5784732..35f9f0c 100644 --- a/action.yml +++ b/action.yml @@ -72,7 +72,7 @@ inputs: required: false default: "5" retry-on-timeout: - description: "Number of additional retry attempts when the agent times out (exit code 124). Default 0 = no timeout retries." + description: "Number of additional retry attempts when the agent times out (exit code 124). Independent of max-retries — both budgets can be consumed in the same run. Default 0 = no timeout retries." required: false default: "0" extra-args: diff --git a/review-pr/action.yml b/review-pr/action.yml index 67970f4..2bf090e 100644 --- a/review-pr/action.yml +++ b/review-pr/action.yml @@ -881,10 +881,10 @@ runs: # (prior runs posted to /issues/comments; this prevents a duplicate on the first # run of this new code against PRs that already received an LGTM issue comment). EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \ - --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ + --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \ 2>/dev/null || echo "0") EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ + --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \ 2>/dev/null || echo "0") if [ "${EXISTING:-0}" -gt 0 ] || [ "${EXISTING_OLD:-0}" -gt 0 ]; then echo "ℹ️ Fallback LGTM review already exists — skipping duplicate post" diff --git a/src/main/exec.ts b/src/main/exec.ts index cef6187..61e1d19 100644 --- a/src/main/exec.ts +++ b/src/main/exec.ts @@ -290,7 +290,11 @@ export async function runAgent(opts: RunAgentOptions): Promise { totalAttempt++; if (totalAttempt > 1) { - core.info(`🔄 Retry attempt ${totalAttempt - 1} (waiting ${currentDelay}s)...`); + const retryLabel = + exitCode === TIMEOUT_EXIT_CODE + ? `Timeout retry ${timeoutRetryCount} of ${opts.retryOnTimeout}` + : `Retry ${failureRetryCount} of ${opts.maxRetries}`; + core.info(`🔄 ${retryLabel} (waiting ${currentDelay}s)...`); await sleep(currentDelay); currentDelay *= 2;