diff --git a/action.yml b/action.yml index ddebf68..35f9f0c 100644 --- a/action.yml +++ b/action.yml @@ -71,6 +71,10 @@ inputs: description: "Base delay in seconds between retries (doubles each attempt)" required: false default: "5" + retry-on-timeout: + description: "Number of additional retry attempts when the agent times out (exit code 124). Independent of max-retries — both budgets can be consumed in the same run. Default 0 = no timeout retries." + required: false + default: "0" extra-args: description: "Additional arguments to pass to docker agent run" required: false diff --git a/review-pr/action.yml b/review-pr/action.yml index 1867584..2bf090e 100644 --- a/review-pr/action.yml +++ b/review-pr/action.yml @@ -777,6 +777,7 @@ runs: extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }} add-prompt-files: ${{ inputs.add-prompt-files }} max-retries: "1" # One retry handles transient API failures (e.g. Anthropic 400s) without risking duplicate reviews from full pipeline restarts + retry-on-timeout: "1" # Retry once on timeout — agent may succeed on a second pass after a transient infra hiccup skip-summary: "true" org-membership-token: ${{ inputs.org-membership-token }} auth-org: ${{ inputs.auth-org }} @@ -848,8 +849,9 @@ runs: else TIMEOUT_BODY="⏱️ **PR Review Timed Out** — The review agent hit the 2700 s time limit. This usually happens on large or complex diffs. Re-request a review from \`docker-agent\` to retry — if it times out again, consider splitting the PR into smaller pieces." fi - if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="$TIMEOUT_BODY" \ + if ! jq -n --arg body "$TIMEOUT_BODY" --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \ 2>&1; then echo "::warning::Failed to post timeout comment to PR" fi @@ -862,8 +864,9 @@ runs: STATUS="⚠️ **Review completed with warnings** (exit code: $EXIT_CODE)" else STATUS="❌ **Review failed** (exit code: $EXIT_CODE)" - if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \ + if ! jq -n --arg body "❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \ 2>&1; then echo "::warning::Failed to post fallback comment to PR" fi @@ -873,17 +876,24 @@ runs: # Defense-in-depth: if the log exists but no review was posted, post a fallback LGTM comment. # This guards against the agent exiting 0 without calling gh api (e.g., zero-findings early exit). if [ -n "$VERBOSE_LOG_FILE" ] && [ -f "$VERBOSE_LOG_FILE" ] && ! grep -qE 'pullrequestreview-[0-9]+' "$VERBOSE_LOG_FILE" 2>/dev/null; then - # Dedup guard: skip posting if an identical fallback comment already exists. - EXISTING=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \ + # Dedup guard: skip posting if an identical fallback PR review already exists. + # Also check the old issue-comment endpoint for a one-time migration guard + # (prior runs posted to /issues/comments; this prevents a duplicate on the first + # run of this new code against PRs that already received an LGTM issue comment). + EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \ + --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \ 2>/dev/null || echo "0") - if [ "${EXISTING:-0}" -gt 0 ]; then - echo "ℹ️ Fallback LGTM comment already exists — skipping duplicate post" + EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ + --jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \ + 2>/dev/null || echo "0") + if [ "${EXISTING:-0}" -gt 0 ] || [ "${EXISTING_OLD:-0}" -gt 0 ]; then + echo "ℹ️ Fallback LGTM review already exists — skipping duplicate post" else - echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM comment" - gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \ - -f body="🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." 2>&1 || \ - echo "::warning::Failed to post fallback LGTM comment to PR" + echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM review" + jq -n --arg body "🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." --arg event "COMMENT" \ + '{body: $body, event: $event, comments: []}' \ + | gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - 2>&1 || \ + echo "::warning::Failed to post fallback LGTM review to PR" fi fi fi diff --git a/src/main/__tests__/exec.test.ts b/src/main/__tests__/exec.test.ts index 6bbe453..e8c2845 100644 --- a/src/main/__tests__/exec.test.ts +++ b/src/main/__tests__/exec.test.ts @@ -69,6 +69,7 @@ function baseOpts(overrides: Partial[0]> = {}) { timeout: 0, maxRetries: 0, retryDelay: 0, + retryOnTimeout: 0, debug: false, anthropicApiKey: 'sk-ant-test', telemetryTags: 'source=test', @@ -248,17 +249,55 @@ describe('runAgent', () => { expect(writtenData.toString()).toContain('Raw prompt'); }); - it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout', async () => { + it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout when retryOnTimeout=0', async () => { // Return 124 (our timeout sentinel — simulate the timer firing) mockSpawn.mockReturnValue(makeMockChild(TIMEOUT_EXIT_CODE)); - const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3 })); + const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3, retryOnTimeout: 0 })); expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE); // Only spawned once — no retries after timeout expect(mockSpawn).toHaveBeenCalledOnce(); }); + it('retries once on timeout when retryOnTimeout=1, succeeds on retry', async () => { + mockSpawn + .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) + .mockImplementation(() => makeMockChild(0)); + + const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 })); + + expect(result.exitCode).toBe(0); + // First attempt timed out, second attempt succeeded + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + + it('stops retrying on timeout after retryOnTimeout attempts', async () => { + mockSpawn.mockImplementation(() => makeMockChild(TIMEOUT_EXIT_CODE)); + + const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 })); + + expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE); + // 1 initial + 1 timeout retry = 2 total + expect(mockSpawn).toHaveBeenCalledTimes(2); + }); + + it('timeout retry budget is independent of failure retry budget (mixed sequence)', async () => { + // Attempt 1: normal failure — consumes one failureRetryCount slot + // Attempt 2: timeout — should still get its own retryOnTimeout=1 slot + // Attempt 3: success + mockSpawn + .mockImplementationOnce(() => makeMockChild(1)) // failure + .mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) // timeout + .mockImplementation(() => makeMockChild(0)); // success + + const result = await runAgent(baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 })); + + expect(result.exitCode).toBe(0); + // failure retry + timeout retry + final success = 3 spawns + expect(mockSpawn).toHaveBeenCalledTimes(3); + }); + it('kills process with SIGTERM when timeout fires (FIX D)', async () => { // Child exits naturally in 5 s; action timeout is 50 ms. // The real timer path fires SIGTERM, then the child is killed. diff --git a/src/main/exec.ts b/src/main/exec.ts index 807fa29..61e1d19 100644 --- a/src/main/exec.ts +++ b/src/main/exec.ts @@ -47,6 +47,8 @@ export interface RunAgentOptions { maxRetries: number; /** Base delay between retries in seconds (doubles each attempt). */ retryDelay: number; + /** Number of additional retry attempts allowed when the agent times out (exit 124). */ + retryOnTimeout: number; /** Whether debug mode is enabled. */ debug: boolean; @@ -276,23 +278,30 @@ export async function runAgent(opts: RunAgentOptions): Promise { const startTime = Date.now(); let exitCode = 1; - let attempt = 0; + let totalAttempt = 0; + // Track the two retry budgets independently so mixed-failure sequences + // (e.g. a non-timeout failure followed by a timeout) consume only from + // the correct budget and never silently exhaust the other. + let failureRetryCount = 0; + let timeoutRetryCount = 0; let currentDelay = opts.retryDelay; while (true) { - attempt++; - - if (attempt > 1) { - core.info( - `🔄 Retry attempt ${attempt - 1} of ${opts.maxRetries} (waiting ${currentDelay}s)...`, - ); + totalAttempt++; + + if (totalAttempt > 1) { + const retryLabel = + exitCode === TIMEOUT_EXIT_CODE + ? `Timeout retry ${timeoutRetryCount} of ${opts.retryOnTimeout}` + : `Retry ${failureRetryCount} of ${opts.maxRetries}`; + core.info(`🔄 ${retryLabel} (waiting ${currentDelay}s)...`); await sleep(currentDelay); currentDelay *= 2; // Reset verbose log separator for retry const separator = [ '', - `========== RETRY ATTEMPT ${attempt} (${new Date().toISOString()}) ==========`, + `========== RETRY ATTEMPT ${totalAttempt} (${new Date().toISOString()}) ==========`, '', ].join(os.EOL); fs.appendFileSync(opts.verboseLogFile, separator, 'utf-8'); @@ -320,15 +329,22 @@ export async function runAgent(opts: RunAgentOptions): Promise { if (exitCode === TIMEOUT_EXIT_CODE) { core.error(`Agent execution timed out after ${opts.timeout} seconds`); - break; // Don't retry timeouts - } - - if (attempt > opts.maxRetries) { - core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`); - break; + if (timeoutRetryCount >= opts.retryOnTimeout) { + break; // Timeout retry budget exhausted + } + timeoutRetryCount++; + core.warning( + `Timeout — will retry (${timeoutRetryCount}/${opts.retryOnTimeout} timeout retries used)`, + ); + // fall through to retry + } else { + if (failureRetryCount >= opts.maxRetries) { + core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`); + break; + } + failureRetryCount++; + core.warning(`Agent failed (exit code: ${exitCode}), will retry...`); } - - core.warning(`Agent failed (exit code: ${exitCode}), will retry...`); } const executionTime = Math.round((Date.now() - startTime) / 1000); diff --git a/src/main/index.ts b/src/main/index.ts index 71382ef..c942fc7 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -217,6 +217,7 @@ async function run(): Promise { const timeout = parseInt(core.getInput('timeout') || '0', 10); const maxRetries = parseInt(core.getInput('max-retries') || '2', 10); const retryDelay = parseInt(core.getInput('retry-delay') || '5', 10); + const retryOnTimeout = parseInt(core.getInput('retry-on-timeout') || '0', 10); const yolo = core.getBooleanInput('yolo'); const workingDirectory = core.getInput('working-directory') || '.'; const extraArgs = core.getInput('extra-args'); @@ -342,6 +343,7 @@ async function run(): Promise { timeout, maxRetries, retryDelay, + retryOnTimeout, debug, anthropicApiKey, openaiApiKey,