Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ inputs:
description: "Base delay in seconds between retries (doubles each attempt)"
required: false
default: "5"
retry-on-timeout:
description: "Number of additional retry attempts when the agent times out (exit code 124). Independent of max-retries — both budgets can be consumed in the same run. Default 0 = no timeout retries."
required: false
default: "0"
extra-args:
description: "Additional arguments to pass to docker agent run"
required: false
Expand Down
36 changes: 23 additions & 13 deletions review-pr/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,7 @@ runs:
extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }}
add-prompt-files: ${{ inputs.add-prompt-files }}
max-retries: "1" # One retry handles transient API failures (e.g. Anthropic 400s) without risking duplicate reviews from full pipeline restarts
retry-on-timeout: "1" # Retry once on timeout — agent may succeed on a second pass after a transient infra hiccup
skip-summary: "true"
org-membership-token: ${{ inputs.org-membership-token }}
auth-org: ${{ inputs.auth-org }}
Expand Down Expand Up @@ -848,8 +849,9 @@ runs:
else
TIMEOUT_BODY="⏱️ **PR Review Timed Out** — The review agent hit the 2700 s time limit. This usually happens on large or complex diffs. Re-request a review from \`docker-agent\` to retry — if it times out again, consider splitting the PR into smaller pieces."
fi
if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-f body="$TIMEOUT_BODY" \
if ! jq -n --arg body "$TIMEOUT_BODY" --arg event "COMMENT" \
'{body: $body, event: $event, comments: []}' \
| gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \
2>&1; then
echo "::warning::Failed to post timeout comment to PR"
fi
Expand All @@ -862,8 +864,9 @@ runs:
STATUS="⚠️ **Review completed with warnings** (exit code: $EXIT_CODE)"
else
STATUS="❌ **Review failed** (exit code: $EXIT_CODE)"
if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \
if ! jq -n --arg body "❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." --arg event "COMMENT" \
'{body: $body, event: $event, comments: []}' \
| gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - \
2>&1; then
echo "::warning::Failed to post fallback comment to PR"
fi
Expand All @@ -873,17 +876,24 @@ runs:
# Defense-in-depth: if the log exists but no review was posted, post a fallback LGTM comment.
# This guards against the agent exiting 0 without calling gh api (e.g., zero-findings early exit).
if [ -n "$VERBOSE_LOG_FILE" ] && [ -f "$VERBOSE_LOG_FILE" ] && ! grep -qE 'pullrequestreview-[0-9]+' "$VERBOSE_LOG_FILE" 2>/dev/null; then
# Dedup guard: skip posting if an identical fallback comment already exists.
EXISTING=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
--jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \
# Dedup guard: skip posting if an identical fallback PR review already exists.
# Also check the old issue-comment endpoint for a one-time migration guard
# (prior runs posted to /issues/comments; this prevents a duplicate on the first
# run of this new code against PRs that already received an LGTM issue comment).
EXISTING=$(gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" \

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Dedup guard for fallback LGTM is blind to old issue-comment style posts

The dedup guard now queries GET /pulls/{pr}/reviews for existing LGTM reviews, but the previous code posted fallbacks to POST /issues/{pr}/comments. These are separate GitHub API endpoints returning entirely different data.

On the first run of this new code against a PR that previously received an LGTM issue comment (from the old path), the guard will find zero matching reviews and post a duplicate LGTM PR review — leaving the PR with both an issue comment and a PR review saying the same thing.

Consider adding a one-time cross-check against the issue comments endpoint as well:

EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
  --jq '[.[] | select(.body | startswith("🟢 **No issues found**"))] | length' \
  2>/dev/null || echo "0")

Or simply accept the one-time duplicate as a known migration artefact (subsequent runs will dedup correctly via the reviews endpoint).

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed with the cross-check approach — the guard now queries both /pulls/{pr}/reviews (new path) and /issues/{pr}/comments (old path), and skips posting if either returns a match. This avoids the one-time duplicate on the migration run without any manual cleanup.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perfect — the cross-check implementation looks solid. The guard now queries both /pulls/{pr}/reviews and /issues/{pr}/comments, and skips posting if either returns a match. This cleanly handles the migration without manual intervention.

The || echo \"0\" fallback on both queries is a nice touch — ensures the skip logic degrades gracefully if either API call fails (e.g., rate limiting).

--jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \
2>/dev/null || echo "0")
if [ "${EXISTING:-0}" -gt 0 ]; then
echo "ℹ️ Fallback LGTM comment already exists — skipping duplicate post"
EXISTING_OLD=$(gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
--jq '[.[] | select((.body // "") | startswith("🟢 **No issues found**"))] | length' \
2>/dev/null || echo "0")
if [ "${EXISTING:-0}" -gt 0 ] || [ "${EXISTING_OLD:-0}" -gt 0 ]; then
echo "ℹ️ Fallback LGTM review already exists — skipping duplicate post"
else
echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM comment"
gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
-f body="🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." 2>&1 || \
echo "::warning::Failed to post fallback LGTM comment to PR"
echo "::warning::Agent exited 0 but no review was posted — posting fallback LGTM review"
jq -n --arg body "🟢 **No issues found** — LGTM! [View logs]($RUN_URL)." --arg event "COMMENT" \
'{body: $body, event: $event, comments: []}' \
| gh api "repos/$REPOSITORY/pulls/$PR_NUMBER/reviews" --input - 2>&1 || \
echo "::warning::Failed to post fallback LGTM review to PR"
fi
fi
fi
Expand Down
43 changes: 41 additions & 2 deletions src/main/__tests__/exec.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ function baseOpts(overrides: Partial<Parameters<typeof runAgent>[0]> = {}) {
timeout: 0,
maxRetries: 0,
retryDelay: 0,
retryOnTimeout: 0,
debug: false,
anthropicApiKey: 'sk-ant-test',
telemetryTags: 'source=test',
Expand Down Expand Up @@ -248,17 +249,55 @@ describe('runAgent', () => {
expect(writtenData.toString()).toContain('Raw prompt');
});

it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout', async () => {
it('returns TIMEOUT_EXIT_CODE (124) without retrying on timeout when retryOnTimeout=0', async () => {
// Return 124 (our timeout sentinel — simulate the timer firing)
mockSpawn.mockReturnValue(makeMockChild(TIMEOUT_EXIT_CODE));

const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3 }));
const result = await runAgent(baseOpts({ timeout: 5, maxRetries: 3, retryOnTimeout: 0 }));

expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE);
// Only spawned once — no retries after timeout
expect(mockSpawn).toHaveBeenCalledOnce();
});

it('retries once on timeout when retryOnTimeout=1, succeeds on retry', async () => {
mockSpawn
.mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE))
.mockImplementation(() => makeMockChild(0));

const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 }));

expect(result.exitCode).toBe(0);
// First attempt timed out, second attempt succeeded
expect(mockSpawn).toHaveBeenCalledTimes(2);
});

it('stops retrying on timeout after retryOnTimeout attempts', async () => {
mockSpawn.mockImplementation(() => makeMockChild(TIMEOUT_EXIT_CODE));

const result = await runAgent(baseOpts({ maxRetries: 0, retryDelay: 0, retryOnTimeout: 1 }));

expect(result.exitCode).toBe(TIMEOUT_EXIT_CODE);
// 1 initial + 1 timeout retry = 2 total
expect(mockSpawn).toHaveBeenCalledTimes(2);
});

it('timeout retry budget is independent of failure retry budget (mixed sequence)', async () => {
// Attempt 1: normal failure — consumes one failureRetryCount slot
// Attempt 2: timeout — should still get its own retryOnTimeout=1 slot
// Attempt 3: success
mockSpawn
.mockImplementationOnce(() => makeMockChild(1)) // failure
.mockImplementationOnce(() => makeMockChild(TIMEOUT_EXIT_CODE)) // timeout
.mockImplementation(() => makeMockChild(0)); // success

const result = await runAgent(baseOpts({ maxRetries: 1, retryDelay: 0, retryOnTimeout: 1 }));

expect(result.exitCode).toBe(0);
// failure retry + timeout retry + final success = 3 spawns
expect(mockSpawn).toHaveBeenCalledTimes(3);
});

it('kills process with SIGTERM when timeout fires (FIX D)', async () => {
// Child exits naturally in 5 s; action timeout is 50 ms.
// The real timer path fires SIGTERM, then the child is killed.
Expand Down
48 changes: 32 additions & 16 deletions src/main/exec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ export interface RunAgentOptions {
maxRetries: number;
/** Base delay between retries in seconds (doubles each attempt). */
retryDelay: number;
/** Number of additional retry attempts allowed when the agent times out (exit 124). */
retryOnTimeout: number;
/** Whether debug mode is enabled. */
debug: boolean;

Expand Down Expand Up @@ -276,23 +278,30 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {

const startTime = Date.now();
let exitCode = 1;
let attempt = 0;
let totalAttempt = 0;
// Track the two retry budgets independently so mixed-failure sequences
// (e.g. a non-timeout failure followed by a timeout) consume only from
// the correct budget and never silently exhaust the other.
let failureRetryCount = 0;
let timeoutRetryCount = 0;
let currentDelay = opts.retryDelay;

while (true) {
attempt++;

if (attempt > 1) {
core.info(
`🔄 Retry attempt ${attempt - 1} of ${opts.maxRetries} (waiting ${currentDelay}s)...`,
);
totalAttempt++;

if (totalAttempt > 1) {
const retryLabel =
exitCode === TIMEOUT_EXIT_CODE
? `Timeout retry ${timeoutRetryCount} of ${opts.retryOnTimeout}`
: `Retry ${failureRetryCount} of ${opts.maxRetries}`;
core.info(`🔄 ${retryLabel} (waiting ${currentDelay}s)...`);
await sleep(currentDelay);
currentDelay *= 2;

// Reset verbose log separator for retry
const separator = [
'',
`========== RETRY ATTEMPT ${attempt} (${new Date().toISOString()}) ==========`,
`========== RETRY ATTEMPT ${totalAttempt} (${new Date().toISOString()}) ==========`,
'',
].join(os.EOL);
fs.appendFileSync(opts.verboseLogFile, separator, 'utf-8');
Expand Down Expand Up @@ -320,15 +329,22 @@ export async function runAgent(opts: RunAgentOptions): Promise<RunAgentResult> {

if (exitCode === TIMEOUT_EXIT_CODE) {
core.error(`Agent execution timed out after ${opts.timeout} seconds`);
break; // Don't retry timeouts
}

if (attempt > opts.maxRetries) {
core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`);
break;
if (timeoutRetryCount >= opts.retryOnTimeout) {
break; // Timeout retry budget exhausted
}
timeoutRetryCount++;
core.warning(
`Timeout — will retry (${timeoutRetryCount}/${opts.retryOnTimeout} timeout retries used)`,
);
// fall through to retry
} else {
if (failureRetryCount >= opts.maxRetries) {
core.warning(`Agent failed after ${opts.maxRetries} retries (exit code: ${exitCode})`);
break;
}
failureRetryCount++;
core.warning(`Agent failed (exit code: ${exitCode}), will retry...`);
}

core.warning(`Agent failed (exit code: ${exitCode}), will retry...`);
}

const executionTime = Math.round((Date.now() - startTime) / 1000);
Expand Down
2 changes: 2 additions & 0 deletions src/main/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ async function run(): Promise<void> {
const timeout = parseInt(core.getInput('timeout') || '0', 10);
const maxRetries = parseInt(core.getInput('max-retries') || '2', 10);
const retryDelay = parseInt(core.getInput('retry-delay') || '5', 10);
const retryOnTimeout = parseInt(core.getInput('retry-on-timeout') || '0', 10);
const yolo = core.getBooleanInput('yolo');
const workingDirectory = core.getInput('working-directory') || '.';
const extraArgs = core.getInput('extra-args');
Expand Down Expand Up @@ -342,6 +343,7 @@ async function run(): Promise<void> {
timeout,
maxRetries,
retryDelay,
retryOnTimeout,
debug,
anthropicApiKey,
openaiApiKey,
Expand Down
Loading