From 9a38c1d40589aa76106bd293bebee1c2d7bbece2 Mon Sep 17 00:00:00 2001 From: Jacob Sussmilch Date: Thu, 5 Mar 2026 16:01:13 +1100 Subject: [PATCH] fix: KEEP-1545 disable SDK retries on all steps and match both error formats Set maxRetries = 0 on 15 steps missing it to prevent the SDK durability layer from retrying and producing spurious failure errors. Update the reconciler to detect both SDK error formats ("exceeded max retries" and "failed after N retries") as defense in depth. --- keeperhub/lib/max-retries-reconciler.ts | 30 +++++-- keeperhub/lib/steps/enrich-explorer-links.ts | 2 + keeperhub/plugins/math/steps/aggregate.ts | 2 + .../plugins/protocol/steps/protocol-read.ts | 2 + .../plugins/protocol/steps/protocol-write.ts | 2 + .../plugins/web3/steps/batch-read-contract.ts | 2 + .../plugins/web3/steps/check-allowance.ts | 2 + keeperhub/plugins/web3/steps/check-balance.ts | 2 + .../plugins/web3/steps/check-token-balance.ts | 2 + .../plugins/web3/steps/get-transaction.ts | 2 + keeperhub/plugins/web3/steps/query-events.ts | 2 + .../plugins/web3/steps/query-transactions.ts | 2 + keeperhub/plugins/web3/steps/read-contract.ts | 2 + .../plugins/web3/steps/transfer-funds.ts | 2 + .../plugins/web3/steps/transfer-token.ts | 2 + .../plugins/web3/steps/write-contract.ts | 2 + tests/unit/max-retries-reconciler.test.ts | 81 +++++++++++++++++-- 17 files changed, 127 insertions(+), 14 deletions(-) diff --git a/keeperhub/lib/max-retries-reconciler.ts b/keeperhub/lib/max-retries-reconciler.ts index 18542f9ec..48e453082 100644 --- a/keeperhub/lib/max-retries-reconciler.ts +++ b/keeperhub/lib/max-retries-reconciler.ts @@ -1,6 +1,6 @@ /** * KEEP-1541: The Workflow DevKit's "use step" durability layer can throw - * "exceeded max retries" even when the step itself succeeded. This happens + * retry-related errors even when the step itself succeeded. This happens * because the SDK's internal state tracking encounters a conflict (e.g., * step already completed, state replay mismatch) AFTER withStepLogging has * already recorded a success log in workflow_execution_logs. The error is @@ -8,8 +8,12 @@ * then causes finalSuccess to be false -- marking the entire workflow as * "error" despite all steps completing successfully. * - * Fix: after all nodes finish, cross-reference failed results that have - * "max retries exceeded" errors against the in-memory success tracker. + * The SDK produces two error formats: + * 1. Pre-check path: 'Step "X" exceeded max retries (N retries)' + * 2. Catch path: 'Step "X" failed after N retries: {error}' + * + * Fix: after all nodes finish, cross-reference failed results that match + * either SDK retry error pattern against the in-memory success tracker. * If a failed node has a recorded success in the tracker, the SDK error * was spurious and we override the result to success. */ @@ -31,17 +35,27 @@ type ReconcileOutput = { overriddenNodeIds: string[]; }; -// SDK error message substring used to identify spurious max-retries failures. -// If the SDK changes this wording, update this constant. +// SDK error message substrings used to identify spurious retry failures. +// The SDK produces two distinct formats depending on the code path. +// If the SDK changes this wording, update these constants. export const MAX_RETRIES_ERROR_MARKER = "exceeded max retries"; +export const FAILED_AFTER_RETRIES_MARKER = "failed after"; + +export function isSdkRetryError(error: string | undefined): boolean { + if (!error) { + return false; + } + return ( + error.includes(MAX_RETRIES_ERROR_MARKER) || + error.includes(FAILED_AFTER_RETRIES_MARKER) + ); +} export function getFailedMaxRetriesNodeIds( results: Record ): string[] { return Object.entries(results) - .filter( - ([, r]) => !r.success && r.error?.includes(MAX_RETRIES_ERROR_MARKER) - ) + .filter(([, r]) => !r.success && isSdkRetryError(r.error)) .map(([nodeId]) => nodeId); } diff --git a/keeperhub/lib/steps/enrich-explorer-links.ts b/keeperhub/lib/steps/enrich-explorer-links.ts index b82262645..d214fb0ab 100644 --- a/keeperhub/lib/steps/enrich-explorer-links.ts +++ b/keeperhub/lib/steps/enrich-explorer-links.ts @@ -38,3 +38,5 @@ export async function enrichExplorerLinks( ); } } + +enrichExplorerLinks.maxRetries = 0; diff --git a/keeperhub/plugins/math/steps/aggregate.ts b/keeperhub/plugins/math/steps/aggregate.ts index 42abada7a..c2e4d119a 100644 --- a/keeperhub/plugins/math/steps/aggregate.ts +++ b/keeperhub/plugins/math/steps/aggregate.ts @@ -626,4 +626,6 @@ export async function aggregateStep( ); } +aggregateStep.maxRetries = 0; + export const _integrationType = PLUGIN_NAME; diff --git a/keeperhub/plugins/protocol/steps/protocol-read.ts b/keeperhub/plugins/protocol/steps/protocol-read.ts index 21f2f8992..89bdb7e91 100644 --- a/keeperhub/plugins/protocol/steps/protocol-read.ts +++ b/keeperhub/plugins/protocol/steps/protocol-read.ts @@ -123,4 +123,6 @@ export async function protocolReadStep( return await withStepLogging(input, () => readContractCore(coreInput)); } +protocolReadStep.maxRetries = 0; + export const _integrationType = "protocol"; diff --git a/keeperhub/plugins/protocol/steps/protocol-write.ts b/keeperhub/plugins/protocol/steps/protocol-write.ts index 5fdbe74e9..552f7ace6 100644 --- a/keeperhub/plugins/protocol/steps/protocol-write.ts +++ b/keeperhub/plugins/protocol/steps/protocol-write.ts @@ -132,4 +132,6 @@ export async function protocolWriteStep( return await withStepLogging(input, () => writeContractCore(coreInput)); } +protocolWriteStep.maxRetries = 0; + export const _integrationType = "protocol"; diff --git a/keeperhub/plugins/web3/steps/batch-read-contract.ts b/keeperhub/plugins/web3/steps/batch-read-contract.ts index d7d2681c9..c6177cab7 100644 --- a/keeperhub/plugins/web3/steps/batch-read-contract.ts +++ b/keeperhub/plugins/web3/steps/batch-read-contract.ts @@ -818,4 +818,6 @@ export async function batchReadContractStep( ); } +batchReadContractStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/check-allowance.ts b/keeperhub/plugins/web3/steps/check-allowance.ts index 8dbde1f5a..d5aeb2d6e 100644 --- a/keeperhub/plugins/web3/steps/check-allowance.ts +++ b/keeperhub/plugins/web3/steps/check-allowance.ts @@ -169,4 +169,6 @@ export async function checkAllowanceStep( return withStepLogging(input, () => stepHandler(input)); } +checkAllowanceStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/check-balance.ts b/keeperhub/plugins/web3/steps/check-balance.ts index 8d2c8195f..ba567d9f7 100644 --- a/keeperhub/plugins/web3/steps/check-balance.ts +++ b/keeperhub/plugins/web3/steps/check-balance.ts @@ -217,4 +217,6 @@ export async function checkBalanceStep( ); } +checkBalanceStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/check-token-balance.ts b/keeperhub/plugins/web3/steps/check-token-balance.ts index 4ce764d49..0eaf85d16 100644 --- a/keeperhub/plugins/web3/steps/check-token-balance.ts +++ b/keeperhub/plugins/web3/steps/check-token-balance.ts @@ -487,4 +487,6 @@ export async function checkTokenBalanceStep( return withStepLogging(input, () => stepHandler(input)); } +checkTokenBalanceStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/get-transaction.ts b/keeperhub/plugins/web3/steps/get-transaction.ts index b2c29dcc5..5da6e0523 100644 --- a/keeperhub/plugins/web3/steps/get-transaction.ts +++ b/keeperhub/plugins/web3/steps/get-transaction.ts @@ -184,4 +184,6 @@ export async function getTransactionStep( ); } +getTransactionStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/query-events.ts b/keeperhub/plugins/web3/steps/query-events.ts index 353bb6d7d..f727be6ca 100644 --- a/keeperhub/plugins/web3/steps/query-events.ts +++ b/keeperhub/plugins/web3/steps/query-events.ts @@ -399,4 +399,6 @@ export async function queryEventsStep( ); } +queryEventsStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/query-transactions.ts b/keeperhub/plugins/web3/steps/query-transactions.ts index 706b37b6a..e4f3426ef 100644 --- a/keeperhub/plugins/web3/steps/query-transactions.ts +++ b/keeperhub/plugins/web3/steps/query-transactions.ts @@ -25,4 +25,6 @@ export async function queryTransactionsStep( ); } +queryTransactionsStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/read-contract.ts b/keeperhub/plugins/web3/steps/read-contract.ts index 3af92308f..3ccf0d3c2 100644 --- a/keeperhub/plugins/web3/steps/read-contract.ts +++ b/keeperhub/plugins/web3/steps/read-contract.ts @@ -55,4 +55,6 @@ export async function readContractStep( ); } +readContractStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/transfer-funds.ts b/keeperhub/plugins/web3/steps/transfer-funds.ts index 3b677e739..be94bef0b 100644 --- a/keeperhub/plugins/web3/steps/transfer-funds.ts +++ b/keeperhub/plugins/web3/steps/transfer-funds.ts @@ -60,4 +60,6 @@ export async function transferFundsStep( ); } +transferFundsStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/transfer-token.ts b/keeperhub/plugins/web3/steps/transfer-token.ts index 49c897421..fc5f37493 100644 --- a/keeperhub/plugins/web3/steps/transfer-token.ts +++ b/keeperhub/plugins/web3/steps/transfer-token.ts @@ -51,4 +51,6 @@ export async function transferTokenStep( return withStepLogging(enrichedInput, () => transferTokenCore(input)); } +transferTokenStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/keeperhub/plugins/web3/steps/write-contract.ts b/keeperhub/plugins/web3/steps/write-contract.ts index e05432542..4b3548fef 100644 --- a/keeperhub/plugins/web3/steps/write-contract.ts +++ b/keeperhub/plugins/web3/steps/write-contract.ts @@ -55,4 +55,6 @@ export async function writeContractStep( ); } +writeContractStep.maxRetries = 0; + export const _integrationType = "web3"; diff --git a/tests/unit/max-retries-reconciler.test.ts b/tests/unit/max-retries-reconciler.test.ts index dc90dcce4..a39f6077c 100644 --- a/tests/unit/max-retries-reconciler.test.ts +++ b/tests/unit/max-retries-reconciler.test.ts @@ -2,11 +2,36 @@ import { describe, expect, it } from "vitest"; import { getFailedMaxRetriesNodeIds, + isSdkRetryError, reconcileMaxRetriesFailures, } from "@/keeperhub/lib/max-retries-reconciler"; +describe("isSdkRetryError", () => { + it("should match 'exceeded max retries' error format", () => { + expect( + isSdkRetryError( + 'Step "step//abc//sendWebhook" exceeded max retries (0 retries)' + ) + ).toBe(true); + }); + + it("should match 'failed after N retries' error format", () => { + expect( + isSdkRetryError( + 'Step "step//abc//checkBalance" failed after 3 retries: Error: step_completed event failed' + ) + ).toBe(true); + }); + + it("should not match unrelated errors", () => { + expect(isSdkRetryError("HTTP 500 Internal Server Error")).toBe(false); + expect(isSdkRetryError("Connection refused")).toBe(false); + expect(isSdkRetryError(undefined)).toBe(false); + }); +}); + describe("getFailedMaxRetriesNodeIds", () => { - it("should return node IDs with max-retries errors", () => { + it("should return node IDs with exceeded-max-retries errors", () => { const results = { "node-1": { success: true }, "node-2": { @@ -19,7 +44,20 @@ describe("getFailedMaxRetriesNodeIds", () => { expect(getFailedMaxRetriesNodeIds(results)).toEqual(["node-2"]); }); - it("should return empty array when no max-retries errors", () => { + it("should return node IDs with failed-after-retries errors", () => { + const results = { + "node-1": { success: true }, + "node-2": { + success: false, + error: + 'Step "step//abc//checkBalance" failed after 3 retries: step_completed event failed', + }, + }; + + expect(getFailedMaxRetriesNodeIds(results)).toEqual(["node-2"]); + }); + + it("should return empty array when no retry errors", () => { const results = { "node-1": { success: true }, "node-2": { success: false, error: "Connection refused" }, @@ -30,7 +68,7 @@ describe("getFailedMaxRetriesNodeIds", () => { }); describe("reconcileMaxRetriesFailures", () => { - it("should return empty overrides when no max-retries failures exist", () => { + it("should return empty overrides when no retry failures exist", () => { const results: Record = { "node-1": { success: true }, "node-2": { success: false, error: "HTTP 500 Internal Server Error" }, @@ -46,7 +84,7 @@ describe("reconcileMaxRetriesFailures", () => { expect(overriddenNodeIds).toEqual([]); }); - it("should override to success when node has a tracked success", () => { + it("should override exceeded-max-retries failure with tracked success", () => { const results: Record< string, { success: boolean; error?: string; data?: unknown } @@ -76,6 +114,36 @@ describe("reconcileMaxRetriesFailures", () => { }); }); + it("should override failed-after-retries failure with tracked success", () => { + const results: Record< + string, + { success: boolean; error?: string; data?: unknown } + > = { + "node-1": { + success: false, + error: + 'Step "step//abc//checkBalance" failed after 0 retries: step_completed event failed', + }, + }; + + const successfulSteps = new Map([ + ["node-1", { balance: "1.5" }], + ]); + + const { overriddenNodeIds } = reconcileMaxRetriesFailures({ + results, + successfulSteps, + executionId: "exec-1", + workflowId: "wf-1", + }); + + expect(overriddenNodeIds).toEqual(["node-1"]); + expect(results["node-1"]).toEqual({ + success: true, + data: { balance: "1.5" }, + }); + }); + it("should override to success when node output is undefined", () => { const results: Record< string, @@ -132,7 +200,8 @@ describe("reconcileMaxRetriesFailures", () => { }, "node-2": { success: false, - error: 'Step "step//b//condition" exceeded max retries (0 retries)', + error: + 'Step "step//b//checkBalance" failed after 3 retries: state conflict', }, "node-3": { success: true }, }; @@ -155,7 +224,7 @@ describe("reconcileMaxRetriesFailures", () => { }); }); - it("should not produce overrides for non-max-retries failures", () => { + it("should not produce overrides for non-retry failures", () => { const results: Record< string, { success: boolean; error?: string; data?: unknown }