From e294aff0f9fcd15945a343bd80f41ad81189d4b7 Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Thu, 21 May 2026 07:57:09 -0600 Subject: [PATCH 1/2] fix: resolve flaky integration tests in shards 2 and 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shard 2 (cloneNode): waitForAvailableStatus doesn't guarantee data has finished copying to the clone. Add a retry loop for the search_by_id verification so the test polls until the record appears rather than failing immediately on a race. Shard 4 (replicationLoad): the Replication Load Testing suite hit its 120 s timeout on shared CI runners — the suite includes node startup, cluster formation, 5000 upserts, app deploy with restart, a 35 s sleep, and 5000 blob requests. Raise the timeout to 300 s. Co-Authored-By: Claude Sonnet 4.6 --- integrationTests/cloneNode/cloneNode.test.mjs | 27 +++++++++++++------ .../cluster/replicationLoad.test.mjs | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/integrationTests/cloneNode/cloneNode.test.mjs b/integrationTests/cloneNode/cloneNode.test.mjs index 7f0c80016..ebbd372e7 100644 --- a/integrationTests/cloneNode/cloneNode.test.mjs +++ b/integrationTests/cloneNode/cloneNode.test.mjs @@ -205,14 +205,25 @@ suite('Clone Node', (ctx) => { equal(clusterStatusNode2.connections.length, 1, 'Clone node should have 1 connection'); equal(clusterStatusNode2.connections?.[0]?.database_sockets.length, 2, 'Clone node should be connected to leader'); - // Verify that data was cloned successfully by querying the clone node for data that was inserted into the leader node before cloning - const responseData = await sendOperation(ctx.nodes[1], { - operation: 'search_by_id', - table: 'test', - get_attributes: ['id', 'name'], - ids: ['1'], - }); - equal(responseData.length, 1, 'Should find 1 record in clone node'); + // Verify that data was cloned successfully by querying the clone node for data that was inserted into the leader node before cloning. + // "Available" status doesn't guarantee all data has finished copying, so retry until the record appears. + let responseData; + for (let retries = 0; ; retries++) { + try { + responseData = await sendOperation(ctx.nodes[1], { + operation: 'search_by_id', + table: 'test', + get_attributes: ['id', 'name'], + ids: ['1'], + }); + if (responseData.length === 1) break; + } catch {} + if (retries >= 20) { + equal(responseData?.length ?? 0, 1, 'Should find 1 record in clone node'); + break; + } + await sleep(500); + } equal(responseData[0].name, 'test-clone', 'Record name should match the original'); const sshKeys = await sendOperation(ctx.nodes[1], { diff --git a/integrationTests/cluster/replicationLoad.test.mjs b/integrationTests/cluster/replicationLoad.test.mjs index caaccb208..df01e594d 100644 --- a/integrationTests/cluster/replicationLoad.test.mjs +++ b/integrationTests/cluster/replicationLoad.test.mjs @@ -19,7 +19,7 @@ process.env.HARPER_INTEGRATION_TEST_INSTALL_SCRIPT = join( ); const NODE_COUNT = 3; -suite('Replication Load Testing', { timeout: 120000 }, (ctx) => { +suite('Replication Load Testing', { timeout: 300000 }, (ctx) => { before(async () => { // start up the nodes ctx.nodes = await Promise.all( From ee0ef7cfd82f4e1680e697942587c95cfd2e687c Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Thu, 21 May 2026 08:20:38 -0600 Subject: [PATCH 2/2] fix: increase Deploy app and test replication inner suite timeout The inner suite has its own 60 s timeout. Its before() deploys an app with restart=true and waits a hard-coded 35 s, leaving insufficient time for the 5000-request blob test. Raise to 180 s. Co-Authored-By: Claude Sonnet 4.6 --- integrationTests/cluster/replicationLoad.test.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrationTests/cluster/replicationLoad.test.mjs b/integrationTests/cluster/replicationLoad.test.mjs index df01e594d..51a8e59fb 100644 --- a/integrationTests/cluster/replicationLoad.test.mjs +++ b/integrationTests/cluster/replicationLoad.test.mjs @@ -267,7 +267,7 @@ suite('Replication Load Testing', { timeout: 300000 }, (ctx) => { } } }); - suite('Deploy app and test replication', { timeout: 60000 }, () => { + suite('Deploy app and test replication', { timeout: 180000 }, () => { before(async () => { const project = 'test-application'; const payload = await targz(join(import.meta.dirname, 'fixture'));