From 816d141c16f5f56f89c9c0aee4774fff73482438 Mon Sep 17 00:00:00 2001
From: Aidan Daly <aidandal@amazon.com>
Date: Thu, 25 Jun 2026 06:12:57 +0000
Subject: [PATCH] fix(unit-only): re-tune dataset-eval e2e timing budget and
 export SPAN_INGESTION_DELAY_MS (#1522)

Export SPAN_INGESTION_DELAY_MS so the progress log and the per-it timeout
derive from the real 180s span-ingestion floor instead of stale hardcoded
values. Raise the per-it ceiling 300000 -> 420000 (within the 600000 suite
cap), lower retries 18 -> 2, and add an in-test guard asserting the timeout
covers the full retry budget. Correct the misleading 'Waiting for span
ingestion (15s)...' log to render the real 180s wait.

Refs aws/agentcore-cli#1522
---
 e2e-tests/dataset-eval-integration.test.ts    | 20 ++++++++++++++++---
 .../shared/__tests__/span-collector.test.ts   | 10 +++++++++-
 .../eval/shared/dataset-session-provider.ts   |  4 ++--
 .../operations/eval/shared/span-collector.ts  |  2 +-
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/e2e-tests/dataset-eval-integration.test.ts b/e2e-tests/dataset-eval-integration.test.ts
index c08406fd7..e9fe02906 100644
--- a/e2e-tests/dataset-eval-integration.test.ts
+++ b/e2e-tests/dataset-eval-integration.test.ts
@@ -10,6 +10,7 @@
  *   - AWS credentials
  *   - npm, git, uv installed
  */
+import { SPAN_INGESTION_DELAY_MS } from '../src/cli/operations/eval/shared/span-collector.js';
 import { parseJsonOutput, retry } from '../src/test-utils/index.js';
 import {
   baseCanRun,
@@ -27,6 +28,10 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 
 const canRun = baseCanRun && hasAws;
 
+// Per-`it` ceiling for the dataset eval run. Must cover the full retry budget
+// (see assertion in the test body); stays within the 600000ms e2e suite cap.
+const EVAL_IT_TIMEOUT_MS = 420000;
+
 describe.sequential('e2e: dataset eval integration', () => {
   let testDir: string;
   let projectPath: string;
@@ -158,6 +163,15 @@ describe.sequential('e2e: dataset eval integration', () => {
   it.skipIf(!canRun)(
     'runs evaluation using dataset as input',
     async () => {
+      // Each `run eval --dataset` attempt has a ~180s span-ingestion floor
+      // (SPAN_INGESTION_DELAY_MS), so the per-`it` timeout below must leave room
+      // for the whole retry budget: retries * (ingestion floor + gap).
+      const evalRetries = 2;
+      const evalRetryGapMs = 10000;
+      expect(EVAL_IT_TIMEOUT_MS).toBeGreaterThanOrEqual(
+        evalRetries * (SPAN_INGESTION_DELAY_MS + evalRetryGapMs)
+      );
+
       await retry(
         async () => {
           const result = await run([
@@ -178,10 +192,10 @@ describe.sequential('e2e: dataset eval integration', () => {
           expect(json).toHaveProperty('success', true);
           expect(json).toHaveProperty('run');
         },
-        18,
-        10000
+        evalRetries,
+        evalRetryGapMs
       );
     },
-    300000
+    EVAL_IT_TIMEOUT_MS
   );
 });
diff --git a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts
index 6c7e1ded2..c94c73a16 100644
--- a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts
+++ b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts
@@ -1,7 +1,15 @@
-import { collectSpans, extractTraceIds } from '../span-collector';
+import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from '../span-collector';
 import type { DocumentType } from '@smithy/types';
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 
+describe('SPAN_INGESTION_DELAY_MS progress message', () => {
+  it('renders the real wait (180s), not a stale literal', () => {
+    const message = `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`;
+    expect(message).toContain('180s');
+    expect(message).not.toContain('15s');
+  });
+});
+
 describe('extractTraceIds', () => {
   it('extracts unique traceIds in appearance order', () => {
     const spans = [
diff --git a/src/cli/operations/eval/shared/dataset-session-provider.ts b/src/cli/operations/eval/shared/dataset-session-provider.ts
index 40b3bc19e..94d099723 100644
--- a/src/cli/operations/eval/shared/dataset-session-provider.ts
+++ b/src/cli/operations/eval/shared/dataset-session-provider.ts
@@ -13,7 +13,7 @@ import type { AgentContext } from '../../invoke/resolve-agent-context';
 import { loadDatasetScenarios } from './dataset-loader';
 import { executeScenarios } from './scenario-executor';
 import type { ScenarioInvocationResult } from './scenario-executor';
-import { collectSpans, extractTraceIds } from './span-collector';
+import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from './span-collector';
 import type { PredefinedScenario } from './types';
 import type { DocumentType } from '@smithy/types';
 
@@ -138,7 +138,7 @@ export async function runDatasetScenariosAndCollectSpans(
   const logGroup = runtimeLogGroup(agentContext.runtimeId, agentContext.endpoint);
   const sessionIds = successfulResults.map(r => r.sessionId);
 
-  onProgress?.('collect', 'Waiting for span ingestion (15s)...');
+  onProgress?.('collect', `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`);
   const { spans: collectedSpans, timedOut } = await collectSpans({
     sessionIds,
     region: agentContext.region,
diff --git a/src/cli/operations/eval/shared/span-collector.ts b/src/cli/operations/eval/shared/span-collector.ts
index deafb709c..f4f2f5e03 100644
--- a/src/cli/operations/eval/shared/span-collector.ts
+++ b/src/cli/operations/eval/shared/span-collector.ts
@@ -13,7 +13,7 @@ import type { DocumentType } from '@smithy/types';
  * Default delay before first span query (CloudWatch ingestion buffer).
  * Matches SDK's evaluation_delay_seconds default (180s).
  */
-const SPAN_INGESTION_DELAY_MS = 180_000;
+export const SPAN_INGESTION_DELAY_MS = 180_000;
 
 /** Maximum time to poll for spans after the ingestion delay. */
 const SPAN_POLL_TIMEOUT_MS = 60_000;