From 816d141c16f5f56f89c9c0aee4774fff73482438 Mon Sep 17 00:00:00 2001 From: Aidan Daly Date: Thu, 25 Jun 2026 06:12:57 +0000 Subject: [PATCH] fix(unit-only): re-tune dataset-eval e2e timing budget and export SPAN_INGESTION_DELAY_MS (#1522) Export SPAN_INGESTION_DELAY_MS so the progress log and the per-it timeout derive from the real 180s span-ingestion floor instead of stale hardcoded values. Raise the per-it ceiling 300000 -> 420000 (within the 600000 suite cap), lower retries 18 -> 2, and add an in-test guard asserting the timeout covers the full retry budget. Correct the misleading 'Waiting for span ingestion (15s)...' log to render the real 180s wait. Refs aws/agentcore-cli#1522 --- e2e-tests/dataset-eval-integration.test.ts | 20 ++++++++++++++++--- .../shared/__tests__/span-collector.test.ts | 10 +++++++++- .../eval/shared/dataset-session-provider.ts | 4 ++-- .../operations/eval/shared/span-collector.ts | 2 +- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/e2e-tests/dataset-eval-integration.test.ts b/e2e-tests/dataset-eval-integration.test.ts index c08406fd7..e9fe02906 100644 --- a/e2e-tests/dataset-eval-integration.test.ts +++ b/e2e-tests/dataset-eval-integration.test.ts @@ -10,6 +10,7 @@ * - AWS credentials * - npm, git, uv installed */ +import { SPAN_INGESTION_DELAY_MS } from '../src/cli/operations/eval/shared/span-collector.js'; import { parseJsonOutput, retry } from '../src/test-utils/index.js'; import { baseCanRun, @@ -27,6 +28,10 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest'; const canRun = baseCanRun && hasAws; +// Per-`it` ceiling for the dataset eval run. Must cover the full retry budget +// (see assertion in the test body); stays within the 600000ms e2e suite cap. +const EVAL_IT_TIMEOUT_MS = 420000; + describe.sequential('e2e: dataset eval integration', () => { let testDir: string; let projectPath: string; @@ -158,6 +163,15 @@ describe.sequential('e2e: dataset eval integration', () => { it.skipIf(!canRun)( 'runs evaluation using dataset as input', async () => { + // Each `run eval --dataset` attempt has a ~180s span-ingestion floor + // (SPAN_INGESTION_DELAY_MS), so the per-`it` timeout below must leave room + // for the whole retry budget: retries * (ingestion floor + gap). + const evalRetries = 2; + const evalRetryGapMs = 10000; + expect(EVAL_IT_TIMEOUT_MS).toBeGreaterThanOrEqual( + evalRetries * (SPAN_INGESTION_DELAY_MS + evalRetryGapMs) + ); + await retry( async () => { const result = await run([ @@ -178,10 +192,10 @@ describe.sequential('e2e: dataset eval integration', () => { expect(json).toHaveProperty('success', true); expect(json).toHaveProperty('run'); }, - 18, - 10000 + evalRetries, + evalRetryGapMs ); }, - 300000 + EVAL_IT_TIMEOUT_MS ); }); diff --git a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts index 6c7e1ded2..c94c73a16 100644 --- a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts +++ b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts @@ -1,7 +1,15 @@ -import { collectSpans, extractTraceIds } from '../span-collector'; +import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from '../span-collector'; import type { DocumentType } from '@smithy/types'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +describe('SPAN_INGESTION_DELAY_MS progress message', () => { + it('renders the real wait (180s), not a stale literal', () => { + const message = `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`; + expect(message).toContain('180s'); + expect(message).not.toContain('15s'); + }); +}); + describe('extractTraceIds', () => { it('extracts unique traceIds in appearance order', () => { const spans = [ diff --git a/src/cli/operations/eval/shared/dataset-session-provider.ts b/src/cli/operations/eval/shared/dataset-session-provider.ts index 40b3bc19e..94d099723 100644 --- a/src/cli/operations/eval/shared/dataset-session-provider.ts +++ b/src/cli/operations/eval/shared/dataset-session-provider.ts @@ -13,7 +13,7 @@ import type { AgentContext } from '../../invoke/resolve-agent-context'; import { loadDatasetScenarios } from './dataset-loader'; import { executeScenarios } from './scenario-executor'; import type { ScenarioInvocationResult } from './scenario-executor'; -import { collectSpans, extractTraceIds } from './span-collector'; +import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from './span-collector'; import type { PredefinedScenario } from './types'; import type { DocumentType } from '@smithy/types'; @@ -138,7 +138,7 @@ export async function runDatasetScenariosAndCollectSpans( const logGroup = runtimeLogGroup(agentContext.runtimeId, agentContext.endpoint); const sessionIds = successfulResults.map(r => r.sessionId); - onProgress?.('collect', 'Waiting for span ingestion (15s)...'); + onProgress?.('collect', `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`); const { spans: collectedSpans, timedOut } = await collectSpans({ sessionIds, region: agentContext.region, diff --git a/src/cli/operations/eval/shared/span-collector.ts b/src/cli/operations/eval/shared/span-collector.ts index deafb709c..f4f2f5e03 100644 --- a/src/cli/operations/eval/shared/span-collector.ts +++ b/src/cli/operations/eval/shared/span-collector.ts @@ -13,7 +13,7 @@ import type { DocumentType } from '@smithy/types'; * Default delay before first span query (CloudWatch ingestion buffer). * Matches SDK's evaluation_delay_seconds default (180s). */ -const SPAN_INGESTION_DELAY_MS = 180_000; +export const SPAN_INGESTION_DELAY_MS = 180_000; /** Maximum time to poll for spans after the ingestion delay. */ const SPAN_POLL_TIMEOUT_MS = 60_000;