diff --git a/e2e-tests/dataset-eval-integration.test.ts b/e2e-tests/dataset-eval-integration.test.ts index c08406fd7..e9fe02906 100644 --- a/e2e-tests/dataset-eval-integration.test.ts +++ b/e2e-tests/dataset-eval-integration.test.ts @@ -10,6 +10,7 @@ * - AWS credentials * - npm, git, uv installed */ +import { SPAN_INGESTION_DELAY_MS } from '../src/cli/operations/eval/shared/span-collector.js'; import { parseJsonOutput, retry } from '../src/test-utils/index.js'; import { baseCanRun, @@ -27,6 +28,10 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest'; const canRun = baseCanRun && hasAws; +// Per-`it` ceiling for the dataset eval run. Must cover the full retry budget +// (see assertion in the test body); stays within the 600000ms e2e suite cap. +const EVAL_IT_TIMEOUT_MS = 420000; + describe.sequential('e2e: dataset eval integration', () => { let testDir: string; let projectPath: string; @@ -158,6 +163,15 @@ describe.sequential('e2e: dataset eval integration', () => { it.skipIf(!canRun)( 'runs evaluation using dataset as input', async () => { + // Each `run eval --dataset` attempt has a ~180s span-ingestion floor + // (SPAN_INGESTION_DELAY_MS), so the per-`it` timeout below must leave room + // for the whole retry budget: retries * (ingestion floor + gap). + const evalRetries = 2; + const evalRetryGapMs = 10000; + expect(EVAL_IT_TIMEOUT_MS).toBeGreaterThanOrEqual( + evalRetries * (SPAN_INGESTION_DELAY_MS + evalRetryGapMs) + ); + await retry( async () => { const result = await run([ @@ -178,10 +192,10 @@ describe.sequential('e2e: dataset eval integration', () => { expect(json).toHaveProperty('success', true); expect(json).toHaveProperty('run'); }, - 18, - 10000 + evalRetries, + evalRetryGapMs ); }, - 300000 + EVAL_IT_TIMEOUT_MS ); }); diff --git a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts index 6c7e1ded2..c94c73a16 100644 --- a/src/cli/operations/eval/shared/__tests__/span-collector.test.ts +++ b/src/cli/operations/eval/shared/__tests__/span-collector.test.ts @@ -1,7 +1,15 @@ -import { collectSpans, extractTraceIds } from '../span-collector'; +import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from '../span-collector'; import type { DocumentType } from '@smithy/types'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +describe('SPAN_INGESTION_DELAY_MS progress message', () => { + it('renders the real wait (180s), not a stale literal', () => { + const message = `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`; + expect(message).toContain('180s'); + expect(message).not.toContain('15s'); + }); +}); + describe('extractTraceIds', () => { it('extracts unique traceIds in appearance order', () => { const spans = [ diff --git a/src/cli/operations/eval/shared/dataset-session-provider.ts b/src/cli/operations/eval/shared/dataset-session-provider.ts index 40b3bc19e..94d099723 100644 --- a/src/cli/operations/eval/shared/dataset-session-provider.ts +++ b/src/cli/operations/eval/shared/dataset-session-provider.ts @@ -13,7 +13,7 @@ import type { AgentContext } from '../../invoke/resolve-agent-context'; import { loadDatasetScenarios } from './dataset-loader'; import { executeScenarios } from './scenario-executor'; import type { ScenarioInvocationResult } from './scenario-executor'; -import { collectSpans, extractTraceIds } from './span-collector'; +import { SPAN_INGESTION_DELAY_MS, collectSpans, extractTraceIds } from './span-collector'; import type { PredefinedScenario } from './types'; import type { DocumentType } from '@smithy/types'; @@ -138,7 +138,7 @@ export async function runDatasetScenariosAndCollectSpans( const logGroup = runtimeLogGroup(agentContext.runtimeId, agentContext.endpoint); const sessionIds = successfulResults.map(r => r.sessionId); - onProgress?.('collect', 'Waiting for span ingestion (15s)...'); + onProgress?.('collect', `Waiting for span ingestion (${SPAN_INGESTION_DELAY_MS / 1000}s)...`); const { spans: collectedSpans, timedOut } = await collectSpans({ sessionIds, region: agentContext.region, diff --git a/src/cli/operations/eval/shared/span-collector.ts b/src/cli/operations/eval/shared/span-collector.ts index deafb709c..f4f2f5e03 100644 --- a/src/cli/operations/eval/shared/span-collector.ts +++ b/src/cli/operations/eval/shared/span-collector.ts @@ -13,7 +13,7 @@ import type { DocumentType } from '@smithy/types'; * Default delay before first span query (CloudWatch ingestion buffer). * Matches SDK's evaluation_delay_seconds default (180s). */ -const SPAN_INGESTION_DELAY_MS = 180_000; +export const SPAN_INGESTION_DELAY_MS = 180_000; /** Maximum time to poll for spans after the ingestion delay. */ const SPAN_POLL_TIMEOUT_MS = 60_000;