Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions e2e-tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,23 @@ Feature lifecycle tests: describe what the test exercises end-to-end

- `dev-lifecycle.test.ts`
- `evals-lifecycle.test.ts`
- `ab-test-config-bundle.test.ts` — A/B test (config-bundle mode): create → run → pause → resume → promote, asserting
live execution state from AWS via `view ab-test`
- `ab-test-target-based.test.ts` — A/B test (target-based mode): two http-runtime gateway-targets on named runtime
endpoints, each scoped by its own online-eval → run → pause → resume → promote (control endpoint version-bumped to
treatment's)
- `httpgateway-all-targets.test.ts` — one `protocolType: None` (HTTP) gateway hosting every deployable target type
(http-runtime, mcp-server, lambda-function-arn, api-gateway, open-api-schema, smithy-model, web-search, passthrough),
deployed in a single stack. `passthrough` is gated, so its add/deploy run with `ENABLE_GATED_FEATURES=1`. Omits
`connector` (Bedrock FMKB, a private-beta CFN resource type).

### Fixtures that provision external AWS resources

Some gateway-target types reference AWS resources that `agentcore deploy` cannot create (an existing Lambda, a REST
API). `fixtures/gateway-targets/setup_target_prereqs.py` creates them idempotently (check-then-create, reused across
runs) and writes their identifiers to a per-run JSON file the test reads — mirroring `fixtures/import/`'s boto3 setup
pattern. If the IAM role lacks permission to create one (e.g. a restricted CI role without `lambda:*`/`apigateway:*`),
the fixture emits `null` for that identifier and the test skips the dependent target rather than failing the suite.

## Important Notes

Expand Down
340 changes: 340 additions & 0 deletions e2e-tests/ab-test-config-bundle.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
/**
* E2E test for A/B tests (config-bundle mode) across the AWS boundary.
*
* Flow: create project → add gateway → add config bundle (v1) → deploy →
* update bundle (v2) → deploy → add online-eval (Builtin evaluator) → deploy →
* run ab-test → view (poll RUNNING) → pause → view (PAUSED) → resume →
* view (RUNNING) → promote → archive
*
* A/B tests are fire-and-forget jobs, not project resources, so cleanup must
* `archive` the test explicitly — `remove all` does not touch it.
*
* Live-AWS behaviours this proves (per e2e-tests/README.md): pause / resume /
* promote return live execution state from AWS. `view ab-test --json` re-fetches
* server state; the live execution status (RUNNING/PAUSED/STOPPED) surfaces in
* the `lifecycleStatus` field (handler.refresh maps executionStatus → lifecycleStatus).
*
* Prerequisites: AWS credentials, npm, git, uv.
*/
import { parseJsonOutput, retry } from '../src/test-utils/index.js';
import {
baseCanRun,
hasAws,
installCdkTarball,
runAgentCoreCLI,
teardownE2EProject,
writeAwsTargets,
} from './e2e-helper.js';
import { randomUUID } from 'node:crypto';
import { mkdir, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterAll, beforeAll, describe, expect, it } from 'vitest';

const canRun = baseCanRun && hasAws;

describe.sequential('e2e: A/B test lifecycle (config-bundle mode)', () => {
let testDir: string;
let projectPath: string;
const suffix = String(Date.now()).slice(-8);
const agentName = `E2eAbt${suffix}`;
const gatewayName = 'abtgw';
const bundleName = 'E2eAbtBundle';
const onlineEvalName = 'E2eAbtEval';
const abTestName = 'E2eAbtTest';

// Captured across the sequential steps.
let controlVersionId: string;
let abTestId: string;

beforeAll(async () => {
if (!canRun) return;

testDir = join(tmpdir(), `agentcore-e2e-ab-test-${randomUUID()}`);
await mkdir(testDir, { recursive: true });

const result = await runAgentCoreCLI(
[
'create',
'--name',
agentName,
'--language',
'Python',
'--framework',
'Strands',
'--model-provider',
'Bedrock',
'--memory',
'none',
'--json',
],
testDir
);
expect(result.exitCode, `Create failed: ${result.stderr}`).toBe(0);
projectPath = (parseJsonOutput(result.stdout) as { projectPath: string }).projectPath;

await writeAwsTargets(projectPath);
installCdkTarball(projectPath);
}, 300000);

afterAll(async () => {
// A/B tests are jobs, not project resources — archive explicitly before teardown.
if (abTestId && projectPath && hasAws) {
await runAgentCoreCLI(['archive', 'ab-test', '-i', abTestId, '--json'], projectPath);
}
if (projectPath && hasAws) {
await teardownE2EProject(projectPath, agentName, 'Bedrock');
}
if (testDir) await rm(testDir, { recursive: true, force: true, maxRetries: 3, retryDelay: 1000 });
}, 600000);

const run = (args: string[]) => runAgentCoreCLI(args, projectPath);

const bundleComponents = (systemPrompt: string, temperature: number) =>
JSON.stringify({
[`{{runtime:${agentName}}}`]: { configuration: { systemPrompt, temperature } },
});

// ── Gateway (required: AB tests resolve a deployed gateway ARN) ──────────

it.skipIf(!canRun)(
'adds a gateway',
async () => {
const result = await run(['add', 'gateway', '--name', gatewayName, '--protocol-type', 'None', '--json']);
expect(result.exitCode, `Add gateway failed: ${result.stdout}`).toBe(0);
expect((parseJsonOutput(result.stdout) as { success: boolean }).success).toBe(true);
},
60000
);

// ── Config bundle v1 + deploy ────────────────────────────────────────────

it.skipIf(!canRun)(
'adds config bundle (v1) and deploys',
async () => {
const add = await run([
'add',
'config-bundle',
'--name',
bundleName,
'--description',
'AB test bundle',
'--components',
bundleComponents('You are control: concise.', 0.5),
'--branch',
'mainline',
'--commit-message',
'v1 control',
'--json',
]);
expect(add.exitCode, `Add config-bundle failed: ${add.stdout}`).toBe(0);
expect((parseJsonOutput(add.stdout) as { success: boolean }).success).toBe(true);

const deploy = await run(['deploy', '--yes', '--json']);
if (deploy.exitCode !== 0) console.log('Deploy v1 stdout/stderr:', deploy.stdout, deploy.stderr);
expect(deploy.exitCode, 'Deploy v1 failed').toBe(0);
expect((parseJsonOutput(deploy.stdout) as { success: boolean }).success).toBe(true);
},
600000
);

// ── Config bundle v2 (remove + re-add same name + redeploy = version bump) ─

it.skipIf(!canRun)(
'updates config bundle to v2 (second version of the same bundle) and deploys',
async () => {
let result = await run(['remove', 'config-bundle', '--name', bundleName, '--json']);
expect(result.exitCode, `Remove config-bundle failed: ${result.stdout}`).toBe(0);

result = await run([
'add',
'config-bundle',
'--name',
bundleName,
'--description',
'AB test bundle - treatment',
'--components',
bundleComponents('You are treatment: detailed and thorough.', 0.9),
'--branch',
'mainline',
'--commit-message',
'v2 treatment',
'--json',
]);
expect(result.exitCode, `Re-add config-bundle failed: ${result.stdout}`).toBe(0);

result = await run(['deploy', '--yes', '--json']);
expect(result.exitCode, `Redeploy failed: ${result.stdout}`).toBe(0);
},
600000
);

it.skipIf(!canRun)(
'config-bundle versions lists both versions (captures control = oldest)',
async () => {
const result = await run(['config-bundle', 'versions', '--name', bundleName, '--json']);
expect(result.exitCode, `cb versions failed: ${result.stderr}`).toBe(0);
const json = parseJsonOutput(result.stdout) as { versions: { versionId: string }[] };
expect(json.versions.length).toBeGreaterThanOrEqual(2);
// Versions are newest-first; oldest is the control (treatment uses LATEST).
controlVersionId = json.versions[json.versions.length - 1]!.versionId;
expect(controlVersionId).toBeTruthy();
},
120000
);

// ── Online-eval (Builtin evaluator — no custom evaluator resource needed) ──

it.skipIf(!canRun)(
'adds an online-eval config and deploys',
async () => {
const add = await run([
'add',
'online-eval',
'--name',
onlineEvalName,
'--runtime',
agentName,
'--evaluator',
'Builtin.Faithfulness',
'--sampling-rate',
'100',
'--json',
]);
expect(add.exitCode, `Add online-eval failed: ${add.stdout}`).toBe(0);
const addJson = parseJsonOutput(add.stdout) as { success: boolean; configName: string };
expect(addJson.success).toBe(true);
expect(addJson.configName).toBe(onlineEvalName);

const deploy = await run(['deploy', '--yes', '--json']);
if (deploy.exitCode !== 0) console.log('Deploy eval stdout/stderr:', deploy.stdout, deploy.stderr);
expect(deploy.exitCode, 'Deploy online-eval failed').toBe(0);
expect((parseJsonOutput(deploy.stdout) as { success: boolean }).success).toBe(true);
},
600000
);

// ── Create the A/B test ───────────────────────────────────────────────────

it.skipIf(!canRun)(
'runs the A/B test (control = oldest version, treatment = LATEST)',
async () => {
expect(controlVersionId, 'Control version should have been captured').toBeTruthy();

// Auto-creates an IAM role and retries on AccessDenied while IAM propagates;
// retry the whole call to absorb propagation flakiness.
let runJson: { mode: string; variants: { name: string }[] } | undefined;
await retry(
async () => {
const result = await run([
'run',
'ab-test',
'-n',
abTestName,
'-g',
gatewayName,
'--mode',
'config-bundle',
'--control-bundle',
bundleName,
'--control-version',
controlVersionId,
'--treatment-bundle',
bundleName,
'--treatment-version',
'LATEST',
'--online-eval',
onlineEvalName,
'--runtime',
agentName,
'--json',
]);

if (result.exitCode !== 0) console.log('run ab-test stdout/stderr:', result.stdout, result.stderr);
expect(result.exitCode, `run ab-test failed: ${result.stdout}`).toBe(0);
const json = parseJsonOutput(result.stdout) as {
success: boolean;
id: string;
mode: string;
variants: { name: string }[];
};
expect(json.success).toBe(true);
expect(json.id).toBeTruthy();
// Capture the id immediately so afterAll always archives the test, even if a
// later assertion fails. Done inside retry (before any throw) so an orphan is
// never left behind by a re-attempt.
abTestId = json.id;
runJson = json;
},
3,
20000
);
// Deterministic checks live outside retry — a mismatch must not re-create the test.
expect(runJson!.mode).toBe('config-bundle');
expect(runJson!.variants).toHaveLength(2);
},
300000
);

// ── pause / resume / promote — live execution state from AWS ───────────────

const viewExecutionStatus = async (): Promise<string> => {
const result = await run(['view', 'ab-test', abTestId, '--json']);
expect(result.exitCode, `view ab-test failed: ${result.stderr}`).toBe(0);
// Live execution status (RUNNING/PAUSED/STOPPED) surfaces in lifecycleStatus.
return (parseJsonOutput(result.stdout) as { lifecycleStatus: string }).lifecycleStatus;
};

it.skipIf(!canRun)(
'view reports the test reaching RUNNING',
async () => {
expect(abTestId, 'AB test ID should have been captured').toBeTruthy();
await retry(
async () => {
expect(await viewExecutionStatus()).toBe('RUNNING');
},
12,
10000
);
},
180000
);

it.skipIf(!canRun)(
'pause sets live execution state to PAUSED',
async () => {
const result = await run(['pause', 'ab-test', '-i', abTestId, '--json']);
expect(result.exitCode, `pause failed: ${result.stderr}`).toBe(0);
expect((parseJsonOutput(result.stdout) as { success: boolean; id: string }).success).toBe(true);

await retry(async () => expect(await viewExecutionStatus()).toBe('PAUSED'), 6, 10000);
},
120000
);

it.skipIf(!canRun)(
'resume sets live execution state back to RUNNING',
async () => {
const result = await run(['resume', 'ab-test', '-i', abTestId, '--json']);
expect(result.exitCode, `resume failed: ${result.stderr}`).toBe(0);
expect((parseJsonOutput(result.stdout) as { success: boolean }).success).toBe(true);

await retry(async () => expect(await viewExecutionStatus()).toBe('RUNNING'), 6, 10000);
},
120000
);

it.skipIf(!canRun)(
'promote stops the test and applies the winning variant to config',
async () => {
// promote waits for RUNNING (up to ~120s), stops the test, rewrites the bundle.
const result = await run(['promote', 'ab-test', '-i', abTestId, '--json']);
if (result.exitCode !== 0) console.log('promote stdout/stderr:', result.stdout, result.stderr);
expect(result.exitCode, `promote failed: ${result.stdout}`).toBe(0);
expect((parseJsonOutput(result.stdout) as { success: boolean; id: string }).success).toBe(true);

await retry(async () => expect(await viewExecutionStatus()).toBe('STOPPED'), 6, 10000);
},
180000
);
});
Loading
Loading