diff --git a/.github/workflows/e2e-scenarios-all.yaml b/.github/workflows/e2e-scenarios-all.yaml index d7de9666db..ce74552eaf 100644 --- a/.github/workflows/e2e-scenarios-all.yaml +++ b/.github/workflows/e2e-scenarios-all.yaml @@ -83,3 +83,10 @@ jobs: scenarios: ${{ matrix.id }} secrets: NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} + + ubuntu-rebuild-openclaw: + uses: ./.github/workflows/e2e-scenarios.yaml + with: + scenarios: ubuntu-rebuild-openclaw + secrets: + NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} diff --git a/.github/workflows/e2e-scenarios.yaml b/.github/workflows/e2e-scenarios.yaml index 4d68e695cc..49f317caff 100644 --- a/.github/workflows/e2e-scenarios.yaml +++ b/.github/workflows/e2e-scenarios.yaml @@ -63,6 +63,7 @@ jobs: [ubuntu-gateway-port-conflict-negative]=ubuntu-latest [ubuntu-invalid-nvidia-key-negative]=ubuntu-latest [ubuntu-no-docker-preflight-negative]=ubuntu-latest + [ubuntu-rebuild-openclaw]=ubuntu-latest [ubuntu-repo-cloud-hermes]=ubuntu-latest [ubuntu-repo-cloud-hermes-discord]=ubuntu-latest [ubuntu-repo-cloud-hermes-slack]=ubuntu-latest @@ -84,7 +85,6 @@ jobs: for raw in "${IDS[@]}"; do id="${raw//[[:space:]]/}" [ -n "${id}" ] || continue - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${id}" --plan-only >/dev/null runner="${ROUTES[$id]:-}" if [ -z "${runner}" ]; then echo "::error::No runner route for scenario: ${id}" >&2 @@ -138,7 +138,7 @@ jobs: echo "::error::Invalid scenario input: ${SCENARIOS}" >&2 exit 1 fi - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${SCENARIOS}" --dry-run + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${SCENARIOS}" - name: Resolve workspace paths for WSL if: contains(inputs.scenarios || github.event.inputs.scenarios, 'wsl-repo-cloud-openclaw') @@ -302,7 +302,7 @@ jobs: export E2E_CONTEXT_DIR="`$workdir" npm ci --ignore-scripts set +e - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "`$scenarios" --dry-run + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "`$scenarios" status=`$? if [ -d "`$workdir/.e2e" ]; then rm -rf "`$checkout_dir/.e2e" @@ -324,14 +324,14 @@ jobs: exit $LASTEXITCODE } - - name: Append typed dry-run summary + - name: Append typed scenario summary if: always() shell: bash run: | { - echo '## E2E typed scenario dry-run' + echo '## E2E typed scenario run' echo '' - echo 'Mode: `test/e2e-scenario/scenarios/run.ts --dry-run`.' + echo 'Mode: `test/e2e-scenario/scenarios/run.ts --scenarios ` (live).' echo '' if [ -f .e2e/run-plan.json ]; then python3 - <<'PY' @@ -368,14 +368,25 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: e2e-scenario-${{ inputs.scenarios || github.event.inputs.scenarios }} + # Explicit subpath list, NOT a blanket .e2e/ + hidden files. + # The framework redacts every byte that flows from spawned + # children into actions/*.log, logs/*.log, and onboard.log via + # orchestrators/redaction.ts::pipeRedacted. Anything outside + # the listed paths (notably the raw context.env file) is + # excluded so secret-bearing key=value lines cannot leak via + # the artifact even if a future helper writes there. + # Diagnostic dumps of context use e2e_context_dump, which + # redacts on emit (runtime/lib/context.sh). path: | .e2e/run-plan.json .e2e/plan.txt .e2e/environment.result.json .e2e/onboarding.result.json .e2e/runtime.result.json - .e2e/ + .e2e/actions/ + .e2e/logs/ + .e2e/onboard.log test/e2e/logs/ if-no-files-found: warn retention-days: 14 - include-hidden-files: true + include-hidden-files: false diff --git a/test/e2e-scenario/docs/README.md b/test/e2e-scenario/docs/README.md index 5d27dd161e..6bfdaa098d 100644 --- a/test/e2e-scenario/docs/README.md +++ b/test/e2e-scenario/docs/README.md @@ -24,9 +24,10 @@ Use the source that matches the task while the migration is in progress: | Task | Current source | | --- | --- | -| Scenario workflow fan-out and dry-run planning | `test/e2e-scenario/scenarios/registry.ts`, `test/e2e-scenario/scenarios/scenarios/baseline.ts`, and `test/e2e-scenario/scenarios/run.ts` | +| Scenario workflow fan-out and live execution | `test/e2e-scenario/scenarios/registry.ts`, `test/e2e-scenario/scenarios/scenarios/baseline.ts`, and `test/e2e-scenario/scenarios/run.ts` | +| Typed expected-state registry (single source of truth) | `test/e2e-scenario/scenarios/expected-states.ts` | | Product-facing desired setup/onboarding state | `test/e2e-scenario/manifests/*.yaml` | -| Shell runner scenario resolution and live scenario execution | `test/e2e-scenario/nemoclaw_scenarios/scenarios.yaml`, `expected-states.yaml`, and `validation_suites/suites.yaml` | +| Shell runner scenario resolution and live scenario execution | `test/e2e-scenario/nemoclaw_scenarios/scenarios.yaml` and `validation_suites/suites.yaml` (legacy YAML resolver path retired) | | Reusable live suite assertions | `test/e2e-scenario/validation_suites/` | | Existing nightly and platform E2E coverage | legacy `test/e2e/test-*.sh` scripts and their workflows | @@ -158,7 +159,6 @@ test/e2e-scenario/ scenarios/ # Typed builders, registry, compiler, assertions, dry-run orchestration nemoclaw_scenarios/ # YAML runtime metadata and setup helpers scenarios.yaml - expected-states.yaml install/ onboard/ fixtures/ diff --git a/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts b/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts index 6a7c97959f..0134d6adc9 100644 --- a/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts @@ -9,7 +9,6 @@ import path from "node:path"; const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); const CONTEXT_LIB = path.join(REPO_ROOT, "test/e2e-scenario/runtime/lib/context.sh"); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); function runBash(script: string, env: Record = {}): SpawnSyncReturns { return spawnSync("bash", ["-c", script], { @@ -86,38 +85,4 @@ describe("E2E context helper (runtime/lib/context.sh)", () => { } }); - it("scenario_plan_execution_should_emit_context_under_dry_run", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-ctx-")); - try { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - const ctxPath = path.join(tmp, "context.env"); - expect(fs.existsSync(ctxPath), `context.env missing in ${tmp}`).toBe(true); - const ctx = fs.readFileSync(ctxPath, "utf8"); - for (const key of [ - "E2E_SCENARIO", - "E2E_PLATFORM_OS", - "E2E_INSTALL_METHOD", - "E2E_ONBOARDING_PATH", - "E2E_AGENT", - "E2E_PROVIDER", - "E2E_SANDBOX_NAME", - "E2E_GATEWAY_URL", - "E2E_INFERENCE_ROUTE", - ]) { - expect(ctx, `${key} missing from context.env`).toMatch(new RegExp(`^${key}=`, "m")); - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); }); diff --git a/test/e2e-scenario/framework-tests/e2e-coverage-report.test.ts b/test/e2e-scenario/framework-tests/e2e-coverage-report.test.ts deleted file mode 100644 index b4a6056db0..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-coverage-report.test.ts +++ /dev/null @@ -1,89 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import path from "node:path"; - -import { loadMetadataFromDir, loadMetadataFromObjects } from "../runtime/resolver/load.ts"; -import { renderCoverageReport } from "../runtime/resolver/coverage.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); - -describe("coverage report", () => { - it("should_render_single_coverage_table", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const md = renderCoverageReport(meta); - expect(md).toContain("test/e2e-scenario/nemoclaw_scenarios/{scenarios,expected-states}.yaml"); - expect(md).toContain("test/e2e-scenario/validation_suites/suites.yaml"); - // Exactly one primary Scenario Coverage table. - const headers = md.match(/\|\s*Scenario\s*\|\s*Platform\s*\|\s*Install\s*\|\s*Runtime\s*\|\s*Onboarding\s*\|\s*Expected state\s*\|\s*Suites\s*\|/g); - expect(headers).toBeTruthy(); - expect(headers?.length).toBe(1); - // Every scenario should appear as a row. - for (const id of Object.keys(meta.scenarios.setup_scenarios)) { - expect(md).toContain(id); - } - // Rows should be sorted deterministically (alphabetically). - const rowOrder = Object.keys(meta.scenarios.setup_scenarios).sort(); - let pos = 0; - for (const id of rowOrder) { - const idx = md.indexOf(`| ${id} |`, pos); - expect(idx, `row ${id} not found in order. report:\n${md}`).toBeGreaterThanOrEqual(0); - pos = idx; - } - }); - - it("should_flag_scenarios_without_suites", () => { - const meta = loadMetadataFromObjects({ - scenarios: { - platforms: { p: {} }, - installs: { i: {} }, - runtimes: { r: {} }, - onboarding: { o: { agent: "openclaw", provider: "nvidia" } }, - setup_scenarios: { - "empty-suite-scenario": { - dimensions: { platform: "p", install: "i", runtime: "r", onboarding: "o" }, - expected_state: "some-state", - suites: [], - }, - }, - }, - expectedStates: { expected_states: { "some-state": { gateway: { health: "healthy" } } } }, - suites: { suites: {} }, - }); - const md = renderCoverageReport(meta); - expect(md).toMatch(/## Gaps/); - expect(md).toMatch(/empty-suite-scenario.*no suites|no suites.*empty-suite-scenario/s); - }); - - it("should_flag_expected_states_not_used_by_any_scenario", () => { - const meta = loadMetadataFromObjects({ - scenarios: { - platforms: { p: {} }, - installs: { i: {} }, - runtimes: { r: {} }, - onboarding: { o: { agent: "openclaw", provider: "nvidia" } }, - setup_scenarios: { - s1: { - dimensions: { platform: "p", install: "i", runtime: "r", onboarding: "o" }, - expected_state: "used-state", - suites: ["smoke"], - }, - }, - }, - expectedStates: { - expected_states: { - "used-state": { gateway: { health: "healthy" } }, - "unused-state": { gateway: { health: "healthy" } }, - }, - }, - suites: { - suites: { smoke: { steps: [{ id: "a", script: "suites/smoke/a.sh" }] } }, - }, - }); - const md = renderCoverageReport(meta); - expect(md).toMatch(/## Gaps/); - expect(md).toMatch(/unused-state/); - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-expected-failure.test.ts b/test/e2e-scenario/framework-tests/e2e-expected-failure.test.ts deleted file mode 100644 index bf2c751d51..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-expected-failure.test.ts +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Unit tests for the expected-failure schema, resolver merge, and matcher. - * - * Companion to NemoClaw issue #3608. The scenario-additional-families - * suite covers the end-to-end plan shape; this file focuses on the new - * code paths in isolation so failures point at a single layer. - */ - -import { describe, it, expect } from "vitest"; -import yaml from "js-yaml"; - -import { loadMetadataFromObjects } from "../runtime/resolver/load.ts"; -import { resolveScenario } from "../runtime/resolver/plan.ts"; -import { - matchExpectedFailure, - type ObservedFailure, -} from "../runtime/resolver/expected-failure.ts"; -import type { ExpectedFailure } from "../runtime/resolver/schema.ts"; - -function makeMetadata(opts: { - stateBlock?: Record | null; - scenarioBlock?: Record | null; -}) { - const stateBlock = opts.stateBlock; - const scenarioBlock = opts.scenarioBlock; - const stateYaml: Record = { - cli: { installed: true }, - gateway: { expected: "absent" }, - sandbox: { expected: "absent" }, - }; - if (stateBlock !== undefined && stateBlock !== null) { - stateYaml.expected_failure = stateBlock; - } - const scenarioYaml: Record = { - dimensions: { - platform: "p", - install: "i", - runtime: "r", - onboarding: "o", - }, - expected_state: "neg", - suites: [], - }; - if (scenarioBlock !== undefined && scenarioBlock !== null) { - scenarioYaml.expected_failure = scenarioBlock; - } - return loadMetadataFromObjects({ - scenarios: { - platforms: { p: { os: "ubuntu" } }, - installs: { i: { method: "repo-checkout" } }, - runtimes: { r: { container_engine: "docker", container_daemon: "missing" } }, - onboarding: { o: { agent: "openclaw", provider: "nvidia" } }, - setup_scenarios: { s: scenarioYaml }, - }, - expectedStates: { - expected_states: { neg: stateYaml }, - }, - suites: { suites: {} }, - }); -} - -describe("expected_failure: loader validation", () => { - it("accepts a complete state-level block", () => { - const meta = makeMetadata({ - stateBlock: { - phase: "preflight", - error_class: "docker-missing", - message_pattern: "docker", - forbidden_side_effects: ["sandbox-created"], - }, - }); - const plan = resolveScenario("s", meta); - expect(plan.expected_failure?.phase).toBe("preflight"); - expect(plan.expected_failure?.error_class).toBe("docker-missing"); - }); - - it("rejects unknown phase", () => { - expect(() => - makeMetadata({ - stateBlock: { phase: "bogus", error_class: "docker-missing" }, - }), - ).toThrow(/expected_failure\.phase/); - }); - - it("rejects unknown error_class", () => { - expect(() => - makeMetadata({ - stateBlock: { phase: "preflight", error_class: "moon-missing" }, - }), - ).toThrow(/expected_failure\.error_class/); - }); - - it("rejects invalid message_pattern regex", () => { - expect(() => - makeMetadata({ - stateBlock: { - phase: "preflight", - error_class: "docker-missing", - message_pattern: "(unclosed", - }, - }), - ).toThrow(/message_pattern is not a valid regex/); - }); - - it("rejects unknown forbidden_side_effects entry", () => { - expect(() => - makeMetadata({ - stateBlock: { - phase: "preflight", - error_class: "docker-missing", - forbidden_side_effects: ["paint-the-fence"], - }, - }), - ).toThrow(/forbidden_side_effects entry/); - }); - - it("rejects unknown keys in the block", () => { - expect(() => - makeMetadata({ - stateBlock: { - phase: "preflight", - error_class: "docker-missing", - rogue: true, - }, - }), - ).toThrow(/unknown key 'rogue'/); - }); - - it("requires phase + error_class at the state level", () => { - expect(() => makeMetadata({ stateBlock: { phase: "preflight" } })).toThrow( - /error_class is required/, - ); - }); - - it("rejects a non-mapping expected_states section", () => { - expect(() => - loadMetadataFromObjects({ - scenarios: { - platforms: { p: {} }, - installs: { i: {} }, - runtimes: { r: {} }, - onboarding: { o: { agent: "openclaw", provider: "nvidia" } }, - setup_scenarios: {}, - }, - expectedStates: { expected_states: [] }, - suites: { suites: {} }, - }), - ).toThrow(/expected_states' must be a mapping/); - }); - - it("rejects scenario-level expected_failure when state has none", () => { - expect(() => - resolveScenario( - "s", - makeMetadata({ - stateBlock: null, - scenarioBlock: { phase: "preflight", error_class: "docker-missing" }, - }), - ), - ).toThrow(/expected_failure but expected_state.*does not/); - }); - - it("merges scenario-level override on top of state-level block", () => { - const meta = makeMetadata({ - stateBlock: { - phase: "preflight", - error_class: "docker-missing", - message_pattern: "docker", - forbidden_side_effects: ["sandbox-created"], - }, - scenarioBlock: { - message_pattern: "(?i)daemon", - forbidden_side_effects: ["gateway-started"], - }, - }); - const plan = resolveScenario("s", meta); - expect(plan.expected_failure?.message_pattern).toBe("(?i)daemon"); - expect(plan.expected_failure?.forbidden_side_effects).toEqual(["gateway-started"]); - expect(plan.expected_failure?.phase).toBe("preflight"); - }); -}); - -describe("expected_failure: matcher", () => { - const expected: ExpectedFailure = { - phase: "preflight", - error_class: "docker-missing", - message_pattern: "(?i)docker|daemon", - forbidden_side_effects: ["sandbox-created", "gateway-started"], - }; - - function obs(over: Partial): ObservedFailure { - return { - phase: "preflight", - error_class: "docker-missing", - log: "Cannot connect to the Docker daemon", - observed_side_effects: [], - ...over, - }; - } - - it("passes when phase, class, pattern, and side-effects all match", () => { - const report = matchExpectedFailure(expected, obs({})); - expect(report.ok).toBe(true); - expect(report.checks.every((c) => c.ok)).toBe(true); - }); - - it("fails on phase mismatch", () => { - const report = matchExpectedFailure(expected, obs({ phase: "install" })); - expect(report.ok).toBe(false); - expect(report.checks.find((c) => c.name === "phase")?.ok).toBe(false); - }); - - it("fails on error_class mismatch", () => { - const report = matchExpectedFailure(expected, obs({ error_class: "gpu-missing" })); - expect(report.ok).toBe(false); - expect(report.checks.find((c) => c.name === "error_class")?.ok).toBe(false); - }); - - it("skips error_class check when observation is undefined", () => { - const report = matchExpectedFailure(expected, obs({ error_class: undefined })); - const classCheck = report.checks.find((c) => c.name === "error_class"); - expect(classCheck?.ok).toBe(true); - expect(classCheck?.message).toMatch(/skipped/); - }); - - it("fails when message_pattern does not match the log", () => { - const report = matchExpectedFailure( - expected, - obs({ log: "something else entirely" }), - ); - expect(report.ok).toBe(false); - expect(report.checks.find((c) => c.name === "message_pattern")?.ok).toBe(false); - }); - - it("fails when a forbidden side effect is observed", () => { - const report = matchExpectedFailure( - expected, - obs({ observed_side_effects: ["sandbox-created"] }), - ); - expect(report.ok).toBe(false); - const sideCheck = report.checks.find((c) => c.name === "forbidden_side_effects"); - expect(sideCheck?.ok).toBe(false); - expect(sideCheck?.message).toMatch(/sandbox-created/); - }); - - it("ignores non-forbidden observed side effects", () => { - const trimmed: ExpectedFailure = { - ...expected, - forbidden_side_effects: ["gateway-started"], - }; - const report = matchExpectedFailure( - trimmed, - obs({ observed_side_effects: ["sandbox-created"] }), - ); - expect(report.ok).toBe(true); - }); -}); - -describe("expected_failure: real metadata", () => { - it("loads structurally for ubuntu-no-docker-preflight-negative", () => { - const meta = loadMetadataFromObjects({ - scenarios: yaml.load(` -platforms: { p: { os: ubuntu } } -installs: { i: {} } -runtimes: { r: { container_daemon: missing } } -onboarding: { o: { agent: openclaw, provider: nvidia } } -setup_scenarios: - s: - dimensions: { platform: p, install: i, runtime: r, onboarding: o } - expected_state: neg - suites: [] -`) as object, - expectedStates: yaml.load(` -expected_states: - neg: - cli: { installed: true } - gateway: { expected: absent } - sandbox: { expected: absent } - expected_failure: - phase: preflight - error_class: docker-missing - message_pattern: "(?i)docker|container|daemon|socket|preflight" - forbidden_side_effects: [sandbox-created, gateway-started, credentials-written] -`) as object, - suites: yaml.load(` -suites: {} -`) as object, - }); - const plan = resolveScenario("s", meta); - expect(plan.expected_failure).toBeTruthy(); - expect(plan.expected_failure?.forbidden_side_effects?.length).toBe(3); - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts b/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts deleted file mode 100644 index ba1f2b5f31..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { - validateExpectedState, - type ProbeResults, -} from "../runtime/resolver/validator.ts"; -import type { ExpectedStateConfig, ResolvedSuite } from "../runtime/resolver/schema.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); - -function cloudOpenclawReady(): ExpectedStateConfig { - return { - cli: { installed: true }, - gateway: { expected: "present", health: "healthy" }, - sandbox: { expected: "present", status: "running", agent: "openclaw" }, - inference: { - expected: "available", - provider: "nvidia", - route: "inference-local", - mode: "gateway-routed", - }, - credentials: { expected: "present", storage: "gateway-managed" }, - }; -} - -function passingProbes(): ProbeResults { - return { - "cli.installed": true, - "gateway.health": "healthy", - "gateway.expected": "present", - "sandbox.status": "running", - "sandbox.expected": "present", - "sandbox.agent": "openclaw", - "inference.expected": "available", - "inference.provider": "nvidia", - "inference.route": "inference-local", - "inference.mode": "gateway-routed", - "credentials.expected": "present", - "credentials.storage": "gateway-managed", - }; -} - -describe("expected state validator", () => { - it("should_validate_matching_state", () => { - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes: passingProbes(), - suites: [], - }); - expect(report.ok).toBe(true); - expect(report.checks.every((c) => c.ok)).toBe(true); - }); - - it("should_fail_when_gateway_expected_but_unhealthy", () => { - const probes = passingProbes(); - probes["gateway.health"] = "unhealthy"; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes, - suites: [], - }); - expect(report.ok).toBe(false); - const failing = report.checks.find((c) => c.key === "gateway.health"); - expect(failing?.ok).toBe(false); - expect(failing?.expected).toBe("healthy"); - expect(failing?.actual).toBe("unhealthy"); - }); - - it("should_fail_when_sandbox_expected_but_absent", () => { - const probes = passingProbes(); - probes["sandbox.status"] = "absent"; - probes["sandbox.expected"] = "absent"; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes, - suites: [], - }); - expect(report.ok).toBe(false); - expect(report.checks.some((c) => c.key === "sandbox.status" && !c.ok)).toBe(true); - }); - - it("should_fail_when_suite_requires_state_unmet_at_runtime", () => { - // Expected state claims inference.expected=available, but the probe - // reports unavailable; the smoke suite happens to pass but an inference - // suite's requires_state should trigger a runtime failure before - // execution. - const state = cloudOpenclawReady(); - const probes = passingProbes(); - probes["inference.expected"] = "unavailable"; - const inferenceSuite: ResolvedSuite = { - id: "inference", - requires_state: { "inference.expected": "available" }, - steps: [{ id: "models-health", script: "suites/inference/cloud/00-models-health.sh" }], - }; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state, - probes, - suites: [inferenceSuite], - }); - expect(report.ok).toBe(false); - const msg = report.checks - .filter((c) => !c.ok) - .map((c) => `${c.key}=${c.actual ?? ""} (wanted ${c.expected})`) - .join("; "); - expect(msg).toMatch(/inference\.expected/); - expect(msg).toMatch(/available/); - expect(msg).toMatch(/unavailable/); - // Should also reference the suite that made the requirement. - expect(report.checks.some((c) => c.suite === "inference" && !c.ok)).toBe(true); - }); -}); - -describe("runner_should_not_run_suites_when_expected_state_fails", () => { - it("runs expected-state validation and skips suites on failure", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-es-")); - try { - const trace = path.join(tmp, "trace.log"); - // Simulate gateway-unhealthy probe by setting an override env var. - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - // validator reads these overrides in dry-run mode to fake probes - E2E_PROBE_OVERRIDE_GATEWAY_HEALTH: "unhealthy", - E2E_VALIDATE_EXPECTED_STATE: "1", - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - // Dry-run execution should now fail because the expected state - // validation runs and sees gateway.health=unhealthy. - expect(r.status).not.toBe(0); - // Validator must run (its report file should exist) but suites must not. - const reportPath = path.join(tmp, "expected-state-report.json"); - expect(fs.existsSync(reportPath), `missing ${reportPath}`).toBe(true); - const report = JSON.parse(fs.readFileSync(reportPath, "utf8")); - expect(report.ok).toBe(false); - expect(report.checks.some((c: { key: string; ok: boolean }) => c.key === "gateway.health" && !c.ok)).toBe(true); - // And the run's failure output should reference expected-state, not suites. - expect(`${r.stdout}${r.stderr}`).toMatch(/expected.state/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); - -// ───────────────────────────────────────────────────────────────────────────── -// Phase 1.F — --validate-only flag on run-scenario.sh -// ───────────────────────────────────────────────────────────────────────────── - -describe("run-scenario --validate-only flag", () => { - it("runs only validator and emits probe results json on stdout without running install/onboard/suites", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-validate-only-")); - try { - const trace = path.join(tmp, "trace.log"); - // Pre-populate a context.env: --validate-only assumes setup has already run. - fs.writeFileSync( - path.join(tmp, "context.env"), - "E2E_SCENARIO=ubuntu-repo-cloud-openclaw\n", - ); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--validate-only"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - // Supply probe overrides for every key the expected state needs. - E2E_PROBE_OVERRIDE_CLI_INSTALLED: "true", - E2E_PROBE_OVERRIDE_GATEWAY_EXPECTED: "present", - E2E_PROBE_OVERRIDE_GATEWAY_HEALTH: "healthy", - E2E_PROBE_OVERRIDE_SANDBOX_EXPECTED: "present", - E2E_PROBE_OVERRIDE_SANDBOX_STATUS: "running", - E2E_PROBE_OVERRIDE_SANDBOX_AGENT: "openclaw", - E2E_PROBE_OVERRIDE_INFERENCE_EXPECTED: "available", - E2E_PROBE_OVERRIDE_INFERENCE_PROVIDER: "nvidia", - E2E_PROBE_OVERRIDE_INFERENCE_ROUTE: "inference-local", - E2E_PROBE_OVERRIDE_INFERENCE_MODE: "gateway-routed", - E2E_PROBE_OVERRIDE_CREDENTIALS_EXPECTED: "present", - E2E_PROBE_OVERRIDE_CREDENTIALS_STORAGE: "gateway-managed", - E2E_PROBE_OVERRIDE_SECURITY_SHIELDS: "supported", - // `security.policy_engine` has an embedded underscore, which the - // E2E_PROBE_OVERRIDE_* convention cannot express. Use the - // JSON escape hatch for this one. - E2E_PROBE_OVERRIDES_JSON: JSON.stringify({ "security.policy_engine": "supported" }), - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - // Must NOT have traced install or onboard. - const contents = fs.existsSync(trace) ? fs.readFileSync(trace, "utf8") : ""; - expect(contents).not.toMatch(/install:/); - expect(contents).not.toMatch(/onboard:/); - // Must have emitted an expected-state-report.json (probe results). - const reportPath = path.join(tmp, "expected-state-report.json"); - expect(fs.existsSync(reportPath), `missing ${reportPath}`).toBe(true); - const report = JSON.parse(fs.readFileSync(reportPath, "utf8")); - expect(report.ok).toBe(true); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("is_mutually_exclusive_with_plan_only", () => { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--validate-only", "--plan-only"], - { encoding: "utf8", timeout: 15_000, cwd: REPO_ROOT }, - ); - expect(r.status).not.toBe(0); - expect(r.stdout + r.stderr).toMatch(/mutually.exclusive|cannot.*both|--plan-only.*--validate-only|--validate-only.*--plan-only/i); - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-expected-state.test.ts b/test/e2e-scenario/framework-tests/e2e-expected-state.test.ts new file mode 100644 index 0000000000..98ffa9378f --- /dev/null +++ b/test/e2e-scenario/framework-tests/e2e-expected-state.test.ts @@ -0,0 +1,319 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { compileRunPlans } from "../scenarios/compiler.ts"; +import { + getExpectedState, + listExpectedStates, + probesForState, + requireExpectedState, +} from "../scenarios/expected-states.ts"; +import { ScenarioRunner } from "../scenarios/orchestrators/runner.ts"; +import { listScenarios } from "../scenarios/registry.ts"; +import type { ExpectedState, PhaseName, PhaseResult, RunContext, RunPlanPhase } from "../scenarios/types.ts"; + +function freshCtx(): RunContext { + return { contextDir: fs.mkdtempSync(path.join(os.tmpdir(), "e2e-state-")) }; +} + +// The legacy parity tests against `nemoclaw_scenarios/expected-states.yaml` +// were retired alongside the YAML resolver path (see commit 9da75ac0a). +// The typed registry in `scenarios/expected-states.ts` is the single source +// of truth; these id-coverage assertions replace the YAML-mirror checks. +describe("typed expected-state registry id coverage", () => { + it("exposes a non-empty list of registered expected-state ids", () => { + const ids = listExpectedStates().map((s) => s.id); + expect(ids.length).toBeGreaterThan(0); + expect(new Set(ids).size).toBe(ids.length); + }); + + it("requireExpectedState throws on unknown id with available list", () => { + expect(() => requireExpectedState("does-not-exist")).toThrow(/Unknown expected_state/); + }); + + it("getExpectedState returns the state for known ids", () => { + expect(getExpectedState("cloud-openclaw-ready")?.id).toBe("cloud-openclaw-ready"); + }); +}); + +describe("probesForState maps typed expected-state into probe ids", () => { + it("ready cloud state emits cli-installed, gateway-healthy, sandbox-running", () => { + expect(probesForState(requireExpectedState("cloud-openclaw-ready"))).toEqual([ + "cli-installed", + "gateway-healthy", + "sandbox-running", + ]); + }); + + it("preflight-failure state emits cli-installed, gateway-absent, sandbox-absent", () => { + expect(probesForState(requireExpectedState("preflight-failure-no-sandbox"))).toEqual([ + "cli-installed", + "gateway-absent", + "sandbox-absent", + ]); + }); + + it("optional-dimension state emits cli-installed only", () => { + expect(probesForState(requireExpectedState("macos-cli-ready-docker-optional"))).toEqual([ + "cli-installed", + ]); + }); + + it("inference and credentials probes are intentionally NOT emitted yet", () => { + // The typed registry declares inference.expected=available and + // credentials.expected=present for ready states; the compiler does + // not yet emit probe actions for those dimensions because the + // probe scripts aren't written. This test pins that gap so a + // future probe-script PR is forced to update probesForState too. + const state: ExpectedState = { + id: "synthetic", + inference: { expected: "available", provider: "nvidia" }, + credentials: { expected: "present" }, + }; + expect(probesForState(state)).toEqual([]); + }); +}); + +describe("compiler emits state-validation phase actions from expected-state registry", () => { + it("positive scenario gets cli-installed + gateway-healthy + sandbox-running probe actions", () => { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const stateValidationPhase = plan.phases.find((p) => p.name === "state-validation"); + expect(stateValidationPhase).toBeTruthy(); + expect(stateValidationPhase!.actions.map((a) => a.id)).toEqual([ + "state-validation.cli-installed", + "state-validation.gateway-healthy", + "state-validation.sandbox-running", + ]); + // Probes are typed shell-fn actions that go through the shared + // dispatcher; the orchestrator owns timeouts and redaction. + for (const action of stateValidationPhase!.actions) { + expect(action.kind).toBe("shell-fn"); + expect(action.fn).toBe("e2e_state_probe"); + expect(action.scriptRef).toBe( + "test/e2e-scenario/nemoclaw_scenarios/probes/dispatch.sh", + ); + expect(action.timeoutSeconds).toBe(30); + } + }); + + it("negative scenario gets cli-installed + gateway-absent + sandbox-absent probe actions", () => { + const [plan] = compileRunPlans(["ubuntu-no-docker-preflight-negative"]); + const stateValidationPhase = plan.phases.find((p) => p.name === "state-validation"); + expect(stateValidationPhase).toBeTruthy(); + expect(stateValidationPhase!.actions.map((a) => a.id)).toEqual([ + "state-validation.cli-installed", + "state-validation.gateway-absent", + "state-validation.sandbox-absent", + ]); + }); + + it("compiler hard-errors on a scenario referencing an unknown expected_state id", () => { + expect(() => + compileRunPlans([ + { + id: "synthetic-unknown-state", + assertionGroups: [], + expectedStateId: "definitely-not-a-state", + }, + ]), + ).toThrow(/unknown expected_state/); + }); + + it("phase order is environment -> onboarding -> state-validation -> lifecycle -> runtime", () => { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + // 'lifecycle' is the post-onboard state-mutation phase. Scenarios + // without a `environment.lifecycle` profile (e.g. this one) emit + // an empty action list for the phase but the phase still appears + // in the plan so phase-order invariants stay deterministic. + expect(plan.phases.map((p) => p.name)).toEqual([ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", + ]); + }); +}); + +describe("ScenarioRunner short-circuit semantics around state-validation", () => { + it("onboarding action failure does NOT block state-validation (negative scenarios verify absent state)", async () => { + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-no-docker-preflight-negative"]); + const phase = ( + name: PhaseName, + outcome: PhaseResult, + ): { run: (ctx: RunContext, p: RunPlanPhase) => Promise } => ({ + run: async () => outcome, + }); + + let stateValidationCalled = false; + let runtimeCalled = false; + const runner = new ScenarioRunner({ + environment: phase("environment", { + phase: "environment", + status: "passed", + actions: [], + assertions: [], + }), + onboarding: phase("onboarding", { + phase: "onboarding", + status: "failed", + actions: [ + { + id: "onboarding.profile.cloud-openclaw-no-docker", + status: "failed", + durationMs: 1, + message: "preflight detected docker-missing", + }, + ], + assertions: [], + }), + stateValidation: { + run: async () => { + stateValidationCalled = true; + return { + phase: "state-validation", + status: "passed", + actions: [], + assertions: [], + }; + }, + }, + runtime: { + run: async () => { + runtimeCalled = true; + return { phase: "runtime", status: "passed", actions: [], assertions: [] }; + }, + }, + }); + + const results = await runner.run(ctx, plan); + expect(stateValidationCalled).toBe(true); + expect(runtimeCalled).toBe(false); + // state-validation has its real result; runtime is skipped with + // the blocking-action message. + const stateRes = results.find((r) => r.phase === "state-validation")!; + expect(stateRes.status).toBe("passed"); + const runtimeRes = results.find((r) => r.phase === "runtime")!; + expect(runtimeRes.status).toBe("skipped"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("environment action failure blocks state-validation AND runtime", async () => { + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + let stateValidationCalled = false; + let runtimeCalled = false; + const runner = new ScenarioRunner({ + environment: { + run: async () => ({ + phase: "environment", + status: "failed", + actions: [ + { + id: "environment.install.repo-current", + status: "failed", + durationMs: 1, + message: "install dispatcher exit 1", + }, + ], + assertions: [], + }), + }, + onboarding: { + run: async () => ({ phase: "onboarding", status: "passed", actions: [], assertions: [] }), + }, + stateValidation: { + run: async () => { + stateValidationCalled = true; + return { + phase: "state-validation", + status: "passed", + actions: [], + assertions: [], + }; + }, + }, + runtime: { + run: async () => { + runtimeCalled = true; + return { phase: "runtime", status: "passed", actions: [], assertions: [] }; + }, + }, + }); + await runner.run(ctx, plan); + expect(stateValidationCalled).toBe(false); + expect(runtimeCalled).toBe(false); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("state-validation action failure blocks runtime", async () => { + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + let runtimeCalled = false; + const runner = new ScenarioRunner({ + environment: { + run: async () => ({ phase: "environment", status: "passed", actions: [], assertions: [] }), + }, + onboarding: { + run: async () => ({ phase: "onboarding", status: "passed", actions: [], assertions: [] }), + }, + stateValidation: { + run: async () => ({ + phase: "state-validation", + status: "failed", + actions: [ + { + id: "state-validation.gateway-healthy", + status: "failed", + durationMs: 1, + message: "gateway unreachable at http://127.0.0.1:18789", + }, + ], + assertions: [], + }), + }, + runtime: { + run: async () => { + runtimeCalled = true; + return { phase: "runtime", status: "passed", actions: [], assertions: [] }; + }, + }, + }); + const results = await runner.run(ctx, plan); + expect(runtimeCalled).toBe(false); + const runtimeRes = results.find((r) => r.phase === "runtime")!; + expect(runtimeRes.status).toBe("skipped"); + expect(runtimeRes.assertions[0].message).toMatch(/state-validation\.gateway-healthy/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("expected-state registry covers every scenario referenced in the typed registry", () => { + it("every ScenarioDefinition.expectedStateId resolves in the typed expected-state registry", () => { + const referenced = new Set(); + for (const scenario of listScenarios()) { + if (scenario.expectedStateId) { + referenced.add(scenario.expectedStateId); + } + } + expect(referenced.size).toBeGreaterThan(0); + for (const id of referenced) { + expect(getExpectedState(id), `expected_state '${id}' must be in the typed registry`).toBeDefined(); + } + }); +}); diff --git a/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts b/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts index 9dc179fa95..e68eaff830 100644 --- a/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts @@ -15,7 +15,6 @@ const ASSERT = path.join(VALIDATION_SUITES, "assert"); const REBUILD_UPGRADE_LIB = path.join(VALIDATION_SUITES, "lib/rebuild_upgrade.sh"); const FIXTURES = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/fixtures"); const INSTALL_DIR = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/install"); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); function runBash(script: string, env: Record = {}): SpawnSyncReturns { return spawnSync("bash", ["-c", script], { @@ -61,51 +60,6 @@ describe("E2E shell helpers", () => { } }); - it("test_should_emit_plan_only_checks_without_live_infrastructure", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-inf-plan-")); - try { - const r = runBash( - ` - set -euo pipefail - . "${RUNTIME_LIB}/context.sh" - . "${VALIDATION_SUITES}/lib/inference_routing.sh" - e2e_context_init - e2e_context_set E2E_SANDBOX_NAME sandbox-1 - e2e_inference_routing_assert_chat_completion "post-onboard.inference-routing.inference-local-chat-completion" - `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout).toContain("post-onboard.inference-routing.inference-local-chat-completion"); - expect(r.stdout).toMatch(/dry-run|plan/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("test_should_not_print_secret_values_in_helper_output", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-inf-secret-")); - try { - const r = runBash( - ` - set -euo pipefail - . "${RUNTIME_LIB}/context.sh" - . "${VALIDATION_SUITES}/lib/inference_routing.sh" - e2e_context_init - e2e_context_set E2E_SANDBOX_NAME sandbox-1 - e2e_context_set E2E_PROVIDER_API_KEY super-secret-test-token - e2e_inference_routing_assert_auth_proxy "post-onboard.ollama-auth-proxy.authenticated-request-accepted" "valid" - `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).not.toContain("super-secret-test-token"); - expect(r.stdout + r.stderr).toMatch(/REDACTED|dry-run|plan/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - it("security_policy_credentials_helper_should_load_with_context_library", () => { const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "spc-context-")); try { @@ -117,7 +71,7 @@ describe("E2E shell helpers", () => { spc_require_context E2E_SCENARIO E2E_PROVIDER echo "provider=$(spc_context_get E2E_PROVIDER)" `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, + { E2E_CONTEXT_DIR: tmp }, ); expect(r.status, r.stderr).toBe(0); expect(r.stdout).toContain("provider=nvidia"); @@ -136,7 +90,7 @@ describe("E2E shell helpers", () => { . "${VALIDATION_SUITES}/lib/security_policy_credentials.sh" spc_require_context E2E_PROVIDER `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, + { E2E_CONTEXT_DIR: tmp }, ); expect(r.status).not.toBe(0); expect(r.stderr).toContain("E2E_PROVIDER"); @@ -474,38 +428,6 @@ exit 0 } }); - it("scenario_dry_run_should_trace_helper_sequence_in_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-trace-")); - try { - const trace = path.join(tmp, "trace.log"); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - expect(fs.existsSync(trace), "trace log missing").toBe(true); - const contents = fs.readFileSync(trace, "utf8"); - const order = ["env:noninteractive", "install:", "onboard:", "gateway:check", "sandbox:check"]; - let pos = 0; - for (const marker of order) { - const idx = contents.indexOf(marker, pos); - expect(idx, `trace missing marker in order: ${marker}\nfull:\n${contents}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); }); // ───────────────────────────────────────────────────────────────────────────── @@ -600,6 +522,82 @@ describe("rebuild/upgrade validation helpers", () => { fs.rmSync(tmp, { recursive: true, force: true }); } }); + + it("policy_preset_check_should_match_endpoint_url_when_preset_name_absent", () => { + // The legacy assertion called `nemoclaw policy status` (a command + // that does not exist) and silently failed. The new assertion calls + // `openshell policy get --full ` and matches preset names + // OR their well-known endpoint hostnames. Verify both paths: a + // policy output containing only endpoint URLs (no bare preset name) + // still passes, mirroring the behavior of the live gateway policy + // dump in test/e2e/test-rebuild-openclaw.sh. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-ru-policy-")); + try { + fs.writeFileSync( + path.join(tmp, "context.env"), + "E2E_SCENARIO=test\nE2E_AGENT=openclaw\nE2E_SANDBOX_NAME=sb\nE2E_GATEWAY_URL=http://127.0.0.1\n", + ); + const r = runBash( + ` + set -euo pipefail + fake_openshell() { + # Emit a minimal policy dump that contains the preset endpoint + # URLs but NOT the bare preset names. This is the realistic + # case: 'openshell policy get --full' renders network rules + # by hostname, not by preset label. + printf 'allow registry.npmjs.org\\nallow pypi.org\\n' + } + . "${REBUILD_UPGRADE_LIB}" + rebuild_upgrade_assert_policy_presets_preserved + `, + { + E2E_CONTEXT_DIR: tmp, + REBUILD_UPGRADE_OPENSHELL_CMD: "fake_openshell", + E2E_EXPECTED_POLICY_PRESETS: "npm pypi", + }, + ); + expect(r.status, r.stderr).toBe(0); + expect(r.stdout).toContain("suite.rebuild.policy_presets_preserved"); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("policy_preset_check_should_fail_with_diagnostic_when_preset_missing", () => { + // Negative case: when a declared preset is absent from the live + // policy dump, the assertion must fail AND emit a diagnostic line + // identifying the missing preset and showing the policy head. The + // original implementation failed silently because the underlying + // `nemoclaw policy status` command did not exist; the new + // implementation must produce actionable evidence. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-ru-policy-miss-")); + try { + fs.writeFileSync( + path.join(tmp, "context.env"), + "E2E_SCENARIO=test\nE2E_AGENT=openclaw\nE2E_SANDBOX_NAME=sb\nE2E_GATEWAY_URL=http://127.0.0.1\n", + ); + const r = runBash( + ` + fake_openshell() { + # Policy dump missing 'pypi' entirely. + printf 'allow registry.npmjs.org\\n' + } + . "${REBUILD_UPGRADE_LIB}" + rebuild_upgrade_assert_policy_presets_preserved + `, + { + E2E_CONTEXT_DIR: tmp, + REBUILD_UPGRADE_OPENSHELL_CMD: "fake_openshell", + E2E_EXPECTED_POLICY_PRESETS: "npm pypi", + }, + ); + expect(r.status).not.toBe(0); + expect(r.stdout + r.stderr).toMatch(/preset 'pypi' not in policy/); + expect(r.stdout + r.stderr).toMatch(/matchers: pypi/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); }); describe("Phase 1.A logging helpers", () => { @@ -675,7 +673,9 @@ exec "$@" e2e_sandbox_exec sb1 -- false echo "rc=$?" `, - { PATH: `${bin}:${process.env.PATH}` }, + // Force the openshell-direct transport so the stubbed openshell + // (which has no `sandbox ssh-config` subcommand) is exercised. + { PATH: `${bin}:${process.env.PATH}`, E2E_SANDBOX_EXEC_VIA_OPENSHELL: "1" }, ); expect(r.stdout).toMatch(/rc=1/); } finally { @@ -683,21 +683,6 @@ exec "$@" } }); - it("sandbox_exec_should_dry_run_short_circuit_when_e2e_dry_run_set", () => { - // Use a PATH that has bash itself but no nemoclaw — dry-run must - // short-circuit before the CLI lookup. - const r = runBash( - ` - set -euo pipefail - . "${VALIDATION_SUITES}/sandbox-exec.sh" - e2e_sandbox_exec sb1 -- rm -rf / - `, - { E2E_DRY_RUN: "1", PATH: "/usr/bin:/bin" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/dry[- ]run/i); - }); - it("sandbox_exec_stdin_should_quote_args_safely_when_piped", () => { // Verify that $TOKEN is NOT expanded on the host side before being // delivered to the sandbox. We stub openshell to echo back stdin. @@ -717,7 +702,12 @@ exec "$@" . "${VALIDATION_SUITES}/sandbox-exec.sh" printf 'hello $TOKEN' | e2e_sandbox_exec_stdin sb1 -- cat `, - { PATH: `${bin}:${process.env.PATH}`, TOKEN: "SHOULD_NOT_EXPAND" }, + { + PATH: `${bin}:${process.env.PATH}`, + TOKEN: "SHOULD_NOT_EXPAND", + // Stub only handles the openshell-direct transport. + E2E_SANDBOX_EXEC_VIA_OPENSHELL: "1", + }, ); expect(r.status, r.stderr).toBe(0); expect(r.stdout).toContain("hello $TOKEN"); @@ -726,6 +716,111 @@ exec "$@" fs.rmSync(tmp, { recursive: true, force: true }); } }); + + it("sandbox_exec_should_prefer_ssh_config_transport_when_openshell_offers_one", () => { + // Verify the new default: when `openshell sandbox ssh-config ` + // succeeds, the wrapper routes through `ssh -F ` instead of + // `openshell sandbox exec`. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-sbex-ssh-")); + try { + const bin = path.join(tmp, "bin"); + fs.mkdirSync(bin); + const trace = path.join(tmp, "ssh.trace"); + fs.writeFileSync( + path.join(bin, "openshell"), + `#!/usr/bin/env bash +set -euo pipefail +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then + printf 'Host openshell-%s\\n HostName 127.0.0.1\\n Port 2222\\n User sandbox\\n' "$3" + exit 0 +fi +echo "unexpected openshell call: $*" >&2 +exit 99 +`, + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(bin, "ssh"), + `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "ssh-args:$*" >> "${trace}" +remote="\${@: -1}" +printf '%s\\n' "remote-cmd:\${remote}" >> "${trace}" +echo ok-from-ssh +exit 0 +`, + { mode: 0o755 }, + ); + const ctxDir = path.join(tmp, "ctx"); + fs.mkdirSync(ctxDir); + const r = runBash( + ` + set -euo pipefail + . "${VALIDATION_SUITES}/sandbox-exec.sh" + e2e_sandbox_exec sb1 -- echo hello + `, + { + PATH: `${bin}:${process.env.PATH}`, + E2E_CONTEXT_DIR: ctxDir, + }, + ); + expect(r.status, r.stderr).toBe(0); + expect(r.stdout).toContain("ok-from-ssh"); + const traceContents = fs.readFileSync(trace, "utf8"); + expect(traceContents).toMatch(/ssh-args:.*-F /); + expect(traceContents).toContain("openshell-sb1"); + expect(traceContents).toMatch(/remote-cmd:echo hello$/m); + const cfg = path.join(ctxDir, ".ssh-config-cache", "sb1.cfg"); + expect(fs.existsSync(cfg)).toBe(true); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("sandbox_exec_should_fall_back_to_openshell_when_ssh_config_unavailable", () => { + // If `openshell sandbox ssh-config` fails, the wrapper must fall + // back to `openshell sandbox exec`. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-sbex-fb-")); + try { + const bin = path.join(tmp, "bin"); + fs.mkdirSync(bin); + fs.writeFileSync( + path.join(bin, "openshell"), + `#!/usr/bin/env bash +set -uo pipefail +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then + exit 1 +fi +if [[ "$1" == "sandbox" && "$2" == "exec" ]]; then + shift 2 + while [[ "$#" -gt 0 && "$1" != "--" ]]; do shift; done + shift || true + exec "$@" +fi +exit 99 +`, + { mode: 0o755 }, + ); + const ctxDir = path.join(tmp, "ctx"); + fs.mkdirSync(ctxDir); + const r = runBash( + ` + set -euo pipefail + . "${VALIDATION_SUITES}/sandbox-exec.sh" + e2e_sandbox_exec sb1 -- echo fallback-ok + `, + { + PATH: `${bin}:${process.env.PATH}`, + E2E_CONTEXT_DIR: ctxDir, + }, + ); + expect(r.status, r.stderr).toBe(0); + expect(r.stdout).toContain("fallback-ok"); + expect(r.stderr).toMatch(/ssh-config unavailable for sb1/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); }); // ───────────────────────────────────────────────────────────────────────────── @@ -971,53 +1066,6 @@ describe("Issue #3810 messaging provider helper library", () => { }); }); -// ───────────────────────────────────────────────────────────────────────────── -// Phase 1.E — Install-method dispatcher splits -// ───────────────────────────────────────────────────────────────────────────── - -describe("Phase 1.E install dispatcher splits", () => { - function dispatchDryRun(profile: string): SpawnSyncReturns { - return runBash( - ` - set -euo pipefail - . "${INSTALL_DIR}/dispatch.sh" - e2e_install "${profile}" - `, - { E2E_DRY_RUN: "1" }, - ); - } - - it("install_should_dispatch_to_install_repo_helper_for_repo_current_profile", () => { - const r = dispatchDryRun("repo-current"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-repo/); - expect(r.stdout + r.stderr).not.toMatch(/install-curl|install-ollama|install-launchable/); - }); - - it("install_should_dispatch_to_install_curl_helper_for_public_installer_profile", () => { - const r = dispatchDryRun("public-installer"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-curl/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-ollama|install-launchable/); - }); - - it("install_should_dispatch_to_install_ollama_helper_for_ollama_profile", () => { - const r = dispatchDryRun("ollama"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-ollama/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-curl|install-launchable/); - }); - - it("install_should_dispatch_to_install_launchable_helper_for_launchable_profile", () => { - const r = dispatchDryRun("launchable"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-launchable/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-curl|install-ollama/); - }); -}); - - - describe("baseline onboarding validation helper", () => { it("baseline_helper_should_source_under_strict_shell_options", () => { const r = runBash(`set -euo pipefail; source "${VALIDATION_SUITES}/lib/baseline_onboarding.sh"`); @@ -1083,7 +1131,7 @@ describe("sandbox lifecycle validation helper", () => { try { const bin = path.join(tmp, "bin"); fs.mkdirSync(bin); fs.writeFileSync(path.join(bin, "timeout"), "#!/usr/bin/env bash\necho timed out >&2\nexit 124\n", { mode: 0o755 }); - const r = runBash(`set -e; unset E2E_DRY_RUN; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_run_with_timeout 1 bash -c 'sleep 5'`, { PATH: `${bin}:${process.env.PATH}` }); + const r = runBash(`set -e; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_run_with_timeout 1 bash -c 'sleep 5'`, { PATH: `${bin}:${process.env.PATH}` }); expect(r.status).toBe(124); expect(r.stderr).toMatch(/timed out/); } finally { fs.rmSync(tmp, { recursive: true, force: true }); } @@ -1096,7 +1144,7 @@ describe("sandbox lifecycle validation helper", () => { fs.writeFileSync(path.join(bin, "nemoclaw"), `#!/usr/bin/env bash case "$*" in list) echo sb1;; - "sb1 status") echo 'status running gateway healthy sandbox running';; + "sb1 status") printf ' Sandbox: sb1\\n Model: nvidia/x\\n OpenShell: 0.0.44\\n Policies: npm\\n';; "sb1 logs") echo logline;; *) echo "unexpected nemoclaw args: $*" >&2; exit 64;; esac @@ -1105,7 +1153,12 @@ esac echo lifecycle-ok `, { mode: 0o755 }); fs.writeFileSync(path.join(tmp, "context.env"), "E2E_SANDBOX_NAME=sb1\nE2E_GATEWAY_URL=http://127.0.0.1:1\n"); - const r = runBash(`set -euo pipefail; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_load_context; sandbox_lifecycle_assert_nemoclaw_list_contains_sandbox; sandbox_lifecycle_assert_status_fields_present; sandbox_lifecycle_assert_logs_available; sandbox_lifecycle_assert_openshell_exec_ok`, { E2E_CONTEXT_DIR: tmp, PATH: `${bin}:${process.env.PATH}` }); + // Force the wrapper's openshell-exec fallback transport: this + // stub openshell ignores its argv and always echoes 'lifecycle-ok', + // which would corrupt an ssh-config materialization. The opt-out + // env var keeps the test exercising openshell-exec directly while + // production callers still pick up ssh-config-preferred routing. + const r = runBash(`set -euo pipefail; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_load_context; sandbox_lifecycle_assert_nemoclaw_list_contains_sandbox; sandbox_lifecycle_assert_status_fields_present; sandbox_lifecycle_assert_logs_available; sandbox_lifecycle_assert_openshell_exec_ok`, { E2E_CONTEXT_DIR: tmp, PATH: `${bin}:${process.env.PATH}`, E2E_SANDBOX_EXEC_VIA_OPENSHELL: "1" }); expect(r.status, r.stderr).toBe(0); expect(r.stdout).toMatch(/validation\.sandbox_operations\.sandbox_listed/); expect(r.stdout).toMatch(/validation\.sandbox_operations\.openshell_exec_ok/); diff --git a/test/e2e-scenario/framework-tests/e2e-metadata-final-hygiene.test.ts b/test/e2e-scenario/framework-tests/e2e-metadata-final-hygiene.test.ts deleted file mode 100644 index 558f0b9d5d..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-metadata-final-hygiene.test.ts +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Phase 11: Clean the House - final metadata and documentation hygiene. - * - * These tests are intentionally conservative during the incremental - * migration: they guard the README, assert that every suite script - * referenced in suites.yaml exists and is executable, and assert that - * every scenario either has both an expected state and at least one - * suite or is explicitly marked as negative / disabled. - */ - -import { describe, it, expect } from "vitest"; -import fs from "node:fs"; -import path from "node:path"; - -import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); -const VALIDATION_SUITES_DIR = path.join(E2E_DIR, "validation_suites"); -describe("Phase 11 final hygiene", () => { - it("all_suite_scripts_should_exist", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const missing: string[] = []; - for (const [suiteId, suite] of Object.entries(meta.suites.suites)) { - for (const step of suite.steps) { - const p = path.join(VALIDATION_SUITES_DIR, step.script); - if (!fs.existsSync(p)) { - missing.push(`${suiteId}/${step.id} -> ${step.script}`); - } else { - const mode = fs.statSync(p).mode; - // owner-executable bit must be set - if ((mode & 0o100) === 0) { - missing.push(`${suiteId}/${step.id} -> ${step.script} (not executable)`); - } - } - } - } - expect(missing, `missing/non-executable suite scripts:\n${missing.join("\n")}`).toEqual([]); - }); - - it("all_scenarios_should_have_expected_state_and_suites", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const problems: string[] = []; - for (const [id, sc] of Object.entries(meta.scenarios.setup_scenarios)) { - if (!sc.expected_state) { - problems.push(`${id}: missing expected_state`); - continue; - } - // Negative scenarios (preflight failures) intentionally have no suites. - const state = meta.expectedStates.expected_states[sc.expected_state] as { - failure?: { expected?: boolean }; - }; - const isNegative = state?.failure?.expected === true; - if (!Array.isArray(sc.suites)) { - problems.push(`${id}: suites must be an array`); - continue; - } - if (sc.suites.length === 0 && !isNegative) { - problems.push(`${id}: no suites and not a negative scenario`); - } - } - expect(problems, problems.join("\n")).toEqual([]); - }); - -}); diff --git a/test/e2e-scenario/framework-tests/e2e-negative-matcher.test.ts b/test/e2e-scenario/framework-tests/e2e-negative-matcher.test.ts new file mode 100644 index 0000000000..363cb3fcc9 --- /dev/null +++ b/test/e2e-scenario/framework-tests/e2e-negative-matcher.test.ts @@ -0,0 +1,399 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { compileRunPlans } from "../scenarios/compiler.ts"; +import { + evaluateNegativeContract, + negativeContractPhaseResult, +} from "../scenarios/orchestrators/negative-matcher.ts"; +import { ScenarioRunner } from "../scenarios/orchestrators/runner.ts"; +import { listScenarios } from "../scenarios/registry.ts"; +import type { + ExpectedFailureContract, + PhaseName, + PhaseResult, + RunContext, + RunPlan, + RunPlanPhase, +} from "../scenarios/types.ts"; + +function freshCtx(): RunContext { + return { contextDir: fs.mkdtempSync(path.join(os.tmpdir(), "e2e-neg-")) }; +} + +function planWithExpectedFailure(contract: ExpectedFailureContract): RunPlan { + return { + scenarioId: "synthetic-negative", + status: "compiled", + suiteIds: [], + onboardingAssertionIds: [], + phases: [ + { name: "environment", actions: [], assertionGroups: [] }, + { name: "onboarding", actions: [], assertionGroups: [] }, + { name: "runtime", actions: [], assertionGroups: [] }, + ], + runnerRequirements: [], + requiredSecrets: [], + skippedCapabilities: [], + expectedFailure: contract, + sutBoundaries: [{ id: "host-cli", client: "HostCliClient" }], + }; +} + +function phaseResult( + phase: PhaseName, + opts: { + status?: PhaseResult["status"]; + failedActionId?: string; + failedActionMessage?: string; + failedAssertionId?: string; + failedAssertionMessage?: string; + } = {}, +): PhaseResult { + return { + phase, + status: opts.status ?? "passed", + actions: opts.failedActionId + ? [{ id: opts.failedActionId, status: "failed", durationMs: 1, message: opts.failedActionMessage }] + : [], + assertions: opts.failedAssertionId + ? [ + { + id: opts.failedAssertionId, + status: "failed", + attempts: 1, + durationMs: 1, + message: opts.failedAssertionMessage, + }, + ] + : [], + }; +} + +describe("evaluateNegativeContract - phase + errorClass matching", () => { + it("matches when expected phase fails with the declared errorClass", () => { + const plan = planWithExpectedFailure({ + phase: "onboarding", + errorClass: "invalid-nvidia-api-key", + forbiddenSideEffects: ["gateway-started"], + }); + const results: PhaseResult[] = [ + phaseResult("environment", { status: "passed" }), + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding.profile.cloud-openclaw-invalid-nvidia-key", + failedActionMessage: "phase action onboarding exit 1: invalid-nvidia-api-key auth failed", + }), + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(true); + expect(result.outcome).toBe("matched"); + expect(result.observed.failedPhase).toBe("onboarding"); + }); + + it("resolves preflight expected phase to onboarding orchestrator", () => { + const plan = planWithExpectedFailure({ + phase: "preflight", + errorClass: "docker-missing", + }); + const results: PhaseResult[] = [ + phaseResult("environment", { status: "passed" }), + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding.profile.cloud-openclaw", + failedActionMessage: "preflight detected docker-missing on the runner host", + }), + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(true); + expect(result.outcome).toBe("matched"); + }); + + it("fails when no failure was observed at all", () => { + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results: PhaseResult[] = [ + phaseResult("environment", { status: "passed" }), + phaseResult("onboarding", { status: "passed" }), + phaseResult("runtime", { status: "passed" }), + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(false); + expect(result.outcome).toBe("no-failure-observed"); + expect(result.message).toMatch(/all phases passed/); + }); + + it("fails when the wrong phase failed", () => { + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results: PhaseResult[] = [ + phaseResult("environment", { + status: "failed", + failedActionId: "environment.install.ubuntu-repo-no-docker", + failedActionMessage: "install dispatcher exit 1: docker-missing", + }), + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(false); + expect(result.outcome).toBe("wrong-phase"); + expect(result.message).toMatch(/expected onboarding failure/); + expect(result.observed.failedPhase).toBe("environment"); + }); + + it("fails when the right phase failed for the wrong errorClass", () => { + const plan = planWithExpectedFailure({ + phase: "onboarding", + errorClass: "gateway-port-conflict", + }); + const results: PhaseResult[] = [ + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding.profile.cloud-openclaw-gateway-port-conflict", + failedActionMessage: "onboard exit 1: invalid-nvidia-api-key authentication failed", + }), + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(false); + expect(result.outcome).toBe("wrong-error-class"); + expect(result.message).toMatch(/errorClass mismatch/); + }); + + it("ignores the runtime side-effect probe step when scanning for observed failure", () => { + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results: PhaseResult[] = [ + phaseResult("environment", { status: "passed" }), + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding.profile.cloud-openclaw", + failedActionMessage: "onboard exit 1: docker-missing daemon unreachable", + }), + // runtime phase has only the required pending side-effect step + // that fails closed until the probe lands. The matcher must NOT + // treat that as the observed failure mode. + { + phase: "runtime", + status: "failed", + actions: [], + assertions: [ + { + id: "runtime.expected-failure.no-side-effects", + status: "failed", + attempts: 1, + durationMs: 0, + message: "required pending step not implemented: expectedFailureNoSideEffectsProbe", + }, + ], + }, + ]; + const result = evaluateNegativeContract(plan, results); + expect(result.matched).toBe(true); + expect(result.observed.failedActionId).toBe("onboarding.profile.cloud-openclaw"); + }); + + it("matches errorClass case-insensitively and across separator variants", () => { + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results: PhaseResult[] = [ + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding", + failedActionMessage: "Onboard exit 1: Docker_Missing daemon socket unreachable", + }), + ]; + expect(evaluateNegativeContract(plan, results).matched).toBe(true); + }); + + it("throws if invoked for a plan without expectedFailure", () => { + const plan: RunPlan = { ...planWithExpectedFailure({ phase: "onboarding", errorClass: "x" }), expectedFailure: undefined }; + expect(() => evaluateNegativeContract(plan, [])).toThrow(/no expectedFailure declared/); + }); + + it("synthetic phase result reflects matched status", () => { + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results: PhaseResult[] = [ + phaseResult("onboarding", { + status: "failed", + failedActionId: "onboarding", + failedActionMessage: "docker-missing", + }), + ]; + const synthetic = negativeContractPhaseResult(evaluateNegativeContract(plan, results)); + expect(synthetic.phase).toBe("negative-contract"); + expect(synthetic.status).toBe("passed"); + expect(synthetic.assertions[0]).toEqual( + expect.objectContaining({ id: "negative-contract.match", status: "passed" }), + ); + }); +}); + +describe("ScenarioRunner appends negative-contract phase", () => { + it("invokes matcher and appends a passing synthetic phase when contract matched", async () => { + const ctx = freshCtx(); + try { + const fakePhase = ( + phase: PhaseName, + outcome: PhaseResult, + ) => ({ + run: async ( + _ctx: RunContext, + _runPhase: RunPlanPhase, + _prior?: PhaseResult[], + ): Promise => outcome, + }); + + const runner = new ScenarioRunner({ + environment: fakePhase("environment", { phase: "environment", status: "passed", actions: [], assertions: [] }), + onboarding: fakePhase("onboarding", { + phase: "onboarding", + status: "failed", + actions: [ + { + id: "onboarding.profile.cloud-openclaw", + status: "failed", + durationMs: 1, + message: "onboard exit 1: docker-missing daemon unreachable", + }, + ], + assertions: [], + }), + runtime: fakePhase("runtime", { phase: "runtime", status: "passed", actions: [], assertions: [] }), + }); + + const plan = planWithExpectedFailure({ phase: "preflight", errorClass: "docker-missing" }); + const results = await runner.run(ctx, plan); + + const contractPhase = results[results.length - 1]; + expect(contractPhase.phase).toBe("negative-contract"); + expect(contractPhase.status).toBe("passed"); + + // Artifact emitted to ctx.contextDir/.e2e/negative-contract.json + const artifact = path.join(ctx.contextDir, ".e2e", "negative-contract.json"); + expect(fs.existsSync(artifact)).toBe(true); + const parsed = JSON.parse(fs.readFileSync(artifact, "utf8")); + expect(parsed.matched).toBe(true); + expect(parsed.outcome).toBe("matched"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("emits a failed synthetic phase when the wrong phase failed", async () => { + const ctx = freshCtx(); + try { + const fakePhase = (outcome: PhaseResult) => ({ + run: async (): Promise => outcome, + }); + + const runner = new ScenarioRunner({ + environment: fakePhase({ + phase: "environment", + status: "failed", + actions: [ + { + id: "environment.install.ubuntu-repo-no-docker", + status: "failed", + durationMs: 1, + message: "install dispatcher exit 1: dns-resolution-error", + }, + ], + assertions: [], + }), + onboarding: fakePhase({ phase: "onboarding", status: "skipped", actions: [], assertions: [] }), + runtime: fakePhase({ phase: "runtime", status: "skipped", actions: [], assertions: [] }), + }); + + const plan = planWithExpectedFailure({ phase: "onboarding", errorClass: "docker-missing" }); + const results = await runner.run(ctx, plan); + + const contractPhase = results[results.length - 1]; + expect(contractPhase.phase).toBe("negative-contract"); + expect(contractPhase.status).toBe("failed"); + expect(contractPhase.assertions[0].message).toMatch(/expected onboarding failure/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("does NOT append negative-contract phase for positive scenarios", async () => { + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + expect(plan.expectedFailure).toBeUndefined(); + + const fakePhase = (phase: PhaseName) => ({ + run: async (): Promise => ({ + phase, + status: "passed", + actions: [], + assertions: [], + }), + }); + const runner = new ScenarioRunner({ + environment: fakePhase("environment"), + onboarding: fakePhase("onboarding"), + stateValidation: fakePhase("state-validation"), + lifecycle: fakePhase("lifecycle"), + runtime: fakePhase("runtime"), + }); + + const results = await runner.run(ctx, plan); + expect(results.map((r) => r.phase)).toEqual([ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", + ]); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("registry contract: every negative scenario opts into the side-effect probe", () => { + it("scenario.expectedFailure implies the runtime no-side-effects required pending step", () => { + const negatives = listScenarios().filter((scenario) => scenario.expectedFailure); + expect(negatives.length).toBeGreaterThan(0); + for (const scenario of negatives) { + const runtimeGroups = scenario.assertionGroups.filter((group) => group.phase === "runtime"); + const hasProbeStep = runtimeGroups.some((group) => + group.steps.some( + (step) => + step.id === "runtime.expected-failure.no-side-effects" && + step.implementation?.kind === "pending" && + step.required === true, + ), + ); + expect(hasProbeStep, `scenario ${scenario.id} must include the required side-effect pending step`).toBe(true); + } + }); +}); + +describe("compiler validates the typed expected-failure contract", () => { + it("rejects an invalid phase value", () => { + expect(() => + compileRunPlans([ + { + id: "synthetic-bad-phase", + assertionGroups: [], + // Force the bad shape the compiler must reject. + expectedFailure: { phase: "bogus" as never, errorClass: "x" }, + }, + ]), + ).toThrow(/expectedFailure\.phase invalid/); + }); + + it("rejects an empty errorClass", () => { + expect(() => + compileRunPlans([ + { + id: "synthetic-empty-class", + assertionGroups: [], + expectedFailure: { phase: "onboarding", errorClass: "" }, + }, + ]), + ).toThrow(/errorClass must be a non-empty string/); + }); +}); diff --git a/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts b/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts index 497dac3387..52ec95cddb 100644 --- a/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts @@ -3,19 +3,39 @@ import { describe, expect, it } from "vitest"; import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; import { HostCliClient } from "../scenarios/clients/host-cli.ts"; import { compileRunPlans } from "../scenarios/compiler.ts"; import { PhaseOrchestrator } from "../scenarios/orchestrators/phase.ts"; import { ScenarioRunner } from "../scenarios/orchestrators/runner.ts"; -import type { AssertionStep, PhaseName, PhaseResult, RunContext, RunPlanPhase } from "../scenarios/types.ts"; +import type { + AssertionStep, + PhaseAction, + PhaseName, + PhaseResult, + RunContext, + RunPlanPhase, +} from "../scenarios/types.ts"; -function fakeCtx(): RunContext { - return { contextDir: fs.mkdtempSync(path.join(process.cwd(), ".tmp-e2e-phase-")), dryRun: true }; +const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); + +function freshCtx(): RunContext { + return { contextDir: fs.mkdtempSync(path.join(os.tmpdir(), "e2e-phase-")) }; +} + +function shellStep(id: string, phase: PhaseName, ref: string, reliability?: AssertionStep["reliability"]): AssertionStep { + return { + id, + phase, + implementation: { kind: "shell", ref }, + evidencePath: `.e2e/assertions/${id}.log`, + reliability, + }; } -function fakeStep(id: string, phase: PhaseName, ref = "fake-pass"): AssertionStep { +function probeStep(id: string, phase: PhaseName, ref = "no-such-probe"): AssertionStep { return { id, phase, @@ -24,97 +44,916 @@ function fakeStep(id: string, phase: PhaseName, ref = "fake-pass"): AssertionSte }; } -function fakePhase(step: AssertionStep): RunPlanPhase { +function pendingStep(id: string, phase: PhaseName): AssertionStep { + return { + id, + phase, + implementation: { kind: "pending", ref: "not-yet" }, + }; +} + +function makePhase(steps: AssertionStep[]): RunPlanPhase { return { - name: step.phase, + name: steps[0].phase, actions: [], - assertionGroups: [{ id: `group.${step.id}`, phase: step.phase, migrationStatus: "complete", steps: [step] }], + assertionGroups: [{ id: `group.${steps[0].id}`, phase: steps[0].phase, migrationStatus: "complete", steps }], + }; +} + +function writeTempScript(dir: string, name: string, body: string): string { + const p = path.join(dir, name); + fs.writeFileSync(p, `#!/usr/bin/env bash\nset -euo pipefail\n${body}\n`, { mode: 0o755 }); + return p; +} + +function shellAction( + id: string, + phase: PhaseName, + scriptRef: string, + opts: { timeoutSeconds?: number; arg?: string } = {}, +): PhaseAction { + return { + id, + phase, + kind: "shell", + scriptRef, + arg: opts.arg, + timeoutSeconds: opts.timeoutSeconds, + }; +} + +function makePhaseWithActions( + phase: PhaseName, + actions: PhaseAction[], + steps: AssertionStep[], +): RunPlanPhase { + return { + name: phase, + actions, + assertionGroups: + steps.length > 0 + ? [{ id: `group.${steps[0].id}`, phase, migrationStatus: "complete", steps }] + : [], }; } -describe("phase orchestrators", () => { +describe("phase orchestrators - top-level delegation", () => { it("test_should_execute_phase_assertions_from_phase_orchestrators_not_top_level_runner", async () => { - const ctx = fakeCtx(); + const ctx = freshCtx(); try { const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); const calls: string[] = []; const fakeOrchestrator = (phase: PhaseName) => ({ run: async (_ctx: RunContext, runPhase: RunPlanPhase, _prior?: PhaseResult[]): Promise => { calls.push(runPhase.name); - return { phase, status: "passed", assertions: [] }; + return { phase, status: "passed", actions: [], assertions: [] }; }, }); const runner = new ScenarioRunner({ environment: fakeOrchestrator("environment"), onboarding: fakeOrchestrator("onboarding"), + stateValidation: fakeOrchestrator("state-validation"), + lifecycle: fakeOrchestrator("lifecycle"), runtime: fakeOrchestrator("runtime"), }); const results = await runner.run(ctx, plan); - expect(calls).toEqual(["environment", "onboarding", "runtime"]); - expect(results.map((result) => result.phase)).toEqual(["environment", "onboarding", "runtime"]); + expect(calls).toEqual([ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", + ]); + expect(results.map((result) => result.phase)).toEqual([ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", + ]); } finally { fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); +}); - it("test_should_record_step_status_attempts_duration_classifier_and_evidence", async () => { - const ctx = fakeCtx(); +describe("phase orchestrators - real shell execution", () => { + it("shell_step_passes_when_script_exits_zero", async () => { + const ctx = freshCtx(); try { - const step = fakeStep("runtime.retry-pass", "runtime", "fake-retry-once-pass"); - step.reliability = { retry: { attempts: 2, on: ["gateway-transient"] } }; + const script = writeTempScript(ctx.contextDir, "ok.sh", "echo hello-from-real-shell"); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-pass", "runtime", ref); const orchestrator = new PhaseOrchestrator("runtime"); - const result = await orchestrator.run(ctx, fakePhase(step)); + const result = await orchestrator.run(ctx, makePhase([step])); expect(result.status).toBe("passed"); expect(result.assertions[0]).toEqual( - expect.objectContaining({ - id: "runtime.retry-pass", - status: "passed", - attempts: 2, - classifier: "gateway-transient", - evidence: ".e2e/assertions/runtime.retry-pass.json", - }), + expect.objectContaining({ id: "runtime.real-pass", status: "passed", attempts: 1 }), ); - expect(result.assertions[0].durationMs).toBeGreaterThanOrEqual(0); + const log = fs.readFileSync(result.assertions[0].evidence!, "utf8"); + expect(log).toContain("hello-from-real-shell"); } finally { fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); - it("test_should_enforce_timeout_and_retry_policy_in_orchestrator", async () => { - const ctx = fakeCtx(); + it("shell_step_fails_when_script_exits_nonzero_and_records_stderr_tail", async () => { + const ctx = freshCtx(); try { - const step = fakeStep("runtime.retry-fail", "runtime", "fake-always-transient"); - step.reliability = { timeoutSeconds: 1, retry: { attempts: 2, on: ["provider-transient"] } }; + const script = writeTempScript(ctx.contextDir, "fail.sh", 'echo "boom: real failure" >&2; exit 7'); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-fail", "runtime", ref); const orchestrator = new PhaseOrchestrator("runtime"); - const result = await orchestrator.run(ctx, fakePhase(step)); + const result = await orchestrator.run(ctx, makePhase([step])); expect(result.status).toBe("failed"); - expect(result.assertions[0]).toEqual( - expect.objectContaining({ - id: "runtime.retry-fail", - status: "failed", - attempts: 2, - classifier: "provider-transient", + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/exit 7/); + expect(result.assertions[0].message).toMatch(/boom: real failure/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("shell_step_times_out_via_orchestrator_policy_not_script", async () => { + const ctx = freshCtx(); + try { + const script = writeTempScript(ctx.contextDir, "slow.sh", "sleep 30"); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-timeout", "runtime", ref, { timeoutSeconds: 1 }); + const orchestrator = new PhaseOrchestrator("runtime"); + + const started = Date.now(); + const result = await orchestrator.run(ctx, makePhase([step])); + const elapsed = Date.now() - started; + + expect(result.status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/exceeded 1s/); + expect(elapsed).toBeLessThan(15_000); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }, 20_000); + + it("shell_step_retries_on_classified_transient_then_passes", async () => { + const ctx = freshCtx(); + try { + const counterFile = path.join(ctx.contextDir, "counter"); + fs.writeFileSync(counterFile, "0"); + const script = writeTempScript( + ctx.contextDir, + "gateway-flaky.sh", + `n=$(cat "${counterFile}"); n=$((n+1)); echo "$n" > "${counterFile}"; if [ "$n" -lt 2 ]; then echo "gateway-transient: try again" >&2; exit 1; fi; echo ok`, + ); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.gateway-retry", "runtime", ref, { + retry: { attempts: 2, on: ["gateway-transient"] }, + }); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("passed"); + expect(result.assertions[0].attempts).toBe(2); + expect(result.assertions[0].classifier).toBe("gateway-transient"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("shell_step_fails_with_clear_message_when_script_missing", async () => { + const ctx = freshCtx(); + try { + const step = shellStep("runtime.missing", "runtime", "test/e2e-scenario/does-not-exist.sh"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/script not found/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("probe_step_without_registered_probe_skips_visibly_never_passes_falsely", async () => { + const ctx = freshCtx(); + try { + const step = probeStep("runtime.probe-pending", "runtime"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/probe not registered/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("pending_step_skips_visibly_with_pending_marker", async () => { + const ctx = freshCtx(); + try { + const step = pendingStep("runtime.pending", "runtime"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/^pending:/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("phase orchestrators - actions execute before assertions", () => { + it("phase_action_runs_before_assertions_and_records_evidence", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "setup.sh", "echo phase-action-evidence"); + const action = shellAction("environment.setup-ok", "environment", path.relative(REPO_ROOT, actionScript)); + const stepScript = writeTempScript(ctx.contextDir, "after.sh", "echo after-action"); + const step = shellStep("environment.assert-ok", "environment", path.relative(REPO_ROOT, stepScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [step])); + + expect(result.status).toBe("passed"); + expect(result.actions).toHaveLength(1); + expect(result.actions[0]).toEqual( + expect.objectContaining({ id: "environment.setup-ok", status: "passed" }), + ); + expect(result.actions[0].evidence).toBeTruthy(); + const actionLog = fs.readFileSync(result.actions[0].evidence!, "utf8"); + expect(actionLog).toContain("phase-action-evidence"); + expect(result.assertions).toHaveLength(1); + expect(result.assertions[0].status).toBe("passed"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_failure_short_circuits_assertions", async () => { + const ctx = freshCtx(); + try { + const failScript = writeTempScript(ctx.contextDir, "fail.sh", 'echo "setup boom" >&2; exit 5'); + const action = shellAction("environment.setup-fail", "environment", path.relative(REPO_ROOT, failScript)); + const stepScript = writeTempScript(ctx.contextDir, "after.sh", "echo should-not-run"); + const step = shellStep("environment.never-runs", "environment", path.relative(REPO_ROOT, stepScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [step])); + + expect(result.status).toBe("failed"); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].status).toBe("failed"); + expect(result.actions[0].message).toMatch(/exit 5/); + // Assertions must NOT have run, so they must NOT show a misleading + // pass for an environment that was never set up. + expect(result.assertions).toEqual([]); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_times_out_via_orchestrator_policy", async () => { + const ctx = freshCtx(); + try { + const slow = writeTempScript(ctx.contextDir, "slow.sh", "sleep 30"); + const action = shellAction("environment.setup-slow", "environment", path.relative(REPO_ROOT, slow), { + timeoutSeconds: 1, + }); + const orchestrator = new PhaseOrchestrator("environment"); + + const started = Date.now(); + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [])); + + expect(result.status).toBe("failed"); + expect(result.actions[0].status).toBe("failed"); + expect(result.actions[0].message).toMatch(/exceeded 1s/); + // The orchestrator must enforce the timeout, not depend on the + // script self-killing. Allow some headroom but fail if we waited + // anywhere near the script's 30s sleep. + expect(Date.now() - started).toBeLessThan(15_000); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_publishes_alias_path_on_success", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "alias.sh", "echo aliased-output"); + const action: PhaseAction = { + id: "onboarding.profile.alias-demo", + phase: "onboarding", + kind: "shell", + scriptRef: path.relative(REPO_ROOT, actionScript), + aliasPath: "onboard.log", + }; + const orchestrator = new PhaseOrchestrator("onboarding"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("onboarding", [action], [])); + + expect(result.actions[0].status).toBe("passed"); + const aliasContents = fs.readFileSync(path.join(ctx.contextDir, "onboard.log"), "utf8"); + expect(aliasContents).toContain("aliased-output"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_evidence_log_is_flushed_before_resolve", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "flush.sh", "echo flushed-phase-action-output"); + const action = shellAction("environment.flush", "environment", path.relative(REPO_ROOT, actionScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [])); + + // Synchronous read must already see the output - the orchestrator + // must wait for the WriteStream's 'finish' before resolving. + const log = fs.readFileSync(result.actions[0].evidence!, "utf8"); + expect(log).toContain("flushed-phase-action-output"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("plan compiler emits phase actions for canonical scenarios", () => { + it("compiler_emits_install_and_onboard_actions_for_canonical_scenarios", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ids = [ + "ubuntu-repo-cloud-openclaw", + "ubuntu-repo-cloud-hermes", + "gpu-repo-local-ollama-openclaw", + "macos-repo-cloud-openclaw", + "wsl-repo-cloud-openclaw", + "brev-launchable-cloud-openclaw", + "ubuntu-no-docker-preflight-negative", + ]; + const plans = compileRunPlans(ids); + expect(plans).toHaveLength(ids.length); + for (const plan of plans) { + const env = plan.phases.find((p) => p.name === "environment")!; + const onb = plan.phases.find((p) => p.name === "onboarding")!; + expect(env.actions.some((a) => a.id.startsWith("environment.install."))).toBe(true); + expect(onb.actions.some((a) => a.id.startsWith("onboarding.profile."))).toBe(true); + // context.env emission is framework infrastructure (ScenarioRunner), + // not a shell action. The compiler must NOT emit a shell context + // action - if it did we'd be coupling back to the old resolver's + // plan.json shape. + expect(env.actions.map((a) => a.id)).not.toContain("environment.context.emit"); + // Onboarding action must publish a stable alias path so legacy + // shell assertions referencing ${E2E_CONTEXT_DIR}/onboard.log + // keep working without coupling them to action ids. + const onboardingAction = onb.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(onboardingAction?.aliasPath).toBe("onboard.log"); + // Every install/onboard action must be a typed shell-fn referencing + // the canonical dispatcher script - no free-form strings. + for (const action of [...env.actions, ...onb.actions]) { + if (action.id.startsWith("environment.install.") || action.id.startsWith("onboarding.profile.")) { + expect(action.kind).toBe("shell-fn"); + expect(action.scriptRef).toMatch(/dispatch\.sh$/); + expect(action.fn).toMatch(/^e2e_(install|onboard)$/); + expect(action.arg).toBeTruthy(); + } + } + } + }); + + it("compiler_routes_docker_missing_runtime_to_no_docker_onboarding_profile", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + // Negative scenario declares runtime=docker-missing in scenarios.yaml. + // The compiler must substitute the onboarding profile id from the + // base 'cloud-openclaw' to 'cloud-openclaw-no-docker' so the + // dispatcher routes to the worker that installs the docker shim and + // captures negative-preflight.log. Without this routing, the + // 'onboarding.preflight.expected-failed' assertion has nothing to grep. + const [plan] = compileRunPlans(["ubuntu-no-docker-preflight-negative"]); + const onb = plan.phases.find((p) => p.name === "onboarding")!; + const action = onb.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(action?.id).toBe("onboarding.profile.cloud-openclaw-no-docker"); + expect(action?.arg).toBe("cloud-openclaw-no-docker"); + expect(action?.evidencePath).toBe( + ".e2e/actions/onboarding.profile.cloud-openclaw-no-docker.log", + ); + // Secret env must still include NVIDIA_API_KEY so behavior matches + // a real user invocation (CLI loads creds even if preflight aborts). + expect(action?.secretEnv).toContain("NVIDIA_API_KEY"); + // Positive scenarios must NOT pick up the -no-docker suffix. + const [posPlan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const posAction = posPlan.phases + .find((p) => p.name === "onboarding")! + .actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(posAction?.arg).toBe("cloud-openclaw"); + }); + + it("compiler_emits_lifecycle_phase_action_when_scenario_declares_lifecycle_profile", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + // Rebuild scenario declares environment.lifecycle = + // 'rebuild-current-version'. The compiler must emit a single + // lifecycle phase action that dispatches to the canonical + // lifecycle dispatcher; without this, runtime-phase rebuild + // assertions run against a sandbox that was never rebuilt. + const [plan] = compileRunPlans(["ubuntu-rebuild-openclaw"]); + const lifecycle = plan.phases.find((p) => p.name === "lifecycle")!; + expect(lifecycle).toBeTruthy(); + expect(lifecycle.actions).toHaveLength(1); + const action = lifecycle.actions[0]; + expect(action.id).toBe("lifecycle.profile.rebuild-current-version"); + expect(action.arg).toBe("rebuild-current-version"); + expect(action.scriptRef).toMatch(/lifecycle\/dispatch\.sh$/); + expect(action.fn).toBe("e2e_lifecycle"); + expect(action.evidencePath).toBe( + ".e2e/actions/lifecycle.profile.rebuild-current-version.log", + ); + // Secret env: nemoclaw rebuild re-reads NVIDIA_API_KEY when the + // post-rebuild sandbox is brought back up. + expect(action.secretEnv).toContain("NVIDIA_API_KEY"); + }); + + it("compiler_emits_no_lifecycle_actions_when_scenario_does_not_declare_lifecycle", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + // Default scenarios omit environment.lifecycle. The lifecycle + // phase still appears in the plan (deterministic phase order) + // but emits zero actions and runs no assertions. + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const lifecycle = plan.phases.find((p) => p.name === "lifecycle")!; + expect(lifecycle).toBeTruthy(); + expect(lifecycle.actions).toHaveLength(0); + expect(lifecycle.assertionGroups).toHaveLength(0); + }); + + it("compiler_drops_rebuild_and_upgrade_supplemental_suites_from_cloud_openclaw", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + // The 'rebuild' and 'upgrade' suites used to be supplementally + // attached to ubuntu-repo-cloud-openclaw, which produced + // fake-failures (no rebuild ran -> nothing could be preserved). + // Coverage now lives on ubuntu-rebuild-openclaw, which actually + // runs the lifecycle phase. The cloud-openclaw scenario must NOT + // include those suites' assertion groups. + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const runtime = plan.phases.find((p) => p.name === "runtime")!; + const groupIds = runtime.assertionGroups.map((g) => g.id); + expect(groupIds).not.toContain("suite.rebuild"); + expect(groupIds).not.toContain("suite.upgrade"); + }); + + it("compiler_includes_rebuild_and_upgrade_groups_on_ubuntu_rebuild_openclaw", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const [plan] = compileRunPlans(["ubuntu-rebuild-openclaw"]); + const runtime = plan.phases.find((p) => p.name === "runtime")!; + const groupIds = runtime.assertionGroups.map((g) => g.id); + expect(groupIds).toContain("suite.rebuild"); + expect(groupIds).toContain("suite.upgrade"); + }); +}); + +describe("ScenarioRunner seeds context.env and short-circuits across phases", () => { + it("seedContextEnv_writes_normalized_keys_at_top_level_context_env_path", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const { seedContextEnv } = await import("../scenarios/orchestrators/context.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const result = seedContextEnv(ctx, plan); + + // Path matches the shell helper's e2e_context_init: top-level, + // not under .e2e/. Runtime steps source ${E2E_CONTEXT_DIR}/context.env. + expect(result.path).toBe(path.join(ctx.contextDir, "context.env")); + const body = fs.readFileSync(result.path, "utf8"); + // Required keys downstream shell assertions look up. + expect(body).toMatch(/^E2E_SCENARIO=ubuntu-repo-cloud-openclaw$/m); + expect(body).toMatch(/^E2E_PLATFORM_OS=ubuntu$/m); + expect(body).toMatch(/^E2E_AGENT=openclaw$/m); + expect(body).toMatch(/^E2E_PROVIDER=nvidia$/m); + expect(body).toMatch(/^E2E_GATEWAY_URL=http:\/\/127\.0\.0\.1:18789$/m); + expect(body).toMatch(/^E2E_SANDBOX_NAME=e2e-ubuntu-repo-cloud-openclaw$/m); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("hermes_scenario_seeds_hermes_gateway_url", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const { seedContextEnv } = await import("../scenarios/orchestrators/context.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-hermes"]); + const result = seedContextEnv(ctx, plan); + const body = fs.readFileSync(result.path, "utf8"); + expect(body).toMatch(/^E2E_AGENT=hermes$/m); + expect(body).toMatch(/^E2E_GATEWAY_URL=http:\/\/127\.0\.0\.1:8642$/m); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("runner_skips_downstream_phases_when_prior_phase_action_fails", async () => { + const { ScenarioRunner } = await import("../scenarios/orchestrators/runner.ts"); + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + // Inject a failing environment phase to simulate an install action + // failure. Onboarding and runtime must report skipped, not run + // their own actions or assertions. + const failingEnv = { + run: async () => ({ + phase: "environment" as const, + status: "failed" as const, + actions: [ + { + id: "environment.install.repo-current", + status: "failed" as const, + durationMs: 5, + message: "simulated install failure", + }, + ], + assertions: [], + }), + }; + let onboardingCalled = false; + let runtimeCalled = false; + const onboarding = { + run: async () => { + onboardingCalled = true; + return { phase: "onboarding" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + const runtime = { + run: async () => { + runtimeCalled = true; + return { phase: "runtime" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + let stateValidationCalled = false; + const stateValidation = { + run: async () => { + stateValidationCalled = true; + return { + phase: "state-validation" as const, + status: "passed" as const, + actions: [], + assertions: [], + }; + }, + }; + const runner = new ScenarioRunner({ + environment: failingEnv, + onboarding, + stateValidation, + runtime, + }); + + const results = await runner.run(ctx, plan); + + // Downstream orchestrators must NOT have been invoked. An + // environment failure means install never ran; there is nothing + // for state-validation to probe. + expect(onboardingCalled).toBe(false); + expect(stateValidationCalled).toBe(false); + expect(runtimeCalled).toBe(false); + // Each phase still has a result, and the downstream ones are + // skipped with a message that names the blocking action. + expect(results.map((r) => r.phase)).toEqual([ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", + ]); + expect(results[1].status).toBe("skipped"); + expect(results[2].status).toBe("skipped"); + expect(results[3].status).toBe("skipped"); + expect(results[4].status).toBe("skipped"); + expect(results[1].assertions[0].message).toMatch(/blocked by prior failure/); + expect(results[1].assertions[0].message).toMatch(/environment.install.repo-current/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("runner_does_not_short_circuit_on_assertion_failure_only", async () => { + // Assertion failures (as opposed to action failures) must not block + // downstream phases - reviewers need to see all failure layers. + const { ScenarioRunner } = await import("../scenarios/orchestrators/runner.ts"); + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const env = { + run: async () => ({ + phase: "environment" as const, + status: "failed" as const, + actions: [], + assertions: [ + { id: "environment.something", status: "failed" as const, attempts: 1, durationMs: 1 }, + ], }), + }; + let onboardingCalled = false; + const onboarding = { + run: async () => { + onboardingCalled = true; + return { phase: "onboarding" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + const runner = new ScenarioRunner({ + environment: env, + onboarding, + runtime: { + run: async () => ({ phase: "runtime" as const, status: "passed" as const, actions: [], assertions: [] }), + }, + }); + + await runner.run(ctx, plan); + expect(onboardingCalled).toBe(true); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("required probe and pending steps fail closed", () => { + it("test_required_probe_step_that_is_unregistered_fails_the_phase", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.security.required-probe", + phase: "runtime", + implementation: { kind: "probe", ref: "unregisteredSecurityProbe" }, + evidencePath: ".e2e/assertions/runtime.security.required-probe.json", + required: true, + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/required probe not registered/); + expect(result.assertions[0].message).toContain("unregisteredSecurityProbe"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_non_required_probe_step_continues_to_skip_visibly", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.diagnostics.non-required-probe", + phase: "runtime", + // Use an intentionally-unregistered ref so this test exercises + // the "missing probe" code path. `diagnosticsProbe` is now a + // real built-in registered at orchestrator import time, so + // referring to it here would actually invoke nemoclaw and the + // assertion would fail (or pass) on real CLI behavior — + // unrelated to what this test verifies. + implementation: { kind: "probe", ref: "unregisteredFakeProbe" }, + evidencePath: ".e2e/assertions/runtime.diagnostics.non-required-probe.json", + // required intentionally omitted (defaults to false) + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/probe not registered/); + // Non-required skipped step does not fail the phase. + expect(result.status).not.toBe("failed"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_required_pending_step_fails_closed", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.expected-failure.no-side-effects", + phase: "runtime", + implementation: { kind: "pending", ref: "expectedFailureNoSideEffectsProbe" }, + evidencePath: ".e2e/assertions/runtime.expected-failure.no-side-effects.json", + required: true, + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/required pending step not implemented/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_security_suite_groups_in_registry_mark_their_steps_as_required", async () => { + const { assertionGroupForSuite } = await import("../scenarios/assertions/registry.ts"); + for (const suiteId of ["security-shields", "security-policy", "security-injection"]) { + const group = assertionGroupForSuite(suiteId); + expect(group, `missing assertion group for suite ${suiteId}`).toBeDefined(); + for (const step of group?.steps ?? []) { + expect( + step.required, + `${suiteId} step ${step.id} must be required so it fails closed`, + ).toBe(true); + } + } + }); + + it("test_expected_failure_no_side_effects_step_in_registry_is_required", async () => { + const { assertionRegistry } = await import("../scenarios/assertions/registry.ts"); + const group = assertionRegistry.groups.find( + (g) => g.id === "runtime.expected-failure.no-side-effects", + ); + expect(group).toBeDefined(); + for (const step of group?.steps ?? []) { + expect(step.required).toBe(true); + } + }); +}); + +describe("framework-owned secret hygiene at the spawn boundary", () => { + it("test_should_not_persist_secret_shaped_child_output_into_evidence", async () => { + const ctx = freshCtx(); + try { + // Child writes secret-shaped tokens (NVIDIA, GitHub, OpenAI, + // Slack, Bearer-prefixed) on both stdout and stderr, then exits + // non-zero so stderrTail also flows into result.message. None of + // those literal tokens may persist anywhere in the evidence. + const body = [ + 'echo "step prints nvapi-1234567890abcdef0123456789"', + 'echo "and ghp_abcdefghijklmnopqrstuvwxyz0123456789"', + 'echo "and sk-abcdefghijklmnopqrstuvwxyz0123456789"', + 'echo "and xoxb-9876543210-fake-bot-token-abc"', + 'echo "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.signature" 1>&2', + 'exit 7', + ].join("\n"); + const script = writeTempScript(ctx.contextDir, "leak.sh", body); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.leak", "runtime", ref); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const assertion = result.assertions[0]; + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + const phaseResultJson = fs.readFileSync( + path.join(ctx.contextDir, ".e2e", "runtime.result.json"), + "utf8", + ); + const surfaces = [logBody, assertion.message ?? "", phaseResultJson]; + + // Every secret-shaped token canonicalized in + // src/lib/security/secret-patterns.ts must be redacted on the + // way to disk, regardless of which surface is read. + const forbiddenPatterns = [ + /nvapi-[A-Za-z0-9_-]{10,}/, + /ghp_[A-Za-z0-9_-]{10,}/, + /sk-[A-Za-z0-9_-]{20,}/, + /(?:xox[bpas]|xapp)-[A-Za-z0-9-]{10,}/, + /Bearer\s+[A-Za-z0-9_.+\/=-]{10,}/i, + ]; + for (const surface of surfaces) { + for (const pat of forbiddenPatterns) { + expect(surface, `evidence surface must not contain ${pat}`).not.toMatch(pat); + } + expect(surface).toMatch(//); + } + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_should_drop_non_allowlisted_parent_env_unless_declared_in_secretEnv", async () => { + const ctx = freshCtx(); + const sentinelKey = "SECRET_LEAK_PROBE_TOKEN"; + const previous = process.env[sentinelKey]; + process.env[sentinelKey] = "sentinel-value-that-must-not-leak"; + try { + const script = writeTempScript( + ctx.contextDir, + "env-leak.sh", + `printenv | sort\n`, + ); + const ref = path.relative(REPO_ROOT, script); + // Step does NOT declare SECRET_LEAK_PROBE_TOKEN in secretEnv, + // so the framework must drop it before spawn. + const step = shellStep("runtime.env-drop", "runtime", ref); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + + expect(result.assertions[0].status).toBe("passed"); + expect(logBody, "non-allowlisted parent env must not reach the child").not.toContain(sentinelKey); + expect(logBody).not.toContain("sentinel-value-that-must-not-leak"); + // Framework allowlist + overlay still arrive: PATH and E2E_PHASE. + expect(logBody).toMatch(/^PATH=/m); + expect(logBody).toMatch(/^E2E_PHASE=runtime$/m); + } finally { + if (previous === undefined) delete process.env[sentinelKey]; + else process.env[sentinelKey] = previous; + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_should_pass_declared_secretEnv_through_to_child", async () => { + const ctx = freshCtx(); + const declaredKey = "NEMOCLAW_TEST_API_KEY"; // matches SECRET_ENV_KEY_SHAPE + const previous = process.env[declaredKey]; + process.env[declaredKey] = "declared-secret-value-passes-through"; + try { + const script = writeTempScript( + ctx.contextDir, + "declared.sh", + `printenv ${declaredKey} || echo MISSING\n`, ); + const ref = path.relative(REPO_ROOT, script); + const step: AssertionStep = { + ...shellStep("runtime.env-declared", "runtime", ref), + secretEnv: [declaredKey], + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + + expect(result.assertions[0].status).toBe("passed"); + // Declared secret reaches the child verbatim. + expect(logBody).toContain("declared-secret-value-passes-through"); + // It is NOT redacted in printenv output because nothing about + // the literal value matches a token-shape pattern. (Real + // secrets that match secret-patterns.ts WILL be redacted as a + // second line of defense; this synthetic value is intentionally + // shape-free to isolate the env-passthrough behavior.) } finally { + if (previous === undefined) delete process.env[declaredKey]; + else process.env[declaredKey] = previous; fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); + it("test_should_reject_non_secret_shaped_keys_in_secretEnv_at_runtime", async () => { + const { buildChildEnv } = await import("../scenarios/orchestrators/redaction.ts"); + expect(() => + buildChildEnv(process.env, { secretEnv: ["FOO_VAR"], frameworkOverlay: {} }), + ).toThrow(/secret-key shape/); + }); + + it("test_should_declare_NVIDIA_API_KEY_only_for_cloud_onboarding_actions", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const plans = compileRunPlans([ + "ubuntu-repo-cloud-openclaw", + "gpu-repo-local-ollama-openclaw", + ]); + const cloudOnboard = plans[0].phases + .find((p) => p.name === "onboarding") + ?.actions.find((a) => a.id.startsWith("onboarding.profile.")); + const localOnboard = plans[1].phases + .find((p) => p.name === "onboarding") + ?.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(cloudOnboard?.secretEnv).toEqual(["NVIDIA_API_KEY"]); + expect(localOnboard?.secretEnv).toEqual([]); + }); +}); + +describe("clients are pass/fail/policy free", () => { it("test_should_keep_clients_free_of_pass_fail_and_retry_semantics", () => { - const source = fs.readFileSync( - path.join(process.cwd(), "test/e2e-scenario/scenarios/clients/host-cli.ts"), - "utf8", - ); const observation = new HostCliClient().observeVersion(); + // The client returns a raw act/observe shape only: the command it would + // run. It must NOT decide pass/fail, attach retry policy, surface a + // classifier, or expose AssertionResult/PhaseResult-shaped fields. expect(observation).toEqual(expect.objectContaining({ command: ["nemoclaw", "--version"] })); - expect(source).not.toMatch(/AssertionResult|PhaseResult|retry|timeout|passed|failed/); + // Raw act/observe fields are allowed (exitCode/stdout/stderr/timing). + // Pass/fail and reliability-policy fields are not. + const forbiddenKeys = [ + "status", + "attempts", + "classifier", + "evidence", + "retry", + "timeout", + "timeoutSeconds", + "phase", + "assertions", + "passed", + "failed", + ]; + for (const key of forbiddenKeys) { + expect(observation).not.toHaveProperty(key); + } }); }); diff --git a/test/e2e-scenario/framework-tests/e2e-probes.test.ts b/test/e2e-scenario/framework-tests/e2e-probes.test.ts new file mode 100644 index 0000000000..db90b47798 --- /dev/null +++ b/test/e2e-scenario/framework-tests/e2e-probes.test.ts @@ -0,0 +1,670 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { + listRegisteredProbes, + lookupProbe, + registerProbe, + resetProbeRegistry, +} from "../scenarios/probes/registry.ts"; +import type { ProbeContext, ProbeOutcome } from "../scenarios/probes/types.ts"; +import { registerBuiltinProbes } from "../scenarios/probes/builtin.ts"; + +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../.."); + +describe("probe registry", () => { + // The orchestrator side-effect-imports builtin.ts at module load, + // so the registry already contains the built-ins. Each test resets + // and re-registers explicitly so order independence holds. + beforeEach(() => { + resetProbeRegistry(); + }); + + afterEach(() => { + // Restore the production wiring so subsequent test files don't + // see an empty registry (vitest shares module state across files + // within a worker). + resetProbeRegistry(); + registerBuiltinProbes(); + }); + + it("registerProbe_lookupProbe_round_trip", () => { + const fn = async (): Promise => ({ status: "passed" }); + registerProbe("myProbe", fn); + expect(lookupProbe("myProbe")).toBe(fn); + }); + + it("lookupProbe_returns_undefined_for_unknown_ref", () => { + expect(lookupProbe("nonexistent")).toBeUndefined(); + }); + + it("registerProbe_rejects_duplicate_registration", () => { + const fn = async (): Promise => ({ status: "passed" }); + registerProbe("dup", fn); + expect(() => registerProbe("dup", fn)).toThrow(/already registered/); + }); + + it("registerProbe_rejects_empty_name", () => { + const fn = async (): Promise => ({ status: "passed" }); + expect(() => registerProbe("", fn)).toThrow(/name is required/); + }); + + it("listRegisteredProbes_returns_sorted_names", () => { + registerProbe("zeta", async () => ({ status: "passed" })); + registerProbe("alpha", async () => ({ status: "passed" })); + registerProbe("mu", async () => ({ status: "passed" })); + expect(listRegisteredProbes()).toEqual(["alpha", "mu", "zeta"]); + }); + + it("registerBuiltinProbes_is_idempotent", () => { + registerBuiltinProbes(); + const first = listRegisteredProbes(); + expect(first).toContain("diagnosticsProbe"); + expect(first).toContain("docsValidationProbe"); + // Calling again must not throw on duplicate names. + expect(() => registerBuiltinProbes()).not.toThrow(); + expect(listRegisteredProbes()).toEqual(first); + }); + + it("registerBuiltinProbes_registers_security_probes", () => { + // shieldsConfig / networkPolicy / injectionBlocked are marked + // `required: true` in scenarios/assertions/registry.ts. The + // orchestrator fails closed when a required probe is missing, + // so registering all three turns the security suites from + // 'silently skipped' into 'actually verified'. + registerBuiltinProbes(); + const registered = listRegisteredProbes(); + expect(registered).toContain("shieldsConfigProbe"); + expect(registered).toContain("networkPolicyProbe"); + expect(registered).toContain("injectionBlockedProbe"); + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// diagnosticsProbe — uses a fake `nemoclaw` on PATH so this test runs +// reproducibly without depending on a real nemoclaw install. +// ───────────────────────────────────────────────────────────────────────────── + +function makeProbeCtx(tmp: string, evidenceFile = "diag-evidence.json"): ProbeContext { + // contextDir doubles as the parent of the evidence file when the + // step does not specify an explicit path. Tests pass an explicit + // path here to keep the file under tmp. + return { + contextDir: tmp, + evidencePath: path.join(tmp, evidenceFile), + contextEnv: {}, + sandboxName: null, + gatewayUrl: null, + repoRoot: REPO_ROOT, + }; +} + +function installFakeOnPath( + binDir: string, + name: string, + script: string, +): { restore: () => void } { + fs.mkdirSync(binDir, { recursive: true }); + fs.writeFileSync(path.join(binDir, name), script, { mode: 0o755 }); + const oldPath = process.env.PATH; + process.env.PATH = `${binDir}:${oldPath ?? ""}`; + return { + restore: () => { + process.env.PATH = oldPath; + }, + }; +} + +describe("diagnosticsProbe", () => { + it("passes_when_nemoclaw_debug_quick_writes_a_non_empty_archive", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "diag-probe-pass-")); + const fake = installFakeOnPath( + path.join(tmp, "bin"), + "nemoclaw", + `#!/usr/bin/env bash +# Stub: locate the --output value and write a small non-empty archive there. +out="" +while [[ "$#" -gt 0 ]]; do + case "$1" in + --output) out="$2"; shift 2 ;; + *) shift ;; + esac +done +[[ -n "$out" ]] || { echo "no --output" >&2; exit 2; } +printf 'fake-archive-bytes' > "$out" +exit 0 +`, + ); + try { + const { diagnosticsProbe } = await import("../scenarios/probes/diagnostics.ts"); + const outcome = await diagnosticsProbe(makeProbeCtx(tmp)); + expect(outcome.status).toBe("passed"); + expect(outcome.message).toMatch(/bundle ok/); + // Evidence JSON must exist and parse. + const ev = JSON.parse(fs.readFileSync(path.join(tmp, "diag-evidence.json"), "utf8")); + expect(ev.exitCode).toBe(0); + expect(ev.archiveSize).toBeGreaterThan(0); + } finally { + fake.restore(); + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_nemoclaw_exits_nonzero", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "diag-probe-fail-")); + const fake = installFakeOnPath( + path.join(tmp, "bin"), + "nemoclaw", + `#!/usr/bin/env bash\necho "boom" >&2\nexit 7\n`, + ); + try { + const { diagnosticsProbe } = await import("../scenarios/probes/diagnostics.ts"); + const outcome = await diagnosticsProbe(makeProbeCtx(tmp)); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/exited 7/); + const ev = JSON.parse(fs.readFileSync(path.join(tmp, "diag-evidence.json"), "utf8")); + expect(ev.exitCode).toBe(7); + expect(ev.stderrTail).toContain("boom"); + } finally { + fake.restore(); + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_archive_is_empty", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "diag-probe-empty-")); + const fake = installFakeOnPath( + path.join(tmp, "bin"), + "nemoclaw", + `#!/usr/bin/env bash +out="" +while [[ "$#" -gt 0 ]]; do + case "$1" in --output) out="$2"; shift 2 ;; *) shift ;; esac +done +: > "$out" # zero-byte archive +exit 0 +`, + ); + try { + const { diagnosticsProbe } = await import("../scenarios/probes/diagnostics.ts"); + const outcome = await diagnosticsProbe(makeProbeCtx(tmp)); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/empty/); + } finally { + fake.restore(); + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// docsValidationProbe — substitutes a fake check-docs.sh by overriding +// the repoRoot in the ProbeContext so the resolved path points at a +// scratch dir we control. +// ───────────────────────────────────────────────────────────────────────────── + +describe("docsValidationProbe", () => { + function setupFakeCheckDocs( + tmp: string, + cliExit: number, + linksExit: number, + ): { ctx: ProbeContext } { + const scriptDir = path.join(tmp, "test/e2e/e2e-cloud-experimental"); + fs.mkdirSync(scriptDir, { recursive: true }); + fs.writeFileSync( + path.join(scriptDir, "check-docs.sh"), + `#!/usr/bin/env bash +case "$1" in + --only-cli) exit ${cliExit} ;; + --only-links) exit ${linksExit} ;; + *) echo "unknown: $*" >&2; exit 99 ;; +esac +`, + { mode: 0o755 }, + ); + return { + ctx: { + contextDir: tmp, + evidencePath: path.join(tmp, "docs-evidence.json"), + contextEnv: {}, + sandboxName: null, + gatewayUrl: null, + repoRoot: tmp, // probe resolves check-docs.sh against this + }, + }; + } + + it("passes_when_both_cli_and_links_checks_exit_zero", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "docs-probe-pass-")); + try { + const { ctx } = setupFakeCheckDocs(tmp, 0, 0); + const { docsValidationProbe } = await import("../scenarios/probes/docs-validation.ts"); + const outcome = await docsValidationProbe(ctx); + expect(outcome.status).toBe("passed"); + const ev = JSON.parse(fs.readFileSync(ctx.evidencePath, "utf8")); + expect(ev.results).toHaveLength(2); + expect(ev.results[0].phase).toBe("cli-parity"); + expect(ev.results[0].exitCode).toBe(0); + expect(ev.results[1].phase).toBe("links-local"); + expect(ev.results[1].exitCode).toBe(0); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_cli_parity_check_exits_nonzero", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "docs-probe-cli-fail-")); + try { + const { ctx } = setupFakeCheckDocs(tmp, 3, 0); + const { docsValidationProbe } = await import("../scenarios/probes/docs-validation.ts"); + const outcome = await docsValidationProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/CLI\/docs parity failed.*exit 3/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_links_check_exits_nonzero", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "docs-probe-links-fail-")); + try { + const { ctx } = setupFakeCheckDocs(tmp, 0, 5); + const { docsValidationProbe } = await import("../scenarios/probes/docs-validation.ts"); + const outcome = await docsValidationProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/markdown link check failed.*exit 5/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_with_actionable_message_when_check_docs_script_missing", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "docs-probe-missing-")); + try { + const { docsValidationProbe } = await import("../scenarios/probes/docs-validation.ts"); + const ctx: ProbeContext = { + contextDir: tmp, + evidencePath: path.join(tmp, "docs-evidence.json"), + contextEnv: {}, + sandboxName: null, + gatewayUrl: null, + repoRoot: tmp, // no test/e2e/... tree under tmp + }; + const outcome = await docsValidationProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/check-docs\.sh not found/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); +}); + +// ────────────────────────────────────────────────────────────────────────── +// Security probes — stub `nemoclaw` (host CLI) and `openshell` so the +// canonical sandbox-exec wrapper resolves through the stub. The +// wrapper's openshell-fallback path is exercised because the stub +// does not implement `sandbox ssh-config`. +// ────────────────────────────────────────────────────────────────────────── + +function makeProbeCtxFor( + tmp: string, + sandboxName: string, + contextEnv: Record = {}, +): ProbeContext { + // Write context.env so spawned bash scripts that source the + // wrapper can pick up E2E_SANDBOX_NAME if needed. + const lines = Object.entries({ E2E_SANDBOX_NAME: sandboxName, ...contextEnv }) + .map(([k, v]) => `${k}=${v}`) + .join("\n"); + fs.writeFileSync(path.join(tmp, "context.env"), lines + "\n"); + return { + contextDir: tmp, + evidencePath: path.join(tmp, "probe-evidence.json"), + contextEnv: { E2E_SANDBOX_NAME: sandboxName, ...contextEnv }, + sandboxName, + gatewayUrl: null, + repoRoot: REPO_ROOT, + }; +} + +describe("shieldsConfigProbe", () => { + it("passes_when_shields_status_matches_expected_and_perms_match_state", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "shields-probe-pass-")); + const fakeBin = path.join(tmp, "bin"); + fs.mkdirSync(fakeBin); + fs.writeFileSync( + path.join(fakeBin, "nemoclaw"), + `#!/usr/bin/env bash +# nemoclaw shields status +if [[ "$2" == "shields" && "$3" == "status" ]]; then + echo "Shields: DOWN" + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(fakeBin, "openshell"), + `#!/usr/bin/env bash +# Stub openshell. Reject ssh-config so wrapper falls back to sandbox exec. +# Then implement 'sandbox exec --name -- ' by stripping args +# until '--' and running what's left. +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then + exit 1 +fi +if [[ "$1" == "sandbox" && "$2" == "exec" ]]; then + shift 2 + while [[ "$#" -gt 0 && "$1" != "--" ]]; do shift; done + shift || true + # The 'stat -c %a %U:%G ' invocation: emit a fake permissions + # line that matches a DOWN-state sandbox config (sandbox-owned). + if [[ "$1" == "stat" ]]; then + echo "644 sandbox:sandbox" + exit 0 + fi + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + const oldPath = process.env.PATH; + process.env.PATH = `${fakeBin}:${oldPath ?? ""}`; + try { + const { shieldsConfigProbe } = await import("../scenarios/probes/shields-config.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1", { + E2E_AGENT: "openclaw", + E2E_SHIELDS_EXPECTED_STATE: "down", + }); + const outcome = await shieldsConfigProbe(ctx); + expect(outcome.status).toBe("passed"); + expect(outcome.message).toMatch(/shields=down/); + const ev = JSON.parse(fs.readFileSync(ctx.evidencePath, "utf8")); + expect(ev.observed).toBe("down"); + expect(ev.expected).toBe("down"); + expect(ev.permissionsLine).toBe("644 sandbox:sandbox"); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_observed_state_disagrees_with_expected", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "shields-probe-mismatch-")); + const fakeBin = path.join(tmp, "bin"); + fs.mkdirSync(fakeBin); + fs.writeFileSync( + path.join(fakeBin, "nemoclaw"), + `#!/usr/bin/env bash +if [[ "$2" == "shields" && "$3" == "status" ]]; then + echo "Shields: UP" + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + const oldPath = process.env.PATH; + process.env.PATH = `${fakeBin}:${oldPath ?? ""}`; + try { + const { shieldsConfigProbe } = await import("../scenarios/probes/shields-config.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1", { + E2E_AGENT: "openclaw", + E2E_SHIELDS_EXPECTED_STATE: "down", + }); + const outcome = await shieldsConfigProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/expected shields 'down', observed 'up'/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_perms_dont_match_observed_state", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "shields-probe-perms-")); + const fakeBin = path.join(tmp, "bin"); + fs.mkdirSync(fakeBin); + fs.writeFileSync( + path.join(fakeBin, "nemoclaw"), + `#!/usr/bin/env bash +if [[ "$2" == "shields" && "$3" == "status" ]]; then + # Shields claim UP, but the stub openshell will report sandbox-owned + # perms below — a mismatch the probe must catch. + echo "Shields: UP" + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(fakeBin, "openshell"), + `#!/usr/bin/env bash +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then exit 1; fi +if [[ "$1" == "sandbox" && "$2" == "exec" ]]; then + shift 2 + while [[ "$#" -gt 0 && "$1" != "--" ]]; do shift; done + shift || true + # Sandbox-owned perms: would pass for DOWN, must FAIL for UP. + echo "644 sandbox:sandbox" + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + const oldPath = process.env.PATH; + process.env.PATH = `${fakeBin}:${oldPath ?? ""}`; + try { + const { shieldsConfigProbe } = await import("../scenarios/probes/shields-config.ts"); + // Don't declare expected state — the probe should still fail on + // perms-vs-observed mismatch alone. + const ctx = makeProbeCtxFor(tmp, "sb1", { E2E_AGENT: "openclaw" }); + const outcome = await shieldsConfigProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/shields are 'up' but .* permissions are/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); +}); + +describe("networkPolicyProbe", () => { + function fakeOpenshellEmittingHttpStatus( + binDir: string, + httpStatus: string, + curlExitCode: number = 0, + ): void { + fs.mkdirSync(binDir, { recursive: true }); + fs.writeFileSync( + path.join(binDir, "openshell"), + `#!/usr/bin/env bash +# Opt out of ssh-config; force wrapper to use 'sandbox exec' fallback. +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then exit 1; fi +if [[ "$1" == "sandbox" && "$2" == "exec" ]]; then + shift 2 + while [[ "$#" -gt 0 && "$1" != "--" ]]; do shift; done + shift || true + # We're being asked to run curl inside the sandbox. Emit the test's + # chosen status to stdout (mirrors curl -w '%{http_code}') and exit + # with the test's chosen curl exit code. + printf '%s' "${httpStatus}" + exit ${curlExitCode} +fi +exit 99 +`, + { mode: 0o755 }, + ); + } + + it("passes_when_blocked_url_returns_403", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "netpolicy-probe-403-")); + fakeOpenshellEmittingHttpStatus(path.join(tmp, "bin"), "403", 0); + const oldPath = process.env.PATH; + process.env.PATH = `${path.join(tmp, "bin")}:${oldPath ?? ""}`; + try { + const { networkPolicyProbe } = await import("../scenarios/probes/network-policy.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await networkPolicyProbe(ctx); + expect(outcome.status).toBe("passed"); + expect(outcome.message).toMatch(/blocked .*http_code=403/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("passes_when_curl_exits_nonzero_and_no_http_response", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "netpolicy-probe-conn-")); + // curl exit 7 = couldn't connect; status '000' = no HTTP response. + fakeOpenshellEmittingHttpStatus(path.join(tmp, "bin"), "000", 7); + const oldPath = process.env.PATH; + process.env.PATH = `${path.join(tmp, "bin")}:${oldPath ?? ""}`; + try { + const { networkPolicyProbe } = await import("../scenarios/probes/network-policy.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await networkPolicyProbe(ctx); + expect(outcome.status).toBe("passed"); + expect(outcome.message).toMatch(/curl exit 7/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_blocked_url_returns_200", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "netpolicy-probe-200-")); + fakeOpenshellEmittingHttpStatus(path.join(tmp, "bin"), "200", 0); + const oldPath = process.env.PATH; + process.env.PATH = `${path.join(tmp, "bin")}:${oldPath ?? ""}`; + try { + const { networkPolicyProbe } = await import("../scenarios/probes/network-policy.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await networkPolicyProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/reachable from sandbox.*http_code=200/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_blocked_url_returns_401_indicating_policy_bypass", async () => { + // 401 means the request reached upstream auth, NOT that gateway + // dropped it. The probe must classify this as a policy bypass. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "netpolicy-probe-401-")); + fakeOpenshellEmittingHttpStatus(path.join(tmp, "bin"), "401", 0); + const oldPath = process.env.PATH; + process.env.PATH = `${path.join(tmp, "bin")}:${oldPath ?? ""}`; + try { + const { networkPolicyProbe } = await import("../scenarios/probes/network-policy.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await networkPolicyProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/reachable from sandbox.*http_code=401/); + } finally { + process.env.PATH = oldPath; + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); +}); + +describe("injectionBlockedProbe", () => { + // For the injection probe we need a stub openshell that simulates a + // sandbox shell honestly: pre-clean, echo back stdin, and respond + // SAFE/EXPLOITED based on whether the marker file exists. We give + // each test its own tmp dir and stub script. + function setupInjectionStub(tmp: string, exploited: boolean): { restore: () => void } { + const binDir = path.join(tmp, "bin"); + fs.mkdirSync(binDir, { recursive: true }); + // Use a state file in tmp to track whether the 'exploit' branch + // should claim the marker exists. + const stateFile = path.join(tmp, "exploit.state"); + fs.writeFileSync(stateFile, exploited ? "yes" : "no"); + fs.writeFileSync( + path.join(binDir, "openshell"), + `#!/usr/bin/env bash +state=\$(cat "${stateFile}" 2>/dev/null || echo no) +if [[ "\$1" == "sandbox" && "\$2" == "ssh-config" ]]; then exit 1; fi +if [[ "\$1" == "sandbox" && "\$2" == "exec" ]]; then + shift 2 + while [[ "\$#" -gt 0 && "\$1" != "--" ]]; do shift; done + shift || true + # Recognize the three operations the probe issues: + # 1. sh -c 'rm -f ' — cleanup; always succeeds + # 2. sh -c 'MSG=\$(cat); printf %s\\n "\$MSG"' — echo back stdin + # 3. sh -c 'test -f && echo EXPLOITED || echo SAFE' + cmd="\$*" + case "\$cmd" in + *"MSG="*"printf"*) + cat + ;; + *"test -f"*"EXPLOITED"*"SAFE"*) + if [[ "\$state" == "yes" ]]; then echo EXPLOITED; else echo SAFE; fi + ;; + *"rm -f"*) + : + ;; + *) + echo "unrecognized cmd: \$cmd" >&2 + exit 64 + ;; + esac + exit 0 +fi +exit 99 +`, + { mode: 0o755 }, + ); + const oldPath = process.env.PATH; + process.env.PATH = `${binDir}:${oldPath ?? ""}`; + return { + restore: () => { + process.env.PATH = oldPath; + }, + }; + } + + it("passes_when_payload_is_preserved_and_marker_absent", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "inj-probe-pass-")); + const stub = setupInjectionStub(tmp, false); + try { + const { injectionBlockedProbe } = await import("../scenarios/probes/injection-blocked.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await injectionBlockedProbe(ctx); + expect(outcome.status).toBe("passed"); + const ev = JSON.parse(fs.readFileSync(ctx.evidencePath, "utf8")); + expect(ev.payloadPreservedLiterally).toBe(true); + expect(ev.markerAbsent).toBe(true); + } finally { + stub.restore(); + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("fails_when_marker_file_was_created_indicating_command_substitution_executed", async () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "inj-probe-fail-")); + const stub = setupInjectionStub(tmp, true); + try { + const { injectionBlockedProbe } = await import("../scenarios/probes/injection-blocked.ts"); + const ctx = makeProbeCtxFor(tmp, "sb1"); + const outcome = await injectionBlockedProbe(ctx); + expect(outcome.status).toBe("failed"); + expect(outcome.message).toMatch(/marker file .* present/); + expect(outcome.message).toMatch(/command substitution executed/); + const ev = JSON.parse(fs.readFileSync(ctx.evidencePath, "utf8")); + expect(ev.markerAbsent).toBe(false); + } finally { + stub.restore(); + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); +}); diff --git a/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts b/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts new file mode 100644 index 0000000000..aab3b00f98 --- /dev/null +++ b/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Parity test: the framework's local secret-pattern set + * (test/e2e-scenario/scenarios/orchestrators/redaction.ts) must stay in + * lockstep with the canonical product source + * (src/lib/security/secret-patterns.ts). + * + * The framework deliberately mirrors rather than imports — see the + * "Framework-local mirror" comment in redaction.ts for why — but the + * mirror is only safe if it is actually a mirror. This test imports + * the RegExp arrays from both modules and compares them by behavior + * (`.source` + `.flags`) rather than by source-text shape, so the + * source-shape budget (ci/source-shape-test-budget.json) stays at 0. + * + * The framework-runtime decoupling is preserved: redaction.ts itself + * does not import from src/lib/security/. Only this test crosses the + * boundary, which is the entire point of a parity test. + */ + +import { describe, expect, it } from "vitest"; + +import { + CONTEXT_PATTERNS as FRAMEWORK_CONTEXT_PATTERNS, + TOKEN_PREFIX_PATTERNS as FRAMEWORK_TOKEN_PREFIX_PATTERNS, +} from "../scenarios/orchestrators/redaction.ts"; +import { + CONTEXT_PATTERNS as PRODUCT_CONTEXT_PATTERNS, + TOKEN_PREFIX_PATTERNS as PRODUCT_TOKEN_PREFIX_PATTERNS, +} from "../../../src/lib/security/secret-patterns.ts"; + +function fingerprint(patterns: readonly RegExp[]): string[] { + return patterns.map((re) => `${re.source}::${re.flags}`); +} + +describe("framework redaction parity with product source-of-truth", () => { + it("framework TOKEN_PREFIX_PATTERNS matches product TOKEN_PREFIX_PATTERNS", () => { + const framework = fingerprint(FRAMEWORK_TOKEN_PREFIX_PATTERNS); + const product = fingerprint(PRODUCT_TOKEN_PREFIX_PATTERNS); + expect(framework.length).toBeGreaterThan(0); + expect(product.length).toBeGreaterThan(0); + expect(framework).toEqual(product); + }); + + it("framework CONTEXT_PATTERNS matches product CONTEXT_PATTERNS", () => { + const framework = fingerprint(FRAMEWORK_CONTEXT_PATTERNS); + const product = fingerprint(PRODUCT_CONTEXT_PATTERNS); + expect(framework.length).toBeGreaterThan(0); + expect(product.length).toBeGreaterThan(0); + expect(framework).toEqual(product); + }); +}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts deleted file mode 100644 index 8c2e70caae..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Phase 9: Migrate Additional Scenario Families. - * Verifies metadata for new scenarios (macOS, WSL, GPU local Ollama, Brev - * launchable, Ubuntu cloud Hermes, and the no-docker negative preflight) - * plus the deferred schema concepts (scenario-level overrides, negative - * expected state). - */ - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; -import { resolveScenario } from "../runtime/resolver/plan.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); -const RUN_SCENARIO = path.join(E2E_DIR, "runtime", "run-scenario.sh"); - -function planOnly(scenarioId: string): { stdout: string; stderr: string; status: number | null; plan: Record } { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-p9-")); - try { - const r = spawnSync("bash", [RUN_SCENARIO, scenarioId, "--plan-only"], { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }); - let plan = {}; - const pj = path.join(tmp, "plan.json"); - if (fs.existsSync(pj)) { - plan = JSON.parse(fs.readFileSync(pj, "utf8")); - } - return { stdout: r.stdout, stderr: r.stderr, status: r.status, plan }; - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } -} - -describe("Issue 3812: inference/provider suite families", () => { - it("test_should_route_inference_suite_families_to_domain_specific_steps", () => { - const { suites } = loadMetadataFromDir(E2E_DIR); - for (const family of ["inference-routing", "inference-switch", "kimi-compatibility", "ollama-auth-proxy", "model-router"]) { - const scripts = suites.suites[family]?.steps?.map((step) => step.script ?? "") ?? []; - expect(scripts.length, family).toBeGreaterThan(0); - expect(scripts.every((script) => script.startsWith("inference/")), family).toBe(true); - expect(scripts.some((script) => !script.startsWith("inference/cloud/")), family).toBe(true); - } - }); -}); - -describe("Phase 9: additional scenario families - metadata", () => { - it("resolver should resolve all new scenarios", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const ids = [ - "macos-repo-cloud-openclaw", - "wsl-repo-cloud-openclaw", - "gpu-repo-local-ollama-openclaw", - "brev-launchable-cloud-openclaw", - "ubuntu-repo-cloud-hermes", - "ubuntu-no-docker-preflight-negative", - ]; - for (const id of ids) { - const plan = resolveScenario(id, meta); - expect(plan.scenario_id).toBe(id); - expect(plan.expected_state.id).toBeTypeOf("string"); - expect(Array.isArray(plan.suites)).toBe(true); - } - }); -}); - -describe("Phase 9: macOS / WSL plan-only", () => { - it("macos scenario plan identifies macOS platform", () => { - const { status, plan } = planOnly("macos-repo-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { dimensions: { platform: { profile: { os?: string } } } }).dimensions; - expect(dims.platform.profile.os).toBe("macos"); - }); - - it("wsl scenario plan identifies WSL platform", () => { - const { status, plan } = planOnly("wsl-repo-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { dimensions: { platform: { profile: { os?: string } } } }).dimensions; - expect(dims.platform.profile.os).toBe("wsl"); - }); -}); - -describe("Phase 9: GPU local Ollama plan-only", () => { - it("runtime indicates GPU/CDI and provider is ollama", () => { - const { status, plan } = planOnly("gpu-repo-local-ollama-openclaw"); - expect(status).toBe(0); - const dims = (plan as { - dimensions: { - runtime: { profile: { gpu_runtime?: string } }; - onboarding: { profile: { provider?: string } }; - }; - }).dimensions; - expect(dims.runtime.profile.gpu_runtime).toBe("cdi"); - expect(dims.onboarding.profile.provider).toBe("ollama"); - }); -}); - -describe("Phase 9: Brev launchable scenario (overrides schema)", () => { - it("should_support_scenario_overrides_on_brev_launchable", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const plan = resolveScenario("brev-launchable-cloud-openclaw", meta); - expect(plan.overrides).toBeTruthy(); - const overrides = plan.overrides as { - onboarding?: { gateway?: { bind_address?: string } }; - }; - expect(overrides?.onboarding?.gateway?.bind_address).toBeTypeOf("string"); - expect(overrides?.onboarding?.gateway?.bind_address?.length).toBeGreaterThan(0); - }); - - it("plan shows remote target, launchable install, and gateway bind override", () => { - const { status, stdout, plan } = planOnly("brev-launchable-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { - dimensions: { - platform: { profile: { execution_target?: string } }; - install: { id: string }; - }; - }).dimensions; - expect(dims.platform.profile.execution_target).toBe("remote"); - expect(dims.install.id).toBe("launchable"); - expect(stdout).toMatch(/Overrides:/); - expect(stdout).toMatch(/bind_address/); - }); -}); - -describe("Phase 9: negative preflight", () => { - it("should_define_preflight_failure_no_sandbox_state", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const es = meta.expectedStates.expected_states["preflight-failure-no-sandbox"] as - | { - gateway?: { expected?: string }; - sandbox?: { expected?: string }; - failure?: { expected?: boolean }; - } - | undefined; - expect(es, "preflight-failure-no-sandbox should be defined").toBeTruthy(); - expect(es?.gateway?.expected).toBe("absent"); - expect(es?.sandbox?.expected).toBe("absent"); - expect(es?.failure?.expected).toBe(true); - }); - - it("negative scenario plan identifies docker missing and negative state", () => { - const { status, plan } = planOnly("ubuntu-no-docker-preflight-negative"); - expect(status).toBe(0); - const p = plan as { - dimensions: { runtime: { profile: { container_daemon?: string } } }; - expected_state: { id: string }; - expected_failure?: { - phase?: string; - error_class?: string; - message_pattern?: string; - forbidden_side_effects?: string[]; - }; - }; - expect(p.dimensions.runtime.profile.container_daemon).toBe("missing"); - expect(p.expected_state.id).toBe("preflight-failure-no-sandbox"); - expect(p.expected_failure?.phase).toBe("preflight"); - expect(p.expected_failure?.error_class).toBe("docker-missing"); - expect(p.expected_failure?.message_pattern).toBeTypeOf("string"); - expect(p.expected_failure?.forbidden_side_effects).toEqual( - expect.arrayContaining(["sandbox-created", "gateway-started", "credentials-written"]), - ); - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts deleted file mode 100644 index 0307ca9103..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Phase 6: Migrate First Scenario - ubuntu-repo-cloud-openclaw. - * Verifies resolver output, plan printout, and dry-run phase ordering. - */ - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; -import { resolveScenario } from "../runtime/resolver/plan.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); -const RUN_SCENARIO = path.join(E2E_DIR, "runtime", "run-scenario.sh"); - -describe("Phase 6: ubuntu-repo-cloud-openclaw migration", () => { - it("ubuntu_repo_cloud_openclaw_should_resolve_to_cloud_openclaw_ready", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const plan = resolveScenario("ubuntu-repo-cloud-openclaw", meta); - expect(plan.expected_state.id).toBe("cloud-openclaw-ready"); - const suiteIds = plan.suites.map((s) => s.id); - expect(suiteIds).toContain("smoke"); - expect(suiteIds).toContain("inference"); - }); - - it("ubuntu_repo_cloud_openclaw_plan_should_include_setup_install_onboard", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-first-")); - try { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--plan-only"], - { env: { ...process.env, E2E_CONTEXT_DIR: tmp }, encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), cwd: REPO_ROOT }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout).toMatch(/install=repo-current/); - expect(r.stdout).toMatch(/runtime=docker-running/); - expect(r.stdout).toMatch(/onboarding=cloud-openclaw/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("ubuntu_repo_cloud_openclaw_dry_run_should_execute_phases_in_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-first-")); - try { - const trace = path.join(tmp, "trace.log"); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp, E2E_TRACE_FILE: trace }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - expect(fs.existsSync(trace)).toBe(true); - const contents = fs.readFileSync(trace, "utf8"); - const order = [ - "env:noninteractive", - "install:repo-current", - "onboard:cloud-openclaw", - "gateway:check", - "sandbox:check", - ]; - let pos = 0; - for (const marker of order) { - const idx = contents.indexOf(marker, pos); - expect(idx, `missing marker ${marker}. trace:\n${contents}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - // The run should also seed the context and produce plan.json. - expect(fs.existsSync(path.join(tmp, "context.env"))).toBe(true); - expect(fs.existsSync(path.join(tmp, "plan.json"))).toBe(true); - // After dry-run, suite runner should be able to execute the full - // suite sequence against the emitted context. - const suites = spawnSync( - "bash", - [path.join(E2E_DIR, "runtime", "run-suites.sh"), "smoke", "inference"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(suites.status, `suite stderr:${suites.stderr}\nstdout:${suites.stdout}`).toBe(0); - expect(suites.stdout).toMatch(/PASS smoke\/cli-available/); - expect(suites.stdout).toMatch(/PASS inference\/models-health/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts deleted file mode 100644 index 31965cffcb..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts +++ /dev/null @@ -1,275 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -import yaml from "js-yaml"; - -import { resolveScenario, type ResolverInput } from "../runtime/resolver/plan.ts"; -import { loadMetadataFromDir, loadMetadataFromObjects } from "../runtime/resolver/load.ts"; -import { listScenarios } from "../scenarios/registry.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); - -function realMetadata(): ResolverInput { - return loadMetadataFromDir(E2E_DIR); -} - -describe("E2E scenario resolver", () => { - it("should_resolve_valid_scenario", () => { - const meta = realMetadata(); - const plan = resolveScenario("ubuntu-repo-cloud-openclaw", meta); - expect(plan.scenario_id).toBe("ubuntu-repo-cloud-openclaw"); - expect(plan.dimensions.platform.id).toBe("ubuntu-local"); - expect(plan.dimensions.install.id).toBe("repo-current"); - expect(plan.dimensions.runtime.id).toBe("docker-running"); - expect(plan.dimensions.onboarding.id).toBe("cloud-openclaw"); - expect(plan.expected_state.id).toBe("cloud-openclaw-ready"); - const suiteIds = plan.suites.map((s) => s.id); - expect(suiteIds).toEqual(["smoke", "inference", "credentials"]); - // each suite should carry its ordered steps with resolved scripts - expect(plan.suites[0].steps.length).toBeGreaterThan(0); - for (const s of plan.suites) { - for (const step of s.steps) { - expect(step.id).toBeTypeOf("string"); - expect(step.script).toMatch(/\.sh$/); - } - } - }); - - it("should_resolve_onboard_negative_path_migration_scenarios", () => { - const meta = realMetadata(); - const custom = resolveScenario("ubuntu-repo-cloud-openclaw-custom-policies", meta); - expect(custom.dimensions.onboarding.id).toBe("cloud-openclaw-custom-policies"); - expect(custom.expected_state.id).toBe("cloud-openclaw-custom-policies-ready"); - expect(custom.suites.map((s) => s.id)).toContain("onboarding-state"); - - const invalidKey = resolveScenario("ubuntu-invalid-nvidia-key-negative", meta); - expect(invalidKey.expected_state.config.failure).toMatchObject({ - expected: true, - stage: "onboarding", - reason: "invalid-nvidia-api-key", - exit_code: 1, - no_stack_trace: true, - }); - - const portConflict = resolveScenario("ubuntu-gateway-port-conflict-negative", meta); - expect(portConflict.expected_state.config.failure).toMatchObject({ - expected: true, - stage: "onboarding", - reason: "gateway-port-conflict", - exit_code: 1, - no_stack_trace: true, - }); - }); - - it("should_resolve_every_typed_scenario_id_through_yaml_setup_scenarios", () => { - const meta = realMetadata(); - const failures = listScenarios().flatMap((scenario) => { - try { - resolveScenario(scenario.id, meta); - return []; - } catch (error) { - return [`${scenario.id}: ${(error as Error).message}`]; - } - }); - - expect(failures, failures.join("\n")).toEqual([]); - }); - - it("should_fail_for_unknown_scenario", () => { - const meta = realMetadata(); - expect(() => resolveScenario("does-not-exist", meta)).toThrow(/does-not-exist/); - }); - - it("should_fail_for_missing_profile_reference", () => { - const meta = loadMetadataFromObjects({ - scenarios: yaml.load(` -platforms: - ubuntu-local: { os: ubuntu } -installs: - repo-current: { method: repo-checkout } -runtimes: - docker-running: { container_engine: docker } -onboarding: - cloud-openclaw: { path: cloud, agent: openclaw, provider: nvidia } -setup_scenarios: - broken: - dimensions: - platform: missing-platform - install: repo-current - runtime: docker-running - onboarding: cloud-openclaw - expected_state: some-state - suites: [smoke] -`) as object, - expectedStates: yaml.load(` -expected_states: - some-state: - gateway: { health: healthy } - sandbox: { status: running } -`) as object, - suites: yaml.load(` -suites: - smoke: - requires_state: - gateway.health: healthy - sandbox.status: running - steps: - - { id: step, script: suites/smoke/step.sh } -`) as object, - }); - expect(() => resolveScenario("broken", meta)).toThrow(/platform.*missing-platform/); - }); - - it("should_fail_for_missing_expected_state_reference", () => { - const meta = loadMetadataFromObjects({ - scenarios: yaml.load(` -platforms: { p: {} } -installs: { i: {} } -runtimes: { r: {} } -onboarding: { o: { agent: openclaw, provider: nvidia } } -setup_scenarios: - s: - dimensions: { platform: p, install: i, runtime: r, onboarding: o } - expected_state: ghost - suites: [smoke] -`) as object, - expectedStates: yaml.load(` -expected_states: - real: { gateway: { health: healthy } } -`) as object, - suites: yaml.load(` -suites: - smoke: - steps: - - { id: step, script: suites/smoke/step.sh } -`) as object, - }); - expect(() => resolveScenario("s", meta)).toThrow(/expected_state.*ghost/); - }); - - it("should_fail_for_missing_suite_reference", () => { - const meta = loadMetadataFromObjects({ - scenarios: yaml.load(` -platforms: { p: {} } -installs: { i: {} } -runtimes: { r: {} } -onboarding: { o: { agent: openclaw, provider: nvidia } } -setup_scenarios: - s: - dimensions: { platform: p, install: i, runtime: r, onboarding: o } - expected_state: real - suites: [smoke, phantom] -`) as object, - expectedStates: yaml.load(` -expected_states: - real: { gateway: { health: healthy } } -`) as object, - suites: yaml.load(` -suites: - smoke: - steps: - - { id: step, script: suites/smoke/step.sh } -`) as object, - }); - expect(() => resolveScenario("s", meta)).toThrow(/suite.*phantom/); - }); - - it("should_fail_when_suite_requires_state_incompatible_with_scenario_expected_state", () => { - const meta = loadMetadataFromObjects({ - scenarios: yaml.load(` -platforms: { p: {} } -installs: { i: {} } -runtimes: { r: {} } -onboarding: { o: { agent: openclaw, provider: nvidia } } -setup_scenarios: - s: - dimensions: { platform: p, install: i, runtime: r, onboarding: o } - expected_state: gw-unhealthy - suites: [smoke] -`) as object, - expectedStates: yaml.load(` -expected_states: - gw-unhealthy: - gateway: { health: unhealthy } - sandbox: { status: running } -`) as object, - suites: yaml.load(` -suites: - smoke: - requires_state: - gateway.health: healthy - steps: - - { id: step, script: suites/smoke/step.sh } -`) as object, - }); - expect(() => resolveScenario("s", meta)).toThrow( - /smoke.*gateway\.health.*healthy.*unhealthy/s, - ); - }); -}); - -describe("run-scenario.sh --plan-only", () => { - it("run_scenario_plan_only_should_print_plan", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-plan-")); - try { - const result = spawnSync( - "bash", - [ - path.join(E2E_DIR, "runtime", "run-scenario.sh"), - "ubuntu-repo-cloud-openclaw", - "--plan-only", - ], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(result.status, result.stderr).toBe(0); - expect(result.stdout).toContain("ubuntu-repo-cloud-openclaw"); - expect(result.stdout).toContain("cloud-openclaw-ready"); - expect(result.stdout).toContain("smoke"); - expect(result.stdout).toContain("inference"); - const planJsonPath = path.join(tmp, "plan.json"); - expect(fs.existsSync(planJsonPath)).toBe(true); - const doc = JSON.parse(fs.readFileSync(planJsonPath, "utf8")); - expect(doc.scenario_id).toBe("ubuntu-repo-cloud-openclaw"); - expect(doc.expected_state.id).toBe("cloud-openclaw-ready"); - expect(Array.isArray(doc.suites)).toBe(true); - expect(doc.suites.map((s: { id: string }) => s.id)).toContain("smoke"); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_scenario_plan_only_should_fail_for_unknown_scenario", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-plan-")); - try { - const result = spawnSync( - "bash", - [ - path.join(E2E_DIR, "runtime", "run-scenario.sh"), - "does-not-exist", - "--plan-only", - ], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(result.status).not.toBe(0); - expect(`${result.stderr}${result.stdout}`).toMatch(/does-not-exist/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-schema.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-schema.test.ts deleted file mode 100644 index b9768cf2dd..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-scenario-schema.test.ts +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -import yaml from "js-yaml"; - -import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; - -const E2E_DIR = path.resolve(import.meta.dirname, ".."); -const SCENARIOS_PATH = path.join(E2E_DIR, "nemoclaw_scenarios", "scenarios.yaml"); -const STATES_PATH = path.join(E2E_DIR, "nemoclaw_scenarios", "expected-states.yaml"); -const SUITES_PATH = path.join(E2E_DIR, "validation_suites", "suites.yaml"); - -type AnyRecord = Record; - -function loadYaml(p: string): AnyRecord { - const raw = fs.readFileSync(p, "utf8"); - const doc = yaml.load(raw); - if (!doc || typeof doc !== "object") { - throw new Error(`YAML file ${p} did not parse to an object`); - } - return doc as AnyRecord; -} - -describe("E2E scenario metadata schema", () => { - it("should_parse_all_metadata_files", () => { - expect(fs.existsSync(SCENARIOS_PATH)).toBe(true); - expect(fs.existsSync(STATES_PATH)).toBe(true); - expect(fs.existsSync(SUITES_PATH)).toBe(true); - expect(() => loadYaml(SCENARIOS_PATH)).not.toThrow(); - expect(() => loadYaml(STATES_PATH)).not.toThrow(); - expect(() => loadYaml(SUITES_PATH)).not.toThrow(); - }); - - it("should_have_required_top_level_sections", () => { - const scenarios = loadYaml(SCENARIOS_PATH); - expect(scenarios).toHaveProperty("platforms"); - expect(scenarios).toHaveProperty("installs"); - expect(scenarios).toHaveProperty("runtimes"); - expect(scenarios).toHaveProperty("onboarding"); - expect(scenarios).toHaveProperty("setup_scenarios"); - - const states = loadYaml(STATES_PATH); - expect(states).toHaveProperty("expected_states"); - - const suites = loadYaml(SUITES_PATH); - expect(suites).toHaveProperty("suites"); - }); - - it("should_define_initial_required_scenarios", () => { - const scenarios = loadYaml(SCENARIOS_PATH); - const setup = scenarios.setup_scenarios as AnyRecord; - expect(setup).toBeTypeOf("object"); - expect(setup).toHaveProperty("ubuntu-repo-cloud-openclaw"); - expect(setup).toHaveProperty("ubuntu-repo-cloud-hermes"); - expect(setup).toHaveProperty("gpu-repo-local-ollama-openclaw"); - }); - - it("should_use_singular_expected_state_field", () => { - const scenarios = loadYaml(SCENARIOS_PATH); - const setup = scenarios.setup_scenarios as AnyRecord; - for (const [id, entry] of Object.entries(setup)) { - const s = entry as AnyRecord; - expect(s, `scenario ${id} missing expected_state`).toHaveProperty("expected_state"); - expect(typeof s.expected_state, `scenario ${id}.expected_state must be a string`).toBe( - "string", - ); - expect( - (s as AnyRecord).expected_states, - `scenario ${id} must not have array-style expected_states`, - ).toBeUndefined(); - } - }); - - it("should_define_initial_expected_states", () => { - const states = loadYaml(STATES_PATH); - const es = states.expected_states as AnyRecord; - // Initial three states must exist; Phase 9 adds additional states - // (e.g. preflight-failure-no-sandbox) alongside their first consumer. - for (const id of [ - "cloud-openclaw-ready", - "cloud-hermes-ready", - "local-ollama-openclaw-ready", - ]) { - expect(es, `expected state ${id} should be defined`).toHaveProperty(id); - } - }); - - it("should_define_initial_suites", () => { - const suites = loadYaml(SUITES_PATH); - const s = suites.suites as AnyRecord; - for (const id of [ - "smoke", - "inference", - "credentials", - "local-ollama-inference", - "ollama-proxy", - ]) { - expect(s, `suite ${id} should be defined`).toHaveProperty(id); - } - }); - - it("platform_specific_scenarios_should_declare_runner_requirements", () => { - const scenarios = loadYaml(SCENARIOS_PATH); - const setup = scenarios.setup_scenarios as Record; - for (const id of [ - "macos-repo-cloud-openclaw", - "wsl-repo-cloud-openclaw", - "gpu-repo-local-ollama-openclaw", - "brev-launchable-cloud-openclaw", - ]) { - expect(setup[id]?.runner_requirements, `${id} missing runner requirements`).toEqual( - expect.arrayContaining([expect.any(String)]), - ); - } - }); - - it("should_reject_platform_specific_fixture_without_runner_requirements", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-schema-runner-")); - try { - fs.writeFileSync( - path.join(tmp, "scenarios.yaml"), - ` -platforms: - brev-launchable: - os: ubuntu - execution_target: remote -installs: - launchable: {} -runtimes: - docker-running: {} -onboarding: - cloud-openclaw: - agent: openclaw -setup_scenarios: - bad-brev: - dimensions: - platform: brev-launchable - install: launchable - runtime: docker-running - onboarding: cloud-openclaw - expected_state: ready - suites: [smoke] -`, - ); - fs.writeFileSync(tmp + "/expected-states.yaml", "expected_states:\n ready: {}\n"); - fs.writeFileSync(tmp + "/suites.yaml", "suites:\n smoke:\n steps: []\n"); - expect(() => loadMetadataFromDir(tmp)).toThrow(/runner_requirements|bad-brev/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts b/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts index 604ec1c033..106d46b339 100644 --- a/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts @@ -95,8 +95,9 @@ jobs: "run-scenario job must use the resolved runner output", "run-scenario job missing step: Run typed scenarios in WSL", "artifact upload name must include the scenarios input", - "artifact upload must include hidden .e2e files", - "artifact upload path must include .e2e/", + "artifact upload must set include-hidden-files: false (raw context.env must not leak)", + "artifact upload path must include .e2e/actions/ (redacted action evidence)", + "artifact upload path must include .e2e/logs/ (redacted shell-step evidence)", ]), ); } finally { diff --git a/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts b/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts deleted file mode 100644 index ded16c1917..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import { spawnSync, type SpawnSyncReturns } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const RUN_SUITES = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-suites.sh"); - -function runSuites(args: string[], env: Record = {}): SpawnSyncReturns { - return spawnSync("bash", [RUN_SUITES, ...args], { - env: { ...process.env, ...env }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }); -} - -function seedContext(tmp: string, values: Record): void { - fs.mkdirSync(tmp, { recursive: true }); - const ctx = Object.entries(values) - .map(([k, v]) => `${k}=${v}`) - .join("\n"); - fs.writeFileSync(path.join(tmp, "context.env"), `${ctx}\n`); -} - -function fullContext(): Record { - return { - E2E_SCENARIO: "ubuntu-repo-cloud-openclaw", - E2E_PLATFORM_OS: "ubuntu", - E2E_EXECUTION_TARGET: "local", - E2E_INSTALL_METHOD: "repo-checkout", - E2E_CONTAINER_ENGINE: "docker", - E2E_CONTAINER_DAEMON: "running", - E2E_ONBOARDING_PATH: "cloud", - E2E_AGENT: "openclaw", - E2E_PROVIDER: "nvidia", - E2E_SANDBOX_NAME: "e2e-ubuntu-repo-cloud-openclaw", - E2E_GATEWAY_URL: "http://127.0.0.1:18789", - E2E_INFERENCE_ROUTE: "inference-local", - }; -} - -describe("Issue #3810 messaging suite wiring", () => { - it("should_define_real_steps_for_messaging_provider_suites", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-messaging-suites-")); - try { - const baseContext = { - ...fullContext(), - E2E_PROVIDER: "telegram", - E2E_MESSAGING_PROVIDER: "telegram", - E2E_MESSAGING_BRIDGE_URL: "http://127.0.0.1:18789", - E2E_MESSAGING_CONFIG_CONTENT: "TELEGRAM_BOT_TOKEN=PLACEHOLDER", - }; - seedContext(tmp, baseContext); - const telegram = runSuites(["messaging-telegram"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(telegram.status, `stderr:${telegram.stderr}\nstdout:${telegram.stdout}`).toBe(0); - seedContext(tmp, { - ...baseContext, - E2E_MESSAGING_PROVIDER: "discord", - E2E_MESSAGING_CONFIG_CONTENT: "DISCORD_BOT_TOKEN=PLACEHOLDER", - }); - const discord = runSuites(["messaging-discord"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(discord.status, `stderr:${discord.stderr}\nstdout:${discord.stdout}`).toBe(0); - seedContext(tmp, { - ...baseContext, - E2E_MESSAGING_PROVIDER: "slack", - E2E_MESSAGING_CHANNEL: "bot", - E2E_MESSAGING_CONFIG_CONTENT: "SLACK_BOT_TOKEN=PLACEHOLDER", - }); - const slack = runSuites(["messaging-slack"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(slack.status, `stderr:${slack.stderr}\nstdout:${slack.stdout}`).toBe(0); - const output = `${telegram.stdout}\n${discord.stdout}\n${slack.stdout}`; - for (const id of [ - "messaging-provider-attached", - "messaging-placeholder-configured", - "messaging-no-secret-leak", - "messaging-bridge-reachable", - "telegram-injection-safety", - "discord-gateway-path", - "slack-provider-state", - "slack.runtime-discovery", - ]) { - expect(output).toContain(id); - } - expect(output).not.toContain("cli-available"); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); - -describe("run-suites.sh", () => { - it("security_credentials_suite_should_emit_stable_assertion_ids", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-security-credentials-")); - try { - seedContext(tmp, { ...fullContext(), E2E_CREDENTIALS_EXPECTED: "present" }); - const r = runSuites(["security-credentials"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1", HOME: tmp }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - expect(r.stdout).toContain("post-onboard.credentials.gateway-list-redacts-values"); - expect(r.stdout).toContain("post-onboard.credentials.no-plaintext-host-store"); - expect(r.stdout).not.toMatch(/no-credentials-leaked|assert\//); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_run_steps_in_declared_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["smoke"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - // Smoke order is: cli-available, gateway-health, sandbox-listed, sandbox-shell - const order = ["cli-available", "gateway-health", "sandbox-listed", "sandbox-shell"]; - let pos = 0; - for (const marker of order) { - const idx = r.stdout.indexOf(marker, pos); - expect(idx, `missing marker ${marker} after ${pos} in:\n${r.stdout}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_fail_on_unknown_suite", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["does-not-exist"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status).not.toBe(0); - expect(`${r.stdout}${r.stderr}`).toMatch(/does-not-exist/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_stop_on_first_failed_step", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - // Use a fixture suites file with a failing middle step. - const fixtureSuites = path.join(tmp, "suites.yaml"); - const fixtureDir = path.join(tmp, "suites", "fixture"); - fs.mkdirSync(fixtureDir, { recursive: true }); - fs.writeFileSync(path.join(fixtureDir, "00-a.sh"), "#!/usr/bin/env bash\necho A-RAN\nexit 0\n"); - fs.writeFileSync(path.join(fixtureDir, "01-b.sh"), "#!/usr/bin/env bash\necho B-RAN\nexit 1\n"); - fs.writeFileSync(path.join(fixtureDir, "02-c.sh"), "#!/usr/bin/env bash\necho C-RAN\nexit 0\n"); - fs.chmodSync(path.join(fixtureDir, "00-a.sh"), 0o755); - fs.chmodSync(path.join(fixtureDir, "01-b.sh"), 0o755); - fs.chmodSync(path.join(fixtureDir, "02-c.sh"), 0o755); - fs.writeFileSync( - fixtureSuites, - `suites: - fixture: - steps: - - { id: a, script: suites/fixture/00-a.sh } - - { id: b, script: suites/fixture/01-b.sh } - - { id: c, script: suites/fixture/02-c.sh } -`, - ); - const r = runSuites(["fixture"], { - E2E_CONTEXT_DIR: tmp, - E2E_SUITES_FILE: fixtureSuites, - E2E_SUITES_DIR: tmp, - }); - expect(r.status).not.toBe(0); - expect(r.stdout).toContain("A-RAN"); - expect(r.stdout).toContain("B-RAN"); - expect(r.stdout).not.toContain("C-RAN"); - expect(`${r.stdout}${r.stderr}`).toMatch(/FAIL.*(fixture\/b|step=b)/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("smoke_suite_should_require_context", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - // No context.env written to tmp. - const r = runSuites(["smoke"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status).not.toBe(0); - expect(`${r.stderr}${r.stdout}`).toMatch(/context\.env|E2E_SCENARIO|missing/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("rebuild_and_upgrade_suites_should_emit_stable_assertion_ids_in_dry_run", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["rebuild", "upgrade"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - for (const id of [ - "suite.rebuild.workspace_state_preserved", - "suite.rebuild.agent_version_upgraded", - "suite.rebuild.inference_still_works", - "suite.rebuild.policy_presets_preserved", - "suite.rebuild.hermes_config_preserved", - "suite.upgrade.sandbox_registry_preserved", - "suite.upgrade.gateway_version_upgraded", - "suite.upgrade.survivor_agent_reachable", - ]) { - expect(r.stdout).toContain(id); - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("smoke_and_inference_run_with_stub_context", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["smoke", "inference"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - for (const id of [ - "cli-available", - "gateway-health", - "sandbox-listed", - "sandbox-shell", - "models-health", - "chat-completion", - "sandbox-inference-local", - ]) { - expect(r.stdout).toContain(id); - } - // Summary should call out PASS for each step. - expect(r.stdout).toMatch(/PASS/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/manifests/openclaw-nvidia-rebuild.yaml b/test/e2e-scenario/manifests/openclaw-nvidia-rebuild.yaml new file mode 100644 index 0000000000..b8a30d2589 --- /dev/null +++ b/test/e2e-scenario/manifests/openclaw-nvidia-rebuild.yaml @@ -0,0 +1,30 @@ +apiVersion: nemoclaw.io/v1 +kind: NemoClawInstance +metadata: + name: openclaw-nvidia-rebuild +spec: + setup: + install: + source: repo-current + runtime: + containerEngine: docker + containerDaemon: running + platform: + os: ubuntu + executionTarget: local + onboarding: + agent: openclaw + provider: nvidia + modelRoute: inference-local + policyTier: balanced + messaging: [] + # Lifecycle phase opt-in. Routes through + # nemoclaw_scenarios/lifecycle/dispatch.sh to the + # rebuild-current-version worker, which seeds a workspace marker, + # invokes `nemoclaw rebuild --yes`, and publishes the + # marker contract to runtime-phase assertions. + lifecycle: rebuild-current-version + state: + workspaceRef: default + credentialRefs: + - NVIDIA_API_KEY diff --git a/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh b/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh new file mode 100755 index 0000000000..5aaca1b2c1 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Phase-action launcher for the hybrid scenario E2E framework. +# +# The phase orchestrators (EnvironmentOrchestrator, OnboardingOrchestrator) +# call this launcher to invoke a function defined in a sourced shell +# dispatcher (install/dispatch.sh or onboard/dispatch.sh). Those +# dispatchers are intentionally library-style (function definitions +# only); this script gives them a deterministic executable entrypoint +# the typed runner can spawn. +# +# Usage: +# dispatch-action.sh +# +# Examples: +# dispatch-action.sh e2e_install repo-current \ +# test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh +# +# dispatch-action.sh e2e_onboard cloud-openclaw \ +# test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh +# +# Environment (set by the orchestrator): +# E2E_CONTEXT_DIR artifact directory +# E2E_PHASE environment | onboarding +# E2E_ACTION_ID stable action id, used for trace/log correlation + +set -euo pipefail + +if [[ $# -lt 3 ]]; then + echo "dispatch-action.sh: usage: " >&2 + exit 2 +fi + +ACTION_FN="$1" +ACTION_ARG="$2" +DISPATCHER="$3" + +if [[ ! -f "${DISPATCHER}" ]]; then + echo "dispatch-action.sh: dispatcher script not found: ${DISPATCHER}" >&2 + exit 2 +fi + +# Source the runtime/lib helpers the dispatchers (and their workers) rely on. +RUNTIME_LIB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../runtime/lib" && pwd)" +# shellcheck source=runtime/lib/env.sh +. "${RUNTIME_LIB}/env.sh" +# shellcheck source=runtime/lib/context.sh +. "${RUNTIME_LIB}/context.sh" + +# Apply the standard non-interactive env once, on the very first action of +# the run. Subsequent actions in the same run see the env via process +# inheritance. e2e_env_apply_noninteractive is idempotent. +e2e_env_apply_noninteractive +e2e_env_trace "phase:${E2E_PHASE:-unknown}/action:${E2E_ACTION_ID:-unknown}" + +# IMPORTANT: do NOT call e2e_context_init here. The TS framework +# (ScenarioRunner.seedContextEnv) is the single owner of context.env +# initialization for the run; e2e_context_init opens with `: > ctx` +# which would truncate the file and wipe seeded keys (E2E_SCENARIO, +# E2E_GATEWAY_URL, ...) that runtime assertions require. +# Workers may still call e2e_context_set to extend context.env in place. + +# Source the dispatcher last so its function definitions are in scope +# when we invoke the requested function. +# shellcheck source=/dev/null +. "${DISPATCHER}" + +if ! declare -F "${ACTION_FN}" >/dev/null 2>&1; then + echo "dispatch-action.sh: function not found in dispatcher: ${ACTION_FN}" >&2 + exit 2 +fi + +"${ACTION_FN}" "${ACTION_ARG}" diff --git a/test/e2e-scenario/nemoclaw_scenarios/expected-states.yaml b/test/e2e-scenario/nemoclaw_scenarios/expected-states.yaml deleted file mode 100644 index 8b7d95c11b..0000000000 --- a/test/e2e-scenario/nemoclaw_scenarios/expected-states.yaml +++ /dev/null @@ -1,186 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Expected state configs. -# -# Each entry describes the observable contract that must be true after -# setup/install/onboarding completes for a given scenario. Expected states -# are reusable: multiple setup scenarios can resolve to the same expected -# state when they produce the same completed environment. -# -# Schema keys are intentionally small and structural. Deeper behavior lives -# in suites; expected states answer "is the environment in the shape we -# expect?" not "does every feature still work?". -# -# Negative/preflight expected states (e.g. `preflight-failure-no-sandbox`) -# are introduced in Phase 9 alongside their first consuming scenario. - -expected_states: - cloud-openclaw-ready: - cli: - installed: true - gateway: - expected: present - health: healthy - sandbox: - expected: present - status: running - agent: openclaw - inference: - expected: available - provider: nvidia - route: inference-local - mode: gateway-routed - credentials: - expected: present - storage: gateway-managed - security: - policy_engine: supported - shields: supported - - macos-cli-ready-docker-optional: - cli: - installed: true - gateway: - expected: optional - health: optional - sandbox: - expected: optional - status: optional - agent: openclaw - inference: - expected: optional - provider: nvidia - route: inference-local - mode: gateway-routed - credentials: - expected: optional - storage: gateway-managed - security: - policy_engine: supported - shields: supported - - cloud-openclaw-custom-policies-ready: - cli: - installed: true - gateway: - expected: present - health: healthy - sandbox: - expected: present - status: running - agent: openclaw - inference: - expected: available - provider: nvidia - route: inference-local - mode: gateway-routed - credentials: - expected: present - storage: gateway-managed - onboarding_state: - provider: nvidia-prod - model: nvidia/nemotron-3-super-120b-a12b - policy_presets: npm,pypi - security: - policy_engine: supported - shields: supported - - cloud-hermes-ready: - cli: - installed: true - gateway: - expected: present - health: healthy - sandbox: - expected: present - status: running - agent: hermes - inference: - expected: available - provider: nvidia - route: inference-local - mode: gateway-routed - credentials: - expected: present - storage: gateway-managed - security: - policy_engine: supported - shields: supported - - local-ollama-openclaw-ready: - cli: - installed: true - gateway: - expected: present - health: healthy - sandbox: - expected: present - status: running - agent: openclaw - inference: - expected: available - provider: ollama - route: inference-local - mode: gateway-routed - credentials: - expected: present - storage: gateway-managed - security: - policy_engine: supported - shields: supported - - # Negative preflight state. Setup is expected to fail and the runner - # must confirm that no gateway or sandbox ghost state was left behind. - # The `expected_failure` block (added for #3608) is the structured - # contract the runner matches against; the legacy `failure` block is - # retained as a drift guard while scenarios migrate. - preflight-failure-no-sandbox: - cli: - installed: true - gateway: - expected: absent - sandbox: - expected: absent - failure: - expected: true - stage: preflight - expected_failure: - phase: preflight - error_class: docker-missing - # Docker, container, daemon, socket, or preflight - case insensitive. - message_pattern: "(?i)docker|container|daemon|socket|preflight" - forbidden_side_effects: - - sandbox-created - - gateway-started - - credentials-written - - onboarding-failure-invalid-nvidia-key: - cli: - installed: true - gateway: - expected: absent - sandbox: - expected: absent - failure: - expected: true - stage: onboarding - reason: invalid-nvidia-api-key - exit_code: 1 - message_contains: Invalid NVIDIA API key. Must start with nvapi- - no_stack_trace: true - - onboarding-failure-gateway-port-conflict: - cli: - installed: true - gateway: - expected: absent - sandbox: - expected: absent - failure: - expected: true - stage: onboarding - reason: gateway-port-conflict - exit_code: 1 - message_contains: Port 18080 is not available - no_stack_trace: true diff --git a/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh b/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh index 3d49c03116..d10fbd2c9d 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh @@ -12,8 +12,6 @@ # older_base_image_prepare [--registry ghcr.io/nvidia/nemoclaw] # Writes a minimal Dockerfile to a temp location whose first line is # `FROM :`, and prints the Dockerfile path on stdout. -# Honors E2E_DRY_RUN: skips the `docker pull` step (but still writes -# the Dockerfile, which is what callers inspect). # older_base_image_cleanup # Removes the generated Dockerfile and (if present) its build context. @@ -50,11 +48,9 @@ LABEL nemoclaw.e2e.fixture=older-base-image EOF e2e_env_trace "fixture:older-base-image" "${registry}:${tag}" - if ! e2e_env_is_dry_run; then - if command -v docker >/dev/null 2>&1; then - docker pull "${registry}:${tag}" >&2 \ - || echo "older_base_image_prepare: docker pull failed (continuing; build may still succeed on cached layers)" >&2 - fi + if command -v docker >/dev/null 2>&1; then + docker pull "${registry}:${tag}" >&2 \ + || echo "older_base_image_prepare: docker pull failed (continuing; build may still succeed on cached layers)" >&2 fi printf '%s\n' "${dockerfile}" } diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh index 7ea798cfdf..1a2ec2b0aa 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh @@ -4,7 +4,7 @@ # # Install dispatcher. Routes by install-method / profile id to one of four # split helpers (repo-current.sh, public-curl.sh, ollama.sh, -# launchable.sh). Honors E2E_DRY_RUN. +# launchable.sh). # # Accepts both legacy install-method names (repo-checkout, # curl-install-script) and the new profile-centric names used by diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh b/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh index 5ec638e90a..09d8aa3bbb 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh @@ -18,11 +18,6 @@ _E2E_INST_LNCH_RUNTIME_LIB="$(cd "${_E2E_INST_LNCH_DIR}/../../runtime/lib" && pw e2e_install_launchable() { e2e_env_trace "install-launchable" - if e2e_env_is_dry_run; then - echo "[dry-run] install-launchable (skipped)" - return 0 - fi - # Match nightly launchable-smoke-e2e: exercise the launchable bootstrap # script on the current runner instead of assuming a pre-provisioned Brev VM. # The script has no Brev API dependency; it installs Docker/OpenShell/NemoClaw diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh b/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh index a9d5f81c14..449eae519a 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh @@ -17,10 +17,6 @@ _E2E_INST_OL_RUNTIME_LIB="$(cd "${_E2E_INST_OL_DIR}/../../runtime/lib" && pwd)" e2e_install_ollama() { e2e_env_trace "install-ollama" - if e2e_env_is_dry_run; then - echo "[dry-run] install-ollama (skipped)" - return 0 - fi local ollama_url="${E2E_OLLAMA_INSTALL_URL:-https://ollama.ai/install.sh}" if ! command -v ollama >/dev/null 2>&1; then if ! curl -fsSL --retry 3 --retry-delay 2 "${ollama_url}" | bash; then diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh b/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh index 143d097f0d..6628e332a2 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh @@ -16,10 +16,6 @@ _E2E_INST_CURL_RUNTIME_LIB="$(cd "${_E2E_INST_CURL_DIR}/../../runtime/lib" && pw e2e_install_curl() { e2e_env_trace "install-curl" - if e2e_env_is_dry_run; then - echo "[dry-run] install-curl (skipped)" - return 0 - fi local url="${E2E_INSTALLER_URL:-https://raw.githubusercontent.com/NVIDIA/NemoClaw/main/scripts/install.sh}" local sha256="${E2E_INSTALLER_SHA256:-}" local tmp diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh b/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh index 8c985dc3f7..000431a4b8 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh @@ -5,7 +5,6 @@ # Install from a checked-out repo (repo-current / repo-checkout profile). # # Split from the install dispatcher to keep scenario setup logic flat and to -# make the per-profile code discoverable by grep. Honors E2E_DRY_RUN. _E2E_INST_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" _E2E_INST_REPO_RUNTIME_LIB="$(cd "${_E2E_INST_REPO_DIR}/../../runtime/lib" && pwd)" @@ -16,10 +15,6 @@ _E2E_INST_REPO_RUNTIME_LIB="$(cd "${_E2E_INST_REPO_DIR}/../../runtime/lib" && pw e2e_install_repo() { e2e_env_trace "install-repo" - if e2e_env_is_dry_run; then - echo "[dry-run] install-repo (skipped)" - return 0 - fi local repo_root repo_root="$(cd "${_E2E_INST_REPO_DIR}/../../../.." && pwd)" cd "${repo_root}" || return diff --git a/test/e2e-scenario/nemoclaw_scenarios/lifecycle/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/lifecycle/dispatch.sh new file mode 100755 index 0000000000..3cb82476cf --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/lifecycle/dispatch.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Lifecycle dispatcher. Mirrors install/dispatch.sh and onboard/dispatch.sh: +# sources the runtime libs and per-profile worker files, then defines +# `e2e_lifecycle()` which routes by profile id. +# +# Lifecycle workers run AFTER onboarding completes and BEFORE runtime +# assertions execute. They mutate sandbox state (rebuild, upgrade, +# snapshot, ...) and seed context.env keys that runtime assertions in +# validation_suites/lib/rebuild_upgrade.sh consume: +# +# E2E_REBUILD_MARKER_PATH absolute path to the workspace marker +# the worker wrote before rebuild +# E2E_REBUILD_MARKER_EXPECTED exact content of that marker +# E2E_OLD_AGENT_VERSION (optional) version present pre-rebuild +# E2E_AGENT_VERSION_COMMAND (optional) sandbox command to read the +# current agent version +# +# Adding a new profile: +# 1. Drop a worker file here (e.g. snapshot-restore.sh) that defines +# `e2e_lifecycle_`. +# 2. Source it below. +# 3. Add the case branch in e2e_lifecycle(). +# 4. Register the profile id in LIFECYCLE_PROFILE_SECRET_ENV in +# scenarios/compiler.ts so secret env routing keeps working. + +_E2E_LIFECYCLE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +_E2E_LIFECYCLE_RUNTIME_LIB="$(cd "${_E2E_LIFECYCLE_DIR}/../../runtime/lib" && pwd)" +# shellcheck source=../../runtime/lib/env.sh +. "${_E2E_LIFECYCLE_RUNTIME_LIB}/env.sh" +# shellcheck source=../../runtime/lib/context.sh +. "${_E2E_LIFECYCLE_RUNTIME_LIB}/context.sh" +# shellcheck source=rebuild-current-version.sh +. "${_E2E_LIFECYCLE_DIR}/rebuild-current-version.sh" + +e2e_lifecycle() { + local profile="${1:-}" + if [[ -z "${profile}" ]]; then + echo "e2e_lifecycle: missing lifecycle profile id" >&2 + return 2 + fi + e2e_env_trace "lifecycle:${profile}" + case "${profile}" in + rebuild-current-version) + e2e_lifecycle_rebuild_current_version + ;; + *) + echo "e2e_lifecycle: unsupported lifecycle profile: ${profile}" >&2 + return 2 + ;; + esac +} diff --git a/test/e2e-scenario/nemoclaw_scenarios/lifecycle/rebuild-current-version.sh b/test/e2e-scenario/nemoclaw_scenarios/lifecycle/rebuild-current-version.sh new file mode 100755 index 0000000000..359645754a --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/lifecycle/rebuild-current-version.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Lifecycle worker: rebuild-current-version. +# +# Drives the workspace-state-preservation invariant from +# test/e2e/test-rebuild-openclaw.sh, scoped to the rebuild trigger and +# the contract the runtime-phase rebuild_upgrade.sh assertions consume. +# The legacy test additionally exercised the version-upgrade path +# (build OLD-version base image, create sandbox from it, then rebuild +# to current). That dimension belongs to a future +# `rebuild-from-old-version` lifecycle profile and is intentionally +# out of scope here: this profile validates that +# `nemoclaw rebuild --yes` preserves workspace state across +# a rebuild, which is the core invariant the rebuild_upgrade.sh +# assertions assert. +# +# Sequence: +# 1. Read E2E_SANDBOX_NAME from the context the onboarding phase +# already populated. +# 2. Snapshot the current agent version (informational; runtime +# assertions accept an empty E2E_OLD_AGENT_VERSION as a vacuous +# pass on the version-upgraded check, which is the right default +# until the old-version profile lands). +# 3. Write a unique marker into /sandbox/.openclaw/workspace via the +# canonical e2e_sandbox_exec wrapper. Path mirrors the legacy +# test's MARKER_FILE so the read-side assertion stays unchanged. +# 4. Verify the marker is readable post-write (catch silent write +# failures before rebuild kicks off). +# 5. Run `nemoclaw rebuild --yes` and capture the output. +# 6. Seed E2E_REBUILD_MARKER_PATH and E2E_REBUILD_MARKER_EXPECTED in +# context.env so the runtime-phase +# rebuild_upgrade_assert_marker_preserved assertion can read them. +# 7. Optionally seed E2E_AGENT_VERSION_COMMAND so the version-check +# assertion uses the in-sandbox `openclaw --version` invocation. + +# Source the canonical sandbox-exec wrapper so this worker inherits the +# ssh-config preferred / openshell-exec fallback transport without +# re-implementing the routing logic. +_E2E_LIFECYCLE_RC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +_E2E_LIFECYCLE_RC_VALIDATION_SUITES="$(cd "${_E2E_LIFECYCLE_RC_DIR}/../../validation_suites" && pwd)" +# shellcheck source=../../validation_suites/sandbox-exec.sh +. "${_E2E_LIFECYCLE_RC_VALIDATION_SUITES}/sandbox-exec.sh" + +# Marker file path inside the sandbox. Mirrors the legacy +# test-rebuild-openclaw.sh MARKER_FILE so cross-check against the +# legacy contract stays apples-to-apples. +LIFECYCLE_REBUILD_MARKER_PATH="/sandbox/.openclaw/workspace/rebuild-marker.txt" + +e2e_lifecycle_rebuild_current_version() { + e2e_env_apply_noninteractive + + local sandbox_name marker_content rc=0 + sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" + if [[ -z "${sandbox_name}" ]]; then + echo "lifecycle:rebuild-current-version: E2E_SANDBOX_NAME missing in context" >&2 + return 1 + fi + # Random suffix prevents marker-content collisions across re-runs that + # somehow inherit a partially-rebuilt sandbox; the timestamp keeps the + # value greppable in logs. + marker_content="REBUILD_LIFECYCLE_$(date +%s)_${RANDOM}" + + echo "lifecycle:rebuild-current-version: sandbox=${sandbox_name}" + echo "lifecycle:rebuild-current-version: marker_path=${LIFECYCLE_REBUILD_MARKER_PATH}" + echo "lifecycle:rebuild-current-version: marker_content=${marker_content}" + + # Step 2: snapshot current version (best-effort; vacuous if it fails). + local pre_rebuild_version="" + if pre_rebuild_version="$( + E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=30 \ + e2e_sandbox_exec "${sandbox_name}" -- bash -lc 'openclaw --version 2>/dev/null || true' + )"; then + echo "lifecycle:rebuild-current-version: pre_rebuild_version=${pre_rebuild_version}" + fi + + # Step 3: write the marker file. + if ! E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=30 \ + e2e_sandbox_exec "${sandbox_name}" -- sh -c \ + "mkdir -p '$(dirname "${LIFECYCLE_REBUILD_MARKER_PATH}")' && printf '%s' '${marker_content}' > '${LIFECYCLE_REBUILD_MARKER_PATH}'"; then + echo "lifecycle:rebuild-current-version: failed to write marker into sandbox" >&2 + return 1 + fi + + # Step 4: verify marker readable pre-rebuild. This catches sandbox + # filesystem oddities (read-only mounts, perms) before we waste the + # rebuild cycle. + local verify_content="" + verify_content="$( + E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=30 \ + e2e_sandbox_exec "${sandbox_name}" -- cat "${LIFECYCLE_REBUILD_MARKER_PATH}" + )" || rc=$? + if [[ "${rc}" -ne 0 || "${verify_content}" != "${marker_content}" ]]; then + echo "lifecycle:rebuild-current-version: marker readback mismatch (got '${verify_content}', expected '${marker_content}')" >&2 + return 1 + fi + echo "lifecycle:rebuild-current-version: marker seeded and verified" + + # Step 5: trigger the rebuild. Match the legacy contract: + # `--yes` to skip the confirmation prompt; `--verbose` to surface + # progress in the action log so failures are diagnosable from the + # artifact bundle alone. + echo "lifecycle:rebuild-current-version: invoking nemoclaw ${sandbox_name} rebuild --yes --verbose" + if ! nemoclaw "${sandbox_name}" rebuild --yes --verbose; then + rc=$? + echo "lifecycle:rebuild-current-version: nemoclaw rebuild exited ${rc}" >&2 + return "${rc}" + fi + echo "lifecycle:rebuild-current-version: rebuild completed" + + # Step 6: publish the marker contract to runtime-phase assertions. + e2e_context_set E2E_REBUILD_MARKER_PATH "${LIFECYCLE_REBUILD_MARKER_PATH}" + e2e_context_set E2E_REBUILD_MARKER_EXPECTED "${marker_content}" + # Step 7: tell the version-check assertion how to read the agent + # version inside the sandbox. The default in rebuild_upgrade.sh is + # already `openclaw --version`, but seeding it explicitly makes the + # contract obvious in context.env when artifacts are inspected. + e2e_context_set E2E_AGENT_VERSION_COMMAND "openclaw --version" + if [[ -n "${pre_rebuild_version}" ]]; then + # Only set E2E_OLD_AGENT_VERSION when we actually captured a + # non-empty pre-rebuild version. The version-upgraded assertion + # treats an empty value as "no comparison required" and passes + # vacuously, which is the correct behavior for the + # rebuild-current-version profile (no upgrade is expected; we are + # only validating workspace preservation). + e2e_context_set E2E_OLD_AGENT_VERSION "${pre_rebuild_version}" + fi + + echo "lifecycle:rebuild-current-version: context.env updated" + return 0 +} diff --git a/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh b/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh new file mode 100755 index 0000000000..9c7b9803f1 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Onboard worker: cloud-openclaw-no-docker profile. +# +# Drives the negative `ubuntu-no-docker-preflight-negative` scenario by: +# +# 1. Installing a `docker` shim earlier on PATH that exits non-zero +# with a "Cannot connect to the Docker daemon" message. This makes +# `commandExists("docker")` succeed (the binary is present) while +# `docker info` fails — matching the production failure mode users +# see when Docker is installed but the daemon is not running. +# +# 2. Running `nemoclaw onboard --non-interactive` with stdout+stderr +# captured to `${E2E_CONTEXT_DIR}/negative-preflight.log`. The +# `onboarding.preflight.expected-failed` assertion greps that file. +# +# 3. Asserting that nemoclaw exits non-zero (preflight DID fail). If +# onboard unexpectedly succeeds, the action fails so the operator +# sees a clear "expected failure did not happen" signal instead of a +# green light masking a regression. +# +# 4. Returning 0 on the *expected* failure path so the orchestrator +# reports the action as passed and the assertion phase runs against +# the captured log. Without this, the action would be marked failed +# and the dependent assertions would be skipped. +# +# Pattern mirrors test/e2e/e2e-cloud-experimental/test-port8080-conflict.sh, +# which sets up a different failure condition (port 8080 occupied) but +# follows the same capture-output / check-exit / grep-log shape. + +e2e_onboard_cloud_openclaw_no_docker() { + e2e_env_apply_noninteractive + e2e_context_init + + local log shim_dir rc=0 + log="${E2E_CONTEXT_DIR}/negative-preflight.log" + shim_dir="$(mktemp -d -t e2e-no-docker-XXXXXX)" + + cat >"${shim_dir}/docker" <<'SHIM' +#!/usr/bin/env bash +# Negative-preflight docker shim — preserves "docker is installed" while +# breaking "docker info" / "docker version" so preflight fails with the +# real "Cannot connect to the Docker daemon" message. +printf 'Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?\n' >&2 +exit 1 +SHIM + chmod +x "${shim_dir}/docker" + + echo "negative-preflight: shim docker installed at ${shim_dir}/docker" + echo "negative-preflight: log_file=${log}" + echo "negative-preflight: invoking nemoclaw onboard --non-interactive (expected to fail at preflight)" + + PATH="${shim_dir}:${PATH}" \ + nemoclaw onboard --non-interactive --yes-i-accept-third-party-software \ + >"${log}" 2>&1 || rc=$? + + rm -rf "${shim_dir}" + + echo "negative-preflight: nemoclaw onboard exited ${rc}" + if [[ -f "${log}" ]]; then + echo "--- captured log tail (${log}) ---" + tail -50 "${log}" 2>/dev/null || true + echo "--- end captured log ---" + fi + + if [[ "${rc}" -eq 0 ]]; then + echo "negative-preflight: ERROR: nemoclaw onboard unexpectedly exited 0; preflight should have failed when docker is unreachable" >&2 + return 1 + fi + + return 0 +} diff --git a/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh index 2baf698986..fba1004559 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh @@ -14,6 +14,8 @@ _E2E_ONBOARD_RUNTIME_LIB="$(cd "${_E2E_ONBOARD_DIR}/../../runtime/lib" && pwd)" . "${_E2E_ONBOARD_RUNTIME_LIB}/context.sh" # shellcheck source=cloud-openclaw.sh . "${_E2E_ONBOARD_DIR}/cloud-openclaw.sh" +# shellcheck source=cloud-openclaw-no-docker.sh +. "${_E2E_ONBOARD_DIR}/cloud-openclaw-no-docker.sh" # shellcheck source=cloud-hermes.sh . "${_E2E_ONBOARD_DIR}/cloud-hermes.sh" # shellcheck source=local-ollama-openclaw.sh @@ -26,14 +28,13 @@ e2e_onboard() { return 2 fi e2e_env_trace "onboard:${profile}" - if e2e_env_is_dry_run; then - echo "[dry-run] onboard profile=${profile} (skipped)" - return 0 - fi case "${profile}" in cloud-openclaw) e2e_onboard_cloud_openclaw ;; + cloud-openclaw-no-docker) + e2e_onboard_cloud_openclaw_no_docker + ;; cloud-openclaw-custom-policies) E2E_ONBOARDING_MODEL="${E2E_ONBOARDING_MODEL:-nvidia/nemotron-3-super-120b-a12b}" E2E_ONBOARDING_POLICY_PRESETS="${E2E_ONBOARDING_POLICY_PRESETS:-npm,pypi}" diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/cli-installed.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/cli-installed.sh new file mode 100755 index 0000000000..77b773e3e6 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/cli-installed.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Probe: cli-installed +# +# Asserts that the nemoclaw CLI is reachable on PATH after the +# environment phase's install action completed. + +set -euo pipefail + +if ! command -v nemoclaw >/dev/null 2>&1; then + echo "probe cli-installed: nemoclaw not found on PATH (PATH=${PATH})" >&2 + exit 1 +fi + +# Resolve to a real binary; aliases or shell functions don't count. +nemoclaw_bin="$(command -v nemoclaw)" +if [[ ! -x "${nemoclaw_bin}" ]]; then + echo "probe cli-installed: nemoclaw resolved to non-executable: ${nemoclaw_bin}" >&2 + exit 1 +fi + +printf 'probe cli-installed: ok (%s)\n' "${nemoclaw_bin}" +exit 0 diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/dispatch.sh new file mode 100755 index 0000000000..84db7e7fa1 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/dispatch.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# State-validation probe dispatcher. +# +# Each probe is a small bash script in this directory invoked by the +# typed StateValidationOrchestrator via the shared dispatch-action.sh +# launcher. The orchestrator owns timeouts, redaction, evidence +# logging, and pass/fail attribution; probes only return 0 (probe +# satisfied) or non-zero with a human-readable message on stderr. +# +# Probes consult ${E2E_CONTEXT_DIR}/context.env for runtime values +# (E2E_GATEWAY_URL, E2E_SANDBOX_NAME) seeded by the framework and +# extended by onboarding. +# +# Library style: dispatch.sh defines a single dispatch function +# (e2e_state_probe) that runs the named probe. The TS phase-action +# uses fn=e2e_state_probe arg=. + +_E2E_PROBES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# e2e_state_probe +e2e_state_probe() { + local id="$1" + if [[ -z "${id}" ]]; then + echo "e2e_state_probe: missing probe id" >&2 + return 2 + fi + local probe_script="${_E2E_PROBES_DIR}/${id}.sh" + if [[ ! -f "${probe_script}" ]]; then + echo "e2e_state_probe: unknown probe id '${id}' (no script at ${probe_script})" >&2 + return 2 + fi + e2e_env_trace "probe:${id}" + # Probes run in a subshell so a `set -e` failure inside one probe + # does not affect another action in the same orchestrator process. + ( + # shellcheck source=/dev/null + . "${probe_script}" + ) +} diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-absent.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-absent.sh new file mode 100755 index 0000000000..67dc0d9d2f --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-absent.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Probe: gateway-absent +# +# Negative-state probe. Asserts that no gateway was started by a +# scenario whose expected_state declares gateway.expected=absent +# (preflight failure, invalid-key onboarding failure, +# gateway-port-conflict onboarding failure). This is the typed +# replacement for the runtime.expected-failure.no-side-effects +# pending step on the gateway-started axis: a real probe that fails +# closed if the gateway IS running. + +set -euo pipefail + +# Order matters: cheap CLI status check first, then port reachability +# fallback. We deliberately do NOT rely on any single signal so a +# scenario that leaves a partially-started gateway behind cannot +# slip through. + +if command -v nemoclaw >/dev/null 2>&1; then + if nemoclaw gateway status >/dev/null 2>&1; then + echo "probe gateway-absent: nemoclaw reports gateway is running, expected absent" >&2 + nemoclaw gateway status >&2 || true + exit 1 + fi +fi + +# Best-effort URL reachability check. context.env may carry a +# gateway URL even for negative scenarios (it is computed from the +# scenario id, not from a successful onboard). +context_env="${E2E_CONTEXT_DIR:-.e2e}/context.env" +if [[ -f "${context_env}" ]]; then + url="$(awk -F= '/^E2E_GATEWAY_URL=/{print substr($0, index($0, "=")+1); exit}' "${context_env}" | tr -d '"')" + if [[ -n "${url}" ]]; then + if curl -fsS -o /dev/null --max-time 3 "${url%/}/health" 2>/dev/null; then + echo "probe gateway-absent: ${url%/}/health responded healthy, expected absent" >&2 + exit 1 + fi + fi +fi + +echo "probe gateway-absent: ok" +exit 0 diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-healthy.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-healthy.sh new file mode 100755 index 0000000000..169ce4ce22 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/gateway-healthy.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Probe: gateway-healthy +# +# Asserts the gateway is reachable and reports a healthy HTTP status +# at ${E2E_GATEWAY_URL}/health (with fallback to the base URL). Mirrors +# the legacy validation_suites/assert/gateway-alive.sh::e2e_gateway_assert_healthy +# contract, but is invoked as a typed phase action by the +# StateValidationOrchestrator BEFORE runtime suites run, so suite +# assertions never execute against a missing or wedged gateway. + +set -euo pipefail + +# Defer to the legacy bash helper for the actual probe logic so we keep +# a single implementation of the gateway-health contract during the +# transition. The legacy helper consults context.env for the URL. +_THIS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GATEWAY_HELPER="$(cd "${_THIS_DIR}/../../validation_suites/assert" && pwd)/gateway-alive.sh" + +if [[ ! -f "${GATEWAY_HELPER}" ]]; then + echo "probe gateway-healthy: legacy helper not found: ${GATEWAY_HELPER}" >&2 + exit 1 +fi + +# shellcheck source=/dev/null +. "${GATEWAY_HELPER}" + +if ! e2e_gateway_assert_healthy; then + exit 1 +fi + +echo "probe gateway-healthy: ok" +exit 0 diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-absent.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-absent.sh new file mode 100755 index 0000000000..5f08e39df0 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-absent.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Probe: sandbox-absent +# +# Negative-state probe. Asserts that no sandbox was created by a +# scenario whose expected_state declares sandbox.expected=absent +# (preflight failure, onboarding failures). Typed replacement for +# the legacy run-scenario.sh inline check +# `openshell sandbox list | grep -Fq "${sandbox_name}"`. + +set -euo pipefail + +# E2E_SANDBOX_NAME is seeded by the framework from the scenario id +# even when onboarding never completed; missing context here is a +# framework bug, not a probe pass. +if [[ -z "${E2E_SANDBOX_NAME:-}" ]]; then + context_env="${E2E_CONTEXT_DIR:-.e2e}/context.env" + if [[ -f "${context_env}" ]]; then + E2E_SANDBOX_NAME="$(awk -F= '/^E2E_SANDBOX_NAME=/{print substr($0, index($0, "=")+1); exit}' "${context_env}" | tr -d '"')" + fi +fi +if [[ -z "${E2E_SANDBOX_NAME:-}" ]]; then + echo "probe sandbox-absent: E2E_SANDBOX_NAME unset; framework did not seed context" >&2 + exit 2 +fi + +# Two independent checks — `nemoclaw list` is the user-facing surface +# and openshell-side listing covers cases where nemoclaw is uninstalled +# or wedged. Either reporting the sandbox fails the probe. +if command -v nemoclaw >/dev/null 2>&1; then + if nemoclaw list 2>/dev/null | grep -qE "(^|[[:space:]])${E2E_SANDBOX_NAME}([[:space:]]|$)"; then + echo "probe sandbox-absent: nemoclaw list reports sandbox '${E2E_SANDBOX_NAME}', expected absent" >&2 + exit 1 + fi +fi + +if command -v openshell >/dev/null 2>&1; then + if openshell sandbox list 2>/dev/null | grep -Fq "${E2E_SANDBOX_NAME}"; then + echo "probe sandbox-absent: openshell reports sandbox '${E2E_SANDBOX_NAME}', expected absent" >&2 + exit 1 + fi +fi + +echo "probe sandbox-absent: ok" +exit 0 diff --git a/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-running.sh b/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-running.sh new file mode 100755 index 0000000000..2ff4d5ded3 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/probes/sandbox-running.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Probe: sandbox-running +# +# Asserts the sandbox declared by E2E_SANDBOX_NAME (seeded by +# onboarding) is present in `nemoclaw list`. Mirrors the legacy +# validation_suites/assert/sandbox-alive.sh::e2e_sandbox_assert_running +# contract; promoted to a typed phase action so runtime suites cannot +# silently run against an absent sandbox. + +set -euo pipefail + +_THIS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SANDBOX_HELPER="$(cd "${_THIS_DIR}/../../validation_suites/assert" && pwd)/sandbox-alive.sh" + +if [[ ! -f "${SANDBOX_HELPER}" ]]; then + echo "probe sandbox-running: legacy helper not found: ${SANDBOX_HELPER}" >&2 + exit 1 +fi + +# shellcheck source=/dev/null +. "${SANDBOX_HELPER}" + +if ! e2e_sandbox_assert_running; then + exit 1 +fi + +echo "probe sandbox-running: ok" +exit 0 diff --git a/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh b/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh index 69bda6c47c..fb05606494 100755 --- a/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh +++ b/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh @@ -9,7 +9,14 @@ if [[ ! -f "${E2E_CONTEXT_DIR:-}/onboard.log" ]]; then exit 1 fi -if grep -Eiq "preflight.*(fail|error)|docker|container|daemon|socket" "${E2E_CONTEXT_DIR}/onboard.log"; then +# The onboarding action already completed (exit 0) for this assertion to +# run; we only need to confirm the captured onboard.log does not contain +# explicit preflight FAILURE markers. The previous regex matched any +# mention of 'docker' / 'container' / 'daemon' / 'socket', which a normal +# successful onboarding always logs. Tighten to actual failure phrases. +if grep -Eiq \ + "preflight[[:space:]]+(failed|error)|cannot connect to[[:space:]]+(the[[:space:]]+)?docker daemon|permission denied[[:space:]]+while trying to connect to.*docker.*sock|onboarding aborted|FATAL: docker|ERROR: docker daemon" \ + "${E2E_CONTEXT_DIR}/onboard.log"; then echo "FAIL: onboarding.preflight.passed - onboard log contains preflight failure evidence" exit 1 fi diff --git a/test/e2e-scenario/runtime/coverage-report.sh b/test/e2e-scenario/runtime/coverage-report.sh deleted file mode 100755 index 8426d0ba30..0000000000 --- a/test/e2e-scenario/runtime/coverage-report.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Render the E2E scenario coverage report as Markdown to stdout. -# -# Usage: -# bash test/e2e-scenario/runtime/coverage-report.sh > coverage.md - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" - -TSX_BIN="${REPO_ROOT}/node_modules/.bin/tsx" -if [[ -x "${TSX_BIN}" ]]; then - "${TSX_BIN}" "${SCRIPT_DIR}/resolver/index.ts" coverage -else - # CodeRabbit review items #3, #10: fall back to --no-install so we rely on - # the lockfile-pinned tsx rather than a network fetch, and fail closed - # with a clear hint if tsx is not installed. - if ! (cd "${REPO_ROOT}" && npx --no-install tsx "${SCRIPT_DIR}/resolver/index.ts" coverage); then - echo "coverage-report: tsx not available. Run 'npm ci' at the repo root to install devDependencies." >&2 - exit 1 - fi -fi diff --git a/test/e2e-scenario/runtime/lib/env.sh b/test/e2e-scenario/runtime/lib/env.sh index ed33fb8a6a..9c33af97cc 100755 --- a/test/e2e-scenario/runtime/lib/env.sh +++ b/test/e2e-scenario/runtime/lib/env.sh @@ -40,8 +40,3 @@ e2e_env_trace() { printf '%s %s\n' "${event}" "$*" >>"${E2E_TRACE_FILE}" fi } - -# e2e_env_is_dry_run: true if E2E_DRY_RUN=1 -e2e_env_is_dry_run() { - [[ "${E2E_DRY_RUN:-0}" == "1" ]] -} diff --git a/test/e2e-scenario/runtime/resolver/coverage.ts b/test/e2e-scenario/runtime/resolver/coverage.ts deleted file mode 100644 index 2a3110f40c..0000000000 --- a/test/e2e-scenario/runtime/resolver/coverage.ts +++ /dev/null @@ -1,170 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Render a Markdown coverage report for E2E setup scenarios. - * - * Design (per the simplify pass): one primary table, one row per scenario. - * A `## Gaps` section flags scenarios without suites and expected states - * that no scenario references. Rows are sorted deterministically for - * stable CI diffs. - */ - -import type { ResolverInput } from "./load.ts"; - -export interface CoverageReportOptions { - /** Optional map of scenario id -> last known run status. */ - lastRunStatus?: Record; -} - -export function renderCoverageReport( - meta: ResolverInput, - options: CoverageReportOptions = {}, -): string { - const { scenarios, expectedStates } = meta; - const scenarioIds = Object.keys(scenarios.setup_scenarios).sort(); - const lines: string[] = []; - lines.push("# E2E Setup Scenario Coverage"); - lines.push(""); - lines.push( - "_Generated from `test/e2e-scenario/nemoclaw_scenarios/{scenarios,expected-states}.yaml` and `test/e2e-scenario/validation_suites/suites.yaml`._", - ); - lines.push(""); - lines.push("## Base Scenarios"); - lines.push(""); - lines.push("| Base | Platform | Install | Runtime | Requirements |"); - lines.push("|---|---|---|---|---|"); - for (const [id, base] of Object.entries(scenarios.base_scenarios ?? {}).sort( - ([a], [b]) => a.localeCompare(b), - )) { - lines.push( - `| ${id} | ${base.platform} | ${base.install} | ${base.runtime} | ${(base.runner_requirements ?? []).join(", ") || "_none_"} |`, - ); - } - lines.push(""); - lines.push("## Onboarding Profiles"); - lines.push(""); - lines.push("| Profile | Path | Provider | Agent | Route |"); - lines.push("|---|---|---|---|---|"); - for (const [id, profile] of Object.entries( - scenarios.onboarding_profiles ?? {}, - ).sort(([a], [b]) => a.localeCompare(b))) { - lines.push( - `| ${id} | ${profile.path ?? ""} | ${profile.provider ?? ""} | ${profile.agent ?? ""} | ${profile.inference_route ?? ""} |`, - ); - } - lines.push(""); - lines.push("## Test Plans"); - lines.push(""); - lines.push("| Plan | Base | Onboarding | Expected state | Suites |"); - lines.push("|---|---|---|---|---|"); - for (const [id, plan] of Object.entries(scenarios.test_plans ?? {}).sort( - ([a], [b]) => a.localeCompare(b), - )) { - lines.push( - `| ${id} | ${plan.base} | ${plan.onboarding} | ${plan.expected_state} | ${(plan.suites ?? []).join(", ") || "_(none)_"} |`, - ); - } - lines.push(""); - lines.push("## Suites"); - lines.push(""); - lines.push(`Total suites: ${Object.keys(meta.suites.suites).length}`); - lines.push(""); - lines.push("## Scenarios"); - lines.push(""); - const hasStatus = - options.lastRunStatus && Object.keys(options.lastRunStatus).length > 0; - const header = hasStatus - ? "| Scenario | Platform | Install | Runtime | Onboarding | Expected state | Suites | Last run |" - : "| Scenario | Platform | Install | Runtime | Onboarding | Expected state | Suites |"; - const sep = hasStatus - ? "|---|---|---|---|---|---|---|---|" - : "|---|---|---|---|---|---|---|"; - lines.push(header); - lines.push(sep); - for (const id of scenarioIds) { - const sc = scenarios.setup_scenarios[id]; - if (!sc) continue; - const suites = sc.suites ?? []; - const dimensions = sc.dimensions; - const suiteCell = suites.length === 0 ? "_(none)_" : suites.join(", "); - const row = [ - id, - dimensions?.platform ?? "", - dimensions?.install ?? "", - dimensions?.runtime ?? "", - dimensions?.onboarding ?? "", - sc.expected_state ?? "", - suiteCell, - ]; - if (hasStatus) { - row.push(options.lastRunStatus?.[id] ?? "_unknown_"); - } - lines.push(`| ${row.join(" | ")} |`); - } - lines.push(""); - // Gaps section. - const scenarioEntries = scenarioIds.flatMap((id) => { - const scenario = scenarios.setup_scenarios[id]; - return scenario ? [{ id, scenario }] : []; - }); - const scenariosWithoutSuites = scenarioEntries - .filter(({ scenario }) => (scenario.suites ?? []).length === 0) - .map(({ id }) => id); - const skippedScenarios = scenarioEntries - .map(({ id, scenario }) => ({ - id, - skips: scenario.skipped_capabilities ?? [], - })) - .filter(({ skips }) => skips.length > 0); - const referencedStates = new Set( - scenarioEntries - .map(({ scenario }) => scenario.expected_state) - .filter((state): state is string => Boolean(state)), - ); - const unusedStates = Object.keys(expectedStates.expected_states) - .filter((s) => !referencedStates.has(s)) - .sort(); - - lines.push("## Gaps"); - lines.push(""); - if ( - scenariosWithoutSuites.length === 0 && - unusedStates.length === 0 && - skippedScenarios.length === 0 - ) { - lines.push("_No gaps detected._"); - } else { - if (scenariosWithoutSuites.length > 0) { - lines.push("### Scenarios with no suites"); - lines.push(""); - for (const id of scenariosWithoutSuites.sort()) { - lines.push(`- \`${id}\`: no suites configured`); - } - lines.push(""); - } - if (skippedScenarios.length > 0) { - lines.push("### Explicitly skipped capabilities"); - lines.push(""); - for (const { id, skips } of skippedScenarios) { - for (const skip of skips) { - const suites = - Array.isArray(skip.suites) && skip.suites.length > 0 - ? ` Suites: ${skip.suites.map((suite) => `\`${suite}\``).join(", ")}.` - : ""; - lines.push(`- \`${id}\` / \`${skip.id}\`: ${skip.reason}${suites}`); - } - } - lines.push(""); - } - if (unusedStates.length > 0) { - lines.push("### Unused expected states"); - lines.push(""); - for (const id of unusedStates) { - lines.push(`- \`${id}\`: no scenario references this expected state`); - } - lines.push(""); - } - } - return lines.join("\n"); -} diff --git a/test/e2e-scenario/runtime/resolver/expected-failure.ts b/test/e2e-scenario/runtime/resolver/expected-failure.ts deleted file mode 100644 index 07901e5e15..0000000000 --- a/test/e2e-scenario/runtime/resolver/expected-failure.ts +++ /dev/null @@ -1,167 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Expected-failure matcher. - * - * Negative scenarios declare an `expected_failure` contract on their - * expected state. The runner captures the failed setup's log plus a small - * side-effect inventory (sandbox-created, gateway-started, credentials-written) - * and asks this module whether the observation matches the contract. - * - * The contract has four parts: - * - phase: which setup stage produced the failure (informational; the - * runner is responsible for invoking the matcher only when that phase - * actually ran). - * - error_class: stable identifier for the failure mode. - * - message_pattern: regex applied to the captured log when present. - * - forbidden_side_effects: effects that MUST NOT be observed. - * - * Match result is structured (`ExpectedFailureReport`) so the runner can - * write `expected-vs-actual.json` and surface a useful diff in CI. - */ - -import { compileMessagePattern } from "./load.ts"; -import type { - ExpectedFailure, - ExpectedFailurePhase, - ExpectedFailureErrorClass, - ExpectedFailureSideEffect, -} from "./schema.ts"; - -export interface ObservedFailure { - /** Phase the runner attempted; matched against `expected_failure.phase`. */ - phase: ExpectedFailurePhase; - /** - * Structured reason if the runner could derive one (preferred). When - * absent, matching falls back to log-content heuristics in the runner. - */ - error_class?: ExpectedFailureErrorClass; - /** Captured setup log; matched against `expected_failure.message_pattern`. */ - log: string; - /** - * Side effects the runner positively observed after the failure. Each - * effect in `expected_failure.forbidden_side_effects` is checked against - * this set; presence is a failure. - */ - observed_side_effects: ExpectedFailureSideEffect[]; -} - -export interface ExpectedFailureCheck { - name: "phase" | "error_class" | "message_pattern" | "forbidden_side_effects"; - ok: boolean; - expected: string; - actual: string; - message?: string; -} - -export interface ExpectedFailureReport { - ok: boolean; - expected: ExpectedFailure; - observed: ObservedFailure; - checks: ExpectedFailureCheck[]; -} - -export function matchExpectedFailure( - expected: ExpectedFailure, - observed: ObservedFailure, -): ExpectedFailureReport { - const checks: ExpectedFailureCheck[] = []; - - const phaseOk = expected.phase === observed.phase; - checks.push({ - name: "phase", - ok: phaseOk, - expected: expected.phase, - actual: observed.phase, - message: phaseOk - ? undefined - : `phase mismatch: expected '${expected.phase}' but observed '${observed.phase}'`, - }); - - if (observed.error_class !== undefined) { - const classOk = expected.error_class === observed.error_class; - checks.push({ - name: "error_class", - ok: classOk, - expected: expected.error_class, - actual: observed.error_class, - message: classOk - ? undefined - : `error_class mismatch: expected '${expected.error_class}' but observed '${observed.error_class}'`, - }); - } else { - // No structured class from the runner; defer to message_pattern as - // the discriminator. Record a SKIPPED entry so the report makes it - // obvious that the class was not asserted structurally. - checks.push({ - name: "error_class", - ok: true, - expected: expected.error_class, - actual: "", - message: "skipped: runner did not derive a structured error_class", - }); - } - - if (expected.message_pattern) { - let regex: RegExp; - try { - regex = compileMessagePattern(expected.message_pattern); - } catch (err) { - checks.push({ - name: "message_pattern", - ok: false, - expected: expected.message_pattern, - actual: "", - message: `message_pattern is not a valid regex: ${(err as Error).message}`, - }); - return finalize(expected, observed, checks); - } - const ok = regex.test(observed.log); - checks.push({ - name: "message_pattern", - ok, - expected: expected.message_pattern, - actual: ok ? "" : "", - message: ok - ? undefined - : `message_pattern '${expected.message_pattern}' did not match captured log`, - }); - } - - if (expected.forbidden_side_effects?.length) { - const observedSet = new Set(observed.observed_side_effects); - const found = expected.forbidden_side_effects.filter((e) => observedSet.has(e)); - const ok = found.length === 0; - checks.push({ - name: "forbidden_side_effects", - ok, - expected: expected.forbidden_side_effects.join(","), - actual: observed.observed_side_effects.join(",") || "", - message: ok - ? undefined - : `forbidden side effects observed after failure: ${found.join(", ")}`, - }); - } - - return finalize(expected, observed, checks); -} - -function finalize( - expected: ExpectedFailure, - observed: ObservedFailure, - checks: ExpectedFailureCheck[], -): ExpectedFailureReport { - return { ok: checks.every((c) => c.ok), expected, observed, checks }; -} - -export function formatExpectedFailureReport(report: ExpectedFailureReport): string { - const lines: string[] = []; - lines.push(`expected-failure: ${report.ok ? "OK" : "FAILED"}`); - for (const c of report.checks) { - const status = c.ok ? "PASS" : "FAIL"; - lines.push(` ${status} ${c.name} expected=${c.expected} actual=${c.actual}`); - if (c.message) lines.push(` ${c.message}`); - } - return lines.join("\n"); -} diff --git a/test/e2e-scenario/runtime/resolver/index.ts b/test/e2e-scenario/runtime/resolver/index.ts deleted file mode 100644 index 972fd073db..0000000000 --- a/test/e2e-scenario/runtime/resolver/index.ts +++ /dev/null @@ -1,354 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * CLI entrypoint for the E2E scenario resolver. - * - * Usage: - * tsx test/e2e-scenario/runtime/resolver/index.ts plan [--context-dir ] - * tsx test/e2e-scenario/runtime/resolver/index.ts validate-state [--probes-from-state] - * tsx test/e2e-scenario/runtime/resolver/index.ts match-failure \ - * --log --observed-phase \ - * [--observed-error-class ] [--observed-side-effects ] - * - * Writes `plan.json`, `expected-state-report.json`, or `expected-vs-actual.json` - * under the context dir (default `.e2e/`). Exit codes: - * 0 success, 2 usage error, 1 resolution error, - * 3 expected-state mismatch, 4 expected-failure mismatch. - */ - -import fs from "node:fs"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; - -import { loadMetadataFromDir } from "./load.ts"; -import { resolveScenario, formatPlan } from "./plan.ts"; -import { - validateExpectedState, - formatReport, - type ProbeResults, - type ProbeValue, -} from "./validator.ts"; -import { renderCoverageReport } from "./coverage.ts"; -import { - matchExpectedFailure, - formatExpectedFailureReport, - type ObservedFailure, -} from "./expected-failure.ts"; -import { - EXPECTED_FAILURE_PHASES, - EXPECTED_FAILURE_ERROR_CLASSES, - EXPECTED_FAILURE_SIDE_EFFECTS, - type ExpectedFailurePhase, - type ExpectedFailureErrorClass, - type ExpectedFailureSideEffect, -} from "./schema.ts"; - -function parseArgs(argv: string[]): { - command: string; - scenarioId?: string; - contextDir: string; - metadataDir: string; - probesFromState: boolean; - logPath?: string; - observedPhase?: string; - observedErrorClass?: string; - observedSideEffects?: string; -} { - const args = argv.slice(2); - const command = args.shift() ?? ""; - let scenarioId: string | undefined; - let contextDir = process.env.E2E_CONTEXT_DIR ?? ".e2e"; - let probesFromState = false; - let logPath: string | undefined; - let observedPhase: string | undefined; - let observedErrorClass: string | undefined; - let observedSideEffects: string | undefined; - const scriptDir = path.dirname(fileURLToPath(import.meta.url)); - // resolver/ lives under test/e2e-scenario/runtime/, so the E2E metadata root - // (which loadMetadataFromDir resolves further into nemoclaw_scenarios/ - // and validation_suites/) is two levels up. - let metadataDir = path.resolve(scriptDir, "..", ".."); - while (args.length > 0) { - const a = args.shift(); - if (a === "--context-dir") { - const v = args.shift(); - if (!v) throw new Error("--context-dir requires a value"); - contextDir = v; - } else if (a === "--metadata-dir") { - const v = args.shift(); - if (!v) throw new Error("--metadata-dir requires a value"); - metadataDir = v; - } else if (a === "--probes-from-state") { - // Dry-run affordance: seed probes from the expected state itself so - // the validator can exercise its logic without real probe values. - // Non-dry-run callers MUST NOT pass this flag (CodeRabbit review - // item #9); the resolver will fail closed when required probe keys - // are missing without this flag. - probesFromState = true; - } else if (a === "--log") { - const v = args.shift(); - if (!v) throw new Error("--log requires a value"); - logPath = v; - } else if (a === "--observed-phase") { - const v = args.shift(); - if (!v) throw new Error("--observed-phase requires a value"); - observedPhase = v; - } else if (a === "--observed-error-class") { - const v = args.shift(); - if (!v) throw new Error("--observed-error-class requires a value"); - observedErrorClass = v; - } else if (a === "--observed-side-effects") { - const v = args.shift(); - if (v === undefined) throw new Error("--observed-side-effects requires a value"); - observedSideEffects = v; - } else if (a && !a.startsWith("--") && !scenarioId) { - scenarioId = a; - } else if (a === "--help" || a === "-h") { - // ignore; help handled by caller - } else if (a) { - throw new Error(`unexpected argument: ${a}`); - } - } - return { - command, - scenarioId, - contextDir, - metadataDir, - probesFromState, - logPath, - observedPhase, - observedErrorClass, - observedSideEffects, - }; -} - -function main(): number { - let parsed: ReturnType; - try { - parsed = parseArgs(process.argv); - } catch (err) { - process.stderr.write(`resolver: ${(err as Error).message}\n`); - return 2; - } - const { command, scenarioId, contextDir, metadataDir } = parsed; - if (command === "coverage") { - try { - const meta = loadMetadataFromDir(metadataDir); - const md = renderCoverageReport(meta); - process.stdout.write(`${md}\n`); - return 0; - } catch (err) { - process.stderr.write(`resolver: ${(err as Error).message}\n`); - return 1; - } - } - if (!scenarioId) { - process.stderr.write("resolver: missing scenario id\n"); - return 2; - } - try { - const meta = loadMetadataFromDir(metadataDir); - const plan = resolveScenario(scenarioId, meta); - if (command === "plan") { - fs.mkdirSync(contextDir, { recursive: true }); - const planJsonPath = path.join(contextDir, "plan.json"); - fs.writeFileSync(planJsonPath, `${JSON.stringify(plan, null, 2)}\n`); - process.stdout.write(`${formatPlan(plan)}\n`); - process.stdout.write(`plan.json: ${planJsonPath}\n`); - return 0; - } - if (command === "validate-state") { - // CodeRabbit review item #9: only self-seed probes when the caller - // explicitly opts in (dry-run / test contexts). Non-dry-run callers - // without real probes wired should fail, not quietly self-validate. - const probes = parsed.probesFromState - ? probesFromEnvAndState(plan.expected_state.config) - : probesFromEnvOnly(); - const report = validateExpectedState({ - stateId: plan.expected_state.id, - state: plan.expected_state.config, - probes, - suites: plan.suites, - }); - fs.mkdirSync(contextDir, { recursive: true }); - const reportPath = path.join(contextDir, "expected-state-report.json"); - fs.writeFileSync(reportPath, `${JSON.stringify(report, null, 2)}\n`); - process.stdout.write(`${formatReport(report)}\n`); - process.stdout.write(`expected-state-report: ${reportPath}\n`); - return report.ok ? 0 : 3; - } - if (command === "match-failure") { - if (!plan.expected_failure) { - process.stderr.write( - `resolver: scenario '${scenarioId}' has no expected_failure block; nothing to match\n`, - ); - return 2; - } - if (!parsed.observedPhase) { - process.stderr.write("resolver: match-failure requires --observed-phase\n"); - return 2; - } - if (!EXPECTED_FAILURE_PHASES.includes(parsed.observedPhase as ExpectedFailurePhase)) { - process.stderr.write( - `resolver: --observed-phase must be one of: ${EXPECTED_FAILURE_PHASES.join(", ")}\n`, - ); - return 2; - } - let observedErrorClass: ExpectedFailureErrorClass | undefined; - if (parsed.observedErrorClass !== undefined && parsed.observedErrorClass !== "") { - if ( - !EXPECTED_FAILURE_ERROR_CLASSES.includes( - parsed.observedErrorClass as ExpectedFailureErrorClass, - ) - ) { - process.stderr.write( - `resolver: --observed-error-class must be one of: ${EXPECTED_FAILURE_ERROR_CLASSES.join(", ")}\n`, - ); - return 2; - } - observedErrorClass = parsed.observedErrorClass as ExpectedFailureErrorClass; - } - const observedSideEffects: ExpectedFailureSideEffect[] = (parsed.observedSideEffects ?? "") - .split(",") - .map((s) => s.trim()) - .filter(Boolean) - .map((s) => { - if (!EXPECTED_FAILURE_SIDE_EFFECTS.includes(s as ExpectedFailureSideEffect)) { - throw new Error( - `--observed-side-effects entry '${s}' must be one of: ${EXPECTED_FAILURE_SIDE_EFFECTS.join(", ")}`, - ); - } - return s as ExpectedFailureSideEffect; - }); - if (!parsed.logPath) { - process.stderr.write("resolver: match-failure requires --log\n"); - return 2; - } - const log = fs.readFileSync(parsed.logPath, "utf8"); - const observed: ObservedFailure = { - phase: parsed.observedPhase as ExpectedFailurePhase, - error_class: observedErrorClass, - log, - observed_side_effects: observedSideEffects, - }; - const report = matchExpectedFailure(plan.expected_failure, observed); - // Exclude the (potentially large) log from the JSON artifact so - // expected-vs-actual.json stays human-readable; the log is already - // captured separately under the context dir. - const artifact = { - ok: report.ok, - expected: report.expected, - observed: { - phase: report.observed.phase, - error_class: report.observed.error_class, - observed_side_effects: report.observed.observed_side_effects, - }, - checks: report.checks, - }; - fs.mkdirSync(contextDir, { recursive: true }); - const reportPath = path.join(contextDir, "expected-vs-actual.json"); - fs.writeFileSync(reportPath, `${JSON.stringify(artifact, null, 2)}\n`); - process.stdout.write(`${formatExpectedFailureReport(report)}\n`); - process.stdout.write(`expected-vs-actual: ${reportPath}\n`); - return report.ok ? 0 : 4; - } - process.stderr.write( - `resolver: unknown command '${command}' (expected: plan|validate-state|match-failure )\n`, - ); - return 2; - } catch (err) { - process.stderr.write(`resolver: ${(err as Error).message}\n`); - return 1; - } -} - -function flattenState( - obj: unknown, - prefix: string, - out: Record, -): void { - if (obj === null || typeof obj !== "object") { - out[prefix] = obj as ProbeValue; - return; - } - for (const [k, v] of Object.entries(obj as Record)) { - const next = prefix ? `${prefix}.${k}` : k; - if (v !== null && typeof v === "object" && !Array.isArray(v)) { - flattenState(v, next, out); - } else { - out[next] = v as ProbeValue; - } - } -} - -/** - * Read probe overrides from the environment without seeding from state. - * - * Used in non-dry-run mode: the validator then reports a concrete failure - * for any expected-state key that has no corresponding probe value. - */ -function probesFromEnvOnly(): ProbeResults { - const probes: ProbeResults = {}; - // 1. Prefix-based overrides: E2E_PROBE_OVERRIDE_= where - // maps underscores to dots (e.g. GATEWAY_HEALTH -> gateway.health). - // This works for simple keys but cannot express underscores inside a - // single segment. - const prefix = "E2E_PROBE_OVERRIDE_"; - for (const [envKey, value] of Object.entries(process.env)) { - if (!envKey.startsWith(prefix) || value === undefined) continue; - const key = envKey.slice(prefix.length).toLowerCase().replace(/_/g, "."); - probes[key] = coerceProbeValue(value); - } - // 2. JSON escape hatch for keys with embedded underscores (e.g. - // `security.policy_engine`). Later overrides win over (1). - const overridesJson = process.env.E2E_PROBE_OVERRIDES_JSON; - if (overridesJson) { - try { - const parsed = JSON.parse(overridesJson); - if (parsed && typeof parsed === "object") { - for (const [k, v] of Object.entries(parsed as Record)) { - probes[k] = typeof v === "string" ? coerceProbeValue(v) : (v as ProbeValue); - } - } - } catch (err) { - process.stderr.write( - `resolver: E2E_PROBE_OVERRIDES_JSON parse error: ${(err as Error).message}\n`, - ); - } - } - return probes; -} - -/** - * Build a probe results map. - * - * In dry-run / test mode we do not probe real services; instead we default - * every expected-state leaf to its declared value so the validator passes, - * and then allow targeted overrides via E2E_PROBE_OVERRIDE_=value. - * This lets tests simulate specific failure modes without spinning up a - * real gateway or sandbox. - */ -function probesFromEnvAndState(state: unknown): ProbeResults { - const probes: ProbeResults = {}; - flattenState(state, "", probes); - const prefix = "E2E_PROBE_OVERRIDE_"; - for (const [envKey, value] of Object.entries(process.env)) { - if (!envKey.startsWith(prefix) || value === undefined) continue; - const key = envKey - .slice(prefix.length) - .toLowerCase() - .replace(/_/g, "."); - probes[key] = coerceProbeValue(value); - } - return probes; -} - -function coerceProbeValue(v: string): ProbeValue { - if (v === "true") return true; - if (v === "false") return false; - if (/^-?\d+$/.test(v)) return parseInt(v, 10); - return v; -} - -process.exit(main()); diff --git a/test/e2e-scenario/runtime/resolver/js-yaml.d.ts b/test/e2e-scenario/runtime/resolver/js-yaml.d.ts deleted file mode 100644 index 6ea52a82de..0000000000 --- a/test/e2e-scenario/runtime/resolver/js-yaml.d.ts +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -// Local type shim for js-yaml. The runtime package ships without -// TypeScript declarations; we only use `load` for YAML parsing. -declare module "js-yaml" { - export function load(input: string): unknown; - export function dump(obj: unknown, opts?: Record): string; - const _default: { load: typeof load; dump: typeof dump }; - export default _default; -} diff --git a/test/e2e-scenario/runtime/resolver/load.ts b/test/e2e-scenario/runtime/resolver/load.ts deleted file mode 100644 index 9c8dc3991b..0000000000 --- a/test/e2e-scenario/runtime/resolver/load.ts +++ /dev/null @@ -1,360 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Load and lightly-validate the E2E metadata files. - * - * The full reference check happens in `plan.ts` during scenario resolution. - * This module only asserts that each file exists and has the required - * top-level sections so callers get a clear error before touching scenarios. - */ - -import fs from "node:fs"; -import path from "node:path"; -import yaml from "js-yaml"; - -import { - EXPECTED_FAILURE_ERROR_CLASSES, - EXPECTED_FAILURE_PHASES, - EXPECTED_FAILURE_SIDE_EFFECTS, -} from "./schema.ts"; -import type { - ScenariosFile, - ExpectedStatesFile, - SuitesFile, - ExpectedFailurePhase, - ExpectedFailureErrorClass, - ExpectedFailureSideEffect, -} from "./schema.ts"; - -export interface ResolverInput { - scenarios: ScenariosFile; - expectedStates: ExpectedStatesFile; - suites: SuitesFile; - /** Optional source dir, used for resolving suite script paths. */ - sourceDir?: string; -} - -function readYaml(p: string): unknown { - const raw = fs.readFileSync(p, "utf8"); - return yaml.load(raw); -} - -function ensureObject(doc: unknown, file: string): Record { - if (!doc || typeof doc !== "object" || Array.isArray(doc)) { - throw new Error(`metadata file ${file} must parse to a YAML mapping`); - } - return doc as Record; -} - -function requireSections( - doc: Record, - file: string, - sections: string[], -): void { - for (const s of sections) { - if (!(s in doc)) { - throw new Error(`metadata file ${file} is missing required section: ${s}`); - } - } -} - -/** - * Compile a YAML-authored `message_pattern` into a JS `RegExp`. RE2-style - * inline flag prefixes (e.g. `(?i)`, `(?ims)`) are stripped and converted - * to the corresponding `RegExp` flags so authors can write the same shape - * the issue body shows without worrying about the underlying engine. - * - * Exported so the matcher uses identical compilation rules; throws on any - * unsupported flag character or on an invalid pattern. - */ -export function compileMessagePattern(pattern: string): RegExp { - let body = pattern; - let flags = ""; - const inlineFlagMatch = /^\(\?([a-zA-Z]+)\)/.exec(pattern); - if (inlineFlagMatch) { - const allowed = new Set(["i", "m", "s"]); - for (const ch of inlineFlagMatch[1]) { - if (!allowed.has(ch)) { - throw new Error(`unsupported inline regex flag '(?${inlineFlagMatch[1]})'; allowed: i, m, s`); - } - if (!flags.includes(ch)) flags += ch; - } - body = pattern.slice(inlineFlagMatch[0].length); - } - return new RegExp(body, flags); -} - -/** - * Validate an `expected_failure` block. `partial` controls whether every - * required field must be present (state-level blocks: yes; scenario-level - * override: no, since absent fields fall back to the state). - */ -function validateExpectedFailureBlock( - block: unknown, - origin: string, - opts: { partial: boolean }, -): void { - if (!block || typeof block !== "object" || Array.isArray(block)) { - throw new Error(`${origin}.expected_failure must be a mapping`); - } - const b = block as Record; - if (b.phase !== undefined) { - if (typeof b.phase !== "string" || !EXPECTED_FAILURE_PHASES.includes(b.phase as ExpectedFailurePhase)) { - throw new Error( - `${origin}.expected_failure.phase must be one of: ${EXPECTED_FAILURE_PHASES.join(", ")}`, - ); - } - } else if (!opts.partial) { - throw new Error(`${origin}.expected_failure.phase is required`); - } - if (b.error_class !== undefined) { - if ( - typeof b.error_class !== "string" || - !EXPECTED_FAILURE_ERROR_CLASSES.includes(b.error_class as ExpectedFailureErrorClass) - ) { - throw new Error( - `${origin}.expected_failure.error_class must be one of: ${EXPECTED_FAILURE_ERROR_CLASSES.join(", ")}`, - ); - } - } else if (!opts.partial) { - throw new Error(`${origin}.expected_failure.error_class is required`); - } - if (b.message_pattern !== undefined && typeof b.message_pattern !== "string") { - throw new Error(`${origin}.expected_failure.message_pattern must be a string`); - } - if (typeof b.message_pattern === "string") { - try { - compileMessagePattern(b.message_pattern); - } catch (err) { - throw new Error( - `${origin}.expected_failure.message_pattern is not a valid regex: ${(err as Error).message}`, - ); - } - } - if (b.forbidden_side_effects !== undefined) { - if (!Array.isArray(b.forbidden_side_effects)) { - throw new Error(`${origin}.expected_failure.forbidden_side_effects must be a list`); - } - for (const effect of b.forbidden_side_effects) { - if ( - typeof effect !== "string" || - !EXPECTED_FAILURE_SIDE_EFFECTS.includes(effect as ExpectedFailureSideEffect) - ) { - throw new Error( - `${origin}.expected_failure.forbidden_side_effects entry '${String(effect)}' must be one of: ${EXPECTED_FAILURE_SIDE_EFFECTS.join(", ")}`, - ); - } - } - } - const known = new Set(["phase", "error_class", "message_pattern", "forbidden_side_effects"]); - for (const k of Object.keys(b)) { - if (!known.has(k)) { - throw new Error(`${origin}.expected_failure has unknown key '${k}'`); - } - } -} - -function validateScenarios(doc: Record, file: string): ScenariosFile { - requireSections(doc, file, [ - "platforms", - "installs", - "runtimes", - "onboarding", - "setup_scenarios", - ]); - const setup = doc.setup_scenarios as Record; - for (const [id, entry] of Object.entries(setup)) { - if (!entry || typeof entry !== "object") { - throw new Error(`scenario ${id} must be a mapping`); - } - const e = entry as Record; - if ("expected_states" in e) { - throw new Error( - `scenario ${id} uses array-form 'expected_states'; use singular 'expected_state'`, - ); - } - if (typeof e.alias_for_plan === "string") { - continue; - } - if (typeof e.expected_state !== "string") { - throw new Error(`scenario ${id} must declare a string 'expected_state'`); - } - if (!Array.isArray(e.suites)) { - throw new Error(`scenario ${id} must declare a list of 'suites'`); - } - if ("runner_requirements" in e) { - if ( - !Array.isArray(e.runner_requirements) || - e.runner_requirements.some((requirement) => typeof requirement !== "string") - ) { - throw new Error(`scenario ${id}.runner_requirements must be a list of strings`); - } - } - if ("expected_failure" in e) { - validateExpectedFailureBlock(e.expected_failure, `scenario ${id}`, { partial: true }); - } - if ("skipped_capabilities" in e) { - if ( - !Array.isArray(e.skipped_capabilities) || - e.skipped_capabilities.some((skip) => { - if (!skip || typeof skip !== "object" || Array.isArray(skip)) return true; - const s = skip as Record; - return ( - typeof s.id !== "string" || - typeof s.reason !== "string" || - ("suites" in s && (!Array.isArray(s.suites) || s.suites.some((suite) => typeof suite !== "string"))) - ); - }) - ) { - throw new Error(`scenario ${id}.skipped_capabilities must list {id, reason, suites?}`); - } - } - const dims = e.dimensions as Record | undefined; - if (!dims) { - throw new Error(`scenario ${id} must declare 'dimensions'`); - } - for (const key of ["platform", "install", "runtime", "onboarding"]) { - if (typeof dims[key] !== "string") { - throw new Error(`scenario ${id}.dimensions.${key} must be a string`); - } - } - const platformId = dims.platform as string; - const platform = (doc.platforms as Record | undefined>)[ - platformId - ]; - const requiresExplicitRunner = - platform?.execution_target === "remote" || - platform?.os === "macos" || - platform?.os === "wsl" || - platform?.gpu !== undefined || - platform?.hardware !== undefined; - if ( - requiresExplicitRunner && - (!Array.isArray(e.runner_requirements) || e.runner_requirements.length === 0) - ) { - throw new Error(`scenario ${id} must declare runner_requirements for platform ${platformId}`); - } - } - return doc as unknown as ScenariosFile; -} - -function validateExpectedStates( - doc: Record, - file: string, -): ExpectedStatesFile { - requireSections(doc, file, ["expected_states"]); - const rawStates = doc.expected_states; - if (!rawStates || typeof rawStates !== "object" || Array.isArray(rawStates)) { - throw new Error(`metadata file ${file} section 'expected_states' must be a mapping`); - } - const states = rawStates as Record; - for (const [id, entry] of Object.entries(states)) { - if (!entry || typeof entry !== "object") { - throw new Error(`expected_state ${id} must be a mapping`); - } - const e = entry as Record; - if ("expected_failure" in e) { - validateExpectedFailureBlock(e.expected_failure, `expected_state ${id}`, { partial: false }); - } - } - return doc as unknown as ExpectedStatesFile; -} - -function validateSuites(doc: Record, file: string): SuitesFile { - requireSections(doc, file, ["suites"]); - const suites = doc.suites as Record; - for (const [id, entry] of Object.entries(suites)) { - if (!entry || typeof entry !== "object") { - throw new Error(`suite ${id} must be a mapping`); - } - const e = entry as Record; - if (!Array.isArray(e.steps)) { - throw new Error(`suite ${id} must declare a 'steps' array`); - } - for (const step of e.steps) { - if (!step || typeof step !== "object") { - throw new Error(`suite ${id} has a non-mapping step`); - } - const s = step as Record; - if (typeof s.id !== "string" || typeof s.script !== "string") { - throw new Error(`suite ${id} has an invalid step (requires string id and script)`); - } - } - } - return doc as unknown as SuitesFile; -} - -/** - * Resolve the concrete on-disk locations of the three metadata files - * given the E2E root directory (`test/e2e/`). - * - * Post-restructure layout: - * /nemoclaw_scenarios/scenarios.yaml - * /nemoclaw_scenarios/expected-states.yaml - * /validation_suites/suites.yaml - * - * For backward compatibility (and for tests that synthesise a flat - * fixture directory) we also accept a directory that already contains - * all three YAML files side by side. - */ -function resolveMetadataPaths(dir: string): { - scenarios: string; - states: string; - suites: string; -} { - const flatScenarios = path.join(dir, "scenarios.yaml"); - const flatStates = path.join(dir, "expected-states.yaml"); - const flatSuites = path.join(dir, "suites.yaml"); - if ( - fs.existsSync(flatScenarios) && - fs.existsSync(flatStates) && - fs.existsSync(flatSuites) - ) { - return { scenarios: flatScenarios, states: flatStates, suites: flatSuites }; - } - return { - scenarios: path.join(dir, "nemoclaw_scenarios", "scenarios.yaml"), - states: path.join(dir, "nemoclaw_scenarios", "expected-states.yaml"), - suites: path.join(dir, "validation_suites", "suites.yaml"), - }; -} - -export function loadMetadataFromDir(dir: string): ResolverInput { - const { scenarios: scenariosPath, states: statesPath, suites: suitesPath } = - resolveMetadataPaths(dir); - const scenarios = validateScenarios( - ensureObject(readYaml(scenariosPath), scenariosPath), - scenariosPath, - ); - const expectedStates = validateExpectedStates( - ensureObject(readYaml(statesPath), statesPath), - statesPath, - ); - const suites = validateSuites( - ensureObject(readYaml(suitesPath), suitesPath), - suitesPath, - ); - return { scenarios, expectedStates, suites, sourceDir: dir }; -} - -export function loadMetadataFromObjects(input: { - scenarios: object; - expectedStates: object; - suites: object; - sourceDir?: string; -}): ResolverInput { - const scenarios = validateScenarios( - ensureObject(input.scenarios, ""), - "", - ); - const expectedStates = validateExpectedStates( - ensureObject(input.expectedStates, ""), - "", - ); - const suites = validateSuites( - ensureObject(input.suites, ""), - "", - ); - return { scenarios, expectedStates, suites, sourceDir: input.sourceDir }; -} diff --git a/test/e2e-scenario/runtime/resolver/plan.ts b/test/e2e-scenario/runtime/resolver/plan.ts deleted file mode 100644 index c20350eaed..0000000000 --- a/test/e2e-scenario/runtime/resolver/plan.ts +++ /dev/null @@ -1,256 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Resolve a setup scenario into a concrete, fully-referenced execution plan. - * - * The resolver: - * 1. looks up the scenario by id, - * 2. resolves each dimension profile, - * 3. resolves the expected state, - * 4. resolves each suite definition, - * 5. validates each suite's `requires_state` against the scenario's expected - * state (fail-fast if any key is missing or has an incompatible value). - * - * The resulting `ResolvedPlan` is serializable to JSON and forms the basis of - * the `.e2e/plan.json` artifact and the human-readable plan printout. - */ - -import type { ResolverInput } from "./load.ts"; -import type { - BaseScenario, - ResolvedPlan, - ResolvedSuite, - SuiteDefinition, - ExpectedFailure, - ExpectedStateConfig, - TestPlan, -} from "./schema.ts"; - -export type { ResolverInput } from "./load.ts"; -export type { ResolvedPlan } from "./schema.ts"; - -function lookupProfile( - collection: Record, - kind: string, - name: string, - scenarioId: string, -): T { - if (!(name in collection)) { - const available = Object.keys(collection).sort().join(", "); - throw new Error( - `scenario '${scenarioId}' references unknown ${kind} '${name}' (available: ${available || ""})`, - ); - } - return collection[name] as T; -} - -function getByDottedPath(obj: unknown, dotted: string): unknown { - const parts = dotted.split("."); - let cur: unknown = obj; - for (const p of parts) { - if (cur === null || cur === undefined || typeof cur !== "object") { - return undefined; - } - cur = (cur as Record)[p]; - } - return cur; -} - -/** - * Merge a state-level `expected_failure` with an optional scenario-level - * override and return a fully-formed `ExpectedFailure`, or `undefined` if - * neither side declares one. Scenario-level fields win over state-level. - * - * After merge, every required field MUST be present. The loader already - * enforces this for state-level blocks; an override-only declaration on a - * positive expected state is rejected here. - */ -function resolveExpectedFailure( - stateConfig: ExpectedStateConfig, - expectedStateId: string, - scenarioId: string, - overrides: Array<{ - block?: Partial; - mode: "fill" | "override"; - origin: string; - }>, -): ExpectedFailure | undefined { - const stateBlock = (stateConfig as { expected_failure?: unknown }).expected_failure as - | Partial - | undefined; - const presentOverrides = overrides.filter((source) => source.block); - if (!stateBlock && presentOverrides.length === 0) return undefined; - if (!stateBlock) { - const origins = presentOverrides.map((source) => source.origin).join(", "); - throw new Error( - `scenario '${scenarioId}' declares expected_failure but expected_state '${expectedStateId}' does not - declare the base contract on the state first (source: ${origins})`, - ); - } - const merged: Partial = { ...stateBlock }; - for (const source of overrides) { - const block = source.block; - if (!block) continue; - for (const key of Object.keys(block) as Array) { - const value = block[key]; - if (value === undefined) continue; - if (source.mode === "fill" && merged[key] !== undefined) continue; - (merged as Record)[key] = value; - } - } - if (!merged.phase || !merged.error_class) { - throw new Error( - `scenario '${scenarioId}' expected_failure resolves with missing required fields (phase, error_class) after merge`, - ); - } - return merged as ExpectedFailure; -} - -function validateSuiteAgainstState( - suiteId: string, - suite: SuiteDefinition, - state: ExpectedStateConfig, - scenarioId: string, -): void { - const requires = suite.requires_state ?? {}; - for (const [key, expected] of Object.entries(requires)) { - const actual = getByDottedPath(state, key); - if (actual === undefined) { - throw new Error( - `scenario '${scenarioId}' selects suite '${suiteId}' which requires state key '${key}=${String(expected)}', but the expected state has no value at '${key}'`, - ); - } - if (actual !== expected) { - throw new Error( - `scenario '${scenarioId}' selects suite '${suiteId}' which requires '${key}=${String(expected)}', but the scenario's expected state has '${key}=${String(actual)}'`, - ); - } - } -} - -export function resolveScenario(scenarioId: string, meta: ResolverInput): ResolvedPlan { - const legacy = meta.scenarios.setup_scenarios[scenarioId]; - const directPlan = meta.scenarios.test_plans?.[scenarioId]; - if (!legacy && !directPlan) { - const available = [ - ...Object.keys(meta.scenarios.setup_scenarios), - ...Object.keys(meta.scenarios.test_plans ?? {}), - ].sort().join(", "); - throw new Error(`unknown scenario '${scenarioId}' (available: ${available || ""})`); - } - const planId = legacy?.alias_for_plan ?? scenarioId; - const layeredPlan = meta.scenarios.test_plans?.[planId]; - const legacyDimensions = legacy?.dimensions; - const baseId = layeredPlan?.base; - const base = baseId ? lookupProfile(meta.scenarios.base_scenarios ?? {}, "base", baseId, scenarioId) : undefined; - const onboardingId = legacy?.alias_for_plan && legacyDimensions?.onboarding ? legacyDimensions.onboarding : (layeredPlan?.onboarding ?? legacyDimensions?.onboarding); - const onboardingCollection = onboardingId && onboardingId in meta.scenarios.onboarding ? meta.scenarios.onboarding : (meta.scenarios.onboarding_profiles ?? meta.scenarios.onboarding); - const onboarding = lookupProfile(onboardingCollection, "onboarding", onboardingId ?? "", scenarioId); - const platformId = base?.platform ?? legacyDimensions?.platform; - const installId = base?.install ?? legacyDimensions?.install; - const runtimeId = base?.runtime ?? legacyDimensions?.runtime; - if (!platformId || !installId || !runtimeId) throw new Error(`scenario '${scenarioId}' is missing layered base or legacy dimensions`); - const platform = lookupProfile(meta.scenarios.platforms, "platform", platformId, scenarioId); - const install = lookupProfile(meta.scenarios.installs, "install", installId, scenarioId); - const runtime = lookupProfile(meta.scenarios.runtimes, "runtime", runtimeId, scenarioId); - const expectedStateId = layeredPlan?.expected_state ?? legacy?.expected_state; - if (!expectedStateId || !(expectedStateId in meta.expectedStates.expected_states)) { - const available = Object.keys(meta.expectedStates.expected_states).sort().join(", "); - throw new Error(`scenario '${scenarioId}' references unknown expected_state '${expectedStateId}' (available: ${available || ""})`); - } - const stateConfig = meta.expectedStates.expected_states[expectedStateId]; - const suiteIds = layeredPlan?.suites ?? legacy?.suites ?? []; - const resolvedSuites: ResolvedSuite[] = []; - for (const suiteId of suiteIds) { - if (!(suiteId in meta.suites.suites)) { - const available = Object.keys(meta.suites.suites).sort().join(", "); - throw new Error( - `scenario '${scenarioId}' references unknown suite '${suiteId}' (available: ${available || ""})`, - ); - } - const def = meta.suites.suites[suiteId]; - validateSuiteAgainstState(suiteId, def, stateConfig, scenarioId); - resolvedSuites.push({ - id: suiteId, - requires_state: def.requires_state ?? {}, - steps: def.steps.map((s) => ({ id: s.id, script: s.script })), - }); - } - const runnerRequirements = [ - ...(base?.runner_requirements ?? []), - ...((layeredPlan as TestPlan | undefined)?.runner_requirements ?? []), - ...(legacy?.runner_requirements ?? []), - ]; - const expectedFailure = resolveExpectedFailure(stateConfig, expectedStateId, scenarioId, [ - { origin: `base '${baseId}'`, block: base?.expected_failure, mode: "fill" }, - { origin: `test_plan '${planId}'`, block: layeredPlan?.expected_failure, mode: "override" }, - { origin: `setup_scenario '${scenarioId}'`, block: legacy?.expected_failure, mode: "override" }, - ]); - return { - scenario_id: scenarioId, - plan_id: layeredPlan ? planId : undefined, - legacy_scenario_id: legacy?.alias_for_plan ? scenarioId : undefined, - base: base && baseId ? { id: baseId, profile: base as BaseScenario } : undefined, - onboarding: onboardingId ? { id: onboardingId, profile: onboarding } : undefined, - onboarding_assertions: layeredPlan?.onboarding_assertions ?? [], - dimensions: { - platform: { id: platformId, profile: platform }, - install: { id: installId, profile: install }, - runtime: { id: runtimeId, profile: runtime }, - onboarding: { id: onboardingId ?? "", profile: onboarding }, - }, - expected_state: { id: expectedStateId, config: stateConfig }, - suites: resolvedSuites, - overrides: layeredPlan?.overrides ?? legacy?.overrides, - runner_requirements: runnerRequirements.length > 0 ? runnerRequirements : undefined, - required_secrets: layeredPlan?.required_secrets, - ...(expectedFailure ? { expected_failure: expectedFailure } : {}), - }; -} - -export function formatPlan(plan: ResolvedPlan): string { - const lines: string[] = []; - lines.push(`Scenario: ${plan.scenario_id}`); - if (plan.plan_id) lines.push(`Test plan: ${plan.plan_id}`); - if (plan.base) lines.push(`Base: ${plan.base.id}`); - if (plan.onboarding) lines.push(`Onboarding: ${plan.onboarding.id}`); - lines.push("Dimensions:"); - lines.push(` platform=${plan.dimensions.platform.id}`); - lines.push(` install=${plan.dimensions.install.id}`); - lines.push(` runtime=${plan.dimensions.runtime.id}`); - lines.push(` onboarding=${plan.dimensions.onboarding.id}`); - lines.push(`Expected state: ${plan.expected_state.id}`); - if (plan.onboarding_assertions && plan.onboarding_assertions.length > 0) { - lines.push("Onboarding assertions:"); - for (const assertion of plan.onboarding_assertions) lines.push(` - ${assertion}`); - } - lines.push("Suites:"); - for (const s of plan.suites) { - lines.push(` - ${s.id}`); - for (const step of s.steps) { - lines.push(` * ${step.id} (${step.script})`); - } - } - if (plan.runner_requirements && plan.runner_requirements.length > 0) { - lines.push("Runner requirements:"); - for (const requirement of plan.runner_requirements) { - lines.push(` - ${requirement}`); - } - } - if (plan.overrides) { - lines.push("Overrides:"); - lines.push(` ${JSON.stringify(plan.overrides)}`); - } - if (plan.expected_failure) { - lines.push("Expected failure:"); - lines.push(` phase=${plan.expected_failure.phase}`); - lines.push(` error_class=${plan.expected_failure.error_class}`); - if (plan.expected_failure.message_pattern) { - lines.push(` message_pattern=${plan.expected_failure.message_pattern}`); - } - if (plan.expected_failure.forbidden_side_effects?.length) { - lines.push(` forbidden_side_effects=${plan.expected_failure.forbidden_side_effects.join(",")}`); - } - } - return lines.join("\n"); -} diff --git a/test/e2e-scenario/runtime/resolver/schema.ts b/test/e2e-scenario/runtime/resolver/schema.ts deleted file mode 100644 index d8354981f6..0000000000 --- a/test/e2e-scenario/runtime/resolver/schema.ts +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Types for the E2E scenario metadata schema. - * - * These mirror the shape of `scenarios.yaml`, `expected-states.yaml`, and - * `suites.yaml`. The resolver validates unknown references and returns a - * normalized `ResolvedPlan` suitable for the shell runner and JSON artifact. - */ - -export type AnyRecord = Record; - -export interface PlatformProfile extends AnyRecord { - os?: string; - execution_target?: string; -} -export type InstallProfile = AnyRecord; -export type RuntimeProfile = AnyRecord; -export interface OnboardingProfile extends AnyRecord { - path?: string; - agent?: string; - provider?: string; - inference_route?: string; -} - -/** - * Phases where setup is permitted to fail in negative scenarios. - * - * Aligned with `nemoclaw` setup stages and the wording in NemoClaw issue - * #3608. `preflight` is the only phase whose side-effect probes are wired - * in this initial cut; the rest are accepted by the schema so that future - * negative scenarios can declare them without churning YAML again. - */ -export const EXPECTED_FAILURE_PHASES = [ - "preflight", - "install", - "onboard", - "readiness", - "suite", -] as const; -export type ExpectedFailurePhase = (typeof EXPECTED_FAILURE_PHASES)[number]; - -/** - * Structured failure reason. Open-ended on purpose - new negative scenarios - * may need new classes, but every value here MUST be enumerated so reports - * have a stable vocabulary. - */ -export const EXPECTED_FAILURE_ERROR_CLASSES = [ - "docker-missing", - "credentials-missing", - "gpu-missing", - "unsupported-platform", -] as const; -export type ExpectedFailureErrorClass = (typeof EXPECTED_FAILURE_ERROR_CLASSES)[number]; - -/** - * Side effects that a successful setup would normally leave behind. A - * negative scenario asserts that NONE of the listed effects are observed - * after the failure. - */ -export const EXPECTED_FAILURE_SIDE_EFFECTS = [ - "sandbox-created", - "gateway-started", - "credentials-written", -] as const; -export type ExpectedFailureSideEffect = (typeof EXPECTED_FAILURE_SIDE_EFFECTS)[number]; - -export interface ExpectedFailure { - phase: ExpectedFailurePhase; - error_class: ExpectedFailureErrorClass; - /** RE2/POSIX-compatible regex matched against the captured setup log. */ - message_pattern?: string; - /** Effects that must NOT be observed after the failure. */ - forbidden_side_effects?: ExpectedFailureSideEffect[]; -} - -export interface SkippedCapability extends AnyRecord { - id: string; - reason: string; - suites?: string[]; -} - -export interface BaseScenario extends AnyRecord { - platform: string; - install: string; - runtime: string; - runner_requirements?: string[]; - expected_failure?: Partial; - skipped_capabilities?: SkippedCapability[]; -} - -export interface TestPlan extends AnyRecord { - base: string; - onboarding: string; - expected_state: string; - onboarding_assertions?: string[]; - suites: string[]; - overrides?: AnyRecord; - runner_requirements?: string[]; - required_secrets?: string[]; - expected_failure?: Partial; - skipped_capabilities?: SkippedCapability[]; -} - -export interface SetupScenario { - alias_for_plan?: string; - dimensions?: { - platform: string; - install: string; - runtime: string; - onboarding: string; - }; - expected_state?: string; - suites?: string[]; - overrides?: AnyRecord; - /** Explicit CI/hardware requirements for non-default platforms. */ - runner_requirements?: string[]; - skipped_capabilities?: SkippedCapability[]; - /** - * Per-scenario override of the expected-state failure contract. Fields - * present here win over the state-level `expected_failure`; absent - * fields fall back to the state. Negative scenarios MUST resolve to a - * complete `ExpectedFailure` (state + override merged). - */ - expected_failure?: Partial; - /** - * Guard: the legacy array form `expected_states: [...]` must not reappear. - * If present, the loader fails. - */ - expected_states?: never; -} - -export interface ScenariosFile { - platforms: Record; - installs: Record; - runtimes: Record; - onboarding: Record; - setup_scenarios: Record; - base_scenarios?: Record; - onboarding_profiles?: Record; - test_plans?: Record; - onboarding_assertions?: Record; -} - -export type ExpectedStateConfig = AnyRecord; - -export interface ExpectedStatesFile { - expected_states: Record; -} - -export interface SuiteStep { - id: string; - script: string; -} - -export interface SuiteDefinition { - requires_state?: Record; - steps: SuiteStep[]; -} - -export interface SuitesFile { - suites: Record; -} - -export interface ResolvedDimension { - id: string; - profile: T; -} - -export interface ResolvedSuite { - id: string; - requires_state: Record; - steps: SuiteStep[]; -} - -export interface ResolvedExpectedState { - id: string; - config: ExpectedStateConfig; -} - -export interface ResolvedPlan { - scenario_id: string; - plan_id?: string; - legacy_scenario_id?: string; - base?: ResolvedDimension; - onboarding?: ResolvedDimension; - onboarding_assertions?: string[]; - dimensions: { - platform: ResolvedDimension; - install: ResolvedDimension; - runtime: ResolvedDimension; - onboarding: ResolvedDimension; - }; - expected_state: ResolvedExpectedState; - suites: ResolvedSuite[]; - overrides?: AnyRecord; - runner_requirements?: string[]; - required_secrets?: string[]; - /** - * Present only for negative scenarios that declare an `expected_failure` - * (either at scenario level or via their expected state). Absence means - * the runner expects setup to succeed. - */ - expected_failure?: ExpectedFailure; -} diff --git a/test/e2e-scenario/runtime/resolver/validator.ts b/test/e2e-scenario/runtime/resolver/validator.ts deleted file mode 100644 index 214190f6dc..0000000000 --- a/test/e2e-scenario/runtime/resolver/validator.ts +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Expected-state validator. - * - * Walks the expected state tree and compares each leaf to a probe result. - * Also validates per-suite `requires_state` entries at runtime, producing a - * single report whose `ok` field drives whether the runner proceeds to - * execute suites. - */ - -import type { ExpectedStateConfig, ResolvedSuite } from "./schema.ts"; - -export type ProbeValue = string | number | boolean | null; -export type ProbeResults = Record; - -export interface ValidatorInput { - stateId: string; - state: ExpectedStateConfig; - probes: ProbeResults; - suites: ResolvedSuite[]; -} - -export interface ValidatorCheck { - key: string; - expected: ProbeValue; - actual: ProbeValue | undefined; - ok: boolean; - origin: "state" | "suite"; - suite?: string; - message?: string; -} - -export interface ValidatorReport { - state_id: string; - ok: boolean; - checks: ValidatorCheck[]; -} - -function flatten( - obj: unknown, - prefix: string, - out: Record, -): void { - if (obj === null || typeof obj !== "object") { - out[prefix] = obj as ProbeValue; - return; - } - for (const [k, v] of Object.entries(obj as Record)) { - const next = prefix ? `${prefix}.${k}` : k; - if (v !== null && typeof v === "object" && !Array.isArray(v)) { - flatten(v, next, out); - } else { - out[next] = v as ProbeValue; - } - } -} - -function compare( - _key: string, - expected: ProbeValue, - actual: ProbeValue | undefined, -): boolean { - if (actual === undefined) return false; - return expected === actual; -} - -export function validateExpectedState(input: ValidatorInput): ValidatorReport { - const checks: ValidatorCheck[] = []; - const flat: Record = {}; - flatten(input.state, "", flat); - - for (const [key, expected] of Object.entries(flat)) { - const actual = input.probes[key]; - const ok = compare(key, expected, actual); - checks.push({ - key, - expected, - actual, - ok, - origin: "state", - message: ok - ? undefined - : `expected '${key}=${String(expected)}' but got '${String(actual ?? "")}'`, - }); - } - - for (const suite of input.suites) { - const req = suite.requires_state ?? {}; - for (const [key, expected] of Object.entries(req)) { - const actual = input.probes[key]; - const ok = compare(key, expected as ProbeValue, actual); - checks.push({ - key, - expected: expected as ProbeValue, - actual, - ok, - origin: "suite", - suite: suite.id, - message: ok - ? undefined - : `suite '${suite.id}' requires '${key}=${String(expected)}' but got '${String(actual ?? "")}'`, - }); - } - } - - const ok = checks.every((c) => c.ok); - return { state_id: input.stateId, ok, checks }; -} - -export function formatReport(report: ValidatorReport): string { - const lines: string[] = []; - lines.push(`expected-state: ${report.state_id} ${report.ok ? "OK" : "FAILED"}`); - for (const c of report.checks) { - const status = c.ok ? "PASS" : "FAIL"; - const origin = c.origin === "suite" ? `[suite:${c.suite}]` : "[state]"; - lines.push( - ` ${status} ${origin} ${c.key} expected=${String(c.expected)} actual=${String(c.actual ?? "")}`, - ); - } - return lines.join("\n"); -} diff --git a/test/e2e-scenario/runtime/run-scenario.sh b/test/e2e-scenario/runtime/run-scenario.sh index 58042c8523..2477ce79ec 100755 --- a/test/e2e-scenario/runtime/run-scenario.sh +++ b/test/e2e-scenario/runtime/run-scenario.sh @@ -2,482 +2,24 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # -# E2E scenario runner entrypoint. -# -# Usage: -# bash test/e2e-scenario/runtime/run-scenario.sh [--plan-only|--validate-only|--dry-run] -# -# Flags: -# --plan-only Resolve metadata and print the plan only. Writes -# ${E2E_CONTEXT_DIR:-.e2e}/plan.json for artifact upload. -# --validate-only Run the expected-state validator against the current -# context.env without running install/onboard/suites. -# Emits probe results JSON to stdout and writes -# ${E2E_CONTEXT_DIR}/expected-state-report.json. Used by -# the parity-compare workflow to collect per-assertion -# probe results. Mutually exclusive with --plan-only. -# --dry-run (reserved) Run orchestration with real side effects -# replaced by trace-logged stubs. Sets E2E_DRY_RUN=1 for -# helpers. Full dry-run orchestration lands in later phases. -# -# Environment: -# E2E_CONTEXT_DIR Override the scenario artifact directory -# (default: /.e2e/). +# DEPRECATED. The hybrid scenario architecture has a single supported runtime +# entrypoint: test/e2e-scenario/scenarios/run.ts. This bash runner duplicated +# install/onboard/gateway-check/suite-execution that now belongs in TS phase +# orchestrators (EnvironmentOrchestrator, OnboardingOrchestrator, +# RuntimeOrchestrator) and shared clients (HostCliClient, GatewayClient, +# SandboxClient). It is fail-fast so the deprecation is loud, not silent. set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -E2E_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" - -SCENARIO_ID="" -PLAN_ONLY=0 -VALIDATE_ONLY=0 -DRY_RUN=0 - -usage() { - cat >&2 <<'USAGE' -Usage: bash test/e2e-scenario/runtime/run-scenario.sh [--plan-only|--validate-only|--dry-run] -USAGE -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --plan-only) - PLAN_ONLY=1 - shift - ;; - --validate-only) - VALIDATE_ONLY=1 - shift - ;; - --dry-run) - DRY_RUN=1 - shift - ;; - -h | --help) - usage - exit 0 - ;; - --*) - echo "run-scenario: unknown flag: $1" >&2 - usage - exit 2 - ;; - *) - if [[ -z "${SCENARIO_ID}" ]]; then - SCENARIO_ID="$1" - else - echo "run-scenario: unexpected positional argument: $1" >&2 - usage - exit 2 - fi - shift - ;; - esac -done - -if [[ -z "${SCENARIO_ID}" ]]; then - echo "run-scenario: missing scenario id" >&2 - usage - exit 2 -fi - -if [[ "${PLAN_ONLY}" -eq 1 && "${VALIDATE_ONLY}" -eq 1 ]]; then - echo "run-scenario: --plan-only and --validate-only are mutually exclusive" >&2 - usage - exit 2 -fi - -export E2E_CONTEXT_DIR="${E2E_CONTEXT_DIR:-${REPO_ROOT}/.e2e}" -mkdir -p "${E2E_CONTEXT_DIR}" - -if [[ "${DRY_RUN}" -eq 1 ]]; then - export E2E_DRY_RUN=1 -fi - -# Prefer the locally-installed tsx if present, otherwise fall back to npx. -TSX_BIN="${REPO_ROOT}/node_modules/.bin/tsx" -if [[ ! -x "${TSX_BIN}" ]]; then - TSX_BIN="" -fi - -run_resolver() { - if [[ -n "${TSX_BIN}" ]]; then - "${TSX_BIN}" "${SCRIPT_DIR}/resolver/index.ts" "$@" - return - fi - # CodeRabbit review item #10: fail closed with a clear hint instead of - # silently pulling tsx from the network via `npx --yes`. - if ! (cd "${REPO_ROOT}" && npx --no-install tsx "${SCRIPT_DIR}/resolver/index.ts" "$@"); then - echo "run-scenario: tsx is required but not installed. Run 'npm ci' at the repo root and retry." >&2 - return 1 - fi -} - -run_resolver plan "${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}" - -if [[ "${PLAN_ONLY}" -eq 1 ]]; then - exit 0 -fi - -# --validate-only: assume setup has already completed. Skip install / -# onboard / suite execution and dispatch the expected-state validator -# using probes resolved from E2E_PROBE_OVERRIDE_* env vars. Emits the -# probe results JSON report to stdout and writes it to -# ${E2E_CONTEXT_DIR}/expected-state-report.json. -if [[ "${VALIDATE_ONLY}" -eq 1 ]]; then - validate_args=("${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}") - if ! run_resolver validate-state "${validate_args[@]}"; then - echo "run-scenario: --validate-only: expected-state validation failed" >&2 - exit 3 - fi - exit 0 -fi - -# Source the shared helper library so we can exercise the full -# setup → install → onboard → gateway/sandbox check sequence. In dry-run -# mode each helper short-circuits (and writes to E2E_TRACE_FILE if set). -# shellcheck source=lib/env.sh -. "${SCRIPT_DIR}/lib/env.sh" -# shellcheck source=lib/context.sh -. "${SCRIPT_DIR}/lib/context.sh" -# shellcheck source=lib/negative.sh -. "${SCRIPT_DIR}/lib/negative.sh" -# shellcheck source=lib/port-holder.sh -. "${SCRIPT_DIR}/lib/port-holder.sh" -# shellcheck source=../nemoclaw_scenarios/install/dispatch.sh -. "${E2E_ROOT}/nemoclaw_scenarios/install/dispatch.sh" -# shellcheck source=../nemoclaw_scenarios/onboard/dispatch.sh -. "${E2E_ROOT}/nemoclaw_scenarios/onboard/dispatch.sh" -# shellcheck source=../validation_suites/assert/gateway-alive.sh -. "${E2E_ROOT}/validation_suites/assert/gateway-alive.sh" -# shellcheck source=../validation_suites/assert/sandbox-alive.sh -. "${E2E_ROOT}/validation_suites/assert/sandbox-alive.sh" - -# Apply standard non-interactive env (and trace it). -e2e_env_apply_noninteractive -e2e_env_trace "env:noninteractive" - -# Emit normalized context from the resolved plan. -e2e_context_init -"${E2E_ROOT}/nemoclaw_scenarios/helpers/emit-context-from-plan.sh" "${E2E_CONTEXT_DIR}/plan.json" - -# Extract the install method and onboarding profile from the plan so we can -# dispatch to the right helpers. -read_plan_string() { - local key="$1" - node -e " - const p = JSON.parse(require('fs').readFileSync(process.argv[1], 'utf8')); - const parts = process.argv[2].split('.'); - let cur = p; - for (const part of parts) { if (cur == null) { cur = ''; break; } cur = cur[part]; } - process.stdout.write(cur == null ? '' : String(cur)); - " "${E2E_CONTEXT_DIR}/plan.json" "${key}" -} - -INSTALL_ID="$(read_plan_string dimensions.install.id)" -INSTALL_METHOD="$(read_plan_string dimensions.install.profile.method)" -ONBOARDING_ID="$(read_plan_string dimensions.onboarding.id)" -RUNTIME_ID="$(read_plan_string dimensions.runtime.id)" -RUNTIME_CONTAINER_DAEMON="$(read_plan_string dimensions.runtime.profile.container_daemon)" -EXPECTED_STATE_ID="$(read_plan_string expected_state.id)" -FAILURE_STAGE="$(read_plan_string expected_state.config.failure.stage)" -FAILURE_EXIT_CODE="$(read_plan_string expected_state.config.failure.exit_code)" -FAILURE_MESSAGE_CONTAINS="$(read_plan_string expected_state.config.failure.message_contains)" -FAILURE_NO_STACK_TRACE="$(read_plan_string expected_state.config.failure.no_stack_trace)" - -# Trace the dimension id so scenario-level assertions can identify the -# configured install (e.g. repo-current); e2e_install internally traces -# the resolved method. -e2e_env_trace "install:${INSTALL_ID}" - -install_log="${E2E_CONTEXT_DIR}/install.log" -set +e -e2e_install "${INSTALL_METHOD}" >"${install_log}" 2>&1 -install_status=$? -set -e -if [[ "${install_status}" -ne 0 ]]; then - cat "${install_log}" >&2 - echo "run-scenario: install ${INSTALL_METHOD} failed with status ${install_status}" >&2 - exit "${install_status}" -fi -export PATH="${HOME}/.local/bin:${PATH}" -{ - printf 'PATH=%s\n' "${PATH}" - command -v nemoclaw || true -} >"${E2E_CONTEXT_DIR}/post-install-path.log" 2>&1 -if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'run-scenario: dry-run skipping post-install nemoclaw PATH verification\n' >&2 -else - nemoclaw_bin="$(command -v nemoclaw || true)" - if [[ -z "${nemoclaw_bin}" ]]; then - cat "${E2E_CONTEXT_DIR}/post-install-path.log" >&2 - echo "run-scenario: nemoclaw not found on PATH after install" >&2 - exit 127 - fi - printf 'run-scenario: using nemoclaw at %s\n' "${nemoclaw_bin}" >&2 -fi - -# Negative scenarios declare an `expected_failure` block on their expected -# state (see NemoClaw issue #3608). The runner forces the failure mode for -# the scenario, captures the setup log, gathers a side-effect inventory, and -# delegates structured matching to `resolver/index.ts match-failure`. The -# matcher writes `expected-vs-actual.json` for CI artifact upload. - -read_plan_failure_field() { - local key="$1" - node -e " - (() => { - const p = JSON.parse(require('fs').readFileSync(process.argv[1], 'utf8')); - const ef = p.expected_failure; - if (!ef) { process.stdout.write(''); return; } - const v = ef[process.argv[2]]; - process.stdout.write(v == null ? '' : Array.isArray(v) ? v.join(',') : String(v)); - })(); - " "${E2E_CONTEXT_DIR}/plan.json" "${key}" -} - -EXPECTED_FAILURE_PHASE="$(read_plan_failure_field phase)" - -if [[ -n "${EXPECTED_FAILURE_PHASE}" ]]; then - expected_error_class="$(read_plan_failure_field error_class)" - negative_log="${E2E_CONTEXT_DIR}/negative-${EXPECTED_FAILURE_PHASE}.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - - # Snapshot the side-effect baseline BEFORE forcing the failure so we only - # report effects newly introduced by this scenario. A pre-existing gateway - # or credentials file from an earlier run would otherwise look like a fresh - # side effect and falsely fail negative scenarios in dirty environments. - baseline_sandbox=0 - if [[ -n "${sandbox_name}" ]] && openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - baseline_sandbox=1 - fi - baseline_gateway=0 - if nemoclaw gateway status >/dev/null 2>&1; then - baseline_gateway=1 - fi - baseline_credentials=0 - if [[ -s "${HOME}/.nemoclaw/credentials.json" ]]; then - baseline_credentials=1 - fi - - # Force the failure mode declared by the scenario. Only `preflight` / - # `docker-missing` is implemented here; other phases are accepted by the - # schema but their forcing logic lands alongside the first consumer. - case "${EXPECTED_FAILURE_PHASE}:${expected_error_class}" in - preflight:docker-missing) - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'Cannot connect to the Docker daemon during preflight\n' >"${negative_log}" - else - if DOCKER_HOST="unix:///tmp/nemoclaw-e2e-missing-docker.sock" \ - e2e_onboard "${ONBOARDING_ID}" >"${negative_log}" 2>&1; then - echo "run-scenario: expected preflight failure, but onboarding succeeded" >&2 - cat "${negative_log}" >&2 - exit 4 - fi - fi - ;; - *) - echo "run-scenario: expected_failure phase=${EXPECTED_FAILURE_PHASE} class=${expected_error_class} has no forcing implementation yet" >&2 - exit 2 - ;; - esac - - # Compute the side-effect delta: only count effects that were absent in the - # baseline and present after the forced failure. - observed_side_effects="" - if [[ "${baseline_sandbox}" -eq 0 ]] && [[ -n "${sandbox_name}" ]] \ - && openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}sandbox-created" - fi - if [[ "${baseline_gateway}" -eq 0 ]] && nemoclaw gateway status >/dev/null 2>&1; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}gateway-started" - fi - if [[ "${baseline_credentials}" -eq 0 ]] && [[ -s "${HOME}/.nemoclaw/credentials.json" ]]; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}credentials-written" - fi - - # `--observed-error-class` is intentionally omitted: the runner does not yet - # derive a structured error class from the actual failure output, and - # reporting the planned class back to the matcher would make the check - # tautological. The matcher logs this as a skipped check. - match_args=( - match-failure "${SCENARIO_ID}" - --context-dir "${E2E_CONTEXT_DIR}" - --log "${negative_log}" - --observed-phase "${EXPECTED_FAILURE_PHASE}" - ) - if [[ -n "${observed_side_effects}" ]]; then - match_args+=(--observed-side-effects "${observed_side_effects}") - fi - if ! run_resolver "${match_args[@]}"; then - echo "run-scenario: expected-failure match failed; see ${E2E_CONTEXT_DIR}/expected-vs-actual.json" >&2 - exit 4 - fi - echo "run-scenario: negative scenario passed (phase=${EXPECTED_FAILURE_PHASE} class=${expected_error_class})" - exit 0 -fi - -if [[ "${EXPECTED_STATE_ID}" == "preflight-failure-no-sandbox" ]]; then - negative_log="${E2E_CONTEXT_DIR}/negative-preflight.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'Cannot connect to the Docker daemon during preflight\n' >"${negative_log}" - elif DOCKER_HOST="unix:///tmp/nemoclaw-e2e-missing-docker.sock" e2e_onboard "${ONBOARDING_ID}" >"${negative_log}" 2>&1; then - echo "run-scenario: expected preflight failure, but onboarding succeeded" >&2 - exit 4 - fi - if ! grep -Eiq "docker|container|daemon|socket|preflight" "${negative_log}"; then - echo "run-scenario: negative preflight failed without a clear Docker/preflight reason" >&2 - cat "${negative_log}" >&2 - exit 4 - fi - if openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - echo "run-scenario: negative preflight left behind sandbox ${sandbox_name}" >&2 - exit 4 - fi - echo "run-scenario: negative preflight passed; Docker daemon unavailable and no sandbox was created" - exit 0 -fi - -if [[ "${FAILURE_STAGE}" == "onboarding" ]]; then - negative_log="${E2E_CONTEXT_DIR}/negative-onboarding.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - port_holder_started=0 - onboard_env=(NEMOCLAW_SANDBOX_NAME="${sandbox_name}" NEMOCLAW_RECREATE_SANDBOX=1 NEMOCLAW_POLICY_MODE=skip) - case "${ONBOARDING_ID}" in - cloud-openclaw-invalid-nvidia-key) - onboard_env+=(NVIDIA_API_KEY=not-a-nvidia-key) - ;; - cloud-openclaw-gateway-port-conflict) - conflict_port="$(read_plan_string dimensions.onboarding.profile.gateway_port)" - : "${conflict_port:=18080}" - if e2e_port_holder_start "${conflict_port}"; then - port_holder_started=1 - else - echo "run-scenario: could not start port holder on ${conflict_port}; continuing against any existing listener" >&2 - fi - onboard_env+=(NEMOCLAW_GATEWAY_PORT="${conflict_port}") - ;; - esac - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf '%s -' "${FAILURE_MESSAGE_CONTAINS}" >"${negative_log}" - negative_status="${FAILURE_EXIT_CODE:-1}" - else - set +e - ( - export "${onboard_env[@]}" - e2e_onboard "${ONBOARDING_ID}" - ) >"${negative_log}" 2>&1 - negative_status=$? - set -e - fi - if [[ "${port_holder_started}" -eq 1 ]]; then - e2e_port_holder_stop - fi - if ! e2e_negative_assert_failure "${negative_log}" "${negative_status}" "${FAILURE_EXIT_CODE:-1}" "${FAILURE_MESSAGE_CONTAINS}" "$([[ "${FAILURE_NO_STACK_TRACE}" == "true" ]] && echo 1 || echo 0)"; then - exit 4 - fi - if openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - echo "run-scenario: negative onboarding left behind sandbox ${sandbox_name}" >&2 - exit 4 - fi - echo "run-scenario: negative onboarding ${ONBOARDING_ID} passed" - exit 0 -fi - -DOCKER_OPTIONAL_UNAVAILABLE=0 -if [[ "${RUNTIME_CONTAINER_DAEMON}" == "optional" ]] && ! docker info >/dev/null 2>&1; then - DOCKER_OPTIONAL_UNAVAILABLE=1 - echo "SKIP: scenario.${SCENARIO_ID}.docker-dependent-suites Docker unavailable for optional runtime ${RUNTIME_ID}; gateway/sandbox/inference coverage skipped" - echo "run-scenario: Docker unavailable for optional runtime ${RUNTIME_ID}; scaling back to platform-only suites" -else - onboard_log="${E2E_CONTEXT_DIR}/onboard.log" - set +e - e2e_onboard "${ONBOARDING_ID}" >"${onboard_log}" 2>&1 - onboard_status=$? - set -e - if [[ "${onboard_status}" -ne 0 ]]; then - cat "${onboard_log}" >&2 - echo "run-scenario: onboarding ${ONBOARDING_ID} failed with status ${onboard_status}" >&2 - exit "${onboard_status}" - fi - if [[ "${RUNTIME_ID}" == "gpu-docker-cdi" ]] && ! e2e_env_is_dry_run; then - echo "run-scenario: GPU Docker CDI uses host-network gateway; validating gateway from suites" - else - e2e_gateway_assert_healthy - fi - e2e_sandbox_assert_running -fi - -# Expected state validation. The validator reads E2E_PROBE_OVERRIDE_* env -# variables to simulate real probe outputs in dry-run/test contexts. -# Live probe wiring lands scenario-by-scenario; by default, live runs move -# straight from setup checks to suites so migrated suite assertions can be -# debugged against the real environment. -if [[ "${E2E_VALIDATE_EXPECTED_STATE:-0}" == "1" || "${DRY_RUN}" -eq 1 ]]; then - validate_args=("${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}") - if [[ "${DRY_RUN}" -eq 1 ]]; then - # CodeRabbit review item #9: explicitly opt in to seeding probes from - # the expected state in dry-run/test mode. Live runs go through real - # probes and must fail closed if any are missing. - validate_args+=(--probes-from-state) - fi - if ! run_resolver validate-state "${validate_args[@]}"; then - echo "run-scenario: expected-state validation failed; suites will NOT run" >&2 - exit 3 - fi -fi - -if [[ "${DRY_RUN}" -eq 1 ]]; then - echo "run-scenario: dry-run complete; context.env emitted under ${E2E_CONTEXT_DIR}" - exit 0 -fi - -SUITE_IDS=() -while IFS= read -r suite_id; do - SUITE_IDS+=("${suite_id}") -done < <(node -e " - try { - const planPath = process.argv[1]; - const p = JSON.parse(require('fs').readFileSync(planPath, 'utf8')); - if (!Array.isArray(p.suites)) { - throw new Error('missing or invalid suites array'); - } - const filter = process.env.E2E_SUITE_FILTER || ''; - const selected = filter ? filter.split(',').map((s) => s.trim()).filter(Boolean) : p.suites.map((s) => s.id); - for (const id of selected) console.log(id); - } catch (err) { - console.error('run-scenario: failed to parse plan.json ' + process.argv[1] + ': ' + err.message); - process.exit(1); - } -" "${E2E_CONTEXT_DIR}/plan.json") - -if [[ "${#SUITE_IDS[@]}" -eq 0 ]]; then - echo "run-scenario: no suites selected for ${SCENARIO_ID}" >&2 - exit 4 -fi - -if [[ "${DOCKER_OPTIONAL_UNAVAILABLE}" -eq 1 ]]; then - FILTERED_SUITE_IDS=() - for suite_id in "${SUITE_IDS[@]}"; do - case "${suite_id}" in - smoke | inference | credentials | hermes-specific | local-ollama-inference | ollama-proxy | gateway-health | sandbox-shell | cloud-inference | ollama-auth-proxy | security-credentials | messaging-telegram | messaging-discord | messaging-slack | security-shields | inference-routing | sandbox-lifecycle | sandbox-operations | snapshot | rebuild | upgrade | diagnostics | docs-validation | openai-compatible-inference | inference-switch | kimi-compatibility | messaging-token-rotation | security-policy | security-injection | model-router) - echo "SKIP: suite.${suite_id} skipped because optional Docker runtime ${RUNTIME_ID} is unavailable" - ;; - *) - FILTERED_SUITE_IDS+=("${suite_id}") - ;; - esac - done - SUITE_IDS=("${FILTERED_SUITE_IDS[@]}") -fi +cat >&2 <<'MSG' +run-scenario.sh is deprecated. Use the TS runner instead: -if [[ "${#SUITE_IDS[@]}" -eq 0 ]]; then - echo "run-scenario: all suites skipped for ${SCENARIO_ID}" >&2 - exit 0 -fi + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios -bash "${SCRIPT_DIR}/run-suites.sh" "${SUITE_IDS[@]}" +Other run.ts modes (read-only): + --list List canonical scenario ids + --emit-matrix Emit GitHub Actions matrix payload from the registry + --plan-only Local debug: print the compiled plan, do not execute + (must NOT appear in any CI workflow) +MSG +exit 2 diff --git a/test/e2e-scenario/runtime/run-suites.sh b/test/e2e-scenario/runtime/run-suites.sh index e99c069408..dac69cd422 100755 --- a/test/e2e-scenario/runtime/run-suites.sh +++ b/test/e2e-scenario/runtime/run-suites.sh @@ -2,136 +2,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # -# Run one or more functional suites against a completed E2E environment. -# -# Usage: -# bash test/e2e-scenario/runtime/run-suites.sh [ ...] -# -# Reads suite metadata from test/e2e-scenario/validation_suites/suites.yaml -# (or $E2E_SUITES_FILE). Each suite script receives .e2e/context.env -# via E2E_CONTEXT_DIR and is expected to source runtime/lib/context.sh if -# it needs specific keys. -# -# Environment: -# E2E_CONTEXT_DIR Directory containing context.env (default: /.e2e) -# E2E_SUITES_FILE Override suites metadata file (for tests) -# E2E_SUITES_DIR Override the directory that suite scripts are resolved -# against (default: test/e2e-scenario/validation_suites/) -# E2E_DRY_RUN When 1, suite scripts run in dry-run mode themselves. -# -# Exit code: 0 if all steps pass; non-zero at the first failing step. +# DEPRECATED. Suite execution is now driven directly by the TS phase +# orchestrator (RuntimeOrchestrator -> PhaseOrchestrator.runShellStep) which +# spawns each migrated assertion step's implementation.ref shell script. +# There is no longer a YAML-walking bash suite runner. set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -E2E_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -VALIDATION_SUITES_DIR="${E2E_ROOT}/validation_suites" - -if (($# == 0)); then - echo "run-suites: at least one suite id required" >&2 - echo "Usage: bash test/e2e-scenario/runtime/run-suites.sh [ ...]" >&2 - exit 2 -fi - -export E2E_CONTEXT_DIR="${E2E_CONTEXT_DIR:-${REPO_ROOT}/.e2e}" -SUITES_FILE="${E2E_SUITES_FILE:-${VALIDATION_SUITES_DIR}/suites.yaml}" -SUITES_DIR="${E2E_SUITES_DIR:-${VALIDATION_SUITES_DIR}}" - -CTX_FILE="${E2E_CONTEXT_DIR}/context.env" -if [[ ! -f "${CTX_FILE}" ]]; then - echo "run-suites: missing ${CTX_FILE}; run-scenario.sh must emit context before running suites" >&2 - exit 1 -fi - -# Sanity-check that the baseline scenario key is present. -if ! grep -q '^E2E_SCENARIO=' "${CTX_FILE}"; then - echo "run-suites: ${CTX_FILE} is missing required key E2E_SCENARIO" >&2 - exit 1 -fi - -# Resolve the suite step list by reading the YAML via node. -resolve_suite() { - local suite_id="$1" - node -e " - const fs = require('fs'); - const path = process.argv[1]; - const wanted = process.argv[2]; - const raw = fs.readFileSync(path, 'utf8'); - // Minimal YAML reader: prefer js-yaml if available; else fall back. - let yaml; - try { yaml = require('js-yaml'); } catch (_) { - process.stderr.write('run-suites: js-yaml required to parse suite metadata\n'); - process.exit(2); - } - const doc = yaml.load(raw); - if (!doc || !doc.suites || !doc.suites[wanted]) { - process.stderr.write('run-suites: unknown suite: ' + wanted + '\n'); - process.exit(3); - } - const steps = doc.suites[wanted].steps || []; - for (const s of steps) { - if (!s || typeof s.id !== 'string' || typeof s.script !== 'string') { - process.stderr.write('run-suites: malformed step in ' + wanted + '\n'); - process.exit(4); - } - process.stdout.write(s.id + '\t' + s.script + '\n'); - } - " "${SUITES_FILE}" "${suite_id}" -} - -declare -a FAILED_STEPS=() -declare -a PASSED_STEPS=() -OVERALL_STATUS=0 - -run_one_suite() { - local suite_id="$1" - echo "== suite: ${suite_id} ==" - local steps - if ! steps="$(resolve_suite "${suite_id}")"; then - OVERALL_STATUS=1 - return 1 - fi - if [[ -z "${steps}" ]]; then - echo " (no steps)" - return 0 - fi - while IFS=$'\t' read -r step_id script; do - [[ -z "${step_id}" ]] && continue - local full="${SUITES_DIR}/${script}" - echo " -> step: ${step_id} (${script})" - if [[ ! -f "${full}" ]]; then - echo " FAIL: script not found at ${full}" >&2 - FAILED_STEPS+=("${suite_id}/${step_id}") - OVERALL_STATUS=1 - return 1 - fi - if ! bash "${full}"; then - echo " FAIL: suite=${suite_id} step=${step_id}" >&2 - FAILED_STEPS+=("${suite_id}/${step_id}") - OVERALL_STATUS=1 - return 1 - fi - echo " PASS: ${step_id}" - PASSED_STEPS+=("${suite_id}/${step_id}") - done <<<"${steps}" -} - -for suite_id in "$@"; do - if ! run_one_suite "${suite_id}"; then - break - fi -done +cat >&2 <<'MSG' +run-suites.sh is deprecated. Suite assertions are now executed by +test/e2e-scenario/scenarios/orchestrators/phase.ts via child_process.spawn, +walking the typed assertionGroups defined in the scenario registry. -echo -echo "== suite summary ==" -# bash 3.2 (macOS) fails on "${arr[@]}" when the array is empty under `set -u`; -# use the `${arr[@]+...}` guard to expand to nothing when empty. -for p in ${PASSED_STEPS[@]+"${PASSED_STEPS[@]}"}; do - echo " PASS ${p}" -done -for f in ${FAILED_STEPS[@]+"${FAILED_STEPS[@]}"}; do - echo " FAIL ${f}" -done +Run scenarios via: -exit "${OVERALL_STATUS}" + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios +MSG +exit 2 diff --git a/test/e2e-scenario/scenarios/assertions/environment.ts b/test/e2e-scenario/scenarios/assertions/environment.ts deleted file mode 100644 index be7a62e6fb..0000000000 --- a/test/e2e-scenario/scenarios/assertions/environment.ts +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function environmentBaseline(): AssertionGroup { - return { - id: "environment.baseline", - phase: "environment", - description: "Skeleton environment baseline assertion group.", - migrationStatus: "complete", - steps: [ - { - id: "environment.plan.skeleton", - phase: "environment", - description: "Placeholder step until live environment orchestration is migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/environment.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/assertions/onboarding.ts b/test/e2e-scenario/scenarios/assertions/onboarding.ts deleted file mode 100644 index 9886a701fb..0000000000 --- a/test/e2e-scenario/scenarios/assertions/onboarding.ts +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function onboardingBaseline(): AssertionGroup { - return { - id: "onboarding.baseline", - phase: "onboarding", - description: "Skeleton onboarding assertion group.", - steps: [ - { - id: "onboarding.plan.skeleton", - phase: "onboarding", - description: "Placeholder step until onboarding assertions are migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/onboarding.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/assertions/registry.ts b/test/e2e-scenario/scenarios/assertions/registry.ts index 5123c6c731..c4457cb9ed 100644 --- a/test/e2e-scenario/scenarios/assertions/registry.ts +++ b/test/e2e-scenario/scenarios/assertions/registry.ts @@ -3,7 +3,6 @@ import fs from "node:fs"; import path from "node:path"; -import { environmentBaseline } from "./environment.ts"; import type { AssertionGroup, AssertionStep, PhaseName, ScenarioDefinition } from "../types.ts"; type Reliability = AssertionStep["reliability"]; @@ -25,22 +24,42 @@ function shellStep(input: ShellStepInput): AssertionStep { }; } -function probeStep(id: string, phase: PhaseName, ref: string, reliability?: Reliability): AssertionStep { +interface ProbeStepOptions { + reliability?: Reliability; + // When true, an unregistered probe fails the phase (and the run) + // instead of skipping. Use for security-sensitive probes the run + // is not safe without. + required?: boolean; +} + +function probeStep( + id: string, + phase: PhaseName, + ref: string, + options: ProbeStepOptions = {}, +): AssertionStep { return { id, phase, implementation: { kind: "probe", ref }, evidencePath: `.e2e/assertions/${id}.json`, - reliability, + reliability: options.reliability, + required: options.required, }; } -function pendingStep(id: string, phase: PhaseName, ref: string): AssertionStep { +function pendingStep( + id: string, + phase: PhaseName, + ref: string, + options: { required?: boolean } = {}, +): AssertionStep { return { id, phase, implementation: { kind: "pending", ref }, evidencePath: `.e2e/assertions/${id}.json`, + required: options.required, }; } @@ -186,7 +205,21 @@ export const runtimeControlGroups: AssertionGroup[] = [ phase: "runtime", description: "Negative scenario runtime check ensuring forbidden side effects did not occur.", migrationStatus: "complete", - steps: [pendingStep("runtime.expected-failure.no-side-effects", "runtime", "expectedFailureNoSideEffectsProbe")], + steps: [ + pendingStep( + "runtime.expected-failure.no-side-effects", + "runtime", + "expectedFailureNoSideEffectsProbe", + // Negative scenarios assert that a declared failure mode + // produced no forbidden side effects. Until the side-effect + // validator is implemented, this step must fail closed for + // any scenario that opts into runtimeControlGroups[0] + // (i.e. scenario.expectedFailure is set). Skipping it would + // let negative scenarios silently "pass" without verifying + // their core contract. + { required: true }, + ), + ], }, ]; @@ -219,9 +252,19 @@ export const validationSuiteGroups: AssertionGroup[] = [ ]), suiteGroup("credentials", credentialsSteps), suiteGroup("security-credentials", credentialsSteps), - suiteGroup("security-shields", [probeStep("security.shields.config", "runtime", "shieldsConfigProbe")]), - suiteGroup("security-policy", [probeStep("security.policy.enforced", "runtime", "networkPolicyProbe")]), - suiteGroup("security-injection", [probeStep("security.injection.blocked", "runtime", "injectionBlockedProbe")]), + // Security-sensitive probes MUST fail closed until the probe + // registry lands. A skipped shields/policy/injection check would + // produce fake-green for the exact suites these scenarios exist to + // protect. + suiteGroup("security-shields", [ + probeStep("security.shields.config", "runtime", "shieldsConfigProbe", { required: true }), + ]), + suiteGroup("security-policy", [ + probeStep("security.policy.enforced", "runtime", "networkPolicyProbe", { required: true }), + ]), + suiteGroup("security-injection", [ + probeStep("security.injection.blocked", "runtime", "injectionBlockedProbe", { required: true }), + ]), suiteGroup("messaging-telegram", [ shellStep({ id: "messaging.telegram.injection-safety", phase: "runtime", ref: "test/e2e-scenario/validation_suites/messaging/telegram/00-telegram-injection-safety.sh", reliability: { timeoutSeconds: 30, retry: { attempts: 2, on: ["external-tunnel"] } } }), shellStep({ id: "messaging.telegram.injection-payload-classes", phase: "runtime", ref: "test/e2e-scenario/validation_suites/messaging/telegram/01-telegram-injection-payload-classes.sh", reliability: { timeoutSeconds: 30, retry: { attempts: 2, on: ["external-tunnel"] } } }), @@ -257,7 +300,7 @@ export const validationSuiteGroups: AssertionGroup[] = [ ]; export const assertionRegistry = { - groups: [environmentBaseline(), ...onboardingAssertionGroups, ...runtimeControlGroups, ...validationSuiteGroups], + groups: [...onboardingAssertionGroups, ...runtimeControlGroups, ...validationSuiteGroups], }; export function assertionGroupForSuite(suiteId: string): AssertionGroup | undefined { @@ -285,8 +328,15 @@ function supplementalSuiteIdsForScenario(scenario: ScenarioDefinition): string[] "sandbox-lifecycle", "sandbox-operations", "snapshot", - "rebuild", - "upgrade", + // 'rebuild' and 'upgrade' are intentionally NOT supplemental + // here. Those suites assert post-rebuild state (marker survival, + // version upgrade, post-rebuild inference) and require a real + // `nemoclaw rebuild` to have run first. The dedicated + // `ubuntu-rebuild-openclaw` scenario opts into them by declaring + // a `rebuild-current-version` lifecycle profile that performs + // the rebuild before the runtime phase. Including them on this + // scenario produced fake-failures (no rebuild ran, so nothing + // could be preserved) and obscured the real coverage gap. "diagnostics", "docs-validation", ); @@ -352,8 +402,11 @@ export function assertionGroupsForScenario(scenario: ScenarioDefinition): Assert return group; }); + // Environment phase work is performed by typed PhaseAction entries + // (context.emit + install.) emitted from compiler.phaseActions(), + // not by assertion groups. No environment-phase assertion group is + // included in scenario plans. const groups: (AssertionGroup | undefined)[] = [ - environmentBaseline(), ...onboardingGroups, ...suiteGroups, ...supplementalGroups, diff --git a/test/e2e-scenario/scenarios/assertions/runtime.ts b/test/e2e-scenario/scenarios/assertions/runtime.ts deleted file mode 100644 index 5ed7031279..0000000000 --- a/test/e2e-scenario/scenarios/assertions/runtime.ts +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function runtimeSmokeSkeleton(): AssertionGroup { - return { - id: "runtime.smoke.skeleton", - phase: "runtime", - description: "Skeleton runtime smoke assertion group.", - steps: [ - { - id: "runtime.plan.skeleton", - phase: "runtime", - description: "Placeholder step until validation suites are migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/runtime.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/builder.ts b/test/e2e-scenario/scenarios/builder.ts index b2b9243a51..d4c2327e84 100644 --- a/test/e2e-scenario/scenarios/builder.ts +++ b/test/e2e-scenario/scenarios/builder.ts @@ -60,7 +60,7 @@ export class ScenarioBuilder { return this; } - expectedFailure(expectedFailure: Record): ScenarioBuilder { + expectedFailure(expectedFailure: import("./types.ts").ExpectedFailureContract): ScenarioBuilder { this.definition.expectedFailure = expectedFailure; return this; } diff --git a/test/e2e-scenario/scenarios/compiler.ts b/test/e2e-scenario/scenarios/compiler.ts index 5046c77dd2..8d46d419d1 100644 --- a/test/e2e-scenario/scenarios/compiler.ts +++ b/test/e2e-scenario/scenarios/compiler.ts @@ -4,11 +4,33 @@ import fs from "node:fs"; import path from "node:path"; import { fileURLToPath } from "node:url"; +import { getExpectedState, probesForState } from "./expected-states.ts"; import { loadManifest } from "./manifests.ts"; import { requireScenarios } from "./registry.ts"; -import type { AssertionGroup, NemoClawInstanceManifest, PhaseName, RunPlan, ScenarioDefinition, SutBoundary } from "./types.ts"; +import type { + AssertionGroup, + ExpectedFailureContract, + ExpectedFailurePhase, + NemoClawInstanceManifest, + PhaseAction, + PhaseName, + RunPlan, + ScenarioDefinition, + SutBoundary, +} from "./types.ts"; -const PHASES: PhaseName[] = ["environment", "onboarding", "runtime"]; +// Phase order. state-validation runs after onboarding and before +// runtime so gateway/sandbox/cli probes gate suite execution: a +// failed probe is a failed phase action, and the existing runner +// short-circuit reports runtime as skipped without re-running +// suite assertions against a missing/wedged environment. +const PHASES: PhaseName[] = [ + "environment", + "onboarding", + "state-validation", + "lifecycle", + "runtime", +]; const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../.."); function groupsForPhase(scenario: ScenarioDefinition, phase: PhaseName): AssertionGroup[] { @@ -67,19 +89,198 @@ function validateManifestCompatibility(scenario: ScenarioDefinition, manifest?: } } -function phaseActions(phase: PhaseName, scenario: ScenarioDefinition): string[] { +// Centralized paths to the existing shell helpers. Spec rule: shell +// scripts can remain as implementations, but invocation goes through +// typed assertion/action definitions, not bare workflow YAML or a +// resurrected bash runner. +const INSTALL_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh"; +const ONBOARD_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh"; +const PROBES_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/probes/dispatch.sh"; +const LIFECYCLE_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/lifecycle/dispatch.sh"; + +// Default action timeouts. Install and onboarding can take a while on +// cold runners (Docker pulls, image builds, sandbox bootstrap). +const INSTALL_TIMEOUT_SECONDS = 900; +const ONBOARD_TIMEOUT_SECONDS = 900; +// Lifecycle actions wrap state-mutation flows like `nemoclaw rebuild`, +// which can take longer than onboarding when an image rebuild is +// involved (workspace snapshot + recreate + verify). +const LIFECYCLE_TIMEOUT_SECONDS = 900; +// State-validation probes are cheap (`command -v`, single curl, +// `nemoclaw list`); a tight timeout keeps a wedged probe from +// consuming runner budget. +const PROBE_TIMEOUT_SECONDS = 30; + +// Declared parent-env secrets each onboarding profile actually needs. +// Anything not listed here (and not in the framework allowlist) is +// dropped before spawn by buildChildEnv. Keep this list minimal — +// every entry widens the secret blast radius if the child or one of +// its descendants logs unredacted output. +const ONBOARD_PROFILE_SECRET_ENV: Readonly> = { + // Cloud profiles invoke `nemoclaw onboard` which authenticates to the + // NVIDIA cloud provider via NVIDIA_API_KEY. + "cloud-openclaw": ["NVIDIA_API_KEY"], + "cloud-openclaw-custom-policies": ["NVIDIA_API_KEY"], + "cloud-openclaw-invalid-nvidia-key": ["NVIDIA_API_KEY"], + "cloud-openclaw-gateway-port-conflict": ["NVIDIA_API_KEY"], + // Negative scenario: nemoclaw onboard runs against a docker shim that + // exits non-zero. Onboard never reaches the cloud auth step, but the + // CLI still loads NVIDIA_API_KEY when present — keep it in the secret + // env so behavior matches a real user invocation. + "cloud-openclaw-no-docker": ["NVIDIA_API_KEY"], + "cloud-hermes": ["NVIDIA_API_KEY"], + "cloud-hermes-discord": ["NVIDIA_API_KEY"], + "cloud-hermes-slack": ["NVIDIA_API_KEY"], + // Local profiles do not need any cloud secret. + "local-ollama-openclaw": [], +}; + +function phaseActions(phase: PhaseName, scenario: ScenarioDefinition): PhaseAction[] { if (phase === "environment") { + if (!scenario.environment) { + // Scenarios without any environment dimension (skeleton scenarios) + // legitimately have no actions yet. Don't fail-fast here. + return []; + } + const installId = scenario.environment.install; + if (!installId) { + // Environment is declared but install is missing - that IS a + // malformed scenario; fail fast so the caller sees a clear error + // rather than a phase that silently no-ops setup work. + throw new Error(`Scenario ${scenario.id} is missing environment.install`); + } return [ - `install:${scenario.environment?.install ?? "unknown"}`, - `runtime:${scenario.environment?.runtime ?? "unknown"}`, + { + id: `environment.install.${installId}`, + phase: "environment", + description: `Run e2e_install ${installId} to set up the host control plane.`, + kind: "shell-fn", + scriptRef: INSTALL_DISPATCH, + fn: "e2e_install", + arg: installId, + timeoutSeconds: INSTALL_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/environment.install.${installId}.log`, + }, ]; } if (phase === "onboarding") { - return [`onboard:${scenario.environment?.onboarding ?? "unknown"}`]; + if (!scenario.environment) { + return []; + } + const baseOnboardingId = scenario.environment.onboarding; + if (!baseOnboardingId) { + throw new Error(`Scenario ${scenario.id} is missing environment.onboarding`); + } + // Negative-runtime scenarios route to a dedicated onboarding profile + // that sets up the failure condition (e.g. docker-missing) BEFORE + // invoking `nemoclaw onboard` and captures the resulting output to + // the log file the assertion phase reads. The profile id convention + // is `-no-docker`. New negative profiles register a worker in + // nemoclaw_scenarios/onboard/dispatch.sh and a secret-env mapping + // above. + const onboardingId = + scenario.environment.runtime === "docker-missing" + ? `${baseOnboardingId}-no-docker` + : baseOnboardingId; + // secretEnv defaults to [] (no parent-env secrets pass through) + // unless the profile is explicitly listed above. Unknown profiles + // get the safest setting and surface the gap loudly the first + // time they actually need a secret to authenticate. + const secretEnv = ONBOARD_PROFILE_SECRET_ENV[onboardingId] ?? []; + return [ + { + id: `onboarding.profile.${onboardingId}`, + phase: "onboarding", + description: `Run e2e_onboard ${onboardingId} to bring the gateway and sandbox online.`, + kind: "shell-fn", + scriptRef: ONBOARD_DISPATCH, + fn: "e2e_onboard", + arg: onboardingId, + timeoutSeconds: ONBOARD_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/onboarding.profile.${onboardingId}.log`, + // Legacy preflight assertions look for ${E2E_CONTEXT_DIR}/onboard.log; + // publish a stable alias so they keep working without rewiring. + aliasPath: "onboard.log", + secretEnv, + }, + ]; + } + if (phase === "state-validation") { + // State-validation actions are emitted from the typed expected-state + // registry, NOT from the legacy expected-states.yaml. The compiler + // stays a pure function over typed inputs; YAML-vs-typed parity is + // enforced by a framework test, not by re-reading the YAML at + // compile time. + if (!scenario.expectedStateId) { + // Scenarios without an expected state (older skeleton scenarios) + // legitimately have no probes; do not fail-fast. + return []; + } + const state = getExpectedState(scenario.expectedStateId); + if (!state) { + // The compiler treats an unknown expected_state id as a hard + // error: typed scenarios must reference a typed state. The + // legacy YAML resolver has its own validation path; this is a + // separate (and stricter) contract for the typed runner. + throw new Error( + `Scenario ${scenario.id} references unknown expected_state '${scenario.expectedStateId}'`, + ); + } + return probesForState(state).map((probeId) => ({ + id: `state-validation.${probeId}`, + phase: "state-validation", + description: `Probe ${probeId} from expected_state '${state.id}'.`, + kind: "shell-fn", + scriptRef: PROBES_DISPATCH, + fn: "e2e_state_probe", + arg: probeId, + timeoutSeconds: PROBE_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/state-validation.${probeId}.log`, + })); } - return (scenario.suiteIds ?? []).map((suiteId) => `suite:${suiteId}`); + if (phase === "lifecycle") { + // Lifecycle is the post-onboarding state-mutation phase: rebuild, + // upgrade, snapshot+restore, etc. Scenarios opt in by declaring + // `environment.lifecycle = `; everything else gets + // an empty action list and runs no lifecycle assertions. The + // profile id routes through nemoclaw_scenarios/lifecycle/dispatch.sh + // to a worker that mutates state and seeds context.env keys + // (E2E_REBUILD_MARKER_PATH, E2E_REBUILD_MARKER_EXPECTED, ...) the + // runtime-phase assertions in rebuild_upgrade.sh consume. + if (!scenario.environment?.lifecycle) { + return []; + } + const lifecycleId = scenario.environment.lifecycle; + const secretEnv = LIFECYCLE_PROFILE_SECRET_ENV[lifecycleId] ?? []; + return [ + { + id: `lifecycle.profile.${lifecycleId}`, + phase: "lifecycle", + description: `Run e2e_lifecycle ${lifecycleId} to drive the post-onboard state mutation.`, + kind: "shell-fn", + scriptRef: LIFECYCLE_DISPATCH, + fn: "e2e_lifecycle", + arg: lifecycleId, + timeoutSeconds: LIFECYCLE_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/lifecycle.profile.${lifecycleId}.log`, + secretEnv, + }, + ]; + } + // Runtime phase has no actions; suites are assertion groups. + return []; } +// Declared parent-env secrets each lifecycle profile needs. Mirrors +// ONBOARD_PROFILE_SECRET_ENV: minimal allowlist; widen only when a +// profile actually invokes a CLI that authenticates upstream. +const LIFECYCLE_PROFILE_SECRET_ENV: Readonly> = { + // `nemoclaw rebuild` re-reads NVIDIA_API_KEY when the post-rebuild + // sandbox is brought back up; keep it in the secret env so behavior + // matches a real user invocation. + "rebuild-current-version": ["NVIDIA_API_KEY"], +}; + const SUT_BOUNDARIES: SutBoundary[] = [ { id: "host-cli", client: "HostCliClient" }, { id: "gateway", client: "GatewayClient" }, @@ -89,6 +290,41 @@ const SUT_BOUNDARIES: SutBoundary[] = [ { id: "state", client: "StateClient" }, ]; +// Negative scenarios advertise their failure mode against one of these +// user-facing phases. "preflight" is intentionally distinct from the +// internal PhaseName union: scenario manifests speak the user's vocab +// ("preflight failed") and the matcher resolves preflight to the +// onboarding phase orchestrator. See orchestrators/negative-matcher.ts. +const EXPECTED_FAILURE_PHASES: readonly ExpectedFailurePhase[] = [ + "environment", + "onboarding", + "runtime", + "preflight", +]; + +function validateExpectedFailure(scenarioId: string, contract: ExpectedFailureContract): void { + if (!EXPECTED_FAILURE_PHASES.includes(contract.phase)) { + throw new Error( + `Scenario ${scenarioId} expectedFailure.phase invalid: ${String(contract.phase)} (allowed: ${EXPECTED_FAILURE_PHASES.join(", ")})`, + ); + } + if (typeof contract.errorClass !== "string" || contract.errorClass.trim().length === 0) { + throw new Error(`Scenario ${scenarioId} expectedFailure.errorClass must be a non-empty string`); + } + if (contract.forbiddenSideEffects !== undefined) { + if (!Array.isArray(contract.forbiddenSideEffects)) { + throw new Error(`Scenario ${scenarioId} expectedFailure.forbiddenSideEffects must be an array`); + } + for (const entry of contract.forbiddenSideEffects) { + if (typeof entry !== "string" || entry.trim().length === 0) { + throw new Error( + `Scenario ${scenarioId} expectedFailure.forbiddenSideEffects entries must be non-empty strings`, + ); + } + } + } +} + export function validateRunPlan(plan: RunPlan): void { if (!plan.scenarioId) { throw new Error("RunPlan missing scenarioId"); @@ -101,6 +337,9 @@ export function validateRunPlan(plan: RunPlan): void { if (plan.sutBoundaries.length === 0) { throw new Error(`RunPlan ${plan.scenarioId} missing SUT boundaries`); } + if (plan.expectedFailure) { + validateExpectedFailure(plan.scenarioId, plan.expectedFailure); + } } export function compileRunPlans(inputs: Array): RunPlan[] { @@ -112,7 +351,7 @@ export function compileRunPlans(inputs: Array): Run const plan: RunPlan = { scenarioId: scenario.id, status: "compiled", - note: "compiled plan-only preview; live execution lands in later phases", + note: "compiled plan; phase orchestrators execute actions then assertions", manifestPath: scenario.manifestPath, manifest, environment: scenario.environment, @@ -182,6 +421,18 @@ export function renderPlanText(plans: RunPlan[]): string { } for (const phase of plan.phases) { lines.push(`Phase: ${phase.name}`); + for (const action of phase.actions) { + const policy: string[] = []; + if (action.timeoutSeconds) { + policy.push(`timeout=${action.timeoutSeconds}s`); + } + const target = action.kind === "shell-fn" + ? `${action.fn ?? ""}${action.arg ? ` ${action.arg}` : ""}`.trim() + : action.scriptRef; + const policySuffix = policy.length > 0 ? ` (${policy.join(", ")})` : ""; + const targetSuffix = target ? ` -> ${target}` : ""; + lines.push(` Action: ${action.id}${policySuffix}${targetSuffix}`); + } for (const group of phase.assertionGroups) { lines.push(` Group: ${group.id}`); for (const step of group.steps) { diff --git a/test/e2e-scenario/scenarios/expected-states.ts b/test/e2e-scenario/scenarios/expected-states.ts new file mode 100644 index 0000000000..539c520f22 --- /dev/null +++ b/test/e2e-scenario/scenarios/expected-states.ts @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ExpectedState, StateProbeId } from "./types.ts"; + +// Typed mirror of nemoclaw_scenarios/expected-states.yaml. +// +// During the transition this registry is the source of truth for the +// TS runner. expected-states.yaml stays in place for the legacy bash +// resolver; a framework test verifies the typed registry covers the +// YAML's expected-state ids and matches their structural shape on the +// dimensions the typed runner probes today (cli, gateway, sandbox). +// Inference and credentials remain declared in YAML and in this typed +// registry, but the compiler skips emitting probe actions for them +// until the corresponding probe scripts land — see +// nemoclaw_scenarios/probes/. + +const cloudOpenclawReady: ExpectedState = { + id: "cloud-openclaw-ready", + cli: { installed: true }, + gateway: { expected: "present", health: "healthy" }, + sandbox: { expected: "present", status: "running", agent: "openclaw" }, + inference: { expected: "available", provider: "nvidia" }, + credentials: { expected: "present" }, +}; + +const cloudOpenclawCustomPoliciesReady: ExpectedState = { + ...cloudOpenclawReady, + id: "cloud-openclaw-custom-policies-ready", +}; + +const cloudHermesReady: ExpectedState = { + id: "cloud-hermes-ready", + cli: { installed: true }, + gateway: { expected: "present", health: "healthy" }, + sandbox: { expected: "present", status: "running", agent: "hermes" }, + inference: { expected: "available", provider: "nvidia" }, + credentials: { expected: "present" }, +}; + +const localOllamaOpenclawReady: ExpectedState = { + id: "local-ollama-openclaw-ready", + cli: { installed: true }, + gateway: { expected: "present", health: "healthy" }, + sandbox: { expected: "present", status: "running", agent: "openclaw" }, + inference: { expected: "available", provider: "ollama" }, + credentials: { expected: "present" }, +}; + +const macosCliReadyDockerOptional: ExpectedState = { + id: "macos-cli-ready-docker-optional", + cli: { installed: true }, + gateway: { expected: "optional", health: "optional" }, + sandbox: { expected: "optional", status: "optional", agent: "openclaw" }, + inference: { expected: "optional", provider: "nvidia" }, + credentials: { expected: "optional" }, +}; + +const preflightFailureNoSandbox: ExpectedState = { + id: "preflight-failure-no-sandbox", + cli: { installed: true }, + gateway: { expected: "absent" }, + sandbox: { expected: "absent" }, +}; + +const onboardingFailureInvalidNvidiaKey: ExpectedState = { + id: "onboarding-failure-invalid-nvidia-key", + cli: { installed: true }, + gateway: { expected: "absent" }, + sandbox: { expected: "absent" }, +}; + +const onboardingFailureGatewayPortConflict: ExpectedState = { + id: "onboarding-failure-gateway-port-conflict", + cli: { installed: true }, + gateway: { expected: "absent" }, + sandbox: { expected: "absent" }, +}; + +const REGISTRY: readonly ExpectedState[] = [ + cloudOpenclawReady, + cloudOpenclawCustomPoliciesReady, + cloudHermesReady, + localOllamaOpenclawReady, + macosCliReadyDockerOptional, + preflightFailureNoSandbox, + onboardingFailureInvalidNvidiaKey, + onboardingFailureGatewayPortConflict, +]; + +const BY_ID: ReadonlyMap = new Map(REGISTRY.map((state) => [state.id, state])); + +export function listExpectedStates(): readonly ExpectedState[] { + return REGISTRY; +} + +export function getExpectedState(id: string): ExpectedState | undefined { + return BY_ID.get(id); +} + +export function requireExpectedState(id: string): ExpectedState { + const state = BY_ID.get(id); + if (!state) { + const available = Array.from(BY_ID.keys()).join(", "); + throw new Error(`Unknown expected_state id '${id}' (available: ${available})`); + } + return state; +} + +// Translate the typed expected-state contract into the concrete probe +// ids the state-validation orchestrator emits. Inference and +// credentials probes are intentionally omitted today (probe scripts +// not yet implemented); their declarations remain in ExpectedState so +// the contract is visible in plan output and a future change can +// switch on emission without touching scenario data. "optional" +// dimensions emit no probe actions. +export function probesForState(state: ExpectedState): readonly StateProbeId[] { + const probes: StateProbeId[] = []; + if (state.cli?.installed === true) { + probes.push("cli-installed"); + } + if (state.gateway?.expected === "present" && state.gateway.health === "healthy") { + probes.push("gateway-healthy"); + } else if (state.gateway?.expected === "absent") { + probes.push("gateway-absent"); + } + if (state.sandbox?.expected === "present" && state.sandbox.status === "running") { + probes.push("sandbox-running"); + } else if (state.sandbox?.expected === "absent") { + probes.push("sandbox-absent"); + } + return probes; +} diff --git a/test/e2e-scenario/scenarios/matrix.ts b/test/e2e-scenario/scenarios/matrix.ts index dc869941c9..daea207dd4 100644 --- a/test/e2e-scenario/scenarios/matrix.ts +++ b/test/e2e-scenario/scenarios/matrix.ts @@ -26,3 +26,23 @@ export function brevLaunchableRemote(onboarding: string): ScenarioEnvironment { export function ubuntuRepoNoDocker(onboarding: string): ScenarioEnvironment { return { platform: "ubuntu-local", install: "repo-current", runtime: "docker-missing", onboarding }; } + +/** + * ubuntu-local + repo-current + docker-running + a lifecycle profile. + * Use for scenarios whose runtime assertions depend on a post-onboard + * state mutation (rebuild, upgrade, snapshot+restore). The lifecycle + * profile id maps to a worker under nemoclaw_scenarios/lifecycle/ via + * its dispatcher. + */ +export function ubuntuRepoDockerLifecycle( + onboarding: string, + lifecycle: string, +): ScenarioEnvironment { + return { + platform: "ubuntu-local", + install: "repo-current", + runtime: "docker-running", + onboarding, + lifecycle, + }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/context.ts b/test/e2e-scenario/scenarios/orchestrators/context.ts new file mode 100644 index 0000000000..35394121fc --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/context.ts @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; +import type { RunContext, RunPlan } from "../types.ts"; + +// Spec ownership: emitting the normalized context.env that downstream +// shell helpers consume is FRAMEWORK INFRASTRUCTURE, not a phase action. +// Doing it as a shell action coupled the typed runner back to the old +// resolver's plan.json shape; doing it here keeps the typed RunPlan as +// the single source of truth. +// +// We seed context.env with values derivable from the typed RunPlan +// (scenario id, install method, agent/provider/route, default sandbox +// name and gateway URL). Onboarding helpers may overwrite these via +// e2e_context_set (e.g. assigning a real sandbox name, real gateway +// URL after the gateway boots). + +function platformOsFromManifest(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.platform.os; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + // Fall back to the scenario environment platform id ("ubuntu-local", + // "macos-local", "wsl-local", "gpu-runner", "brev-launchable"). + const platform = plan.environment?.platform ?? ""; + if (platform.startsWith("macos")) return "macos"; + if (platform.startsWith("wsl")) return "wsl"; + if (platform.startsWith("brev")) return "ubuntu"; + if (platform.startsWith("gpu")) return "ubuntu"; + return "ubuntu"; +} + +function executionTargetFromManifest(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.platform.executionTarget; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + return plan.environment?.platform === "brev-launchable" ? "remote" : "local"; +} + +function containerEngine(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.runtime.containerEngine; + return typeof explicit === "string" && explicit.length > 0 ? explicit : "docker"; +} + +function containerDaemon(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.runtime.containerDaemon; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + return plan.environment?.runtime === "docker-missing" ? "missing" : "running"; +} + +function defaultGatewayUrl(agent: string): string { + // Mirrors the historical defaults from emit-context-from-plan.sh so + // existing shell helpers see the same seed values they used to. + return agent === "hermes" ? "http://127.0.0.1:8642" : "http://127.0.0.1:18789"; +} + +function escapeContextValue(value: string): string { + // The context library accepts plain `KEY=value` lines without quoting. + // Reject newlines (would corrupt the file) and otherwise pass through. + if (value.includes("\n")) { + throw new Error(`context.env value for must not contain newline: ${JSON.stringify(value)}`); + } + return value; +} + +export interface ContextSeedResult { + path: string; + keys: string[]; +} + +export function seedContextEnv(ctx: RunContext, plan: RunPlan): ContextSeedResult { + const onboarding = plan.manifest?.spec.onboarding; + const agent = onboarding?.agent ?? "openclaw"; + const provider = onboarding?.provider ?? "nvidia"; + const inferenceRoute = onboarding?.modelRoute ?? "inference-local"; + const onboardingPath = plan.environment?.onboarding ?? "unknown"; + const installMethod = plan.environment?.install ?? "unknown"; + + const entries: Record = { + E2E_SCENARIO: plan.scenarioId, + E2E_PLATFORM_OS: platformOsFromManifest(plan), + E2E_EXECUTION_TARGET: executionTargetFromManifest(plan), + E2E_INSTALL_METHOD: installMethod, + E2E_CONTAINER_ENGINE: containerEngine(plan), + E2E_CONTAINER_DAEMON: containerDaemon(plan), + E2E_ONBOARDING_PATH: onboardingPath, + E2E_AGENT: agent, + E2E_PROVIDER: provider, + E2E_INFERENCE_ROUTE: inferenceRoute, + E2E_SANDBOX_NAME: `e2e-${plan.scenarioId}`, + E2E_GATEWAY_URL: defaultGatewayUrl(agent), + }; + + // Path matches the shell helper's e2e_context_init: ${E2E_CONTEXT_DIR}/context.env + const contextPath = path.join(ctx.contextDir, "context.env"); + fs.mkdirSync(ctx.contextDir, { recursive: true }); + const lines = Object.entries(entries) + .map(([key, value]) => `${key}=${escapeContextValue(value)}`) + .join("\n"); + fs.writeFileSync(contextPath, `${lines}\n`); + + return { path: contextPath, keys: Object.keys(entries) }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/lifecycle.ts b/test/e2e-scenario/scenarios/orchestrators/lifecycle.ts new file mode 100644 index 0000000000..509112f171 --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/lifecycle.ts @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { PhaseOrchestrator } from "./phase.ts"; + +/** + * Lifecycle phase orchestrator. + * + * Sits between state-validation and runtime. Drives post-onboard + * state mutations (rebuild, upgrade, snapshot+restore, ...) by + * executing the action(s) the compiler emits when a scenario declares + * `environment.lifecycle = `. The action's worker (under + * test/e2e-scenario/nemoclaw_scenarios/lifecycle/) seeds context.env + * keys (E2E_REBUILD_MARKER_PATH, E2E_REBUILD_MARKER_EXPECTED, ...) + * which the runtime-phase rebuild_upgrade.sh assertions consume. + * + * Scenarios without a lifecycle profile see this phase as a no-op: + * the compiler emits an empty action list, the orchestrator runs no + * assertions, and the runtime phase proceeds as before. + */ +export class LifecycleOrchestrator extends PhaseOrchestrator { + constructor() { + super("lifecycle"); + } +} diff --git a/test/e2e-scenario/scenarios/orchestrators/negative-matcher.ts b/test/e2e-scenario/scenarios/orchestrators/negative-matcher.ts new file mode 100644 index 0000000000..dbbe2b0956 --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/negative-matcher.ts @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { + ExpectedFailureContract, + ExpectedFailurePhase, + PhaseName, + PhaseResult, + RunPlan, +} from "../types.ts"; + +// Pure framework infrastructure: given a compiled RunPlan and the +// observed phase results, decide whether a negative scenario's +// declared failure contract was honored. Does not mutate inputs and +// does not perform I/O. +// +// Spec ownership boundaries: +// - Failure injection (uninstalling docker, planting a bad key, +// occupying a gateway port) is runner-environment prep, NOT this +// matcher's job. The matcher only inspects what actually happened. +// - Forbidden-side-effect verification (did a sandbox actually get +// created when the scenario forbids it?) belongs to the +// `expectedFailureNoSideEffectsProbe` implementation registered as +// a probe step. Until that probe lands, the runtime control group +// keeps the negative scenario visibly red via a `required: true` +// pending step. The matcher reports the contract status for +// phase + errorClass independently of the side-effect probe, and +// exposes whether forbiddenSideEffects were declared so callers can +// integrate both signals. + +export type NegativeContractMatchOutcome = + // Right phase, right errorClass match observed. + | "matched" + // Scenario expected a failure but every phase passed. + | "no-failure-observed" + // Wrong phase failed (e.g., expected onboarding, observed environment). + | "wrong-phase" + // Right phase, but the failure message did not advertise the + // declared errorClass. + | "wrong-error-class"; + +export interface NegativeContractObservation { + failedPhase?: PhaseName; + failedActionId?: string; + failedActionMessage?: string; + failedAssertionId?: string; + failedAssertionMessage?: string; +} + +export interface NegativeContractResult { + matched: boolean; + outcome: NegativeContractMatchOutcome; + expected: ExpectedFailureContract; + observed: NegativeContractObservation; + // Human-readable diagnostic; suitable for evidence logs and CI output. + message: string; +} + +// Internal id reserved for the runtime side-effect pending/probe step +// declared in assertions/registry.ts. The matcher excludes failures of +// that step from "observed failure" detection so the contract evaluation +// is not confused by its own enforcement scaffolding. +// +// As of the state-validation phase landing, forbidden side effects are +// observed by the typed gateway-absent / sandbox-absent probes during +// the state-validation phase, not by this pending step. The exclusion +// is kept to stay correct for any scenario that still references the +// legacy step id. +const SIDE_EFFECT_PROBE_STEP_ID = "runtime.expected-failure.no-side-effects"; + +// State-validation probe ids the matcher must skip when scanning for +// observed failures. For a negative scenario, these probes are real +// post-failure checks (gateway-absent, sandbox-absent) — their pass/fail +// status does NOT determine which phase advertised the original failure +// mode, only whether forbidden side effects occurred. +const STATE_VALIDATION_FORBIDDEN_PROBE_IDS: ReadonlySet = new Set([ + "state-validation.gateway-absent", + "state-validation.sandbox-absent", +]); + +// Map the user-facing expected failure phase to the internal phase +// orchestrator that owns it. Today preflight assertions live under +// onboarding (see assertions/registry.ts: onboarding.preflight.*). +function resolveExpectedPhase(phase: ExpectedFailurePhase): PhaseName { + if (phase === "preflight") { + return "onboarding"; + } + return phase; +} + +function isOwnPhaseResult(phase: PhaseResult["phase"]): phase is PhaseName { + return ( + phase === "environment" || + phase === "onboarding" || + phase === "state-validation" || + phase === "runtime" + ); +} + +function findFirstObservedFailure(results: readonly PhaseResult[]): NegativeContractObservation | undefined { + for (const result of results) { + if (!isOwnPhaseResult(result.phase)) { + continue; + } + // state-validation forbidden-side-effect probes (gateway-absent, + // sandbox-absent) are post-failure verification, not the failure + // mode itself; skip them when locating the originating failure. + // A failed cli-installed probe IS a real observed failure (the + // install action passed but the binary isn't reachable) and is + // not skipped. + const failedAction = result.actions.find( + (action) => + action.status === "failed" && !STATE_VALIDATION_FORBIDDEN_PROBE_IDS.has(action.id), + ); + if (failedAction) { + return { + failedPhase: result.phase, + failedActionId: failedAction.id, + failedActionMessage: failedAction.message, + }; + } + const failedAssertion = result.assertions.find( + (assertion) => assertion.status === "failed" && assertion.id !== SIDE_EFFECT_PROBE_STEP_ID, + ); + if (failedAssertion) { + return { + failedPhase: result.phase, + failedAssertionId: failedAssertion.id, + failedAssertionMessage: failedAssertion.message, + }; + } + } + return undefined; +} + +function errorClassMatches(message: string | undefined, errorClass: string): boolean { + if (!message) { + return false; + } + // Substring-with-case-fold match. Negative scenarios assert their + // failure mode by class name (e.g., "docker-missing", + // "invalid-nvidia-api-key"); we match either the literal class + // string or a normalized form where dashes/underscores/spaces are + // interchangeable. This stays a pure string check so the matcher + // can be fully tested in isolation. + const normalize = (value: string): string => value.toLowerCase().replace(/[\s_-]+/g, "-"); + return normalize(message).includes(normalize(errorClass)); +} + +function describeObservation(observation: NegativeContractObservation): string { + const parts: string[] = []; + if (observation.failedPhase) { + parts.push(`phase=${observation.failedPhase}`); + } + if (observation.failedActionId) { + parts.push(`action=${observation.failedActionId}`); + } + if (observation.failedAssertionId) { + parts.push(`assertion=${observation.failedAssertionId}`); + } + const message = observation.failedActionMessage ?? observation.failedAssertionMessage; + if (message) { + parts.push(`message="${message.slice(0, 240)}"`); + } + return parts.length > 0 ? parts.join(" ") : "no failure observed"; +} + +export function evaluateNegativeContract(plan: RunPlan, results: readonly PhaseResult[]): NegativeContractResult { + const expected = plan.expectedFailure; + if (!expected) { + throw new Error( + `evaluateNegativeContract called for scenario ${plan.scenarioId} which has no expectedFailure declared`, + ); + } + const expectedPhase = resolveExpectedPhase(expected.phase); + const observation = findFirstObservedFailure(results); + + if (!observation) { + return { + matched: false, + outcome: "no-failure-observed", + expected, + observed: {}, + message: `scenario ${plan.scenarioId} expected to fail in ${expected.phase} (errorClass=${expected.errorClass}), but all phases passed`, + }; + } + + if (observation.failedPhase !== expectedPhase) { + return { + matched: false, + outcome: "wrong-phase", + expected, + observed: observation, + message: `scenario ${plan.scenarioId} expected ${expected.phase} failure (errorClass=${expected.errorClass}); observed ${describeObservation(observation)}`, + }; + } + + const observedMessage = observation.failedActionMessage ?? observation.failedAssertionMessage; + if (!errorClassMatches(observedMessage, expected.errorClass)) { + return { + matched: false, + outcome: "wrong-error-class", + expected, + observed: observation, + message: `scenario ${plan.scenarioId} ${expected.phase} failure errorClass mismatch: expected="${expected.errorClass}" observed=${describeObservation(observation)}`, + }; + } + + return { + matched: true, + outcome: "matched", + expected, + observed: observation, + message: `scenario ${plan.scenarioId} negative contract matched: ${expected.phase}/${expected.errorClass} (${describeObservation(observation)})`, + }; +} + +// Convenience: build a synthetic PhaseResult for the runner to append +// to the per-phase results. Keeps run.ts and artifact writers honest +// (one shape, written through the same path as real phase results). +export function negativeContractPhaseResult(result: NegativeContractResult): PhaseResult { + return { + phase: "negative-contract", + status: result.matched ? "passed" : "failed", + actions: [], + assertions: [ + { + id: "negative-contract.match", + status: result.matched ? "passed" : "failed", + attempts: 1, + durationMs: 0, + message: result.message, + }, + ], + }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/phase.ts b/test/e2e-scenario/scenarios/orchestrators/phase.ts index ae59a58e62..ccde0ba73d 100644 --- a/test/e2e-scenario/scenarios/orchestrators/phase.ts +++ b/test/e2e-scenario/scenarios/orchestrators/phase.ts @@ -1,67 +1,306 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { spawn } from "node:child_process"; import fs from "node:fs"; import path from "node:path"; +import { fileURLToPath } from "node:url"; import type { AssertionResult, AssertionStep, + PhaseAction, + PhaseActionResult, PhaseName, PhaseResult, RunContext, RunPlanPhase, TransientClassifier, } from "../types.ts"; +import { lookupProbe } from "../probes/registry.ts"; +import type { ProbeContext } from "../probes/types.ts"; +import { buildChildEnv, pipeRedacted, redactString } from "./redaction.ts"; + +// Auto-register the built-in probes the moment the orchestrator is +// imported. This is a deliberate side-effect import: registry state is +// module-scoped and we want every entry point that runs assertions +// (run.ts, ScenarioRunner, framework tests) to see the same wired set +// without each one repeating the registration. +import { registerBuiltinProbes } from "../probes/builtin.ts"; +registerBuiltinProbes(); + +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../.."); +const DEFAULT_STEP_TIMEOUT_SECONDS = 300; interface StepAttemptOutcome { - status: "passed" | "failed"; + status: "passed" | "failed" | "skipped"; classifier?: TransientClassifier; message?: string; + evidence?: string; } -function transientForRef(ref: string): TransientClassifier { - if (ref.includes("provider") || ref.includes("transient")) { - return "provider-transient"; +// Heuristic transient classifier for shell step refs that don't print +// their own classifier hint. Phase orchestrators own classification; +// clients/scripts do not. +function classifierForRef(ref: string): TransientClassifier { + if (/provider|inference|chat-completion|cloudflared|tunnel/i.test(ref)) { + // Use case-insensitive matching here too; the outer guard is /i, so + // mixed-case refs (Tunnel, Cloudflared) must still classify as + // external-tunnel rather than fall through to provider-transient. + return /tunnel|cloudflared/i.test(ref) ? "external-tunnel" : "provider-transient"; } - if (ref.includes("gateway")) { + if (/gateway/i.test(ref)) { return "gateway-transient"; } + if (/event-capture|tui|chat-events/i.test(ref)) { + return "empty-event-capture"; + } return "runner-infra"; } +/** + * Build the typed ProbeContext handed to a probe runner. Mirrors the + * subset of state that shell steps already get via + * ${E2E_CONTEXT_DIR}/context.env, but parsed up front so probe code + * doesn't reach into the file system itself. + */ +function buildProbeContext(ctx: RunContext, step: AssertionStep): ProbeContext { + const contextEnvPath = path.join(ctx.contextDir, "context.env"); + const contextEnv: Record = {}; + if (fs.existsSync(contextEnvPath)) { + const raw = fs.readFileSync(contextEnvPath, "utf8"); + for (const line of raw.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + const eq = trimmed.indexOf("="); + if (eq <= 0) continue; + const key = trimmed.slice(0, eq); + let value = trimmed.slice(eq + 1); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + contextEnv[key] = value; + } + } + const evidenceRel = step.evidencePath ?? `.e2e/assertions/${step.id}.json`; + const evidencePath = path.isAbsolute(evidenceRel) + ? evidenceRel + : path.join(ctx.contextDir, evidenceRel); + return { + contextDir: ctx.contextDir, + evidencePath, + contextEnv, + sandboxName: contextEnv.E2E_SANDBOX_NAME ?? null, + gatewayUrl: contextEnv.E2E_GATEWAY_URL ?? null, + repoRoot: REPO_ROOT, + }; +} + export class PhaseOrchestrator { constructor(private readonly phaseName: PhaseName) {} async run(ctx: RunContext, phase: RunPlanPhase): Promise { + const actions: PhaseActionResult[] = []; + let actionFailed = false; + for (const action of phase.actions) { + const actionResult = await this.runAction(ctx, action); + actions.push(actionResult); + if (actionResult.status === "failed") { + actionFailed = true; + // Spec failure-layer rule: setup failure must not let assertions + // run and accidentally pass. Stop the phase here. + break; + } + } const assertions: AssertionResult[] = []; - for (const group of phase.assertionGroups) { - for (const step of group.steps) { - assertions.push(await this.runStep(ctx, step)); + if (!actionFailed) { + for (const group of phase.assertionGroups) { + for (const step of group.steps) { + assertions.push(await this.runStep(ctx, step)); + } } } - const status = assertions.some((assertion) => assertion.status === "failed") ? "failed" : "passed"; - const result: PhaseResult = { phase: this.phaseName, status, assertions }; + const assertionsFailed = assertions.some((assertion) => assertion.status === "failed"); + const allSkipped = + !actionFailed && + assertions.length > 0 && + assertions.every((assertion) => assertion.status === "skipped"); + let status: PhaseResult["status"]; + if (actionFailed || assertionsFailed) { + status = "failed"; + } else if (allSkipped || (actions.length === 0 && assertions.length === 0)) { + status = "skipped"; + } else { + status = "passed"; + } + const result: PhaseResult = { phase: this.phaseName, status, actions, assertions }; this.writePhaseResult(ctx, result); return result; } + private async runAction(ctx: RunContext, action: PhaseAction): Promise { + const startedAt = Date.now(); + const scriptPath = path.isAbsolute(action.scriptRef) + ? action.scriptRef + : path.resolve(REPO_ROOT, action.scriptRef); + if (!fs.existsSync(scriptPath)) { + return { + id: action.id, + status: "failed", + durationMs: Date.now() - startedAt, + message: `phase action ${action.id} script not found: ${scriptPath}`, + }; + } + const timeoutSeconds = action.timeoutSeconds ?? DEFAULT_STEP_TIMEOUT_SECONDS; + const logDir = path.join(ctx.contextDir, ".e2e", "actions"); + fs.mkdirSync(logDir, { recursive: true }); + const logPath = path.join(logDir, `${action.id}.log`); + + // Compose the bash invocation. shell-fn sources the dispatcher and + // calls the named function with its single positional arg; shell + // executes the script directly. We always go through bash -lc so + // sourced shell helpers see a normal interactive-style env. + const dispatchAction = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh"); + const useDispatchLauncher = action.kind === "shell-fn" && fs.existsSync(dispatchAction); + const bashArgs: string[] = useDispatchLauncher + ? [dispatchAction, action.fn ?? "", action.arg ?? "", scriptPath] + : [scriptPath, ...(action.arg ? [action.arg] : [])]; + + // Framework-owned secret hygiene at the spawn boundary. The child + // gets a minimal allowlisted env plus only the secrets this action + // explicitly declared via PhaseAction.secretEnv. See + // orchestrators/redaction.ts for the full contract. + const env = buildChildEnv(process.env, { + secretEnv: action.secretEnv, + frameworkOverlay: { + E2E_CONTEXT_DIR: ctx.contextDir, + E2E_PHASE: action.phase, + E2E_ACTION_ID: action.id, + }, + }); + + return await new Promise((resolve) => { + const child = spawn("bash", bashArgs, { env, cwd: REPO_ROOT, detached: true }); + const pgid = child.pid; + const logStream = fs.createWriteStream(logPath); + let stderrTail = ""; + // Every byte from the child passes through redactString before + // hitting the evidence log or the stderr tail; raw output never + // touches disk or PhaseActionResult.message. + pipeRedacted(child.stdout, logStream); + pipeRedacted(child.stderr, logStream, (redactedChunk) => { + stderrTail = (stderrTail + redactedChunk).slice(-4096); + }); + + const killGroup = (signal: NodeJS.Signals) => { + if (typeof pgid !== "number") { + child.kill(signal); + return; + } + try { + process.kill(-pgid, signal); + } catch { + /* group already gone */ + } + }; + + let timedOut = false; + const timeout = setTimeout(() => { + timedOut = true; + killGroup("SIGTERM"); + setTimeout(() => { + if (!child.killed) { + killGroup("SIGKILL"); + } + }, 5_000).unref(); + }, timeoutSeconds * 1_000); + + const finishLog = (): Promise => + new Promise((res) => { + if ((logStream as unknown as { closed?: boolean }).closed) { + res(); + return; + } + logStream.once("finish", () => res()); + logStream.once("error", () => res()); + logStream.end(); + }); + + child.on("error", (err) => { + clearTimeout(timeout); + void finishLog().then(() => + resolve({ + id: action.id, + status: "failed", + durationMs: Date.now() - startedAt, + evidence: logPath, + message: redactString(`phase action ${action.id} spawn error: ${err.message}`), + }), + ); + }); + + child.on("close", (code, signal) => { + clearTimeout(timeout); + void finishLog().then(() => { + const durationMs = Date.now() - startedAt; + if (timedOut) { + resolve({ + id: action.id, + status: "failed", + durationMs, + evidence: logPath, + message: `phase action ${action.id} exceeded ${timeoutSeconds}s (signal=${signal ?? "SIGTERM"})`, + }); + return; + } + if (code === 0) { + // Publish the action's evidence log under a stable alias for + // legacy assertions that reference fixed filenames + // (onboard.log, install.log, ...). Best-effort; alias copy + // failures do not fail the action. + if (action.aliasPath) { + try { + const aliasFull = path.isAbsolute(action.aliasPath) + ? action.aliasPath + : path.join(ctx.contextDir, action.aliasPath); + fs.mkdirSync(path.dirname(aliasFull), { recursive: true }); + fs.copyFileSync(logPath, aliasFull); + } catch { + /* alias is a convenience; never fail action on copy */ + } + } + resolve({ id: action.id, status: "passed", durationMs, evidence: logPath }); + return; + } + resolve({ + id: action.id, + status: "failed", + durationMs, + evidence: logPath, + message: `phase action ${action.id} exit ${code ?? "null"}: ${stderrTail.split("\n").slice(-3).join(" | ").trim()}`, + }); + }); + }); + }); + } + private async runStep(ctx: RunContext, step: AssertionStep): Promise { const startedAt = Date.now(); const rawAttempts = step.reliability?.retry?.attempts; - const maxAttempts = typeof rawAttempts === "number" && Number.isFinite(rawAttempts) ? Math.max(1, Math.floor(rawAttempts)) : 1; + const maxAttempts = + typeof rawAttempts === "number" && Number.isFinite(rawAttempts) ? Math.max(1, Math.floor(rawAttempts)) : 1; let attempts = 0; let lastOutcome: StepAttemptOutcome = { status: "failed", message: "step did not run" }; for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { attempts = attempt; lastOutcome = await this.executeStep(ctx, step, attempt); - if (lastOutcome.status === "passed") { + if (lastOutcome.status === "passed" || lastOutcome.status === "skipped") { return { id: step.id, - status: "passed", + status: lastOutcome.status, attempts, durationMs: Date.now() - startedAt, classifier: attempt > 1 ? step.reliability?.retry?.on[0] : lastOutcome.classifier, - evidence: step.evidencePath, + evidence: lastOutcome.evidence ?? step.evidencePath, message: lastOutcome.message, }; } @@ -75,7 +314,7 @@ export class PhaseOrchestrator { attempts, durationMs: Date.now() - startedAt, classifier: lastOutcome.classifier, - evidence: step.evidencePath, + evidence: lastOutcome.evidence ?? step.evidencePath, message: lastOutcome.message, }; } @@ -92,26 +331,215 @@ export class PhaseOrchestrator { return step.reliability?.retry?.on.includes(classifier) ?? false; } - private async executeStep(_ctx: RunContext, step: AssertionStep, attempt: number): Promise { - const ref = step.implementation?.ref ?? ""; - if (ref === "fake-pass" || ref === "phase-1-skeleton") { - return { status: "passed" }; + private async executeStep(ctx: RunContext, step: AssertionStep, _attempt: number): Promise { + const kind = step.implementation?.kind; + if (kind === "shell") { + return this.runShellStep(ctx, step); } - if (ref === "fake-retry-once-pass") { - return attempt === 1 - ? { status: "failed", classifier: step.reliability?.retry?.on[0] ?? "gateway-transient" } - : { status: "passed" }; + if (kind === "probe") { + const ref = step.implementation?.ref ?? ""; + const probe = lookupProbe(ref); + if (!probe) { + // Probe is referenced by the typed registry but no + // implementation has been registered yet. Surface as + // skipped — unless the step is marked required, in which + // case fail closed so security-sensitive suites never + // pass on a missing probe. + if (step.required) { + return { + status: "failed", + classifier: "runner-infra", + message: `required probe not registered: ${ref} (step ${step.id})`, + }; + } + return { status: "skipped", message: `probe not registered: ${ref}` }; + } + const probeCtx = buildProbeContext(ctx, step); + try { + const outcome = await probe(probeCtx); + return { + status: outcome.status, + classifier: outcome.classifier, + message: outcome.message, + evidence: outcome.evidence ?? probeCtx.evidencePath, + }; + } catch (err) { + // Probes must not throw — but a thrown error must NEVER + // cause an unobservable failure. Convert to a failed + // outcome with a redacted message so the orchestrator's + // result aggregation still records evidence. + const message = err instanceof Error ? err.message : String(err); + return { + status: "failed", + message: redactString(`probe ${ref} threw: ${message}`), + evidence: probeCtx.evidencePath, + }; + } + } + if (kind === "pending") { + // pending steps surface as skipped with the placeholder ref so + // gaps are visible in plan output and phase results. Required + // pending steps (e.g. expected-failure side-effect validators + // for negative scenarios) fail closed instead — the run cannot + // honestly pass while the contract is unimplemented. + const ref = step.implementation?.ref ?? ""; + if (step.required) { + return { + status: "failed", + classifier: "runner-infra", + message: `required pending step not implemented: ${ref} (step ${step.id})`, + }; + } + return { status: "skipped", message: `pending: ${ref}` }; } - if (ref === "fake-always-transient") { - return { status: "failed", classifier: step.reliability?.retry?.on[0] ?? transientForRef(ref) }; + throw new Error(`Unknown assertion step kind for ${step.id}: ${String(kind)}`); + } + + private async runShellStep(ctx: RunContext, step: AssertionStep): Promise { + const ref = step.implementation?.ref; + if (!ref) { + return { status: "failed", message: `shell step ${step.id} missing implementation.ref` }; } - if (step.implementation?.kind === "shell" && _ctx.dryRun) { - return { status: "passed", message: `dry-run shell ${ref}` }; + const scriptPath = path.isAbsolute(ref) ? ref : path.resolve(REPO_ROOT, ref); + if (!fs.existsSync(scriptPath)) { + return { status: "failed", message: `shell step ${step.id} script not found: ${scriptPath}` }; } - if (step.implementation?.kind === "probe" && _ctx.dryRun) { - return { status: "passed", message: `dry-run probe ${ref}` }; + + const timeoutSeconds = step.reliability?.timeoutSeconds ?? DEFAULT_STEP_TIMEOUT_SECONDS; + const logDir = path.join(ctx.contextDir, ".e2e", "logs"); + fs.mkdirSync(logDir, { recursive: true }); + const logPath = path.join(logDir, `${step.id}.log`); + + // Framework-owned secret hygiene at the spawn boundary (mirrors + // runAction). The shell step's child gets only the framework + // allowlist + scenario context.env keys + step.secretEnv + // declarations. See orchestrators/redaction.ts. + const env = buildChildEnv(process.env, { + secretEnv: step.secretEnv, + frameworkOverlay: { + E2E_CONTEXT_DIR: ctx.contextDir, + E2E_STEP_ID: step.id, + E2E_PHASE: step.phase, + }, + }); + // Surface scenario-derived context (E2E_SCENARIO, E2E_SANDBOX_NAME, + // E2E_GATEWAY_URL, etc.) that the framework wrote at the start of the + // run and that environment+onboarding phases extended via + // e2e_context_set. The shell context library writes to + // ${E2E_CONTEXT_DIR}/context.env, NOT to ${E2E_CONTEXT_DIR}/.e2e/. + const contextEnvPath = path.join(ctx.contextDir, "context.env"); + if (fs.existsSync(contextEnvPath)) { + const contextEnv = fs.readFileSync(contextEnvPath, "utf8"); + for (const line of contextEnv.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) { + continue; + } + const eq = trimmed.indexOf("="); + if (eq <= 0) { + continue; + } + const key = trimmed.slice(0, eq); + let value = trimmed.slice(eq + 1); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + env[key] = value; + } } - return { status: "failed", message: `unsupported live step ${step.id}` }; + + return await new Promise((resolve) => { + // detached: true puts the child (and any of its children, e.g. a `sleep` + // spawned by bash) into its own process group. We send signals to the + // negative pid so the whole group dies on timeout. Without this, bash + // ignores SIGTERM until its current foreground command (e.g. sleep) + // returns, and timeouts effectively don't work. + const child = spawn("bash", [scriptPath], { env, cwd: REPO_ROOT, detached: true }); + const pgid = child.pid; + const logStream = fs.createWriteStream(logPath); + let stderrTail = ""; + // Redact at the I/O boundary; raw bytes from the child must not + // reach the evidence log or the stderr tail that flows into + // step result.message. + pipeRedacted(child.stdout, logStream); + pipeRedacted(child.stderr, logStream, (redactedChunk) => { + stderrTail = (stderrTail + redactedChunk).slice(-4096); + }); + + const killGroup = (signal: NodeJS.Signals) => { + if (typeof pgid !== "number") { + child.kill(signal); + return; + } + try { + process.kill(-pgid, signal); + } catch { + /* group already gone */ + } + }; + + let timedOut = false; + const timeout = setTimeout(() => { + timedOut = true; + killGroup("SIGTERM"); + setTimeout(() => { + if (!child.killed) { + killGroup("SIGKILL"); + } + }, 5_000).unref(); + }, timeoutSeconds * 1_000); + + // Wait for the log writeStream to fully flush before resolving so + // callers can synchronously read the evidence file. Without this, the + // 'close' event on the child fires before the WriteStream finishes + // draining, and tests/orchestrators see an empty log file. + const finishLog = (): Promise => + new Promise((res) => { + if ((logStream as unknown as { closed?: boolean }).closed) { + res(); + return; + } + logStream.once("finish", () => res()); + logStream.once("error", () => res()); + logStream.end(); + }); + + child.on("error", (err) => { + clearTimeout(timeout); + void finishLog().then(() => + resolve({ + status: "failed", + message: redactString(`shell step ${step.id} spawn error: ${err.message}`), + evidence: logPath, + }), + ); + }); + + child.on("close", (code, signal) => { + clearTimeout(timeout); + void finishLog().then(() => { + if (timedOut) { + resolve({ + status: "failed", + classifier: "runner-infra", + message: `shell step ${step.id} exceeded ${timeoutSeconds}s (signal=${signal ?? "SIGTERM"})`, + evidence: logPath, + }); + return; + } + if (code === 0) { + resolve({ status: "passed", evidence: logPath }); + return; + } + resolve({ + status: "failed", + classifier: classifierForRef(ref), + message: `shell step ${step.id} exit ${code ?? "null"}: ${stderrTail.split("\n").slice(-3).join(" | ").trim()}`, + evidence: logPath, + }); + }); + }); + }); } private writePhaseResult(ctx: RunContext, result: PhaseResult) { diff --git a/test/e2e-scenario/scenarios/orchestrators/redaction.ts b/test/e2e-scenario/scenarios/orchestrators/redaction.ts new file mode 100644 index 0000000000..347eae12bc --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/redaction.ts @@ -0,0 +1,235 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Framework-owned secret hygiene at the spawn boundary. + * + * Spec ownership: redaction and child-env minimization are FRAMEWORK + * INFRASTRUCTURE, not a per-action / per-script / per-workflow concern. + * Children spawned by PhaseOrchestrator must (a) receive a minimal, + * typed env (framework allowlist + per-action declared `secretEnv` + * passthrough only), and (b) have their stdout/stderr passed through + * redaction before any byte reaches an evidence log or + * PhaseResult.message. There is no opt-out flag, no env switch, no + * helper that bypasses this. One execution mode, secrets always + * redacted in evidence — same one-mode discipline that motivates the + * rest of this PR. + * + * Pattern source-of-truth: src/lib/security/secret-patterns.ts. We + * import the canonical regex sets and apply them here so framework + * redaction stays in lockstep with product-runtime redaction without + * coupling the framework to product runtime modules. + * + * Bash side: test/e2e-scenario/runtime/lib/context.sh::e2e_context_dump + * already redacts on dump via _e2e_context_is_sensitive_key. Bash + * helpers must continue to use that for diagnostic dumps; this module + * only covers the TS-spawned-child I/O path. + * + * Tests: + * test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts + * - test_should_not_persist_secret_shaped_child_output_into_evidence + * - test_should_drop_non_allowlisted_parent_env_unless_declared_in_secretEnv + * - test_should_pass_declared_secretEnv_through_to_child + */ + +import type { Readable, Writable } from "node:stream"; + +const REDACTED = ""; + +// Framework-local mirror of src/lib/security/secret-patterns.ts. The +// framework deliberately does not import from src/lib/security/ so it +// stays decoupled from product runtime modules and the cross-tsconfig +// boundary. A parity test +// (test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts) +// asserts these regex sources stay in lockstep with the canonical +// product source so adding a token shape there keeps both layers +// honest at once. +// Exported only so the parity test +// (test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts) can +// import the actual RegExp values rather than parsing source text. +// Production code in this module continues to use them via the local +// binding; nothing in the framework runtime imports these. +export const TOKEN_PREFIX_PATTERNS: RegExp[] = [ + /nvapi-[A-Za-z0-9_-]{10,}/g, + /nvcf-[A-Za-z0-9_-]{10,}/g, + /ghp_[A-Za-z0-9_-]{10,}/g, + /(?:github_pat_)[A-Za-z0-9_]{30,}/g, + /sk-proj-[A-Za-z0-9_-]{10,}/g, + /sk-ant-[A-Za-z0-9_-]{10,}/g, + /sk-[A-Za-z0-9_-]{20,}/g, + /(?:xox[bpas]|xapp)-[A-Za-z0-9-]{10,}/g, + /A(?:K|S)IA[A-Z0-9]{16}/g, + /hf_[A-Za-z0-9]{10,}/g, + /glpat-[A-Za-z0-9_-]{10,}/g, + /gsk_[A-Za-z0-9]{10,}/g, + /pypi-[A-Za-z0-9_-]{10,}/g, + /\bbot\d{8,10}:[A-Za-z0-9_-]{35}\b/g, + /\b\d{8,10}:[A-Za-z0-9_-]{35}\b/g, + /\b[A-Za-z0-9]{24}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27,}\b/g, +]; + +export const CONTEXT_PATTERNS: RegExp[] = [ + /(?<=Bearer\s+)[A-Za-z0-9_.+/=-]{10,}/gi, + /(?<=(?:_KEY|API_KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL)[=: ]['"]?)[A-Za-z0-9_.+/=-]{10,}/gi, +]; + +/** + * Replace every secret-shaped token in `text` with ``. Uses + * the canonical TOKEN_PREFIX_PATTERNS + CONTEXT_PATTERNS sets. + * + * Best-effort against unknown token shapes. The actual defense is the + * env allowlist (buildChildEnv); pattern redaction catches what slips + * through (e.g. error messages that echo a secret value). + */ +export function redactString(text: string): string { + if (!text) return text; + let out = text; + for (const p of TOKEN_PREFIX_PATTERNS) { + p.lastIndex = 0; + out = out.replace(p, REDACTED); + } + for (const p of CONTEXT_PATTERNS) { + p.lastIndex = 0; + out = out.replace(p, REDACTED); + } + return out; +} + +// Env keys the framework guarantees children may always see. Anything +// outside this set, outside FRAMEWORK_ENV_PREFIXES, and not declared +// in PhaseAction.secretEnv / AssertionStep.secretEnv is dropped before +// the child spawns. +const FRAMEWORK_ENV_ALLOWLIST: ReadonlySet = new Set([ + "PATH", + "HOME", + "SHELL", + "USER", + "LOGNAME", + "LANG", + "LC_ALL", + "LC_CTYPE", + "TZ", + "TERM", + "TMPDIR", + "RUNNER_TEMP", + "RUNNER_OS", + "GITHUB_ACTIONS", + "CI", + "NEMOCLAW_NON_INTERACTIVE", + "NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE", +]); + +const FRAMEWORK_ENV_PREFIXES: readonly string[] = ["E2E_", "NEMOCLAW_LOG_"]; + +// Shape required of any declared secretEnv key — must look like a +// secret-bearing variable. Prevents accidental allowlisting of +// non-secret values via the secretEnv channel and keeps the +// "framework-allowlist vs declared-secret" distinction honest. +const SECRET_ENV_KEY_SHAPE = + /^[A-Z][A-Z0-9_]*(?:API[_]?KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|PASSPHRASE|PRIVATE[_]?KEY|ACCESS[_]?KEY)$/; + +export function isValidSecretEnvKey(key: string): boolean { + return SECRET_ENV_KEY_SHAPE.test(key); +} + +export interface BuildChildEnvOptions { + /** Per-action / per-step declared secret-bearing env keys to pass through. */ + secretEnv?: readonly string[]; + /** Framework-controlled overlay (E2E_CONTEXT_DIR, E2E_PHASE, E2E_*_ID). */ + frameworkOverlay: NodeJS.ProcessEnv; +} + +/** + * Build the child's env from `base` (typically `process.env`) by + * keeping only: + * 1. keys in FRAMEWORK_ENV_ALLOWLIST + * 2. keys starting with one of FRAMEWORK_ENV_PREFIXES + * 3. keys explicitly declared in `opts.secretEnv` (validated shape) + * then layering `opts.frameworkOverlay` on top. + * + * Throws if a `secretEnv` entry doesn't match the secret-key shape; + * better to fail loudly at compile/runtime than silently leak a + * non-secret env var (which would defeat the allowlist purpose). + */ +export function buildChildEnv( + base: NodeJS.ProcessEnv, + opts: BuildChildEnvOptions, +): NodeJS.ProcessEnv { + const out: NodeJS.ProcessEnv = {}; + for (const [key, value] of Object.entries(base)) { + if (value === undefined) continue; + if (FRAMEWORK_ENV_ALLOWLIST.has(key)) { + out[key] = value; + continue; + } + if (FRAMEWORK_ENV_PREFIXES.some((prefix) => key.startsWith(prefix))) { + out[key] = value; + continue; + } + } + for (const key of opts.secretEnv ?? []) { + if (!isValidSecretEnvKey(key)) { + throw new Error( + `secretEnv entry '${key}' does not match the secret-key shape ` + + `(must end with API_KEY, TOKEN, SECRET, PASSWORD, CREDENTIAL, ` + + `PASSPHRASE, PRIVATE_KEY, or ACCESS_KEY). Refusing to allowlist.`, + ); + } + if (base[key] !== undefined) { + out[key] = base[key]; + } + } + Object.assign(out, opts.frameworkOverlay); + // The install action drops nemoclaw / openshell shims under + // ~/.local/bin (see nemoclaw_scenarios/install/repo-current.sh). + // On Ubuntu GH runners ~/.local/bin is on the default PATH; on + // self-hosted GPU runners and inside WSL it often is not, so the + // onboarding action's child runs without nemoclaw on PATH and + // dies with 'nemoclaw: command not found'. Add ~/.local/bin to + // every child's PATH at the framework boundary so the install + // location is consistent across phases. Idempotent equivalent of + // the install-path-refresh.sh nemoclaw_ensure_local_bin_on_path + // helper, applied centrally instead of per-script. + const home = out.HOME ?? base.HOME; + if (typeof home === "string" && home.length > 0) { + const localBin = `${home}/.local/bin`; + const currentPath = out.PATH ?? ""; + if (!currentPath.split(":").includes(localBin)) { + out.PATH = currentPath ? `${localBin}:${currentPath}` : localBin; + } + } + return out; +} + +/** + * Pipe `src` into `log`, redacting every chunk on the way through. + * Optional `onChunk` receives the already-redacted text (used by the + * orchestrator to keep a redacted stderr tail for failure messages). + * + * No raw bytes from the child ever reach `log` or the tail callback. + */ +export function pipeRedacted( + src: Readable, + log: Writable, + onChunk?: (redactedChunk: string) => void, +): void { + src.on("data", (chunk: Buffer) => { + const redacted = redactString(chunk.toString("utf8")); + log.write(redacted); + onChunk?.(redacted); + }); +} + +/** + * Compact array of all framework env keys the child sees by default. + * Exported for tests/diagnostics; do not use to bypass the boundary. + */ +export function frameworkEnvAllowlistSnapshot(): { + keys: string[]; + prefixes: string[]; +} { + return { + keys: [...FRAMEWORK_ENV_ALLOWLIST].sort(), + prefixes: [...FRAMEWORK_ENV_PREFIXES], + }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/runner.ts b/test/e2e-scenario/scenarios/orchestrators/runner.ts index 6ab3b76c62..02be1b195f 100644 --- a/test/e2e-scenario/scenarios/orchestrators/runner.ts +++ b/test/e2e-scenario/scenarios/orchestrators/runner.ts @@ -1,10 +1,17 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import type { PhaseResult, RunContext, RunPlan, RunPlanPhase } from "../types.ts"; +import fs from "node:fs"; +import path from "node:path"; + +import type { PhaseActionResult, PhaseResult, RunContext, RunPlan, RunPlanPhase } from "../types.ts"; +import { seedContextEnv } from "./context.ts"; import { EnvironmentOrchestrator } from "./environment.ts"; +import { LifecycleOrchestrator } from "./lifecycle.ts"; +import { evaluateNegativeContract, negativeContractPhaseResult } from "./negative-matcher.ts"; import { OnboardingOrchestrator } from "./onboarding.ts"; import { RuntimeOrchestrator } from "./runtime.ts"; +import { StateValidationOrchestrator } from "./state-validation.ts"; interface PhaseRunner { run(ctx: RunContext, phase: RunPlanPhase, priorResults?: PhaseResult[]): Promise; @@ -13,37 +20,166 @@ interface PhaseRunner { export interface ScenarioRunnerDeps { environment?: PhaseRunner; onboarding?: PhaseRunner; + stateValidation?: PhaseRunner; + lifecycle?: PhaseRunner; runtime?: PhaseRunner; } export class ScenarioRunner { private readonly environment: PhaseRunner; private readonly onboarding: PhaseRunner; + private readonly stateValidation: PhaseRunner; + private readonly lifecycle: PhaseRunner; private readonly runtime: PhaseRunner; constructor(deps: ScenarioRunnerDeps = {}) { this.environment = deps.environment ?? new EnvironmentOrchestrator(); this.onboarding = deps.onboarding ?? new OnboardingOrchestrator(); + this.stateValidation = deps.stateValidation ?? new StateValidationOrchestrator(); + this.lifecycle = deps.lifecycle ?? new LifecycleOrchestrator(); this.runtime = deps.runtime ?? new RuntimeOrchestrator(); } async run(ctx: RunContext, plan: RunPlan): Promise { + // Seed context.env from the typed RunPlan once, before any phase + // runs. Spec ownership: framework infrastructure (the runner), not + // a shell action. Onboarding may extend context.env via + // e2e_context_set; the runtime phase reads whatever is on disk. + seedContextEnv(ctx, plan); + const results: PhaseResult[] = []; for (const phase of plan.phases) { - if (phase.name === "environment") { - results.push(await this.environment.run(ctx, phase, results)); - continue; - } - if (phase.name === "onboarding") { - results.push(await this.onboarding.run(ctx, phase, results)); + const blocked = phaseBlockedBy(phase.name, results); + if (blocked) { + // Cross-phase short-circuit: the previous phase's setup work + // failed, so this phase cannot meaningfully run. Synthesize a + // skipped PhaseResult with a clear reason so artifacts stay + // honest (no false greens, no <1s assertion explosion). + results.push({ + phase: phase.name, + status: "skipped", + actions: [], + assertions: [ + { + id: `${phase.name}.blocked`, + status: "skipped", + attempts: 0, + durationMs: 0, + message: `phase blocked by prior failure: ${blocked.phase} action ${blocked.action.id} failed (${blocked.action.message ?? "no message"})`, + }, + ], + }); continue; } - if (phase.name === "runtime") { - results.push(await this.runtime.run(ctx, phase, results)); - continue; - } - throw new Error(`Unsupported phase: ${String(phase.name)}`); + const orchestrator = this.orchestratorFor(phase.name); + results.push(await orchestrator.run(ctx, phase, results)); } + + // Negative-scenario contract verification. Single decision point: + // if the plan declared expectedFailure, evaluate the matcher and + // append a synthetic phase result. Positive scenarios are + // unaffected. Side-effect verification stays the responsibility of + // the runtime control group's required pending step (kept red + // until the probe lands); the matcher only judges phase + errorClass. + if (plan.expectedFailure) { + const contractResult = evaluateNegativeContract(plan, results); + const synthetic = negativeContractPhaseResult(contractResult); + results.push(synthetic); + writeNegativeContractArtifact(ctx, contractResult, synthetic); + } + return results; } + + private orchestratorFor(name: RunPlanPhase["name"]): PhaseRunner { + if (name === "environment") return this.environment; + if (name === "onboarding") return this.onboarding; + if (name === "state-validation") return this.stateValidation; + if (name === "lifecycle") return this.lifecycle; + if (name === "runtime") return this.runtime; + throw new Error(`Unsupported phase: ${String(name)}`); + } +} + +interface BlockingFailure { + phase: "environment" | "onboarding" | "state-validation" | "lifecycle" | "runtime"; + action: PhaseActionResult; +} + +function writeNegativeContractArtifact( + ctx: RunContext, + contractResult: ReturnType, + synthetic: PhaseResult, +): void { + try { + const outputDir = path.join(ctx.contextDir, ".e2e"); + fs.mkdirSync(outputDir, { recursive: true }); + fs.writeFileSync( + path.join(outputDir, "negative-contract.json"), + `${JSON.stringify(contractResult, null, 2)}\n`, + ); + fs.writeFileSync( + path.join(outputDir, `${synthetic.phase}.result.json`), + `${JSON.stringify(synthetic, null, 2)}\n`, + ); + } catch { + /* artifact emission is best-effort; matcher result already in memory */ + } +} + +// state-validation is the typed diagnostic layer between onboarding +// and runtime. It probes gateway/sandbox/cli post-conditions and is +// the phase that proves a negative scenario's forbidden side effects +// did not occur (gateway-absent, sandbox-absent). For state-validation +// to do its job after a deliberate onboarding failure (negative +// scenarios), an onboarding failure must NOT block it. Only an +// environment-phase failure (install never ran) skips state-validation. +// Runtime stays blocked by any prior phase-action failure, including +// state-validation, so suites never run against a missing or wedged +// environment. +function phaseBlockedBy( + phase: "environment" | "onboarding" | "state-validation" | "lifecycle" | "runtime", + results: PhaseResult[], +): BlockingFailure | undefined { + const firstFailure = firstBlockingActionFailure(results); + if (!firstFailure) { + return undefined; + } + if (phase === "state-validation" && firstFailure.phase !== "environment") { + // state-validation is the diagnostic layer that proves a negative + // scenario's forbidden side effects didn't occur, so an onboarding + // failure must NOT block it. + return undefined; + } + if (phase === "lifecycle" && firstFailure.phase === "state-validation") { + // state-validation failure does not block the lifecycle phase + // either: state-validation results are diagnostic. Lifecycle + // workers depend on onboarding having produced a sandbox, but + // not on state-validation probes having all passed. + return undefined; + } + return firstFailure; +} + +function firstBlockingActionFailure(results: PhaseResult[]): BlockingFailure | undefined { + // A phase action failure (real setup work didn't succeed) blocks + // downstream phases. Assertion failures do NOT block downstream + // phases - they are expected to be reported alongside other phase + // results so reviewers can see all failure layers at once. + for (const result of results) { + if ( + result.phase !== "environment" && + result.phase !== "onboarding" && + result.phase !== "state-validation" && + result.phase !== "lifecycle" && + result.phase !== "runtime" + ) { + continue; + } + const failedAction = result.actions.find((action) => action.status === "failed"); + if (failedAction) { + return { phase: result.phase, action: failedAction }; + } + } + return undefined; } diff --git a/test/e2e-scenario/scenarios/orchestrators/state-validation.ts b/test/e2e-scenario/scenarios/orchestrators/state-validation.ts new file mode 100644 index 0000000000..567d49b3a6 --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/state-validation.ts @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { PhaseOrchestrator } from "./phase.ts"; + +// Typed replacement for the inline gateway/sandbox checks the legacy +// bash runner ran between onboarding and suite execution +// (e2e_gateway_assert_healthy / e2e_sandbox_assert_running) AND the +// post-failure side-effect checks for negative scenarios +// (`openshell sandbox list | grep -Fq ...`). The orchestrator inserts +// itself between onboarding and runtime; its phase actions are real +// probes (typed PhaseAction shell-fn entries the compiler emits from +// scenario.expectedStateId via the typed expected-state registry). +// +// Failure semantics: a probe action failure is just a phase-action +// failure, so the existing ScenarioRunner short-circuit logic kicks +// in and the runtime phase is reported as skipped. No new control +// flow is added; this orchestrator is only here to give the phase a +// dedicated identity in PhaseResult artifacts and in tests. +export class StateValidationOrchestrator extends PhaseOrchestrator { + constructor() { + super("state-validation"); + } +} diff --git a/test/e2e-scenario/scenarios/probes/builtin.ts b/test/e2e-scenario/scenarios/probes/builtin.ts new file mode 100644 index 0000000000..7f78fc06bc --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/builtin.ts @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { diagnosticsProbe } from "./diagnostics.ts"; +import { docsValidationProbe } from "./docs-validation.ts"; +import { injectionBlockedProbe } from "./injection-blocked.ts"; +import { networkPolicyProbe } from "./network-policy.ts"; +import { shieldsConfigProbe } from "./shields-config.ts"; +import { lookupProbe, registerProbe } from "./registry.ts"; + +/** + * Register all built-in probes. Idempotent: re-importing this module + * (e.g. through a different entry point) is a no-op once the probes + * are already in place. + * + * Ownership boundary: + * - Built-in probes here implement the cross-scenario contract that + * the typed registry already references by name (see + * scenarios/assertions/registry.ts). + * - Scenario-specific probes (if any) belong in a per-scenario + * module that calls `registerProbe()` directly. + * + * Security probes (shieldsConfigProbe, networkPolicyProbe, + * injectionBlockedProbe) are marked `required: true` in + * scenarios/assertions/registry.ts. With the implementations + * registered below, the orchestrator runs them and fails the phase + * on real assertion violations — not on a missing implementation. + */ +const BUILTIN_PROBES = { + diagnosticsProbe, + docsValidationProbe, + shieldsConfigProbe, + networkPolicyProbe, + injectionBlockedProbe, +} as const; + +export function registerBuiltinProbes(): void { + for (const [name, fn] of Object.entries(BUILTIN_PROBES)) { + if (lookupProbe(name) === undefined) { + registerProbe(name, fn); + } + } +} diff --git a/test/e2e-scenario/scenarios/probes/diagnostics.ts b/test/e2e-scenario/scenarios/probes/diagnostics.ts new file mode 100644 index 0000000000..e2259a6b77 --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/diagnostics.ts @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import type { ProbeContext, ProbeFn, ProbeOutcome } from "./types.ts"; + +/** + * Probe: diagnostics.bundle (`diagnosticsProbe`). + * + * Mirrors test/e2e/test-diagnostics.sh's TC-DIAG-02 case: + * + * 1. Run `nemoclaw debug --quick --output /quick-debug.tar.gz` + * with a 30s budget. + * 2. Assert exit 0. + * 3. Assert the archive exists and is non-empty. + * + * The legacy test also asserts the archive contains no plaintext + * credentials (TC-DIAG-01), but that lives in a separate probe + * (a future `diagnosticsBundleSecretsProbe`) so this one stays + * narrowly focused on bundle production. + * + * Evidence: a JSON document at ProbeContext.evidencePath summarizing + * exit code, archive size, and elapsed seconds. + */ +const DIAGNOSTICS_TIMEOUT_MS = 30_000; + +interface DiagnosticsEvidence { + exitCode: number | null; + signal: NodeJS.Signals | null; + elapsedMs: number; + archivePath: string; + archiveSize: number | null; + stderrTail: string; +} + +function writeEvidence(evidencePath: string, payload: DiagnosticsEvidence): void { + try { + fs.mkdirSync(path.dirname(evidencePath), { recursive: true }); + fs.writeFileSync(evidencePath, JSON.stringify(payload, null, 2)); + } catch { + /* evidence write is best-effort; never fail the probe on IO. */ + } +} + +export const diagnosticsProbe: ProbeFn = async (ctx: ProbeContext): Promise => { + // Pre-flight: nemoclaw must be on PATH; the legacy test treats this + // as a hard prerequisite, not a skip. + // (We rely on the spawned process surfacing ENOENT if it isn't.) + + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-diag-probe-")); + const archivePath = path.join(tmp, "quick-debug.tar.gz"); + const startedAt = Date.now(); + + let exitCode: number | null = null; + let signal: NodeJS.Signals | null = null; + let stderrTail = ""; + + const result = await new Promise<{ code: number | null; signal: NodeJS.Signals | null }>( + (resolve) => { + const child = spawn( + "nemoclaw", + ["debug", "--quick", "--output", archivePath], + // Use the parent env directly: probes run inside the framework + // process and don't need the redacted secret env that shell + // steps build at the spawn boundary. PATH/HOME/E2E_* are + // already in process.env. + { env: process.env, cwd: ctx.repoRoot, stdio: ["ignore", "ignore", "pipe"] }, + ); + const onTimeout = setTimeout(() => { + try { + child.kill("SIGTERM"); + } catch { + /* already gone */ + } + }, DIAGNOSTICS_TIMEOUT_MS); + child.stderr?.on("data", (chunk: Buffer) => { + stderrTail = (stderrTail + chunk.toString("utf8")).slice(-1024); + }); + child.on("error", (err) => { + clearTimeout(onTimeout); + // ENOENT or similar — nemoclaw is not on PATH. Surface as a + // distinct classifier so the operator can see it's an + // environment problem, not a real diagnostics failure. + stderrTail = (stderrTail + `spawn error: ${err.message}`).slice(-1024); + resolve({ code: 127, signal: null }); + }); + child.on("close", (code, sig) => { + clearTimeout(onTimeout); + resolve({ code, signal: sig }); + }); + }, + ); + exitCode = result.code; + signal = result.signal; + const elapsedMs = Date.now() - startedAt; + + let archiveSize: number | null = null; + try { + const stat = fs.statSync(archivePath); + archiveSize = stat.size; + } catch { + archiveSize = null; + } + + const evidence: DiagnosticsEvidence = { + exitCode, + signal, + elapsedMs, + archivePath, + archiveSize, + stderrTail, + }; + writeEvidence(ctx.evidencePath, evidence); + + // Best-effort cleanup of the tmp dir; keep the JSON evidence on + // disk regardless. + try { + fs.rmSync(tmp, { recursive: true, force: true }); + } catch { + /* tmp cleanup is non-fatal */ + } + + if (signal === "SIGTERM") { + return { + status: "failed", + classifier: "runner-infra", + message: `diagnosticsProbe: nemoclaw debug --quick exceeded ${DIAGNOSTICS_TIMEOUT_MS / 1000}s`, + }; + } + if (exitCode !== 0) { + return { + status: "failed", + message: `diagnosticsProbe: nemoclaw debug --quick exited ${exitCode}; stderr: ${stderrTail.slice(-300)}`, + }; + } + if (archiveSize === null) { + return { + status: "failed", + message: `diagnosticsProbe: archive missing at ${archivePath}`, + }; + } + if (archiveSize === 0) { + return { + status: "failed", + message: `diagnosticsProbe: archive at ${archivePath} is empty`, + }; + } + + return { + status: "passed", + message: `diagnosticsProbe: bundle ok (${archiveSize} bytes, ${elapsedMs}ms)`, + }; +}; diff --git a/test/e2e-scenario/scenarios/probes/docs-validation.ts b/test/e2e-scenario/scenarios/probes/docs-validation.ts new file mode 100644 index 0000000000..76ba5127c6 --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/docs-validation.ts @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; +import type { ProbeContext, ProbeFn, ProbeOutcome } from "./types.ts"; + +/** + * Probe: docs.validation (`docsValidationProbe`). + * + * Mirrors test/e2e/test-docs-validation.sh: + * + * 1. Run `test/e2e/e2e-cloud-experimental/check-docs.sh --only-cli` + * to verify `nemoclaw --help` matches docs/reference/commands.mdx + * (CLI / docs parity). + * 2. Run `... --only-links --local-only` to verify markdown internal + * links resolve. Remote http(s) probes are skipped by default + * because they are slow and flaky under CI rate limiting (the + * legacy script documents this caveat). + * + * Both checks exit 0 on success. The probe captures both exit codes + * and surfaces a single combined outcome, with a structured evidence + * JSON for diagnosis. + */ + +const CHECK_DOCS_REL = "test/e2e/e2e-cloud-experimental/check-docs.sh"; +const CLI_PARITY_TIMEOUT_MS = 60_000; +const LINK_CHECK_TIMEOUT_MS = 90_000; + +interface DocsCheckResult { + phase: "cli-parity" | "links-local"; + exitCode: number | null; + signal: NodeJS.Signals | null; + elapsedMs: number; + stderrTail: string; + stdoutTail: string; +} + +interface DocsEvidence { + results: DocsCheckResult[]; +} + +function runCheck( + scriptPath: string, + args: readonly string[], + cwd: string, + timeoutMs: number, + phase: DocsCheckResult["phase"], +): Promise { + return new Promise((resolve) => { + const startedAt = Date.now(); + let stdoutTail = ""; + let stderrTail = ""; + const child = spawn("bash", [scriptPath, ...args], { + env: { ...process.env, CHECK_DOC_LINKS_REMOTE: "0" }, + cwd, + stdio: ["ignore", "pipe", "pipe"], + }); + const onTimeout = setTimeout(() => { + try { + child.kill("SIGTERM"); + } catch { + /* already gone */ + } + }, timeoutMs); + child.stdout?.on("data", (chunk: Buffer) => { + stdoutTail = (stdoutTail + chunk.toString("utf8")).slice(-1024); + }); + child.stderr?.on("data", (chunk: Buffer) => { + stderrTail = (stderrTail + chunk.toString("utf8")).slice(-1024); + }); + child.on("error", (err) => { + clearTimeout(onTimeout); + resolve({ + phase, + exitCode: 127, + signal: null, + elapsedMs: Date.now() - startedAt, + stderrTail: `spawn error: ${err.message}`, + stdoutTail, + }); + }); + child.on("close", (code, sig) => { + clearTimeout(onTimeout); + resolve({ + phase, + exitCode: code, + signal: sig, + elapsedMs: Date.now() - startedAt, + stderrTail, + stdoutTail, + }); + }); + }); +} + +function writeEvidence(evidencePath: string, payload: DocsEvidence): void { + try { + fs.mkdirSync(path.dirname(evidencePath), { recursive: true }); + fs.writeFileSync(evidencePath, JSON.stringify(payload, null, 2)); + } catch { + /* evidence write is best-effort */ + } +} + +export const docsValidationProbe: ProbeFn = async (ctx: ProbeContext): Promise => { + const scriptPath = path.resolve(ctx.repoRoot, CHECK_DOCS_REL); + if (!fs.existsSync(scriptPath)) { + return { + status: "failed", + message: `docsValidationProbe: check-docs.sh not found at ${scriptPath}`, + }; + } + + const cliResult = await runCheck( + scriptPath, + ["--only-cli"], + ctx.repoRoot, + CLI_PARITY_TIMEOUT_MS, + "cli-parity", + ); + const linksResult = await runCheck( + scriptPath, + ["--only-links", "--local-only"], + ctx.repoRoot, + LINK_CHECK_TIMEOUT_MS, + "links-local", + ); + + writeEvidence(ctx.evidencePath, { results: [cliResult, linksResult] }); + + // Surface SIGTERM (timeout) as runner-infra so the orchestrator may + // retry on a transient slowness. Hard exit-code failures do not + // retry — a docs/CLI drift is deterministic. + if (cliResult.signal === "SIGTERM" || linksResult.signal === "SIGTERM") { + const which = cliResult.signal === "SIGTERM" ? "cli-parity" : "links-local"; + return { + status: "failed", + classifier: "runner-infra", + message: `docsValidationProbe: ${which} check timed out`, + }; + } + if (cliResult.exitCode !== 0) { + return { + status: "failed", + message: `docsValidationProbe: CLI/docs parity failed (exit ${cliResult.exitCode}); stderr: ${cliResult.stderrTail.slice(-300)}`, + }; + } + if (linksResult.exitCode !== 0) { + return { + status: "failed", + message: `docsValidationProbe: markdown link check failed (exit ${linksResult.exitCode}); stderr: ${linksResult.stderrTail.slice(-300)}`, + }; + } + return { + status: "passed", + message: `docsValidationProbe: ok (cli ${cliResult.elapsedMs}ms, links ${linksResult.elapsedMs}ms)`, + }; +}; diff --git a/test/e2e-scenario/scenarios/probes/injection-blocked.ts b/test/e2e-scenario/scenarios/probes/injection-blocked.ts new file mode 100644 index 0000000000..d1acf8ab3d --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/injection-blocked.ts @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ProbeContext, ProbeFn, ProbeOutcome } from "./types.ts"; +import { runSandboxCmd, writeProbeEvidence } from "./util.ts"; + +/** + * Probe: security.injection.blocked (`injectionBlockedProbe`). + * + * Mirrors the contract in + * test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh + * `spc_assert_telegram_payload_not_shell_executed`, which itself + * encodes the test-credential-sanitization.sh / test-brave-search-e2e.sh + * lesson: when an external content channel (chat message, web fetch + * response, document body) flows into the agent, it MUST be treated + * as data, never as a shell command. + * + * Implementation: + * 1. Pre-clean a unique marker file inside the sandbox. + * 2. Send a payload designed to exploit naive shell expansion: + * $(touch && echo INJECTED) + * via stdin to a remote `sh -c 'MSG=$(cat); printf "%s\\n" "$MSG"'` + * that's responsible for echoing the bytes back without + * evaluation. + * 3. Verify the echoed output contains the literal payload + * (proves the bytes were preserved, not interpreted). + * 4. Verify the marker file does NOT exist (proves the + * command-substitution operator did not fire on the host + * side or the sandbox-side shell). + * + * If either invariant is violated the probe fails closed: the + * scenario must NEVER pass on a fake-green injection check. + * + * Evidence: structured JSON capturing the payload, the marker path, + * the echo response tail, and the marker existence query result. + */ + +const PER_CALL_SECONDS = 25; + +interface InjectionEvidence { + payload: string; + markerPath: string; + echoExitCode: number | null; + echoStdoutTail: string; + echoStderrTail: string; + markerStateExitCode: number | null; + markerStateStdout: string; + payloadPreservedLiterally: boolean; + markerAbsent: boolean; +} + +function uniqueMarkerPath(): string { + // `/tmp` is sandbox-writable; collisions across parallel scenarios + // are avoided by mixing pid+random to keep the marker scoped to + // this probe invocation. + const rand = Math.floor(Math.random() * 0xffff_ffff).toString(16); + return `/tmp/nemoclaw-injection-probe-${process.pid}-${rand}`; +} + +export const injectionBlockedProbe: ProbeFn = async (ctx: ProbeContext): Promise => { + if (!ctx.sandboxName) { + return { status: "failed", message: "injectionBlockedProbe: E2E_SANDBOX_NAME missing in context.env" }; + } + + const markerPath = uniqueMarkerPath(); + // Single-quote the marker path inside the payload so the marker + // string survives the host-side bash quoting layer; the test is + // the COMMAND SUBSTITUTION operator surviving, not the path. + const payload = `$(touch '${markerPath}' && echo INJECTED)`; + + const evidence: InjectionEvidence = { + payload, + markerPath, + echoExitCode: null, + echoStdoutTail: "", + echoStderrTail: "", + markerStateExitCode: null, + markerStateStdout: "", + payloadPreservedLiterally: false, + markerAbsent: false, + }; + + // Step 1: pre-clean the marker. Best-effort; failure here doesn't + // matter as long as Step 3 confirms the marker is absent at the + // end. + await runSandboxCmd(ctx, ["sh", "-c", `rm -f '${markerPath}'`], { + perCallSeconds: PER_CALL_SECONDS, + }); + + // Step 2: send the payload via stdin to an in-sandbox shell that + // reads it into a variable and echoes it back. The remote command + // string must NOT use $() inside the literal; the host-side bash + // wrapper passes the script verbatim and the sandbox shell reads + // the payload as data. + const echoResult = await runSandboxCmd( + ctx, + ["sh", "-c", 'MSG=$(cat); printf "%s\n" "$MSG"'], + { perCallSeconds: PER_CALL_SECONDS, stdin: payload }, + ); + evidence.echoExitCode = echoResult.exitCode; + evidence.echoStdoutTail = echoResult.stdout; + evidence.echoStderrTail = echoResult.stderr; + + if (echoResult.exitCode !== 0) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + classifier: echoResult.signal === "SIGTERM" ? "gateway-transient" : undefined, + message: `injectionBlockedProbe: echo command failed (exit ${echoResult.exitCode}); stderr: ${echoResult.stderr.slice(-300)}`, + }; + } + + evidence.payloadPreservedLiterally = echoResult.stdout.includes(payload); + if (!evidence.payloadPreservedLiterally) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `injectionBlockedProbe: payload was not preserved literally; stdout tail: ${echoResult.stdout.slice(-300)}`, + }; + } + + // Step 3: confirm the marker file does NOT exist. A 'SAFE' result + // means the command-substitution payload was treated as data; an + // 'EXPLOITED' result means a shell somewhere in the chain + // evaluated the substitution. + const markerResult = await runSandboxCmd( + ctx, + ["sh", "-c", `test -f '${markerPath}' && echo EXPLOITED || echo SAFE`], + { perCallSeconds: PER_CALL_SECONDS }, + ); + evidence.markerStateExitCode = markerResult.exitCode; + evidence.markerStateStdout = markerResult.stdout; + evidence.markerAbsent = markerResult.stdout.includes("SAFE"); + + // Best-effort cleanup of the marker if it somehow got created + // (an 'EXPLOITED' result is a probe failure but we still don't + // want a stray file lingering between runs). + await runSandboxCmd(ctx, ["sh", "-c", `rm -f '${markerPath}'`], { + perCallSeconds: PER_CALL_SECONDS, + }); + + writeProbeEvidence(ctx.evidencePath, evidence); + + if (!evidence.markerAbsent) { + return { + status: "failed", + message: `injectionBlockedProbe: marker file ${markerPath} present \u2014 command substitution executed; stdout: ${markerResult.stdout.slice(-200)}`, + }; + } + + return { + status: "passed", + message: `injectionBlockedProbe: payload preserved as data, marker ${markerPath} absent`, + }; +}; diff --git a/test/e2e-scenario/scenarios/probes/network-policy.ts b/test/e2e-scenario/scenarios/probes/network-policy.ts new file mode 100644 index 0000000000..c3bb50923c --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/network-policy.ts @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ProbeContext, ProbeFn, ProbeOutcome } from "./types.ts"; +import { runSandboxCmd, writeProbeEvidence } from "./util.ts"; + +/** + * Probe: security.policy.enforced (`networkPolicyProbe`). + * + * Mirrors the deny-by-default contract from + * test/e2e/test-network-policy.sh TC-NET-01: when no policy preset + * widens egress for a given hostname, a request to that hostname + * from inside the sandbox MUST be rejected by the gateway. A success + * status is a hard failure \u2014 it means the network-policy enforcement + * layer is not catching the request. + * + * Implementation: from inside the sandbox, run `curl` against a + * non-whitelisted URL and inspect: + * - HTTP status code (via curl -w '%{http_code}') + * - curl exit code (curl exit 7 / 28 / etc. when DNS or connect + * is blocked outright) + * + * Expected outcomes: + * - HTTP 403 (gateway proxy rejected the request) + * - HTTP 4xx (any other 4xx that's not 401 \u2014 401 indicates the + * request reached an upstream auth wall, which counts as policy + * bypass, NOT block) + * - curl exit != 0 with HTTP code 000 (DNS / connect error) \u2014 the + * gateway dropped the request before HTTP could be spoken + * + * Anything else (HTTP 2xx, 3xx, 401) means policy is NOT enforcing + * deny-by-default and the probe fails. + * + * Hostname choice: example.com is the canonical "should never be on + * any preset" target the legacy test uses. Probes that need a + * different fixture override via E2E_NETWORK_POLICY_BLOCKED_URL. + */ + +const DEFAULT_BLOCKED_URL = "https://example.com/"; +const CURL_MAX_TIME_S = 10; +const PER_CALL_SECONDS = 25; + +interface NetworkPolicyEvidence { + blockedUrl: string; + curlExitCode: number | null; + curlSignal: string | null; + httpStatus: string | null; + stdoutTail: string; + stderrTail: string; +} + +function isBlockedHttpStatus(code: string): boolean { + if (code === "000") return true; // DNS/connect refused before HTTP + if (code === "401") return false; // reached upstream auth -> NOT blocked + return /^4[0-9][0-9]$/.test(code) || /^5[0-9][0-9]$/.test(code); +} + +export const networkPolicyProbe: ProbeFn = async (ctx: ProbeContext): Promise => { + if (!ctx.sandboxName) { + return { status: "failed", message: "networkPolicyProbe: E2E_SANDBOX_NAME missing in context.env" }; + } + const blockedUrl = ctx.contextEnv.E2E_NETWORK_POLICY_BLOCKED_URL || DEFAULT_BLOCKED_URL; + + // curl -sS keeps stderr informative on failure; -o /dev/null discards + // body so the gateway's HTML reject page doesn't pollute stdout; + // -w prints the status code we parse below. + const result = await runSandboxCmd( + ctx, + [ + "curl", + "-sS", + "-o", + "/dev/null", + "-w", + "%{http_code}", + "--max-time", + String(CURL_MAX_TIME_S), + blockedUrl, + ], + { perCallSeconds: PER_CALL_SECONDS }, + ); + + // curl writes the status code to stdout (or '000' on connect/DNS + // failure). Trim whitespace; some curl builds emit a trailing + // newline. + const httpStatus = result.stdout.trim() || null; + const evidence: NetworkPolicyEvidence = { + blockedUrl, + curlExitCode: result.exitCode, + curlSignal: result.signal, + httpStatus, + stdoutTail: result.stdout, + stderrTail: result.stderr, + }; + writeProbeEvidence(ctx.evidencePath, evidence); + + if (result.signal === "SIGTERM") { + return { + status: "failed", + classifier: "gateway-transient", + message: `networkPolicyProbe: curl into sandbox timed out after ${PER_CALL_SECONDS}s`, + }; + } + + // The probe accepts: + // - curl exit 0 with a 4xx/5xx body (gateway returned a reject) + // - curl exit != 0 with status '000' (gateway dropped the + // connection, curl never got an HTTP response) + if (httpStatus && isBlockedHttpStatus(httpStatus)) { + return { + status: "passed", + message: `networkPolicyProbe: ${blockedUrl} blocked (http_code=${httpStatus}, curl exit ${result.exitCode})`, + }; + } + if (result.exitCode !== 0 && (!httpStatus || httpStatus === "000")) { + return { + status: "passed", + message: `networkPolicyProbe: ${blockedUrl} blocked (curl exit ${result.exitCode}, no HTTP response)`, + }; + } + return { + status: "failed", + message: `networkPolicyProbe: ${blockedUrl} reachable from sandbox (http_code=${httpStatus ?? ""}, curl exit ${result.exitCode}); deny-by-default not enforced`, + }; +}; diff --git a/test/e2e-scenario/scenarios/probes/registry.ts b/test/e2e-scenario/scenarios/probes/registry.ts new file mode 100644 index 0000000000..3c4403cfcc --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/registry.ts @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ProbeFn } from "./types.ts"; + +/** + * Map of probe-ref name → probe runner. Shell-side AssertionStep + * declarations carry an `implementation: { kind: "probe", ref: }`. + * The orchestrator calls `lookupProbe(ref)` at execution time; if it + * returns undefined the step is reported skipped (or failed for + * `required` probes). + * + * The registry is module-scoped state. Built-in probes are registered + * by importing `./builtin.ts` (which calls registerProbe at module + * load). Tests that need a clean slate can call `resetProbeRegistry()`. + */ +const probes = new Map(); + +/** + * Register a probe implementation under `name`. Re-registering an + * existing name throws — silently shadowing a probe is a contract + * violation that hides behavior from the runner. + */ +export function registerProbe(name: string, fn: ProbeFn): void { + if (!name) { + throw new Error("registerProbe: name is required"); + } + if (probes.has(name)) { + throw new Error(`registerProbe: '${name}' already registered`); + } + probes.set(name, fn); +} + +/** + * Look up a registered probe. Returns undefined when the ref is not + * registered; the caller (phase.ts) decides whether the missing probe + * surfaces as skipped or failed based on AssertionStep.required. + */ +export function lookupProbe(name: string): ProbeFn | undefined { + return probes.get(name); +} + +/** + * Names of every currently-registered probe. Useful in plan rendering + * and tests that assert a build wired its expected probes. + */ +export function listRegisteredProbes(): readonly string[] { + return Array.from(probes.keys()).sort(); +} + +/** Test-only: clear the registry so each test starts from empty. */ +export function resetProbeRegistry(): void { + probes.clear(); +} diff --git a/test/e2e-scenario/scenarios/probes/shields-config.ts b/test/e2e-scenario/scenarios/probes/shields-config.ts new file mode 100644 index 0000000000..6e268f69a5 --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/shields-config.ts @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { ProbeContext, ProbeFn, ProbeOutcome } from "./types.ts"; +import { runHostCmd, runSandboxCmd, writeProbeEvidence } from "./util.ts"; + +/** + * Probe: security.shields.config (`shieldsConfigProbe`). + * + * Mirrors test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh + * `spc_assert_shields_config_consistent`, which itself ports the + * legacy test/e2e/test-shields-config.sh contract: + * + * 1. Ask the host CLI: `nemoclaw shields status` and + * classify the reported state as up | down | not-configured. + * 2. If the scenario declares an expected state via + * `E2E_SHIELDS_EXPECTED_STATE` (or the legacy + * `E2E_SHIELDS_EXPECTED`), assert observed === expected. + * 3. Verify the in-sandbox config file permissions match the + * observed state: + * - up -> root:root + restrictive 4xx mode + * (read-only for owner+group, no write + * for sandbox user) + * - down|not-configured -> sandbox:sandbox (writable by the + * sandbox user, since shields are + * not locking the file) + * + * Config path depends on the agent the scenario onboarded: + * - openclaw -> /sandbox/.openclaw/openclaw.json + * - hermes -> /sandbox/.hermes/.env + * + * Evidence: a JSON document at ProbeContext.evidencePath summarizing + * status output, observed state, expected state (if declared), and + * config-permission stat output. + */ + +const SHIELDS_STATUS_TIMEOUT_MS = 30_000; +const SANDBOX_STAT_PER_CALL_SECONDS = 25; + +type ShieldsState = "up" | "down" | "not-configured"; + +interface ShieldsEvidence { + observed: ShieldsState | null; + expected: ShieldsState | null; + statusExitCode: number | null; + statusStdoutTail: string; + configPath: string | null; + permissionsLine: string | null; + mode: string | null; + owner: string | null; +} + +function classifyStatus(stdout: string): ShieldsState | null { + if (stdout.includes("Shields: UP")) return "up"; + if (stdout.includes("Shields: DOWN")) return "down"; + if (stdout.includes("Shields: NOT CONFIGURED")) return "not-configured"; + return null; +} + +function configPathFor(agent: string | undefined): string | null { + switch (agent) { + case "openclaw": + case undefined: + case "": + return "/sandbox/.openclaw/openclaw.json"; + case "hermes": + return "/sandbox/.hermes/.env"; + default: + return null; + } +} + +function permissionsOk(observed: ShieldsState, mode: string, owner: string): boolean { + if (observed === "up") { + // Locked: owner must be root, mode must be 4xx (no group/world + // writes; legacy lib accepts 4[0-4][0-4]). + return /^4[0-4][0-4]$/.test(mode) && owner === "root:root"; + } + // down | not-configured: sandbox user owns the file so they can + // edit when shields are dropped. + return owner === "sandbox:sandbox"; +} + +function expectedStateFromContext(env: Readonly>): ShieldsState | null { + const raw = (env.E2E_SHIELDS_EXPECTED_STATE || env.E2E_SHIELDS_EXPECTED || "").trim(); + if (!raw) return null; + const norm = raw.replace(/_/g, "-").toLowerCase(); + if (norm === "up" || norm === "down" || norm === "not-configured") return norm; + return null; +} + +export const shieldsConfigProbe: ProbeFn = async (ctx: ProbeContext): Promise => { + if (!ctx.sandboxName) { + return { status: "failed", message: "shieldsConfigProbe: E2E_SANDBOX_NAME missing in context.env" }; + } + + const evidence: ShieldsEvidence = { + observed: null, + expected: expectedStateFromContext(ctx.contextEnv), + statusExitCode: null, + statusStdoutTail: "", + configPath: null, + permissionsLine: null, + mode: null, + owner: null, + }; + + // --- Step 1: nemoclaw shields status --- + const statusResult = await runHostCmd( + "nemoclaw", + [ctx.sandboxName, "shields", "status"], + { timeoutMs: SHIELDS_STATUS_TIMEOUT_MS }, + ); + evidence.statusExitCode = statusResult.exitCode; + evidence.statusStdoutTail = statusResult.stdout; + if (statusResult.signal === "SIGTERM") { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + classifier: "runner-infra", + message: `shieldsConfigProbe: 'nemoclaw shields status' timed out after ${SHIELDS_STATUS_TIMEOUT_MS}ms`, + }; + } + if (statusResult.exitCode !== 0) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: 'nemoclaw shields status' exited ${statusResult.exitCode}; stderr: ${statusResult.stderr.slice(-300)}`, + }; + } + const observed = classifyStatus(statusResult.stdout); + evidence.observed = observed; + if (!observed) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: status output did not report a recognized Shields state; tail: ${statusResult.stdout.slice(-200)}`, + }; + } + if (evidence.expected && evidence.expected !== observed) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: expected shields '${evidence.expected}', observed '${observed}'`, + }; + } + + // --- Step 2: in-sandbox stat of the config file --- + const configPath = configPathFor(ctx.contextEnv.E2E_AGENT); + if (!configPath) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: unsupported E2E_AGENT '${ctx.contextEnv.E2E_AGENT}'`, + }; + } + evidence.configPath = configPath; + const statResult = await runSandboxCmd( + ctx, + ["stat", "-c", "%a %U:%G", configPath], + { perCallSeconds: SANDBOX_STAT_PER_CALL_SECONDS }, + ); + if (statResult.exitCode !== 0) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + classifier: statResult.signal === "SIGTERM" ? "gateway-transient" : undefined, + message: `shieldsConfigProbe: stat of ${configPath} failed (exit ${statResult.exitCode}); stderr: ${statResult.stderr.slice(-300)}`, + }; + } + const permsLine = statResult.stdout.trim(); + evidence.permissionsLine = permsLine; + const [mode, owner] = permsLine.split(/\s+/, 2); + evidence.mode = mode ?? null; + evidence.owner = owner ?? null; + if (!mode || !owner) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: could not parse stat output: '${permsLine}'`, + }; + } + if (!permissionsOk(observed, mode, owner)) { + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "failed", + message: `shieldsConfigProbe: shields are '${observed}' but ${configPath} permissions are '${permsLine}'`, + }; + } + + writeProbeEvidence(ctx.evidencePath, evidence); + return { + status: "passed", + message: `shieldsConfigProbe: shields=${observed} ${configPath}=${permsLine}`, + }; +}; diff --git a/test/e2e-scenario/scenarios/probes/types.ts b/test/e2e-scenario/scenarios/probes/types.ts new file mode 100644 index 0000000000..4b1edabd08 --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/types.ts @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { TransientClassifier } from "../types.ts"; + +/** + * Context handed to a probe runner. Mirrors the subset of scenario + * state that shell steps already get via `${E2E_CONTEXT_DIR}/context.env`, + * but typed so probe implementations don't have to parse the file + * themselves. + * + * The orchestrator builds this before invoking the probe; probe code + * must NOT mutate `contextEnv` (treat as read-only). + */ +export interface ProbeContext { + /** Repo-relative or absolute path to .e2e/.. context root. */ + contextDir: string; + /** Absolute path to the evidence file the probe SHOULD write. */ + evidencePath: string; + /** Parsed key/value pairs from ${contextDir}/context.env. */ + contextEnv: Readonly>; + /** Convenience accessor for the most-used keys. Null when missing. */ + sandboxName: string | null; + gatewayUrl: string | null; + /** Repo root, so probes that shell out have a canonical cwd. */ + repoRoot: string; +} + +/** + * Structured probe result. Mirrors AssertionStep StepAttemptOutcome + * in `phase.ts` so the orchestrator can adopt it without translation. + * + * Probes MUST emit a structured outcome — never throw out of the + * registered function. Throwing is a contract violation that the + * orchestrator surfaces as a failed assertion with the error message, + * but a well-behaved probe converts thrown errors into a `failed` + * outcome with a redacted message. + */ +export interface ProbeOutcome { + status: "passed" | "failed" | "skipped"; + message?: string; + classifier?: TransientClassifier; + /** + * Optional override for the evidence path. If omitted the orchestrator + * uses `step.evidencePath` (which the probe was already told via + * ProbeContext.evidencePath). + */ + evidence?: string; +} + +/** + * The function shape every registered probe implements. + * + * Convention: + * - Probes are async even when they could be sync, so the registry + * can swap an implementation for a slow IO-bound version without + * ripple effects through the orchestrator. + * - Probes write structured evidence (JSON) to ProbeContext.evidencePath + * so failures are diagnosable from the artifact bundle. + */ +export type ProbeFn = (ctx: ProbeContext) => Promise; diff --git a/test/e2e-scenario/scenarios/probes/util.ts b/test/e2e-scenario/scenarios/probes/util.ts new file mode 100644 index 0000000000..22e192d074 --- /dev/null +++ b/test/e2e-scenario/scenarios/probes/util.ts @@ -0,0 +1,287 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; +import type { ProbeContext } from "./types.ts"; + +/** + * Shared utilities for built-in probes. Two responsibilities: + * + * 1. Entering the sandbox via the canonical bash wrapper + * (`validation_suites/sandbox-exec.sh`) instead of re-implementing + * the ssh-config / openshell-exec logic in TS. This keeps the + * transport choice in ONE place \u2014 if the wrapper changes + * (e.g. switches from openshell-exec to ssh-config preferred), + * every probe inherits the new behavior. + * + * 2. Spawning host-side CLIs (`nemoclaw`, `openshell`) with timeouts + * and structured outcome capture. Probes never invoke spawn + * directly so timeout and stdio handling stays consistent. + * + * Probe code MUST treat the returned `stdout`/`stderr` as already-bounded + * (we slice the tail). The full output is never returned or logged from + * here \u2014 evidence files keep the structured fields a probe explicitly + * decides to persist. + */ + +const VALIDATION_SUITES_REL = "test/e2e-scenario/validation_suites"; +const TAIL_BYTES = 2048; + +export interface CmdResult { + exitCode: number | null; + signal: NodeJS.Signals | null; + stdout: string; + stderr: string; + elapsedMs: number; +} + +interface RunOptions { + /** Hard cap; on expiry the helper SIGTERMs the child and resolves. */ + timeoutMs: number; + /** stdin payload for `runSandboxCmdStdin`. UTF-8 only. */ + stdin?: string; + /** Override env. Defaults to process.env. */ + env?: NodeJS.ProcessEnv; + /** Override cwd. Defaults to ProbeContext.repoRoot resolution. */ + cwd?: string; +} + +function tail(buf: string, max = TAIL_BYTES): string { + return buf.length <= max ? buf : buf.slice(-max); +} + +/** + * Reject NUL bytes in any string that flows into a child process. Mirrors + * the defense-in-depth used by src/lib/runner.ts (normalizeSpawnFile / + * normalizeSpawnArgs) so probe-side spawns enforce the same boundary. + */ +function rejectNulByte(value: string, label: string): string { + if (value.includes("\u0000")) { + throw new Error(`${label} must not contain NUL bytes`); + } + return value; +} + +/** + * Spawn a bash script and capture the result. Internal helper used by + * the sandbox-cmd path; not exported because direct bash spawning by + * probes invites the same drift the canonical wrapper exists to + * prevent. + * + * Contract that addresses CodeQL js/shell-command-injection-from-environment: + * + * 1. The `script` parameter is always a string LITERAL at every call + * site — callers do not interpolate user-controlled data into + * the script body. + * 2. `bashArgs` carry all variable data and reach the script via + * bash positional parameters ($1, $2, ...). Bash treats positional + * argv as data, not code, so the values bypass parser expansion. + * 3. Every string in `bashArgs` is NUL-byte-rejected here — NUL is + * the only byte process-spawn cannot survive cleanly. + * 4. The bash binary path is hard-coded; `shell: false` is implicit + * because spawn() does not enable a shell when given an explicit + * argv array. + * + * The lgtm suppression below is justified by this contract; it mirrors + * the established pattern in src/lib/runner.ts where the same rule is + * suppressed for argv arrays passed through `bash -c`. + */ +function spawnBash( + script: string, + opts: RunOptions, + bashArgs: readonly string[] = [], +): Promise { + const safeArgs = bashArgs.map((arg, idx) => + rejectNulByte(String(arg), `spawnBash: bashArgs[${idx + 1}]`), + ); + return new Promise((resolve) => { + const startedAt = Date.now(); + let stdout = ""; + let stderr = ""; + // bash -c reserves the first positional after the script for $0; + // a fixed sentinel keeps the script's own $1..$N aligned with the + // caller-supplied bashArgs. Spawn safety contract is documented on + // spawnBash above (literal script body, NUL-validated positional + // argv, hard-coded bash binary). The lgtm marker MUST be the line + // immediately preceding the spawn() call so CodeQL/LGTM picks it up. + // lgtm[js/shell-command-injection-from-environment] + const child = spawn("bash", ["-c", script, "e2e-probe-spawn", ...safeArgs], { + env: opts.env ?? process.env, + cwd: opts.cwd, + stdio: [opts.stdin === undefined ? "ignore" : "pipe", "pipe", "pipe"], + }); + const onTimeout = setTimeout(() => { + try { + child.kill("SIGTERM"); + } catch { + /* already gone */ + } + }, opts.timeoutMs); + child.stdout?.on("data", (chunk: Buffer) => { + stdout = tail(stdout + chunk.toString("utf8")); + }); + child.stderr?.on("data", (chunk: Buffer) => { + stderr = tail(stderr + chunk.toString("utf8")); + }); + if (opts.stdin !== undefined && child.stdin) { + child.stdin.end(opts.stdin); + } + child.on("error", (err) => { + clearTimeout(onTimeout); + resolve({ + exitCode: 127, + signal: null, + stdout, + stderr: tail(stderr + `spawn error: ${err.message}`), + elapsedMs: Date.now() - startedAt, + }); + }); + child.on("close", (code, sig) => { + clearTimeout(onTimeout); + resolve({ + exitCode: code, + signal: sig, + stdout, + stderr, + elapsedMs: Date.now() - startedAt, + }); + }); + }); +} + +/** + * Run a command inside the scenario's sandbox via the canonical + * `e2e_sandbox_exec` shell wrapper. Picks up the same ssh-config + * preferred / openshell-exec fallback transport, the per-call + * timeout, and the classified diagnostic on hang. + * + * `args` is treated as a single argv vector by the wrapper. Each + * element is passed as a positional bash parameter (not + * interpolated into the script body) so payloads with shell + * metacharacters survive intact and no user-controlled data flows + * into the shell command string. + */ +export async function runSandboxCmd( + ctx: ProbeContext, + args: readonly string[], + opts: { timeoutMs?: number; perCallSeconds?: number; stdin?: string } = {}, +): Promise { + if (!ctx.sandboxName) { + return { + exitCode: 1, + signal: null, + stdout: "", + stderr: "runSandboxCmd: ProbeContext.sandboxName is null (E2E_SANDBOX_NAME unset in context.env)", + elapsedMs: 0, + }; + } + const wrapperPath = path.resolve(ctx.repoRoot, VALIDATION_SUITES_REL, "sandbox-exec.sh"); + if (!fs.existsSync(wrapperPath)) { + return { + exitCode: 1, + signal: null, + stdout: "", + stderr: `runSandboxCmd: wrapper not found at ${wrapperPath}`, + elapsedMs: 0, + }; + } + const fnName = opts.stdin === undefined ? "e2e_sandbox_exec" : "e2e_sandbox_exec_stdin"; + // Per-call wrapper cap (bash-side timeout); outer node-side cap + // sits a few seconds above so node always wins and we get a clean + // CmdResult even if bash hangs mid-output. + const perCall = opts.perCallSeconds ?? 25; + const outerMs = opts.timeoutMs ?? perCall * 1000 + 5_000; + // All user-controlled values (wrapper path from ctx.repoRoot, + // sandbox name, payload argv) are passed as positional bash + // parameters rather than interpolated into the script body. + // Layout: $1=wrapperPath, $2=fnName, $3=sandboxName, $4..$N=argv. + // CodeQL alert 715 — "shell command built from environment + // values" — is cleared by this contract because no user data + // appears in the script string. + const script = `set -uo pipefail +. "$1" +E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=${perCall} "$2" "$3" -- "\${@:4}" +`; + return spawnBash( + script, + { + timeoutMs: outerMs, + stdin: opts.stdin, + env: { ...process.env, E2E_CONTEXT_DIR: ctx.contextDir }, + cwd: ctx.repoRoot, + }, + [wrapperPath, fnName, ctx.sandboxName, ...args], + ); +} + +/** + * Spawn a host-side CLI directly. Use for `nemoclaw` / `openshell` + * commands that operate against the host, not inside the sandbox + * (e.g. `nemoclaw shields status`, `openshell policy get`). + */ +export function runHostCmd( + bin: string, + args: readonly string[], + opts: { timeoutMs?: number; cwd?: string; env?: NodeJS.ProcessEnv } = {}, +): Promise { + return new Promise((resolve) => { + const startedAt = Date.now(); + let stdout = ""; + let stderr = ""; + const child = spawn(bin, [...args], { + env: opts.env ?? process.env, + cwd: opts.cwd, + stdio: ["ignore", "pipe", "pipe"], + }); + const timeoutMs = opts.timeoutMs ?? 30_000; + const onTimeout = setTimeout(() => { + try { + child.kill("SIGTERM"); + } catch { + /* already gone */ + } + }, timeoutMs); + child.stdout?.on("data", (chunk: Buffer) => { + stdout = tail(stdout + chunk.toString("utf8")); + }); + child.stderr?.on("data", (chunk: Buffer) => { + stderr = tail(stderr + chunk.toString("utf8")); + }); + child.on("error", (err) => { + clearTimeout(onTimeout); + resolve({ + exitCode: 127, + signal: null, + stdout, + stderr: tail(stderr + `spawn error: ${err.message}`), + elapsedMs: Date.now() - startedAt, + }); + }); + child.on("close", (code, sig) => { + clearTimeout(onTimeout); + resolve({ + exitCode: code, + signal: sig, + stdout, + stderr, + elapsedMs: Date.now() - startedAt, + }); + }); + }); +} + +/** + * Best-effort write of structured probe evidence. Every built-in + * probe writes its structured outcome to ProbeContext.evidencePath + * via this helper so the artifact bundle has a uniform JSON layout. + */ +export function writeProbeEvidence(evidencePath: string, payload: unknown): void { + try { + fs.mkdirSync(path.dirname(evidencePath), { recursive: true }); + fs.writeFileSync(evidencePath, JSON.stringify(payload, null, 2)); + } catch { + /* evidence is best-effort; never fail the probe on IO */ + } +} diff --git a/test/e2e-scenario/scenarios/run.ts b/test/e2e-scenario/scenarios/run.ts index b50e7836ea..ff9fb056c4 100644 --- a/test/e2e-scenario/scenarios/run.ts +++ b/test/e2e-scenario/scenarios/run.ts @@ -8,14 +8,12 @@ import { compileRunPlans, renderPlanText, writePlanArtifacts } from "./compiler. import { ScenarioRunner } from "./orchestrators/runner.ts"; import { listScenarios } from "./registry.ts"; import { resolveRunnerForScenario } from "./runner-routing.ts"; -import type { ScenarioDefinition } from "./types.ts"; +import type { PhaseResult, ScenarioDefinition } from "./types.ts"; interface Args { list: boolean; - planOnly: boolean; - dryRun: boolean; - validateOnly: boolean; emitMatrix: boolean; + planOnly: boolean; scenarios: string[]; } @@ -34,14 +32,7 @@ export interface ScenarioMatrixEntry { } function parseArgs(argv: string[]): Args { - const args: Args = { - list: false, - planOnly: false, - dryRun: false, - validateOnly: false, - emitMatrix: false, - scenarios: [], - }; + const args: Args = { list: false, emitMatrix: false, planOnly: false, scenarios: [] }; for (let i = 0; i < argv.length; i += 1) { const arg = argv[i]; if (arg === "--list") { @@ -56,14 +47,6 @@ function parseArgs(argv: string[]): Args { args.planOnly = true; continue; } - if (arg === "--dry-run") { - args.dryRun = true; - continue; - } - if (arg === "--validate-only") { - args.validateOnly = true; - continue; - } if (arg === "--scenarios") { const value = argv[i + 1]; if (!value) { @@ -122,6 +105,7 @@ function emitMatrix() { // Single line so GHA's `$GITHUB_OUTPUT` can consume it via // echo "matrix=$(npx tsx ... --emit-matrix)" >> "$GITHUB_OUTPUT" // without needing heredoc multi-line output handling. + // Consumed by the dynamic matrix workflow (PR #4359). process.stdout.write(`${JSON.stringify(buildScenarioMatrix())}\n`); } @@ -136,10 +120,6 @@ async function main() { return; } - const modeCount = [args.planOnly, args.dryRun, args.validateOnly].filter(Boolean).length; - if (modeCount !== 1) { - throw new Error("Use exactly one of --plan-only, --dry-run, or --validate-only with --scenarios "); - } if (args.scenarios.length === 0) { throw new Error("scenario execution requires --scenarios "); } @@ -153,12 +133,73 @@ async function main() { writePlanArtifacts(plans, contextDir); console.log(renderPlanText(plans)); - if (args.dryRun) { - const runner = new ScenarioRunner(); - for (const plan of plans) { - await runner.run({ contextDir, dryRun: true }, plan); + if (args.planOnly) { + // Local debug only. Workflows must not pass --plan-only. + return; + } + + const runner = new ScenarioRunner(); + const allResults: PhaseResult[] = []; + let anyFailed = false; + for (const plan of plans) { + const results = await runner.run({ contextDir }, plan); + allResults.push(...results); + if (planFailed(plan, results)) { + anyFailed = true; } } + + // Surface a compact run summary so phase results don't have to be opened + // to see what passed. + console.log(""); + console.log("Phase results:"); + for (const result of allResults) { + const counts = result.assertions.reduce( + (acc, assertion) => { + acc[assertion.status] = (acc[assertion.status] ?? 0) + 1; + return acc; + }, + {} as Record, + ); + const detail = Object.entries(counts) + .map(([status, count]) => `${status}=${count}`) + .join(" "); + console.log(` ${result.phase}: ${result.status} (${detail || "no steps"})`); + } + + if (anyFailed) { + process.exitCode = 1; + } +} + +// A scenario fails iff: +// positive (no expectedFailure): any phase result failed. +// negative (expectedFailure declared): the synthetic +// negative-contract phase did not match, OR the runtime +// control group's required side-effect step did not pass. +// +// The matcher decides exit code for negatives so that a scenario +// that failed for the right reason in the right phase is no longer +// reported as red just because setup did not complete. Until the +// forbidden-side-effect probe lands, the required pending step in +// runtimeControlGroups keeps negatives visibly red on the side-effect +// axis even when phase + errorClass match. +function planFailed(plan: import("./types.ts").RunPlan, results: PhaseResult[]): boolean { + if (!plan.expectedFailure) { + return results.some((result) => result.status === "failed"); + } + const contractPhase = results.find((result) => result.phase === "negative-contract"); + if (!contractPhase || contractPhase.status !== "passed") { + return true; + } + const runtime = results.find((result) => result.phase === "runtime"); + const sideEffectStep = runtime?.assertions.find( + (assertion) => assertion.id === "runtime.expected-failure.no-side-effects", + ); + if (!sideEffectStep || sideEffectStep.status !== "passed") { + return true; + } + return false; } // Only execute when invoked directly as a script. Importing this module from diff --git a/test/e2e-scenario/scenarios/scenarios/baseline.ts b/test/e2e-scenario/scenarios/scenarios/baseline.ts index 098209017a..cd19c43fe0 100644 --- a/test/e2e-scenario/scenarios/scenarios/baseline.ts +++ b/test/e2e-scenario/scenarios/scenarios/baseline.ts @@ -8,10 +8,11 @@ import { gpuRepoDockerCdi, macosRepoDocker, ubuntuRepoDocker, + ubuntuRepoDockerLifecycle, ubuntuRepoNoDocker, wslRepoDocker, } from "../matrix.ts"; -import type { ScenarioDefinition, ScenarioEnvironment } from "../types.ts"; +import type { ExpectedFailureContract, ScenarioDefinition, ScenarioEnvironment } from "../types.ts"; interface CanonicalScenarioInput { id: string; @@ -24,7 +25,7 @@ interface CanonicalScenarioInput { runnerRequirements?: string[]; requiredSecrets?: string[]; skippedCapabilities?: Array>; - expectedFailure?: Record; + expectedFailure?: ExpectedFailureContract; } function canonicalScenario(input: CanonicalScenarioInput): ScenarioDefinition { @@ -130,6 +131,23 @@ const canonicalScenarioInputs: CanonicalScenarioInput[] = [ forbiddenSideEffects: ["gateway-started", "sandbox-created"], }, }, + { + // Rebuild scenario. Onboards an OpenClaw sandbox normally, then + // the lifecycle phase seeds a workspace marker, runs + // `nemoclaw rebuild --yes`, and publishes the marker contract to + // runtime-phase assertions in rebuild_upgrade.sh. Mirrors the + // workspace-state-preservation invariant from + // test/e2e/test-rebuild-openclaw.sh; the broader version-upgrade + // dimension (build OLD-version base image first) belongs to a + // future `rebuild-from-old-version` lifecycle profile and is + // intentionally out of scope here. + id: "ubuntu-rebuild-openclaw", + manifestName: "openclaw-nvidia-rebuild", + environment: ubuntuRepoDockerLifecycle("cloud-openclaw", "rebuild-current-version"), + expectedStateId: "cloud-openclaw-ready", + suiteIds: ["smoke", "rebuild", "upgrade"], + requiredSecrets: ["NVIDIA_API_KEY"], + }, { id: "ubuntu-repo-openai-compatible-openclaw", manifestName: "openclaw-openai-compatible", diff --git a/test/e2e-scenario/scenarios/types.ts b/test/e2e-scenario/scenarios/types.ts index b29f8458d6..157ffa0ae6 100644 --- a/test/e2e-scenario/scenarios/types.ts +++ b/test/e2e-scenario/scenarios/types.ts @@ -1,7 +1,85 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -export type PhaseName = "environment" | "onboarding" | "runtime"; +export type PhaseName = + | "environment" + | "onboarding" + | "state-validation" + | "lifecycle" + | "runtime"; + +// Synthetic phase appended by the scenario runner when a scenario +// declares plan.expectedFailure. Distinct from PhaseName so a scenario +// builder cannot accidentally declare an assertion or action against +// it. Only the runner emits PhaseResult entries with this name. +export type NegativeContractPhase = "negative-contract"; + +export type PhaseResultName = PhaseName | NegativeContractPhase; + +// Concrete probe ids the state-validation orchestrator emits as phase +// actions. Each id maps to a probe script under +// nemoclaw_scenarios/probes/. Inference and credentials probes are +// declared but not yet implemented; the compiler skips emitting actions +// for them until the probe scripts land. +export type StateProbeId = + | "cli-installed" + | "gateway-healthy" + | "gateway-absent" + | "sandbox-running" + | "sandbox-absent"; + +// User-facing phase the negative-scenario contract advertises. Wider +// than PhaseName because manifests may declare "preflight" failures, +// which the matcher resolves to the onboarding phase orchestrator. +// state-validation is intentionally omitted: it is an internal phase +// the framework inserts after onboarding; scenarios cannot declare +// expected failures against it (those are expressed via +// expectedStateId + the absent/forbidden-side-effect probes). +export type ExpectedFailurePhase = "environment" | "onboarding" | "runtime" | "preflight"; + +export interface ExpectedFailureContract { + phase: ExpectedFailurePhase; + errorClass: string; + forbiddenSideEffects?: readonly string[]; +} + +// Expected-state contract. Mirrors the structural shape of +// nemoclaw_scenarios/expected-states.yaml so the typed registry can +// remain a verifiable mirror of the legacy YAML during transition. +// Each dimension's `expected` field declares whether that aspect of +// the post-setup environment should be present, absent, or optional. +// Optional dimensions emit no probe actions; present/absent dimensions +// emit a real probe that gates the runtime phase. +// +// Spec ownership: the typed registry (scenarios/expected-states.ts) is +// the source of truth for the TS runner; expected-states.yaml stays +// alongside until the legacy resolver is fully retired, with a contract +// test that the typed registry mirrors the YAML. +export type ExpectedPresence = "present" | "absent" | "optional"; +export type ExpectedHealth = "healthy" | "absent" | "optional"; +export type ExpectedSandboxStatus = "running" | "absent" | "optional"; +export type ExpectedInferenceAvail = "available" | "absent" | "optional"; + +export interface ExpectedState { + id: string; + cli?: { installed?: boolean }; + gateway?: { + expected: ExpectedPresence; + health?: ExpectedHealth; + }; + sandbox?: { + expected: ExpectedPresence; + status?: ExpectedSandboxStatus; + agent?: string; + }; + inference?: { + expected: ExpectedInferenceAvail; + provider?: string; + }; + credentials?: { + expected: ExpectedPresence; + }; +} export type TransientClassifier = | "empty-event-capture" @@ -66,6 +144,21 @@ export interface AssertionStep { }; evidencePath?: string; reliability?: AssertionStepReliability; + // Declared parent-env keys this step requires beyond the framework's + // allowlist. Anything not allowlisted and not declared here is + // dropped before spawn. See orchestrators/redaction.ts. Each entry + // must match the secret-key shape; the framework rejects non-secret + // names to keep the allowlist-vs-declared-secret boundary honest. + secretEnv?: readonly string[]; + // When true, a probe/pending step that resolves as "skipped" is + // reclassified as "failed" by the phase orchestrator. Required + // steps fail closed when their underlying implementation isn't + // available yet (probe registry not landed, expected-failure + // side-effect validator not implemented, ...) instead of silently + // producing fake green. Defaults to false; set true for security- + // sensitive suites and expected-failure validators that the run + // is not safe without. + required?: boolean; } export interface AssertionGroup { @@ -83,6 +176,15 @@ export interface ScenarioEnvironment { install: string; runtime: string; onboarding: string; + // Optional lifecycle profile id. When set, the compiler emits a + // dedicated `lifecycle` phase action between state-validation and + // runtime. The action is implemented by a worker under + // nemoclaw_scenarios/lifecycle/, dispatched by + // nemoclaw_scenarios/lifecycle/dispatch.sh, and routes by profile + // id (e.g. "rebuild-current-version"). Scenarios that don't need a + // post-onboard state mutation simply omit this field; their + // lifecycle phase emits no actions and runs no assertions. + lifecycle?: string; } export interface ScenarioDefinition { @@ -97,12 +199,56 @@ export interface ScenarioDefinition { runnerRequirements?: string[]; requiredSecrets?: string[]; skippedCapabilities?: Array>; - expectedFailure?: Record; + expectedFailure?: ExpectedFailureContract; +} + +// A phase action is real, deterministic setup work the phase orchestrator +// performs BEFORE running its assertions: install nemoclaw, run +// onboarding, emit context.env, etc. Actions short-circuit assertions on +// failure (assertions don't run if the action they depend on failed). +// +// Spec ownership: phase orchestrators own actions. The top-level runner +// must not execute actions; clients must not embed action policy. +export interface PhaseAction { + id: string; + phase: PhaseName; + description?: string; + // "shell-fn" sources the bash dispatcher and invokes the named function. + // "shell" runs an executable script (used for context-emit helper). + kind: "shell-fn" | "shell"; + // Repo-relative path to the script. + scriptRef: string; + // For "shell-fn": the bash function to invoke after sourcing scriptRef. + fn?: string; + // Single positional arg passed to the function/script (install method or + // onboarding profile id today). Kept as a single string to keep stable + // ids predictable; multi-arg variants can extend this later. + arg?: string; + // Per-action timeout. No retry by default - install/onboard must fail + // loudly so the regression is visible. Retry stays a property of + // assertion steps, not actions. + timeoutSeconds?: number; + // Repo-relative evidence log path. + evidencePath?: string; + // Optional stable alias the orchestrator copies the evidence log to + // after a successful action. Lets legacy shell assertions that + // reference well-known filenames (e.g. ${E2E_CONTEXT_DIR}/onboard.log) + // keep working without coupling them to the action's stable id. + aliasPath?: string; + // Declared parent-env keys this action requires beyond the + // framework's allowlist (PATH, HOME, E2E_*, NEMOCLAW_*, ...). + // Anything not allowlisted and not declared here is dropped before + // spawn. See orchestrators/redaction.ts. Each entry must match the + // secret-key shape; the framework rejects non-secret names so the + // allowlist-vs-declared-secret boundary stays honest. Cloud install + // declares ["NVIDIA_API_KEY"]; slack onboarding declares the slack + // tokens it actually needs; etc. + secretEnv?: readonly string[]; } export interface RunPlanPhase { name: PhaseName; - actions: string[]; + actions: PhaseAction[]; assertionGroups: AssertionGroup[]; } @@ -120,13 +266,12 @@ export interface RunPlan { runnerRequirements: string[]; requiredSecrets: string[]; skippedCapabilities: Array>; - expectedFailure?: Record; + expectedFailure?: ExpectedFailureContract; sutBoundaries: SutBoundary[]; } export interface RunContext { contextDir: string; - dryRun: boolean; } export interface AssertionResult { @@ -139,8 +284,20 @@ export interface AssertionResult { message?: string; } +export interface PhaseActionResult { + id: string; + status: "passed" | "failed" | "skipped"; + durationMs: number; + evidence?: string; + message?: string; +} + export interface PhaseResult { - phase: PhaseName; + phase: PhaseResultName; status: "passed" | "failed" | "skipped"; + // Action results are recorded distinctly from assertion results so + // failure-layer attribution stays unambiguous: a failure in actions + // means setup never completed; assertions did not have a fair chance. + actions: PhaseActionResult[]; assertions: AssertionResult[]; } diff --git a/test/e2e-scenario/validation_suites/assert/gateway-alive.sh b/test/e2e-scenario/validation_suites/assert/gateway-alive.sh index a498602d35..42f33e1c50 100755 --- a/test/e2e-scenario/validation_suites/assert/gateway-alive.sh +++ b/test/e2e-scenario/validation_suites/assert/gateway-alive.sh @@ -9,6 +9,8 @@ _E2E_GW_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../runtime/lib" && pwd) . "${_E2E_GW_LIB_DIR}/env.sh" # shellcheck source=../../runtime/lib/context.sh . "${_E2E_GW_LIB_DIR}/context.sh" +# shellcheck source=../sandbox-exec.sh +. "$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/sandbox-exec.sh" # e2e_gateway_assert_healthy [url] # Defaults to E2E_GATEWAY_URL from context; returns non-zero with a clear @@ -23,10 +25,6 @@ e2e_gateway_assert_healthy() { return 2 fi e2e_env_trace "gateway:check" "${url}" - if e2e_env_is_dry_run; then - echo "[dry-run] gateway check ${url} (skipped)" - return 0 - fi # Prefer /health if available, otherwise just hit the base URL. local http_code http_code="$(curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 "${url%/}/health" 2>/dev/null || echo 000)" @@ -41,7 +39,9 @@ e2e_gateway_assert_healthy() { local sandbox_name sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" if [[ -n "${sandbox_name}" ]] && command -v openshell >/dev/null 2>&1; then - http_code="$(openshell sandbox exec -n "${sandbox_name}" -- curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 http://localhost:18789/health 2>/dev/null || echo 000)" + # Wrapper applies a per-call timeout so a wedged ssh handshake here + # cannot consume the orchestrator's whole step budget. + http_code="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=15 e2e_sandbox_exec "${sandbox_name}" -- curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 http://localhost:18789/health 2>/dev/null || echo 000)" if [[ "${http_code}" == "200" || "${http_code}" == "401" ]]; then return 0 fi diff --git a/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh b/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh index b85ef9cd60..473061e972 100755 --- a/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh +++ b/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh @@ -12,7 +12,6 @@ _E2E_SB_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../runtime/lib" && pwd) # e2e_sandbox_assert_running # Requires E2E_SANDBOX_NAME in context. Real implementation queries -# `nemoclaw list`; honors E2E_DRY_RUN. e2e_sandbox_assert_running() { if ! e2e_context_require E2E_SANDBOX_NAME; then return 1 @@ -20,10 +19,6 @@ e2e_sandbox_assert_running() { local name name="$(e2e_context_get E2E_SANDBOX_NAME)" e2e_env_trace "sandbox:check" "${name}" - if e2e_env_is_dry_run; then - echo "[dry-run] sandbox check ${name} (skipped)" - return 0 - fi if ! command -v nemoclaw >/dev/null 2>&1; then echo "e2e_sandbox_assert_running: nemoclaw CLI not on PATH" >&2 return 1 diff --git a/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh b/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh index 0fff0fd9ab..4b8161aea4 100755 --- a/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh +++ b/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh @@ -16,10 +16,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../runtime/lib" && pwd)" echo "hermes-specific:hermes-health" e2e_context_require E2E_AGENT -if e2e_env_is_dry_run; then - echo "[dry-run] would run Hermes health checks" - exit 0 -fi agent="$(e2e_context_get E2E_AGENT)" if [[ "${agent}" != "hermes" ]]; then echo "hermes-specific: E2E_AGENT should be 'hermes', got '${agent}'" >&2 diff --git a/test/e2e-scenario/validation_suites/hermes/01-history-writable.sh b/test/e2e-scenario/validation_suites/hermes/01-history-writable.sh index 953263d50a..64746aa31c 100755 --- a/test/e2e-scenario/validation_suites/hermes/01-history-writable.sh +++ b/test/e2e-scenario/validation_suites/hermes/01-history-writable.sh @@ -31,10 +31,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../runtime/lib" && pwd)" echo "hermes-specific:history-writable" e2e_context_require E2E_AGENT E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would probe /sandbox/.hermes/.hermes_history writability under shields up/down" - exit 0 -fi agent="$(e2e_context_get E2E_AGENT)" if [[ "${agent}" != "hermes" ]]; then diff --git a/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh b/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh index 64e1b086fc..8277f05f38 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh @@ -13,17 +13,16 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:models-health" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would GET inference.local/v1/models from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" -body="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 30 "https://inference.local/v1/models")" +# Orchestrator step cap is 30s; wrapper default 25s applies. Inner curl +# --max-time keeps a hung HTTP read from consuming the whole budget. +body="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 20 "https://inference.local/v1/models")" if [[ -z "${body}" ]]; then echo "inference:models-health: no response from models endpoint" >&2 exit 1 diff --git a/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh b/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh index f54ff8806b..c76e15842d 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh @@ -12,19 +12,21 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:chat-completion" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would POST a chat completion to inference.local from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" payload='{"model":"nvidia/nemotron-3-super-120b-a12b","messages":[{"role":"user","content":"Reply with exactly one word: PONG"}],"max_tokens":100}' -response="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 60 -H 'Content-Type: application/json' \ - -d "${payload}" "https://inference.local/v1/chat/completions")" +# Orchestrator step cap is 60s; widen the wrapper cap to 50s so a hung +# upstream surfaces with a clear diagnostic before SIGTERM. Inner curl +# --max-time stays ~10s under the wrapper cap. +# shellcheck disable=SC2034 # consumed by e2e_sandbox_exec via env +E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 \ + response="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 40 -H 'Content-Type: application/json' \ + -d "${payload}" "https://inference.local/v1/chat/completions")" # CodeRabbit review item #12: substring expansion instead of `| head` # avoids SIGPIPE-driven false failures under `set -o pipefail`. printf '%s\n' "${response:0:1024}" diff --git a/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh b/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh index 6d1343a736..e00b83f75e 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh @@ -13,18 +13,37 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:sandbox-inference-local" e2e_context_require E2E_SANDBOX_NAME E2E_INFERENCE_ROUTE -if e2e_env_is_dry_run; then - echo "[dry-run] would resolve inference-local from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" route="$(e2e_context_get E2E_INFERENCE_ROUTE)" + +# Map the route slug recorded in context.env (e.g. "inference-local") +# to the actual DNS hostname used by the OpenShell DNS+proxy inside +# the sandbox. The legacy test/e2e/ tests (test-cloud-inference-e2e.sh, +# test-bedrock-runtime-compatible-anthropic.sh, test-full-e2e.sh, ...) +# all hit the literal `inference.local` hostname — the sandbox-side +# resolver only knows that name. Interpolating the slug directly +# (`https://inference-local/...`) yields a different, non-existent DNS +# name and the gateway returns 403 because no policy widens egress +# for it. +host="" +case "${route}" in + inference-local) host="inference.local" ;; + *) + echo "inference:sandbox-inference-local: unsupported E2E_INFERENCE_ROUTE '${route}'; add a slug→hostname mapping here" >&2 + exit 2 + ;; +esac + +# Orchestrator step cap is 45s; widen wrapper cap to 35s. # CodeRabbit review item #13: capture then truncate to avoid `| head` racing # curl under `pipefail` and flagging a successful request as failed. -body="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 10 "https://${route}/v1/models")" +# shellcheck disable=SC2034 # consumed by e2e_sandbox_exec via env +E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=35 \ + body="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 25 "https://${host}/v1/models")" printf '%s\n' "${body:0:512}" diff --git a/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh b/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh index 77d4772c17..d172615795 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh @@ -12,18 +12,16 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "ollama-proxy:proxy-reachable" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would verify the Ollama auth proxy is reachable from the sandbox" - exit 0 -fi name="$(e2e_context_get E2E_SANDBOX_NAME)" # The Ollama auth proxy intentionally rejects unauthenticated requests to # /api/tags (legacy test-gpu-e2e.sh accepts 401/403 as proof the proxy is # live and enforcing auth). Do not use curl -f here. -status="$(openshell sandbox exec --name "${name}" -- curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "http://inference-local/api/tags" 2>/dev/null || echo 000)" +status="$(e2e_sandbox_exec "${name}" -- curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "http://inference-local/api/tags" 2>/dev/null || echo 000)" case "${status}" in 200 | 401 | 403) echo "ollama-proxy:proxy-reachable status=${status}" diff --git a/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh b/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh index 47e9f1fd43..d61ead2e98 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh @@ -15,10 +15,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "local-ollama-inference:ollama-models-health" e2e_context_require E2E_PROVIDER -if e2e_env_is_dry_run; then - echo "[dry-run] would GET ollama /api/tags via host Ollama" - exit 0 -fi # GPU Ollama scenarios mirror legacy test-gpu-e2e.sh: validate the host # Ollama daemon directly because Docker GPU host networking bypasses the # normal dashboard/gateway forward path. diff --git a/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh b/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh index ad8ff54faa..5d18b4209a 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh @@ -15,10 +15,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "local-ollama-inference:ollama-chat-completion" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would POST chat completion from sandbox to host-network Ollama" - exit 0 -fi name="$(e2e_context_get E2E_SANDBOX_NAME)" model="$(curl -fsS --max-time 10 http://127.0.0.1:11434/api/tags \ | node -e "const fs=require('fs'); const data=JSON.parse(fs.readFileSync(0,'utf8')); process.stdout.write(data.models?.[0]?.name || data.models?.[0]?.model || 'default');")" diff --git a/test/e2e-scenario/validation_suites/lib/inference_routing.sh b/test/e2e-scenario/validation_suites/lib/inference_routing.sh index b4f4c1d63f..17db0bbedb 100755 --- a/test/e2e-scenario/validation_suites/lib/inference_routing.sh +++ b/test/e2e-scenario/validation_suites/lib/inference_routing.sh @@ -31,16 +31,6 @@ _e2e_inference_sandbox_name() { e2e_context_get E2E_SANDBOX_NAME } -_e2e_inference_plan() { - local assertion_id="${1:-}" - local detail="${2:-planned inference/provider check}" - e2e_env_trace "inference:plan" "${assertion_id} ${detail}" - echo "[dry-run] ${assertion_id}: ${detail}" - if [[ -f "$(e2e_context_path)" ]]; then - e2e_context_dump | sed -E 's/(TOKEN|SECRET|API_KEY|APIKEY|CREDENTIAL|PASSWORD)([^=]*)=.*/\1\2=REDACTED/' - fi -} - _e2e_inference_curl_json() { local sandbox="$1" local url="$2" @@ -64,10 +54,6 @@ e2e_inference_routing_assert_chat_completion() { local assertion_id="${1:-post-onboard.inference-routing.inference-local-chat-completion}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "POST https://inference.local/v1/chat/completions with bounded curl" - return 0 - fi local sandbox payload output sandbox="$(_e2e_inference_sandbox_name)" payload='{"model":"default","messages":[{"role":"user","content":"Say ok"}],"max_tokens":8}' @@ -84,10 +70,6 @@ e2e_inference_routing_assert_health() { local url="${2:-https://inference.local/v1/models}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "GET ${url} with bounded curl" - return 0 - fi local sandbox status sandbox="$(_e2e_inference_sandbox_name)" status="$(_e2e_inference_status "${sandbox}" "${url}")" @@ -103,10 +85,6 @@ e2e_inference_routing_assert_auth_proxy() { local mode="${2:-valid}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "auth-proxy ${mode} request; sensitive context redacted" - return 0 - fi local sandbox status token sandbox="$(_e2e_inference_sandbox_name)" case "${mode}" in diff --git a/test/e2e-scenario/validation_suites/lib/messaging_providers.sh b/test/e2e-scenario/validation_suites/lib/messaging_providers.sh index 77eb1f1176..01250b784f 100755 --- a/test/e2e-scenario/validation_suites/lib/messaging_providers.sh +++ b/test/e2e-scenario/validation_suites/lib/messaging_providers.sh @@ -104,10 +104,6 @@ e2e_messaging_read_config_surface() { return 0 fi path="$(e2e_messaging_agent_config_path)" - if [[ -n "${E2E_DRY_RUN:-}" ]]; then - printf '%s=PLACEHOLDER\n' "$(e2e_messaging_config_key)" - return 0 - fi if [[ -f "${path}" ]]; then cat "${path}" return 0 @@ -177,9 +173,6 @@ e2e_messaging_assert_literal_payload() { local assertion_id="${1:?assertion id required}" local payload="${2:?payload required}" local observed="${3:-}" - if [[ -z "${observed}" && -n "${E2E_DRY_RUN:-}" ]]; then - observed="${payload}" - fi if [[ -z "${observed}" ]]; then e2e_fail "${assertion_id} missing observed payload output" fi diff --git a/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh b/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh index c6483c99fb..317e0974f8 100755 --- a/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh +++ b/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh @@ -10,6 +10,15 @@ _REBUILD_UPGRADE_REPO_ROOT="$(cd "${_REBUILD_UPGRADE_DIR}/../../../.." && pwd)" . "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/runtime/lib/context.sh" # shellcheck source=../../runtime/lib/logging.sh . "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/runtime/lib/logging.sh" +# shellcheck source=../sandbox-exec.sh +. "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/validation_suites/sandbox-exec.sh" + +# Sandbox-exec calls in this lib feed the lifecycle.rebuild/upgrade +# orchestrator steps, which carry 120s caps. Default the per-call wrapper +# cap to 100s so a hung 'openshell sandbox exec'/'ssh -F' surfaces as a +# classified exit 124 well before the orchestrator's SIGTERM. Callers +# may still override per-call. +: "${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS:=100}" rebuild_upgrade_require_context() { e2e_context_require E2E_SCENARIO E2E_AGENT E2E_SANDBOX_NAME E2E_GATEWAY_URL @@ -30,15 +39,30 @@ _rebuild_upgrade_run() { "$@" } +# _rebuild_upgrade_sandbox_exec [args...] +# Routes through the canonical `e2e_sandbox_exec` wrapper (ssh-config +# preferred, openshell-exec fallback, per-call timeout, classified +# diagnostic on hang) for production; honors the legacy +# REBUILD_UPGRADE_SANDBOX_CMD override so tests can inject a fake. The +# override contract preserves the original argv shape +# (` -n -- ...`) so existing test fakes +# (e.g. `REBUILD_UPGRADE_SANDBOX_CMD=fake_sandbox`) keep working. +_rebuild_upgrade_sandbox_exec() { + local sandbox="$1" + shift + if [[ -n "${REBUILD_UPGRADE_SANDBOX_CMD:-}" ]]; then + # shellcheck disable=SC2086 + ${REBUILD_UPGRADE_SANDBOX_CMD} -n "${sandbox}" -- "$@" + return $? + fi + e2e_sandbox_exec "${sandbox}" -- "$@" +} + rebuild_upgrade_assert_sandbox_reachable() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.survivor_agent_reachable dry-run" - return 0 - fi local sandbox sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" - if _rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- true; then + if _rebuild_upgrade_sandbox_exec "${sandbox}" true; then e2e_pass "suite.upgrade.survivor_agent_reachable" else e2e_fail "suite.upgrade.survivor_agent_reachable" @@ -47,15 +71,11 @@ rebuild_upgrade_assert_sandbox_reachable() { rebuild_upgrade_assert_marker_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.workspace_state_preserved dry-run" - return 0 - fi local sandbox marker_path expected actual sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" marker_path="${E2E_REBUILD_MARKER_PATH:-/workspace/.nemoclaw-rebuild-marker}" expected="${E2E_REBUILD_MARKER_EXPECTED:-${E2E_STATE_MARKER_EXPECTED:-}}" - actual="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- cat "${marker_path}" 2>/dev/null || true)" + actual="$(_rebuild_upgrade_sandbox_exec "${sandbox}" cat "${marker_path}" 2>/dev/null || true)" if [[ -n "${actual}" && (-z "${expected}" || "${actual}" == "${expected}") ]]; then e2e_pass "suite.rebuild.workspace_state_preserved" else @@ -65,16 +85,12 @@ rebuild_upgrade_assert_marker_preserved() { rebuild_upgrade_assert_agent_version_upgraded() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.agent_version_upgraded dry-run" - return 0 - fi local sandbox old expected actual cmd sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" old="${E2E_OLD_AGENT_VERSION:-}" expected="${E2E_EXPECTED_AGENT_VERSION:-}" cmd="${E2E_AGENT_VERSION_COMMAND:-openclaw --version}" - actual="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "${cmd}" 2>/dev/null || true)" + actual="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "${cmd}" 2>/dev/null || true)" if [[ -n "${actual}" && (-z "${old}" || "${actual}" != *"${old}"*) && (-z "${expected}" || "${actual}" == *"${expected}"*) ]]; then e2e_pass "suite.rebuild.agent_version_upgraded" else @@ -84,14 +100,10 @@ rebuild_upgrade_assert_agent_version_upgraded() { rebuild_upgrade_assert_inference_works() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.inference_still_works dry-run" - return 0 - fi local sandbox cmd output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" cmd="${E2E_INFERENCE_CHECK_COMMAND:-curl -fsS http://inference.local/v1/models}" - output="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "${cmd}" 2>/dev/null || true)" + output="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "${cmd}" 2>/dev/null || true)" if [[ -n "${output}" ]]; then e2e_pass "suite.rebuild.inference_still_works" else @@ -101,20 +113,48 @@ rebuild_upgrade_assert_inference_works() { rebuild_upgrade_assert_policy_presets_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.policy_presets_preserved dry-run" - return 0 - fi - local presets output preset + local id="suite.rebuild.policy_presets_preserved" + local sandbox presets output preset + sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" presets="${E2E_EXPECTED_POLICY_PRESETS:-npm pypi}" - output="$(_rebuild_upgrade_run REBUILD_UPGRADE_NEMOCLAW_CMD nemoclaw policy status 2>/dev/null || true)" + + # Mirror the legacy test/e2e/test-rebuild-openclaw.sh and + # test-full-e2e.sh pattern: ask the live gateway for the full policy + # via `openshell policy get --full ` and grep for the preset + # name OR a well-known endpoint hostname for that preset. The earlier + # implementation called `nemoclaw policy status`, which does not + # exist as a CLI subcommand — the assertion always failed silently + # because the wrapper swallowed the missing-command stderr via + # `2>/dev/null || true`. + output="$(_rebuild_upgrade_run REBUILD_UPGRADE_OPENSHELL_CMD openshell policy get --full "${sandbox}" 2>&1 || true)" + if [[ -z "${output}" ]]; then + e2e_fail "${id} openshell policy get --full returned no output for sandbox '${sandbox}'" + return 1 + fi + + local preset matchers found m for preset in ${presets}; do - if [[ "${output}" != *"${preset}"* ]]; then - e2e_fail "suite.rebuild.policy_presets_preserved" + case "${preset}" in + npm) matchers=("npm" "registry.npmjs.org") ;; + pypi) matchers=("pypi" "pypi.org" "files.pythonhosted.org") ;; + huggingface) matchers=("huggingface" "huggingface.co") ;; + brew) matchers=("brew" "formulae.brew.sh") ;; + openclaw-pricing) matchers=("openclaw-pricing" "openrouter.ai") ;; + *) matchers=("${preset}") ;; + esac + found=0 + for m in "${matchers[@]}"; do + if [[ "${output}" == *"${m}"* ]]; then + found=1 + break + fi + done + if [[ "${found}" -eq 0 ]]; then + e2e_fail "${id} preset '${preset}' not in policy (matchers: ${matchers[*]}); head: ${output:0:300}" return 1 fi done - e2e_pass "suite.rebuild.policy_presets_preserved" + e2e_pass "${id} presets=${presets}" } rebuild_upgrade_assert_hermes_config_preserved() { @@ -123,13 +163,9 @@ rebuild_upgrade_assert_hermes_config_preserved() { e2e_pass "suite.rebuild.hermes_config_preserved skipped non-hermes" return 0 fi - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.hermes_config_preserved dry-run" - return 0 - fi local sandbox output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" - output="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "grep -R 'platforms.discord\|DISCORD' ~/.hermes . 2>/dev/null" || true)" + output="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "grep -R 'platforms.discord\|DISCORD' ~/.hermes . 2>/dev/null" || true)" if [[ "${output}" == *"discord"* || "${output}" == *"DISCORD"* ]]; then e2e_pass "suite.rebuild.hermes_config_preserved" else @@ -139,10 +175,6 @@ rebuild_upgrade_assert_hermes_config_preserved() { rebuild_upgrade_assert_sandbox_registry_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.sandbox_registry_preserved dry-run" - return 0 - fi local sandbox output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" output="$(_rebuild_upgrade_run REBUILD_UPGRADE_NEMOCLAW_CMD nemoclaw list 2>/dev/null || true)" @@ -155,10 +187,6 @@ rebuild_upgrade_assert_sandbox_registry_preserved() { rebuild_upgrade_assert_gateway_version_upgraded() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.gateway_version_upgraded dry-run" - return 0 - fi local expected output expected="${E2E_EXPECTED_OPENSHELL_VERSION:-}" output="$(_rebuild_upgrade_run REBUILD_UPGRADE_GATEWAY_CMD curl -fsS "$(_rebuild_upgrade_ctx E2E_GATEWAY_URL)/version" 2>/dev/null || true)" diff --git a/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh b/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh index df942487e7..fa33a4230e 100755 --- a/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh +++ b/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh @@ -5,6 +5,8 @@ _sandbox_lifecycle_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=../../runtime/lib/context.sh . "${_sandbox_lifecycle_dir}/../../runtime/lib/context.sh" +# shellcheck source=../sandbox-exec.sh +. "${_sandbox_lifecycle_dir}/../sandbox-exec.sh" SANDBOX_LIFECYCLE_LAST_OUTPUT="" @@ -37,11 +39,6 @@ sandbox_lifecycle_run_with_timeout() { local seconds="$1" shift SANDBOX_LIFECYCLE_LAST_OUTPUT="" - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - SANDBOX_LIFECYCLE_LAST_OUTPUT="dry-run: $*" - printf '%s\n' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" - return 0 - fi if command -v timeout >/dev/null 2>&1; then SANDBOX_LIFECYCLE_LAST_OUTPUT="$(timeout "${seconds}" "$@" 2>&1)" || { local rc=$? @@ -58,13 +55,47 @@ sandbox_lifecycle_run_with_timeout() { printf '%s\n' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" } +# _sandbox_lifecycle_sandbox_exec [args...] +# +# Routes ssh-into-sandbox calls through the canonical e2e_sandbox_exec +# wrapper (ssh-config preferred transport, openshell-exec fallback, +# classified diagnostic on hang) instead of invoking +# `openshell sandbox exec` directly. Behavior contract for callers: +# - On success: SANDBOX_LIFECYCLE_LAST_OUTPUT contains stdout+stderr; +# stdout is also printed (matches sandbox_lifecycle_run_with_timeout). +# - On failure: returns the wrapper's exit code (124 on hang, real +# command exit otherwise) and prints the captured output to stderr. +# +# Why a separate helper instead of just calling e2e_sandbox_exec at the +# call sites: this lib's existing assert helpers all read +# SANDBOX_LIFECYCLE_LAST_OUTPUT after the timeout helper returns. Keeping +# that contract intact lets us migrate without rewriting every assert. +_sandbox_lifecycle_sandbox_exec() { + local seconds="$1" + shift + SANDBOX_LIFECYCLE_LAST_OUTPUT="" + local rc=0 + SANDBOX_LIFECYCLE_LAST_OUTPUT="$( + E2E_SANDBOX_EXEC_TIMEOUT_SECONDS="${seconds}" \ + e2e_sandbox_exec "${E2E_SANDBOX_NAME}" -- "$@" 2>&1 + )" || rc=$? + if [[ "${rc}" -ne 0 ]]; then + printf '%s\n' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" >&2 + return "${rc}" + fi + printf '%s\n' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" +} + sandbox_lifecycle_assert_nemoclaw_list_contains_sandbox() { local id="validation.sandbox_operations.sandbox_listed" sandbox_lifecycle_run_with_timeout 20 nemoclaw list >/dev/null || { sandbox_lifecycle_fail "${id}" "nemoclaw list failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"${E2E_SANDBOX_NAME}"* ]] || { + # Match the sandbox name exactly as a whole token; substring match + # would let `sb1` falsely match `sb10`. + awk -v n="${E2E_SANDBOX_NAME}" '$1 == n { found = 1 } END { exit !found }' \ + <<<"${SANDBOX_LIFECYCLE_LAST_OUTPUT}" || { sandbox_lifecycle_fail "${id}" "sandbox not listed: ${E2E_SANDBOX_NAME}" return 1 } @@ -77,16 +108,25 @@ sandbox_lifecycle_assert_status_fields_present() { sandbox_lifecycle_fail "${id}" "nemoclaw status failed" return 1 } - if [[ "${E2E_DRY_RUN:-0}" != "1" ]]; then - local status_output_lower - status_output_lower="$(printf '%s' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" | tr '[:upper:]' '[:lower:]')" - for field in status gateway sandbox; do - [[ "${status_output_lower}" == *"${field}"* ]] || { - sandbox_lifecycle_fail "${id}" "missing status field: ${field}" - return 1 - } - done + # The real `nemoclaw status` output (src/lib/actions/sandbox/status.ts) + # always emits a 'Sandbox: ' header plus structured fields like + # 'Model:', 'OpenShell:', 'Policies:'. The original assertion required + # literal 'status' and 'gateway' tokens that never appear in normal + # output — it only passed against the test-suite mock. Align with the + # production CLI: require the sandbox name and a couple of substantive + # field labels that are unconditionally printed. + local output="${SANDBOX_LIFECYCLE_LAST_OUTPUT}" + if [[ "${output}" != *"${E2E_SANDBOX_NAME}"* ]]; then + sandbox_lifecycle_fail "${id}" "status output did not mention sandbox '${E2E_SANDBOX_NAME}'" + return 1 fi + local field + for field in Sandbox Model OpenShell; do + [[ "${output}" == *"${field}"* ]] || { + sandbox_lifecycle_fail "${id}" "missing status field: ${field}" + return 1 + } + done sandbox_lifecycle_pass "${id}" "status fields present" } @@ -96,7 +136,7 @@ sandbox_lifecycle_assert_logs_available() { sandbox_lifecycle_fail "${id}" "nemoclaw logs failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || -n "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" ]] || { + [[ -n "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" ]] || { sandbox_lifecycle_fail "${id}" "logs empty" return 1 } @@ -105,11 +145,11 @@ sandbox_lifecycle_assert_logs_available() { sandbox_lifecycle_assert_openshell_exec_ok() { local id="validation.sandbox_operations.openshell_exec_ok" - sandbox_lifecycle_run_with_timeout 20 openshell sandbox exec -n "${E2E_SANDBOX_NAME}" -- sh -lc 'echo lifecycle-ok' >/dev/null || { + _sandbox_lifecycle_sandbox_exec 20 sh -lc 'echo lifecycle-ok' >/dev/null || { sandbox_lifecycle_fail "${id}" "openshell exec failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"lifecycle-ok"* ]] || { + [[ "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"lifecycle-ok"* ]] || { sandbox_lifecycle_fail "${id}" "unexpected exec output" return 1 } @@ -139,30 +179,36 @@ sandbox_lifecycle_assert_gateway_recovers_after_probe() { } sandbox_lifecycle_assert_snapshot_create_list_restore_marker() { - sandbox_lifecycle_run_with_timeout 30 openshell sandbox exec -n "${E2E_SANDBOX_NAME}" -- sh -lc 'echo lifecycle-marker-before-snapshot > /tmp/nemoclaw-lifecycle-marker' >/dev/null || { + _sandbox_lifecycle_sandbox_exec 30 sh -lc 'echo lifecycle-marker-before-snapshot > /tmp/nemoclaw-lifecycle-marker' >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.marker_written "failed to write marker" return 1 } sandbox_lifecycle_pass validation.sandbox_snapshot.marker_written "marker written" - sandbox_lifecycle_run_with_timeout 30 nemoclaw snapshot create "${E2E_SANDBOX_NAME}" >/dev/null || { + # Argv shape: `nemoclaw snapshot `. The earlier + # form `nemoclaw snapshot create ` parsed `snapshot` as a + # sandbox name and produced the misleading 'Unknown command: snapshot' + # error. Mirrors test/e2e/test-snapshot-commands.sh argv layout. + sandbox_lifecycle_run_with_timeout 30 nemoclaw "${E2E_SANDBOX_NAME}" snapshot create >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.create_succeeds "snapshot create failed" return 1 } sandbox_lifecycle_pass validation.sandbox_snapshot.create_succeeds "snapshot create succeeded" - sandbox_lifecycle_run_with_timeout 30 openshell sandbox exec -n "${E2E_SANDBOX_NAME}" -- sh -lc 'echo lifecycle-marker-after-snapshot > /tmp/nemoclaw-lifecycle-marker' >/dev/null || { + _sandbox_lifecycle_sandbox_exec 30 sh -lc 'echo lifecycle-marker-after-snapshot > /tmp/nemoclaw-lifecycle-marker' >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.restore_rolls_back_marker "failed to mutate marker" return 1 } - sandbox_lifecycle_run_with_timeout 30 nemoclaw snapshot list "${E2E_SANDBOX_NAME}" >/dev/null || { + sandbox_lifecycle_run_with_timeout 30 nemoclaw "${E2E_SANDBOX_NAME}" snapshot list >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.list_shows_snapshot "snapshot list failed" return 1 } sandbox_lifecycle_pass validation.sandbox_snapshot.list_shows_snapshot "snapshot listed" - sandbox_lifecycle_run_with_timeout 30 nemoclaw snapshot restore "${E2E_SANDBOX_NAME}" latest >/dev/null || { + # `snapshot restore` with no positional arg defaults to latest; + # matches test/e2e/test-snapshot-commands.sh Phase 6. + sandbox_lifecycle_run_with_timeout 30 nemoclaw "${E2E_SANDBOX_NAME}" snapshot restore >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.restore_rolls_back_marker "snapshot restore failed" return 1 } - sandbox_lifecycle_run_with_timeout 30 openshell sandbox exec -n "${E2E_SANDBOX_NAME}" -- sh -lc 'test -f /tmp/nemoclaw-lifecycle-marker && grep -Fxq lifecycle-marker-before-snapshot /tmp/nemoclaw-lifecycle-marker' >/dev/null || { + _sandbox_lifecycle_sandbox_exec 30 sh -lc 'test -f /tmp/nemoclaw-lifecycle-marker && grep -Fxq lifecycle-marker-before-snapshot /tmp/nemoclaw-lifecycle-marker' >/dev/null || { sandbox_lifecycle_fail validation.sandbox_snapshot.restore_rolls_back_marker "marker did not roll back" return 1 } diff --git a/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh b/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh index 3e1872d62a..8d34a5444f 100755 --- a/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh +++ b/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh @@ -55,10 +55,6 @@ spc_assert_credentials_expected() { return 1 fi spc_log_provider_metadata "$(spc_context_get E2E_PROVIDER)" "gateway" - if e2e_env_is_dry_run; then - echo "[dry-run] would list gateway credentials without raw values" - return 0 - fi local raw_file listed_raw listed list_rc raw_file="$(mktemp "${TMPDIR:-/tmp}/nemoclaw-credentials-list.XXXXXX")" chmod 600 "${raw_file}" @@ -105,10 +101,6 @@ spc_assert_policy_preset_present() { spc_assertion_id "post-onboard.security-policy.${preset}-preset-applied" spc_require_context E2E_SCENARIO E2E_SANDBOX_NAME echo "policy preset expected: ${preset}" - if e2e_env_is_dry_run; then - echo "[dry-run] would verify policy preset ${preset}" - return 0 - fi local sandbox_name active sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" if ! active="$(nemoclaw "${sandbox_name}" policy-list 2>&1)"; then @@ -143,10 +135,6 @@ spc_semver_ge() { spc_assert_openshell_credential_rewrite_supported() { spc_assertion_id "post-onboard.gateway.openshell-version-supports-credential-rewrite" spc_require_context E2E_SCENARIO - if e2e_env_is_dry_run; then - echo "[dry-run] would verify OpenShell gateway capability metadata" - return 0 - fi local openshell_bin version_output version minimum_version binary_strings feature minimum_version="0.0.39" openshell_bin="$(command -v openshell 2>/dev/null || true)" @@ -221,10 +209,6 @@ spc_assert_shields_permissions_match_state() { spc_assert_shields_config_consistent() { spc_assertion_id "post-onboard.security-shields.config-consistent" spc_require_context E2E_SCENARIO E2E_SANDBOX_NAME E2E_AGENT - if e2e_env_is_dry_run; then - echo "[dry-run] would verify shields config consistency" - return 0 - fi local sandbox_name status observed expected sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" if ! status="$(nemoclaw "${sandbox_name}" shields status 2>&1)"; then @@ -262,10 +246,6 @@ spc_assert_telegram_payload_not_shell_executed() { if [[ -n "${fixture_payload}" ]]; then printf 'telegram payload fixture loaded (%s bytes)\n' "${#fixture_payload}" fi - if e2e_env_is_dry_run; then - echo "[dry-run] would submit payload without shell evaluation" - return 0 - fi local sandbox_name marker payload send_output marker_state sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" marker="/tmp/nemoclaw-telegram-injection-proof-$RANDOM-$$" diff --git a/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh b/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh index 9fc2156ad0..8ec82f8aeb 100755 --- a/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh +++ b/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh @@ -5,9 +5,4 @@ set -euo pipefail . "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib/messaging_providers.sh" e2e_messaging_load_context -if [[ -n "${E2E_DRY_RUN:-}" ]]; then - provider="$(e2e_messaging_provider_name)" - e2e_pass "expected-state.messaging.${provider}.bridge-reachable dry-run" - exit 0 -fi e2e_messaging_assert_bridge_reachable diff --git a/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh b/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh index 32cd79093d..a6c02f7f1e 100755 --- a/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh +++ b/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh @@ -3,7 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail -. "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib/messaging_providers.sh" +_SLACK_SUITES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +. "${_SLACK_SUITES_DIR}/lib/messaging_providers.sh" +# shellcheck source=../../sandbox-exec.sh +. "${_SLACK_SUITES_DIR}/sandbox-exec.sh" e2e_messaging_load_context provider="$(e2e_messaging_provider_name)" case "${provider}" in @@ -13,25 +16,25 @@ esac e2e_messaging_assert_provider_attached agent="$(e2e_context_get E2E_AGENT)" if [[ "${agent}" == "openclaw" ]]; then - if [[ -n "${E2E_DRY_RUN:-}" ]]; then - e2e_pass "expected-state.messaging.slack.openclaw-enabled dry-run" - e2e_pass "expected-state.messaging.slack.runtime-discovery dry-run" - else - content="$(e2e_messaging_read_config_surface)" - if ! printf '%s\n' "${content}" | python3 -c ' + content="$(e2e_messaging_read_config_surface)" + if ! printf '%s\n' "${content}" | python3 -c ' import json import sys cfg = json.load(sys.stdin) assert cfg["channels"]["slack"]["enabled"] is True assert cfg["plugins"]["entries"]["slack"]["enabled"] is True '; then - e2e_fail "expected-state.messaging.slack.openclaw-enabled missing channels.slack.enabled or plugins.entries.slack.enabled" - fi - e2e_pass "expected-state.messaging.slack.openclaw-enabled channel and plugin enabled" + e2e_fail "expected-state.messaging.slack.openclaw-enabled missing channels.slack.enabled or plugins.entries.slack.enabled" + fi + e2e_pass "expected-state.messaging.slack.openclaw-enabled channel and plugin enabled" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - runtime_json="$(openshell sandbox exec --name "${sandbox_name}" -- timeout 45 openclaw channels list --all --json --no-color 2>/dev/null || true)" - runtime_state="$(printf '%s\n' "${runtime_json}" | python3 -c ' + sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" + # Wrapper cap (50s) sits just above the inner `timeout 45` so the inner + # cap is what fires under normal upstream slowness; the wrapper only + # catches the case where openshell itself wedges before delivering the + # `timeout` invocation to the sandbox. + runtime_json="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 e2e_sandbox_exec "${sandbox_name}" -- timeout 45 openclaw channels list --all --json --no-color 2>/dev/null || true)" + runtime_state="$(printf '%s\n' "${runtime_json}" | python3 -c ' import json import sys try: @@ -45,11 +48,10 @@ try: except Exception as exc: print("error %s" % exc) ' 2>/dev/null || true)" - if [[ "${runtime_state}" != "yes" ]]; then - e2e_fail "expected-state.messaging.slack.runtime-discovery OpenClaw did not report Slack installed/configured (${runtime_state}; output=${runtime_json:0:300})" - fi - e2e_pass "expected-state.messaging.slack.runtime-discovery OpenClaw reports Slack installed and configured" + if [[ "${runtime_state}" != "yes" ]]; then + e2e_fail "expected-state.messaging.slack.runtime-discovery OpenClaw did not report Slack installed/configured (${runtime_state}; output=${runtime_json:0:300})" fi + e2e_pass "expected-state.messaging.slack.runtime-discovery OpenClaw reports Slack installed and configured" fi if [[ "${agent}" == "hermes" ]]; then # This scenario asserts the static enablement contract Hermes' gateway uses @@ -61,16 +63,14 @@ if [[ "${agent}" == "hermes" ]]; then # and the Bolt app reached the running state. # 3) SLACK_ALLOWED_CHANNELS, when configured, is present in .env so the # allowlist values reach the adapter's environment. - if [[ -n "${E2E_DRY_RUN:-}" ]]; then - e2e_pass "expected-state.messaging.slack.hermes-platforms-enabled dry-run" - e2e_pass "expected-state.messaging.slack.hermes-allowed-channels-scoped dry-run" - e2e_pass "expected-state.messaging.slack.hermes-gateway-running dry-run" - else - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - # The Hermes venv is the same Python that loads config.yaml at runtime, so - # PyYAML is guaranteed there even when the host runner ships a minimal - # python3. Parsing inside the sandbox removes the awk fallback path. - platforms_state="$(openshell sandbox exec --name "${sandbox_name}" -- /opt/hermes/.venv/bin/python -c ' + sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" + # The Hermes venv is the same Python that loads config.yaml at runtime, so + # PyYAML is guaranteed there even when the host runner ships a minimal + # python3. Parsing inside the sandbox removes the awk fallback path. + # Use e2e_sandbox_exec for per-call timeout + ssh-config-preferred / + # openshell-exec fallback. A wedged openshell sandbox exec without the + # wrapper can stall the suite indefinitely in live mode. + platforms_state="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 e2e_sandbox_exec "${sandbox_name}" -- /opt/hermes/.venv/bin/python -c ' import sys import yaml @@ -90,62 +90,61 @@ if isinstance(slack, dict) and slack.get("enabled") is True: else: print("no slack=%r" % (slack,)) ' 2>/dev/null || true)" - case "${platforms_state}" in - yes) - e2e_pass "expected-state.messaging.slack.hermes-platforms-enabled platforms.slack.enabled true in config.yaml" - ;; - missing-config) - e2e_fail "expected-state.messaging.slack.hermes-platforms-enabled /sandbox/.hermes/config.yaml not found" - ;; - *) - e2e_fail "expected-state.messaging.slack.hermes-platforms-enabled platforms.slack.enabled not true (${platforms_state})" - ;; - esac + case "${platforms_state}" in + yes) + e2e_pass "expected-state.messaging.slack.hermes-platforms-enabled platforms.slack.enabled true in config.yaml" + ;; + missing-config) + e2e_fail "expected-state.messaging.slack.hermes-platforms-enabled /sandbox/.hermes/config.yaml not found" + ;; + *) + e2e_fail "expected-state.messaging.slack.hermes-platforms-enabled platforms.slack.enabled not true (${platforms_state})" + ;; + esac - env_state="$(openshell sandbox exec --name "${sandbox_name}" -- sh -c 'grep -E "^SLACK_ALLOWED_CHANNELS=" /sandbox/.hermes/.env 2>/dev/null | head -n1' 2>/dev/null || true)" - case "${env_state}" in - SLACK_ALLOWED_CHANNELS=*[!\ ]*) - e2e_pass "expected-state.messaging.slack.hermes-allowed-channels-scoped allowlist present in .env" - ;; - "") - e2e_pass "expected-state.messaging.slack.hermes-allowed-channels-scoped no channel allowlist requested (open scope)" - ;; - *) - e2e_fail "expected-state.messaging.slack.hermes-allowed-channels-scoped malformed SLACK_ALLOWED_CHANNELS entry" - ;; - esac + env_state="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=20 e2e_sandbox_exec "${sandbox_name}" -- sh -c 'grep -E "^SLACK_ALLOWED_CHANNELS=" /sandbox/.hermes/.env 2>/dev/null | head -n1' 2>/dev/null || true)" + case "${env_state}" in + SLACK_ALLOWED_CHANNELS=*[!\ ]*) + e2e_pass "expected-state.messaging.slack.hermes-allowed-channels-scoped allowlist present in .env" + ;; + "") + e2e_pass "expected-state.messaging.slack.hermes-allowed-channels-scoped no channel allowlist requested (open scope)" + ;; + *) + e2e_fail "expected-state.messaging.slack.hermes-allowed-channels-scoped malformed SLACK_ALLOWED_CHANNELS entry" + ;; + esac - # Hermes ships two surfaces that carry the gateway boot trace: - # - /sandbox/.hermes/logs/gateway.log: Hermes' own structured logger. - # - /gateway.log: stdout captured by agents/hermes/start.sh:862,910 - # when `hermes gateway run` is supervised by the entrypoint. - # Tail both; either is acceptable evidence the Slack platform booted. - tmp_dir=/tmp - gateway_log_basename=gateway.log - gateway_log="" - for log_path in "/sandbox/.hermes/logs/${gateway_log_basename}" "${tmp_dir}/${gateway_log_basename}"; do - chunk="$(openshell sandbox exec --name "${sandbox_name}" -- sh -c "tail -n 200 ${log_path} 2>/dev/null || true" 2>/dev/null || true)" - if [[ -n "${chunk}" ]]; then - if [[ -n "${gateway_log}" ]]; then - gateway_log="${gateway_log}"$'\n'"${chunk}" - else - gateway_log="${chunk}" - fi + # Hermes ships two surfaces that carry the gateway boot trace: + # - /sandbox/.hermes/logs/gateway.log: Hermes' own structured logger. + # - /gateway.log: stdout captured by agents/hermes/start.sh:862,910 + # when `hermes gateway run` is supervised by the entrypoint. + # Tail both; either is acceptable evidence the Slack platform booted. + tmp_dir=/tmp + gateway_log_basename=gateway.log + gateway_log="" + for log_path in "/sandbox/.hermes/logs/${gateway_log_basename}" "${tmp_dir}/${gateway_log_basename}"; do + chunk="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=20 e2e_sandbox_exec "${sandbox_name}" -- sh -c "tail -n 200 ${log_path} 2>/dev/null || true" 2>/dev/null || true)" + if [[ -n "${chunk}" ]]; then + if [[ -n "${gateway_log}" ]]; then + gateway_log="${gateway_log}"$'\n'"${chunk}" + else + gateway_log="${chunk}" fi - done - if [[ -z "${gateway_log}" ]]; then - e2e_fail "expected-state.messaging.slack.hermes-gateway-running could not read gateway log from sandbox or entrypoint surface" - fi - if printf '%s\n' "${gateway_log}" | grep -qE '\[Slack\] Socket Mode connected|✓ slack connected|slack_bolt\.AsyncApp.*Bolt app is running'; then - e2e_pass "expected-state.messaging.slack.hermes-gateway-running gateway booted slack platform" - else - sanitized_tail="$(printf '%s\n' "${gateway_log}" | tail -n 20 | sed -E \ - -e 's/xox[bpaors]-[A-Za-z0-9-]+//g' \ - -e 's/xapp-[A-Za-z0-9-]+//g' \ - -e 's/[Tt][0-9A-Z]{8,}//g' \ - -e 's/[UCWBDG][0-9A-Z]{8,}//g')" - e2e_fail "expected-state.messaging.slack.hermes-gateway-running gateway log shows slack platform never started (sanitized tail: ${sanitized_tail})" fi + done + if [[ -z "${gateway_log}" ]]; then + e2e_fail "expected-state.messaging.slack.hermes-gateway-running could not read gateway log from sandbox or entrypoint surface" + fi + if printf '%s\n' "${gateway_log}" | grep -qE '\[Slack\] Socket Mode connected|✓ slack connected|slack_bolt\.AsyncApp.*Bolt app is running'; then + e2e_pass "expected-state.messaging.slack.hermes-gateway-running gateway booted slack platform" + else + sanitized_tail="$(printf '%s\n' "${gateway_log}" | tail -n 20 | sed -E \ + -e 's/xox[bpaors]-[A-Za-z0-9-]+//g' \ + -e 's/xapp-[A-Za-z0-9-]+//g' \ + -e 's/[Tt][0-9A-Z]{8,}//g' \ + -e 's/[UCWBDG][0-9A-Z]{8,}//g')" + e2e_fail "expected-state.messaging.slack.hermes-gateway-running gateway log shows slack platform never started (sanitized tail: ${sanitized_tail})" fi fi e2e_pass "expected-state.messaging.slack.provider-state ${provider} provider state configured" diff --git a/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh b/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh index 2f42115f5e..4f2f094c67 100755 --- a/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh +++ b/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh @@ -19,11 +19,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "platform-macos:macos-smoke" e2e_context_require E2E_PLATFORM_OS -if e2e_env_is_dry_run; then - echo "[dry-run] would run macOS-specific smoke checks" - exit 0 -fi - os="$(e2e_context_get E2E_PLATFORM_OS)" if [[ "${os}" != "macos" ]]; then echo "platform-macos: E2E_PLATFORM_OS should be 'macos', got '${os}'" >&2 diff --git a/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh b/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh index 1aeb39fe7c..ef96795a0c 100755 --- a/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh +++ b/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh @@ -17,11 +17,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "platform-wsl:wsl-smoke" e2e_context_require E2E_PLATFORM_OS E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would run WSL-specific smoke checks" - exit 0 -fi - os="$(e2e_context_get E2E_PLATFORM_OS)" if [[ "${os}" != "wsl" ]]; then echo "platform-wsl: E2E_PLATFORM_OS should be 'wsl', got '${os}'" >&2 diff --git a/test/e2e-scenario/validation_suites/sandbox-exec.sh b/test/e2e-scenario/validation_suites/sandbox-exec.sh index 0682c4cf2f..44e4288111 100755 --- a/test/e2e-scenario/validation_suites/sandbox-exec.sh +++ b/test/e2e-scenario/validation_suites/sandbox-exec.sh @@ -12,7 +12,6 @@ # Functions: # e2e_sandbox_exec -- [args...] # Run inside via `openshell sandbox exec`. No stdin passed. -# Exit code propagates from . Honors E2E_DRY_RUN. # # e2e_sandbox_exec_stdin -- [args...] # Like e2e_sandbox_exec but pipes the caller's stdin into the @@ -23,6 +22,174 @@ _E2E_SBEX_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../runtime/lib" && pwd)" # shellcheck source=../runtime/lib/env.sh . "${_E2E_SBEX_LIB_DIR}/env.sh" +# Per-call timeout (seconds) applied to every `openshell sandbox exec` +# invocation routed through this wrapper. Callers MAY override per call: +# E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 e2e_sandbox_exec ... +# +# Why a wrapper-level cap exists: +# The orchestrator (phase.ts) enforces step-level timeouts via SIGTERM on +# the script's process group. When openshell ssh-into-sandbox hangs, +# SIGTERM eventually kills the script — but the script has no chance to +# emit a structured diagnostic, so logs end mid-line. An inner per-call +# `timeout` lets the wrapper observe the hang, emit a classified +# diagnostic, and exit cleanly *before* the orchestrator's SIGTERM. +# +# The default (25s) sits below the most common orchestrator step caps +# (30s smoke / kimi, 45s sandbox-local). Steps with longer caps (60s +# chat-completion, 120s rebuild) export a larger value before calling. +: "${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS:=25}" + +# Resolve the timeout binary once. Empty string == not available. +_e2e_sbex_resolve_timeout_cmd() { + if command -v timeout >/dev/null 2>&1; then + printf '%s' timeout + elif command -v gtimeout >/dev/null 2>&1; then + printf '%s' gtimeout + else + printf '%s' '' + fi +} + +# ---------------------------------------------------------------------- +# ssh-config transport (preferred) +# +# `openshell sandbox exec` has been observed to wedge in CI (PR #4380 +# scenario run — host can curl the gateway but `openshell sandbox exec` +# never returns). The legacy test/e2e/ scripts have always entered the +# sandbox via `openshell sandbox ssh-config` + `ssh -F`, which works in +# the same environments. We mirror that pattern here: +# +# 1. On first call per sandbox, materialize an ssh-config under +# ${E2E_CONTEXT_DIR}/.ssh-config-cache/.cfg. +# 2. Subsequent calls reuse the cached config. +# 3. Each ssh invocation gets `-o ConnectTimeout=10`, +# `-o StrictHostKeyChecking=no`, `-o UserKnownHostsFile=/dev/null`, +# `-o LogLevel=ERROR` to mirror the legacy pattern. +# +# Opt-out: set E2E_SANDBOX_EXEC_VIA_OPENSHELL=1 to force the original +# `openshell sandbox exec` transport (e.g. for debugging or for runners +# where ssh-config is unavailable). +# ---------------------------------------------------------------------- + +_e2e_sbex_ssh_cfg_dir() { + local base="${E2E_CONTEXT_DIR:-/tmp}" + printf '%s/.ssh-config-cache' "${base}" +} + +# _e2e_sbex_ssh_config_for +# Prints the path to a populated ssh-config for on stdout. +# Returns non-zero (and prints nothing) if `openshell sandbox ssh-config` +# fails — callers fall back to `openshell sandbox exec`. +_e2e_sbex_ssh_config_for() { + local sandbox="$1" + local dir cfg + dir="$(_e2e_sbex_ssh_cfg_dir)" + mkdir -p "${dir}" || return 1 + cfg="${dir}/${sandbox}.cfg" + if [[ ! -s "${cfg}" ]]; then + if ! openshell sandbox ssh-config "${sandbox}" >"${cfg}" 2>/dev/null; then + rm -f "${cfg}" + return 1 + fi + fi + printf '%s' "${cfg}" +} + +# _e2e_sbex_quote_args +# Outputs the args quoted into a single shell string suitable for +# embedding as the remote command in `ssh host 'cmd args ...'`. +_e2e_sbex_quote_args() { + local arg out="" + for arg in "$@"; do + out+="$(printf '%q' "${arg}") " + done + printf '%s' "${out% }" +} + +# _e2e_sbex_invoke_via_ssh +# stdin_mode is 'pipe' (forward caller stdin) or 'none' (close stdin). +# Returns ssh's exit code (124 if timed out, 137 if SIGKILLed). +_e2e_sbex_invoke_via_ssh() { + local cfg="$1" stdin_mode="$2" seconds="$3" timeout_cmd="$4" + local remote_cmd ssh_args + remote_cmd="$(_e2e_sbex_quote_args "${_E2E_SBEX_CMD[@]}")" + ssh_args=( + -F "${cfg}" + -o ConnectTimeout=10 + -o StrictHostKeyChecking=no + -o UserKnownHostsFile=/dev/null + -o LogLevel=ERROR + "openshell-${_E2E_SBEX_SB_NAME}" + "${remote_cmd}" + ) + if [[ "${stdin_mode}" == "none" ]]; then + if [[ -z "${timeout_cmd}" ]]; then + ssh "${ssh_args[@]}" +# Fallback path that uses `openshell sandbox exec`. +_e2e_sbex_invoke_via_openshell() { + local stdin_mode="$1" seconds="$2" timeout_cmd="$3" + if [[ -z "${timeout_cmd}" ]]; then + openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + else + "${timeout_cmd}" --kill-after=5s "${seconds}" \ + openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + fi +} + +# _e2e_sbex_dispatch +# Shared body for e2e_sandbox_exec / e2e_sandbox_exec_stdin. Picks the +# transport (ssh-config preferred; openshell sandbox exec on opt-out or +# ssh-config failure), applies the per-call timeout, and emits a +# classified diagnostic on hang. +_e2e_sbex_dispatch() { + local stdin_mode="$1" + if ! command -v openshell >/dev/null 2>&1; then + echo "e2e_sandbox_exec: openshell CLI not on PATH" >&2 + return 127 + fi + local timeout_cmd seconds="${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS}" + timeout_cmd="$(_e2e_sbex_resolve_timeout_cmd)" + if [[ -z "${timeout_cmd}" ]]; then + # Make the missing safety net visible so CI can flag it; do not + # abort — the orchestrator's step-level timeout still applies. + echo "e2e_sandbox_exec: 'timeout' not available; running without per-call cap (sandbox=${_E2E_SBEX_SB_NAME})" >&2 + fi + + local cfg="" via="ssh" rc=0 + if [[ "${E2E_SANDBOX_EXEC_VIA_OPENSHELL:-0}" == "1" ]]; then + via="openshell" + elif ! cfg="$(_e2e_sbex_ssh_config_for "${_E2E_SBEX_SB_NAME}")"; then + echo "e2e_sandbox_exec: ssh-config unavailable for ${_E2E_SBEX_SB_NAME}; falling back to 'openshell sandbox exec'" >&2 + via="openshell" + fi + + if [[ "${via}" == "ssh" ]]; then + _e2e_sbex_invoke_via_ssh "${cfg}" "${stdin_mode}" "${seconds}" "${timeout_cmd}" + rc=$? + else + _e2e_sbex_invoke_via_openshell "${stdin_mode}" "${seconds}" "${timeout_cmd}" + rc=$? + fi + + if [[ "${rc}" -eq 124 || "${rc}" -eq 137 ]]; then + echo "e2e_sandbox_exec: ${via} transport hung after ${seconds}s (sandbox=${_E2E_SBEX_SB_NAME}, cmd=${_E2E_SBEX_CMD[0]:-?}; classifier=gateway-transient)" >&2 + fi + return "${rc}" +} + # _e2e_sbex_split_args -- [args...] # Parses the shared calling convention. Prints on stderr on misuse and # returns 2. On success, sets the two global arrays _E2E_SBEX_SB_NAME and @@ -52,15 +219,7 @@ _e2e_sbex_parse() { e2e_sandbox_exec() { _e2e_sbex_parse "$@" || return $? e2e_env_trace "sandbox:exec" "${_E2E_SBEX_SB_NAME}" "${_E2E_SBEX_CMD[*]}" - if e2e_env_is_dry_run; then - echo "[dry-run] sandbox_exec ${_E2E_SBEX_SB_NAME} -- ${_E2E_SBEX_CMD[*]} (skipped)" - return 0 - fi - if ! command -v openshell >/dev/null 2>&1; then - echo "e2e_sandbox_exec: openshell CLI not on PATH" >&2 - return 127 - fi - openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + _e2e_sbex_dispatch none } # e2e_sandbox_exec_stdin -- [args...] @@ -70,15 +229,5 @@ e2e_sandbox_exec() { e2e_sandbox_exec_stdin() { _e2e_sbex_parse "$@" || return $? e2e_env_trace "sandbox:exec_stdin" "${_E2E_SBEX_SB_NAME}" "${_E2E_SBEX_CMD[*]}" - if e2e_env_is_dry_run; then - # Consume stdin so the caller's pipeline doesn't SIGPIPE. - cat >/dev/null 2>&1 || true - echo "[dry-run] sandbox_exec_stdin ${_E2E_SBEX_SB_NAME} -- ${_E2E_SBEX_CMD[*]} (skipped)" - return 0 - fi - if ! command -v openshell >/dev/null 2>&1; then - echo "e2e_sandbox_exec_stdin: openshell CLI not on PATH" >&2 - return 127 - fi - openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + _e2e_sbex_dispatch pipe } diff --git a/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh b/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh index e56925b1f9..ab733f039d 100755 --- a/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh +++ b/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh @@ -18,11 +18,6 @@ echo "smoke:cli-available" e2e_context_require E2E_SCENARIO -if e2e_env_is_dry_run; then - echo "[dry-run] would check that nemoclaw CLI is on PATH" - exit 0 -fi - if ! command -v nemoclaw >/dev/null 2>&1; then echo "smoke:cli-available: nemoclaw CLI not on PATH" >&2 exit 1 diff --git a/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh b/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh index b92dc33e8a..966efeb2d8 100755 --- a/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh +++ b/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh @@ -4,7 +4,6 @@ # # smoke step: sandbox-shell # Verifies that OpenShell can execute a trivial command inside the sandbox. -# Honors E2E_DRY_RUN. set -euo pipefail @@ -14,17 +13,15 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../sandbox-exec.sh +. "${SCRIPT_DIR}/../sandbox-exec.sh" echo "smoke:sandbox-shell" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would run: openshell sandbox exec --name -- echo ok" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" -output="$(openshell sandbox exec --name "${name}" -- echo ok 2>&1)" +# Orchestrator step cap is 30s; wrapper default 25s applies. +output="$(e2e_sandbox_exec "${name}" -- echo ok 2>&1)" echo "${output}" if ! echo "${output}" | grep -q '^ok$'; then echo "smoke:sandbox-shell: did not receive expected 'ok' from sandbox" >&2 diff --git a/tools/e2e-scenarios/workflow-boundary.mts b/tools/e2e-scenarios/workflow-boundary.mts index 26394d1b4c..a06b21f3ea 100644 --- a/tools/e2e-scenarios/workflow-boundary.mts +++ b/tools/e2e-scenarios/workflow-boundary.mts @@ -49,6 +49,13 @@ function requireRunContains(errors: string[], step: WorkflowStep | undefined, ex } } +function requireRunDoesNotContain(errors: string[], step: WorkflowStep | undefined, forbidden: string): void { + if (!step) return; + if (stringValue(step.run).includes(forbidden)) { + errors.push(`step '${step.name ?? ""}' run script must not include ${forbidden}`); + } +} + export function validateE2eScenariosWorkflowBoundary( workflowPath = DEFAULT_WORKFLOW_PATH, ): string[] { @@ -92,7 +99,11 @@ export function validateE2eScenariosWorkflowBoundary( const normalRun = requireStep(errors, steps, "Run typed scenarios"); requireRunContains(errors, normalRun, "npx tsx test/e2e-scenario/scenarios/run.ts"); requireRunContains(errors, normalRun, "--scenarios"); - requireRunContains(errors, normalRun, "--dry-run"); + // The TS runner has one execution mode: live. Workflows must not pass + // --dry-run, --plan-only, or --validate-only — they hide real test runs. + requireRunDoesNotContain(errors, normalRun, "--dry-run"); + requireRunDoesNotContain(errors, normalRun, "--plan-only"); + requireRunDoesNotContain(errors, normalRun, "--validate-only"); const wslInstall = requireStep(errors, steps, "Ensure Ubuntu WSL exists"); requireRunContains(errors, wslInstall, "wsl --install"); @@ -113,7 +124,16 @@ export function validateE2eScenariosWorkflowBoundary( const wslRun = requireStep(errors, steps, "Run typed scenarios in WSL"); requireRunContains(errors, wslRun, "npx tsx test/e2e-scenario/scenarios/run.ts"); requireRunContains(errors, wslRun, "--scenarios"); - requireRunContains(errors, wslRun, "--dry-run"); + // From this PR: the typed runner is the only execution path; the + // bash runner / dry-run / validate-only / plan-only modes are + // removed from CI. + requireRunDoesNotContain(errors, wslRun, "--dry-run"); + requireRunDoesNotContain(errors, wslRun, "--plan-only"); + requireRunDoesNotContain(errors, wslRun, "--validate-only"); + // From main (#4346): the WSL step must use the robust PowerShell + // wrapper that materializes a bash script, copies it into WSL via + // wslpath, and invokes it with `bash -l` so Docker WSL integration + // and Ubuntu first-run races are handled. requireRunContains(errors, wslRun, "$env:WSL_WORKDIR"); requireRunContains(errors, wslRun, "WriteAllText"); requireRunContains(errors, wslRun, "bash -l $wslTmp"); @@ -123,11 +143,28 @@ export function validateE2eScenariosWorkflowBoundary( if (uploadWith.name !== "e2e-scenario-${{ inputs.scenarios || github.event.inputs.scenarios }}") { errors.push("artifact upload name must include the scenarios input"); } - if (uploadWith["include-hidden-files"] !== true) { - errors.push("artifact upload must include hidden .e2e files"); + // Framework-owned secret hygiene: include-hidden-files MUST be false. + // Hidden dotfiles under the workspace can carry raw secrets (notably + // .e2e/context.env, written by e2e_context_set without redaction). + // The redacted surfaces are explicit subpaths under .e2e/ that the + // framework writes via orchestrators/redaction.ts::pipeRedacted. + if (uploadWith["include-hidden-files"] !== false) { + errors.push("artifact upload must set include-hidden-files: false (raw context.env must not leak)"); + } + const uploadPath = stringValue(uploadWith.path); + if (!uploadPath.includes(".e2e/actions/")) { + errors.push("artifact upload path must include .e2e/actions/ (redacted action evidence)"); + } + if (!uploadPath.includes(".e2e/logs/")) { + errors.push("artifact upload path must include .e2e/logs/ (redacted shell-step evidence)"); } - if (!stringValue(uploadWith.path).includes(".e2e/")) { - errors.push("artifact upload path must include .e2e/"); + // Bare blanket '.e2e/' (without a trailing subdir) would re-include + // the raw context.env file. Reject it so the explicit-subpath + // contract stays honest. Subpaths like '.e2e/actions/' are fine. + for (const line of uploadPath.split("\n")) { + if (line.trim() === ".e2e/") { + errors.push("artifact upload path must not list bare .e2e/ (use explicit subpaths to avoid context.env leakage)"); + } } return errors;