From b0a68bdad093984f9ce8d37f391b4e57a0a538ff Mon Sep 17 00:00:00 2001
From: "Arpit Singh (SW-CLOUD)" <arpsingh@nvidia.com>
Date: Tue, 9 Jun 2026 16:44:20 -0700
Subject: [PATCH 1/3] fix layout

Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
---
 tests/agentic-use/runtimes/shared/layout.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py
index a259de71af..07a7a2dd17 100644
--- a/tests/agentic-use/runtimes/shared/layout.py
+++ b/tests/agentic-use/runtimes/shared/layout.py
@@ -45,7 +45,9 @@ def resolve_run_layout(
 ) -> AgenticRunLayout:
     """Resolve or create the on-disk layout for one task attempt."""
     if config is not None and config.output_dir is not None:
-        run_dir = Path(config.output_dir)
+        # Must be absolute: run_dir subpaths are used as Docker bind-mount sources,
+        # and Docker treats a relative `-v` source as a (slash-free) named volume.
+        run_dir = Path(config.output_dir).resolve()
     else:
         run_dir = new_run_dir(default_jobs_dir(shared), task.id)
 

From c081455e08efb09a75459b674ae4a039c2b0aff8 Mon Sep 17 00:00:00 2001
From: "Arpit Singh (SW-CLOUD)" <arpsingh@nvidia.com>
Date: Tue, 9 Jun 2026 18:59:58 -0700
Subject: [PATCH 2/3] feat(evaluator): add orchestration, environments, gating,
 and coding-agent drivers to agent-eval SDK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend nemo_evaluator_sdk.agent_eval from "evaluator + contracts" into a full
agent-evaluation pipeline by adding the layers above and below AgentEvaluator.

Orchestration:
- orchestrator.py: AgentEvalOrchestrator ties AgentEvaluator + gating into one
  call. run_tasks(target=runtime) (online) and score_attempts(attempts=...)
  (offline). Backend-agnostic via injected extra_metrics + a prepare_task hook;
  it never introspects the runtime.
- types.py: AgentAttemptSource protocol — the offline counterpart to
  AgentAttemptRuntime (adapt a stored artifact into an AgentEvalAttempt).

Execution layer (dependency-gated, no core import):
- runtimes/environment.py: AgentEnvironmentProvider/Handle with a single
  run(spec, role) (agent/verifier); DockerEnvironmentProvider default, swappable.
- runtimes/environment_spec.py: declarative environment.yaml -> BuildPlan
  (Dockerfile escape hatch); runtimes/docker.py: stdlib subprocess Docker helpers.
- runtimes/coding_agent.py: CliAgentDriver (generic AgentAttemptRuntime for
  stdin-prompt CLIs) + CodingAgentSpec adapter seam; reference Claude/Cursor specs.
- runtimes/layout.py: RunLayout + resolve_run_dir (abs paths for mounts) +
  prepare_run_layout.
- runtimes/verify.py: VerifierOutcome + collect_verifier_outcome +
  apply_verify_to_metadata.

Attempt + scoring:
- attempts.py: resolve_attempt_status (ran-but-failed -> scorable "partial") +
  standard_evidence_descriptors (initial_state/trace/logs/final_state/verifier_logs).
- common_metrics.py: AgentPhaseSuccessMetric and EvidencePresenceMetric, a real
  metric-over-evidence that reads candidate.evidence.filesystem(...).

Results + gating:
- measurements.py: AttemptMeasurements, one typed projection of
  tokens/runtime/reward/provenance from attempt metadata.
- gating.py: summarize_run + evaluate_gate/GateThresholds/GateReport +
  write_gate_report + baseline loading (pass-rate, token regression, runtime
  tie-breaker, cross-commit provenance) -> gate.json.

A CI grep gate (tests/agent_eval/test_import_hygiene.py) keeps agent_eval free of
external/platform imports. tests/agentic-use is rewired as a thin adapter over
these modules via pure re-export shims. Also fixes a pre-existing
SandboxSdk->SandboxSDK typo in test_docker_sandbox_runtime.py.

107 tests pass; ty and import-hygiene gate clean; e2e CLI run reaches
agent_ok=True, overall_score=1.0, gate_passed=True.

Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
---
 .../nemo_evaluator_sdk/agent_eval/__init__.py |   7 +-
 .../nemo_evaluator_sdk/agent_eval/attempts.py |  90 ++++
 .../agent_eval/common_metrics.py              |  79 +++
 .../nemo_evaluator_sdk/agent_eval/gating.py   | 441 ++++++++++++++++
 .../agent_eval/measurements.py                | 121 +++++
 .../agent_eval/orchestrator.py                | 153 ++++++
 .../agent_eval/runtimes/coding_agent.py       | 291 +++++++++++
 .../agent_eval/runtimes/docker.py             |  89 ++++
 .../agent_eval/runtimes/docker_sandbox.py     |  11 +-
 .../agent_eval/runtimes/environment.py        | 145 ++++++
 .../agent_eval/runtimes/environment_spec.py   | 184 +++++++
 .../agent_eval/runtimes/layout.py             |  63 +++
 .../agent_eval/runtimes/verify.py             |  86 ++++
 .../nemo_evaluator_sdk/agent_eval/types.py    |  12 +
 .../tests/agent_eval/test_coding_agent.py     | 117 +++++
 .../tests/agent_eval/test_common_metrics.py   |  86 ++++
 .../agent_eval/test_docker_sandbox_runtime.py |   6 +-
 .../tests/agent_eval/test_environment.py      |  77 +++
 .../tests/agent_eval/test_gating.py           | 106 ++++
 .../tests/agent_eval/test_import_hygiene.py   |  37 ++
 .../tests/agent_eval/test_measurements.py     |  45 ++
 .../tests/agent_eval/test_orchestrator.py     | 131 +++++
 .../tests/agent_eval/test_verify.py           |  39 ++
 tests/agentic-use/runtimes/COMPLIANCE.md      |  17 +
 tests/agentic-use/runtimes/README.md          |  70 ++-
 tests/agentic-use/runtimes/orchestrator.py    | 110 ++--
 .../agentic-use/runtimes/shared/artifacts.py  |  83 +--
 tests/agentic-use/runtimes/shared/docker.py   | 101 +---
 .../runtimes/shared/environment.py            | 141 ++----
 .../runtimes/shared/environment_spec.py       | 197 +-------
 tests/agentic-use/runtimes/shared/layout.py   |  38 +-
 tests/agentic-use/runtimes/shared/metrics.py  |  25 +-
 .../agentic-use/runtimes/shared/reporting.py  | 471 +-----------------
 .../runtimes/shared/result_adapter.py         |  11 +
 tests/agentic-use/runtimes/shared/verify.py   |  67 +--
 .../tests/test_agentic_runtimes.py            |   2 +
 36 files changed, 2705 insertions(+), 1044 deletions(-)
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py
 create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py

diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
index b4d9805374..963d869bb5 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
@@ -5,9 +5,11 @@
 
 from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard
 from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
 from nemo_evaluator_sdk.agent_eval.persistence import persist_run
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
+    AgentAttemptSource,
     AgentEvalAttempt,
     AgentEvalDiagnostic,
     AgentEvalMetricOutputCoverage,
@@ -24,9 +26,12 @@
 from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence
 
 __all__ = [
+    "AgentAttemptRuntime",
+    "AgentAttemptSource",
     "AgentEvalAttempt",
     "AgentEvalDiagnostic",
     "AgentEvalMetricOutputCoverage",
+    "AgentEvalOrchestrator",
     "AgentEvalRunConfig",
     "AgentEvalRunResult",
     "AgentEvalSummary",
@@ -34,11 +39,11 @@
     "AgentEvalTask",
     "AgentEvalTaskResult",
     "AgentEvaluator",
-    "AgentAttemptRuntime",
     "AgentOutput",
     "CandidateEvidence",
     "EvidenceDescriptor",
     "LocalFilesystemEvidence",
+    "OrchestratorConfig",
     "SemanticView",
     "ViewSignal",
     "persist_run",
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py
new file mode 100644
index 0000000000..dd85fcea5d
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts.
+
+These are the runtime-agnostic pieces: the *scorable* status mapping and the
+standard evidence-key builder. Platform-specific attempt construction (reading
+proprietary artifact layouts, extra evidence keys) composes these in the adapter.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus
+from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor
+
+
+def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
+    """Map an agent-phase outcome to a *scorable* attempt status.
+
+    :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes
+    ``status=="failed"`` from scoring (it short-circuits to a failed metric
+    result). An agent that ran but did not succeed must still be scored — e.g. as
+    a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore
+    use ``"partial"`` for an executed-but-unsuccessful agent and reserve
+    ``"failed"`` for genuine attempt-*production* failures (which a runtime
+    surfaces by raising, not by emitting an unscorable attempt).
+    """
+    return "completed" if agent_ok else "partial"
+
+
+def standard_evidence_descriptors(
+    *,
+    logs_dir: str | Path,
+    final_state_dir: str | Path,
+    trace_path: str | Path | None = None,
+    initial_state_ref: str | None = None,
+    verifier_logs_dir: str | Path | None = None,
+    primary_log: str | None = None,
+) -> dict[str, EvidenceDescriptor]:
+    """Build the documented evidence map for an agent-eval attempt.
+
+    Standard keys: ``initial_state`` (task input filesystem, when staged),
+    ``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log
+    dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present).
+    Callers may add their own extension keys to the returned mapping.
+    """
+    descriptors: dict[str, EvidenceDescriptor] = {}
+
+    if initial_state_ref:
+        descriptors["initial_state"] = EvidenceDescriptor(
+            kind="filesystem",
+            format="dir",
+            ref=str(initial_state_ref),
+            metadata={"role": "initial_state"},
+        )
+
+    if trace_path is not None:
+        trace_name = Path(trace_path).name
+        descriptors["trace"] = EvidenceDescriptor(
+            kind="trace",
+            format="atif" if trace_name.startswith("atif") else "json",
+            ref=str(trace_path),
+        )
+
+    logs_metadata = {"primary_log": primary_log} if primary_log else {}
+    descriptors["logs"] = EvidenceDescriptor(
+        kind="logs",
+        format="dir",
+        ref=str(logs_dir),
+        metadata=logs_metadata,
+    )
+
+    descriptors["final_state"] = EvidenceDescriptor(
+        kind="filesystem",
+        format="dir",
+        ref=str(final_state_dir),
+        metadata={"role": "final_state"},
+    )
+
+    if verifier_logs_dir is not None and Path(verifier_logs_dir).exists():
+        descriptors["verifier_logs"] = EvidenceDescriptor(
+            kind="logs",
+            format="dir",
+            ref=str(verifier_logs_dir),
+            metadata={"role": "verifier"},
+        )
+
+    return descriptors
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py
new file mode 100644
index 0000000000..8cece6a5ad
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Reusable agent-eval metrics.
+
+``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt
+metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it
+scores by inspecting ``candidate.evidence`` (a filesystem evidence handle)
+rather than a reward written into metadata — the value proposition of scoring
+over evidence instead of trusting a verifier's stamped reward.
+"""
+
+from __future__ import annotations
+
+from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+class AgentPhaseSuccessMetric:
+    """Score 1.0 when the agent phase exited successfully, else 0.0.
+
+    The metric ``type`` is overridable via the ``metric_type`` class attribute so
+    callers can namespace it; the output name stays ``agent_phase_success`` (which
+    gating reads as a reward signal).
+    """
+
+    metric_type: str = "agent_phase_success"
+
+    @property
+    def type(self) -> str:
+        return self.metric_type
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("agent_phase_success")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        agent_ok = bool(input.candidate.metadata.get("agent_ok"))
+        return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)])
+
+
+class EvidencePresenceMetric:
+    """Score 1.0 when a named filesystem evidence directory exists (and is non-empty).
+
+    Reads ``candidate.evidence`` directly — the canonical metric-over-evidence
+    pattern — so the score reflects what the agent actually produced on disk,
+    not a reward stamped into metadata by a verifier.
+    """
+
+    def __init__(
+        self,
+        *,
+        evidence_name: str = "final_state",
+        output_name: str = "evidence_present",
+        require_non_empty: bool = True,
+    ) -> None:
+        self._evidence_name = evidence_name
+        self._output_name = output_name
+        self._require_non_empty = require_non_empty
+
+    @property
+    def type(self) -> str:
+        return "evidence_presence"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score(self._output_name)]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        score = 0.0
+        evidence = input.candidate.evidence
+        if evidence is not None and evidence.get(self._evidence_name) is not None:
+            try:
+                handle = await evidence.filesystem(self._evidence_name)
+                if await handle.exists():
+                    if self._require_non_empty:
+                        score = 1.0 if await handle.iter_paths(recursive=True) else 0.0
+                    else:
+                        score = 1.0
+            except (KeyError, ValueError):
+                score = 0.0
+        return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)])
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
new file mode 100644
index 0000000000..f6a7d04cfb
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
@@ -0,0 +1,441 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Deterministic gating + provenance comparison over an agent-eval run bundle.
+
+Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/
+``results.jsonl``/``summary.json``/``report.html``) is handled by
+``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate
+-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic
+provenance checks.
+
+Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`:
+that summary reports the *mean score per metric output* over a run. The gate's
+``pass_rate`` here is a different, intentional view — a per-task pass/fail count
+against a reward threshold — so it is computed separately. Token/runtime/
+provenance aggregation is delegated to
+:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the
+measurement keys are read in exactly one place.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult
+
+# Metric outputs, in priority order, that represent a task's pass/reward signal.
+DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success")
+
+# Provenance fields collapsed into a single run-level summary.
+_PROVENANCE_FIELDS: tuple[str, ...] = (
+    "commit_sha",
+    "commit_short",
+    "commit_dirty",
+    "branch",
+    "remote_url",
+    "agentic_base_image_digest",
+    "pinned",
+    "pinned_to_commit",
+    "pinned_image_tag",
+)
+
+
+@dataclass(frozen=True)
+class GateThresholds:
+    """Knobs controlling the candidate gate (defaults are the strict CI policy)."""
+
+    min_pass_rate: float = 1.0
+    require_token_metrics: bool = False
+    max_pass_rate_drop: float = 0.0
+    max_token_regression_pct: float = 0.0
+    max_runtime_regression_pct: float = 0.0
+    allow_cross_commit: bool = False
+
+
+@dataclass
+class GateCheck:
+    name: str
+    passed: bool
+    details: str
+
+
+@dataclass
+class GateReport:
+    gate_passed: bool
+    summary: dict[str, Any]
+    checks: list[GateCheck] = field(default_factory=list)
+
+    def to_payload(self) -> dict[str, Any]:
+        return {
+            "gate_passed": self.gate_passed,
+            "summary": self.summary,
+            "checks": [asdict(check) for check in self.checks],
+        }
+
+
+def evaluate_gate(
+    result: AgentEvalRunResult,
+    *,
+    thresholds: GateThresholds | None = None,
+    baseline_summary: dict[str, Any] | None = None,
+    reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
+) -> GateReport:
+    """Summarize a run and apply gate checks, optionally against a baseline."""
+    thresholds = thresholds or GateThresholds()
+    summary = summarize_run(result, reward_outputs=reward_outputs)
+    checks = run_gate_checks(summary, thresholds=thresholds, baseline_summary=baseline_summary)
+    return GateReport(gate_passed=all(check.passed for check in checks), summary=summary, checks=checks)
+
+
+def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path:
+    """Persist the gate report alongside the run bundle."""
+    path = Path(output_dir)
+    path.mkdir(parents=True, exist_ok=True)
+    gate_path = path / filename
+    gate_path.write_text(json.dumps(report.to_payload(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return gate_path
+
+
+def load_baseline_summary(path: str | Path) -> dict[str, Any]:
+    """Load + normalize a baseline summary (raw summary or a prior gate.json)."""
+    source = Path(path)
+    payload = json.loads(source.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"Baseline summary must be a JSON object: {source}")
+    summary = payload.get("summary") if isinstance(payload.get("summary"), dict) else payload
+    _validate_baseline_summary(summary, source)
+    return summary
+
+
+def summarize_run(
+    result: AgentEvalRunResult,
+    *,
+    reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
+) -> dict[str, Any]:
+    """Aggregate pass-rate, token, runtime, and provenance for one run.
+
+    Token/runtime/provenance are read via :class:`AttemptMeasurements`; the
+    reward used for pass-rate prefers a scored metric output (``reward_outputs``)
+    and falls back to the attempt's recorded reward.
+    """
+    attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts}
+    reward_by_task = _rewards_by_task(result.results, reward_outputs)
+    task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task))
+
+    passed = 0
+    token_sum = 0
+    token_count = 0
+    token_unavailable: list[str] = []
+    runtime_sum = 0.0
+    runtime_count = 0
+    runtime_unavailable: list[str] = []
+    provenance_inputs: list[dict[str, Any]] = []
+
+    for task_id in task_ids:
+        attempt = attempts_by_task.get(task_id)
+        measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {})
+
+        reward_value = reward_by_task.get(task_id)
+        if reward_value is None:
+            reward_value = measurements.reward if measurements.reward is not None else 0.0
+        if reward_value >= 1.0:
+            passed += 1
+
+        if measurements.total_tokens is not None:
+            token_sum += measurements.total_tokens
+            token_count += 1
+        else:
+            token_unavailable.append(task_id)
+
+        if measurements.runtime_sec is not None:
+            runtime_sum += measurements.runtime_sec
+            runtime_count += 1
+        else:
+            runtime_unavailable.append(task_id)
+
+        if measurements.provenance:
+            provenance_inputs.append(measurements.provenance)
+
+    total = len(task_ids)
+    return {
+        "run_id": result.run_id,
+        "benchmark": result.benchmark,
+        "total_tasks": total,
+        "passed_tasks": passed,
+        "pass_rate": (passed / total) if total else 0.0,
+        "task_names": task_ids,
+        "total_tokens_sum": token_sum if token_count else None,
+        "avg_total_tokens": (token_sum / token_count) if token_count else None,
+        "token_metrics_coverage": (token_count / total) if total else 0.0,
+        "token_metrics_available_tasks": token_count,
+        "token_metrics_unavailable_tasks": sorted(token_unavailable),
+        "runtime_sec_sum": runtime_sum if runtime_count else None,
+        "avg_runtime_sec": (runtime_sum / runtime_count) if runtime_count else None,
+        "runtime_metrics_coverage": (runtime_count / total) if total else 0.0,
+        "runtime_metrics_available_tasks": runtime_count,
+        "runtime_metrics_unavailable_tasks": sorted(runtime_unavailable),
+        "provenance": _aggregate_provenance(provenance_inputs),
+    }
+
+
+def run_gate_checks(
+    summary: dict[str, Any],
+    *,
+    thresholds: GateThresholds,
+    baseline_summary: dict[str, Any] | None = None,
+) -> list[GateCheck]:
+    """Apply absolute + relative (vs baseline) gate checks to a summary."""
+    checks: list[GateCheck] = []
+    total_tasks = int(summary["total_tasks"])
+    pass_rate = float(summary["pass_rate"])
+    provenance = summary.get("provenance") or {}
+
+    checks.append(GateCheck("non_empty_result_set", total_tasks > 0, f"total_tasks={total_tasks}"))
+    checks.append(
+        GateCheck(
+            "min_pass_rate",
+            pass_rate >= thresholds.min_pass_rate,
+            f"pass_rate={pass_rate:.3f}, min_pass_rate={thresholds.min_pass_rate:.3f}",
+        )
+    )
+    checks.append(_commit_consistency_check(provenance))
+
+    if thresholds.require_token_metrics:
+        token_coverage = float(summary["token_metrics_coverage"])
+        runtime_coverage = float(summary["runtime_metrics_coverage"])
+        checks.append(
+            GateCheck(
+                "token_metrics_available_for_all_tasks",
+                token_coverage == 1.0,
+                f"token_metrics_coverage={token_coverage:.3f}",
+            )
+        )
+        checks.append(
+            GateCheck(
+                "runtime_metrics_available_for_all_tasks",
+                runtime_coverage == 1.0,
+                f"runtime_metrics_coverage={runtime_coverage:.3f}",
+            )
+        )
+
+    if baseline_summary is not None:
+        checks.extend(_baseline_checks(summary, baseline_summary, thresholds))
+
+    return checks
+
+
+def _baseline_checks(
+    summary: dict[str, Any],
+    baseline_summary: dict[str, Any],
+    thresholds: GateThresholds,
+) -> list[GateCheck]:
+    checks: list[GateCheck] = []
+    pass_rate = float(summary["pass_rate"])
+    total_tokens_sum = summary["total_tokens_sum"]
+    runtime_sec_sum = summary["runtime_sec_sum"]
+    provenance = summary.get("provenance") or {}
+
+    # Regression checks only make sense when both runs measured the same tasks.
+    baseline_tasks = baseline_summary.get("task_names")
+    candidate_tasks = summary.get("task_names")
+    task_sets_comparable = True
+    if isinstance(baseline_tasks, list) and isinstance(candidate_tasks, list):
+        comparable = sorted(baseline_tasks) == sorted(candidate_tasks)
+        task_sets_comparable = comparable
+        checks.append(
+            GateCheck(
+                "baseline_candidate_task_sets_match",
+                comparable,
+                (
+                    f"both runs measured {len(candidate_tasks)} tasks"
+                    if comparable
+                    else f"baseline={sorted(baseline_tasks)} candidate={sorted(candidate_tasks)}; "
+                    "regression checks short-circuited"
+                ),
+            )
+        )
+    else:
+        checks.append(
+            GateCheck(
+                "baseline_candidate_task_sets_match",
+                True,
+                "task_names not present on baseline and/or candidate; skipping equality guard",
+            )
+        )
+
+    checks.append(_cross_commit_check(provenance, baseline_summary, thresholds.allow_cross_commit))
+
+    if not task_sets_comparable:
+        return checks
+
+    baseline_pass_rate = float(baseline_summary.get("pass_rate", 0.0))
+    checks.append(
+        GateCheck(
+            "no_pass_rate_regression_vs_baseline",
+            pass_rate >= baseline_pass_rate - thresholds.max_pass_rate_drop,
+            f"pass_rate={pass_rate:.3f}, baseline={baseline_pass_rate:.3f}, max_drop={thresholds.max_pass_rate_drop:.3f}",
+        )
+    )
+
+    baseline_tokens = baseline_summary.get("total_tokens_sum")
+    if isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int):
+        max_allowed = baseline_tokens * (1.0 + thresholds.max_token_regression_pct / 100.0)
+        checks.append(
+            GateCheck(
+                "tokens_not_worse_than_baseline",
+                total_tokens_sum <= max_allowed,
+                f"total_tokens_sum={total_tokens_sum}, baseline={baseline_tokens}, "
+                f"max_regression_pct={thresholds.max_token_regression_pct:.2f}",
+            )
+        )
+    else:
+        checks.append(
+            GateCheck(
+                "tokens_not_worse_than_baseline",
+                False,
+                "Missing token totals for candidate or baseline; cannot run deterministic token comparison.",
+            )
+        )
+
+    # Runtime is only a tie-breaker when token totals match exactly.
+    baseline_runtime = baseline_summary.get("runtime_sec_sum")
+    tokens_tied = (
+        isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int) and total_tokens_sum == baseline_tokens
+    )
+    if not tokens_tied:
+        checks.append(
+            GateCheck(
+                "runtime_tie_breaker_not_worse_than_baseline",
+                True,
+                "Not applicable (token totals differ from baseline).",
+            )
+        )
+    elif isinstance(runtime_sec_sum, int | float) and isinstance(baseline_runtime, int | float):
+        max_allowed_runtime = float(baseline_runtime) * (1.0 + thresholds.max_runtime_regression_pct / 100.0)
+        checks.append(
+            GateCheck(
+                "runtime_tie_breaker_not_worse_than_baseline",
+                float(runtime_sec_sum) <= max_allowed_runtime,
+                f"runtime_sec_sum={float(runtime_sec_sum):.3f}, baseline={float(baseline_runtime):.3f}, "
+                f"max_regression_pct={thresholds.max_runtime_regression_pct:.2f}",
+            )
+        )
+    else:
+        checks.append(
+            GateCheck(
+                "runtime_tie_breaker_not_worse_than_baseline",
+                False,
+                "Token totals tied with baseline but runtime totals missing; cannot run tie-breaker.",
+            )
+        )
+
+    return checks
+
+
+def _commit_consistency_check(provenance: dict[str, Any]) -> GateCheck:
+    commit_observed = provenance.get("commit_sha_observed")
+    if isinstance(commit_observed, list) and len(commit_observed) > 1:
+        return GateCheck(
+            "commit_sha_consistent_within_run",
+            False,
+            f"Multiple commit_sha values observed across tasks: {commit_observed}. Re-run from a single commit.",
+        )
+    commit_sha = provenance.get("commit_sha")
+    if commit_sha:
+        return GateCheck(
+            "commit_sha_consistent_within_run",
+            True,
+            f"commit={provenance.get('commit_short') or commit_sha[:12]}, branch={provenance.get('branch') or 'detached'}",
+        )
+    return GateCheck(
+        "commit_sha_consistent_within_run",
+        True,
+        "provenance not recorded (legacy artifacts); skipping commit consistency check.",
+    )
+
+
+def _cross_commit_check(
+    provenance: dict[str, Any],
+    baseline_summary: dict[str, Any],
+    allow_cross_commit: bool,
+) -> GateCheck:
+    baseline_commit = (baseline_summary.get("provenance") or {}).get("commit_sha")
+    candidate_commit = provenance.get("commit_sha")
+    if not (baseline_commit and candidate_commit):
+        return GateCheck(
+            "commit_sha_matches_baseline",
+            True,
+            "commit_sha not present on baseline and/or candidate; skipping cross-commit guard.",
+        )
+    commits_match = baseline_commit == candidate_commit
+    if commits_match:
+        detail = f"both runs at commit={baseline_commit[:12]}"
+    elif allow_cross_commit:
+        detail = (
+            f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}; "
+            "comparison allowed by allow_cross_commit (numbers may not be apples-to-apples)."
+        )
+    else:
+        detail = (
+            f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}. "
+            "Re-run candidate at the baseline commit, or set allow_cross_commit."
+        )
+    return GateCheck("commit_sha_matches_baseline", commits_match or allow_cross_commit, detail)
+
+
+def _rewards_by_task(results: list[AgentEvalTaskResult], reward_outputs: tuple[str, ...]) -> dict[str, float]:
+    rewards: dict[str, float] = {}
+    for task_result in results:
+        for output_name in reward_outputs:
+            value = _numeric_output(task_result, output_name)
+            if value is not None:
+                # Highest-priority output wins; don't overwrite with later metrics.
+                rewards.setdefault(task_result.task_id, value)
+                break
+    return rewards
+
+
+def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None:
+    for output in task_result.outputs:
+        if output.name == name:
+            try:
+                return float(output.value)
+            except (TypeError, ValueError):
+                return None
+    return None
+
+
+def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]:
+    observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS}
+    for prov in provenances:
+        for field_name in _PROVENANCE_FIELDS:
+            value = prov.get(field_name)
+            if value is not None:
+                observed[field_name].add(value)
+
+    aggregated: dict[str, Any] = {"available": bool(provenances)}
+    for field_name in _PROVENANCE_FIELDS:
+        values = observed[field_name]
+        if len(values) == 1:
+            aggregated[field_name] = next(iter(values))
+        else:
+            aggregated[field_name] = None
+            if len(values) > 1:
+                aggregated[f"{field_name}_observed"] = sorted(map(str, values))
+    return aggregated
+
+
+def _validate_baseline_summary(summary: dict[str, Any], source: Path) -> None:
+    missing = [key for key in ("pass_rate", "total_tokens_sum", "runtime_sec_sum") if key not in summary]
+    if missing:
+        raise ValueError(
+            f"Baseline summary {source} is missing required key(s): {', '.join(missing)}. "
+            "Expected a raw summary object or a gate.json with a `summary`."
+        )
+    if not isinstance(summary.get("pass_rate"), int | float):
+        raise ValueError(f"Baseline summary {source} has invalid `pass_rate`; expected a number.")
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py
new file mode 100644
index 0000000000..0ae2330415
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Typed view over the measurement keys carried on ``AgentEvalAttempt.metadata``.
+
+Gating and reporting read these typed fields instead of reaching into the
+attempt metadata dict by magic string. The keys are still *stored* on
+``metadata`` (so the loose-dict contract continues to work during migration);
+this module is the single, documented place that names them and applies the
+fallbacks (``duration_ms`` → ``runtime_sec``, ``passed`` → ``reward``).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+# Token-measurement keys carried on attempt metadata (and in result.json["metrics"]).
+TOKEN_KEYS: tuple[str, ...] = (
+    "prompt_tokens",
+    "completion_tokens",
+    "total_tokens",
+    "cache_creation_tokens",
+    "cache_read_tokens",
+)
+
+
+class AttemptMeasurements(BaseModel):
+    """Numeric measurements + provenance projected from attempt metadata.
+
+    This is the public, typed attempt-measurement contract. Reporting/gating
+    consume it via :meth:`from_metadata`; producers may keep writing the same
+    keys onto ``AgentEvalAttempt.metadata`` and round-trip via :meth:`to_metadata`.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    cache_creation_tokens: int | None = None
+    cache_read_tokens: int | None = None
+    runtime_sec: float | None = None
+    reward: float | None = None
+    passed: bool | None = None
+    provenance: dict[str, Any] = Field(default_factory=dict)
+
+    @classmethod
+    def from_metadata(cls, metadata: Mapping[str, Any] | None) -> AttemptMeasurements:
+        """Project loose attempt metadata onto the typed contract.
+
+        Applies the historical fallbacks so callers don't re-implement them:
+        ``runtime_sec`` falls back to ``duration_ms / 1000``; ``reward`` falls
+        back to ``1.0``/``0.0`` derived from ``passed`` when no explicit reward
+        is recorded.
+        """
+        metadata = metadata or {}
+
+        tokens = {key: _as_int(metadata.get(key)) for key in TOKEN_KEYS}
+        runtime_sec = _runtime_sec(metadata)
+        passed = metadata.get("passed")
+        passed = bool(passed) if isinstance(passed, bool) else None
+        reward = _reward(metadata, passed)
+        provenance = metadata.get("provenance")
+        provenance = dict(provenance) if isinstance(provenance, Mapping) else {}
+
+        return cls(
+            **tokens,
+            runtime_sec=runtime_sec,
+            reward=reward,
+            passed=passed,
+            provenance=provenance,
+        )
+
+    def to_metadata(self) -> dict[str, Any]:
+        """Project back onto the loose metadata keys (only set values)."""
+        payload: dict[str, Any] = {}
+        for key in TOKEN_KEYS:
+            value = getattr(self, key)
+            if value is not None:
+                payload[key] = value
+        if self.runtime_sec is not None:
+            payload["runtime_sec"] = self.runtime_sec
+        if self.reward is not None:
+            payload["reward"] = self.reward
+        if self.passed is not None:
+            payload["passed"] = self.passed
+        if self.provenance:
+            payload["provenance"] = dict(self.provenance)
+        return payload
+
+
+def _as_int(value: Any) -> int | None:
+    # bool is an int subclass; never treat True/False as a token count.
+    if isinstance(value, bool):
+        return None
+    return value if isinstance(value, int) else None
+
+
+def _runtime_sec(metadata: Mapping[str, Any]) -> float | None:
+    runtime_sec = metadata.get("runtime_sec")
+    if isinstance(runtime_sec, int | float) and not isinstance(runtime_sec, bool):
+        return float(runtime_sec)
+    duration_ms = metadata.get("duration_ms")
+    if isinstance(duration_ms, int | float) and not isinstance(duration_ms, bool):
+        return float(duration_ms) / 1000.0
+    return None
+
+
+def _reward(metadata: Mapping[str, Any], passed: bool | None) -> float | None:
+    reward = metadata.get("reward")
+    if reward is not None:
+        try:
+            return float(reward)
+        except (TypeError, ValueError):
+            return None
+    if passed is not None:
+        return 1.0 if passed else 0.0
+    return None
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py
new file mode 100644
index 0000000000..1fb436f809
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic orchestration: agent/scoring run + deterministic gate.
+
+Wraps :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` with the
+gate from :mod:`nemo_evaluator_sdk.agent_eval.gating`. It is intentionally lean —
+the only collaborators are the tasks and a target (online) or attempts (offline).
+Two seams keep it backend-agnostic:
+
+* **verify-enable is inverted to data**: callers pass ``extra_metrics`` to append
+  (e.g. a verifier-reward metric). The orchestrator never introspects a runtime's
+  config to decide what to score.
+* **environment prep is an injected hook**: ``prepare_task`` (e.g. "build the task
+  image") runs per task before execution, so Docker/build specifics live in the
+  caller, not here.
+
+The common Docker case stays a few lines via :meth:`AgentEvalOrchestrator`'s plain
+constructor (config + optional ``extra_metrics``); richer wiring is opt-in.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.gating import (
+    GateThresholds,
+    evaluate_gate,
+    load_baseline_summary,
+    write_gate_report,
+)
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentAttemptRuntime,
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalRunResult,
+    AgentEvalTask,
+)
+from nemo_evaluator_sdk.metrics.protocol import Metric
+
+
+@dataclass(frozen=True)
+class OrchestratorConfig:
+    """Run-level knobs shared by the online and offline paths."""
+
+    parallelism: int = 1
+    write_dashboard: bool = True
+    write_gate: bool = True
+    gate_thresholds: GateThresholds | None = None
+    baseline_summary_path: Path | None = None
+
+
+class AgentEvalOrchestrator:
+    """Run tasks through ``AgentEvaluator`` (online or offline) and apply the gate."""
+
+    def __init__(
+        self,
+        *,
+        config: OrchestratorConfig | None = None,
+        extra_metrics: Sequence[Metric] = (),
+    ) -> None:
+        self.config = config or OrchestratorConfig()
+        self._extra_metrics = list(extra_metrics)
+
+    async def run_tasks(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        *,
+        target: AgentAttemptRuntime,
+        benchmark: dict[str, object] | None = None,
+        output_dir: Path | None = None,
+        run_id: str | None = None,
+        prepare_task: Callable[[AgentEvalTask], None] | None = None,
+    ) -> AgentEvalRunResult:
+        """Online path: optionally prep each task, run the runtime, score, gate."""
+        prepared = [self._with_extra_metrics(task) for task in tasks]
+        if prepare_task is not None:
+            for task in prepared:
+                prepare_task(task)
+
+        result = await AgentEvaluator().run(
+            tasks=prepared,
+            target=target,
+            config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark),
+        )
+        self._maybe_write_gate(result)
+        return result
+
+    async def score_attempts(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        *,
+        attempts: Sequence[AgentEvalAttempt],
+        benchmark: dict[str, object] | None = None,
+        output_dir: Path | None = None,
+        run_id: str | None = None,
+    ) -> AgentEvalRunResult:
+        """Offline path: score already-captured attempts (no agent execution)."""
+        prepared = [self._with_extra_metrics(task) for task in tasks]
+        result = await AgentEvaluator().run(
+            tasks=prepared,
+            attempts=list(attempts),
+            config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark),
+        )
+        self._maybe_write_gate(result)
+        return result
+
+    def _run_config(
+        self,
+        *,
+        output_dir: Path | None,
+        run_id: str | None,
+        benchmark: dict[str, object] | None,
+    ) -> AgentEvalRunConfig:
+        return AgentEvalRunConfig(
+            output_dir=output_dir,
+            run_id=run_id,
+            parallelism=self.config.parallelism,
+            write_dashboard=self.config.write_dashboard,
+            benchmark=dict(benchmark or {}),
+        )
+
+    def _with_extra_metrics(self, task: AgentEvalTask) -> AgentEvalTask:
+        """Append injected metrics, honoring task-authored metrics and avoiding duplicate types."""
+        if not self._extra_metrics:
+            return task
+        metrics: list[Metric] = list(task.metrics)
+        existing_types = {type(metric) for metric in metrics}
+        appended = [metric for metric in self._extra_metrics if type(metric) not in existing_types]
+        if not appended:
+            return task
+        return task.model_copy(update={"metrics": metrics + appended})
+
+    def _maybe_write_gate(self, result: AgentEvalRunResult) -> None:
+        if not (self.config.write_gate and result.output_dir is not None):
+            return
+        baseline = (
+            load_baseline_summary(self.config.baseline_summary_path)
+            if self.config.baseline_summary_path is not None
+            else None
+        )
+        report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline)
+        write_gate_report(report, result.output_dir)
+
+
+__all__ = [
+    "AgentEvalOrchestrator",
+    "GateThresholds",
+    "OrchestratorConfig",
+]
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py
new file mode 100644
index 0000000000..a2d7ac9e44
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Plug-and-play seam for coding-agent CLIs (codex/claude/cursor/...).
+
+The split that makes these "plug-and-play":
+
+* :class:`CliAgentDriver` is the **driver** — a generic ``AgentAttemptRuntime``
+  that runs a CLI which reads a prompt on stdin and writes its final answer to a
+  file, then captures workspace/stdout/stderr/final-output as evidence. This is
+  the stable, reusable part.
+* :class:`CodingAgentSpec` is the **per-agent adapter** — the bespoke part: how to
+  build the CLI command and (optionally) how to parse that agent's trajectory into
+  extra evidence. Implementing a new agent means subclassing this, not rewriting a
+  runtime.
+
+The shipped :class:`ClaudeCodeSpec` / :class:`CursorAgentSpec` are *reference*
+command builders: the driver and evidence contract are stable, but each CLI's
+exact flags and trajectory format are the integrator's responsibility and may
+drift with upstream releases. Auth is the caller's concern (inject via env);
+nothing here hardcodes credentials.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import shutil
+import subprocess
+from collections.abc import Awaitable, Callable, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
+
+DEFAULT_CODING_AGENT_TIMEOUT_S = 600
+ProcessFactory = Callable[..., Awaitable[object]]
+
+
+@dataclass(frozen=True)
+class RunArtifacts:
+    """Resolved on-disk paths for one coding-agent attempt."""
+
+    evidence_dir: Path
+    workspace_dir: Path
+    prompt_path: Path
+    task_path: Path
+    stdout_path: Path
+    stderr_path: Path
+    final_output_path: Path
+
+
+class CodingAgentSpec:
+    """Per-agent adapter: prompt, command, and trajectory→evidence parsing.
+
+    Subclass and implement :meth:`build_command`. Override :meth:`build_prompt`,
+    :meth:`extra_evidence`, or :meth:`final_output` for agent-specific behavior.
+    """
+
+    name: str = "coding_agent"
+    binary: str = ""
+    model: str | None = None
+
+    def build_prompt(self, task: AgentEvalTask) -> str:
+        """Default instruction prompt (override per agent if needed)."""
+        return f"Task id: {task.id}\nIntent: {task.intent}\nInputs: {task.inputs}\n"
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        """Return the argv to launch; the prompt is delivered on stdin."""
+        raise NotImplementedError
+
+    def extra_evidence(self, artifacts: RunArtifacts) -> dict[str, EvidenceDescriptor]:
+        """Optional per-agent evidence (e.g. a parsed trajectory). Default: none."""
+        return {}
+
+    def final_output(self, artifacts: RunArtifacts, stdout_text: str) -> str:
+        """Final answer text: prefer the written final-output file, else stdout."""
+        if artifacts.final_output_path.exists():
+            return artifacts.final_output_path.read_text(encoding="utf-8")
+        return stdout_text
+
+
+class CliAgentDriver:
+    """Generic ``AgentAttemptRuntime`` for stdin-prompt coding-agent CLIs."""
+
+    def __init__(
+        self,
+        spec: CodingAgentSpec,
+        *,
+        work_root: str | Path | None = None,
+        timeout_s: int = DEFAULT_CODING_AGENT_TIMEOUT_S,
+        process_factory: ProcessFactory | None = None,
+    ) -> None:
+        if not spec.binary:
+            raise ValueError(f"{type(spec).__name__} must set a non-empty `binary`")
+        self._spec = spec
+        self._work_root = Path(work_root).expanduser() if work_root is not None else None
+        self._timeout_s = timeout_s
+        self._process_factory = process_factory or asyncio.create_subprocess_exec
+
+    async def run_tasks(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        config: AgentEvalRunConfig | None = None,
+    ) -> Sequence[AgentEvalAttempt]:
+        if self._process_factory is asyncio.create_subprocess_exec and shutil.which(self._spec.binary) is None:
+            raise RuntimeError(f"{self._spec.name} CLI executable {self._spec.binary!r} was not found on PATH")
+
+        resolved = config or AgentEvalRunConfig()
+        semaphore = asyncio.Semaphore(resolved.parallelism)
+
+        async def run_one(index: int, task: AgentEvalTask) -> AgentEvalAttempt:
+            async with semaphore:
+                return await self._run_task(index, task, resolved)
+
+        return await asyncio.gather(*(run_one(index, task) for index, task in enumerate(tasks)))
+
+    async def _run_task(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> AgentEvalAttempt:
+        artifacts = self._artifacts(index, task, config)
+        artifacts.evidence_dir.mkdir(parents=True, exist_ok=True)
+        artifacts.workspace_dir.mkdir(parents=True, exist_ok=True)
+
+        prompt = self._spec.build_prompt(task)
+        artifacts.prompt_path.write_text(prompt, encoding="utf-8")
+        artifacts.task_path.write_text(task.model_dump_json(indent=2), encoding="utf-8")
+
+        command = self._spec.build_command(artifacts)
+        try:
+            process = await self._process_factory(
+                *command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(prompt.encode("utf-8")),
+                timeout=self._timeout_s,
+            )
+        except Exception as exc:
+            return self._failed_attempt(task, artifacts, exc)
+
+        stdout_text = _decode(stdout)
+        stderr_text = _decode(stderr)
+        artifacts.stdout_path.write_text(stdout_text, encoding="utf-8")
+        artifacts.stderr_path.write_text(stderr_text, encoding="utf-8")
+
+        return_code = getattr(process, "returncode", 0)
+        if return_code:
+            return self._failed_attempt(
+                task,
+                artifacts,
+                RuntimeError(f"{self._spec.name} exited with status {return_code}: {stderr_text.strip()}"),
+            )
+
+        descriptors: dict[str, EvidenceDescriptor] = {
+            "workspace": EvidenceDescriptor(kind="filesystem", format="dir", ref=str(artifacts.workspace_dir)),
+            "prompt": EvidenceDescriptor(kind="text", format="txt", ref=str(artifacts.prompt_path)),
+            "task": EvidenceDescriptor(kind="json", format="json", ref=str(artifacts.task_path)),
+            "stdout": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stdout_path)),
+            "stderr": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stderr_path)),
+        }
+        descriptors.update(self._spec.extra_evidence(artifacts))
+
+        return AgentEvalAttempt(
+            id=f"{task.id}:{self._spec.name}",
+            task_id=task.id,
+            status="completed",
+            output=AgentOutput(
+                text=self._spec.final_output(artifacts, stdout_text),
+                metadata={
+                    "runtime": self._spec.name,
+                    "agent_model": self._spec.model,
+                    "evidence_dir": str(artifacts.evidence_dir),
+                },
+            ),
+            evidence=CandidateEvidence(descriptors=descriptors, metadata={"runtime": self._spec.name}),
+            metadata={
+                "runtime": self._spec.name,
+                "agent_model": self._spec.model,
+                "generated": True,
+            },
+        )
+
+    def _failed_attempt(self, task: AgentEvalTask, artifacts: RunArtifacts, exc: Exception) -> AgentEvalAttempt:
+        error_path = artifacts.evidence_dir / "error.json"
+        error_path.write_text(
+            json.dumps({"error_type": exc.__class__.__name__, "error": str(exc)}) + "\n", encoding="utf-8"
+        )
+        return AgentEvalAttempt(
+            id=f"{task.id}:{self._spec.name}",
+            task_id=task.id,
+            status="failed",
+            output=None,
+            evidence=CandidateEvidence(
+                descriptors={"error": EvidenceDescriptor(kind="error", format="json", ref=str(error_path))},
+                metadata={"runtime": self._spec.name},
+            ),
+            metadata={
+                "runtime": self._spec.name,
+                "error_type": exc.__class__.__name__,
+                "error": str(exc),
+            },
+        )
+
+    def _artifacts(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> RunArtifacts:
+        root = self._work_root or ((config.output_dir or Path.cwd()) / "evidence" / self._spec.name)
+        evidence_dir = Path(root) / (_safe_path_name(task.id) or f"task-{index}")
+        return RunArtifacts(
+            evidence_dir=evidence_dir,
+            workspace_dir=evidence_dir / "workspace",
+            prompt_path=evidence_dir / "prompt.txt",
+            task_path=evidence_dir / "task.json",
+            stdout_path=evidence_dir / "stdout.txt",
+            stderr_path=evidence_dir / "stderr.txt",
+            final_output_path=evidence_dir / "final_output.txt",
+        )
+
+
+class ClaudeCodeSpec(CodingAgentSpec):
+    """Reference command builder for the Claude Code CLI (``claude``)."""
+
+    name = "claude_code"
+    binary = "claude"
+
+    def __init__(self, *, model: str | None = None, binary: str = "claude") -> None:
+        self.model = model
+        self.binary = binary
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        command = [
+            self.binary,
+            "--print",
+            "--output-format",
+            "stream-json",
+            "--add-dir",
+            str(artifacts.workspace_dir),
+        ]
+        if self.model is not None:
+            command.extend(["--model", self.model])
+        return command
+
+
+class CursorAgentSpec(CodingAgentSpec):
+    """Reference command builder for the Cursor Agent CLI (``cursor-agent``)."""
+
+    name = "cursor_agent"
+    binary = "cursor-agent"
+
+    def __init__(self, *, model: str | None = None, binary: str = "cursor-agent") -> None:
+        self.model = model
+        self.binary = binary
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        command = [
+            self.binary,
+            "--print",
+            "--output-format",
+            "text",
+            "--workdir",
+            str(artifacts.workspace_dir),
+        ]
+        if self.model is not None:
+            command.extend(["--model", self.model])
+        return command
+
+
+def _decode(value: bytes | str | None) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    return value.decode("utf-8", errors="replace")
+
+
+def _safe_path_name(value: str) -> str:
+    return "".join(char if char.isalnum() or char in "._-" else "-" for char in value).strip(".-")[:120]
+
+
+__all__ = [
+    "CliAgentDriver",
+    "ClaudeCodeSpec",
+    "CodingAgentSpec",
+    "CursorAgentSpec",
+    "RunArtifacts",
+]
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
new file mode 100644
index 0000000000..482ca6e55e
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Docker CLI helpers for agent-eval runtimes.
+
+These shell out to the ``docker`` CLI (stdlib ``subprocess`` only), so importing
+this module does not require the ``agent-runtimes`` extra — only a working
+``docker`` binary at call time.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from collections.abc import Sequence
+
+
+def redact_cmd_for_logging(cmd: Sequence[str]) -> list[str]:
+    """Redact secret values in command logs."""
+    redacted: list[str] = []
+    sensitive_markers = ("KEY", "TOKEN", "SECRET", "PASSWORD")
+    for token in cmd:
+        if "=" not in token:
+            redacted.append(token)
+            continue
+        left, right = token.split("=", 1)
+        env_key = left.split()[-1] if left else left
+        if any(marker in env_key.upper() for marker in sensitive_markers):
+            redacted.append(f"{left}=***REDACTED***")
+        else:
+            redacted.append(f"{left}={right}")
+    return redacted
+
+
+def docker_run(
+    image: str,
+    command: list[str],
+    *,
+    env: dict[str, str] | None = None,
+    mounts: list[tuple[str, str]] | None = None,
+    workdir: str | None = None,
+    remove: bool = True,
+    timeout: int | None = None,
+    extra_args: list[str] | None = None,
+) -> subprocess.CompletedProcess[str]:
+    """Run a command inside a Docker container."""
+    cmd = ["docker", "run"]
+    if remove:
+        cmd.append("--rm")
+    if workdir:
+        cmd += ["-w", workdir]
+
+    for key, value in (env or {}).items():
+        cmd += ["-e", f"{key}={value}"]
+
+    for host_path, container_path in mounts or []:
+        cmd += ["-v", f"{host_path}:{container_path}"]
+
+    docker_extra = (extra_args or []) + (os.environ.get("DOCKER_EXTRA_ARGS", "").split() or [])
+    cmd += docker_extra
+    cmd.append(image)
+    cmd += command
+
+    print(f"[agent-eval-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}")
+    return subprocess.run(cmd, check=False, text=True, timeout=timeout)
+
+
+def docker_image_exists(tag: str) -> bool:
+    """Return True when a Docker image tag exists locally."""
+    result = subprocess.run(["docker", "image", "inspect", tag], capture_output=True, text=True, check=False)
+    return result.returncode == 0
+
+
+def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None:
+    """Build a Docker image from an explicit Dockerfile + build context."""
+    cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)]
+    print(f"[agent-eval-runtime] $ {' '.join(cmd)}")
+    subprocess.run(cmd, check=True)
+
+
+def build_task_image(task_dir: os.PathLike[str], tag: str) -> None:
+    """Build a task-specific Docker image from ``environment/Dockerfile``."""
+    from pathlib import Path
+
+    root = Path(task_dir)
+    env_dockerfile = root / "environment" / "Dockerfile"
+    if not env_dockerfile.exists():
+        raise FileNotFoundError(f"No environment/Dockerfile found in {root}")
+    build_dockerfile(env_dockerfile, env_dockerfile.parent, tag)
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
index 8f84d8ba4f..fc03344c85 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
@@ -1,7 +1,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Docker-backed sandbox runtime for agent-eval attempts."""
+"""Docker-backed sandbox runtime for agent-eval attempts.
+
+Distinct from :mod:`nemo_evaluator_sdk.agent_eval.runtimes.environment`'s
+``DockerEnvironmentProvider`` on purpose: this runtime drives the OpenAI Agents
+SDK ``SandboxAgent`` (Python ``docker`` + ``agents``, behind the
+``agent-runtimes`` extra) and *owns* the agent loop, whereas the environment
+provider only shells out to the ``docker`` CLI to execute a caller-built command
+inside a prebuilt task image. The two are not merged: this one is an
+``AgentAttemptRuntime``; the other is an execution boundary used *by* runtimes.
+"""
 
 from __future__ import annotations
 
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
new file mode 100644
index 0000000000..a08dfdc179
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
@@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Process/filesystem environment boundary for agent-eval runtimes.
+
+This boundary sits *below* :class:`AgentAttemptRuntime` so a runtime never needs
+to know whether the agent/verifier execute under Docker, locally, or another
+filesystem-backed sandbox. It is intentionally a **process/filesystem**
+abstraction, not a fully provider-neutral one: :class:`EnvRunSpec` carries
+``mounts``/``extra_args`` as filesystem-environment hints. Providers that are
+not filesystem-backed may ignore those fields.
+
+A handle exposes a single :meth:`AbstractEnvironmentHandle.run` that takes a
+``role`` ("agent" or "verifier"); :meth:`run_agent`/:meth:`run_verifier` are thin
+role wrappers kept for caller convenience and protocol compatibility.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import subprocess
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Literal, Protocol, runtime_checkable
+
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
+
+EnvRole = Literal["agent", "verifier"]
+
+
+def default_image_tag(task_id: str) -> str:
+    """Default task → image-tag mapping (callers may inject their own)."""
+    return f"{task_id}:latest"
+
+
+@dataclass(frozen=True)
+class EnvCommandResult:
+    """Outcome of running a single command inside a prepared environment."""
+
+    exit_code: int
+    timed_out: bool = False
+
+    @property
+    def ok(self) -> bool:
+        return self.exit_code == 0 and not self.timed_out
+
+
+@dataclass
+class EnvRunSpec:
+    """How to execute one command inside an environment handle.
+
+    ``mounts``/``extra_args`` are filesystem-environment hints (e.g. Docker bind
+    mounts and extra CLI args). Non-filesystem providers may ignore them.
+    """
+
+    command: list[str]
+    env: dict[str, str] = field(default_factory=dict)
+    mounts: list[tuple[str, str]] = field(default_factory=list)
+    workdir: str | None = None
+    timeout: int | None = None
+    extra_args: list[str] = field(default_factory=list)
+
+
+@runtime_checkable
+class AgentEnvironmentHandle(Protocol):
+    """A prepared, single-task environment that can run agent/verifier commands."""
+
+    async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: ...
+
+    async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: ...
+
+    async def close(self) -> None: ...
+
+
+@runtime_checkable
+class AgentEnvironmentProvider(Protocol):
+    """Creates per-task environment handles. Pluggable: Docker now, others later."""
+
+    async def prepare(
+        self,
+        task: AgentEvalTask,
+        config: AgentEvalRunConfig | None = None,
+    ) -> AgentEnvironmentHandle: ...
+
+
+class AbstractEnvironmentHandle:
+    """Base handle that routes both roles through a single :meth:`run`.
+
+    Concrete handles implement :meth:`run`; ``run_agent``/``run_verifier`` are
+    role-specialized wrappers so the duplicated phase methods don't have to be
+    reimplemented per backend.
+    """
+
+    async def run(self, spec: EnvRunSpec, role: EnvRole) -> EnvCommandResult:
+        raise NotImplementedError
+
+    async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult:
+        return await self.run(spec, "agent")
+
+    async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult:
+        return await self.run(spec, "verifier")
+
+    async def close(self) -> None:
+        return None
+
+
+class DockerEnvironmentHandle(AbstractEnvironmentHandle):
+    """Docker-backed environment handle bound to one task image."""
+
+    def __init__(self, image: str) -> None:
+        self.image = image
+
+    async def run(self, spec: EnvRunSpec, role: EnvRole = "agent") -> EnvCommandResult:
+        del role  # Docker runs both roles identically against the same image.
+        from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_run
+
+        try:
+            result = await asyncio.to_thread(
+                docker_run,
+                self.image,
+                spec.command,
+                env=spec.env,
+                mounts=spec.mounts,
+                workdir=spec.workdir,
+                timeout=spec.timeout,
+                extra_args=spec.extra_args,
+            )
+        except subprocess.TimeoutExpired:
+            return EnvCommandResult(exit_code=124, timed_out=True)
+        return EnvCommandResult(exit_code=result.returncode)
+
+
+class DockerEnvironmentProvider:
+    """Default provider that maps each task to its built Docker image."""
+
+    def __init__(self, *, image_tag_fn: Callable[[str], str] = default_image_tag) -> None:
+        self._image_tag_fn = image_tag_fn
+
+    async def prepare(
+        self,
+        task: AgentEvalTask,
+        config: AgentEvalRunConfig | None = None,
+    ) -> DockerEnvironmentHandle:
+        del config
+        return DockerEnvironmentHandle(self._image_tag_fn(task.id))
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
new file mode 100644
index 0000000000..a594705907
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Declarative environment authoring for agent-eval tasks.
+
+Moves task authoring away from an implicit "Dockerfile per task" toward a small,
+declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch.
+
+Spec shape (``environment.yaml`` in the task dir)::
+
+    environment:
+      image: nemo-platform-agentic-base:2026.06
+      profile: evaluator-platform
+      dependencies:
+        python:
+          - pytest
+          - nemo-evaluator-sdk
+      setup:
+        - seed-providers
+        - create-workspace
+
+Escape hatch::
+
+    environment:
+      dockerfile: environment/Dockerfile
+
+Resolution is deliberately minimal: a spec is turned into a :class:`BuildPlan`
+(a Dockerfile + build context + target tag). The Dockerfile path is used as-is;
+an ``image``-based spec generates a tiny derived Dockerfile (``FROM <image>`` plus
+optional ``pip install``). ``setup`` steps are carried as plan metadata — they are
+runtime concerns handled outside the image build — so this module does not
+execute them.
+
+``yaml`` is imported lazily so that importing this module costs nothing for
+callers that never load a spec.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+ENVIRONMENT_SPEC_FILENAME = "environment.yaml"
+DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile"
+
+
+@dataclass(frozen=True)
+class EnvironmentSpec:
+    """Declarative environment for one task (or a Dockerfile escape hatch)."""
+
+    image: str | None = None
+    profile: str | None = None
+    python_dependencies: list[str] = field(default_factory=list)
+    setup: list[str] = field(default_factory=list)
+    dockerfile: Path | None = None
+
+    def __post_init__(self) -> None:
+        if self.dockerfile is None and self.image is None:
+            raise ValueError("environment spec requires either 'image' or 'dockerfile'")
+
+
+def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec:
+    """Load a task's environment spec.
+
+    Resolution order:
+    1. ``environment.yaml`` in the task dir (declarative spec, preferred).
+    2. ``environment/Dockerfile`` (backward-compatible escape hatch so existing
+       tasks work without authoring a spec).
+    """
+    root = Path(task_dir)
+    spec_path = root / ENVIRONMENT_SPEC_FILENAME
+    if spec_path.is_file():
+        import yaml
+
+        return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root)
+
+    dockerfile = root / DEFAULT_DOCKERFILE_RELPATH
+    if dockerfile.is_file():
+        return EnvironmentSpec(dockerfile=dockerfile)
+
+    raise FileNotFoundError(
+        f"No environment defined for task {root}: expected {ENVIRONMENT_SPEC_FILENAME} or {DEFAULT_DOCKERFILE_RELPATH}"
+    )
+
+
+def _parse_spec(payload: dict, task_dir: Path) -> EnvironmentSpec:
+    data = payload.get("environment", payload) if isinstance(payload, dict) else {}
+    if not isinstance(data, dict):
+        raise ValueError(f"Invalid environment spec in {task_dir}: expected a mapping")
+
+    dockerfile_value = data.get("dockerfile")
+    dockerfile = None
+    if dockerfile_value:
+        dockerfile = Path(dockerfile_value)
+        if not dockerfile.is_absolute():
+            dockerfile = (task_dir / dockerfile).resolve()
+        if not dockerfile.is_file():
+            raise FileNotFoundError(f"environment.dockerfile not found: {dockerfile}")
+
+    dependencies = data.get("dependencies") or {}
+    python_deps = dependencies.get("python") if isinstance(dependencies, dict) else None
+
+    return EnvironmentSpec(
+        image=data.get("image"),
+        profile=data.get("profile"),
+        python_dependencies=list(python_deps or []),
+        setup=list(data.get("setup") or []),
+        dockerfile=dockerfile,
+    )
+
+
+@dataclass(frozen=True)
+class BuildPlan:
+    """A resolved, executable Docker build for one task."""
+
+    image_tag: str
+    dockerfile: Path
+    context_dir: Path
+    generated: bool
+    base_image: str | None = None
+    setup: list[str] = field(default_factory=list)
+
+
+def plan_task_build(
+    task_dir: str | Path,
+    image_tag: str,
+    *,
+    spec: EnvironmentSpec | None = None,
+    generated_dir: Path | None = None,
+) -> BuildPlan:
+    """Resolve a task's environment spec into a concrete :class:`BuildPlan`.
+
+    For the Dockerfile escape hatch the existing Dockerfile/context is used. For
+    an ``image``-based spec a minimal derived Dockerfile is written under
+    ``generated_dir`` (defaults to ``<task_dir>/.agentic-build``).
+    """
+    root = Path(task_dir)
+    spec = spec or load_environment_spec(root)
+
+    if spec.dockerfile is not None:
+        return BuildPlan(
+            image_tag=image_tag,
+            dockerfile=spec.dockerfile,
+            context_dir=spec.dockerfile.parent,
+            generated=False,
+            setup=list(spec.setup),
+        )
+
+    # image-based spec: generate a tiny derived Dockerfile.
+    context_dir = generated_dir if generated_dir is not None else (root / ".agentic-build")
+    context_dir.mkdir(parents=True, exist_ok=True)
+    dockerfile = context_dir / "Dockerfile"
+    dockerfile.write_text(render_derived_dockerfile(spec), encoding="utf-8")
+    return BuildPlan(
+        image_tag=image_tag,
+        dockerfile=dockerfile,
+        context_dir=context_dir,
+        generated=True,
+        base_image=spec.image,
+        setup=list(spec.setup),
+    )
+
+
+def execute_build_plan(plan: BuildPlan) -> None:
+    """Build the Docker image described by ``plan``."""
+    from nemo_evaluator_sdk.agent_eval.runtimes.docker import build_dockerfile
+
+    build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag)
+
+
+def render_derived_dockerfile(spec: EnvironmentSpec) -> str:
+    """Render a minimal derived Dockerfile from an image-based spec."""
+    if spec.image is None:
+        raise ValueError("cannot render a derived Dockerfile without a base image")
+    lines = [f"FROM {spec.image}"]
+    if spec.profile:
+        lines.append(f"LABEL com.nvidia.agentic.profile={spec.profile}")
+    if spec.python_dependencies:
+        deps = " ".join(spec.python_dependencies)
+        lines.append(f"RUN pip install --no-cache-dir {deps}")
+    if spec.setup:
+        # Setup steps are runtime concerns; record them for provenance only.
+        lines.append(f'LABEL com.nvidia.agentic.setup="{",".join(spec.setup)}"')
+    return "\n".join(lines) + "\n"
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py
new file mode 100644
index 0000000000..5c858cb037
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic on-disk layout for a single agent-eval task run.
+
+A run produces an agent-log dir and a workspace dir under a run dir, plus a
+written instruction file. Callers that need extra directories (e.g. preserved
+platform state) add them on top of :class:`RunLayout`.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class RunLayout:
+    """Filesystem layout for one task run."""
+
+    run_dir: Path
+    agent_log_dir: Path
+    workspace_dir: Path
+    instruction_path: Path
+
+
+def resolve_run_dir(output_dir: str | Path | None, default_factory: Callable[[], Path]) -> Path:
+    """Resolve the run dir to an absolute path.
+
+    An explicit ``output_dir`` must be made absolute: run-dir subpaths are used as
+    Docker bind-mount sources, and Docker treats a relative ``-v`` source as a
+    (slash-free) named volume rather than a host directory.
+    """
+    if output_dir is not None:
+        return Path(output_dir).resolve()
+    return default_factory()
+
+
+def prepare_run_layout(
+    run_dir: str | Path,
+    instruction_text: str,
+    *,
+    agent_subdir: str = "agent",
+    workspace_subdir: str = "workspace",
+    instruction_name: str = "instruction.md",
+) -> RunLayout:
+    """Create the agent/workspace dirs under ``run_dir`` and write the instruction."""
+    run_dir = Path(run_dir)
+    agent_log_dir = run_dir / agent_subdir
+    workspace_dir = run_dir / workspace_subdir
+    agent_log_dir.mkdir(parents=True, exist_ok=True)
+    workspace_dir.mkdir(parents=True, exist_ok=True)
+
+    instruction_path = agent_log_dir / instruction_name
+    instruction_path.write_text(instruction_text, encoding="utf-8")
+
+    return RunLayout(
+        run_dir=run_dir,
+        agent_log_dir=agent_log_dir,
+        workspace_dir=workspace_dir,
+        instruction_path=instruction_path,
+    )
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py
new file mode 100644
index 0000000000..7e1b0fb0c0
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic verifier-phase mechanic: collect a reward + stamp attempt metadata.
+
+This is the backend-agnostic core. *What* the verifier runs (command, env,
+mounts) and *how* it is invoked are caller concerns — the caller executes its
+verifier through an environment handle, then uses :func:`collect_verifier_outcome`
+to read the reward/stdout convention out of the verifier's log dir, and
+:func:`apply_verify_to_metadata` to stamp the result onto an attempt so a
+reward metric can score it.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+
+@dataclass(frozen=True)
+class VerifierOutcome:
+    """Result of a verifier phase for one task."""
+
+    ran: bool
+    passed: bool
+    reward: int
+    exit_code: int
+    stdout: str
+    verifier_log_dir: Path | None
+
+
+def skipped_outcome() -> VerifierOutcome:
+    """Outcome representing a verifier that did not run."""
+    return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
+
+
+def collect_verifier_outcome(
+    *,
+    ok: bool,
+    exit_code: int,
+    log_dir: str | Path,
+    reward_filename: str = "reward.txt",
+    stdout_filename: str = "test-stdout.txt",
+) -> VerifierOutcome:
+    """Build a :class:`VerifierOutcome` from a verifier run's log dir.
+
+    Reads ``reward.txt`` (``1``/``0``) when present; otherwise derives the reward
+    from ``ok`` and writes the file so reruns are stable. Reads ``test-stdout.txt``
+    when present.
+    """
+    log_dir = Path(log_dir)
+    passed = ok
+
+    stdout = ""
+    stdout_path = log_dir / stdout_filename
+    if stdout_path.is_file():
+        stdout = stdout_path.read_text(encoding="utf-8", errors="replace")
+
+    reward_path = log_dir / reward_filename
+    if reward_path.is_file():
+        reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0
+    else:
+        reward = 1 if passed else 0
+        reward_path.parent.mkdir(parents=True, exist_ok=True)
+        reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8")
+
+    return VerifierOutcome(
+        ran=True,
+        passed=passed,
+        reward=reward,
+        exit_code=exit_code,
+        stdout=stdout,
+        verifier_log_dir=log_dir,
+    )
+
+
+def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None:
+    """Stamp verifier reward/status onto attempt metadata for scoring + gating."""
+    if not outcome.ran:
+        metadata.setdefault("verify_status", "skipped")
+        return
+    metadata["verify_status"] = "ok" if outcome.passed else "failed"
+    metadata["passed"] = outcome.passed
+    metadata["reward"] = outcome.reward
+    metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
index 589a4efde1..03509ab038 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
@@ -287,6 +287,18 @@ async def run_tasks(
     ) -> Sequence[AgentEvalAttempt]: ...
 
 
+@runtime_checkable
+class AgentAttemptSource(Protocol):
+    """Loads a previously captured attempt for a task from a stored artifact.
+
+    The offline counterpart to :class:`AgentAttemptRuntime`: instead of executing
+    the agent, it adapts an already-produced run directory/file into an
+    :class:`AgentEvalAttempt` so it can be (re)scored through ``AgentEvaluator``.
+    """
+
+    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: ...
+
+
 def _metric_coverage(
     results: Sequence[AgentEvalTaskResult],
     tasks: Sequence[AgentEvalTask] | None,
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py
new file mode 100644
index 0000000000..66e7715c07
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py
@@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fixture-based tests for the coding-agent driver seam (no real CLIs)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.runtimes.coding_agent import (
+    ClaudeCodeSpec,
+    CliAgentDriver,
+    CodingAgentSpec,
+    CursorAgentSpec,
+    RunArtifacts,
+)
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
+
+
+class _EchoSpec(CodingAgentSpec):
+    name = "echo_agent"
+    binary = "echo-agent"
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        return [self.binary, "--out", str(artifacts.final_output_path)]
+
+    def extra_evidence(self, artifacts: RunArtifacts) -> dict:
+        from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor
+
+        return {"trajectory": EvidenceDescriptor(kind="trace", format="jsonl", ref=str(artifacts.stdout_path))}
+
+
+class _FakeProcess:
+    def __init__(self, *, returncode: int, final_output_path: Path | None, stdout: bytes = b"", stderr: bytes = b""):
+        self.returncode = returncode
+        self._final_output_path = final_output_path
+        self._stdout = stdout
+        self._stderr = stderr
+
+    async def communicate(self, stdin: bytes | None = None) -> tuple[bytes, bytes]:
+        if self._final_output_path is not None:
+            self._final_output_path.write_text("final answer", encoding="utf-8")
+        return self._stdout, self._stderr
+
+
+def _factory(*, returncode: int = 0, write_final: bool = True):
+    captured: dict = {}
+
+    async def factory(*command, **kwargs):
+        captured["command"] = list(command)
+        final_path = Path(command[command.index("--out") + 1]) if "--out" in command else None
+        return _FakeProcess(
+            returncode=returncode,
+            final_output_path=final_path if write_final else None,
+            stdout=b'{"event":"done"}\n',
+        )
+
+    return factory, captured
+
+
+def _task() -> AgentEvalTask:
+    return AgentEvalTask(id="demo/task", intent="do the thing", inputs={"k": "v"})
+
+
+@pytest.mark.asyncio
+async def test_driver_produces_completed_attempt_with_evidence(tmp_path: Path) -> None:
+    factory, captured = _factory()
+    driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory)
+
+    attempts = await driver.run_tasks([_task()], AgentEvalRunConfig())
+    attempt = attempts[0]
+
+    assert captured["command"][0] == "echo-agent"
+    assert attempt.status == "completed"
+    assert attempt.output is not None and attempt.output.text == "final answer"
+    # Standard + spec-provided evidence keys are present and paths exist on disk.
+    assert {"workspace", "prompt", "task", "stdout", "stderr", "trajectory"} <= set(attempt.evidence.descriptors)
+    assert (tmp_path / "demo-task" / "prompt.txt").read_text(encoding="utf-8").startswith("Task id: demo/task")
+
+
+@pytest.mark.asyncio
+async def test_driver_marks_failed_on_nonzero_exit(tmp_path: Path) -> None:
+    factory, _ = _factory(returncode=1, write_final=False)
+    driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory)
+
+    attempt = (await driver.run_tasks([_task()]))[0]
+    assert attempt.status == "failed"
+    assert attempt.output is None
+    assert "error" in attempt.evidence.descriptors
+    assert (tmp_path / "demo-task" / "error.json").exists()
+
+
+def test_reference_specs_build_expected_commands(tmp_path: Path) -> None:
+    artifacts = RunArtifacts(
+        evidence_dir=tmp_path,
+        workspace_dir=tmp_path / "workspace",
+        prompt_path=tmp_path / "p",
+        task_path=tmp_path / "t",
+        stdout_path=tmp_path / "o",
+        stderr_path=tmp_path / "e",
+        final_output_path=tmp_path / "f",
+    )
+    claude_cmd = ClaudeCodeSpec(model="claude-x").build_command(artifacts)
+    assert claude_cmd[0] == "claude" and "--model" in claude_cmd and "claude-x" in claude_cmd
+
+    cursor_cmd = CursorAgentSpec().build_command(artifacts)
+    assert cursor_cmd[0] == "cursor-agent" and "--model" not in cursor_cmd
+
+
+def test_driver_rejects_spec_without_binary(tmp_path: Path) -> None:
+    class _NoBinary(CodingAgentSpec):
+        def build_command(self, artifacts: RunArtifacts) -> list[str]:
+            return []
+
+    with pytest.raises(ValueError, match="non-empty"):
+        CliAgentDriver(_NoBinary(), work_root=tmp_path)
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py
new file mode 100644
index 0000000000..3e5f9361a2
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for promoted attempt helpers and reusable metrics."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric, EvidencePresenceMetric
+from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence
+
+
+def test_resolve_attempt_status_keeps_failed_agents_scorable() -> None:
+    assert resolve_attempt_status(True) == "completed"
+    assert resolve_attempt_status(False) == "partial"
+
+
+def test_standard_evidence_descriptors_builds_doc_keys(tmp_path: Path) -> None:
+    logs = tmp_path / "agent"
+    workspace = tmp_path / "workspace"
+    verifier = tmp_path / "verifier"
+    logs.mkdir()
+    workspace.mkdir()
+    verifier.mkdir()  # exists -> verifier_logs included
+
+    descriptors = standard_evidence_descriptors(
+        logs_dir=logs,
+        final_state_dir=workspace,
+        trace_path=tmp_path / "atif_trajectory.json",
+        initial_state_ref=str(tmp_path / "seed"),
+        verifier_logs_dir=verifier,
+        primary_log="nat_agent.log",
+    )
+    assert set(descriptors) == {"initial_state", "trace", "logs", "final_state", "verifier_logs"}
+    assert descriptors["trace"].format == "atif"
+    assert descriptors["logs"].metadata["primary_log"] == "nat_agent.log"
+
+    # verifier_logs omitted when the dir is absent.
+    no_verifier = standard_evidence_descriptors(
+        logs_dir=logs, final_state_dir=workspace, verifier_logs_dir=tmp_path / "missing"
+    )
+    assert "verifier_logs" not in no_verifier
+
+
+@pytest.mark.asyncio
+async def test_agent_phase_success_metric_reads_metadata_and_namespaces_type() -> None:
+    metric = AgentPhaseSuccessMetric()
+    assert metric.type == "agent_phase_success"
+    ok = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(metadata={"agent_ok": True}))
+    )
+    assert ok.outputs[0].value == 1.0
+
+    class Namespaced(AgentPhaseSuccessMetric):
+        metric_type = "agentic_use_agent_phase"
+
+    assert Namespaced().type == "agentic_use_agent_phase"
+
+
+@pytest.mark.asyncio
+async def test_evidence_presence_metric_scores_over_evidence(tmp_path: Path) -> None:
+    final_state = tmp_path / "workspace"
+    final_state.mkdir()
+    (final_state / "result.txt").write_text("done", encoding="utf-8")
+    evidence = CandidateEvidence(
+        descriptors=standard_evidence_descriptors(logs_dir=tmp_path / "agent", final_state_dir=final_state)
+    )
+
+    metric = EvidencePresenceMetric()
+    present = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence))
+    )
+    assert present.outputs[0].value == 1.0
+
+    # Empty workspace -> non-empty requirement fails; no evidence -> 0.
+    (final_state / "result.txt").unlink()
+    empty = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence))
+    )
+    assert empty.outputs[0].value == 0.0
+    missing = await metric.compute_scores(MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput()))
+    assert missing.outputs[0].value == 0.0
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
index 5e0446b1eb..c051499030 100644
--- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
@@ -16,7 +16,7 @@
 from nemo_evaluator_sdk.agent_eval.runtimes import docker_sandbox
 from nemo_evaluator_sdk.agent_eval.runtimes.docker_sandbox import (
     DockerSandboxAgentRuntime,
-    SandboxSdk,
+    SandboxSDK,
 )
 
 
@@ -147,8 +147,8 @@ async def run(self, agent: _FakeSandboxAgent, prompt: str, *, run_config: _FakeR
         raise RuntimeError("sandbox run failed")
 
 
-def _fake_sdk() -> SandboxSdk:
-    return SandboxSdk(
+def _fake_sdk() -> SandboxSDK:
+    return SandboxSDK(
         Runner=_FakeRunner(),
         RunConfig=_FakeRunConfig,
         SandboxRunConfig=_FakeSandboxRunConfig,
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py
new file mode 100644
index 0000000000..b7df9a61d4
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the promoted environment boundary + environment authoring."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.runtimes import docker as docker_mod
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    DockerEnvironmentHandle,
+    DockerEnvironmentProvider,
+    EnvRunSpec,
+    default_image_tag,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec, plan_task_build
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask
+
+
+@pytest.mark.asyncio
+async def test_docker_handle_routes_roles_through_single_run(monkeypatch: pytest.MonkeyPatch) -> None:
+    calls: list[tuple[str, list[str]]] = []
+
+    def fake_docker_run(image: str, command: list[str], **kwargs: object) -> subprocess.CompletedProcess[str]:
+        calls.append((image, command))
+        return subprocess.CompletedProcess(args=command, returncode=0)
+
+    monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run)
+
+    handle = DockerEnvironmentHandle("img:latest")
+    spec = EnvRunSpec(command=["echo", "hi"])
+    assert (await handle.run_agent(spec)).ok
+    assert (await handle.run_verifier(spec)).ok
+    assert calls == [("img:latest", ["echo", "hi"]), ("img:latest", ["echo", "hi"])]
+
+
+@pytest.mark.asyncio
+async def test_docker_handle_reports_timeout(monkeypatch: pytest.MonkeyPatch) -> None:
+    def fake_docker_run(image: str, command: list[str], **kwargs: object):
+        raise subprocess.TimeoutExpired(cmd=command, timeout=1)
+
+    monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run)
+    result = await DockerEnvironmentHandle("img").run(EnvRunSpec(command=["sleep"]), "agent")
+    assert result.timed_out and result.exit_code == 124 and not result.ok
+
+
+@pytest.mark.asyncio
+async def test_provider_uses_injected_image_tag_fn() -> None:
+    assert default_image_tag("t") == "t:latest"
+    provider = DockerEnvironmentProvider(image_tag_fn=lambda task_id: f"custom-{task_id}")
+    handle = await provider.prepare(AgentEvalTask(id="demo", intent="x", inputs={}))
+    assert isinstance(handle, DockerEnvironmentHandle)
+    assert handle.image == "custom-demo"
+
+
+def test_environment_spec_yaml_dockerfile_and_plan(tmp_path: Path) -> None:
+    (tmp_path / "environment.yaml").write_text(
+        "environment:\n  image: base:1\n  dependencies:\n    python: [pytest]\n  setup: [seed]\n",
+        encoding="utf-8",
+    )
+    spec = load_environment_spec(tmp_path)
+    assert spec.image == "base:1" and spec.python_dependencies == ["pytest"]
+
+    plan = plan_task_build(tmp_path, "img:latest", generated_dir=tmp_path / "build")
+    content = plan.dockerfile.read_text(encoding="utf-8")
+    assert plan.generated and plan.base_image == "base:1"
+    assert content.startswith("FROM base:1") and "pip install --no-cache-dir pytest" in content
+
+    # Dockerfile escape hatch wins when no yaml present.
+    other = tmp_path / "task2" / "environment"
+    other.mkdir(parents=True)
+    (other / "Dockerfile").write_text("FROM scratch\n", encoding="utf-8")
+    escape = load_environment_spec(tmp_path / "task2")
+    assert escape.dockerfile == other / "Dockerfile" and escape.image is None
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py
new file mode 100644
index 0000000000..613a4cfaa3
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the promoted deterministic gate."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, summarize_run, write_gate_report
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunResult,
+    AgentEvalSummary,
+    AgentEvalTask,
+    AgentEvalTaskResult,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import MetricOutput
+
+
+def _make_run_result(
+    *, reward: float, total_tokens: int, runtime_sec: float, commit: str = "abc123"
+) -> AgentEvalRunResult:
+    task = AgentEvalTask(id="demo", intent="do it", inputs={})
+    attempt = AgentEvalAttempt(
+        id="demo:workflow",
+        task_id="demo",
+        status="completed",
+        output=AgentOutput(text="ok"),
+        metadata={
+            "total_tokens": total_tokens,
+            "runtime_sec": runtime_sec,
+            "provenance": {"commit_sha": commit, "commit_short": commit[:7]},
+        },
+    )
+    task_result = AgentEvalTaskResult(
+        id="demo:workflow:agentic_use_verifier_reward",
+        run_id="run-1",
+        task_id="demo",
+        attempt_id="demo:workflow",
+        metric_type="agentic_use_verifier_reward",
+        outputs=[MetricOutput(name="verifier_reward", value=reward)],
+    )
+    return AgentEvalRunResult(
+        run_id="run-1",
+        tasks=[task],
+        attempts=[attempt],
+        results=[task_result],
+        summary=AgentEvalSummary(),
+    )
+
+
+def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
+    summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5))
+    assert summary["total_tasks"] == 1
+    assert summary["pass_rate"] == 1.0
+    assert summary["total_tokens_sum"] == 120
+    assert summary["runtime_sec_sum"] == 4.5
+    assert summary["token_metrics_coverage"] == 1.0
+    assert summary["provenance"]["commit_sha"] == "abc123"
+
+
+def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None:
+    baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0)
+    candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0)
+
+    baseline_report = evaluate_gate(baseline, thresholds=GateThresholds())
+    assert baseline_report.gate_passed is True
+
+    candidate_report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_report.summary)
+    assert candidate_report.gate_passed is False
+    token_check = next(c for c in candidate_report.checks if c.name == "tokens_not_worse_than_baseline")
+    assert token_check.passed is False
+
+    gate_path = write_gate_report(candidate_report, tmp_path)
+    assert gate_path.exists() and "gate_passed" in gate_path.read_text(encoding="utf-8")
+
+
+def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
+    baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111")
+    candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222")
+
+    baseline_summary = evaluate_gate(baseline, thresholds=GateThresholds()).summary
+    report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_summary)
+    cross = next(c for c in report.checks if c.name == "commit_sha_matches_baseline")
+    assert cross.passed is False and report.gate_passed is False
+
+    allowed = evaluate_gate(
+        candidate, thresholds=GateThresholds(allow_cross_commit=True), baseline_summary=baseline_summary
+    )
+    cross_allowed = next(c for c in allowed.checks if c.name == "commit_sha_matches_baseline")
+    assert cross_allowed.passed is True
+
+
+def test_summarize_run_uses_measurement_fallbacks() -> None:
+    # duration_ms -> runtime_sec, and metadata reward when no scored metric output.
+    run = _make_run_result(reward=0.0, total_tokens=10, runtime_sec=1.0)
+    run.attempts[0].metadata.pop("runtime_sec")
+    run.attempts[0].metadata["duration_ms"] = 2500
+    run.attempts[0].metadata["reward"] = 1
+    run.results.clear()  # no scored metric outputs -> fall back to metadata reward
+
+    summary = summarize_run(run)
+    assert summary["runtime_sec_sum"] == 2.5
+    assert summary["pass_rate"] == 1.0
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py
new file mode 100644
index 0000000000..ed7da3beee
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Guardrail: the agent_eval package must stay free of NeMo-Platform imports.
+
+The SDK is consumed by ``tests/agentic-use`` (the NeMo-Platform adapter), never
+the reverse. This test fails if any module under ``agent_eval`` imports a
+platform-specific package, which keeps the promotion from leaking coupling into
+the SDK.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+import nemo_evaluator_sdk.agent_eval as agent_eval
+
+AGENT_EVAL_ROOT = Path(agent_eval.__file__).resolve().parent
+
+# Import statements that would couple the SDK to the platform / adapter.
+_FORBIDDEN = re.compile(
+    r"^\s*(?:from|import)\s+"
+    r"(nemo_platform|nmp_[A-Za-z0-9_]+|nat_runner|runtimes(?:\.|\s|$)|evaluator_agent_eval)",
+    re.MULTILINE,
+)
+
+
+def test_agent_eval_has_no_platform_imports() -> None:
+    offenders: list[str] = []
+    for path in sorted(AGENT_EVAL_ROOT.rglob("*.py")):
+        text = path.read_text(encoding="utf-8")
+        for match in _FORBIDDEN.finditer(text):
+            line_no = text.count("\n", 0, match.start()) + 1
+            offenders.append(f"{path.relative_to(AGENT_EVAL_ROOT)}:{line_no}: {match.group(0).strip()}")
+
+    assert not offenders, "agent_eval must not import NeMo-Platform packages:\n" + "\n".join(offenders)
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py
new file mode 100644
index 0000000000..bc11bce7ef
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the typed AttemptMeasurements contract."""
+
+from __future__ import annotations
+
+from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
+
+
+def test_from_metadata_reads_tokens_runtime_reward_and_provenance() -> None:
+    measurements = AttemptMeasurements.from_metadata(
+        {
+            "total_tokens": 120,
+            "prompt_tokens": 80,
+            "completion_tokens": 40,
+            "runtime_sec": 4.5,
+            "reward": 1,
+            "passed": True,
+            "provenance": {"commit_sha": "abc123"},
+        }
+    )
+    assert measurements.total_tokens == 120
+    assert measurements.runtime_sec == 4.5
+    assert measurements.reward == 1.0
+    assert measurements.passed is True
+    assert measurements.provenance["commit_sha"] == "abc123"
+
+
+def test_from_metadata_applies_fallbacks_and_ignores_bad_types() -> None:
+    # duration_ms -> runtime_sec, passed -> reward, bool is not a token count.
+    measurements = AttemptMeasurements.from_metadata(
+        {"duration_ms": 2500, "passed": False, "total_tokens": True}
+    )
+    assert measurements.runtime_sec == 2.5
+    assert measurements.reward == 0.0
+    assert measurements.total_tokens is None
+
+    empty = AttemptMeasurements.from_metadata(None)
+    assert empty.reward is None and empty.runtime_sec is None and empty.provenance == {}
+
+
+def test_to_metadata_round_trips_only_set_values() -> None:
+    payload = AttemptMeasurements(total_tokens=10, runtime_sec=1.0, reward=1.0).to_metadata()
+    assert payload == {"total_tokens": 10, "runtime_sec": 1.0, "reward": 1.0}
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py
new file mode 100644
index 0000000000..d5acd5bd3f
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the generic agent-eval orchestrator (online + offline paths)."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Sequence
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+class _ExtraMetric:
+    @property
+    def type(self) -> str:
+        return "extra"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("extra")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        return MetricResult(outputs=[MetricOutput(name="extra", value=1.0)])
+
+
+class _FakeRuntime:
+    def __init__(self) -> None:
+        self.prepared_ids: list[str] = []
+
+    async def run_tasks(
+        self, tasks: Sequence[AgentEvalTask], config: AgentEvalRunConfig | None = None
+    ) -> Sequence[AgentEvalAttempt]:
+        return [
+            AgentEvalAttempt(
+                id=f"{task.id}:fake",
+                task_id=task.id,
+                status="completed",
+                output=AgentOutput(text="ok"),
+                metadata={"agent_ok": True},
+            )
+            for task in tasks
+        ]
+
+
+def _task() -> AgentEvalTask:
+    return AgentEvalTask(id="demo", intent="do it", inputs={}, metrics=[AgentPhaseSuccessMetric()])
+
+
+@pytest.mark.asyncio
+async def test_run_tasks_appends_extra_metrics_and_runs_prepare_hook(tmp_path: Path) -> None:
+    runtime = _FakeRuntime()
+    seen: list[str] = []
+    orch = AgentEvalOrchestrator(
+        config=OrchestratorConfig(write_dashboard=False, write_gate=True),
+        extra_metrics=[_ExtraMetric()],
+    )
+
+    result = await orch.run_tasks(
+        [_task()],
+        target=runtime,
+        benchmark={"benchmark": "demo"},
+        output_dir=tmp_path,
+        run_id="run-1",
+        prepare_task=lambda task: seen.append(task.id),
+    )
+
+    assert seen == ["demo"]
+    assert {m.type for m in result.tasks[0].metrics} == {"agent_phase_success", "extra"}
+    assert result.attempts[0].status == "completed"
+    # Gate is written next to the run bundle.
+    assert (tmp_path / "gate.json").exists()
+
+
+@pytest.mark.asyncio
+async def test_score_attempts_offline_does_not_invoke_runtime() -> None:
+    orch = AgentEvalOrchestrator(config=OrchestratorConfig(write_dashboard=False, write_gate=False))
+    attempt = AgentEvalAttempt(
+        id="demo:stored",
+        task_id="demo",
+        status="completed",
+        output=AgentOutput(text="ok"),
+        metadata={"agent_ok": True},
+    )
+    result = await orch.score_attempts([_task()], attempts=[attempt])
+    assert [m.type for m in result.tasks[0].metrics] == ["agent_phase_success"]
+    assert any(r.metric_type == "agent_phase_success" for r in result.results)
+
+
+@pytest.mark.asyncio
+async def test_extra_metrics_deduplicated_by_type() -> None:
+    task = AgentEvalTask(id="demo", intent="i", inputs={}, metrics=[AgentPhaseSuccessMetric(), _ExtraMetric()])
+    orch = AgentEvalOrchestrator(
+        config=OrchestratorConfig(write_dashboard=False, write_gate=False),
+        extra_metrics=[_ExtraMetric()],
+    )
+    attempt = AgentEvalAttempt(id="demo:s", task_id="demo", status="completed", output=AgentOutput(text="ok"))
+    result = await orch.score_attempts([task], attempts=[attempt])
+    types = [m.type for m in result.tasks[0].metrics]
+    assert types.count("extra") == 1
+
+
+def test_result_dir_attempt_source_protocol_shape(tmp_path: Path) -> None:
+    # A minimal AgentAttemptSource implementation satisfies the protocol.
+    from nemo_evaluator_sdk.agent_eval.types import AgentAttemptSource
+
+    class _Source:
+        def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
+            payload = json.loads(Path(source).read_text(encoding="utf-8"))
+            return AgentEvalAttempt(
+                id=f"{task.id}:stored",
+                task_id=task.id,
+                status="completed",
+                output=AgentOutput(text=payload["agent"]),
+            )
+
+    src_path = tmp_path / "result.json"
+    src_path.write_text(json.dumps({"agent": "ok"}), encoding="utf-8")
+    source: AgentAttemptSource = _Source()
+    assert isinstance(source, AgentAttemptSource)
+    attempt = source.load_attempt(src_path, task=_task())
+    assert attempt.task_id == "demo"
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py
new file mode 100644
index 0000000000..136fda6075
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the generic verifier mechanic."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
+    apply_verify_to_metadata,
+    collect_verifier_outcome,
+    skipped_outcome,
+)
+
+
+def test_collect_reads_reward_file_when_present(tmp_path: Path) -> None:
+    (tmp_path / "reward.txt").write_text("1\n", encoding="utf-8")
+    (tmp_path / "test-stdout.txt").write_text("PASSED", encoding="utf-8")
+    outcome = collect_verifier_outcome(ok=False, exit_code=3, log_dir=tmp_path)
+    # reward.txt is authoritative even when the process exit said not-ok.
+    assert outcome.ran and outcome.reward == 1 and outcome.exit_code == 3
+    assert outcome.stdout == "PASSED"
+
+
+def test_collect_derives_and_writes_reward_when_missing(tmp_path: Path) -> None:
+    outcome = collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path)
+    assert outcome.reward == 1 and outcome.passed is True
+    assert (tmp_path / "reward.txt").read_text(encoding="utf-8").strip() == "1"
+
+
+def test_apply_to_metadata_stamps_and_skips(tmp_path: Path) -> None:
+    meta: dict[str, object] = {}
+    apply_verify_to_metadata(meta, skipped_outcome())
+    assert meta == {"verify_status": "skipped"}
+
+    meta2: dict[str, object] = {}
+    apply_verify_to_metadata(meta2, collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path))
+    assert meta2["verify_status"] == "ok" and meta2["reward"] == 1 and meta2["passed"] is True
diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md
index b7d55b1b13..3631361cad 100644
--- a/tests/agentic-use/runtimes/COMPLIANCE.md
+++ b/tests/agentic-use/runtimes/COMPLIANCE.md
@@ -7,6 +7,23 @@ and `AgentAttemptRuntime` in `nemo_evaluator_sdk.agent_eval`).
 Design reference: internal agent-eval SDK doc
 (`https://docs.google.com/document/d/1mA9Kl6LVJFlgbj5CGulUOiaGyliP7QhqBh7jKXFGifM`).
 
+## Adapter-over-SDK note
+
+The generic building blocks have been **promoted into the SDK**
+(`nemo_evaluator_sdk.agent_eval`): the environment boundary
+(`runtimes.environment`/`environment_spec`/`docker`), gating (`gating`), attempt
+helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics
+(`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence
+`EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the
+`AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the
+coding-agent driver seam (`runtimes.coding_agent`). The `shared/*` modules
+referenced below are now **re-export shims** over those SDK homes (see
+`README.md` for the shim→SDK table); only NeMo-Platform specifics
+(`task_loader`, `result_adapter`, `config`, the pytest verifier command, the
+`state` evidence key, `task_image_tag`) remain platform code. A CI grep gate
+(`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) keeps
+`agent_eval/` free of NeMo-Platform imports.
+
 ## Scope split (per SDK design)
 
 | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location |
diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md
index 90317f204a..d5ecff2c38 100644
--- a/tests/agentic-use/runtimes/README.md
+++ b/tests/agentic-use/runtimes/README.md
@@ -1,32 +1,62 @@
 # Agentic-use AgentAttemptRuntime implementations
 
-Backend-specific runtimes extracted from `nat_runner.py` for use with
-`nemo_evaluator_sdk.agent_eval.AgentEvaluator`.
+NeMo-Platform **adapter** over the generic agent-eval framework in
+`nemo_evaluator_sdk.agent_eval`. The backend-agnostic building blocks (environment
+boundary, gating, attempt/evidence helpers, orchestrator, verify mechanic,
+coding-agent driver seam) now live in the SDK; this directory holds only the
+NeMo-Platform glue (the `workflow`/`aut` backends, agentic task/result formats,
+the pytest verifier, the platform Docker build/image-tag) plus a thin factory.
+
+## Architecture: adapter over SDK
+
+The `shared/*` modules below are **pure re-export shims** over their SDK homes —
+they exist only so existing imports keep working; the logic lives in the SDK:
+
+| `shared/` shim | SDK home |
+|----------------|----------|
+| `docker.py` | `agent_eval.runtimes.docker` |
+| `environment.py` | `agent_eval.runtimes.environment` (re-supplies the platform image-tag) |
+| `environment_spec.py` | `agent_eval.runtimes.environment_spec` |
+| `reporting.py` | `agent_eval.gating` |
+| `verify.py` | wraps `agent_eval.runtimes.verify` (pytest command/env/mounts stay here) |
+| `metrics.py` | `AgentPhaseSuccessMetric` from `agent_eval.common_metrics` (namespaced); `VerifierRewardMetric` is platform |
+| `artifacts.py` | `resolve_attempt_status` + evidence keys from `agent_eval.attempts`; adds the platform `state` key |
+| `layout.py` | delegates to `agent_eval.runtimes.layout`; adds the platform `state_dir` + `task_image_tag` |
+
+The orchestrator (`orchestrator.py`) is a thin factory over
+`agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image
+build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric`
+(`extra_metrics`), and the `result.json` `AgentAttemptSource`.
 
 ## Layout
 
 ```text
 runtimes/
-  shared/           # backend-agnostic building blocks:
-                    #   docker.py            Docker exec + build helpers
-                    #   environment.py       AgentEnvironmentProvider/Handle boundary (B2)
-                    #   environment_spec.py  environment.yaml authoring + build plans (B3)
-                    #   layout.py            per-run output layout
-                    #   task_loader.py       agentic-use task -> AgentEvalTask
-                    #   container_env.py     base container env vars
-                    #   artifacts.py         agent artifacts -> AgentEvalAttempt (+ evidence)
-                    #   result_adapter.py    nat_runner result.json -> AgentEvalAttempt (B1/B4)
-                    #   verify.py            live VERIFY via run_verifier
-                    #   reporting.py         summary + candidate/baseline gate (B4)
-                    #   metrics.py           AgentPhaseSuccessMetric, VerifierRewardMetric
-  workflow/         # NatWorkflowAttemptRuntime (implemented)
-  aut/              # AutAgentAttemptRuntime (implemented)
-  claude_code/      # ClaudeCodeAgentAttemptRuntime (scaffold)
-  codex/            # CodexAgentAttemptRuntime (scaffold)
-  cursor_agent/     # CursorAgentAttemptRuntime (scaffold)
-  orchestrator.py   # BUILD (env spec) + AgentEvaluator + gate; verify runs in the runtime
+  shared/           # thin re-export shims over agent_eval.* (see table above)
+                    #   + platform-only: task_loader.py, result_adapter.py,
+                    #     config.py, container_env.py, constants.py
+  workflow/         # NatWorkflowAttemptRuntime (implemented, NeMo construct)
+  aut/              # AutAgentAttemptRuntime (implemented, NeMo construct)
+  claude_code/      # scaffold (stub) — see "Coding-agent runtimes" below
+  codex/            # scaffold (stub)
+  cursor_agent/     # scaffold (stub)
+  orchestrator.py   # thin factory over agent_eval.orchestrator.AgentEvalOrchestrator
 ```
 
+## Coding-agent runtimes (SDK driver seam)
+
+Coding-agent CLIs plug into the SDK via
+`agent_eval.runtimes.coding_agent`: `CliAgentDriver` (the reusable driver) +
+`CodingAgentSpec` (per-agent command builder + trajectory→evidence parser).
+Reference `ClaudeCodeSpec`/`CursorAgentSpec` are shipped. The profbench codex
+runtime (`agent_eval.runtimes.codex`) remains a separate, standalone-CLI runtime.
+
+The agentic-use `codex`/`claude_code`/`cursor_agent` backends here are still
+stubs: wiring them to run the SDK driver *inside* the `nmp-agentic-base` Docker
+environment (like `workflow`/`aut`) is bespoke per agent and a tracked follow-up.
+`workflow` and `aut` stay in the adapter — they implement `AgentAttemptRuntime`
+but are NeMo constructs, not general SDK runtimes.
+
 ## Example: workflow backend
 
 From the repository root (requires Docker + built task image):
diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py
index 94eb00050e..8a355fbdfc 100644
--- a/tests/agentic-use/runtimes/orchestrator.py
+++ b/tests/agentic-use/runtimes/orchestrator.py
@@ -1,7 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Orchestrate BUILD + AgentEvaluator + VERIFY for agentic-use tasks."""
+"""Agentic-use adapter over the generic SDK orchestrator.
+
+This is a thin NeMo-Platform factory: the generic run/score/gate loop lives in
+:class:`nemo_evaluator_sdk.agent_eval.orchestrator.AgentEvalOrchestrator`. Here we
+inject the platform specifics it deliberately does not know about — the agentic
+task loader, the Docker image build (``prepare_task``), the ``run_verify``-derived
+``VerifierRewardMetric``, and the ``result.json`` :class:`AgentAttemptSource`.
+"""
 
 from __future__ import annotations
 
@@ -10,7 +17,7 @@
 from pathlib import Path
 from typing import Any
 
-from nemo_evaluator_sdk.agent_eval import AgentEvalRunConfig, AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
     AgentEvalRunResult,
@@ -22,8 +29,8 @@
 from runtimes.shared.environment_spec import execute_build_plan, plan_task_build
 from runtimes.shared.layout import task_image_tag
 from runtimes.shared.metrics import VerifierRewardMetric
-from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
-from runtimes.shared.result_adapter import attempt_from_result_dir
+from runtimes.shared.reporting import GateThresholds
+from runtimes.shared.result_adapter import ResultDirAttemptSource
 from runtimes.shared.task_loader import agentic_task_from_dir
 
 
@@ -38,7 +45,7 @@ class AgenticOrchestratorConfig:
 
 
 class AgenticEvalOrchestrator:
-    """Run agentic-use tasks through AgentEvaluator and optional verify phase."""
+    """Run agentic-use tasks through the generic orchestrator + optional verify metric."""
 
     def __init__(
         self,
@@ -48,6 +55,16 @@ def __init__(
     ) -> None:
         self.runtime = runtime
         self.config = config or AgenticOrchestratorConfig()
+        self._orchestrator = AgentEvalOrchestrator(
+            config=OrchestratorConfig(
+                parallelism=1,
+                write_dashboard=self.config.write_dashboard,
+                write_gate=self.config.write_gate,
+                gate_thresholds=self.config.gate_thresholds,
+                baseline_summary_path=self.config.baseline_summary_path,
+            ),
+            extra_metrics=self._extra_metrics(),
+        )
 
     async def run_agent_eval(
         self,
@@ -58,25 +75,15 @@ async def run_agent_eval(
     ) -> AgentEvalRunResult:
         """Build the task image when needed, run the agent runtime, return SDK result."""
         task = agentic_task_from_dir(task_name)
-        task = task.model_copy(update={"metrics": self._metrics_for_task(task)})
-        image_tag = task_image_tag(task.id)
-        self._ensure_task_image(task.metadata["task_dir"], image_tag)
-
-        result = await AgentEvaluator().run(
-            tasks=[task],
+        return await self._orchestrator.run_tasks(
+            [task],
             target=self.runtime,
-            config=AgentEvalRunConfig(
-                output_dir=output_dir,
-                run_id=run_id,
-                parallelism=1,
-                write_dashboard=self.config.write_dashboard,
-                benchmark={"benchmark": "agentic-use", "task": task_name},
-            ),
+            benchmark={"benchmark": "agentic-use", "task": task_name},
+            output_dir=output_dir,
+            run_id=run_id,
+            prepare_task=self._ensure_task_image,
         )
 
-        self._maybe_write_gate(result)
-        return result
-
     async def score_captured_attempts(
         self,
         task_name: str,
@@ -87,61 +94,38 @@ async def score_captured_attempts(
     ) -> AgentEvalRunResult:
         """Score already-captured ``result.json`` runs without re-running the agent.
 
-        This is the SDK's first-class *stored-attempt* path: it imports each
-        ``nat_runner`` output directory via :func:`attempt_from_result_dir` and
-        scores them through :class:`AgentEvaluator`, so metrics can be exercised
-        (and runs rescored) with no Docker/agent execution.
+        The SDK's first-class *stored-attempt* path: each ``nat_runner`` output
+        dir is adapted via :class:`ResultDirAttemptSource` and scored through the
+        generic orchestrator, so metrics can be exercised (and runs rescored) with
+        no Docker/agent execution.
         """
         task = agentic_task_from_dir(task_name)
-        task = task.model_copy(update={"metrics": self._metrics_for_task(task)})
-        attempts = [attempt_from_result_dir(result_dir, task=task) for result_dir in result_dirs]
-
-        result = await AgentEvaluator().run(
-            tasks=[task],
+        source = ResultDirAttemptSource()
+        attempts = [source.load_attempt(result_dir, task=task) for result_dir in result_dirs]
+        return await self._orchestrator.score_attempts(
+            [task],
             attempts=attempts,
-            config=AgentEvalRunConfig(
-                output_dir=output_dir,
-                run_id=run_id,
-                parallelism=1,
-                write_dashboard=self.config.write_dashboard,
-                benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"},
-            ),
+            benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"},
+            output_dir=output_dir,
+            run_id=run_id,
         )
 
-        self._maybe_write_gate(result)
-        return result
+    def _extra_metrics(self) -> list[Metric]:
+        """Append :class:`VerifierRewardMetric` only when the runtime runs verify.
 
-    def _maybe_write_gate(self, result: AgentEvalRunResult) -> None:
-        if not (self.config.write_gate and result.output_dir is not None):
-            return
-        baseline = (
-            load_baseline_summary(self.config.baseline_summary_path)
-            if self.config.baseline_summary_path is not None
-            else None
-        )
-        report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline)
-        write_gate_report(report, result.output_dir)
-
-    def _metrics_for_task(self, task: AgentEvalTask) -> list[Metric]:
-        """Honor task-authored metrics; only *append* a compatibility metric.
-
-        Metrics originate on the task (see ``agentic_task_from_dir``). When the
-        live verify phase is enabled we append :class:`VerifierRewardMetric` so
-        the legacy pytest reward is scored too — but we never replace the task's
-        own metric set, and we avoid duplicating a metric the task already
-        declares (the SDK rejects duplicate metric types).
+        The verify-enable decision stays in the adapter (it knows its own runtime
+        config); the generic orchestrator never introspects the runtime.
         """
-        metrics: list[Metric] = list(task.metrics)
-        if self._verify_enabled() and not any(isinstance(metric, VerifierRewardMetric) for metric in metrics):
-            metrics.append(VerifierRewardMetric())
-        return metrics
+        return [VerifierRewardMetric()] if self._verify_enabled() else []
 
     def _verify_enabled(self) -> bool:
         runtime_config = getattr(self.runtime, "config", None)
         shared = getattr(runtime_config, "shared", None)
         return bool(getattr(shared, "run_verify", False))
 
-    def _ensure_task_image(self, task_dir: str | Path, image_tag: str) -> None:
+    def _ensure_task_image(self, task: AgentEvalTask) -> None:
+        image_tag = task_image_tag(task.id)
+        task_dir = task.metadata["task_dir"]
         if self.config.skip_build:
             if not docker_image_exists(image_tag):
                 raise RuntimeError(
diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py
index 4942568635..912c12a7a0 100644
--- a/tests/agentic-use/runtimes/shared/artifacts.py
+++ b/tests/agentic-use/runtimes/shared/artifacts.py
@@ -15,9 +15,9 @@
     AgentAttemptTrace,
     CapturedAgentAttempt,
 )
+from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentEvalAttempt,
-    AgentEvalAttemptStatus,
     AgentEvalTask,
     AgentOutput,
 )
@@ -27,20 +27,11 @@
 from runtimes.shared.layout import AgenticRunLayout
 from runtimes.shared.usage import extract_usage_metrics
 
-
-def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
-    """Map an agent-phase outcome to a *scorable* attempt status.
-
-    The SDK's :class:`AgentEvaluator` excludes ``status=="failed"`` from scoring
-    (it raises). An agent that ran but failed must still be scored — e.g. as a
-    ``0`` by :class:`AgentPhaseSuccessMetric` — so that pass-rate gating counts
-    it rather than dropping it. We therefore use ``"partial"`` for an
-    executed-but-unsuccessful agent and reserve ``"failed"`` for genuine
-    attempt-*production* failures (which a runtime surfaces by raising, not by
-    emitting an unscorable attempt). This keeps the live builder and the
-    ``result.json`` importer consistent.
-    """
-    return "completed" if agent_ok else "partial"
+__all__ = [
+    "build_agent_eval_attempt",
+    "resolve_attempt_status",
+    "to_captured_agent_attempt",
+]
 
 
 def build_agent_eval_attempt(
@@ -156,48 +147,20 @@ def _evidence_descriptors(
     *,
     initial_state_ref: str | None = None,
 ) -> dict[str, EvidenceDescriptor]:
-    """Build the evidence map specified by the agent-eval SDK design doc.
+    """Compose the SDK's standard evidence keys + the platform ``state`` extension.
 
-    Doc keys: ``initial_state`` (task input filesystem, when staged),
-    ``final_state`` (workspace), ``trace`` (trajectory, ATIF-normalized),
-    ``logs`` (agent log dir), and ``verifier_logs`` (verifier log dir).
-
-    ``state`` is a NeMo-Platform-specific *extension* (not a doc key): it carries
-    the preserved platform/database state across the agent + verifier phases.
+    The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/
+    ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state``
+    is a NeMo-Platform-specific *extension* (not a doc key): it carries the
+    preserved platform/database state across the agent + verifier phases.
     """
-    descriptors: dict[str, EvidenceDescriptor] = {}
-
-    # task input filesystem → evidence["initial_state"] (only when a seed was staged).
-    if initial_state_ref:
-        descriptors["initial_state"] = EvidenceDescriptor(
-            kind="filesystem",
-            format="dir",
-            ref=initial_state_ref,
-            metadata={"role": "initial_state"},
-        )
-
-    # agent/trajectory.json → evidence["trace"], preferably ATIF-normalized.
-    if artifacts.atif_trajectory_path is not None:
-        descriptors["trace"] = EvidenceDescriptor(
-            kind="trace",
-            format="atif" if artifacts.atif_trajectory_path.name.startswith("atif") else "json",
-            ref=str(artifacts.atif_trajectory_path),
-        )
-
-    # agent/ logs → evidence["logs"].
-    descriptors["logs"] = EvidenceDescriptor(
-        kind="logs",
-        format="dir",
-        ref=str(layout.agent_log_dir),
-        metadata={"primary_log": "nat_agent.log"},
-    )
-
-    # workspace/ → evidence["final_state"] filesystem descriptor.
-    descriptors["final_state"] = EvidenceDescriptor(
-        kind="filesystem",
-        format="dir",
-        ref=str(layout.workspace_dir),
-        metadata={"role": "final_state"},
+    descriptors = standard_evidence_descriptors(
+        logs_dir=layout.agent_log_dir,
+        final_state_dir=layout.workspace_dir,
+        trace_path=artifacts.atif_trajectory_path,
+        initial_state_ref=initial_state_ref,
+        verifier_logs_dir=layout.run_dir / "verifier",
+        primary_log="nat_agent.log",
     )
 
     # Platform extension (non-doc key): preserved platform/db state across phases.
@@ -208,16 +171,6 @@ def _evidence_descriptors(
         metadata={"role": "platform_state", "extension": "nemo-platform"},
     )
 
-    # verifier/ logs → evidence["verifier_logs"] (present once verify phase runs).
-    verifier_log_dir = layout.run_dir / "verifier"
-    if verifier_log_dir.exists():
-        descriptors["verifier_logs"] = EvidenceDescriptor(
-            kind="logs",
-            format="dir",
-            ref=str(verifier_log_dir),
-            metadata={"role": "verifier"},
-        )
-
     return descriptors
 
 
diff --git a/tests/agentic-use/runtimes/shared/docker.py b/tests/agentic-use/runtimes/shared/docker.py
index 431d646806..ce3cc6cc22 100644
--- a/tests/agentic-use/runtimes/shared/docker.py
+++ b/tests/agentic-use/runtimes/shared/docker.py
@@ -1,87 +1,26 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Docker helpers for agentic-use runtimes."""
+"""Compatibility shim — Docker helpers were promoted to the Evaluator SDK.
 
-from __future__ import annotations
-
-import os
-import subprocess
-from collections.abc import Sequence
-
-
-def redact_cmd_for_logging(cmd: Sequence[str]) -> list[str]:
-    """Redact secret values in command logs."""
-    redacted: list[str] = []
-    sensitive_markers = ("KEY", "TOKEN", "SECRET", "PASSWORD")
-    for token in cmd:
-        if "=" not in token:
-            redacted.append(token)
-            continue
-        left, right = token.split("=", 1)
-        env_key = left.split()[-1] if left else left
-        if any(marker in env_key.upper() for marker in sensitive_markers):
-            redacted.append(f"{left}=***REDACTED***")
-        else:
-            redacted.append(f"{left}={right}")
-    return redacted
-
-
-def docker_run(
-    image: str,
-    command: list[str],
-    *,
-    env: dict[str, str] | None = None,
-    mounts: list[tuple[str, str]] | None = None,
-    workdir: str | None = None,
-    remove: bool = True,
-    timeout: int | None = None,
-    extra_args: list[str] | None = None,
-) -> subprocess.CompletedProcess[str]:
-    """Run a command inside a Docker container."""
-    cmd = ["docker", "run"]
-    if remove:
-        cmd.append("--rm")
-    if workdir:
-        cmd += ["-w", workdir]
-
-    for key, value in (env or {}).items():
-        cmd += ["-e", f"{key}={value}"]
-
-    for host_path, container_path in mounts or []:
-        cmd += ["-v", f"{host_path}:{container_path}"]
+Import from ``nemo_evaluator_sdk.agent_eval.runtimes.docker`` directly; this
+module re-exports the same symbols so existing adapter imports keep working.
+"""
 
-    docker_extra = (extra_args or []) + (os.environ.get("DOCKER_EXTRA_ARGS", "").split() or [])
-    cmd += docker_extra
-    cmd.append(image)
-    cmd += command
-
-    print(f"[agentic-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}")
-    kwargs: dict[str, object] = {"check": False, "text": True}
-    if timeout is not None:
-        kwargs["timeout"] = timeout
-    return subprocess.run(cmd, **kwargs)
-
-
-def docker_image_exists(tag: str) -> bool:
-    """Return True when a Docker image tag exists locally."""
-    result = subprocess.run(["docker", "image", "inspect", tag], capture_output=True, text=True, check=False)
-    return result.returncode == 0
-
-
-def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None:
-    """Build a Docker image from an explicit Dockerfile + build context."""
-    cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)]
-    print(f"[agentic-runtime] $ {' '.join(cmd)}")
-    subprocess.run(cmd, check=True)
-
-
-def build_task_image(task_dir: os.PathLike[str], tag: str) -> None:
-    """Build a task-specific Docker image from environment/Dockerfile."""
-    from pathlib import Path
+from __future__ import annotations
 
-    root = Path(task_dir)
-    env_dockerfile = root / "environment" / "Dockerfile"
-    if not env_dockerfile.exists():
-        raise FileNotFoundError(f"No environment/Dockerfile found in {root}")
-    build_dockerfile(env_dockerfile, env_dockerfile.parent, tag)
+from nemo_evaluator_sdk.agent_eval.runtimes.docker import (
+    build_dockerfile,
+    build_task_image,
+    docker_image_exists,
+    docker_run,
+    redact_cmd_for_logging,
+)
+
+__all__ = [
+    "build_dockerfile",
+    "build_task_image",
+    "docker_image_exists",
+    "docker_run",
+    "redact_cmd_for_logging",
+]
diff --git a/tests/agentic-use/runtimes/shared/environment.py b/tests/agentic-use/runtimes/shared/environment.py
index fe23893668..08e55ce2ed 100644
--- a/tests/agentic-use/runtimes/shared/environment.py
+++ b/tests/agentic-use/runtimes/shared/environment.py
@@ -1,125 +1,50 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Environment provider boundary for agentic-use runtimes.
+"""Compatibility shim — the environment boundary was promoted to the Evaluator SDK.
 
-This is the design-doc's ``EnvironmentProvider`` boundary (section B2): it sits
-*below* :class:`AgentAttemptRuntime` so a runtime never needs to know whether
-the agent/verifier execute under Docker, locally, Harbor, or NeMo Gym. Today the
-only implementation is :class:`DockerEnvironmentProvider`, which wraps
-``shared/docker.py``.
-
-Deviation from the doc sketch: the doc proposes ``run_agent(instruction, config)
--> AgentEvalAttempt``. We keep the boundary at "execute a command in the
-prepared environment" (returning an :class:`EnvCommandResult`) because each
-backend builds its own command/env/mounts, and attempt construction is owned by
-``shared/artifacts.py``. This keeps command-building and attempt-shaping out of
-the environment layer so new providers only implement process execution.
+The generic boundary now lives in
+``nemo_evaluator_sdk.agent_eval.runtimes.environment``. The only platform-specific
+piece kept here is the default task→image mapping (``nmp-nat-<id>:latest``): the
+adapter's :class:`DockerEnvironmentProvider` injects :func:`task_image_tag` so
+``DockerEnvironmentProvider()`` keeps producing platform-tagged images.
 """
 
 from __future__ import annotations
 
-import asyncio
-import subprocess
 from collections.abc import Callable
-from dataclasses import dataclass, field
-from typing import Protocol, runtime_checkable
 
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    AbstractEnvironmentHandle,
+    AgentEnvironmentHandle,
+    AgentEnvironmentProvider,
+    DockerEnvironmentHandle,
+    EnvCommandResult,
+    EnvRole,
+    EnvRunSpec,
+    default_image_tag,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    DockerEnvironmentProvider as _SDKDockerEnvironmentProvider,
+)
 
-from runtimes.shared.docker import docker_run
 from runtimes.shared.layout import task_image_tag
 
-
-@dataclass(frozen=True)
-class EnvCommandResult:
-    """Outcome of running a single command inside a prepared environment."""
-
-    exit_code: int
-    timed_out: bool = False
-
-    @property
-    def ok(self) -> bool:
-        return self.exit_code == 0 and not self.timed_out
-
-
-@dataclass
-class EnvRunSpec:
-    """How to execute one command inside an environment handle."""
-
-    command: list[str]
-    env: dict[str, str] = field(default_factory=dict)
-    mounts: list[tuple[str, str]] = field(default_factory=list)
-    workdir: str | None = None
-    timeout: int | None = None
-    extra_args: list[str] = field(default_factory=list)
-
-
-@runtime_checkable
-class AgentEnvironmentHandle(Protocol):
-    """A prepared, single-task environment that can run agent/verifier commands."""
-
-    async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: ...
-
-    async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: ...
-
-    async def close(self) -> None: ...
+__all__ = [
+    "AbstractEnvironmentHandle",
+    "AgentEnvironmentHandle",
+    "AgentEnvironmentProvider",
+    "DockerEnvironmentHandle",
+    "DockerEnvironmentProvider",
+    "EnvCommandResult",
+    "EnvRole",
+    "EnvRunSpec",
+    "default_image_tag",
+]
 
 
-@runtime_checkable
-class AgentEnvironmentProvider(Protocol):
-    """Creates per-task environment handles. Pluggable: Docker now, Gym later."""
-
-    async def prepare(
-        self,
-        task: AgentEvalTask,
-        config: AgentEvalRunConfig | None = None,
-    ) -> AgentEnvironmentHandle: ...
-
-
-class DockerEnvironmentHandle:
-    """Docker-backed environment handle bound to one task image."""
-
-    def __init__(self, image: str) -> None:
-        self.image = image
-
-    async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult:
-        return await self._run(spec)
-
-    async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult:
-        return await self._run(spec)
-
-    async def _run(self, spec: EnvRunSpec) -> EnvCommandResult:
-        try:
-            result = await asyncio.to_thread(
-                docker_run,
-                self.image,
-                spec.command,
-                env=spec.env,
-                mounts=spec.mounts,
-                workdir=spec.workdir,
-                timeout=spec.timeout,
-                extra_args=spec.extra_args,
-            )
-        except subprocess.TimeoutExpired:
-            return EnvCommandResult(exit_code=124, timed_out=True)
-        return EnvCommandResult(exit_code=result.returncode)
-
-    async def close(self) -> None:
-        # `docker run --rm` cleans up the container; nothing persistent to release.
-        return None
-
-
-class DockerEnvironmentProvider:
-    """Default provider that maps each task to its built Docker image."""
+class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider):
+    """Platform default: map ``task.id`` to ``nmp-nat-<id>:latest``."""
 
     def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None:
-        self._image_tag_fn = image_tag_fn
-
-    async def prepare(
-        self,
-        task: AgentEvalTask,
-        config: AgentEvalRunConfig | None = None,
-    ) -> DockerEnvironmentHandle:
-        del config
-        return DockerEnvironmentHandle(self._image_tag_fn(task.id))
+        super().__init__(image_tag_fn=image_tag_fn)
diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/tests/agentic-use/runtimes/shared/environment_spec.py
index cd5630926f..9cdd3db71f 100644
--- a/tests/agentic-use/runtimes/shared/environment_spec.py
+++ b/tests/agentic-use/runtimes/shared/environment_spec.py
@@ -1,181 +1,32 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Reusable environment authoring for agentic-use tasks (design-doc B3).
+"""Compatibility shim — environment authoring was promoted to the Evaluator SDK.
 
-Moves task authoring away from an implicit "Dockerfile per task" toward a small,
-declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch.
-
-Spec shape (``environment.yaml`` in the task dir)::
-
-    environment:
-      image: nemo-platform-agentic-base:2026.06
-      profile: evaluator-platform
-      dependencies:
-        python:
-          - pytest
-          - nemo-evaluator-sdk
-      setup:
-        - seed-providers
-        - create-workspace
-
-Escape hatch::
-
-    environment:
-      dockerfile: environment/Dockerfile
-
-Resolution is deliberately minimal: a spec is turned into a :class:`BuildPlan`
-(a Dockerfile + build context + target tag). The Dockerfile path is used as-is;
-an ``image``-based spec generates a tiny derived Dockerfile (``FROM <image>`` plus
-optional ``pip install``). ``setup`` steps are carried as plan metadata — they are
-runtime concerns (e.g. seed-providers) handled outside the image build — so this
-module does not execute them.
+Import from ``nemo_evaluator_sdk.agent_eval.runtimes.environment_spec`` directly;
+this module re-exports the same symbols so existing adapter imports keep working.
 """
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
-from pathlib import Path
-
-import yaml
-
-ENVIRONMENT_SPEC_FILENAME = "environment.yaml"
-DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile"
-
-
-@dataclass(frozen=True)
-class EnvironmentSpec:
-    """Declarative environment for one task (or a Dockerfile escape hatch)."""
-
-    image: str | None = None
-    profile: str | None = None
-    python_dependencies: list[str] = field(default_factory=list)
-    setup: list[str] = field(default_factory=list)
-    dockerfile: Path | None = None
-
-    def __post_init__(self) -> None:
-        if self.dockerfile is None and self.image is None:
-            raise ValueError("environment spec requires either 'image' or 'dockerfile'")
-
-
-def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec:
-    """Load a task's environment spec.
-
-    Resolution order:
-    1. ``environment.yaml`` in the task dir (declarative spec, preferred).
-    2. ``environment/Dockerfile`` (backward-compatible escape hatch so existing
-       tasks work without authoring a spec).
-    """
-    root = Path(task_dir)
-    spec_path = root / ENVIRONMENT_SPEC_FILENAME
-    if spec_path.is_file():
-        return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root)
-
-    dockerfile = root / DEFAULT_DOCKERFILE_RELPATH
-    if dockerfile.is_file():
-        return EnvironmentSpec(dockerfile=dockerfile)
-
-    raise FileNotFoundError(
-        f"No environment defined for task {root}: expected {ENVIRONMENT_SPEC_FILENAME} or {DEFAULT_DOCKERFILE_RELPATH}"
-    )
-
-
-def _parse_spec(payload: dict, task_dir: Path) -> EnvironmentSpec:
-    data = payload.get("environment", payload) if isinstance(payload, dict) else {}
-    if not isinstance(data, dict):
-        raise ValueError(f"Invalid environment spec in {task_dir}: expected a mapping")
-
-    dockerfile_value = data.get("dockerfile")
-    dockerfile = None
-    if dockerfile_value:
-        dockerfile = Path(dockerfile_value)
-        if not dockerfile.is_absolute():
-            dockerfile = (task_dir / dockerfile).resolve()
-        if not dockerfile.is_file():
-            raise FileNotFoundError(f"environment.dockerfile not found: {dockerfile}")
-
-    dependencies = data.get("dependencies") or {}
-    python_deps = dependencies.get("python") if isinstance(dependencies, dict) else None
-
-    return EnvironmentSpec(
-        image=data.get("image"),
-        profile=data.get("profile"),
-        python_dependencies=list(python_deps or []),
-        setup=list(data.get("setup") or []),
-        dockerfile=dockerfile,
-    )
-
-
-@dataclass(frozen=True)
-class BuildPlan:
-    """A resolved, executable Docker build for one task."""
-
-    image_tag: str
-    dockerfile: Path
-    context_dir: Path
-    generated: bool
-    base_image: str | None = None
-    setup: list[str] = field(default_factory=list)
-
-
-def plan_task_build(
-    task_dir: str | Path,
-    image_tag: str,
-    *,
-    spec: EnvironmentSpec | None = None,
-    generated_dir: Path | None = None,
-) -> BuildPlan:
-    """Resolve a task's environment spec into a concrete :class:`BuildPlan`.
-
-    For the Dockerfile escape hatch the existing Dockerfile/context is used. For
-    an ``image``-based spec a minimal derived Dockerfile is written under
-    ``generated_dir`` (defaults to ``<task_dir>/.agentic-build``).
-    """
-    root = Path(task_dir)
-    spec = spec or load_environment_spec(root)
-
-    if spec.dockerfile is not None:
-        return BuildPlan(
-            image_tag=image_tag,
-            dockerfile=spec.dockerfile,
-            context_dir=spec.dockerfile.parent,
-            generated=False,
-            setup=list(spec.setup),
-        )
-
-    # image-based spec: generate a tiny derived Dockerfile.
-    context_dir = generated_dir if generated_dir is not None else (root / ".agentic-build")
-    context_dir.mkdir(parents=True, exist_ok=True)
-    dockerfile = context_dir / "Dockerfile"
-    dockerfile.write_text(render_derived_dockerfile(spec), encoding="utf-8")
-    return BuildPlan(
-        image_tag=image_tag,
-        dockerfile=dockerfile,
-        context_dir=context_dir,
-        generated=True,
-        base_image=spec.image,
-        setup=list(spec.setup),
-    )
-
-
-def execute_build_plan(plan: BuildPlan) -> None:
-    """Build the Docker image described by ``plan``."""
-    from runtimes.shared.docker import build_dockerfile
-
-    build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag)
-
-
-def render_derived_dockerfile(spec: EnvironmentSpec) -> str:
-    """Render a minimal derived Dockerfile from an image-based spec."""
-    if spec.image is None:
-        raise ValueError("cannot render a derived Dockerfile without a base image")
-    lines = [f"FROM {spec.image}"]
-    if spec.profile:
-        lines.append(f"LABEL com.nvidia.agentic.profile={spec.profile}")
-    if spec.python_dependencies:
-        deps = " ".join(spec.python_dependencies)
-        lines.append(f"RUN pip install --no-cache-dir {deps}")
-    if spec.setup:
-        # Setup steps are runtime concerns; record them for provenance only.
-        lines.append(f'LABEL com.nvidia.agentic.setup="{",".join(spec.setup)}"')
-    return "\n".join(lines) + "\n"
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import (
+    DEFAULT_DOCKERFILE_RELPATH,
+    ENVIRONMENT_SPEC_FILENAME,
+    BuildPlan,
+    EnvironmentSpec,
+    execute_build_plan,
+    load_environment_spec,
+    plan_task_build,
+    render_derived_dockerfile,
+)
+
+__all__ = [
+    "DEFAULT_DOCKERFILE_RELPATH",
+    "ENVIRONMENT_SPEC_FILENAME",
+    "BuildPlan",
+    "EnvironmentSpec",
+    "execute_build_plan",
+    "load_environment_spec",
+    "plan_task_build",
+    "render_derived_dockerfile",
+]
diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py
index 07a7a2dd17..86a4c5f4f2 100644
--- a/tests/agentic-use/runtimes/shared/layout.py
+++ b/tests/agentic-use/runtimes/shared/layout.py
@@ -9,6 +9,7 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
+from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
 
 from runtimes.shared.config import AgenticSharedConfig
@@ -16,7 +17,11 @@
 
 @dataclass(frozen=True)
 class AgenticRunLayout:
-    """Filesystem layout for one task run."""
+    """Filesystem layout for one task run.
+
+    Extends the SDK's generic ``RunLayout`` shape with a platform-specific
+    ``state_dir`` (preserved platform/database state across agent + verifier).
+    """
 
     run_dir: Path
     agent_log_dir: Path
@@ -44,29 +49,22 @@ def resolve_run_layout(
     config: AgentEvalRunConfig | None = None,
 ) -> AgenticRunLayout:
     """Resolve or create the on-disk layout for one task attempt."""
-    if config is not None and config.output_dir is not None:
-        # Must be absolute: run_dir subpaths are used as Docker bind-mount sources,
-        # and Docker treats a relative `-v` source as a (slash-free) named volume.
-        run_dir = Path(config.output_dir).resolve()
-    else:
-        run_dir = new_run_dir(default_jobs_dir(shared), task.id)
-
-    agent_log_dir = run_dir / "agent"
-    workspace_dir = run_dir / "workspace"
-    state_dir = run_dir / "state"
-    agent_log_dir.mkdir(parents=True, exist_ok=True)
-    workspace_dir.mkdir(parents=True, exist_ok=True)
-    state_dir.mkdir(parents=True, exist_ok=True)
+    output_dir = config.output_dir if config is not None else None
+    run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id))
 
-    instruction_path = agent_log_dir / "instruction.md"
-    instruction_path.write_text(task.intent, encoding="utf-8")
+    # Generic agent/workspace dirs + written instruction come from the SDK helper.
+    base = prepare_run_layout(run_dir, task.intent)
+
+    # Platform extension: a preserved state dir for platform/db across phases.
+    state_dir = base.run_dir / "state"
+    state_dir.mkdir(parents=True, exist_ok=True)
 
     return AgenticRunLayout(
-        run_dir=run_dir,
-        agent_log_dir=agent_log_dir,
-        workspace_dir=workspace_dir,
+        run_dir=base.run_dir,
+        agent_log_dir=base.agent_log_dir,
+        workspace_dir=base.workspace_dir,
         state_dir=state_dir,
-        instruction_path=instruction_path,
+        instruction_path=base.instruction_path,
     )
 
 
diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py
index 7c68a590ec..e7b8496caf 100644
--- a/tests/agentic-use/runtimes/shared/metrics.py
+++ b/tests/agentic-use/runtimes/shared/metrics.py
@@ -1,28 +1,23 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Default metrics for agentic-use agent-eval runs."""
+"""Default metrics for agentic-use agent-eval runs.
+
+``AgentPhaseSuccessMetric`` is promoted to the SDK; here it is namespaced under
+the ``agentic_use_*`` metric type. ``VerifierRewardMetric`` stays a platform
+compatibility shim (mirrors the legacy pytest verifier reward).
+"""
 
 from __future__ import annotations
 
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric
 from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
 
 
-class AgentPhaseSuccessMetric:
-    """Score 1.0 when the agent phase exited successfully, else 0.0."""
+class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric):
+    """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``)."""
 
-    @property
-    def type(self) -> str:
-        return "agentic_use_agent_phase"
-
-    def output_spec(self) -> list[MetricOutputSpec]:
-        return [MetricOutputSpec.continuous_score("agent_phase_success")]
-
-    async def compute_scores(self, input: MetricInput) -> MetricResult:
-        agent_ok = bool(input.candidate.metadata.get("agent_ok"))
-        return MetricResult(
-            outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)],
-        )
+    metric_type = "agentic_use_agent_phase"
 
 
 class VerifierRewardMetric:
diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/tests/agentic-use/runtimes/shared/reporting.py
index 34b78fbcb7..7e78de3972 100644
--- a/tests/agentic-use/runtimes/shared/reporting.py
+++ b/tests/agentic-use/runtimes/shared/reporting.py
@@ -1,457 +1,34 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Deterministic gating + provenance comparison over an agent-eval run bundle.
+"""Compatibility shim — gating was promoted to the Evaluator SDK.
 
-This closes the design-doc B4 "CI/reporting" gap. Persistence of
-``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html``
-is already handled by the SDK (``agent_eval.persistence.persist_run`` /
-``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline
-gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance
-checks.
-
-The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary
-produced here is interchangeable with the legacy gate's baseline summary. The
-difference is the input: this operates on a typed :class:`AgentEvalRunResult`
-(metric scores + attempt metadata) instead of scanning ``result.json`` files.
+Import from ``nemo_evaluator_sdk.agent_eval.gating`` directly; this module
+re-exports the same symbols so existing adapter imports keep working.
 """
 
 from __future__ import annotations
 
-import json
-from dataclasses import asdict, dataclass, field
-from pathlib import Path
-from typing import Any
-
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult
-
-# Metric outputs, in priority order, that represent a task's pass/reward signal.
-DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success")
-
-# Provenance fields collapsed into a single run-level summary (matches the
-# legacy gate so baselines are interchangeable).
-_PROVENANCE_FIELDS: tuple[str, ...] = (
-    "commit_sha",
-    "commit_short",
-    "commit_dirty",
-    "branch",
-    "remote_url",
-    "agentic_base_image_digest",
-    "pinned",
-    "pinned_to_commit",
-    "pinned_image_tag",
+from nemo_evaluator_sdk.agent_eval.gating import (
+    DEFAULT_REWARD_OUTPUTS,
+    GateCheck,
+    GateReport,
+    GateThresholds,
+    evaluate_gate,
+    load_baseline_summary,
+    run_gate_checks,
+    summarize_run,
+    write_gate_report,
 )
 
-
-@dataclass(frozen=True)
-class GateThresholds:
-    """Knobs controlling the candidate gate (defaults are the strict CI policy)."""
-
-    min_pass_rate: float = 1.0
-    require_token_metrics: bool = False
-    max_pass_rate_drop: float = 0.0
-    max_token_regression_pct: float = 0.0
-    max_runtime_regression_pct: float = 0.0
-    allow_cross_commit: bool = False
-
-
-@dataclass
-class GateCheck:
-    name: str
-    passed: bool
-    details: str
-
-
-@dataclass
-class GateReport:
-    gate_passed: bool
-    summary: dict[str, Any]
-    checks: list[GateCheck] = field(default_factory=list)
-
-    def to_payload(self) -> dict[str, Any]:
-        return {
-            "gate_passed": self.gate_passed,
-            "summary": self.summary,
-            "checks": [asdict(check) for check in self.checks],
-        }
-
-
-def evaluate_gate(
-    result: AgentEvalRunResult,
-    *,
-    thresholds: GateThresholds | None = None,
-    baseline_summary: dict[str, Any] | None = None,
-    reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
-) -> GateReport:
-    """Summarize a run and apply gate checks, optionally against a baseline."""
-    thresholds = thresholds or GateThresholds()
-    summary = summarize_run(result, reward_outputs=reward_outputs)
-    checks = run_gate_checks(summary, thresholds=thresholds, baseline_summary=baseline_summary)
-    return GateReport(gate_passed=all(check.passed for check in checks), summary=summary, checks=checks)
-
-
-def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path:
-    """Persist the gate report alongside the SDK run bundle."""
-    path = Path(output_dir)
-    path.mkdir(parents=True, exist_ok=True)
-    gate_path = path / filename
-    gate_path.write_text(json.dumps(report.to_payload(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
-    return gate_path
-
-
-def load_baseline_summary(path: str | Path) -> dict[str, Any]:
-    """Load + normalize a baseline summary (raw summary or a prior gate.json)."""
-    source = Path(path)
-    payload = json.loads(source.read_text(encoding="utf-8"))
-    if not isinstance(payload, dict):
-        raise ValueError(f"Baseline summary must be a JSON object: {source}")
-    summary = payload.get("summary") if isinstance(payload.get("summary"), dict) else payload
-    _validate_baseline_summary(summary, source)
-    return summary
-
-
-def summarize_run(
-    result: AgentEvalRunResult,
-    *,
-    reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
-) -> dict[str, Any]:
-    """Aggregate pass-rate, token, runtime, and provenance for one run."""
-    attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts}
-    reward_by_task = _rewards_by_task(result.results, reward_outputs)
-    task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task))
-
-    passed = 0
-    token_sum = 0
-    token_count = 0
-    token_unavailable: list[str] = []
-    runtime_sum = 0.0
-    runtime_count = 0
-    runtime_unavailable: list[str] = []
-    provenance_inputs: list[dict[str, Any]] = []
-
-    for task_id in task_ids:
-        attempt = attempts_by_task.get(task_id)
-        metadata = attempt.metadata if attempt is not None else {}
-
-        reward_value = _task_reward(task_id, reward_by_task, metadata)
-        if reward_value >= 1.0:
-            passed += 1
-
-        total_tokens = metadata.get("total_tokens")
-        if isinstance(total_tokens, int):
-            token_sum += total_tokens
-            token_count += 1
-        else:
-            token_unavailable.append(task_id)
-
-        runtime_sec = _task_runtime_sec(metadata)
-        if runtime_sec is not None:
-            runtime_sum += runtime_sec
-            runtime_count += 1
-        else:
-            runtime_unavailable.append(task_id)
-
-        prov = metadata.get("provenance")
-        if isinstance(prov, dict):
-            provenance_inputs.append(prov)
-
-    total = len(task_ids)
-    return {
-        "run_id": result.run_id,
-        "benchmark": result.benchmark,
-        "total_tasks": total,
-        "passed_tasks": passed,
-        "pass_rate": (passed / total) if total else 0.0,
-        "task_names": task_ids,
-        "total_tokens_sum": token_sum if token_count else None,
-        "avg_total_tokens": (token_sum / token_count) if token_count else None,
-        "token_metrics_coverage": (token_count / total) if total else 0.0,
-        "token_metrics_available_tasks": token_count,
-        "token_metrics_unavailable_tasks": sorted(token_unavailable),
-        "runtime_sec_sum": runtime_sum if runtime_count else None,
-        "avg_runtime_sec": (runtime_sum / runtime_count) if runtime_count else None,
-        "runtime_metrics_coverage": (runtime_count / total) if total else 0.0,
-        "runtime_metrics_available_tasks": runtime_count,
-        "runtime_metrics_unavailable_tasks": sorted(runtime_unavailable),
-        "provenance": _aggregate_provenance(provenance_inputs),
-    }
-
-
-def run_gate_checks(
-    summary: dict[str, Any],
-    *,
-    thresholds: GateThresholds,
-    baseline_summary: dict[str, Any] | None = None,
-) -> list[GateCheck]:
-    """Apply absolute + relative (vs baseline) gate checks to a summary."""
-    checks: list[GateCheck] = []
-    total_tasks = int(summary["total_tasks"])
-    pass_rate = float(summary["pass_rate"])
-    provenance = summary.get("provenance") or {}
-
-    checks.append(GateCheck("non_empty_result_set", total_tasks > 0, f"total_tasks={total_tasks}"))
-    checks.append(
-        GateCheck(
-            "min_pass_rate",
-            pass_rate >= thresholds.min_pass_rate,
-            f"pass_rate={pass_rate:.3f}, min_pass_rate={thresholds.min_pass_rate:.3f}",
-        )
-    )
-    checks.append(_commit_consistency_check(provenance))
-
-    if thresholds.require_token_metrics:
-        token_coverage = float(summary["token_metrics_coverage"])
-        runtime_coverage = float(summary["runtime_metrics_coverage"])
-        checks.append(
-            GateCheck(
-                "token_metrics_available_for_all_tasks",
-                token_coverage == 1.0,
-                f"token_metrics_coverage={token_coverage:.3f}",
-            )
-        )
-        checks.append(
-            GateCheck(
-                "runtime_metrics_available_for_all_tasks",
-                runtime_coverage == 1.0,
-                f"runtime_metrics_coverage={runtime_coverage:.3f}",
-            )
-        )
-
-    if baseline_summary is not None:
-        checks.extend(_baseline_checks(summary, baseline_summary, thresholds))
-
-    return checks
-
-
-def _baseline_checks(
-    summary: dict[str, Any],
-    baseline_summary: dict[str, Any],
-    thresholds: GateThresholds,
-) -> list[GateCheck]:
-    checks: list[GateCheck] = []
-    pass_rate = float(summary["pass_rate"])
-    total_tokens_sum = summary["total_tokens_sum"]
-    runtime_sec_sum = summary["runtime_sec_sum"]
-    provenance = summary.get("provenance") or {}
-
-    # Regression checks only make sense when both runs measured the same tasks.
-    baseline_tasks = baseline_summary.get("task_names")
-    candidate_tasks = summary.get("task_names")
-    task_sets_comparable = True
-    if isinstance(baseline_tasks, list) and isinstance(candidate_tasks, list):
-        comparable = sorted(baseline_tasks) == sorted(candidate_tasks)
-        task_sets_comparable = comparable
-        checks.append(
-            GateCheck(
-                "baseline_candidate_task_sets_match",
-                comparable,
-                (
-                    f"both runs measured {len(candidate_tasks)} tasks"
-                    if comparable
-                    else f"baseline={sorted(baseline_tasks)} candidate={sorted(candidate_tasks)}; "
-                    "regression checks short-circuited"
-                ),
-            )
-        )
-    else:
-        checks.append(
-            GateCheck(
-                "baseline_candidate_task_sets_match",
-                True,
-                "task_names not present on baseline and/or candidate; skipping equality guard",
-            )
-        )
-
-    checks.append(_cross_commit_check(provenance, baseline_summary, thresholds.allow_cross_commit))
-
-    if not task_sets_comparable:
-        return checks
-
-    baseline_pass_rate = float(baseline_summary.get("pass_rate", 0.0))
-    checks.append(
-        GateCheck(
-            "no_pass_rate_regression_vs_baseline",
-            pass_rate >= baseline_pass_rate - thresholds.max_pass_rate_drop,
-            f"pass_rate={pass_rate:.3f}, baseline={baseline_pass_rate:.3f}, max_drop={thresholds.max_pass_rate_drop:.3f}",
-        )
-    )
-
-    baseline_tokens = baseline_summary.get("total_tokens_sum")
-    if isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int):
-        max_allowed = baseline_tokens * (1.0 + thresholds.max_token_regression_pct / 100.0)
-        checks.append(
-            GateCheck(
-                "tokens_not_worse_than_baseline",
-                total_tokens_sum <= max_allowed,
-                f"total_tokens_sum={total_tokens_sum}, baseline={baseline_tokens}, "
-                f"max_regression_pct={thresholds.max_token_regression_pct:.2f}",
-            )
-        )
-    else:
-        checks.append(
-            GateCheck(
-                "tokens_not_worse_than_baseline",
-                False,
-                "Missing token totals for candidate or baseline; cannot run deterministic token comparison.",
-            )
-        )
-
-    # Runtime is only a tie-breaker when token totals match exactly.
-    baseline_runtime = baseline_summary.get("runtime_sec_sum")
-    tokens_tied = (
-        isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int) and total_tokens_sum == baseline_tokens
-    )
-    if not tokens_tied:
-        checks.append(
-            GateCheck(
-                "runtime_tie_breaker_not_worse_than_baseline",
-                True,
-                "Not applicable (token totals differ from baseline).",
-            )
-        )
-    elif isinstance(runtime_sec_sum, int | float) and isinstance(baseline_runtime, int | float):
-        max_allowed_runtime = float(baseline_runtime) * (1.0 + thresholds.max_runtime_regression_pct / 100.0)
-        checks.append(
-            GateCheck(
-                "runtime_tie_breaker_not_worse_than_baseline",
-                float(runtime_sec_sum) <= max_allowed_runtime,
-                f"runtime_sec_sum={float(runtime_sec_sum):.3f}, baseline={float(baseline_runtime):.3f}, "
-                f"max_regression_pct={thresholds.max_runtime_regression_pct:.2f}",
-            )
-        )
-    else:
-        checks.append(
-            GateCheck(
-                "runtime_tie_breaker_not_worse_than_baseline",
-                False,
-                "Token totals tied with baseline but runtime totals missing; cannot run tie-breaker.",
-            )
-        )
-
-    return checks
-
-
-def _commit_consistency_check(provenance: dict[str, Any]) -> GateCheck:
-    commit_observed = provenance.get("commit_sha_observed")
-    if isinstance(commit_observed, list) and len(commit_observed) > 1:
-        return GateCheck(
-            "commit_sha_consistent_within_run",
-            False,
-            f"Multiple commit_sha values observed across tasks: {commit_observed}. Re-run from a single commit.",
-        )
-    commit_sha = provenance.get("commit_sha")
-    if commit_sha:
-        return GateCheck(
-            "commit_sha_consistent_within_run",
-            True,
-            f"commit={provenance.get('commit_short') or commit_sha[:12]}, branch={provenance.get('branch') or 'detached'}",
-        )
-    return GateCheck(
-        "commit_sha_consistent_within_run",
-        True,
-        "provenance not recorded (legacy artifacts); skipping commit consistency check.",
-    )
-
-
-def _cross_commit_check(
-    provenance: dict[str, Any],
-    baseline_summary: dict[str, Any],
-    allow_cross_commit: bool,
-) -> GateCheck:
-    baseline_commit = (baseline_summary.get("provenance") or {}).get("commit_sha")
-    candidate_commit = provenance.get("commit_sha")
-    if not (baseline_commit and candidate_commit):
-        return GateCheck(
-            "commit_sha_matches_baseline",
-            True,
-            "commit_sha not present on baseline and/or candidate; skipping cross-commit guard.",
-        )
-    commits_match = baseline_commit == candidate_commit
-    if commits_match:
-        detail = f"both runs at commit={baseline_commit[:12]}"
-    elif allow_cross_commit:
-        detail = (
-            f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}; "
-            "comparison allowed by allow_cross_commit (numbers may not be apples-to-apples)."
-        )
-    else:
-        detail = (
-            f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}. "
-            "Re-run candidate at the baseline commit, or set allow_cross_commit."
-        )
-    return GateCheck("commit_sha_matches_baseline", commits_match or allow_cross_commit, detail)
-
-
-def _rewards_by_task(results: list[AgentEvalTaskResult], reward_outputs: tuple[str, ...]) -> dict[str, float]:
-    rewards: dict[str, float] = {}
-    for task_result in results:
-        for output_name in reward_outputs:
-            value = _numeric_output(task_result, output_name)
-            if value is not None:
-                # Highest-priority output wins; don't overwrite with later metrics.
-                rewards.setdefault(task_result.task_id, value)
-                break
-    return rewards
-
-
-def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None:
-    for output in task_result.outputs:
-        if output.name == name:
-            try:
-                return float(output.value)
-            except (TypeError, ValueError):
-                return None
-    return None
-
-
-def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float:
-    if task_id in reward_by_task:
-        return reward_by_task[task_id]
-    reward = metadata.get("reward")
-    if reward is not None:
-        try:
-            return float(reward)
-        except (TypeError, ValueError):
-            return 0.0
-    return 1.0 if metadata.get("passed") is True else 0.0
-
-
-def _task_runtime_sec(metadata: dict[str, Any]) -> float | None:
-    runtime_sec = metadata.get("runtime_sec")
-    if isinstance(runtime_sec, int | float):
-        return float(runtime_sec)
-    duration_ms = metadata.get("duration_ms")
-    if isinstance(duration_ms, int | float):
-        return float(duration_ms) / 1000.0
-    return None
-
-
-def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]:
-    observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS}
-    for prov in provenances:
-        for field_name in _PROVENANCE_FIELDS:
-            value = prov.get(field_name)
-            if value is not None:
-                observed[field_name].add(value)
-
-    aggregated: dict[str, Any] = {"available": bool(provenances)}
-    for field_name in _PROVENANCE_FIELDS:
-        values = observed[field_name]
-        if len(values) == 1:
-            aggregated[field_name] = next(iter(values))
-        else:
-            aggregated[field_name] = None
-            if len(values) > 1:
-                aggregated[f"{field_name}_observed"] = sorted(map(str, values))
-    return aggregated
-
-
-def _validate_baseline_summary(summary: dict[str, Any], source: Path) -> None:
-    missing = [key for key in ("pass_rate", "total_tokens_sum", "runtime_sec_sum") if key not in summary]
-    if missing:
-        raise ValueError(
-            f"Baseline summary {source} is missing required key(s): {', '.join(missing)}. "
-            "Expected a raw summary object or a gate.json with a `summary`."
-        )
-    if not isinstance(summary.get("pass_rate"), int | float):
-        raise ValueError(f"Baseline summary {source} has invalid `pass_rate`; expected a number.")
+__all__ = [
+    "DEFAULT_REWARD_OUTPUTS",
+    "GateCheck",
+    "GateReport",
+    "GateThresholds",
+    "evaluate_gate",
+    "load_baseline_summary",
+    "run_gate_checks",
+    "summarize_run",
+    "write_gate_report",
+]
diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py
index e8162f9ded..bb0d3fe567 100644
--- a/tests/agentic-use/runtimes/shared/result_adapter.py
+++ b/tests/agentic-use/runtimes/shared/result_adapter.py
@@ -39,6 +39,17 @@
 )
 
 
+class ResultDirAttemptSource:
+    """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts.
+
+    Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource`
+    protocol so the generic orchestrator's offline path can rescore captured runs.
+    """
+
+    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
+        return attempt_from_result_dir(source, task=task)
+
+
 def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt:
     """Load ``<output_dir>/result.json`` and build an attempt from it."""
     output_dir = Path(output_dir)
diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py
index 8be53924b9..f83da8edb3 100644
--- a/tests/agentic-use/runtimes/shared/verify.py
+++ b/tests/agentic-use/runtimes/shared/verify.py
@@ -13,9 +13,14 @@
 from __future__ import annotations
 
 import textwrap
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
+    VerifierOutcome,
+    apply_verify_to_metadata,
+    collect_verifier_outcome,
+    skipped_outcome,
+)
 
 from runtimes.shared.constants import (
     DOCKER_SOCKET_CONTAINER_PATH,
@@ -28,17 +33,14 @@
 from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec
 from runtimes.shared.layout import AgenticRunLayout
 
-
-@dataclass(frozen=True)
-class VerifierOutcome:
-    """Result of the live verifier phase for one task."""
-
-    ran: bool
-    passed: bool
-    reward: int
-    exit_code: int
-    stdout: str
-    verifier_log_dir: Path | None
+__all__ = [
+    "VerifierOutcome",
+    "apply_verify_to_metadata",
+    "build_verify_run_spec",
+    "maybe_run_verify",
+    "run_verify",
+    "verifier_log_dir",
+]
 
 
 def verifier_log_dir(layout: AgenticRunLayout) -> Path:
@@ -141,28 +143,10 @@ async def run_verify(
 ) -> VerifierOutcome:
     """Execute the verifier through the environment handle and collect reward."""
     result = await handle.run_verifier(spec)
-    log_dir = verifier_log_dir(layout)
-    passed = result.ok
-
-    stdout = ""
-    stdout_path = log_dir / "test-stdout.txt"
-    if stdout_path.is_file():
-        stdout = stdout_path.read_text(encoding="utf-8", errors="replace")
-
-    reward_path = log_dir / "reward.txt"
-    if reward_path.is_file():
-        reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0
-    else:
-        reward = 1 if passed else 0
-        reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8")
-
-    return VerifierOutcome(
-        ran=True,
-        passed=passed,
-        reward=reward,
+    return collect_verifier_outcome(
+        ok=result.ok,
         exit_code=result.exit_code,
-        stdout=stdout,
-        verifier_log_dir=log_dir,
+        log_dir=verifier_log_dir(layout),
     )
 
 
@@ -181,7 +165,7 @@ async def maybe_run_verify(
 ) -> VerifierOutcome:
     """Run the verifier through ``handle`` when enabled and a verifier exists."""
     if not enabled:
-        return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
+        return skipped_outcome()
     spec = build_verify_run_spec(
         task_dir,
         layout,
@@ -193,16 +177,5 @@ async def maybe_run_verify(
         extra_args=extra_args,
     )
     if spec is None:
-        return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
+        return skipped_outcome()
     return await run_verify(handle, spec, layout)
-
-
-def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None:
-    """Stamp verifier reward/status onto attempt metadata for scoring + gating."""
-    if not outcome.ran:
-        metadata.setdefault("verify_status", "skipped")
-        return
-    metadata["verify_status"] = "ok" if outcome.passed else "failed"
-    metadata["passed"] = outcome.passed
-    metadata["reward"] = outcome.reward
-    metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None
diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py
index 935ddf7389..5a3590071d 100644
--- a/tests/agentic-use/tests/test_agentic_runtimes.py
+++ b/tests/agentic-use/tests/test_agentic_runtimes.py
@@ -303,6 +303,8 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co
         },
     )
     task_result = AgentEvalTaskResult(
+        id="demo:workflow:agentic_use_verifier_reward",
+        run_id="run-1",
         task_id="demo",
         attempt_id="demo:workflow",
         metric_type="agentic_use_verifier_reward",

From afb7dc87ea27f1d704307869c6306c8aaa4e49ae Mon Sep 17 00:00:00 2001
From: "Arpit Singh (SW-CLOUD)" <arpsingh@nvidia.com>
Date: Tue, 9 Jun 2026 23:16:14 -0700
Subject: [PATCH 3/3] refactor(agentic-use): drop SDK re-export shims for
 direct imports

Remove the compatibility shims under tests/agentic-use/runtimes/shared that
re-exported promoted agent_eval SDK symbols, and import those generics directly
from nemo_evaluator_sdk.agent_eval (docker, environment, environment_spec,
gating, verify) at their use sites.

Consolidate the remaining NeMo-Platform-only glue into a single module,
shared/platform.py: the run layout with the platform state_dir, task_image_tag
+ platform DockerEnvironmentProvider, the namespaced AgentPhaseSuccessMetric +
VerifierRewardMetric, agent-log/usage parsing and the shared container env,
attempt construction (live + result.json/ResultDirAttemptSource), the live
VERIFY phase, and the agentic-use task loader. shared/ now holds only
platform.py, config.py, and constants.py.

Update orchestrator/workflow/aut runtimes, the package __init__ re-exports, the
runtime tests, and README/COMPLIANCE docs accordingly. 107 tests pass; ruff
clean.

Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
---
 tests/agentic-use/runtimes/COMPLIANCE.md      |  58 +-
 tests/agentic-use/runtimes/README.md          |  55 +-
 tests/agentic-use/runtimes/__init__.py        |  46 +-
 tests/agentic-use/runtimes/aut/runtime.py     |  19 +-
 tests/agentic-use/runtimes/orchestrator.py    |  16 +-
 .../agentic-use/runtimes/shared/agent_log.py  |  40 -
 .../agentic-use/runtimes/shared/artifacts.py  | 187 -----
 .../runtimes/shared/container_env.py          |  42 -
 tests/agentic-use/runtimes/shared/docker.py   |  26 -
 .../runtimes/shared/environment.py            |  50 --
 .../runtimes/shared/environment_spec.py       |  32 -
 tests/agentic-use/runtimes/shared/layout.py   |  72 --
 tests/agentic-use/runtimes/shared/metrics.py  |  46 -
 tests/agentic-use/runtimes/shared/platform.py | 791 ++++++++++++++++++
 .../agentic-use/runtimes/shared/reporting.py  |  34 -
 .../runtimes/shared/result_adapter.py         | 156 ----
 .../runtimes/shared/task_loader.py            |  80 --
 tests/agentic-use/runtimes/shared/usage.py    |  32 -
 tests/agentic-use/runtimes/shared/verify.py   | 181 ----
 .../agentic-use/runtimes/workflow/runtime.py  |  17 +-
 .../tests/test_agentic_runtimes.py            |  43 +-
 21 files changed, 924 insertions(+), 1099 deletions(-)
 delete mode 100644 tests/agentic-use/runtimes/shared/agent_log.py
 delete mode 100644 tests/agentic-use/runtimes/shared/artifacts.py
 delete mode 100644 tests/agentic-use/runtimes/shared/container_env.py
 delete mode 100644 tests/agentic-use/runtimes/shared/docker.py
 delete mode 100644 tests/agentic-use/runtimes/shared/environment.py
 delete mode 100644 tests/agentic-use/runtimes/shared/environment_spec.py
 delete mode 100644 tests/agentic-use/runtimes/shared/layout.py
 delete mode 100644 tests/agentic-use/runtimes/shared/metrics.py
 create mode 100644 tests/agentic-use/runtimes/shared/platform.py
 delete mode 100644 tests/agentic-use/runtimes/shared/reporting.py
 delete mode 100644 tests/agentic-use/runtimes/shared/result_adapter.py
 delete mode 100644 tests/agentic-use/runtimes/shared/task_loader.py
 delete mode 100644 tests/agentic-use/runtimes/shared/usage.py
 delete mode 100644 tests/agentic-use/runtimes/shared/verify.py

diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md
index 3631361cad..526aa7b0e7 100644
--- a/tests/agentic-use/runtimes/COMPLIANCE.md
+++ b/tests/agentic-use/runtimes/COMPLIANCE.md
@@ -16,23 +16,25 @@ helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics
 (`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence
 `EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the
 `AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the
-coding-agent driver seam (`runtimes.coding_agent`). The `shared/*` modules
-referenced below are now **re-export shims** over those SDK homes (see
-`README.md` for the shim→SDK table); only NeMo-Platform specifics
-(`task_loader`, `result_adapter`, `config`, the pytest verifier command, the
-`state` evidence key, `task_image_tag`) remain platform code. A CI grep gate
-(`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) keeps
-`agent_eval/` free of NeMo-Platform imports.
+coding-agent driver seam (`runtimes.coding_agent`). Those SDK homes are imported
+**directly** by the runtime scripts — there are no re-export shims. The only
+NeMo-Platform specifics that remain (the agentic task loader, `result.json`
+import, attempt construction, the pytest verifier command, the `state` evidence
+key, `task_image_tag` + platform `DockerEnvironmentProvider`, the
+`VerifierRewardMetric`) are consolidated into a single module,
+`shared/platform.py` (alongside `shared/config.py` and `shared/constants.py`).
+A CI grep gate (`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`)
+keeps `agent_eval/` free of NeMo-Platform imports.
 
 ## Scope split (per SDK design)
 
 | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location |
 |----------------------------|-----------------------------------|------------------|
 | AGENT phase — run backend in Docker, capture logs/trajectory | **Yes** | `runtimes/<backend>/runtime.py` |
-| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `shared/environment_spec.py` (env spec / Dockerfile) + `shared/docker.py` |
-| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/verify.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) |
+| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `agent_eval.runtimes.environment_spec` (env spec / Dockerfile) + `agent_eval.runtimes.docker` |
+| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/platform.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) |
 | CLI — task globs, manifests, summaries | **No** | Still `nat_runner.main` (not migrated) |
-| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/result_adapter.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` |
+| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/platform.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` |
 
 ## Task metrics (authored on the task)
 
@@ -63,14 +65,14 @@ metric scoring row.
 | `_prepare_workflow_for_runtime` | `workflow/prep.py` |
 | `_build_aut_agent_cmd` | `aut/command.py` |
 | `_prepare_aut_config_for_runtime` | `aut/prep.py` |
-| `_agent_log_has_workflow_error` | `shared/agent_log.py` |
-| `run_verify_phase` | `shared/verify.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) |
-| `_docker_run`, `build_task_image` | `shared/docker.py` (`docker_run`, `build_dockerfile`, `build_task_image`) |
-| BUILD env resolution (`environment/Dockerfile`) | `shared/environment_spec.py` (`load_environment_spec`, `plan_task_build`) |
-| `_write_result` (`result.json`) | `shared/result_adapter.py` (import side only; `nat_runner` still writes it) |
-| pass-rate / token / runtime gate | `shared/reporting.py` (mirrors `passrate_token_policy_gate.py`) |
-| `_extract_usage_metrics` | `shared/usage.py` (delegates to `nat_runner` until deduped) |
-| `capture_agent_attempt` shape | `shared/artifacts.py` |
+| `_agent_log_has_workflow_error` | `shared/platform.py` |
+| `run_verify_phase` | `shared/platform.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) |
+| `_docker_run`, `build_task_image` | `agent_eval.runtimes.docker` (`docker_run`, `build_dockerfile`, `build_task_image`) |
+| BUILD env resolution (`environment/Dockerfile`) | `agent_eval.runtimes.environment_spec` (`load_environment_spec`, `plan_task_build`) |
+| `_write_result` (`result.json`) | `shared/platform.py` (import side only; `nat_runner` still writes it) |
+| pass-rate / token / runtime gate | `agent_eval.gating` (mirrors `passrate_token_policy_gate.py`) |
+| `_extract_usage_metrics` | `shared/platform.py` (delegates to `nat_runner` until deduped) |
+| `capture_agent_attempt` shape | `shared/platform.py` |
 | `run_agent_phase` | **Removed per backend** once all backends migrated |
 
 ## Attempt record contract
@@ -83,14 +85,14 @@ includes canonical `CapturedAgentAttempt` fields:
 - Artifact paths: `agent_log_dir`, `workspace_dir`, `state_dir`, `atif_trajectory_path`
 - Phase outcome: `agent_ok`
 - Verifier outcome (when `run_verify=True`): `verify_status`, `passed`, `reward`,
-  `verifier_log_dir` (stamped by `shared/verify.py::apply_verify_to_metadata`)
+  `verifier_log_dir` (stamped by `apply_verify_to_metadata` from `agent_eval.runtimes.verify`)
 
 Use `to_captured_agent_attempt(task, attempt)` for verify/scoring code that
 expects the portable `CapturedAgentAttempt` type.
 
 ## `nat_runner` artifact → `AgentEvalAttempt` evidence map (per design doc)
 
-`shared/artifacts.py::_evidence_descriptors` emits the documented keys:
+`shared/platform.py::_evidence_descriptors` emits the documented keys:
 
 | `nat_runner` output | `AgentEvalAttempt` mapping | Status |
 |---------------------|----------------------------|--------|
@@ -100,7 +102,7 @@ expects the portable `CapturedAgentAttempt` type.
 | `agent/trajectory.json` | `evidence["trace"]` (ATIF when normalized, else json) | Implemented |
 | `agent/` logs | `evidence["logs"]` (dir, `primary_log=nat_agent.log`) | Implemented |
 | `verifier/` logs | `evidence["verifier_logs"]` (added once verify phase runs) | Implemented (conditional) |
-| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/result_adapter.py::attempt_from_result` / `attempt_from_result_dir` |
+| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/platform.py::attempt_from_result` / `attempt_from_result_dir` |
 | final agent log/message | `AgentOutput.text` | Implemented |
 
 `result.json` mapping detail (`attempt_from_result`):
@@ -110,7 +112,7 @@ expects the portable `CapturedAgentAttempt` type.
   attempt-production failures because the SDK's `AgentEvaluator` excludes
   `status=="failed"` from scoring (it raises); an agent that ran but failed must
   stay scorable so pass-rate gating counts it as a `0`. The live builder
-  (`shared/artifacts.py`) and this importer share the same helper.
+  (`shared/platform.py`) and this importer share the same helper.
 - `result["reward"]`/`result["passed"]` → `metadata` measurements (verifier reward
   stays a *measurement*, scored by `VerifierRewardMetric`, not the attempt status).
 - `result["metrics"]` (token/cost) → flattened into `metadata`.
@@ -120,10 +122,10 @@ expects the portable `CapturedAgentAttempt` type.
 
 | Doc section | Status in this package |
 |-------------|------------------------|
-| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/result_adapter.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. |
-| **B2** `EnvironmentProvider` boundary | **Implemented** — `shared/environment.py` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; `DockerEnvironmentProvider` wraps `shared/docker.py`. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. |
-| **B3** standardize environment authoring | **Implemented (minimal)** — `shared/environment_spec.py` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). |
-| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `shared/reporting.py` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. |
+| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/platform.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. |
+| **B2** `EnvironmentProvider` boundary | **Implemented** — `agent_eval.runtimes.environment` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; the platform `DockerEnvironmentProvider` (`shared/platform.py`) wraps `agent_eval.runtimes.docker` with the `nmp-nat-<id>` image tag. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. |
+| **B3** standardize environment authoring | **Implemented (minimal)** — `agent_eval.runtimes.environment_spec` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). |
+| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `agent_eval.gating` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. |
 
 ### B4 reporting / gating detail
 
@@ -131,7 +133,7 @@ expects the portable `CapturedAgentAttempt` type.
   calls `agent_eval.persistence.persist_run`, writing `tasks.jsonl`,
   `attempts.jsonl`, `results.jsonl`, `summary.json`, `benchmark.json`, `run.json`,
   and (when `write_dashboard=True`) `report.html`.
-- **Gating** (`shared/reporting.py`): `summarize_run` aggregates pass-rate,
+- **Gating** (`agent_eval.gating`): `summarize_run` aggregates pass-rate,
   token totals/coverage, runtime totals, and run-level provenance from the typed
   `AgentEvalRunResult` (metric scores first, attempt metadata as fallback).
   `evaluate_gate` applies absolute thresholds and candidate-vs-baseline checks:
@@ -149,7 +151,7 @@ expects the portable `CapturedAgentAttempt` type.
 The doc sketches `AgentEnvironmentHandle.run_agent(instruction, config) -> AgentEvalAttempt`.
 We instead use `run_agent(EnvRunSpec) -> EnvCommandResult` (and the symmetric
 `run_verifier`). Rationale: per-backend command/env/mount construction lives in the
-runtime, and attempt construction lives in `shared/artifacts.py`. Keeping the
+runtime, and attempt construction lives in `shared/platform.py`. Keeping the
 environment layer at "execute a command, return exit status" means a new provider
 (local, Harbor, NeMo Gym) only implements process execution — it never needs to
 know about backends or attempt schemas.
diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md
index d5ecff2c38..5b149c10ec 100644
--- a/tests/agentic-use/runtimes/README.md
+++ b/tests/agentic-use/runtimes/README.md
@@ -9,19 +9,26 @@ the pytest verifier, the platform Docker build/image-tag) plus a thin factory.
 
 ## Architecture: adapter over SDK
 
-The `shared/*` modules below are **pure re-export shims** over their SDK homes —
-they exist only so existing imports keep working; the logic lives in the SDK:
-
-| `shared/` shim | SDK home |
-|----------------|----------|
-| `docker.py` | `agent_eval.runtimes.docker` |
-| `environment.py` | `agent_eval.runtimes.environment` (re-supplies the platform image-tag) |
-| `environment_spec.py` | `agent_eval.runtimes.environment_spec` |
-| `reporting.py` | `agent_eval.gating` |
-| `verify.py` | wraps `agent_eval.runtimes.verify` (pytest command/env/mounts stay here) |
-| `metrics.py` | `AgentPhaseSuccessMetric` from `agent_eval.common_metrics` (namespaced); `VerifierRewardMetric` is platform |
-| `artifacts.py` | `resolve_attempt_status` + evidence keys from `agent_eval.attempts`; adds the platform `state` key |
-| `layout.py` | delegates to `agent_eval.runtimes.layout`; adds the platform `state_dir` + `task_image_tag` |
+The backend-agnostic logic lives in `nemo_evaluator_sdk.agent_eval` and is
+imported **directly** by the runtime scripts (no re-export shims). Everything
+generic comes from these SDK homes:
+
+| What | SDK home |
+|------|----------|
+| Docker CLI helpers | `agent_eval.runtimes.docker` |
+| Environment boundary (`AgentEnvironmentProvider`/`Handle`, `EnvRunSpec`) | `agent_eval.runtimes.environment` |
+| Environment authoring (`load_environment_spec`, `plan_task_build`, …) | `agent_eval.runtimes.environment_spec` |
+| Gating (`GateThresholds`, `evaluate_gate`, `summarize_run`, …) | `agent_eval.gating` |
+| Verify mechanic (`apply_verify_to_metadata`, `collect_verifier_outcome`) | `agent_eval.runtimes.verify` |
+| `AgentPhaseSuccessMetric`, attempt-status + evidence helpers | `agent_eval.common_metrics`, `agent_eval.attempts` |
+| Generic orchestrator + run layout | `agent_eval.orchestrator`, `agent_eval.runtimes.layout` |
+
+All NeMo-Platform-specific glue is consolidated into a single module,
+`shared/platform.py`: the run layout with the platform `state_dir`, the
+`nmp-nat-<id>` image tag + `DockerEnvironmentProvider` default, the namespaced
+`AgentPhaseSuccessMetric` + the `VerifierRewardMetric`, agent-log/usage parsing
+and the shared container env, attempt construction (live + `result.json`), the
+live VERIFY phase, and the agentic-use task loader.
 
 The orchestrator (`orchestrator.py`) is a thin factory over
 `agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image
@@ -32,9 +39,10 @@ build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric`
 
 ```text
 runtimes/
-  shared/           # thin re-export shims over agent_eval.* (see table above)
-                    #   + platform-only: task_loader.py, result_adapter.py,
-                    #     config.py, container_env.py, constants.py
+  shared/           # platform glue only:
+                    #   platform.py  — all NeMo-Platform helpers (one file)
+                    #   config.py    — runtime config dataclasses
+                    #   constants.py — paths / container constants
   workflow/         # NatWorkflowAttemptRuntime (implemented, NeMo construct)
   aut/              # AutAgentAttemptRuntime (implemented, NeMo construct)
   claude_code/      # scaffold (stub) — see "Coding-agent runtimes" below
@@ -106,7 +114,7 @@ Design-doc implementation path (see [COMPLIANCE.md](./COMPLIANCE.md) for detail)
 
 ## B1 — `result.json` import + stored-attempt scoring
 
-`shared/result_adapter.py` imports an existing `nat_runner` run as an attempt:
+`shared/platform.py` imports an existing `nat_runner` run as an attempt:
 
 - `attempt_from_result_dir(output_dir)` reads `<output_dir>/result.json`.
 - `attempt_from_result(result_dict, output_dir=...)` projects a parsed record.
@@ -131,16 +139,17 @@ when `run_verify=True`. `inputs` holds only agent-facing `instruction`;
 
 ## B2 — Environment boundary
 
-Runtimes execute the agent through `shared/environment.py`
+Runtimes execute the agent through the SDK environment boundary
 (`AgentEnvironmentProvider` → `AgentEnvironmentHandle`) rather than calling
-Docker directly. `DockerEnvironmentProvider` is the default; inject another
+Docker directly. The platform `DockerEnvironmentProvider` (`shared/platform.py`,
+defaulting to the `nmp-nat-<id>` image tag) is the default; inject another
 provider (local, Harbor, NeMo Gym) via the runtime's `environment=` argument
 without changing backend code.
 
 ## B3 — Environment authoring
 
 Tasks can declare a reusable environment instead of hand-writing a Dockerfile.
-`shared/environment_spec.py` loads `environment.yaml` from the task dir:
+`agent_eval.runtimes.environment_spec` loads `environment.yaml` from the task dir:
 
 ```yaml
 environment:
@@ -171,10 +180,10 @@ as metadata, not executed here (they are runtime concerns).
 
 The SDK persists the run bundle (`tasks.jsonl`, `attempts.jsonl`,
 `results.jsonl`, `summary.json`, `report.html`) when `output_dir` is set.
-`shared/reporting.py` adds the gate on top:
+`agent_eval.gating` adds the gate on top:
 
 ```python
-from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
 
 report = evaluate_gate(
     run_result,
@@ -190,7 +199,7 @@ The orchestrator emits `gate.json` automatically (`AgenticOrchestratorConfig.wri
 
 ## Live VERIFY phase (through the B2 boundary)
 
-`shared/verify.py` runs the task-local `tests/test_outputs.py` pytest verifier
+`shared/platform.py` runs the task-local `tests/test_outputs.py` pytest verifier
 through `AgentEnvironmentHandle.run_verifier`, in the same prepared environment
 and against the same persisted workspace/state as the agent phase. Enable it via
 `AgenticSharedConfig(run_verify=True)`; the runtime stamps `reward`/`passed`/
diff --git a/tests/agentic-use/runtimes/__init__.py b/tests/agentic-use/runtimes/__init__.py
index 1b7e12491f..df392483cf 100644
--- a/tests/agentic-use/runtimes/__init__.py
+++ b/tests/agentic-use/runtimes/__init__.py
@@ -3,20 +3,23 @@
 
 """Backend-specific AgentAttemptRuntime implementations for agentic-use evals."""
 
-from runtimes.aut.runtime import AutAgentAttemptRuntime
-from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime
-from runtimes.codex.runtime import CodexAgentAttemptRuntime
-from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime
-from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend
-from runtimes.shared.environment import (
+from nemo_evaluator_sdk.agent_eval.gating import (
+    GateCheck,
+    GateReport,
+    GateThresholds,
+    evaluate_gate,
+    load_baseline_summary,
+    summarize_run,
+    write_gate_report,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
     AgentEnvironmentHandle,
     AgentEnvironmentProvider,
     DockerEnvironmentHandle,
-    DockerEnvironmentProvider,
     EnvCommandResult,
     EnvRunSpec,
 )
-from runtimes.shared.environment_spec import (
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import (
     BuildPlan,
     EnvironmentSpec,
     execute_build_plan,
@@ -24,20 +27,19 @@
     plan_task_build,
     render_derived_dockerfile,
 )
-from runtimes.shared.metrics import AgentPhaseSuccessMetric, VerifierRewardMetric
-from runtimes.shared.reporting import (
-    GateCheck,
-    GateReport,
-    GateThresholds,
-    evaluate_gate,
-    load_baseline_summary,
-    summarize_run,
-    write_gate_report,
-)
-from runtimes.shared.result_adapter import attempt_from_result, attempt_from_result_dir
-from runtimes.shared.verify import (
-    VerifierOutcome,
-    apply_verify_to_metadata,
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import VerifierOutcome, apply_verify_to_metadata
+
+from runtimes.aut.runtime import AutAgentAttemptRuntime
+from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime
+from runtimes.codex.runtime import CodexAgentAttemptRuntime
+from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime
+from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend
+from runtimes.shared.platform import (
+    AgentPhaseSuccessMetric,
+    DockerEnvironmentProvider,
+    VerifierRewardMetric,
+    attempt_from_result,
+    attempt_from_result_dir,
     build_verify_run_spec,
     maybe_run_verify,
     run_verify,
diff --git a/tests/agentic-use/runtimes/aut/runtime.py b/tests/agentic-use/runtimes/aut/runtime.py
index 64bc8e46bc..4185abe826 100644
--- a/tests/agentic-use/runtimes/aut/runtime.py
+++ b/tests/agentic-use/runtimes/aut/runtime.py
@@ -8,12 +8,12 @@
 from collections.abc import Sequence
 from pathlib import Path
 
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask
 
 from runtimes.aut.command import build_aut_agent_cmd
 from runtimes.aut.prep import prepare_aut_config_for_runtime
-from runtimes.shared.agent_log import agent_log_has_workflow_error
-from runtimes.shared.artifacts import build_agent_eval_attempt
 from runtimes.shared.config import AutRuntimeConfig
 from runtimes.shared.constants import (
     DOCKER_SOCKET_CONTAINER_PATH,
@@ -21,15 +21,16 @@
     INSTRUCTION_CONTAINER_PATH,
     REPO_ROOT,
 )
-from runtimes.shared.container_env import base_container_env
-from runtimes.shared.environment import (
-    AgentEnvironmentProvider,
+from runtimes.shared.platform import (
+    AgenticRunLayout,
     DockerEnvironmentProvider,
-    EnvRunSpec,
+    agent_log_has_workflow_error,
+    base_container_env,
+    build_agent_eval_attempt,
+    maybe_run_verify,
+    resolve_run_layout,
+    task_agent_timeout_sec,
 )
-from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout
-from runtimes.shared.task_loader import task_agent_timeout_sec
-from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify
 
 RUNTIME_NAME = "aut"
 AUT_CONFIG_CONTAINER_PATH = "/tmp/aut_agent.yml"
diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py
index 8a355fbdfc..74531b1a41 100644
--- a/tests/agentic-use/runtimes/orchestrator.py
+++ b/tests/agentic-use/runtimes/orchestrator.py
@@ -17,7 +17,10 @@
 from pathlib import Path
 from typing import Any
 
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds
 from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
+from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_image_exists
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import execute_build_plan, plan_task_build
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
     AgentEvalRunResult,
@@ -25,13 +28,12 @@
 )
 from nemo_evaluator_sdk.metrics.protocol import Metric
 
-from runtimes.shared.docker import docker_image_exists
-from runtimes.shared.environment_spec import execute_build_plan, plan_task_build
-from runtimes.shared.layout import task_image_tag
-from runtimes.shared.metrics import VerifierRewardMetric
-from runtimes.shared.reporting import GateThresholds
-from runtimes.shared.result_adapter import ResultDirAttemptSource
-from runtimes.shared.task_loader import agentic_task_from_dir
+from runtimes.shared.platform import (
+    ResultDirAttemptSource,
+    VerifierRewardMetric,
+    agentic_task_from_dir,
+    task_image_tag,
+)
 
 
 @dataclass(frozen=True)
diff --git a/tests/agentic-use/runtimes/shared/agent_log.py b/tests/agentic-use/runtimes/shared/agent_log.py
deleted file mode 100644
index 6fc7de0270..0000000000
--- a/tests/agentic-use/runtimes/shared/agent_log.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Agent log parsing helpers shared by backend runtimes."""
-
-from __future__ import annotations
-
-import json
-from typing import Any
-
-
-def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]:
-    """Return JSON dict payloads embedded in an agent log, newest-first after the full log."""
-    candidates = [agent_log.strip()]
-    lines = [line.strip() for line in agent_log.splitlines() if line.strip()]
-    if lines:
-        candidates.append(lines[-1])
-        candidates.extend(reversed(lines))
-
-    payloads: list[dict[str, Any]] = []
-    seen: set[str] = set()
-    for candidate in candidates:
-        if not candidate or candidate in seen:
-            continue
-        seen.add(candidate)
-        try:
-            parsed = json.loads(candidate)
-        except json.JSONDecodeError:
-            continue
-        if isinstance(parsed, dict):
-            payloads.append(parsed)
-    return payloads
-
-
-def agent_log_has_workflow_error(agent_log: str) -> bool:
-    """Detect AUT workflow errors returned as successful HTTP JSON payloads."""
-    for payload in iter_agent_log_json_payloads(agent_log):
-        if payload.get("code") == "workflow_error":
-            return True
-    return False
diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py
deleted file mode 100644
index 912c12a7a0..0000000000
--- a/tests/agentic-use/runtimes/shared/artifacts.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Convert captured agent artifacts into AgentEvalAttempt values."""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-from evaluator_agent_eval.artifacts import AgentArtifacts
-from evaluator_agent_eval.schemas import (
-    AgentAttemptInput,
-    AgentAttemptMetadata,
-    AgentAttemptOutput,
-    AgentAttemptTrace,
-    CapturedAgentAttempt,
-)
-from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
-from nemo_evaluator_sdk.agent_eval.types import (
-    AgentEvalAttempt,
-    AgentEvalTask,
-    AgentOutput,
-)
-from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
-
-from runtimes.shared.config import AgenticRuntimeName
-from runtimes.shared.layout import AgenticRunLayout
-from runtimes.shared.usage import extract_usage_metrics
-
-__all__ = [
-    "build_agent_eval_attempt",
-    "resolve_attempt_status",
-    "to_captured_agent_attempt",
-]
-
-
-def build_agent_eval_attempt(
-    *,
-    task: AgentEvalTask,
-    layout: AgenticRunLayout,
-    runtime_name: AgenticRuntimeName,
-    agent_model: str,
-    exit_code: int,
-    agent_ok: bool,
-    run_id: str | None = None,
-    repo_revision: str | None = None,
-    duration_ms: int | None = None,
-) -> AgentEvalAttempt:
-    """Build an SDK attempt from on-disk agent artifacts.
-
-    Metadata uses the same canonical keys as :class:`CapturedAgentAttempt`
-    (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring
-    helpers can consume attempts without a second adapter.
-    """
-    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    log_text = _read_agent_log(layout.agent_log_dir)
-    usage = extract_usage_metrics(log_text)
-    duration = duration_ms if duration_ms is not None else usage.get("duration_ms")
-
-    output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None
-    raw_log_paths = _raw_log_paths(artifacts.agent_log_dir)
-    initial_state = task.inputs.get("filesystem")
-    descriptors = _evidence_descriptors(
-        layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None
-    )
-
-    metadata: dict[str, object] = {
-        # Canonical CapturedAgentAttempt fields
-        "agent_runtime": runtime_name,
-        "agent_model": agent_model,
-        "agent_runtime_version": None,
-        "repo_revision": repo_revision,
-        "run_id": run_id,
-        "exit_code": exit_code,
-        "duration_ms": duration,
-        # SDK / orchestration extensions
-        "model_id": agent_model,
-        "target_name": agent_model,
-        "attempt_id": f"{task.id}:{runtime_name}",
-        "agent_ok": agent_ok,
-        "agent_log_dir": str(layout.agent_log_dir),
-        "workspace_dir": str(layout.workspace_dir),
-        "state_dir": str(layout.state_dir),
-        "run_dir": str(layout.run_dir),
-        "instruction_path": task.metadata.get("instruction_path"),
-        "final_answer_extracted": artifacts.final_answer.extracted,
-        "final_answer_source": artifacts.final_answer.source,
-        "raw_log_paths": raw_log_paths,
-        "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None,
-        **usage,
-    }
-
-    status = resolve_attempt_status(agent_ok)
-    if output_text:
-        output = AgentOutput(text=output_text)
-    elif agent_ok:
-        output = AgentOutput(text=log_text.strip() or "")
-    else:
-        output = AgentOutput(text=log_text.strip() or "(agent phase failed)")
-
-    return AgentEvalAttempt(
-        id=f"{task.id}:{runtime_name}",
-        task_id=task.id,
-        status=status,
-        output=output,
-        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
-        metadata=metadata,
-    )
-
-
-def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt:
-    """Project an SDK attempt onto the portable CapturedAgentAttempt schema."""
-    metadata = attempt.metadata
-    trace_path = metadata.get("atif_trajectory_path")
-    return CapturedAgentAttempt(
-        task_id=attempt.task_id,
-        input=AgentAttemptInput(
-            instruction_text=task.intent,
-            instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None,
-        ),
-        output=AgentAttemptOutput(
-            final_text=attempt.output.text if attempt.output is not None else "",
-            final_answer_extracted=bool(metadata.get("final_answer_extracted")),
-            final_answer_source=str(metadata.get("final_answer_source"))
-            if metadata.get("final_answer_source") is not None
-            else None,
-            raw_log_paths=list(metadata.get("raw_log_paths") or []),
-        ),
-        metadata=AgentAttemptMetadata(
-            agent_runtime=str(metadata.get("agent_runtime", "unknown")),
-            agent_model=str(metadata.get("agent_model", "unknown")),
-            agent_runtime_version=str(metadata["agent_runtime_version"])
-            if metadata.get("agent_runtime_version") is not None
-            else None,
-            repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None,
-            run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None,
-            exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None,
-            duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None,
-        ),
-        trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None,
-    )
-
-
-def _evidence_descriptors(
-    layout: AgenticRunLayout,
-    artifacts: AgentArtifacts,
-    *,
-    initial_state_ref: str | None = None,
-) -> dict[str, EvidenceDescriptor]:
-    """Compose the SDK's standard evidence keys + the platform ``state`` extension.
-
-    The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/
-    ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state``
-    is a NeMo-Platform-specific *extension* (not a doc key): it carries the
-    preserved platform/database state across the agent + verifier phases.
-    """
-    descriptors = standard_evidence_descriptors(
-        logs_dir=layout.agent_log_dir,
-        final_state_dir=layout.workspace_dir,
-        trace_path=artifacts.atif_trajectory_path,
-        initial_state_ref=initial_state_ref,
-        verifier_logs_dir=layout.run_dir / "verifier",
-        primary_log="nat_agent.log",
-    )
-
-    # Platform extension (non-doc key): preserved platform/db state across phases.
-    descriptors["state"] = EvidenceDescriptor(
-        kind="filesystem",
-        format="dir",
-        ref=str(layout.state_dir),
-        metadata={"role": "platform_state", "extension": "nemo-platform"},
-    )
-
-    return descriptors
-
-
-def _raw_log_paths(agent_log_dir: Path) -> list[str]:
-    if not agent_log_dir.is_dir():
-        return []
-    return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()]
-
-
-def _read_agent_log(agent_log_dir: Path) -> str:
-    log_path = agent_log_dir / "nat_agent.log"
-    if log_path.is_file():
-        return log_path.read_text(encoding="utf-8", errors="replace")
-    return ""
diff --git a/tests/agentic-use/runtimes/shared/container_env.py b/tests/agentic-use/runtimes/shared/container_env.py
deleted file mode 100644
index b59a100b54..0000000000
--- a/tests/agentic-use/runtimes/shared/container_env.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Shared container environment helpers."""
-
-from __future__ import annotations
-
-import json
-from typing import Any
-
-from runtimes.shared.config import AgenticSharedConfig
-from runtimes.shared.constants import (
-    DOCKER_SOCKET_CONTAINER_PATH,
-    DOCKER_SOCKET_HOST_PATH,
-    FILES_STORAGE_CONFIG,
-    PLATFORM_CONFIG_PATH,
-)
-
-
-def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]:
-    """Environment variables shared by all agentic-use container runs."""
-    env: dict[str, str] = {
-        "NMP_BASE_URL": shared.nmp_base_url,
-        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
-        "DATABASE_DIALECT": "sqlite",
-        "DATABASE_PATH": "/data/nmp-platform.db",
-        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
-        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
-        "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec),
-        "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec),
-        "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec),
-    }
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
-    return env
-
-
-def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]:
-    if agent_params:
-        env = dict(env)
-        env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True)
-    return env
diff --git a/tests/agentic-use/runtimes/shared/docker.py b/tests/agentic-use/runtimes/shared/docker.py
deleted file mode 100644
index ce3cc6cc22..0000000000
--- a/tests/agentic-use/runtimes/shared/docker.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Compatibility shim — Docker helpers were promoted to the Evaluator SDK.
-
-Import from ``nemo_evaluator_sdk.agent_eval.runtimes.docker`` directly; this
-module re-exports the same symbols so existing adapter imports keep working.
-"""
-
-from __future__ import annotations
-
-from nemo_evaluator_sdk.agent_eval.runtimes.docker import (
-    build_dockerfile,
-    build_task_image,
-    docker_image_exists,
-    docker_run,
-    redact_cmd_for_logging,
-)
-
-__all__ = [
-    "build_dockerfile",
-    "build_task_image",
-    "docker_image_exists",
-    "docker_run",
-    "redact_cmd_for_logging",
-]
diff --git a/tests/agentic-use/runtimes/shared/environment.py b/tests/agentic-use/runtimes/shared/environment.py
deleted file mode 100644
index 08e55ce2ed..0000000000
--- a/tests/agentic-use/runtimes/shared/environment.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Compatibility shim — the environment boundary was promoted to the Evaluator SDK.
-
-The generic boundary now lives in
-``nemo_evaluator_sdk.agent_eval.runtimes.environment``. The only platform-specific
-piece kept here is the default task→image mapping (``nmp-nat-<id>:latest``): the
-adapter's :class:`DockerEnvironmentProvider` injects :func:`task_image_tag` so
-``DockerEnvironmentProvider()`` keeps producing platform-tagged images.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Callable
-
-from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
-    AbstractEnvironmentHandle,
-    AgentEnvironmentHandle,
-    AgentEnvironmentProvider,
-    DockerEnvironmentHandle,
-    EnvCommandResult,
-    EnvRole,
-    EnvRunSpec,
-    default_image_tag,
-)
-from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
-    DockerEnvironmentProvider as _SDKDockerEnvironmentProvider,
-)
-
-from runtimes.shared.layout import task_image_tag
-
-__all__ = [
-    "AbstractEnvironmentHandle",
-    "AgentEnvironmentHandle",
-    "AgentEnvironmentProvider",
-    "DockerEnvironmentHandle",
-    "DockerEnvironmentProvider",
-    "EnvCommandResult",
-    "EnvRole",
-    "EnvRunSpec",
-    "default_image_tag",
-]
-
-
-class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider):
-    """Platform default: map ``task.id`` to ``nmp-nat-<id>:latest``."""
-
-    def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None:
-        super().__init__(image_tag_fn=image_tag_fn)
diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/tests/agentic-use/runtimes/shared/environment_spec.py
deleted file mode 100644
index 9cdd3db71f..0000000000
--- a/tests/agentic-use/runtimes/shared/environment_spec.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Compatibility shim — environment authoring was promoted to the Evaluator SDK.
-
-Import from ``nemo_evaluator_sdk.agent_eval.runtimes.environment_spec`` directly;
-this module re-exports the same symbols so existing adapter imports keep working.
-"""
-
-from __future__ import annotations
-
-from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import (
-    DEFAULT_DOCKERFILE_RELPATH,
-    ENVIRONMENT_SPEC_FILENAME,
-    BuildPlan,
-    EnvironmentSpec,
-    execute_build_plan,
-    load_environment_spec,
-    plan_task_build,
-    render_derived_dockerfile,
-)
-
-__all__ = [
-    "DEFAULT_DOCKERFILE_RELPATH",
-    "ENVIRONMENT_SPEC_FILENAME",
-    "BuildPlan",
-    "EnvironmentSpec",
-    "execute_build_plan",
-    "load_environment_spec",
-    "plan_task_build",
-    "render_derived_dockerfile",
-]
diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py
deleted file mode 100644
index 86a4c5f4f2..0000000000
--- a/tests/agentic-use/runtimes/shared/layout.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Output directory layout for agentic-use runtime runs."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from pathlib import Path
-
-from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
-
-from runtimes.shared.config import AgenticSharedConfig
-
-
-@dataclass(frozen=True)
-class AgenticRunLayout:
-    """Filesystem layout for one task run.
-
-    Extends the SDK's generic ``RunLayout`` shape with a platform-specific
-    ``state_dir`` (preserved platform/database state across agent + verifier).
-    """
-
-    run_dir: Path
-    agent_log_dir: Path
-    workspace_dir: Path
-    state_dir: Path
-    instruction_path: Path
-
-
-def default_jobs_dir(shared: AgenticSharedConfig) -> Path:
-    if shared.jobs_dir is not None:
-        return shared.jobs_dir
-    return shared.repo_root / "nat-jobs"
-
-
-def new_run_dir(jobs_dir: Path, task_id: str) -> Path:
-    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
-    run_dir = jobs_dir / f"{timestamp}-{task_id}"
-    run_dir.mkdir(parents=True, exist_ok=True)
-    return run_dir
-
-
-def resolve_run_layout(
-    task: AgentEvalTask,
-    shared: AgenticSharedConfig,
-    config: AgentEvalRunConfig | None = None,
-) -> AgenticRunLayout:
-    """Resolve or create the on-disk layout for one task attempt."""
-    output_dir = config.output_dir if config is not None else None
-    run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id))
-
-    # Generic agent/workspace dirs + written instruction come from the SDK helper.
-    base = prepare_run_layout(run_dir, task.intent)
-
-    # Platform extension: a preserved state dir for platform/db across phases.
-    state_dir = base.run_dir / "state"
-    state_dir.mkdir(parents=True, exist_ok=True)
-
-    return AgenticRunLayout(
-        run_dir=base.run_dir,
-        agent_log_dir=base.agent_log_dir,
-        workspace_dir=base.workspace_dir,
-        state_dir=state_dir,
-        instruction_path=base.instruction_path,
-    )
-
-
-def task_image_tag(task_id: str) -> str:
-    return f"nmp-nat-{task_id}:latest"
diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py
deleted file mode 100644
index e7b8496caf..0000000000
--- a/tests/agentic-use/runtimes/shared/metrics.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Default metrics for agentic-use agent-eval runs.
-
-``AgentPhaseSuccessMetric`` is promoted to the SDK; here it is namespaced under
-the ``agentic_use_*`` metric type. ``VerifierRewardMetric`` stays a platform
-compatibility shim (mirrors the legacy pytest verifier reward).
-"""
-
-from __future__ import annotations
-
-from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric
-from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
-
-
-class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric):
-    """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``)."""
-
-    metric_type = "agentic_use_agent_phase"
-
-
-class VerifierRewardMetric:
-    """Compatibility metric mirroring the legacy pytest verifier reward.
-
-    Reads the verifier outcome that ``nat_runner`` records in ``result.json``
-    (projected onto attempt metadata as ``reward``/``passed``) so existing
-    ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK
-    while task-specific metrics are authored.
-    """
-
-    @property
-    def type(self) -> str:
-        return "agentic_use_verifier_reward"
-
-    def output_spec(self) -> list[MetricOutputSpec]:
-        return [MetricOutputSpec.continuous_score("verifier_reward")]
-
-    async def compute_scores(self, input: MetricInput) -> MetricResult:
-        metadata = input.candidate.metadata
-        reward = metadata.get("reward")
-        if reward is None:
-            reward = 1.0 if metadata.get("passed") else 0.0
-        return MetricResult(
-            outputs=[MetricOutput(name="verifier_reward", value=float(reward))],
-        )
diff --git a/tests/agentic-use/runtimes/shared/platform.py b/tests/agentic-use/runtimes/shared/platform.py
new file mode 100644
index 0000000000..721d717e6f
--- /dev/null
+++ b/tests/agentic-use/runtimes/shared/platform.py
@@ -0,0 +1,791 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""NeMo-Platform glue that sits on top of the generic agent-eval SDK.
+
+Everything generic (Docker helpers, the environment boundary, environment
+authoring, gating, attempt-status/evidence helpers, the verifier mechanic) now
+lives in ``nemo_evaluator_sdk.agent_eval`` and is imported directly where used.
+
+This single module holds only the pieces that are specific to the agentic-use
+benchmark and therefore do not belong in the SDK:
+
+* run layout with the platform ``state_dir`` and the ``nmp-nat-<id>`` image tag,
+* a ``DockerEnvironmentProvider`` defaulting to that platform image tag,
+* default metrics (``AgentPhaseSuccessMetric`` namespace + ``VerifierRewardMetric``),
+* agent-log/usage parsing and the shared container env,
+* attempt construction from live artifacts and from ``nat_runner`` ``result.json``,
+* the live VERIFY phase wired through the SDK environment boundary,
+* the agentic-use task loader.
+"""
+
+from __future__ import annotations
+
+import json
+import textwrap
+import tomllib
+from collections.abc import Callable
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any, TypedDict
+
+from evaluator_agent_eval.artifacts import AgentArtifacts
+from evaluator_agent_eval.schemas import (
+    AgentAttemptInput,
+    AgentAttemptMetadata,
+    AgentAttemptOutput,
+    AgentAttemptTrace,
+    CapturedAgentAttempt,
+)
+from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    AgentEnvironmentHandle,
+    EnvRunSpec,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    DockerEnvironmentProvider as _SDKDockerEnvironmentProvider,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
+    VerifierOutcome,
+    collect_verifier_outcome,
+    skipped_outcome,
+)
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import (
+    Metric,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+)
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
+
+from runtimes.shared.config import AgenticRuntimeName, AgenticSharedConfig
+from runtimes.shared.constants import (
+    AGENTIC_USE_DIR,
+    DOCKER_SOCKET_CONTAINER_PATH,
+    DOCKER_SOCKET_HOST_PATH,
+    EVALUATOR_SDK_SRC,
+    FILES_STORAGE_CONFIG,
+    PLATFORM_CONFIG_PATH,
+    SHARED_DIR,
+)
+
+__all__ = [
+    "AgenticRunLayout",
+    "AgentPhaseSuccessMetric",
+    "DockerEnvironmentProvider",
+    "ResultDirAttemptSource",
+    "VerifierRewardMetric",
+    "agent_log_has_workflow_error",
+    "agentic_task_from_dir",
+    "attempt_from_result",
+    "attempt_from_result_dir",
+    "base_container_env",
+    "build_agent_eval_attempt",
+    "build_verify_run_spec",
+    "extract_usage_metrics",
+    "iter_agent_log_json_payloads",
+    "load_task_toml",
+    "maybe_run_verify",
+    "resolve_run_layout",
+    "run_verify",
+    "task_agent_timeout_sec",
+    "task_image_tag",
+    "to_captured_agent_attempt",
+    "verifier_log_dir",
+    "with_candidate_params",
+]
+
+
+# --------------------------------------------------------------------------- #
+# Run layout + image tagging
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class AgenticRunLayout:
+    """Filesystem layout for one task run.
+
+    Extends the SDK's generic ``RunLayout`` shape with a platform-specific
+    ``state_dir`` (preserved platform/database state across agent + verifier).
+    """
+
+    run_dir: Path
+    agent_log_dir: Path
+    workspace_dir: Path
+    state_dir: Path
+    instruction_path: Path
+
+
+def task_image_tag(task_id: str) -> str:
+    return f"nmp-nat-{task_id}:latest"
+
+
+def default_jobs_dir(shared: AgenticSharedConfig) -> Path:
+    if shared.jobs_dir is not None:
+        return shared.jobs_dir
+    return shared.repo_root / "nat-jobs"
+
+
+def new_run_dir(jobs_dir: Path, task_id: str) -> Path:
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    run_dir = jobs_dir / f"{timestamp}-{task_id}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return run_dir
+
+
+def resolve_run_layout(
+    task: AgentEvalTask,
+    shared: AgenticSharedConfig,
+    config: AgentEvalRunConfig | None = None,
+) -> AgenticRunLayout:
+    """Resolve or create the on-disk layout for one task attempt."""
+    output_dir = config.output_dir if config is not None else None
+    run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id))
+
+    # Generic agent/workspace dirs + written instruction come from the SDK helper.
+    base = prepare_run_layout(run_dir, task.intent)
+
+    # Platform extension: a preserved state dir for platform/db across phases.
+    state_dir = base.run_dir / "state"
+    state_dir.mkdir(parents=True, exist_ok=True)
+
+    return AgenticRunLayout(
+        run_dir=base.run_dir,
+        agent_log_dir=base.agent_log_dir,
+        workspace_dir=base.workspace_dir,
+        state_dir=state_dir,
+        instruction_path=base.instruction_path,
+    )
+
+
+class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider):
+    """Platform default: map ``task.id`` to ``nmp-nat-<id>:latest``."""
+
+    def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None:
+        super().__init__(image_tag_fn=image_tag_fn)
+
+
+# --------------------------------------------------------------------------- #
+# Default metrics
+# --------------------------------------------------------------------------- #
+class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric):
+    """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``)."""
+
+    metric_type = "agentic_use_agent_phase"
+
+
+class VerifierRewardMetric:
+    """Compatibility metric mirroring the legacy pytest verifier reward.
+
+    Reads the verifier outcome that ``nat_runner`` records in ``result.json``
+    (projected onto attempt metadata as ``reward``/``passed``) so existing
+    ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK
+    while task-specific metrics are authored.
+    """
+
+    @property
+    def type(self) -> str:
+        return "agentic_use_verifier_reward"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("verifier_reward")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        metadata = input.candidate.metadata
+        reward = metadata.get("reward")
+        if reward is None:
+            reward = 1.0 if metadata.get("passed") else 0.0
+        return MetricResult(
+            outputs=[MetricOutput(name="verifier_reward", value=float(reward))],
+        )
+
+
+# --------------------------------------------------------------------------- #
+# Agent-log parsing + token usage
+# --------------------------------------------------------------------------- #
+class TokenMetrics(TypedDict):
+    prompt_tokens: int | None
+    completion_tokens: int | None
+    total_tokens: int | None
+    cache_creation_tokens: int | None
+    cache_read_tokens: int | None
+    n_assistant_messages: int | None
+    cost_usd: float | None
+    num_turns: int | None
+    duration_ms: float | None
+
+
+def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]:
+    """Extract token usage metrics from an agent log."""
+    import nat_runner
+
+    metrics = nat_runner._extract_usage_metrics(agent_log)
+    return dict(metrics)
+
+
+def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]:
+    """Return JSON dict payloads embedded in an agent log, newest-first after the full log."""
+    candidates = [agent_log.strip()]
+    lines = [line.strip() for line in agent_log.splitlines() if line.strip()]
+    if lines:
+        candidates.append(lines[-1])
+        candidates.extend(reversed(lines))
+
+    payloads: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for candidate in candidates:
+        if not candidate or candidate in seen:
+            continue
+        seen.add(candidate)
+        try:
+            parsed = json.loads(candidate)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(parsed, dict):
+            payloads.append(parsed)
+    return payloads
+
+
+def agent_log_has_workflow_error(agent_log: str) -> bool:
+    """Detect AUT workflow errors returned as successful HTTP JSON payloads."""
+    for payload in iter_agent_log_json_payloads(agent_log):
+        if payload.get("code") == "workflow_error":
+            return True
+    return False
+
+
+# --------------------------------------------------------------------------- #
+# Shared container environment
+# --------------------------------------------------------------------------- #
+def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]:
+    """Environment variables shared by all agentic-use container runs."""
+    env: dict[str, str] = {
+        "NMP_BASE_URL": shared.nmp_base_url,
+        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
+        "DATABASE_DIALECT": "sqlite",
+        "DATABASE_PATH": "/data/nmp-platform.db",
+        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
+        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
+        "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec),
+        "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec),
+        "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec),
+    }
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
+    return env
+
+
+def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]:
+    if agent_params:
+        env = dict(env)
+        env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True)
+    return env
+
+
+# --------------------------------------------------------------------------- #
+# Attempt construction from live artifacts
+# --------------------------------------------------------------------------- #
+def build_agent_eval_attempt(
+    *,
+    task: AgentEvalTask,
+    layout: AgenticRunLayout,
+    runtime_name: AgenticRuntimeName,
+    agent_model: str,
+    exit_code: int,
+    agent_ok: bool,
+    run_id: str | None = None,
+    repo_revision: str | None = None,
+    duration_ms: int | None = None,
+) -> AgentEvalAttempt:
+    """Build an SDK attempt from on-disk agent artifacts.
+
+    Metadata uses the same canonical keys as :class:`CapturedAgentAttempt`
+    (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring
+    helpers can consume attempts without a second adapter.
+    """
+    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    log_text = _read_agent_log(layout.agent_log_dir)
+    usage = extract_usage_metrics(log_text)
+    duration = duration_ms if duration_ms is not None else usage.get("duration_ms")
+
+    output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None
+    raw_log_paths = _raw_log_paths(artifacts.agent_log_dir)
+    initial_state = task.inputs.get("filesystem")
+    descriptors = _evidence_descriptors(
+        layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None
+    )
+
+    metadata: dict[str, object] = {
+        # Canonical CapturedAgentAttempt fields
+        "agent_runtime": runtime_name,
+        "agent_model": agent_model,
+        "agent_runtime_version": None,
+        "repo_revision": repo_revision,
+        "run_id": run_id,
+        "exit_code": exit_code,
+        "duration_ms": duration,
+        # SDK / orchestration extensions
+        "model_id": agent_model,
+        "target_name": agent_model,
+        "attempt_id": f"{task.id}:{runtime_name}",
+        "agent_ok": agent_ok,
+        "agent_log_dir": str(layout.agent_log_dir),
+        "workspace_dir": str(layout.workspace_dir),
+        "state_dir": str(layout.state_dir),
+        "run_dir": str(layout.run_dir),
+        "instruction_path": task.metadata.get("instruction_path"),
+        "final_answer_extracted": artifacts.final_answer.extracted,
+        "final_answer_source": artifacts.final_answer.source,
+        "raw_log_paths": raw_log_paths,
+        "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None,
+        **usage,
+    }
+
+    status = resolve_attempt_status(agent_ok)
+    if output_text:
+        output = AgentOutput(text=output_text)
+    elif agent_ok:
+        output = AgentOutput(text=log_text.strip() or "")
+    else:
+        output = AgentOutput(text=log_text.strip() or "(agent phase failed)")
+
+    return AgentEvalAttempt(
+        id=f"{task.id}:{runtime_name}",
+        task_id=task.id,
+        status=status,
+        output=output,
+        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
+        metadata=metadata,
+    )
+
+
+def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt:
+    """Project an SDK attempt onto the portable CapturedAgentAttempt schema."""
+    metadata = attempt.metadata
+    trace_path = metadata.get("atif_trajectory_path")
+    return CapturedAgentAttempt(
+        task_id=attempt.task_id,
+        input=AgentAttemptInput(
+            instruction_text=task.intent,
+            instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None,
+        ),
+        output=AgentAttemptOutput(
+            final_text=attempt.output.text if attempt.output is not None else "",
+            final_answer_extracted=bool(metadata.get("final_answer_extracted")),
+            final_answer_source=str(metadata.get("final_answer_source"))
+            if metadata.get("final_answer_source") is not None
+            else None,
+            raw_log_paths=list(metadata.get("raw_log_paths") or []),
+        ),
+        metadata=AgentAttemptMetadata(
+            agent_runtime=str(metadata.get("agent_runtime", "unknown")),
+            agent_model=str(metadata.get("agent_model", "unknown")),
+            agent_runtime_version=str(metadata["agent_runtime_version"])
+            if metadata.get("agent_runtime_version") is not None
+            else None,
+            repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None,
+            run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None,
+            exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None,
+            duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None,
+        ),
+        trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None,
+    )
+
+
+def _evidence_descriptors(
+    layout: AgenticRunLayout,
+    artifacts: AgentArtifacts,
+    *,
+    initial_state_ref: str | None = None,
+) -> dict[str, EvidenceDescriptor]:
+    """Compose the SDK's standard evidence keys + the platform ``state`` extension.
+
+    The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/
+    ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state``
+    is a NeMo-Platform-specific *extension* (not a doc key): it carries the
+    preserved platform/database state across the agent + verifier phases.
+    """
+    descriptors = standard_evidence_descriptors(
+        logs_dir=layout.agent_log_dir,
+        final_state_dir=layout.workspace_dir,
+        trace_path=artifacts.atif_trajectory_path,
+        initial_state_ref=initial_state_ref,
+        verifier_logs_dir=layout.run_dir / "verifier",
+        primary_log="nat_agent.log",
+    )
+
+    # Platform extension (non-doc key): preserved platform/db state across phases.
+    descriptors["state"] = EvidenceDescriptor(
+        kind="filesystem",
+        format="dir",
+        ref=str(layout.state_dir),
+        metadata={"role": "platform_state", "extension": "nemo-platform"},
+    )
+
+    return descriptors
+
+
+def _raw_log_paths(agent_log_dir: Path) -> list[str]:
+    if not agent_log_dir.is_dir():
+        return []
+    return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()]
+
+
+def _read_agent_log(agent_log_dir: Path) -> str:
+    log_path = agent_log_dir / "nat_agent.log"
+    if log_path.is_file():
+        return log_path.read_text(encoding="utf-8", errors="replace")
+    return ""
+
+
+# --------------------------------------------------------------------------- #
+# Attempt construction from nat_runner result.json
+# --------------------------------------------------------------------------- #
+# Token/cost measurement keys carried in result.json["metrics"].
+_METRIC_KEYS = (
+    "prompt_tokens",
+    "completion_tokens",
+    "total_tokens",
+    "cache_creation_tokens",
+    "cache_read_tokens",
+    "n_assistant_messages",
+    "cost_usd",
+    "num_turns",
+    "duration_ms",
+    "token_metrics_status",
+    "token_metrics_note",
+)
+
+
+class ResultDirAttemptSource:
+    """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts.
+
+    Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource`
+    protocol so the generic orchestrator's offline path can rescore captured runs.
+    """
+
+    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
+        return attempt_from_result_dir(source, task=task)
+
+
+def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt:
+    """Load ``<output_dir>/result.json`` and build an attempt from it."""
+    output_dir = Path(output_dir)
+    result_path = output_dir / "result.json"
+    if not result_path.is_file():
+        raise FileNotFoundError(f"result.json not found in {output_dir}")
+    result = json.loads(result_path.read_text(encoding="utf-8"))
+    return attempt_from_result(result, output_dir=output_dir, task=task)
+
+
+def attempt_from_result(
+    result: dict[str, Any],
+    *,
+    output_dir: str | Path | None = None,
+    task: AgentEvalTask | None = None,
+) -> AgentEvalAttempt:
+    """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`.
+
+    The attempt ``status`` reflects whether the agent produced a usable
+    response (``agent`` phase outcome). Pass/fail from the verifier is recorded
+    as a *measurement* in metadata (``reward``/``passed``) so scoring metrics —
+    not the runtime — remain the source of truth.
+    """
+    task_id = str(result.get("task") or (task.id if task is not None else "unknown"))
+    backend = str(result.get("agent_backend") or "unknown")
+    resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or "."))
+    layout = _layout_from_result_dir(resolved_dir)
+
+    agent_phase = str(result.get("agent") or "")
+    agent_ok = agent_phase in {"ok", "skipped"}
+    status = resolve_attempt_status(agent_ok)
+
+    output_text, final_extracted, final_source = _resolve_output_text(layout)
+    if not output_text:
+        output_text = "" if agent_ok else "(agent phase failed)"
+
+    descriptors = _evidence_descriptors(
+        layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    )
+
+    metrics = dict(result.get("metrics") or {})
+    metadata: dict[str, Any] = {
+        # Canonical CapturedAgentAttempt-style provenance fields.
+        "agent_runtime": backend,
+        "agent_model": result.get("agent_model"),
+        "run_id": (result.get("provenance") or {}).get("run_id"),
+        "exit_code": 0 if agent_ok else 1,
+        "duration_ms": metrics.get("duration_ms"),
+        # Phase outcomes from result.json.
+        "agent_ok": agent_ok,
+        "build_status": result.get("build"),
+        "agent_status": result.get("agent"),
+        "verify_status": result.get("verify"),
+        # Measurements (verifier reward is a measurement, not attempt status).
+        "passed": result.get("passed"),
+        "reward": result.get("reward"),
+        "runtime_sec": result.get("runtime_sec"),
+        "verifier_scores": result.get("verifier_scores"),
+        # Provenance + candidate identity.
+        "provenance": result.get("provenance"),
+        "candidate_id": result.get("candidate_id"),
+        "candidate_params": result.get("candidate_params"),
+        "image": result.get("image"),
+        "output_dir": str(resolved_dir),
+        # Artifact discovery helpers.
+        "agent_log_dir": str(layout.agent_log_dir),
+        "workspace_dir": str(layout.workspace_dir),
+        "state_dir": str(layout.state_dir),
+        "final_answer_extracted": final_extracted,
+        "final_answer_source": final_source,
+    }
+    metadata.update({key: metrics.get(key) for key in _METRIC_KEYS})
+
+    return AgentEvalAttempt(
+        id=f"{task_id}:{backend}",
+        task_id=task_id,
+        status=status,
+        output=AgentOutput(text=output_text),
+        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
+        metadata=metadata,
+    )
+
+
+def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout:
+    agent_log_dir = output_dir / "agent"
+    return AgenticRunLayout(
+        run_dir=output_dir,
+        agent_log_dir=agent_log_dir,
+        workspace_dir=output_dir / "workspace",
+        state_dir=output_dir / "state",
+        instruction_path=agent_log_dir / "instruction.md",
+    )
+
+
+def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]:
+    if not layout.agent_log_dir.is_dir():
+        return "", False, None
+    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    if artifacts.final_answer.extracted and artifacts.final_answer.text:
+        return artifacts.final_answer.text, True, artifacts.final_answer.source
+    log_path = layout.agent_log_dir / "nat_agent.log"
+    if log_path.is_file():
+        return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None
+    return "", False, None
+
+
+# --------------------------------------------------------------------------- #
+# Live VERIFY phase through the SDK environment boundary
+# --------------------------------------------------------------------------- #
+def verifier_log_dir(layout: AgenticRunLayout) -> Path:
+    return layout.run_dir / "verifier"
+
+
+def build_verify_run_spec(
+    task_dir: Path,
+    layout: AgenticRunLayout,
+    *,
+    nmp_base_url: str,
+    agent_backend: str,
+    agent_model: str,
+    smoke_workspace: str | None = None,
+    timeout_sec: int | None = None,
+    extra_args: list[str] | None = None,
+) -> EnvRunSpec | None:
+    """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``.
+
+    Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to
+    verify), matching the runner's behavior.
+    """
+    tests_dir = task_dir / "tests"
+    if not (tests_dir / "test_outputs.py").exists():
+        return None
+
+    log_dir = verifier_log_dir(layout)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    layout.workspace_dir.mkdir(parents=True, exist_ok=True)
+
+    smoke_seed_cmd = ""
+    smoke_cleanup_cmd = ""
+    if smoke_workspace:
+        smoke_seed_cmd = textwrap.dedent("""\
+            /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \
+              --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true
+        """)
+        smoke_cleanup_cmd = textwrap.dedent("""\
+            /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true
+        """)
+
+    verify_cmd = [
+        "bash",
+        "-c",
+        textwrap.dedent(f"""\
+            export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}"
+            export NAT_AGENT=1
+            {smoke_seed_cmd}
+            /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt
+            EXIT=${{PIPESTATUS[0]}}
+            {smoke_cleanup_cmd}
+            if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt
+            exit $EXIT
+        """),
+    ]
+
+    env: dict[str, str] = {
+        "NMP_BASE_URL": nmp_base_url,
+        "NAT_AGENT": "1",
+        "NAT_AGENT_BACKEND": agent_backend,
+        "NAT_AGENT_MODEL": agent_model,
+        "AGENTIC_USE_TASK_DIR": "/task",
+        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
+        "SMOKE_WORKSPACE": smoke_workspace or "",
+        "DATABASE_DIALECT": "sqlite",
+        "DATABASE_PATH": "/data/nmp-platform.db",
+        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
+        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
+    }
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
+
+    mounts: list[tuple[str, str]] = [
+        (str(tests_dir), "/tests"),
+        (str(task_dir), "/task"),
+        (str(layout.workspace_dir), "/app/workspace"),
+        (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"),
+        (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"),
+        (str(layout.agent_log_dir), "/logs/agent"),
+        (str(log_dir), "/logs/verifier"),
+        # Persist platform/db state across AGENT and VERIFY containers.
+        (str(layout.state_dir), "/data"),
+    ]
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH))
+
+    return EnvRunSpec(
+        command=verify_cmd,
+        env=env,
+        mounts=mounts,
+        timeout=timeout_sec,
+        extra_args=list(extra_args or []),
+    )
+
+
+async def run_verify(
+    handle: AgentEnvironmentHandle,
+    spec: EnvRunSpec,
+    layout: AgenticRunLayout,
+) -> VerifierOutcome:
+    """Execute the verifier through the environment handle and collect reward."""
+    result = await handle.run_verifier(spec)
+    return collect_verifier_outcome(
+        ok=result.ok,
+        exit_code=result.exit_code,
+        log_dir=verifier_log_dir(layout),
+    )
+
+
+async def maybe_run_verify(
+    handle: AgentEnvironmentHandle,
+    *,
+    enabled: bool,
+    task_dir: Path,
+    layout: AgenticRunLayout,
+    nmp_base_url: str,
+    agent_backend: str,
+    agent_model: str,
+    smoke_workspace: str | None = None,
+    timeout_sec: int | None = None,
+    extra_args: list[str] | None = None,
+) -> VerifierOutcome:
+    """Run the verifier through ``handle`` when enabled and a verifier exists."""
+    if not enabled:
+        return skipped_outcome()
+    spec = build_verify_run_spec(
+        task_dir,
+        layout,
+        nmp_base_url=nmp_base_url,
+        agent_backend=agent_backend,
+        agent_model=agent_model,
+        smoke_workspace=smoke_workspace,
+        timeout_sec=timeout_sec,
+        extra_args=extra_args,
+    )
+    if spec is None:
+        return skipped_outcome()
+    return await run_verify(handle, spec, layout)
+
+
+# --------------------------------------------------------------------------- #
+# Agentic-use task loader
+# --------------------------------------------------------------------------- #
+def load_task_toml(task_dir: Path) -> dict[str, object]:
+    task_toml = task_dir / "task.toml"
+    if not task_toml.exists():
+        return {}
+    try:
+        with task_toml.open("rb") as handle:
+            data = tomllib.load(handle)
+    except Exception:
+        return {}
+    return data if isinstance(data, dict) else {}
+
+
+def task_agent_timeout_sec(task_dir: Path) -> int | None:
+    data = load_task_toml(task_dir)
+    agent = data.get("agent")
+    if not isinstance(agent, dict):
+        return None
+    timeout_value = agent.get("timeout_sec")
+    if isinstance(timeout_value, (int, float)) and timeout_value > 0:
+        return int(timeout_value)
+    return None
+
+
+def agentic_task_from_dir(
+    task_dir: str | Path,
+    *,
+    tasks_root: Path | None = None,
+    metrics: list[Metric] | None = None,
+) -> AgentEvalTask:
+    """Build an :class:`AgentEvalTask` from an agentic-use task directory.
+
+    ``inputs`` carries only agent-facing material (``instruction``) per the SDK
+    design doc; runtime materialization details such as ``task_dir`` live in
+    ``metadata`` so they cannot leak into metric scoring rows. Metrics are
+    authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the
+    orchestrator only appends compatibility metrics, it does not own the set.
+    """
+    root = Path(tasks_root or AGENTIC_USE_DIR)
+    task_path = Path(task_dir)
+    if not task_path.is_absolute():
+        task_path = (root / task_path).resolve()
+
+    instruction_path = task_path / "instruction.md"
+    if not instruction_path.exists():
+        raise FileNotFoundError(f"instruction.md not found in {task_path}")
+
+    instruction = instruction_path.read_text(encoding="utf-8")
+    task_toml = load_task_toml(task_path)
+
+    return AgentEvalTask(
+        id=task_path.name,
+        intent=instruction,
+        inputs={
+            "instruction": instruction,
+        },
+        metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()],
+        metadata={
+            "benchmark": "agentic-use",
+            "task_toml": task_toml,
+            "instruction_path": str(instruction_path),
+            "task_dir": str(task_path),
+        },
+    )
diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/tests/agentic-use/runtimes/shared/reporting.py
deleted file mode 100644
index 7e78de3972..0000000000
--- a/tests/agentic-use/runtimes/shared/reporting.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Compatibility shim — gating was promoted to the Evaluator SDK.
-
-Import from ``nemo_evaluator_sdk.agent_eval.gating`` directly; this module
-re-exports the same symbols so existing adapter imports keep working.
-"""
-
-from __future__ import annotations
-
-from nemo_evaluator_sdk.agent_eval.gating import (
-    DEFAULT_REWARD_OUTPUTS,
-    GateCheck,
-    GateReport,
-    GateThresholds,
-    evaluate_gate,
-    load_baseline_summary,
-    run_gate_checks,
-    summarize_run,
-    write_gate_report,
-)
-
-__all__ = [
-    "DEFAULT_REWARD_OUTPUTS",
-    "GateCheck",
-    "GateReport",
-    "GateThresholds",
-    "evaluate_gate",
-    "load_baseline_summary",
-    "run_gate_checks",
-    "summarize_run",
-    "write_gate_report",
-]
diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py
deleted file mode 100644
index bb0d3fe567..0000000000
--- a/tests/agentic-use/runtimes/shared/result_adapter.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Adapt ``nat_runner`` ``result.json`` records into ``AgentEvalAttempt`` values.
-
-This bridges the existing ``nat_runner`` output contract (see
-``nat_runner._write_result``) onto the agent-eval SDK so a run that already
-produced ``result.json`` can be imported as an attempt without re-executing the
-agent. Per the design doc, ``result.json`` carries the attempt *status*,
-*measurements* (reward + token/cost), and *provenance*.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any
-
-from evaluator_agent_eval.artifacts import AgentArtifacts
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalTask, AgentOutput
-from nemo_evaluator_sdk.values.evidence import CandidateEvidence
-
-from runtimes.shared.artifacts import _evidence_descriptors, resolve_attempt_status  # reuse documented helpers
-from runtimes.shared.layout import AgenticRunLayout
-
-# Token/cost measurement keys carried in result.json["metrics"].
-_METRIC_KEYS = (
-    "prompt_tokens",
-    "completion_tokens",
-    "total_tokens",
-    "cache_creation_tokens",
-    "cache_read_tokens",
-    "n_assistant_messages",
-    "cost_usd",
-    "num_turns",
-    "duration_ms",
-    "token_metrics_status",
-    "token_metrics_note",
-)
-
-
-class ResultDirAttemptSource:
-    """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts.
-
-    Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource`
-    protocol so the generic orchestrator's offline path can rescore captured runs.
-    """
-
-    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
-        return attempt_from_result_dir(source, task=task)
-
-
-def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt:
-    """Load ``<output_dir>/result.json`` and build an attempt from it."""
-    output_dir = Path(output_dir)
-    result_path = output_dir / "result.json"
-    if not result_path.is_file():
-        raise FileNotFoundError(f"result.json not found in {output_dir}")
-    result = json.loads(result_path.read_text(encoding="utf-8"))
-    return attempt_from_result(result, output_dir=output_dir, task=task)
-
-
-def attempt_from_result(
-    result: dict[str, Any],
-    *,
-    output_dir: str | Path | None = None,
-    task: AgentEvalTask | None = None,
-) -> AgentEvalAttempt:
-    """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`.
-
-    The attempt ``status`` reflects whether the agent produced a usable
-    response (``agent`` phase outcome). Pass/fail from the verifier is recorded
-    as a *measurement* in metadata (``reward``/``passed``) so scoring metrics —
-    not the runtime — remain the source of truth.
-    """
-    task_id = str(result.get("task") or (task.id if task is not None else "unknown"))
-    backend = str(result.get("agent_backend") or "unknown")
-    resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or "."))
-    layout = _layout_from_result_dir(resolved_dir)
-
-    agent_phase = str(result.get("agent") or "")
-    agent_ok = agent_phase in {"ok", "skipped"}
-    status = resolve_attempt_status(agent_ok)
-
-    output_text, final_extracted, final_source = _resolve_output_text(layout)
-    if not output_text:
-        output_text = "" if agent_ok else "(agent phase failed)"
-
-    descriptors = _evidence_descriptors(
-        layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    )
-
-    metrics = dict(result.get("metrics") or {})
-    metadata: dict[str, Any] = {
-        # Canonical CapturedAgentAttempt-style provenance fields.
-        "agent_runtime": backend,
-        "agent_model": result.get("agent_model"),
-        "run_id": (result.get("provenance") or {}).get("run_id"),
-        "exit_code": 0 if agent_ok else 1,
-        "duration_ms": metrics.get("duration_ms"),
-        # Phase outcomes from result.json.
-        "agent_ok": agent_ok,
-        "build_status": result.get("build"),
-        "agent_status": result.get("agent"),
-        "verify_status": result.get("verify"),
-        # Measurements (verifier reward is a measurement, not attempt status).
-        "passed": result.get("passed"),
-        "reward": result.get("reward"),
-        "runtime_sec": result.get("runtime_sec"),
-        "verifier_scores": result.get("verifier_scores"),
-        # Provenance + candidate identity.
-        "provenance": result.get("provenance"),
-        "candidate_id": result.get("candidate_id"),
-        "candidate_params": result.get("candidate_params"),
-        "image": result.get("image"),
-        "output_dir": str(resolved_dir),
-        # Artifact discovery helpers.
-        "agent_log_dir": str(layout.agent_log_dir),
-        "workspace_dir": str(layout.workspace_dir),
-        "state_dir": str(layout.state_dir),
-        "final_answer_extracted": final_extracted,
-        "final_answer_source": final_source,
-    }
-    metadata.update({key: metrics.get(key) for key in _METRIC_KEYS})
-
-    return AgentEvalAttempt(
-        id=f"{task_id}:{backend}",
-        task_id=task_id,
-        status=status,
-        output=AgentOutput(text=output_text),
-        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
-        metadata=metadata,
-    )
-
-
-def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout:
-    agent_log_dir = output_dir / "agent"
-    return AgenticRunLayout(
-        run_dir=output_dir,
-        agent_log_dir=agent_log_dir,
-        workspace_dir=output_dir / "workspace",
-        state_dir=output_dir / "state",
-        instruction_path=agent_log_dir / "instruction.md",
-    )
-
-
-def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]:
-    if not layout.agent_log_dir.is_dir():
-        return "", False, None
-    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    if artifacts.final_answer.extracted and artifacts.final_answer.text:
-        return artifacts.final_answer.text, True, artifacts.final_answer.source
-    log_path = layout.agent_log_dir / "nat_agent.log"
-    if log_path.is_file():
-        return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None
-    return "", False, None
diff --git a/tests/agentic-use/runtimes/shared/task_loader.py b/tests/agentic-use/runtimes/shared/task_loader.py
deleted file mode 100644
index e64a87e99d..0000000000
--- a/tests/agentic-use/runtimes/shared/task_loader.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Map agentic-use task directories to AgentEvalTask values."""
-
-from __future__ import annotations
-
-import tomllib
-from pathlib import Path
-
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask
-from nemo_evaluator_sdk.metrics.protocol import Metric
-
-from runtimes.shared.constants import AGENTIC_USE_DIR
-from runtimes.shared.metrics import AgentPhaseSuccessMetric
-
-
-def load_task_toml(task_dir: Path) -> dict[str, object]:
-    task_toml = task_dir / "task.toml"
-    if not task_toml.exists():
-        return {}
-    try:
-        with task_toml.open("rb") as handle:
-            data = tomllib.load(handle)
-    except Exception:
-        return {}
-    return data if isinstance(data, dict) else {}
-
-
-def task_agent_timeout_sec(task_dir: Path) -> int | None:
-    data = load_task_toml(task_dir)
-    agent = data.get("agent")
-    if not isinstance(agent, dict):
-        return None
-    timeout_value = agent.get("timeout_sec")
-    if isinstance(timeout_value, (int, float)) and timeout_value > 0:
-        return int(timeout_value)
-    return None
-
-
-def agentic_task_from_dir(
-    task_dir: str | Path,
-    *,
-    tasks_root: Path | None = None,
-    metrics: list[Metric] | None = None,
-) -> AgentEvalTask:
-    """Build an :class:`AgentEvalTask` from an agentic-use task directory.
-
-    ``inputs`` carries only agent-facing material (``instruction``) per the SDK
-    design doc; runtime materialization details such as ``task_dir`` live in
-    ``metadata`` so they cannot leak into metric scoring rows. Metrics are
-    authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the
-    orchestrator only appends compatibility metrics, it does not own the set.
-    """
-    root = Path(tasks_root or AGENTIC_USE_DIR)
-    task_path = Path(task_dir)
-    if not task_path.is_absolute():
-        task_path = (root / task_path).resolve()
-
-    instruction_path = task_path / "instruction.md"
-    if not instruction_path.exists():
-        raise FileNotFoundError(f"instruction.md not found in {task_path}")
-
-    instruction = instruction_path.read_text(encoding="utf-8")
-    task_toml = load_task_toml(task_path)
-
-    return AgentEvalTask(
-        id=task_path.name,
-        intent=instruction,
-        inputs={
-            "instruction": instruction,
-        },
-        metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()],
-        metadata={
-            "benchmark": "agentic-use",
-            "task_toml": task_toml,
-            "instruction_path": str(instruction_path),
-            "task_dir": str(task_path),
-        },
-    )
diff --git a/tests/agentic-use/runtimes/shared/usage.py b/tests/agentic-use/runtimes/shared/usage.py
deleted file mode 100644
index 89053ffb97..0000000000
--- a/tests/agentic-use/runtimes/shared/usage.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Token usage extraction from agent logs.
-
-Reuses the proven implementation from ``nat_runner.py`` until the legacy
-runner delegates here and the duplicate can be removed.
-"""
-
-from __future__ import annotations
-
-from typing import TypedDict
-
-
-class TokenMetrics(TypedDict):
-    prompt_tokens: int | None
-    completion_tokens: int | None
-    total_tokens: int | None
-    cache_creation_tokens: int | None
-    cache_read_tokens: int | None
-    n_assistant_messages: int | None
-    cost_usd: float | None
-    num_turns: int | None
-    duration_ms: float | None
-
-
-def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]:
-    """Extract token usage metrics from an agent log."""
-    import nat_runner
-
-    metrics = nat_runner._extract_usage_metrics(agent_log)
-    return dict(metrics)
diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py
deleted file mode 100644
index f83da8edb3..0000000000
--- a/tests/agentic-use/runtimes/shared/verify.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Live VERIFY phase executed through the environment boundary.
-
-Ports ``nat_runner.run_verify_phase`` onto :meth:`AgentEnvironmentHandle.run_verifier`
-so the task-local ``tests/test_outputs.py`` pytest verifier runs in the *same*
-prepared environment (and against the same persisted workspace/state) as the
-agent phase. The resulting reward is stamped onto the attempt metadata so the
-``VerifierRewardMetric`` compatibility metric scores it through the Evaluator SDK.
-"""
-
-from __future__ import annotations
-
-import textwrap
-from pathlib import Path
-
-from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
-    VerifierOutcome,
-    apply_verify_to_metadata,
-    collect_verifier_outcome,
-    skipped_outcome,
-)
-
-from runtimes.shared.constants import (
-    DOCKER_SOCKET_CONTAINER_PATH,
-    DOCKER_SOCKET_HOST_PATH,
-    EVALUATOR_SDK_SRC,
-    FILES_STORAGE_CONFIG,
-    PLATFORM_CONFIG_PATH,
-    SHARED_DIR,
-)
-from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec
-from runtimes.shared.layout import AgenticRunLayout
-
-__all__ = [
-    "VerifierOutcome",
-    "apply_verify_to_metadata",
-    "build_verify_run_spec",
-    "maybe_run_verify",
-    "run_verify",
-    "verifier_log_dir",
-]
-
-
-def verifier_log_dir(layout: AgenticRunLayout) -> Path:
-    return layout.run_dir / "verifier"
-
-
-def build_verify_run_spec(
-    task_dir: Path,
-    layout: AgenticRunLayout,
-    *,
-    nmp_base_url: str,
-    agent_backend: str,
-    agent_model: str,
-    smoke_workspace: str | None = None,
-    timeout_sec: int | None = None,
-    extra_args: list[str] | None = None,
-) -> EnvRunSpec | None:
-    """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``.
-
-    Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to
-    verify), matching the runner's behavior.
-    """
-    tests_dir = task_dir / "tests"
-    if not (tests_dir / "test_outputs.py").exists():
-        return None
-
-    log_dir = verifier_log_dir(layout)
-    log_dir.mkdir(parents=True, exist_ok=True)
-    layout.workspace_dir.mkdir(parents=True, exist_ok=True)
-
-    smoke_seed_cmd = ""
-    smoke_cleanup_cmd = ""
-    if smoke_workspace:
-        smoke_seed_cmd = textwrap.dedent("""\
-            /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \
-              --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true
-        """)
-        smoke_cleanup_cmd = textwrap.dedent("""\
-            /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true
-        """)
-
-    verify_cmd = [
-        "bash",
-        "-c",
-        textwrap.dedent(f"""\
-            export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}"
-            export NAT_AGENT=1
-            {smoke_seed_cmd}
-            /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt
-            EXIT=${{PIPESTATUS[0]}}
-            {smoke_cleanup_cmd}
-            if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt
-            exit $EXIT
-        """),
-    ]
-
-    env: dict[str, str] = {
-        "NMP_BASE_URL": nmp_base_url,
-        "NAT_AGENT": "1",
-        "NAT_AGENT_BACKEND": agent_backend,
-        "NAT_AGENT_MODEL": agent_model,
-        "AGENTIC_USE_TASK_DIR": "/task",
-        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
-        "SMOKE_WORKSPACE": smoke_workspace or "",
-        "DATABASE_DIALECT": "sqlite",
-        "DATABASE_PATH": "/data/nmp-platform.db",
-        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
-        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
-    }
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
-
-    mounts: list[tuple[str, str]] = [
-        (str(tests_dir), "/tests"),
-        (str(task_dir), "/task"),
-        (str(layout.workspace_dir), "/app/workspace"),
-        (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"),
-        (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"),
-        (str(layout.agent_log_dir), "/logs/agent"),
-        (str(log_dir), "/logs/verifier"),
-        # Persist platform/db state across AGENT and VERIFY containers.
-        (str(layout.state_dir), "/data"),
-    ]
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH))
-
-    return EnvRunSpec(
-        command=verify_cmd,
-        env=env,
-        mounts=mounts,
-        timeout=timeout_sec,
-        extra_args=list(extra_args or []),
-    )
-
-
-async def run_verify(
-    handle: AgentEnvironmentHandle,
-    spec: EnvRunSpec,
-    layout: AgenticRunLayout,
-) -> VerifierOutcome:
-    """Execute the verifier through the environment handle and collect reward."""
-    result = await handle.run_verifier(spec)
-    return collect_verifier_outcome(
-        ok=result.ok,
-        exit_code=result.exit_code,
-        log_dir=verifier_log_dir(layout),
-    )
-
-
-async def maybe_run_verify(
-    handle: AgentEnvironmentHandle,
-    *,
-    enabled: bool,
-    task_dir: Path,
-    layout: AgenticRunLayout,
-    nmp_base_url: str,
-    agent_backend: str,
-    agent_model: str,
-    smoke_workspace: str | None = None,
-    timeout_sec: int | None = None,
-    extra_args: list[str] | None = None,
-) -> VerifierOutcome:
-    """Run the verifier through ``handle`` when enabled and a verifier exists."""
-    if not enabled:
-        return skipped_outcome()
-    spec = build_verify_run_spec(
-        task_dir,
-        layout,
-        nmp_base_url=nmp_base_url,
-        agent_backend=agent_backend,
-        agent_model=agent_model,
-        smoke_workspace=smoke_workspace,
-        timeout_sec=timeout_sec,
-        extra_args=extra_args,
-    )
-    if spec is None:
-        return skipped_outcome()
-    return await run_verify(handle, spec, layout)
diff --git a/tests/agentic-use/runtimes/workflow/runtime.py b/tests/agentic-use/runtimes/workflow/runtime.py
index 1d8c09fecd..55688b3d24 100644
--- a/tests/agentic-use/runtimes/workflow/runtime.py
+++ b/tests/agentic-use/runtimes/workflow/runtime.py
@@ -8,20 +8,21 @@
 from collections.abc import Sequence
 from pathlib import Path
 
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask
 
-from runtimes.shared.artifacts import build_agent_eval_attempt
 from runtimes.shared.config import WorkflowRuntimeConfig
 from runtimes.shared.constants import INSTRUCTION_CONTAINER_PATH, WORKFLOW_CONTAINER_PATH
-from runtimes.shared.container_env import base_container_env
-from runtimes.shared.environment import (
-    AgentEnvironmentProvider,
+from runtimes.shared.platform import (
+    AgenticRunLayout,
     DockerEnvironmentProvider,
-    EnvRunSpec,
+    base_container_env,
+    build_agent_eval_attempt,
+    maybe_run_verify,
+    resolve_run_layout,
+    task_agent_timeout_sec,
 )
-from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout
-from runtimes.shared.task_loader import task_agent_timeout_sec
-from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify
 from runtimes.workflow.command import build_workflow_agent_cmd
 from runtimes.workflow.prep import prepare_workflow_for_runtime
 
diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py
index 5a3590071d..1903989705 100644
--- a/tests/agentic-use/tests/test_agentic_runtimes.py
+++ b/tests/agentic-use/tests/test_agentic_runtimes.py
@@ -10,10 +10,9 @@
 
 import pytest
 import yaml
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec
 from runtimes.shared.config import AgenticSharedConfig, WorkflowRuntimeConfig
-from runtimes.shared.environment import EnvCommandResult, EnvRunSpec
-from runtimes.shared.layout import resolve_run_layout, task_image_tag
-from runtimes.shared.task_loader import agentic_task_from_dir
+from runtimes.shared.platform import agentic_task_from_dir, resolve_run_layout, task_image_tag
 from runtimes.workflow.command import build_workflow_agent_cmd
 from runtimes.workflow.prep import prepare_workflow_for_runtime
 from runtimes.workflow.runtime import NatWorkflowAttemptRuntime
@@ -122,8 +121,7 @@ def test_runtime_for_backend_rejects_unknown() -> None:
 
 
 def test_build_agent_eval_attempt_metadata_matches_captured_schema(tmp_path: Path) -> None:
-    from runtimes.shared.artifacts import build_agent_eval_attempt, to_captured_agent_attempt
-    from runtimes.shared.layout import AgenticRunLayout
+    from runtimes.shared.platform import AgenticRunLayout, build_agent_eval_attempt, to_captured_agent_attempt
 
     task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR)
     layout = AgenticRunLayout(
@@ -183,7 +181,7 @@ async def test_aut_runtime_run_tasks_with_mocked_env(tmp_path: Path) -> None:
 
 
 def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> None:
-    from runtimes.shared.result_adapter import attempt_from_result
+    from runtimes.shared.platform import attempt_from_result
 
     output_dir = tmp_path / "20260101T000000Z-demo"
     (output_dir / "agent").mkdir(parents=True)
@@ -218,7 +216,7 @@ def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> Non
 
 
 def test_attempt_from_result_marks_unsuccessful_agent_partial(tmp_path: Path) -> None:
-    from runtimes.shared.result_adapter import attempt_from_result
+    from runtimes.shared.platform import attempt_from_result
 
     output_dir = tmp_path / "run"
     (output_dir / "agent").mkdir(parents=True)
@@ -271,7 +269,7 @@ async def test_score_captured_attempts_offline(tmp_path: Path) -> None:
 @pytest.mark.asyncio
 async def test_verifier_reward_metric_reads_metadata() -> None:
     from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput
-    from runtimes.shared.metrics import VerifierRewardMetric
+    from runtimes.shared.platform import VerifierRewardMetric
 
     metric = VerifierRewardMetric()
     candidate = CandidateOutput(output_text="x", metadata={"reward": 1})
@@ -320,7 +318,7 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co
 
 
 def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
-    from runtimes.shared.reporting import summarize_run
+    from nemo_evaluator_sdk.agent_eval.gating import summarize_run
 
     summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5))
 
@@ -333,7 +331,7 @@ def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
 
 
 def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None:
-    from runtimes.shared.reporting import GateThresholds, evaluate_gate, write_gate_report
+    from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, write_gate_report
 
     baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0)
     candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0)
@@ -356,7 +354,7 @@ def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> Non
 
 
 def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
-    from runtimes.shared.reporting import GateThresholds, evaluate_gate
+    from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate
 
     baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111")
     candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222")
@@ -378,8 +376,7 @@ def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
 
 
 def test_build_verify_run_spec_shape(tmp_path: Path) -> None:
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import build_verify_run_spec
+    from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec
 
     layout = AgenticRunLayout(
         run_dir=tmp_path,
@@ -404,8 +401,7 @@ def test_build_verify_run_spec_shape(tmp_path: Path) -> None:
 
 
 def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> None:
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import build_verify_run_spec
+    from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec
 
     task_dir = tmp_path / "no-tests-task"
     task_dir.mkdir()
@@ -422,9 +418,8 @@ def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> Non
 
 @pytest.mark.asyncio
 async def test_run_verify_reads_reward_file(tmp_path: Path) -> None:
-    from runtimes.shared.environment import EnvCommandResult, EnvRunSpec
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import run_verify
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec
+    from runtimes.shared.platform import AgenticRunLayout, run_verify
 
     layout = AgenticRunLayout(
         run_dir=tmp_path,
@@ -456,7 +451,7 @@ async def close(self) -> None:
 
 @pytest.mark.asyncio
 async def test_workflow_runtime_runs_verify_through_handle(tmp_path: Path) -> None:
-    from runtimes.shared.verify import verifier_log_dir
+    from runtimes.shared.platform import verifier_log_dir
 
     task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR)
     layout = resolve_run_layout(task, AgenticSharedConfig(jobs_dir=tmp_path))
@@ -493,7 +488,7 @@ async def prepare(self, task: object, config: object = None) -> _Handle:
 
 
 def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     (tmp_path / "environment.yaml").write_text(
         "environment:\n"
@@ -516,7 +511,7 @@ def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None:
 
 
 def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     env_dir = tmp_path / "environment"
     env_dir.mkdir()
@@ -528,14 +523,14 @@ def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None:
 
 
 def test_load_environment_spec_missing_raises(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     with pytest.raises(FileNotFoundError):
         load_environment_spec(tmp_path)
 
 
 def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import plan_task_build
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build
 
     env_dir = tmp_path / "environment"
     env_dir.mkdir()
@@ -549,7 +544,7 @@ def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None:
 
 
 def test_plan_task_build_generates_derived_dockerfile(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import plan_task_build
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build
 
     (tmp_path / "environment.yaml").write_text(
         "environment:\n  image: base:1\n  dependencies:\n    python: [pytest]\n  setup: [seed-providers]\n",