diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
index b4d9805374..963d869bb5 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py
@@ -5,9 +5,11 @@
 
 from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard
 from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
 from nemo_evaluator_sdk.agent_eval.persistence import persist_run
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
+    AgentAttemptSource,
     AgentEvalAttempt,
     AgentEvalDiagnostic,
     AgentEvalMetricOutputCoverage,
@@ -24,9 +26,12 @@
 from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence
 
 __all__ = [
+    "AgentAttemptRuntime",
+    "AgentAttemptSource",
     "AgentEvalAttempt",
     "AgentEvalDiagnostic",
     "AgentEvalMetricOutputCoverage",
+    "AgentEvalOrchestrator",
     "AgentEvalRunConfig",
     "AgentEvalRunResult",
     "AgentEvalSummary",
@@ -34,11 +39,11 @@
     "AgentEvalTask",
     "AgentEvalTaskResult",
     "AgentEvaluator",
-    "AgentAttemptRuntime",
     "AgentOutput",
     "CandidateEvidence",
     "EvidenceDescriptor",
     "LocalFilesystemEvidence",
+    "OrchestratorConfig",
     "SemanticView",
     "ViewSignal",
     "persist_run",
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py
new file mode 100644
index 0000000000..dd85fcea5d
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts.
+
+These are the runtime-agnostic pieces: the *scorable* status mapping and the
+standard evidence-key builder. Platform-specific attempt construction (reading
+proprietary artifact layouts, extra evidence keys) composes these in the adapter.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus
+from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor
+
+
+def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
+    """Map an agent-phase outcome to a *scorable* attempt status.
+
+    :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes
+    ``status=="failed"`` from scoring (it short-circuits to a failed metric
+    result). An agent that ran but did not succeed must still be scored — e.g. as
+    a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore
+    use ``"partial"`` for an executed-but-unsuccessful agent and reserve
+    ``"failed"`` for genuine attempt-*production* failures (which a runtime
+    surfaces by raising, not by emitting an unscorable attempt).
+    """
+    return "completed" if agent_ok else "partial"
+
+
+def standard_evidence_descriptors(
+    *,
+    logs_dir: str | Path,
+    final_state_dir: str | Path,
+    trace_path: str | Path | None = None,
+    initial_state_ref: str | None = None,
+    verifier_logs_dir: str | Path | None = None,
+    primary_log: str | None = None,
+) -> dict[str, EvidenceDescriptor]:
+    """Build the documented evidence map for an agent-eval attempt.
+
+    Standard keys: ``initial_state`` (task input filesystem, when staged),
+    ``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log
+    dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present).
+    Callers may add their own extension keys to the returned mapping.
+    """
+    descriptors: dict[str, EvidenceDescriptor] = {}
+
+    if initial_state_ref:
+        descriptors["initial_state"] = EvidenceDescriptor(
+            kind="filesystem",
+            format="dir",
+            ref=str(initial_state_ref),
+            metadata={"role": "initial_state"},
+        )
+
+    if trace_path is not None:
+        trace_name = Path(trace_path).name
+        descriptors["trace"] = EvidenceDescriptor(
+            kind="trace",
+            format="atif" if trace_name.startswith("atif") else "json",
+            ref=str(trace_path),
+        )
+
+    logs_metadata = {"primary_log": primary_log} if primary_log else {}
+    descriptors["logs"] = EvidenceDescriptor(
+        kind="logs",
+        format="dir",
+        ref=str(logs_dir),
+        metadata=logs_metadata,
+    )
+
+    descriptors["final_state"] = EvidenceDescriptor(
+        kind="filesystem",
+        format="dir",
+        ref=str(final_state_dir),
+        metadata={"role": "final_state"},
+    )
+
+    if verifier_logs_dir is not None and Path(verifier_logs_dir).exists():
+        descriptors["verifier_logs"] = EvidenceDescriptor(
+            kind="logs",
+            format="dir",
+            ref=str(verifier_logs_dir),
+            metadata={"role": "verifier"},
+        )
+
+    return descriptors
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py
new file mode 100644
index 0000000000..8cece6a5ad
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Reusable agent-eval metrics.
+
+``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt
+metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it
+scores by inspecting ``candidate.evidence`` (a filesystem evidence handle)
+rather than a reward written into metadata — the value proposition of scoring
+over evidence instead of trusting a verifier's stamped reward.
+"""
+
+from __future__ import annotations
+
+from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+class AgentPhaseSuccessMetric:
+    """Score 1.0 when the agent phase exited successfully, else 0.0.
+
+    The metric ``type`` is overridable via the ``metric_type`` class attribute so
+    callers can namespace it; the output name stays ``agent_phase_success`` (which
+    gating reads as a reward signal).
+    """
+
+    metric_type: str = "agent_phase_success"
+
+    @property
+    def type(self) -> str:
+        return self.metric_type
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("agent_phase_success")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        agent_ok = bool(input.candidate.metadata.get("agent_ok"))
+        return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)])
+
+
+class EvidencePresenceMetric:
+    """Score 1.0 when a named filesystem evidence directory exists (and is non-empty).
+
+    Reads ``candidate.evidence`` directly — the canonical metric-over-evidence
+    pattern — so the score reflects what the agent actually produced on disk,
+    not a reward stamped into metadata by a verifier.
+    """
+
+    def __init__(
+        self,
+        *,
+        evidence_name: str = "final_state",
+        output_name: str = "evidence_present",
+        require_non_empty: bool = True,
+    ) -> None:
+        self._evidence_name = evidence_name
+        self._output_name = output_name
+        self._require_non_empty = require_non_empty
+
+    @property
+    def type(self) -> str:
+        return "evidence_presence"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score(self._output_name)]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        score = 0.0
+        evidence = input.candidate.evidence
+        if evidence is not None and evidence.get(self._evidence_name) is not None:
+            try:
+                handle = await evidence.filesystem(self._evidence_name)
+                if await handle.exists():
+                    if self._require_non_empty:
+                        score = 1.0 if await handle.iter_paths(recursive=True) else 0.0
+                    else:
+                        score = 1.0
+            except (KeyError, ValueError):
+                score = 0.0
+        return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)])
diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
similarity index 85%
rename from tests/agentic-use/runtimes/shared/reporting.py
rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
index 34b78fbcb7..f6a7d04cfb 100644
--- a/tests/agentic-use/runtimes/shared/reporting.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py
@@ -1,19 +1,21 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """Deterministic gating + provenance comparison over an agent-eval run bundle.
 
-This closes the design-doc B4 "CI/reporting" gap. Persistence of
-``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html``
-is already handled by the SDK (``agent_eval.persistence.persist_run`` /
-``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline
-gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance
-checks.
-
-The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary
-produced here is interchangeable with the legacy gate's baseline summary. The
-difference is the input: this operates on a typed :class:`AgentEvalRunResult`
-(metric scores + attempt metadata) instead of scanning ``result.json`` files.
+Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/
+``results.jsonl``/``summary.json``/``report.html``) is handled by
+``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate
+-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic
+provenance checks.
+
+Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`:
+that summary reports the *mean score per metric output* over a run. The gate's
+``pass_rate`` here is a different, intentional view — a per-task pass/fail count
+against a reward threshold — so it is computed separately. Token/runtime/
+provenance aggregation is delegated to
+:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the
+measurement keys are read in exactly one place.
 """
 
 from __future__ import annotations
@@ -23,13 +25,13 @@
 from pathlib import Path
 from typing import Any
 
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult
+from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult
 
 # Metric outputs, in priority order, that represent a task's pass/reward signal.
 DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success")
 
-# Provenance fields collapsed into a single run-level summary (matches the
-# legacy gate so baselines are interchangeable).
+# Provenance fields collapsed into a single run-level summary.
 _PROVENANCE_FIELDS: tuple[str, ...] = (
     "commit_sha",
     "commit_short",
@@ -91,7 +93,7 @@ def evaluate_gate(
 
 
 def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path:
-    """Persist the gate report alongside the SDK run bundle."""
+    """Persist the gate report alongside the run bundle."""
     path = Path(output_dir)
     path.mkdir(parents=True, exist_ok=True)
     gate_path = path / filename
@@ -115,8 +117,13 @@ def summarize_run(
     *,
     reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
 ) -> dict[str, Any]:
-    """Aggregate pass-rate, token, runtime, and provenance for one run."""
-    attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts}
+    """Aggregate pass-rate, token, runtime, and provenance for one run.
+
+    Token/runtime/provenance are read via :class:`AttemptMeasurements`; the
+    reward used for pass-rate prefers a scored metric output (``reward_outputs``)
+    and falls back to the attempt's recorded reward.
+    """
+    attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts}
     reward_by_task = _rewards_by_task(result.results, reward_outputs)
     task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task))
 
@@ -131,29 +138,28 @@ def summarize_run(
 
     for task_id in task_ids:
         attempt = attempts_by_task.get(task_id)
-        metadata = attempt.metadata if attempt is not None else {}
+        measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {})
 
-        reward_value = _task_reward(task_id, reward_by_task, metadata)
+        reward_value = reward_by_task.get(task_id)
+        if reward_value is None:
+            reward_value = measurements.reward if measurements.reward is not None else 0.0
         if reward_value >= 1.0:
             passed += 1
 
-        total_tokens = metadata.get("total_tokens")
-        if isinstance(total_tokens, int):
-            token_sum += total_tokens
+        if measurements.total_tokens is not None:
+            token_sum += measurements.total_tokens
             token_count += 1
         else:
             token_unavailable.append(task_id)
 
-        runtime_sec = _task_runtime_sec(metadata)
-        if runtime_sec is not None:
-            runtime_sum += runtime_sec
+        if measurements.runtime_sec is not None:
+            runtime_sum += measurements.runtime_sec
             runtime_count += 1
         else:
             runtime_unavailable.append(task_id)
 
-        prov = metadata.get("provenance")
-        if isinstance(prov, dict):
-            provenance_inputs.append(prov)
+        if measurements.provenance:
+            provenance_inputs.append(measurements.provenance)
 
     total = len(task_ids)
     return {
@@ -404,28 +410,6 @@ def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None
     return None
 
 
-def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float:
-    if task_id in reward_by_task:
-        return reward_by_task[task_id]
-    reward = metadata.get("reward")
-    if reward is not None:
-        try:
-            return float(reward)
-        except (TypeError, ValueError):
-            return 0.0
-    return 1.0 if metadata.get("passed") is True else 0.0
-
-
-def _task_runtime_sec(metadata: dict[str, Any]) -> float | None:
-    runtime_sec = metadata.get("runtime_sec")
-    if isinstance(runtime_sec, int | float):
-        return float(runtime_sec)
-    duration_ms = metadata.get("duration_ms")
-    if isinstance(duration_ms, int | float):
-        return float(duration_ms) / 1000.0
-    return None
-
-
 def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]:
     observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS}
     for prov in provenances:
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py
new file mode 100644
index 0000000000..0ae2330415
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Typed view over the measurement keys carried on ``AgentEvalAttempt.metadata``.
+
+Gating and reporting read these typed fields instead of reaching into the
+attempt metadata dict by magic string. The keys are still *stored* on
+``metadata`` (so the loose-dict contract continues to work during migration);
+this module is the single, documented place that names them and applies the
+fallbacks (``duration_ms`` → ``runtime_sec``, ``passed`` → ``reward``).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+# Token-measurement keys carried on attempt metadata (and in result.json["metrics"]).
+TOKEN_KEYS: tuple[str, ...] = (
+    "prompt_tokens",
+    "completion_tokens",
+    "total_tokens",
+    "cache_creation_tokens",
+    "cache_read_tokens",
+)
+
+
+class AttemptMeasurements(BaseModel):
+    """Numeric measurements + provenance projected from attempt metadata.
+
+    This is the public, typed attempt-measurement contract. Reporting/gating
+    consume it via :meth:`from_metadata`; producers may keep writing the same
+    keys onto ``AgentEvalAttempt.metadata`` and round-trip via :meth:`to_metadata`.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    cache_creation_tokens: int | None = None
+    cache_read_tokens: int | None = None
+    runtime_sec: float | None = None
+    reward: float | None = None
+    passed: bool | None = None
+    provenance: dict[str, Any] = Field(default_factory=dict)
+
+    @classmethod
+    def from_metadata(cls, metadata: Mapping[str, Any] | None) -> AttemptMeasurements:
+        """Project loose attempt metadata onto the typed contract.
+
+        Applies the historical fallbacks so callers don't re-implement them:
+        ``runtime_sec`` falls back to ``duration_ms / 1000``; ``reward`` falls
+        back to ``1.0``/``0.0`` derived from ``passed`` when no explicit reward
+        is recorded.
+        """
+        metadata = metadata or {}
+
+        tokens = {key: _as_int(metadata.get(key)) for key in TOKEN_KEYS}
+        runtime_sec = _runtime_sec(metadata)
+        passed = metadata.get("passed")
+        passed = bool(passed) if isinstance(passed, bool) else None
+        reward = _reward(metadata, passed)
+        provenance = metadata.get("provenance")
+        provenance = dict(provenance) if isinstance(provenance, Mapping) else {}
+
+        return cls(
+            **tokens,
+            runtime_sec=runtime_sec,
+            reward=reward,
+            passed=passed,
+            provenance=provenance,
+        )
+
+    def to_metadata(self) -> dict[str, Any]:
+        """Project back onto the loose metadata keys (only set values)."""
+        payload: dict[str, Any] = {}
+        for key in TOKEN_KEYS:
+            value = getattr(self, key)
+            if value is not None:
+                payload[key] = value
+        if self.runtime_sec is not None:
+            payload["runtime_sec"] = self.runtime_sec
+        if self.reward is not None:
+            payload["reward"] = self.reward
+        if self.passed is not None:
+            payload["passed"] = self.passed
+        if self.provenance:
+            payload["provenance"] = dict(self.provenance)
+        return payload
+
+
+def _as_int(value: Any) -> int | None:
+    # bool is an int subclass; never treat True/False as a token count.
+    if isinstance(value, bool):
+        return None
+    return value if isinstance(value, int) else None
+
+
+def _runtime_sec(metadata: Mapping[str, Any]) -> float | None:
+    runtime_sec = metadata.get("runtime_sec")
+    if isinstance(runtime_sec, int | float) and not isinstance(runtime_sec, bool):
+        return float(runtime_sec)
+    duration_ms = metadata.get("duration_ms")
+    if isinstance(duration_ms, int | float) and not isinstance(duration_ms, bool):
+        return float(duration_ms) / 1000.0
+    return None
+
+
+def _reward(metadata: Mapping[str, Any], passed: bool | None) -> float | None:
+    reward = metadata.get("reward")
+    if reward is not None:
+        try:
+            return float(reward)
+        except (TypeError, ValueError):
+            return None
+    if passed is not None:
+        return 1.0 if passed else 0.0
+    return None
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py
new file mode 100644
index 0000000000..1fb436f809
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic orchestration: agent/scoring run + deterministic gate.
+
+Wraps :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` with the
+gate from :mod:`nemo_evaluator_sdk.agent_eval.gating`. It is intentionally lean —
+the only collaborators are the tasks and a target (online) or attempts (offline).
+Two seams keep it backend-agnostic:
+
+* **verify-enable is inverted to data**: callers pass ``extra_metrics`` to append
+  (e.g. a verifier-reward metric). The orchestrator never introspects a runtime's
+  config to decide what to score.
+* **environment prep is an injected hook**: ``prepare_task`` (e.g. "build the task
+  image") runs per task before execution, so Docker/build specifics live in the
+  caller, not here.
+
+The common Docker case stays a few lines via :meth:`AgentEvalOrchestrator`'s plain
+constructor (config + optional ``extra_metrics``); richer wiring is opt-in.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.gating import (
+    GateThresholds,
+    evaluate_gate,
+    load_baseline_summary,
+    write_gate_report,
+)
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentAttemptRuntime,
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalRunResult,
+    AgentEvalTask,
+)
+from nemo_evaluator_sdk.metrics.protocol import Metric
+
+
+@dataclass(frozen=True)
+class OrchestratorConfig:
+    """Run-level knobs shared by the online and offline paths."""
+
+    parallelism: int = 1
+    write_dashboard: bool = True
+    write_gate: bool = True
+    gate_thresholds: GateThresholds | None = None
+    baseline_summary_path: Path | None = None
+
+
+class AgentEvalOrchestrator:
+    """Run tasks through ``AgentEvaluator`` (online or offline) and apply the gate."""
+
+    def __init__(
+        self,
+        *,
+        config: OrchestratorConfig | None = None,
+        extra_metrics: Sequence[Metric] = (),
+    ) -> None:
+        self.config = config or OrchestratorConfig()
+        self._extra_metrics = list(extra_metrics)
+
+    async def run_tasks(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        *,
+        target: AgentAttemptRuntime,
+        benchmark: dict[str, object] | None = None,
+        output_dir: Path | None = None,
+        run_id: str | None = None,
+        prepare_task: Callable[[AgentEvalTask], None] | None = None,
+    ) -> AgentEvalRunResult:
+        """Online path: optionally prep each task, run the runtime, score, gate."""
+        prepared = [self._with_extra_metrics(task) for task in tasks]
+        if prepare_task is not None:
+            for task in prepared:
+                prepare_task(task)
+
+        result = await AgentEvaluator().run(
+            tasks=prepared,
+            target=target,
+            config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark),
+        )
+        self._maybe_write_gate(result)
+        return result
+
+    async def score_attempts(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        *,
+        attempts: Sequence[AgentEvalAttempt],
+        benchmark: dict[str, object] | None = None,
+        output_dir: Path | None = None,
+        run_id: str | None = None,
+    ) -> AgentEvalRunResult:
+        """Offline path: score already-captured attempts (no agent execution)."""
+        prepared = [self._with_extra_metrics(task) for task in tasks]
+        result = await AgentEvaluator().run(
+            tasks=prepared,
+            attempts=list(attempts),
+            config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark),
+        )
+        self._maybe_write_gate(result)
+        return result
+
+    def _run_config(
+        self,
+        *,
+        output_dir: Path | None,
+        run_id: str | None,
+        benchmark: dict[str, object] | None,
+    ) -> AgentEvalRunConfig:
+        return AgentEvalRunConfig(
+            output_dir=output_dir,
+            run_id=run_id,
+            parallelism=self.config.parallelism,
+            write_dashboard=self.config.write_dashboard,
+            benchmark=dict(benchmark or {}),
+        )
+
+    def _with_extra_metrics(self, task: AgentEvalTask) -> AgentEvalTask:
+        """Append injected metrics, honoring task-authored metrics and avoiding duplicate types."""
+        if not self._extra_metrics:
+            return task
+        metrics: list[Metric] = list(task.metrics)
+        existing_types = {type(metric) for metric in metrics}
+        appended = [metric for metric in self._extra_metrics if type(metric) not in existing_types]
+        if not appended:
+            return task
+        return task.model_copy(update={"metrics": metrics + appended})
+
+    def _maybe_write_gate(self, result: AgentEvalRunResult) -> None:
+        if not (self.config.write_gate and result.output_dir is not None):
+            return
+        baseline = (
+            load_baseline_summary(self.config.baseline_summary_path)
+            if self.config.baseline_summary_path is not None
+            else None
+        )
+        report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline)
+        write_gate_report(report, result.output_dir)
+
+
+__all__ = [
+    "AgentEvalOrchestrator",
+    "GateThresholds",
+    "OrchestratorConfig",
+]
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py
new file mode 100644
index 0000000000..a2d7ac9e44
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Plug-and-play seam for coding-agent CLIs (codex/claude/cursor/...).
+
+The split that makes these "plug-and-play":
+
+* :class:`CliAgentDriver` is the **driver** — a generic ``AgentAttemptRuntime``
+  that runs a CLI which reads a prompt on stdin and writes its final answer to a
+  file, then captures workspace/stdout/stderr/final-output as evidence. This is
+  the stable, reusable part.
+* :class:`CodingAgentSpec` is the **per-agent adapter** — the bespoke part: how to
+  build the CLI command and (optionally) how to parse that agent's trajectory into
+  extra evidence. Implementing a new agent means subclassing this, not rewriting a
+  runtime.
+
+The shipped :class:`ClaudeCodeSpec` / :class:`CursorAgentSpec` are *reference*
+command builders: the driver and evidence contract are stable, but each CLI's
+exact flags and trajectory format are the integrator's responsibility and may
+drift with upstream releases. Auth is the caller's concern (inject via env);
+nothing here hardcodes credentials.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import shutil
+import subprocess
+from collections.abc import Awaitable, Callable, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
+
+DEFAULT_CODING_AGENT_TIMEOUT_S = 600
+ProcessFactory = Callable[..., Awaitable[object]]
+
+
+@dataclass(frozen=True)
+class RunArtifacts:
+    """Resolved on-disk paths for one coding-agent attempt."""
+
+    evidence_dir: Path
+    workspace_dir: Path
+    prompt_path: Path
+    task_path: Path
+    stdout_path: Path
+    stderr_path: Path
+    final_output_path: Path
+
+
+class CodingAgentSpec:
+    """Per-agent adapter: prompt, command, and trajectory→evidence parsing.
+
+    Subclass and implement :meth:`build_command`. Override :meth:`build_prompt`,
+    :meth:`extra_evidence`, or :meth:`final_output` for agent-specific behavior.
+    """
+
+    name: str = "coding_agent"
+    binary: str = ""
+    model: str | None = None
+
+    def build_prompt(self, task: AgentEvalTask) -> str:
+        """Default instruction prompt (override per agent if needed)."""
+        return f"Task id: {task.id}\nIntent: {task.intent}\nInputs: {task.inputs}\n"
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        """Return the argv to launch; the prompt is delivered on stdin."""
+        raise NotImplementedError
+
+    def extra_evidence(self, artifacts: RunArtifacts) -> dict[str, EvidenceDescriptor]:
+        """Optional per-agent evidence (e.g. a parsed trajectory). Default: none."""
+        return {}
+
+    def final_output(self, artifacts: RunArtifacts, stdout_text: str) -> str:
+        """Final answer text: prefer the written final-output file, else stdout."""
+        if artifacts.final_output_path.exists():
+            return artifacts.final_output_path.read_text(encoding="utf-8")
+        return stdout_text
+
+
+class CliAgentDriver:
+    """Generic ``AgentAttemptRuntime`` for stdin-prompt coding-agent CLIs."""
+
+    def __init__(
+        self,
+        spec: CodingAgentSpec,
+        *,
+        work_root: str | Path | None = None,
+        timeout_s: int = DEFAULT_CODING_AGENT_TIMEOUT_S,
+        process_factory: ProcessFactory | None = None,
+    ) -> None:
+        if not spec.binary:
+            raise ValueError(f"{type(spec).__name__} must set a non-empty `binary`")
+        self._spec = spec
+        self._work_root = Path(work_root).expanduser() if work_root is not None else None
+        self._timeout_s = timeout_s
+        self._process_factory = process_factory or asyncio.create_subprocess_exec
+
+    async def run_tasks(
+        self,
+        tasks: Sequence[AgentEvalTask],
+        config: AgentEvalRunConfig | None = None,
+    ) -> Sequence[AgentEvalAttempt]:
+        if self._process_factory is asyncio.create_subprocess_exec and shutil.which(self._spec.binary) is None:
+            raise RuntimeError(f"{self._spec.name} CLI executable {self._spec.binary!r} was not found on PATH")
+
+        resolved = config or AgentEvalRunConfig()
+        semaphore = asyncio.Semaphore(resolved.parallelism)
+
+        async def run_one(index: int, task: AgentEvalTask) -> AgentEvalAttempt:
+            async with semaphore:
+                return await self._run_task(index, task, resolved)
+
+        return await asyncio.gather(*(run_one(index, task) for index, task in enumerate(tasks)))
+
+    async def _run_task(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> AgentEvalAttempt:
+        artifacts = self._artifacts(index, task, config)
+        artifacts.evidence_dir.mkdir(parents=True, exist_ok=True)
+        artifacts.workspace_dir.mkdir(parents=True, exist_ok=True)
+
+        prompt = self._spec.build_prompt(task)
+        artifacts.prompt_path.write_text(prompt, encoding="utf-8")
+        artifacts.task_path.write_text(task.model_dump_json(indent=2), encoding="utf-8")
+
+        command = self._spec.build_command(artifacts)
+        try:
+            process = await self._process_factory(
+                *command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(prompt.encode("utf-8")),
+                timeout=self._timeout_s,
+            )
+        except Exception as exc:
+            return self._failed_attempt(task, artifacts, exc)
+
+        stdout_text = _decode(stdout)
+        stderr_text = _decode(stderr)
+        artifacts.stdout_path.write_text(stdout_text, encoding="utf-8")
+        artifacts.stderr_path.write_text(stderr_text, encoding="utf-8")
+
+        return_code = getattr(process, "returncode", 0)
+        if return_code:
+            return self._failed_attempt(
+                task,
+                artifacts,
+                RuntimeError(f"{self._spec.name} exited with status {return_code}: {stderr_text.strip()}"),
+            )
+
+        descriptors: dict[str, EvidenceDescriptor] = {
+            "workspace": EvidenceDescriptor(kind="filesystem", format="dir", ref=str(artifacts.workspace_dir)),
+            "prompt": EvidenceDescriptor(kind="text", format="txt", ref=str(artifacts.prompt_path)),
+            "task": EvidenceDescriptor(kind="json", format="json", ref=str(artifacts.task_path)),
+            "stdout": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stdout_path)),
+            "stderr": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stderr_path)),
+        }
+        descriptors.update(self._spec.extra_evidence(artifacts))
+
+        return AgentEvalAttempt(
+            id=f"{task.id}:{self._spec.name}",
+            task_id=task.id,
+            status="completed",
+            output=AgentOutput(
+                text=self._spec.final_output(artifacts, stdout_text),
+                metadata={
+                    "runtime": self._spec.name,
+                    "agent_model": self._spec.model,
+                    "evidence_dir": str(artifacts.evidence_dir),
+                },
+            ),
+            evidence=CandidateEvidence(descriptors=descriptors, metadata={"runtime": self._spec.name}),
+            metadata={
+                "runtime": self._spec.name,
+                "agent_model": self._spec.model,
+                "generated": True,
+            },
+        )
+
+    def _failed_attempt(self, task: AgentEvalTask, artifacts: RunArtifacts, exc: Exception) -> AgentEvalAttempt:
+        error_path = artifacts.evidence_dir / "error.json"
+        error_path.write_text(
+            json.dumps({"error_type": exc.__class__.__name__, "error": str(exc)}) + "\n", encoding="utf-8"
+        )
+        return AgentEvalAttempt(
+            id=f"{task.id}:{self._spec.name}",
+            task_id=task.id,
+            status="failed",
+            output=None,
+            evidence=CandidateEvidence(
+                descriptors={"error": EvidenceDescriptor(kind="error", format="json", ref=str(error_path))},
+                metadata={"runtime": self._spec.name},
+            ),
+            metadata={
+                "runtime": self._spec.name,
+                "error_type": exc.__class__.__name__,
+                "error": str(exc),
+            },
+        )
+
+    def _artifacts(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> RunArtifacts:
+        root = self._work_root or ((config.output_dir or Path.cwd()) / "evidence" / self._spec.name)
+        evidence_dir = Path(root) / (_safe_path_name(task.id) or f"task-{index}")
+        return RunArtifacts(
+            evidence_dir=evidence_dir,
+            workspace_dir=evidence_dir / "workspace",
+            prompt_path=evidence_dir / "prompt.txt",
+            task_path=evidence_dir / "task.json",
+            stdout_path=evidence_dir / "stdout.txt",
+            stderr_path=evidence_dir / "stderr.txt",
+            final_output_path=evidence_dir / "final_output.txt",
+        )
+
+
+class ClaudeCodeSpec(CodingAgentSpec):
+    """Reference command builder for the Claude Code CLI (``claude``)."""
+
+    name = "claude_code"
+    binary = "claude"
+
+    def __init__(self, *, model: str | None = None, binary: str = "claude") -> None:
+        self.model = model
+        self.binary = binary
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        command = [
+            self.binary,
+            "--print",
+            "--output-format",
+            "stream-json",
+            "--add-dir",
+            str(artifacts.workspace_dir),
+        ]
+        if self.model is not None:
+            command.extend(["--model", self.model])
+        return command
+
+
+class CursorAgentSpec(CodingAgentSpec):
+    """Reference command builder for the Cursor Agent CLI (``cursor-agent``)."""
+
+    name = "cursor_agent"
+    binary = "cursor-agent"
+
+    def __init__(self, *, model: str | None = None, binary: str = "cursor-agent") -> None:
+        self.model = model
+        self.binary = binary
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        command = [
+            self.binary,
+            "--print",
+            "--output-format",
+            "text",
+            "--workdir",
+            str(artifacts.workspace_dir),
+        ]
+        if self.model is not None:
+            command.extend(["--model", self.model])
+        return command
+
+
+def _decode(value: bytes | str | None) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    return value.decode("utf-8", errors="replace")
+
+
+def _safe_path_name(value: str) -> str:
+    return "".join(char if char.isalnum() or char in "._-" else "-" for char in value).strip(".-")[:120]
+
+
+__all__ = [
+    "CliAgentDriver",
+    "ClaudeCodeSpec",
+    "CodingAgentSpec",
+    "CursorAgentSpec",
+    "RunArtifacts",
+]
diff --git a/tests/agentic-use/runtimes/shared/docker.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
similarity index 80%
rename from tests/agentic-use/runtimes/shared/docker.py
rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
index 431d646806..482ca6e55e 100644
--- a/tests/agentic-use/runtimes/shared/docker.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py
@@ -1,7 +1,12 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Docker helpers for agentic-use runtimes."""
+"""Docker CLI helpers for agent-eval runtimes.
+
+These shell out to the ``docker`` CLI (stdlib ``subprocess`` only), so importing
+this module does not require the ``agent-runtimes`` extra — only a working
+``docker`` binary at call time.
+"""
 
 from __future__ import annotations
 
@@ -56,11 +61,8 @@ def docker_run(
     cmd.append(image)
     cmd += command
 
-    print(f"[agentic-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}")
-    kwargs: dict[str, object] = {"check": False, "text": True}
-    if timeout is not None:
-        kwargs["timeout"] = timeout
-    return subprocess.run(cmd, **kwargs)
+    print(f"[agent-eval-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}")
+    return subprocess.run(cmd, check=False, text=True, timeout=timeout)
 
 
 def docker_image_exists(tag: str) -> bool:
@@ -72,12 +74,12 @@ def docker_image_exists(tag: str) -> bool:
 def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None:
     """Build a Docker image from an explicit Dockerfile + build context."""
     cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)]
-    print(f"[agentic-runtime] $ {' '.join(cmd)}")
+    print(f"[agent-eval-runtime] $ {' '.join(cmd)}")
     subprocess.run(cmd, check=True)
 
 
 def build_task_image(task_dir: os.PathLike[str], tag: str) -> None:
-    """Build a task-specific Docker image from environment/Dockerfile."""
+    """Build a task-specific Docker image from ``environment/Dockerfile``."""
     from pathlib import Path
 
     root = Path(task_dir)
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
index 8f84d8ba4f..fc03344c85 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py
@@ -1,7 +1,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Docker-backed sandbox runtime for agent-eval attempts."""
+"""Docker-backed sandbox runtime for agent-eval attempts.
+
+Distinct from :mod:`nemo_evaluator_sdk.agent_eval.runtimes.environment`'s
+``DockerEnvironmentProvider`` on purpose: this runtime drives the OpenAI Agents
+SDK ``SandboxAgent`` (Python ``docker`` + ``agents``, behind the
+``agent-runtimes`` extra) and *owns* the agent loop, whereas the environment
+provider only shells out to the ``docker`` CLI to execute a caller-built command
+inside a prebuilt task image. The two are not merged: this one is an
+``AgentAttemptRuntime``; the other is an execution boundary used *by* runtimes.
+"""
 
 from __future__ import annotations
 
diff --git a/tests/agentic-use/runtimes/shared/environment.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
similarity index 54%
rename from tests/agentic-use/runtimes/shared/environment.py
rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
index fe23893668..a08dfdc179 100644
--- a/tests/agentic-use/runtimes/shared/environment.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py
@@ -1,20 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Environment provider boundary for agentic-use runtimes.
+"""Process/filesystem environment boundary for agent-eval runtimes.
 
-This is the design-doc's ``EnvironmentProvider`` boundary (section B2): it sits
-*below* :class:`AgentAttemptRuntime` so a runtime never needs to know whether
-the agent/verifier execute under Docker, locally, Harbor, or NeMo Gym. Today the
-only implementation is :class:`DockerEnvironmentProvider`, which wraps
-``shared/docker.py``.
+This boundary sits *below* :class:`AgentAttemptRuntime` so a runtime never needs
+to know whether the agent/verifier execute under Docker, locally, or another
+filesystem-backed sandbox. It is intentionally a **process/filesystem**
+abstraction, not a fully provider-neutral one: :class:`EnvRunSpec` carries
+``mounts``/``extra_args`` as filesystem-environment hints. Providers that are
+not filesystem-backed may ignore those fields.
 
-Deviation from the doc sketch: the doc proposes ``run_agent(instruction, config)
--> AgentEvalAttempt``. We keep the boundary at "execute a command in the
-prepared environment" (returning an :class:`EnvCommandResult`) because each
-backend builds its own command/env/mounts, and attempt construction is owned by
-``shared/artifacts.py``. This keeps command-building and attempt-shaping out of
-the environment layer so new providers only implement process execution.
+A handle exposes a single :meth:`AbstractEnvironmentHandle.run` that takes a
+``role`` ("agent" or "verifier"); :meth:`run_agent`/:meth:`run_verifier` are thin
+role wrappers kept for caller convenience and protocol compatibility.
 """
 
 from __future__ import annotations
@@ -23,12 +21,16 @@
 import subprocess
 from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Protocol, runtime_checkable
+from typing import Literal, Protocol, runtime_checkable
 
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
 
-from runtimes.shared.docker import docker_run
-from runtimes.shared.layout import task_image_tag
+EnvRole = Literal["agent", "verifier"]
+
+
+def default_image_tag(task_id: str) -> str:
+    """Default task → image-tag mapping (callers may inject their own)."""
+    return f"{task_id}:latest"
 
 
 @dataclass(frozen=True)
@@ -45,7 +47,11 @@ def ok(self) -> bool:
 
 @dataclass
 class EnvRunSpec:
-    """How to execute one command inside an environment handle."""
+    """How to execute one command inside an environment handle.
+
+    ``mounts``/``extra_args`` are filesystem-environment hints (e.g. Docker bind
+    mounts and extra CLI args). Non-filesystem providers may ignore them.
+    """
 
     command: list[str]
     env: dict[str, str] = field(default_factory=dict)
@@ -68,7 +74,7 @@ async def close(self) -> None: ...
 
 @runtime_checkable
 class AgentEnvironmentProvider(Protocol):
-    """Creates per-task environment handles. Pluggable: Docker now, Gym later."""
+    """Creates per-task environment handles. Pluggable: Docker now, others later."""
 
     async def prepare(
         self,
@@ -77,19 +83,37 @@ async def prepare(
     ) -> AgentEnvironmentHandle: ...
 
 
-class DockerEnvironmentHandle:
-    """Docker-backed environment handle bound to one task image."""
+class AbstractEnvironmentHandle:
+    """Base handle that routes both roles through a single :meth:`run`.
 
-    def __init__(self, image: str) -> None:
-        self.image = image
+    Concrete handles implement :meth:`run`; ``run_agent``/``run_verifier`` are
+    role-specialized wrappers so the duplicated phase methods don't have to be
+    reimplemented per backend.
+    """
+
+    async def run(self, spec: EnvRunSpec, role: EnvRole) -> EnvCommandResult:
+        raise NotImplementedError
 
     async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult:
-        return await self._run(spec)
+        return await self.run(spec, "agent")
 
     async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult:
-        return await self._run(spec)
+        return await self.run(spec, "verifier")
+
+    async def close(self) -> None:
+        return None
+
+
+class DockerEnvironmentHandle(AbstractEnvironmentHandle):
+    """Docker-backed environment handle bound to one task image."""
+
+    def __init__(self, image: str) -> None:
+        self.image = image
+
+    async def run(self, spec: EnvRunSpec, role: EnvRole = "agent") -> EnvCommandResult:
+        del role  # Docker runs both roles identically against the same image.
+        from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_run
 
-    async def _run(self, spec: EnvRunSpec) -> EnvCommandResult:
         try:
             result = await asyncio.to_thread(
                 docker_run,
@@ -105,15 +129,11 @@ async def _run(self, spec: EnvRunSpec) -> EnvCommandResult:
             return EnvCommandResult(exit_code=124, timed_out=True)
         return EnvCommandResult(exit_code=result.returncode)
 
-    async def close(self) -> None:
-        # `docker run --rm` cleans up the container; nothing persistent to release.
-        return None
-
 
 class DockerEnvironmentProvider:
     """Default provider that maps each task to its built Docker image."""
 
-    def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None:
+    def __init__(self, *, image_tag_fn: Callable[[str], str] = default_image_tag) -> None:
         self._image_tag_fn = image_tag_fn
 
     async def prepare(
diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
similarity index 92%
rename from tests/agentic-use/runtimes/shared/environment_spec.py
rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
index cd5630926f..a594705907 100644
--- a/tests/agentic-use/runtimes/shared/environment_spec.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Reusable environment authoring for agentic-use tasks (design-doc B3).
+"""Declarative environment authoring for agent-eval tasks.
 
 Moves task authoring away from an implicit "Dockerfile per task" toward a small,
 declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch.
@@ -28,8 +28,11 @@
 (a Dockerfile + build context + target tag). The Dockerfile path is used as-is;
 an ``image``-based spec generates a tiny derived Dockerfile (``FROM <image>`` plus
 optional ``pip install``). ``setup`` steps are carried as plan metadata — they are
-runtime concerns (e.g. seed-providers) handled outside the image build — so this
-module does not execute them.
+runtime concerns handled outside the image build — so this module does not
+execute them.
+
+``yaml`` is imported lazily so that importing this module costs nothing for
+callers that never load a spec.
 """
 
 from __future__ import annotations
@@ -37,8 +40,6 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
-import yaml
-
 ENVIRONMENT_SPEC_FILENAME = "environment.yaml"
 DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile"
 
@@ -69,6 +70,8 @@ def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec:
     root = Path(task_dir)
     spec_path = root / ENVIRONMENT_SPEC_FILENAME
     if spec_path.is_file():
+        import yaml
+
         return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root)
 
     dockerfile = root / DEFAULT_DOCKERFILE_RELPATH
@@ -160,7 +163,7 @@ def plan_task_build(
 
 def execute_build_plan(plan: BuildPlan) -> None:
     """Build the Docker image described by ``plan``."""
-    from runtimes.shared.docker import build_dockerfile
+    from nemo_evaluator_sdk.agent_eval.runtimes.docker import build_dockerfile
 
     build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag)
 
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py
new file mode 100644
index 0000000000..5c858cb037
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic on-disk layout for a single agent-eval task run.
+
+A run produces an agent-log dir and a workspace dir under a run dir, plus a
+written instruction file. Callers that need extra directories (e.g. preserved
+platform state) add them on top of :class:`RunLayout`.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class RunLayout:
+    """Filesystem layout for one task run."""
+
+    run_dir: Path
+    agent_log_dir: Path
+    workspace_dir: Path
+    instruction_path: Path
+
+
+def resolve_run_dir(output_dir: str | Path | None, default_factory: Callable[[], Path]) -> Path:
+    """Resolve the run dir to an absolute path.
+
+    An explicit ``output_dir`` must be made absolute: run-dir subpaths are used as
+    Docker bind-mount sources, and Docker treats a relative ``-v`` source as a
+    (slash-free) named volume rather than a host directory.
+    """
+    if output_dir is not None:
+        return Path(output_dir).resolve()
+    return default_factory()
+
+
+def prepare_run_layout(
+    run_dir: str | Path,
+    instruction_text: str,
+    *,
+    agent_subdir: str = "agent",
+    workspace_subdir: str = "workspace",
+    instruction_name: str = "instruction.md",
+) -> RunLayout:
+    """Create the agent/workspace dirs under ``run_dir`` and write the instruction."""
+    run_dir = Path(run_dir)
+    agent_log_dir = run_dir / agent_subdir
+    workspace_dir = run_dir / workspace_subdir
+    agent_log_dir.mkdir(parents=True, exist_ok=True)
+    workspace_dir.mkdir(parents=True, exist_ok=True)
+
+    instruction_path = agent_log_dir / instruction_name
+    instruction_path.write_text(instruction_text, encoding="utf-8")
+
+    return RunLayout(
+        run_dir=run_dir,
+        agent_log_dir=agent_log_dir,
+        workspace_dir=workspace_dir,
+        instruction_path=instruction_path,
+    )
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py
new file mode 100644
index 0000000000..7e1b0fb0c0
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generic verifier-phase mechanic: collect a reward + stamp attempt metadata.
+
+This is the backend-agnostic core. *What* the verifier runs (command, env,
+mounts) and *how* it is invoked are caller concerns — the caller executes its
+verifier through an environment handle, then uses :func:`collect_verifier_outcome`
+to read the reward/stdout convention out of the verifier's log dir, and
+:func:`apply_verify_to_metadata` to stamp the result onto an attempt so a
+reward metric can score it.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+
+@dataclass(frozen=True)
+class VerifierOutcome:
+    """Result of a verifier phase for one task."""
+
+    ran: bool
+    passed: bool
+    reward: int
+    exit_code: int
+    stdout: str
+    verifier_log_dir: Path | None
+
+
+def skipped_outcome() -> VerifierOutcome:
+    """Outcome representing a verifier that did not run."""
+    return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
+
+
+def collect_verifier_outcome(
+    *,
+    ok: bool,
+    exit_code: int,
+    log_dir: str | Path,
+    reward_filename: str = "reward.txt",
+    stdout_filename: str = "test-stdout.txt",
+) -> VerifierOutcome:
+    """Build a :class:`VerifierOutcome` from a verifier run's log dir.
+
+    Reads ``reward.txt`` (``1``/``0``) when present; otherwise derives the reward
+    from ``ok`` and writes the file so reruns are stable. Reads ``test-stdout.txt``
+    when present.
+    """
+    log_dir = Path(log_dir)
+    passed = ok
+
+    stdout = ""
+    stdout_path = log_dir / stdout_filename
+    if stdout_path.is_file():
+        stdout = stdout_path.read_text(encoding="utf-8", errors="replace")
+
+    reward_path = log_dir / reward_filename
+    if reward_path.is_file():
+        reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0
+    else:
+        reward = 1 if passed else 0
+        reward_path.parent.mkdir(parents=True, exist_ok=True)
+        reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8")
+
+    return VerifierOutcome(
+        ran=True,
+        passed=passed,
+        reward=reward,
+        exit_code=exit_code,
+        stdout=stdout,
+        verifier_log_dir=log_dir,
+    )
+
+
+def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None:
+    """Stamp verifier reward/status onto attempt metadata for scoring + gating."""
+    if not outcome.ran:
+        metadata.setdefault("verify_status", "skipped")
+        return
+    metadata["verify_status"] = "ok" if outcome.passed else "failed"
+    metadata["passed"] = outcome.passed
+    metadata["reward"] = outcome.reward
+    metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
index 589a4efde1..03509ab038 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py
@@ -287,6 +287,18 @@ async def run_tasks(
     ) -> Sequence[AgentEvalAttempt]: ...
 
 
+@runtime_checkable
+class AgentAttemptSource(Protocol):
+    """Loads a previously captured attempt for a task from a stored artifact.
+
+    The offline counterpart to :class:`AgentAttemptRuntime`: instead of executing
+    the agent, it adapts an already-produced run directory/file into an
+    :class:`AgentEvalAttempt` so it can be (re)scored through ``AgentEvaluator``.
+    """
+
+    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: ...
+
+
 def _metric_coverage(
     results: Sequence[AgentEvalTaskResult],
     tasks: Sequence[AgentEvalTask] | None,
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py
new file mode 100644
index 0000000000..66e7715c07
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py
@@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fixture-based tests for the coding-agent driver seam (no real CLIs)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.runtimes.coding_agent import (
+    ClaudeCodeSpec,
+    CliAgentDriver,
+    CodingAgentSpec,
+    CursorAgentSpec,
+    RunArtifacts,
+)
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
+
+
+class _EchoSpec(CodingAgentSpec):
+    name = "echo_agent"
+    binary = "echo-agent"
+
+    def build_command(self, artifacts: RunArtifacts) -> list[str]:
+        return [self.binary, "--out", str(artifacts.final_output_path)]
+
+    def extra_evidence(self, artifacts: RunArtifacts) -> dict:
+        from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor
+
+        return {"trajectory": EvidenceDescriptor(kind="trace", format="jsonl", ref=str(artifacts.stdout_path))}
+
+
+class _FakeProcess:
+    def __init__(self, *, returncode: int, final_output_path: Path | None, stdout: bytes = b"", stderr: bytes = b""):
+        self.returncode = returncode
+        self._final_output_path = final_output_path
+        self._stdout = stdout
+        self._stderr = stderr
+
+    async def communicate(self, stdin: bytes | None = None) -> tuple[bytes, bytes]:
+        if self._final_output_path is not None:
+            self._final_output_path.write_text("final answer", encoding="utf-8")
+        return self._stdout, self._stderr
+
+
+def _factory(*, returncode: int = 0, write_final: bool = True):
+    captured: dict = {}
+
+    async def factory(*command, **kwargs):
+        captured["command"] = list(command)
+        final_path = Path(command[command.index("--out") + 1]) if "--out" in command else None
+        return _FakeProcess(
+            returncode=returncode,
+            final_output_path=final_path if write_final else None,
+            stdout=b'{"event":"done"}\n',
+        )
+
+    return factory, captured
+
+
+def _task() -> AgentEvalTask:
+    return AgentEvalTask(id="demo/task", intent="do the thing", inputs={"k": "v"})
+
+
+@pytest.mark.asyncio
+async def test_driver_produces_completed_attempt_with_evidence(tmp_path: Path) -> None:
+    factory, captured = _factory()
+    driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory)
+
+    attempts = await driver.run_tasks([_task()], AgentEvalRunConfig())
+    attempt = attempts[0]
+
+    assert captured["command"][0] == "echo-agent"
+    assert attempt.status == "completed"
+    assert attempt.output is not None and attempt.output.text == "final answer"
+    # Standard + spec-provided evidence keys are present and paths exist on disk.
+    assert {"workspace", "prompt", "task", "stdout", "stderr", "trajectory"} <= set(attempt.evidence.descriptors)
+    assert (tmp_path / "demo-task" / "prompt.txt").read_text(encoding="utf-8").startswith("Task id: demo/task")
+
+
+@pytest.mark.asyncio
+async def test_driver_marks_failed_on_nonzero_exit(tmp_path: Path) -> None:
+    factory, _ = _factory(returncode=1, write_final=False)
+    driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory)
+
+    attempt = (await driver.run_tasks([_task()]))[0]
+    assert attempt.status == "failed"
+    assert attempt.output is None
+    assert "error" in attempt.evidence.descriptors
+    assert (tmp_path / "demo-task" / "error.json").exists()
+
+
+def test_reference_specs_build_expected_commands(tmp_path: Path) -> None:
+    artifacts = RunArtifacts(
+        evidence_dir=tmp_path,
+        workspace_dir=tmp_path / "workspace",
+        prompt_path=tmp_path / "p",
+        task_path=tmp_path / "t",
+        stdout_path=tmp_path / "o",
+        stderr_path=tmp_path / "e",
+        final_output_path=tmp_path / "f",
+    )
+    claude_cmd = ClaudeCodeSpec(model="claude-x").build_command(artifacts)
+    assert claude_cmd[0] == "claude" and "--model" in claude_cmd and "claude-x" in claude_cmd
+
+    cursor_cmd = CursorAgentSpec().build_command(artifacts)
+    assert cursor_cmd[0] == "cursor-agent" and "--model" not in cursor_cmd
+
+
+def test_driver_rejects_spec_without_binary(tmp_path: Path) -> None:
+    class _NoBinary(CodingAgentSpec):
+        def build_command(self, artifacts: RunArtifacts) -> list[str]:
+            return []
+
+    with pytest.raises(ValueError, match="non-empty"):
+        CliAgentDriver(_NoBinary(), work_root=tmp_path)
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py
new file mode 100644
index 0000000000..3e5f9361a2
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for promoted attempt helpers and reusable metrics."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric, EvidencePresenceMetric
+from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence
+
+
+def test_resolve_attempt_status_keeps_failed_agents_scorable() -> None:
+    assert resolve_attempt_status(True) == "completed"
+    assert resolve_attempt_status(False) == "partial"
+
+
+def test_standard_evidence_descriptors_builds_doc_keys(tmp_path: Path) -> None:
+    logs = tmp_path / "agent"
+    workspace = tmp_path / "workspace"
+    verifier = tmp_path / "verifier"
+    logs.mkdir()
+    workspace.mkdir()
+    verifier.mkdir()  # exists -> verifier_logs included
+
+    descriptors = standard_evidence_descriptors(
+        logs_dir=logs,
+        final_state_dir=workspace,
+        trace_path=tmp_path / "atif_trajectory.json",
+        initial_state_ref=str(tmp_path / "seed"),
+        verifier_logs_dir=verifier,
+        primary_log="nat_agent.log",
+    )
+    assert set(descriptors) == {"initial_state", "trace", "logs", "final_state", "verifier_logs"}
+    assert descriptors["trace"].format == "atif"
+    assert descriptors["logs"].metadata["primary_log"] == "nat_agent.log"
+
+    # verifier_logs omitted when the dir is absent.
+    no_verifier = standard_evidence_descriptors(
+        logs_dir=logs, final_state_dir=workspace, verifier_logs_dir=tmp_path / "missing"
+    )
+    assert "verifier_logs" not in no_verifier
+
+
+@pytest.mark.asyncio
+async def test_agent_phase_success_metric_reads_metadata_and_namespaces_type() -> None:
+    metric = AgentPhaseSuccessMetric()
+    assert metric.type == "agent_phase_success"
+    ok = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(metadata={"agent_ok": True}))
+    )
+    assert ok.outputs[0].value == 1.0
+
+    class Namespaced(AgentPhaseSuccessMetric):
+        metric_type = "agentic_use_agent_phase"
+
+    assert Namespaced().type == "agentic_use_agent_phase"
+
+
+@pytest.mark.asyncio
+async def test_evidence_presence_metric_scores_over_evidence(tmp_path: Path) -> None:
+    final_state = tmp_path / "workspace"
+    final_state.mkdir()
+    (final_state / "result.txt").write_text("done", encoding="utf-8")
+    evidence = CandidateEvidence(
+        descriptors=standard_evidence_descriptors(logs_dir=tmp_path / "agent", final_state_dir=final_state)
+    )
+
+    metric = EvidencePresenceMetric()
+    present = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence))
+    )
+    assert present.outputs[0].value == 1.0
+
+    # Empty workspace -> non-empty requirement fails; no evidence -> 0.
+    (final_state / "result.txt").unlink()
+    empty = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence))
+    )
+    assert empty.outputs[0].value == 0.0
+    missing = await metric.compute_scores(MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput()))
+    assert missing.outputs[0].value == 0.0
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
index 5e0446b1eb..c051499030 100644
--- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py
@@ -16,7 +16,7 @@
 from nemo_evaluator_sdk.agent_eval.runtimes import docker_sandbox
 from nemo_evaluator_sdk.agent_eval.runtimes.docker_sandbox import (
     DockerSandboxAgentRuntime,
-    SandboxSdk,
+    SandboxSDK,
 )
 
 
@@ -147,8 +147,8 @@ async def run(self, agent: _FakeSandboxAgent, prompt: str, *, run_config: _FakeR
         raise RuntimeError("sandbox run failed")
 
 
-def _fake_sdk() -> SandboxSdk:
-    return SandboxSdk(
+def _fake_sdk() -> SandboxSDK:
+    return SandboxSDK(
         Runner=_FakeRunner(),
         RunConfig=_FakeRunConfig,
         SandboxRunConfig=_FakeSandboxRunConfig,
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py
new file mode 100644
index 0000000000..b7df9a61d4
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the promoted environment boundary + environment authoring."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.runtimes import docker as docker_mod
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    DockerEnvironmentHandle,
+    DockerEnvironmentProvider,
+    EnvRunSpec,
+    default_image_tag,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec, plan_task_build
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask
+
+
+@pytest.mark.asyncio
+async def test_docker_handle_routes_roles_through_single_run(monkeypatch: pytest.MonkeyPatch) -> None:
+    calls: list[tuple[str, list[str]]] = []
+
+    def fake_docker_run(image: str, command: list[str], **kwargs: object) -> subprocess.CompletedProcess[str]:
+        calls.append((image, command))
+        return subprocess.CompletedProcess(args=command, returncode=0)
+
+    monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run)
+
+    handle = DockerEnvironmentHandle("img:latest")
+    spec = EnvRunSpec(command=["echo", "hi"])
+    assert (await handle.run_agent(spec)).ok
+    assert (await handle.run_verifier(spec)).ok
+    assert calls == [("img:latest", ["echo", "hi"]), ("img:latest", ["echo", "hi"])]
+
+
+@pytest.mark.asyncio
+async def test_docker_handle_reports_timeout(monkeypatch: pytest.MonkeyPatch) -> None:
+    def fake_docker_run(image: str, command: list[str], **kwargs: object):
+        raise subprocess.TimeoutExpired(cmd=command, timeout=1)
+
+    monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run)
+    result = await DockerEnvironmentHandle("img").run(EnvRunSpec(command=["sleep"]), "agent")
+    assert result.timed_out and result.exit_code == 124 and not result.ok
+
+
+@pytest.mark.asyncio
+async def test_provider_uses_injected_image_tag_fn() -> None:
+    assert default_image_tag("t") == "t:latest"
+    provider = DockerEnvironmentProvider(image_tag_fn=lambda task_id: f"custom-{task_id}")
+    handle = await provider.prepare(AgentEvalTask(id="demo", intent="x", inputs={}))
+    assert isinstance(handle, DockerEnvironmentHandle)
+    assert handle.image == "custom-demo"
+
+
+def test_environment_spec_yaml_dockerfile_and_plan(tmp_path: Path) -> None:
+    (tmp_path / "environment.yaml").write_text(
+        "environment:\n  image: base:1\n  dependencies:\n    python: [pytest]\n  setup: [seed]\n",
+        encoding="utf-8",
+    )
+    spec = load_environment_spec(tmp_path)
+    assert spec.image == "base:1" and spec.python_dependencies == ["pytest"]
+
+    plan = plan_task_build(tmp_path, "img:latest", generated_dir=tmp_path / "build")
+    content = plan.dockerfile.read_text(encoding="utf-8")
+    assert plan.generated and plan.base_image == "base:1"
+    assert content.startswith("FROM base:1") and "pip install --no-cache-dir pytest" in content
+
+    # Dockerfile escape hatch wins when no yaml present.
+    other = tmp_path / "task2" / "environment"
+    other.mkdir(parents=True)
+    (other / "Dockerfile").write_text("FROM scratch\n", encoding="utf-8")
+    escape = load_environment_spec(tmp_path / "task2")
+    assert escape.dockerfile == other / "Dockerfile" and escape.image is None
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py
new file mode 100644
index 0000000000..613a4cfaa3
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the promoted deterministic gate."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, summarize_run, write_gate_report
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunResult,
+    AgentEvalSummary,
+    AgentEvalTask,
+    AgentEvalTaskResult,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import MetricOutput
+
+
+def _make_run_result(
+    *, reward: float, total_tokens: int, runtime_sec: float, commit: str = "abc123"
+) -> AgentEvalRunResult:
+    task = AgentEvalTask(id="demo", intent="do it", inputs={})
+    attempt = AgentEvalAttempt(
+        id="demo:workflow",
+        task_id="demo",
+        status="completed",
+        output=AgentOutput(text="ok"),
+        metadata={
+            "total_tokens": total_tokens,
+            "runtime_sec": runtime_sec,
+            "provenance": {"commit_sha": commit, "commit_short": commit[:7]},
+        },
+    )
+    task_result = AgentEvalTaskResult(
+        id="demo:workflow:agentic_use_verifier_reward",
+        run_id="run-1",
+        task_id="demo",
+        attempt_id="demo:workflow",
+        metric_type="agentic_use_verifier_reward",
+        outputs=[MetricOutput(name="verifier_reward", value=reward)],
+    )
+    return AgentEvalRunResult(
+        run_id="run-1",
+        tasks=[task],
+        attempts=[attempt],
+        results=[task_result],
+        summary=AgentEvalSummary(),
+    )
+
+
+def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
+    summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5))
+    assert summary["total_tasks"] == 1
+    assert summary["pass_rate"] == 1.0
+    assert summary["total_tokens_sum"] == 120
+    assert summary["runtime_sec_sum"] == 4.5
+    assert summary["token_metrics_coverage"] == 1.0
+    assert summary["provenance"]["commit_sha"] == "abc123"
+
+
+def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None:
+    baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0)
+    candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0)
+
+    baseline_report = evaluate_gate(baseline, thresholds=GateThresholds())
+    assert baseline_report.gate_passed is True
+
+    candidate_report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_report.summary)
+    assert candidate_report.gate_passed is False
+    token_check = next(c for c in candidate_report.checks if c.name == "tokens_not_worse_than_baseline")
+    assert token_check.passed is False
+
+    gate_path = write_gate_report(candidate_report, tmp_path)
+    assert gate_path.exists() and "gate_passed" in gate_path.read_text(encoding="utf-8")
+
+
+def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
+    baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111")
+    candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222")
+
+    baseline_summary = evaluate_gate(baseline, thresholds=GateThresholds()).summary
+    report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_summary)
+    cross = next(c for c in report.checks if c.name == "commit_sha_matches_baseline")
+    assert cross.passed is False and report.gate_passed is False
+
+    allowed = evaluate_gate(
+        candidate, thresholds=GateThresholds(allow_cross_commit=True), baseline_summary=baseline_summary
+    )
+    cross_allowed = next(c for c in allowed.checks if c.name == "commit_sha_matches_baseline")
+    assert cross_allowed.passed is True
+
+
+def test_summarize_run_uses_measurement_fallbacks() -> None:
+    # duration_ms -> runtime_sec, and metadata reward when no scored metric output.
+    run = _make_run_result(reward=0.0, total_tokens=10, runtime_sec=1.0)
+    run.attempts[0].metadata.pop("runtime_sec")
+    run.attempts[0].metadata["duration_ms"] = 2500
+    run.attempts[0].metadata["reward"] = 1
+    run.results.clear()  # no scored metric outputs -> fall back to metadata reward
+
+    summary = summarize_run(run)
+    assert summary["runtime_sec_sum"] == 2.5
+    assert summary["pass_rate"] == 1.0
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py
new file mode 100644
index 0000000000..ed7da3beee
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Guardrail: the agent_eval package must stay free of NeMo-Platform imports.
+
+The SDK is consumed by ``tests/agentic-use`` (the NeMo-Platform adapter), never
+the reverse. This test fails if any module under ``agent_eval`` imports a
+platform-specific package, which keeps the promotion from leaking coupling into
+the SDK.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+import nemo_evaluator_sdk.agent_eval as agent_eval
+
+AGENT_EVAL_ROOT = Path(agent_eval.__file__).resolve().parent
+
+# Import statements that would couple the SDK to the platform / adapter.
+_FORBIDDEN = re.compile(
+    r"^\s*(?:from|import)\s+"
+    r"(nemo_platform|nmp_[A-Za-z0-9_]+|nat_runner|runtimes(?:\.|\s|$)|evaluator_agent_eval)",
+    re.MULTILINE,
+)
+
+
+def test_agent_eval_has_no_platform_imports() -> None:
+    offenders: list[str] = []
+    for path in sorted(AGENT_EVAL_ROOT.rglob("*.py")):
+        text = path.read_text(encoding="utf-8")
+        for match in _FORBIDDEN.finditer(text):
+            line_no = text.count("\n", 0, match.start()) + 1
+            offenders.append(f"{path.relative_to(AGENT_EVAL_ROOT)}:{line_no}: {match.group(0).strip()}")
+
+    assert not offenders, "agent_eval must not import NeMo-Platform packages:\n" + "\n".join(offenders)
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py
new file mode 100644
index 0000000000..bc11bce7ef
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the typed AttemptMeasurements contract."""
+
+from __future__ import annotations
+
+from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
+
+
+def test_from_metadata_reads_tokens_runtime_reward_and_provenance() -> None:
+    measurements = AttemptMeasurements.from_metadata(
+        {
+            "total_tokens": 120,
+            "prompt_tokens": 80,
+            "completion_tokens": 40,
+            "runtime_sec": 4.5,
+            "reward": 1,
+            "passed": True,
+            "provenance": {"commit_sha": "abc123"},
+        }
+    )
+    assert measurements.total_tokens == 120
+    assert measurements.runtime_sec == 4.5
+    assert measurements.reward == 1.0
+    assert measurements.passed is True
+    assert measurements.provenance["commit_sha"] == "abc123"
+
+
+def test_from_metadata_applies_fallbacks_and_ignores_bad_types() -> None:
+    # duration_ms -> runtime_sec, passed -> reward, bool is not a token count.
+    measurements = AttemptMeasurements.from_metadata(
+        {"duration_ms": 2500, "passed": False, "total_tokens": True}
+    )
+    assert measurements.runtime_sec == 2.5
+    assert measurements.reward == 0.0
+    assert measurements.total_tokens is None
+
+    empty = AttemptMeasurements.from_metadata(None)
+    assert empty.reward is None and empty.runtime_sec is None and empty.provenance == {}
+
+
+def test_to_metadata_round_trips_only_set_values() -> None:
+    payload = AttemptMeasurements(total_tokens=10, runtime_sec=1.0, reward=1.0).to_metadata()
+    assert payload == {"total_tokens": 10, "runtime_sec": 1.0, "reward": 1.0}
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py
new file mode 100644
index 0000000000..d5acd5bd3f
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the generic agent-eval orchestrator (online + offline paths)."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Sequence
+from pathlib import Path
+
+import pytest
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+class _ExtraMetric:
+    @property
+    def type(self) -> str:
+        return "extra"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("extra")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        return MetricResult(outputs=[MetricOutput(name="extra", value=1.0)])
+
+
+class _FakeRuntime:
+    def __init__(self) -> None:
+        self.prepared_ids: list[str] = []
+
+    async def run_tasks(
+        self, tasks: Sequence[AgentEvalTask], config: AgentEvalRunConfig | None = None
+    ) -> Sequence[AgentEvalAttempt]:
+        return [
+            AgentEvalAttempt(
+                id=f"{task.id}:fake",
+                task_id=task.id,
+                status="completed",
+                output=AgentOutput(text="ok"),
+                metadata={"agent_ok": True},
+            )
+            for task in tasks
+        ]
+
+
+def _task() -> AgentEvalTask:
+    return AgentEvalTask(id="demo", intent="do it", inputs={}, metrics=[AgentPhaseSuccessMetric()])
+
+
+@pytest.mark.asyncio
+async def test_run_tasks_appends_extra_metrics_and_runs_prepare_hook(tmp_path: Path) -> None:
+    runtime = _FakeRuntime()
+    seen: list[str] = []
+    orch = AgentEvalOrchestrator(
+        config=OrchestratorConfig(write_dashboard=False, write_gate=True),
+        extra_metrics=[_ExtraMetric()],
+    )
+
+    result = await orch.run_tasks(
+        [_task()],
+        target=runtime,
+        benchmark={"benchmark": "demo"},
+        output_dir=tmp_path,
+        run_id="run-1",
+        prepare_task=lambda task: seen.append(task.id),
+    )
+
+    assert seen == ["demo"]
+    assert {m.type for m in result.tasks[0].metrics} == {"agent_phase_success", "extra"}
+    assert result.attempts[0].status == "completed"
+    # Gate is written next to the run bundle.
+    assert (tmp_path / "gate.json").exists()
+
+
+@pytest.mark.asyncio
+async def test_score_attempts_offline_does_not_invoke_runtime() -> None:
+    orch = AgentEvalOrchestrator(config=OrchestratorConfig(write_dashboard=False, write_gate=False))
+    attempt = AgentEvalAttempt(
+        id="demo:stored",
+        task_id="demo",
+        status="completed",
+        output=AgentOutput(text="ok"),
+        metadata={"agent_ok": True},
+    )
+    result = await orch.score_attempts([_task()], attempts=[attempt])
+    assert [m.type for m in result.tasks[0].metrics] == ["agent_phase_success"]
+    assert any(r.metric_type == "agent_phase_success" for r in result.results)
+
+
+@pytest.mark.asyncio
+async def test_extra_metrics_deduplicated_by_type() -> None:
+    task = AgentEvalTask(id="demo", intent="i", inputs={}, metrics=[AgentPhaseSuccessMetric(), _ExtraMetric()])
+    orch = AgentEvalOrchestrator(
+        config=OrchestratorConfig(write_dashboard=False, write_gate=False),
+        extra_metrics=[_ExtraMetric()],
+    )
+    attempt = AgentEvalAttempt(id="demo:s", task_id="demo", status="completed", output=AgentOutput(text="ok"))
+    result = await orch.score_attempts([task], attempts=[attempt])
+    types = [m.type for m in result.tasks[0].metrics]
+    assert types.count("extra") == 1
+
+
+def test_result_dir_attempt_source_protocol_shape(tmp_path: Path) -> None:
+    # A minimal AgentAttemptSource implementation satisfies the protocol.
+    from nemo_evaluator_sdk.agent_eval.types import AgentAttemptSource
+
+    class _Source:
+        def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
+            payload = json.loads(Path(source).read_text(encoding="utf-8"))
+            return AgentEvalAttempt(
+                id=f"{task.id}:stored",
+                task_id=task.id,
+                status="completed",
+                output=AgentOutput(text=payload["agent"]),
+            )
+
+    src_path = tmp_path / "result.json"
+    src_path.write_text(json.dumps({"agent": "ok"}), encoding="utf-8")
+    source: AgentAttemptSource = _Source()
+    assert isinstance(source, AgentAttemptSource)
+    attempt = source.load_attempt(src_path, task=_task())
+    assert attempt.task_id == "demo"
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py
new file mode 100644
index 0000000000..136fda6075
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the generic verifier mechanic."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
+    apply_verify_to_metadata,
+    collect_verifier_outcome,
+    skipped_outcome,
+)
+
+
+def test_collect_reads_reward_file_when_present(tmp_path: Path) -> None:
+    (tmp_path / "reward.txt").write_text("1\n", encoding="utf-8")
+    (tmp_path / "test-stdout.txt").write_text("PASSED", encoding="utf-8")
+    outcome = collect_verifier_outcome(ok=False, exit_code=3, log_dir=tmp_path)
+    # reward.txt is authoritative even when the process exit said not-ok.
+    assert outcome.ran and outcome.reward == 1 and outcome.exit_code == 3
+    assert outcome.stdout == "PASSED"
+
+
+def test_collect_derives_and_writes_reward_when_missing(tmp_path: Path) -> None:
+    outcome = collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path)
+    assert outcome.reward == 1 and outcome.passed is True
+    assert (tmp_path / "reward.txt").read_text(encoding="utf-8").strip() == "1"
+
+
+def test_apply_to_metadata_stamps_and_skips(tmp_path: Path) -> None:
+    meta: dict[str, object] = {}
+    apply_verify_to_metadata(meta, skipped_outcome())
+    assert meta == {"verify_status": "skipped"}
+
+    meta2: dict[str, object] = {}
+    apply_verify_to_metadata(meta2, collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path))
+    assert meta2["verify_status"] == "ok" and meta2["reward"] == 1 and meta2["passed"] is True
diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md
index b7d55b1b13..526aa7b0e7 100644
--- a/tests/agentic-use/runtimes/COMPLIANCE.md
+++ b/tests/agentic-use/runtimes/COMPLIANCE.md
@@ -7,15 +7,34 @@ and `AgentAttemptRuntime` in `nemo_evaluator_sdk.agent_eval`).
 Design reference: internal agent-eval SDK doc
 (`https://docs.google.com/document/d/1mA9Kl6LVJFlgbj5CGulUOiaGyliP7QhqBh7jKXFGifM`).
 
+## Adapter-over-SDK note
+
+The generic building blocks have been **promoted into the SDK**
+(`nemo_evaluator_sdk.agent_eval`): the environment boundary
+(`runtimes.environment`/`environment_spec`/`docker`), gating (`gating`), attempt
+helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics
+(`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence
+`EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the
+`AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the
+coding-agent driver seam (`runtimes.coding_agent`). Those SDK homes are imported
+**directly** by the runtime scripts — there are no re-export shims. The only
+NeMo-Platform specifics that remain (the agentic task loader, `result.json`
+import, attempt construction, the pytest verifier command, the `state` evidence
+key, `task_image_tag` + platform `DockerEnvironmentProvider`, the
+`VerifierRewardMetric`) are consolidated into a single module,
+`shared/platform.py` (alongside `shared/config.py` and `shared/constants.py`).
+A CI grep gate (`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`)
+keeps `agent_eval/` free of NeMo-Platform imports.
+
 ## Scope split (per SDK design)
 
 | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location |
 |----------------------------|-----------------------------------|------------------|
 | AGENT phase — run backend in Docker, capture logs/trajectory | **Yes** | `runtimes/<backend>/runtime.py` |
-| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `shared/environment_spec.py` (env spec / Dockerfile) + `shared/docker.py` |
-| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/verify.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) |
+| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `agent_eval.runtimes.environment_spec` (env spec / Dockerfile) + `agent_eval.runtimes.docker` |
+| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/platform.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) |
 | CLI — task globs, manifests, summaries | **No** | Still `nat_runner.main` (not migrated) |
-| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/result_adapter.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` |
+| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/platform.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` |
 
 ## Task metrics (authored on the task)
 
@@ -46,14 +65,14 @@ metric scoring row.
 | `_prepare_workflow_for_runtime` | `workflow/prep.py` |
 | `_build_aut_agent_cmd` | `aut/command.py` |
 | `_prepare_aut_config_for_runtime` | `aut/prep.py` |
-| `_agent_log_has_workflow_error` | `shared/agent_log.py` |
-| `run_verify_phase` | `shared/verify.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) |
-| `_docker_run`, `build_task_image` | `shared/docker.py` (`docker_run`, `build_dockerfile`, `build_task_image`) |
-| BUILD env resolution (`environment/Dockerfile`) | `shared/environment_spec.py` (`load_environment_spec`, `plan_task_build`) |
-| `_write_result` (`result.json`) | `shared/result_adapter.py` (import side only; `nat_runner` still writes it) |
-| pass-rate / token / runtime gate | `shared/reporting.py` (mirrors `passrate_token_policy_gate.py`) |
-| `_extract_usage_metrics` | `shared/usage.py` (delegates to `nat_runner` until deduped) |
-| `capture_agent_attempt` shape | `shared/artifacts.py` |
+| `_agent_log_has_workflow_error` | `shared/platform.py` |
+| `run_verify_phase` | `shared/platform.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) |
+| `_docker_run`, `build_task_image` | `agent_eval.runtimes.docker` (`docker_run`, `build_dockerfile`, `build_task_image`) |
+| BUILD env resolution (`environment/Dockerfile`) | `agent_eval.runtimes.environment_spec` (`load_environment_spec`, `plan_task_build`) |
+| `_write_result` (`result.json`) | `shared/platform.py` (import side only; `nat_runner` still writes it) |
+| pass-rate / token / runtime gate | `agent_eval.gating` (mirrors `passrate_token_policy_gate.py`) |
+| `_extract_usage_metrics` | `shared/platform.py` (delegates to `nat_runner` until deduped) |
+| `capture_agent_attempt` shape | `shared/platform.py` |
 | `run_agent_phase` | **Removed per backend** once all backends migrated |
 
 ## Attempt record contract
@@ -66,14 +85,14 @@ includes canonical `CapturedAgentAttempt` fields:
 - Artifact paths: `agent_log_dir`, `workspace_dir`, `state_dir`, `atif_trajectory_path`
 - Phase outcome: `agent_ok`
 - Verifier outcome (when `run_verify=True`): `verify_status`, `passed`, `reward`,
-  `verifier_log_dir` (stamped by `shared/verify.py::apply_verify_to_metadata`)
+  `verifier_log_dir` (stamped by `apply_verify_to_metadata` from `agent_eval.runtimes.verify`)
 
 Use `to_captured_agent_attempt(task, attempt)` for verify/scoring code that
 expects the portable `CapturedAgentAttempt` type.
 
 ## `nat_runner` artifact → `AgentEvalAttempt` evidence map (per design doc)
 
-`shared/artifacts.py::_evidence_descriptors` emits the documented keys:
+`shared/platform.py::_evidence_descriptors` emits the documented keys:
 
 | `nat_runner` output | `AgentEvalAttempt` mapping | Status |
 |---------------------|----------------------------|--------|
@@ -83,7 +102,7 @@ expects the portable `CapturedAgentAttempt` type.
 | `agent/trajectory.json` | `evidence["trace"]` (ATIF when normalized, else json) | Implemented |
 | `agent/` logs | `evidence["logs"]` (dir, `primary_log=nat_agent.log`) | Implemented |
 | `verifier/` logs | `evidence["verifier_logs"]` (added once verify phase runs) | Implemented (conditional) |
-| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/result_adapter.py::attempt_from_result` / `attempt_from_result_dir` |
+| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/platform.py::attempt_from_result` / `attempt_from_result_dir` |
 | final agent log/message | `AgentOutput.text` | Implemented |
 
 `result.json` mapping detail (`attempt_from_result`):
@@ -93,7 +112,7 @@ expects the portable `CapturedAgentAttempt` type.
   attempt-production failures because the SDK's `AgentEvaluator` excludes
   `status=="failed"` from scoring (it raises); an agent that ran but failed must
   stay scorable so pass-rate gating counts it as a `0`. The live builder
-  (`shared/artifacts.py`) and this importer share the same helper.
+  (`shared/platform.py`) and this importer share the same helper.
 - `result["reward"]`/`result["passed"]` → `metadata` measurements (verifier reward
   stays a *measurement*, scored by `VerifierRewardMetric`, not the attempt status).
 - `result["metrics"]` (token/cost) → flattened into `metadata`.
@@ -103,10 +122,10 @@ expects the portable `CapturedAgentAttempt` type.
 
 | Doc section | Status in this package |
 |-------------|------------------------|
-| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/result_adapter.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. |
-| **B2** `EnvironmentProvider` boundary | **Implemented** — `shared/environment.py` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; `DockerEnvironmentProvider` wraps `shared/docker.py`. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. |
-| **B3** standardize environment authoring | **Implemented (minimal)** — `shared/environment_spec.py` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). |
-| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `shared/reporting.py` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. |
+| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/platform.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. |
+| **B2** `EnvironmentProvider` boundary | **Implemented** — `agent_eval.runtimes.environment` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; the platform `DockerEnvironmentProvider` (`shared/platform.py`) wraps `agent_eval.runtimes.docker` with the `nmp-nat-<id>` image tag. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. |
+| **B3** standardize environment authoring | **Implemented (minimal)** — `agent_eval.runtimes.environment_spec` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). |
+| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `agent_eval.gating` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. |
 
 ### B4 reporting / gating detail
 
@@ -114,7 +133,7 @@ expects the portable `CapturedAgentAttempt` type.
   calls `agent_eval.persistence.persist_run`, writing `tasks.jsonl`,
   `attempts.jsonl`, `results.jsonl`, `summary.json`, `benchmark.json`, `run.json`,
   and (when `write_dashboard=True`) `report.html`.
-- **Gating** (`shared/reporting.py`): `summarize_run` aggregates pass-rate,
+- **Gating** (`agent_eval.gating`): `summarize_run` aggregates pass-rate,
   token totals/coverage, runtime totals, and run-level provenance from the typed
   `AgentEvalRunResult` (metric scores first, attempt metadata as fallback).
   `evaluate_gate` applies absolute thresholds and candidate-vs-baseline checks:
@@ -132,7 +151,7 @@ expects the portable `CapturedAgentAttempt` type.
 The doc sketches `AgentEnvironmentHandle.run_agent(instruction, config) -> AgentEvalAttempt`.
 We instead use `run_agent(EnvRunSpec) -> EnvCommandResult` (and the symmetric
 `run_verifier`). Rationale: per-backend command/env/mount construction lives in the
-runtime, and attempt construction lives in `shared/artifacts.py`. Keeping the
+runtime, and attempt construction lives in `shared/platform.py`. Keeping the
 environment layer at "execute a command, return exit status" means a new provider
 (local, Harbor, NeMo Gym) only implements process execution — it never needs to
 know about backends or attempt schemas.
diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md
index 90317f204a..5b149c10ec 100644
--- a/tests/agentic-use/runtimes/README.md
+++ b/tests/agentic-use/runtimes/README.md
@@ -1,32 +1,70 @@
 # Agentic-use AgentAttemptRuntime implementations
 
-Backend-specific runtimes extracted from `nat_runner.py` for use with
-`nemo_evaluator_sdk.agent_eval.AgentEvaluator`.
+NeMo-Platform **adapter** over the generic agent-eval framework in
+`nemo_evaluator_sdk.agent_eval`. The backend-agnostic building blocks (environment
+boundary, gating, attempt/evidence helpers, orchestrator, verify mechanic,
+coding-agent driver seam) now live in the SDK; this directory holds only the
+NeMo-Platform glue (the `workflow`/`aut` backends, agentic task/result formats,
+the pytest verifier, the platform Docker build/image-tag) plus a thin factory.
+
+## Architecture: adapter over SDK
+
+The backend-agnostic logic lives in `nemo_evaluator_sdk.agent_eval` and is
+imported **directly** by the runtime scripts (no re-export shims). Everything
+generic comes from these SDK homes:
+
+| What | SDK home |
+|------|----------|
+| Docker CLI helpers | `agent_eval.runtimes.docker` |
+| Environment boundary (`AgentEnvironmentProvider`/`Handle`, `EnvRunSpec`) | `agent_eval.runtimes.environment` |
+| Environment authoring (`load_environment_spec`, `plan_task_build`, …) | `agent_eval.runtimes.environment_spec` |
+| Gating (`GateThresholds`, `evaluate_gate`, `summarize_run`, …) | `agent_eval.gating` |
+| Verify mechanic (`apply_verify_to_metadata`, `collect_verifier_outcome`) | `agent_eval.runtimes.verify` |
+| `AgentPhaseSuccessMetric`, attempt-status + evidence helpers | `agent_eval.common_metrics`, `agent_eval.attempts` |
+| Generic orchestrator + run layout | `agent_eval.orchestrator`, `agent_eval.runtimes.layout` |
+
+All NeMo-Platform-specific glue is consolidated into a single module,
+`shared/platform.py`: the run layout with the platform `state_dir`, the
+`nmp-nat-<id>` image tag + `DockerEnvironmentProvider` default, the namespaced
+`AgentPhaseSuccessMetric` + the `VerifierRewardMetric`, agent-log/usage parsing
+and the shared container env, attempt construction (live + `result.json`), the
+live VERIFY phase, and the agentic-use task loader.
+
+The orchestrator (`orchestrator.py`) is a thin factory over
+`agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image
+build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric`
+(`extra_metrics`), and the `result.json` `AgentAttemptSource`.
 
 ## Layout
 
 ```text
 runtimes/
-  shared/           # backend-agnostic building blocks:
-                    #   docker.py            Docker exec + build helpers
-                    #   environment.py       AgentEnvironmentProvider/Handle boundary (B2)
-                    #   environment_spec.py  environment.yaml authoring + build plans (B3)
-                    #   layout.py            per-run output layout
-                    #   task_loader.py       agentic-use task -> AgentEvalTask
-                    #   container_env.py     base container env vars
-                    #   artifacts.py         agent artifacts -> AgentEvalAttempt (+ evidence)
-                    #   result_adapter.py    nat_runner result.json -> AgentEvalAttempt (B1/B4)
-                    #   verify.py            live VERIFY via run_verifier
-                    #   reporting.py         summary + candidate/baseline gate (B4)
-                    #   metrics.py           AgentPhaseSuccessMetric, VerifierRewardMetric
-  workflow/         # NatWorkflowAttemptRuntime (implemented)
-  aut/              # AutAgentAttemptRuntime (implemented)
-  claude_code/      # ClaudeCodeAgentAttemptRuntime (scaffold)
-  codex/            # CodexAgentAttemptRuntime (scaffold)
-  cursor_agent/     # CursorAgentAttemptRuntime (scaffold)
-  orchestrator.py   # BUILD (env spec) + AgentEvaluator + gate; verify runs in the runtime
+  shared/           # platform glue only:
+                    #   platform.py  — all NeMo-Platform helpers (one file)
+                    #   config.py    — runtime config dataclasses
+                    #   constants.py — paths / container constants
+  workflow/         # NatWorkflowAttemptRuntime (implemented, NeMo construct)
+  aut/              # AutAgentAttemptRuntime (implemented, NeMo construct)
+  claude_code/      # scaffold (stub) — see "Coding-agent runtimes" below
+  codex/            # scaffold (stub)
+  cursor_agent/     # scaffold (stub)
+  orchestrator.py   # thin factory over agent_eval.orchestrator.AgentEvalOrchestrator
 ```
 
+## Coding-agent runtimes (SDK driver seam)
+
+Coding-agent CLIs plug into the SDK via
+`agent_eval.runtimes.coding_agent`: `CliAgentDriver` (the reusable driver) +
+`CodingAgentSpec` (per-agent command builder + trajectory→evidence parser).
+Reference `ClaudeCodeSpec`/`CursorAgentSpec` are shipped. The profbench codex
+runtime (`agent_eval.runtimes.codex`) remains a separate, standalone-CLI runtime.
+
+The agentic-use `codex`/`claude_code`/`cursor_agent` backends here are still
+stubs: wiring them to run the SDK driver *inside* the `nmp-agentic-base` Docker
+environment (like `workflow`/`aut`) is bespoke per agent and a tracked follow-up.
+`workflow` and `aut` stay in the adapter — they implement `AgentAttemptRuntime`
+but are NeMo constructs, not general SDK runtimes.
+
 ## Example: workflow backend
 
 From the repository root (requires Docker + built task image):
@@ -76,7 +114,7 @@ Design-doc implementation path (see [COMPLIANCE.md](./COMPLIANCE.md) for detail)
 
 ## B1 — `result.json` import + stored-attempt scoring
 
-`shared/result_adapter.py` imports an existing `nat_runner` run as an attempt:
+`shared/platform.py` imports an existing `nat_runner` run as an attempt:
 
 - `attempt_from_result_dir(output_dir)` reads `<output_dir>/result.json`.
 - `attempt_from_result(result_dict, output_dir=...)` projects a parsed record.
@@ -101,16 +139,17 @@ when `run_verify=True`. `inputs` holds only agent-facing `instruction`;
 
 ## B2 — Environment boundary
 
-Runtimes execute the agent through `shared/environment.py`
+Runtimes execute the agent through the SDK environment boundary
 (`AgentEnvironmentProvider` → `AgentEnvironmentHandle`) rather than calling
-Docker directly. `DockerEnvironmentProvider` is the default; inject another
+Docker directly. The platform `DockerEnvironmentProvider` (`shared/platform.py`,
+defaulting to the `nmp-nat-<id>` image tag) is the default; inject another
 provider (local, Harbor, NeMo Gym) via the runtime's `environment=` argument
 without changing backend code.
 
 ## B3 — Environment authoring
 
 Tasks can declare a reusable environment instead of hand-writing a Dockerfile.
-`shared/environment_spec.py` loads `environment.yaml` from the task dir:
+`agent_eval.runtimes.environment_spec` loads `environment.yaml` from the task dir:
 
 ```yaml
 environment:
@@ -141,10 +180,10 @@ as metadata, not executed here (they are runtime concerns).
 
 The SDK persists the run bundle (`tasks.jsonl`, `attempts.jsonl`,
 `results.jsonl`, `summary.json`, `report.html`) when `output_dir` is set.
-`shared/reporting.py` adds the gate on top:
+`agent_eval.gating` adds the gate on top:
 
 ```python
-from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
 
 report = evaluate_gate(
     run_result,
@@ -160,7 +199,7 @@ The orchestrator emits `gate.json` automatically (`AgenticOrchestratorConfig.wri
 
 ## Live VERIFY phase (through the B2 boundary)
 
-`shared/verify.py` runs the task-local `tests/test_outputs.py` pytest verifier
+`shared/platform.py` runs the task-local `tests/test_outputs.py` pytest verifier
 through `AgentEnvironmentHandle.run_verifier`, in the same prepared environment
 and against the same persisted workspace/state as the agent phase. Enable it via
 `AgenticSharedConfig(run_verify=True)`; the runtime stamps `reward`/`passed`/
diff --git a/tests/agentic-use/runtimes/__init__.py b/tests/agentic-use/runtimes/__init__.py
index 1b7e12491f..df392483cf 100644
--- a/tests/agentic-use/runtimes/__init__.py
+++ b/tests/agentic-use/runtimes/__init__.py
@@ -3,20 +3,23 @@
 
 """Backend-specific AgentAttemptRuntime implementations for agentic-use evals."""
 
-from runtimes.aut.runtime import AutAgentAttemptRuntime
-from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime
-from runtimes.codex.runtime import CodexAgentAttemptRuntime
-from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime
-from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend
-from runtimes.shared.environment import (
+from nemo_evaluator_sdk.agent_eval.gating import (
+    GateCheck,
+    GateReport,
+    GateThresholds,
+    evaluate_gate,
+    load_baseline_summary,
+    summarize_run,
+    write_gate_report,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
     AgentEnvironmentHandle,
     AgentEnvironmentProvider,
     DockerEnvironmentHandle,
-    DockerEnvironmentProvider,
     EnvCommandResult,
     EnvRunSpec,
 )
-from runtimes.shared.environment_spec import (
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import (
     BuildPlan,
     EnvironmentSpec,
     execute_build_plan,
@@ -24,20 +27,19 @@
     plan_task_build,
     render_derived_dockerfile,
 )
-from runtimes.shared.metrics import AgentPhaseSuccessMetric, VerifierRewardMetric
-from runtimes.shared.reporting import (
-    GateCheck,
-    GateReport,
-    GateThresholds,
-    evaluate_gate,
-    load_baseline_summary,
-    summarize_run,
-    write_gate_report,
-)
-from runtimes.shared.result_adapter import attempt_from_result, attempt_from_result_dir
-from runtimes.shared.verify import (
-    VerifierOutcome,
-    apply_verify_to_metadata,
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import VerifierOutcome, apply_verify_to_metadata
+
+from runtimes.aut.runtime import AutAgentAttemptRuntime
+from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime
+from runtimes.codex.runtime import CodexAgentAttemptRuntime
+from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime
+from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend
+from runtimes.shared.platform import (
+    AgentPhaseSuccessMetric,
+    DockerEnvironmentProvider,
+    VerifierRewardMetric,
+    attempt_from_result,
+    attempt_from_result_dir,
     build_verify_run_spec,
     maybe_run_verify,
     run_verify,
diff --git a/tests/agentic-use/runtimes/aut/runtime.py b/tests/agentic-use/runtimes/aut/runtime.py
index 64bc8e46bc..4185abe826 100644
--- a/tests/agentic-use/runtimes/aut/runtime.py
+++ b/tests/agentic-use/runtimes/aut/runtime.py
@@ -8,12 +8,12 @@
 from collections.abc import Sequence
 from pathlib import Path
 
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask
 
 from runtimes.aut.command import build_aut_agent_cmd
 from runtimes.aut.prep import prepare_aut_config_for_runtime
-from runtimes.shared.agent_log import agent_log_has_workflow_error
-from runtimes.shared.artifacts import build_agent_eval_attempt
 from runtimes.shared.config import AutRuntimeConfig
 from runtimes.shared.constants import (
     DOCKER_SOCKET_CONTAINER_PATH,
@@ -21,15 +21,16 @@
     INSTRUCTION_CONTAINER_PATH,
     REPO_ROOT,
 )
-from runtimes.shared.container_env import base_container_env
-from runtimes.shared.environment import (
-    AgentEnvironmentProvider,
+from runtimes.shared.platform import (
+    AgenticRunLayout,
     DockerEnvironmentProvider,
-    EnvRunSpec,
+    agent_log_has_workflow_error,
+    base_container_env,
+    build_agent_eval_attempt,
+    maybe_run_verify,
+    resolve_run_layout,
+    task_agent_timeout_sec,
 )
-from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout
-from runtimes.shared.task_loader import task_agent_timeout_sec
-from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify
 
 RUNTIME_NAME = "aut"
 AUT_CONFIG_CONTAINER_PATH = "/tmp/aut_agent.yml"
diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py
index 94eb00050e..74531b1a41 100644
--- a/tests/agentic-use/runtimes/orchestrator.py
+++ b/tests/agentic-use/runtimes/orchestrator.py
@@ -1,7 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Orchestrate BUILD + AgentEvaluator + VERIFY for agentic-use tasks."""
+"""Agentic-use adapter over the generic SDK orchestrator.
+
+This is a thin NeMo-Platform factory: the generic run/score/gate loop lives in
+:class:`nemo_evaluator_sdk.agent_eval.orchestrator.AgentEvalOrchestrator`. Here we
+inject the platform specifics it deliberately does not know about — the agentic
+task loader, the Docker image build (``prepare_task``), the ``run_verify``-derived
+``VerifierRewardMetric``, and the ``result.json`` :class:`AgentAttemptSource`.
+"""
 
 from __future__ import annotations
 
@@ -10,7 +17,10 @@
 from pathlib import Path
 from typing import Any
 
-from nemo_evaluator_sdk.agent_eval import AgentEvalRunConfig, AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.gating import GateThresholds
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
+from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_image_exists
+from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import execute_build_plan, plan_task_build
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
     AgentEvalRunResult,
@@ -18,13 +28,12 @@
 )
 from nemo_evaluator_sdk.metrics.protocol import Metric
 
-from runtimes.shared.docker import docker_image_exists
-from runtimes.shared.environment_spec import execute_build_plan, plan_task_build
-from runtimes.shared.layout import task_image_tag
-from runtimes.shared.metrics import VerifierRewardMetric
-from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report
-from runtimes.shared.result_adapter import attempt_from_result_dir
-from runtimes.shared.task_loader import agentic_task_from_dir
+from runtimes.shared.platform import (
+    ResultDirAttemptSource,
+    VerifierRewardMetric,
+    agentic_task_from_dir,
+    task_image_tag,
+)
 
 
 @dataclass(frozen=True)
@@ -38,7 +47,7 @@ class AgenticOrchestratorConfig:
 
 
 class AgenticEvalOrchestrator:
-    """Run agentic-use tasks through AgentEvaluator and optional verify phase."""
+    """Run agentic-use tasks through the generic orchestrator + optional verify metric."""
 
     def __init__(
         self,
@@ -48,6 +57,16 @@ def __init__(
     ) -> None:
         self.runtime = runtime
         self.config = config or AgenticOrchestratorConfig()
+        self._orchestrator = AgentEvalOrchestrator(
+            config=OrchestratorConfig(
+                parallelism=1,
+                write_dashboard=self.config.write_dashboard,
+                write_gate=self.config.write_gate,
+                gate_thresholds=self.config.gate_thresholds,
+                baseline_summary_path=self.config.baseline_summary_path,
+            ),
+            extra_metrics=self._extra_metrics(),
+        )
 
     async def run_agent_eval(
         self,
@@ -58,25 +77,15 @@ async def run_agent_eval(
     ) -> AgentEvalRunResult:
         """Build the task image when needed, run the agent runtime, return SDK result."""
         task = agentic_task_from_dir(task_name)
-        task = task.model_copy(update={"metrics": self._metrics_for_task(task)})
-        image_tag = task_image_tag(task.id)
-        self._ensure_task_image(task.metadata["task_dir"], image_tag)
-
-        result = await AgentEvaluator().run(
-            tasks=[task],
+        return await self._orchestrator.run_tasks(
+            [task],
             target=self.runtime,
-            config=AgentEvalRunConfig(
-                output_dir=output_dir,
-                run_id=run_id,
-                parallelism=1,
-                write_dashboard=self.config.write_dashboard,
-                benchmark={"benchmark": "agentic-use", "task": task_name},
-            ),
+            benchmark={"benchmark": "agentic-use", "task": task_name},
+            output_dir=output_dir,
+            run_id=run_id,
+            prepare_task=self._ensure_task_image,
         )
 
-        self._maybe_write_gate(result)
-        return result
-
     async def score_captured_attempts(
         self,
         task_name: str,
@@ -87,61 +96,38 @@ async def score_captured_attempts(
     ) -> AgentEvalRunResult:
         """Score already-captured ``result.json`` runs without re-running the agent.
 
-        This is the SDK's first-class *stored-attempt* path: it imports each
-        ``nat_runner`` output directory via :func:`attempt_from_result_dir` and
-        scores them through :class:`AgentEvaluator`, so metrics can be exercised
-        (and runs rescored) with no Docker/agent execution.
+        The SDK's first-class *stored-attempt* path: each ``nat_runner`` output
+        dir is adapted via :class:`ResultDirAttemptSource` and scored through the
+        generic orchestrator, so metrics can be exercised (and runs rescored) with
+        no Docker/agent execution.
         """
         task = agentic_task_from_dir(task_name)
-        task = task.model_copy(update={"metrics": self._metrics_for_task(task)})
-        attempts = [attempt_from_result_dir(result_dir, task=task) for result_dir in result_dirs]
-
-        result = await AgentEvaluator().run(
-            tasks=[task],
+        source = ResultDirAttemptSource()
+        attempts = [source.load_attempt(result_dir, task=task) for result_dir in result_dirs]
+        return await self._orchestrator.score_attempts(
+            [task],
             attempts=attempts,
-            config=AgentEvalRunConfig(
-                output_dir=output_dir,
-                run_id=run_id,
-                parallelism=1,
-                write_dashboard=self.config.write_dashboard,
-                benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"},
-            ),
+            benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"},
+            output_dir=output_dir,
+            run_id=run_id,
         )
 
-        self._maybe_write_gate(result)
-        return result
+    def _extra_metrics(self) -> list[Metric]:
+        """Append :class:`VerifierRewardMetric` only when the runtime runs verify.
 
-    def _maybe_write_gate(self, result: AgentEvalRunResult) -> None:
-        if not (self.config.write_gate and result.output_dir is not None):
-            return
-        baseline = (
-            load_baseline_summary(self.config.baseline_summary_path)
-            if self.config.baseline_summary_path is not None
-            else None
-        )
-        report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline)
-        write_gate_report(report, result.output_dir)
-
-    def _metrics_for_task(self, task: AgentEvalTask) -> list[Metric]:
-        """Honor task-authored metrics; only *append* a compatibility metric.
-
-        Metrics originate on the task (see ``agentic_task_from_dir``). When the
-        live verify phase is enabled we append :class:`VerifierRewardMetric` so
-        the legacy pytest reward is scored too — but we never replace the task's
-        own metric set, and we avoid duplicating a metric the task already
-        declares (the SDK rejects duplicate metric types).
+        The verify-enable decision stays in the adapter (it knows its own runtime
+        config); the generic orchestrator never introspects the runtime.
         """
-        metrics: list[Metric] = list(task.metrics)
-        if self._verify_enabled() and not any(isinstance(metric, VerifierRewardMetric) for metric in metrics):
-            metrics.append(VerifierRewardMetric())
-        return metrics
+        return [VerifierRewardMetric()] if self._verify_enabled() else []
 
     def _verify_enabled(self) -> bool:
         runtime_config = getattr(self.runtime, "config", None)
         shared = getattr(runtime_config, "shared", None)
         return bool(getattr(shared, "run_verify", False))
 
-    def _ensure_task_image(self, task_dir: str | Path, image_tag: str) -> None:
+    def _ensure_task_image(self, task: AgentEvalTask) -> None:
+        image_tag = task_image_tag(task.id)
+        task_dir = task.metadata["task_dir"]
         if self.config.skip_build:
             if not docker_image_exists(image_tag):
                 raise RuntimeError(
diff --git a/tests/agentic-use/runtimes/shared/agent_log.py b/tests/agentic-use/runtimes/shared/agent_log.py
deleted file mode 100644
index 6fc7de0270..0000000000
--- a/tests/agentic-use/runtimes/shared/agent_log.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Agent log parsing helpers shared by backend runtimes."""
-
-from __future__ import annotations
-
-import json
-from typing import Any
-
-
-def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]:
-    """Return JSON dict payloads embedded in an agent log, newest-first after the full log."""
-    candidates = [agent_log.strip()]
-    lines = [line.strip() for line in agent_log.splitlines() if line.strip()]
-    if lines:
-        candidates.append(lines[-1])
-        candidates.extend(reversed(lines))
-
-    payloads: list[dict[str, Any]] = []
-    seen: set[str] = set()
-    for candidate in candidates:
-        if not candidate or candidate in seen:
-            continue
-        seen.add(candidate)
-        try:
-            parsed = json.loads(candidate)
-        except json.JSONDecodeError:
-            continue
-        if isinstance(parsed, dict):
-            payloads.append(parsed)
-    return payloads
-
-
-def agent_log_has_workflow_error(agent_log: str) -> bool:
-    """Detect AUT workflow errors returned as successful HTTP JSON payloads."""
-    for payload in iter_agent_log_json_payloads(agent_log):
-        if payload.get("code") == "workflow_error":
-            return True
-    return False
diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py
deleted file mode 100644
index 4942568635..0000000000
--- a/tests/agentic-use/runtimes/shared/artifacts.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Convert captured agent artifacts into AgentEvalAttempt values."""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-from evaluator_agent_eval.artifacts import AgentArtifacts
-from evaluator_agent_eval.schemas import (
-    AgentAttemptInput,
-    AgentAttemptMetadata,
-    AgentAttemptOutput,
-    AgentAttemptTrace,
-    CapturedAgentAttempt,
-)
-from nemo_evaluator_sdk.agent_eval.types import (
-    AgentEvalAttempt,
-    AgentEvalAttemptStatus,
-    AgentEvalTask,
-    AgentOutput,
-)
-from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
-
-from runtimes.shared.config import AgenticRuntimeName
-from runtimes.shared.layout import AgenticRunLayout
-from runtimes.shared.usage import extract_usage_metrics
-
-
-def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
-    """Map an agent-phase outcome to a *scorable* attempt status.
-
-    The SDK's :class:`AgentEvaluator` excludes ``status=="failed"`` from scoring
-    (it raises). An agent that ran but failed must still be scored — e.g. as a
-    ``0`` by :class:`AgentPhaseSuccessMetric` — so that pass-rate gating counts
-    it rather than dropping it. We therefore use ``"partial"`` for an
-    executed-but-unsuccessful agent and reserve ``"failed"`` for genuine
-    attempt-*production* failures (which a runtime surfaces by raising, not by
-    emitting an unscorable attempt). This keeps the live builder and the
-    ``result.json`` importer consistent.
-    """
-    return "completed" if agent_ok else "partial"
-
-
-def build_agent_eval_attempt(
-    *,
-    task: AgentEvalTask,
-    layout: AgenticRunLayout,
-    runtime_name: AgenticRuntimeName,
-    agent_model: str,
-    exit_code: int,
-    agent_ok: bool,
-    run_id: str | None = None,
-    repo_revision: str | None = None,
-    duration_ms: int | None = None,
-) -> AgentEvalAttempt:
-    """Build an SDK attempt from on-disk agent artifacts.
-
-    Metadata uses the same canonical keys as :class:`CapturedAgentAttempt`
-    (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring
-    helpers can consume attempts without a second adapter.
-    """
-    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    log_text = _read_agent_log(layout.agent_log_dir)
-    usage = extract_usage_metrics(log_text)
-    duration = duration_ms if duration_ms is not None else usage.get("duration_ms")
-
-    output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None
-    raw_log_paths = _raw_log_paths(artifacts.agent_log_dir)
-    initial_state = task.inputs.get("filesystem")
-    descriptors = _evidence_descriptors(
-        layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None
-    )
-
-    metadata: dict[str, object] = {
-        # Canonical CapturedAgentAttempt fields
-        "agent_runtime": runtime_name,
-        "agent_model": agent_model,
-        "agent_runtime_version": None,
-        "repo_revision": repo_revision,
-        "run_id": run_id,
-        "exit_code": exit_code,
-        "duration_ms": duration,
-        # SDK / orchestration extensions
-        "model_id": agent_model,
-        "target_name": agent_model,
-        "attempt_id": f"{task.id}:{runtime_name}",
-        "agent_ok": agent_ok,
-        "agent_log_dir": str(layout.agent_log_dir),
-        "workspace_dir": str(layout.workspace_dir),
-        "state_dir": str(layout.state_dir),
-        "run_dir": str(layout.run_dir),
-        "instruction_path": task.metadata.get("instruction_path"),
-        "final_answer_extracted": artifacts.final_answer.extracted,
-        "final_answer_source": artifacts.final_answer.source,
-        "raw_log_paths": raw_log_paths,
-        "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None,
-        **usage,
-    }
-
-    status = resolve_attempt_status(agent_ok)
-    if output_text:
-        output = AgentOutput(text=output_text)
-    elif agent_ok:
-        output = AgentOutput(text=log_text.strip() or "")
-    else:
-        output = AgentOutput(text=log_text.strip() or "(agent phase failed)")
-
-    return AgentEvalAttempt(
-        id=f"{task.id}:{runtime_name}",
-        task_id=task.id,
-        status=status,
-        output=output,
-        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
-        metadata=metadata,
-    )
-
-
-def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt:
-    """Project an SDK attempt onto the portable CapturedAgentAttempt schema."""
-    metadata = attempt.metadata
-    trace_path = metadata.get("atif_trajectory_path")
-    return CapturedAgentAttempt(
-        task_id=attempt.task_id,
-        input=AgentAttemptInput(
-            instruction_text=task.intent,
-            instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None,
-        ),
-        output=AgentAttemptOutput(
-            final_text=attempt.output.text if attempt.output is not None else "",
-            final_answer_extracted=bool(metadata.get("final_answer_extracted")),
-            final_answer_source=str(metadata.get("final_answer_source"))
-            if metadata.get("final_answer_source") is not None
-            else None,
-            raw_log_paths=list(metadata.get("raw_log_paths") or []),
-        ),
-        metadata=AgentAttemptMetadata(
-            agent_runtime=str(metadata.get("agent_runtime", "unknown")),
-            agent_model=str(metadata.get("agent_model", "unknown")),
-            agent_runtime_version=str(metadata["agent_runtime_version"])
-            if metadata.get("agent_runtime_version") is not None
-            else None,
-            repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None,
-            run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None,
-            exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None,
-            duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None,
-        ),
-        trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None,
-    )
-
-
-def _evidence_descriptors(
-    layout: AgenticRunLayout,
-    artifacts: AgentArtifacts,
-    *,
-    initial_state_ref: str | None = None,
-) -> dict[str, EvidenceDescriptor]:
-    """Build the evidence map specified by the agent-eval SDK design doc.
-
-    Doc keys: ``initial_state`` (task input filesystem, when staged),
-    ``final_state`` (workspace), ``trace`` (trajectory, ATIF-normalized),
-    ``logs`` (agent log dir), and ``verifier_logs`` (verifier log dir).
-
-    ``state`` is a NeMo-Platform-specific *extension* (not a doc key): it carries
-    the preserved platform/database state across the agent + verifier phases.
-    """
-    descriptors: dict[str, EvidenceDescriptor] = {}
-
-    # task input filesystem → evidence["initial_state"] (only when a seed was staged).
-    if initial_state_ref:
-        descriptors["initial_state"] = EvidenceDescriptor(
-            kind="filesystem",
-            format="dir",
-            ref=initial_state_ref,
-            metadata={"role": "initial_state"},
-        )
-
-    # agent/trajectory.json → evidence["trace"], preferably ATIF-normalized.
-    if artifacts.atif_trajectory_path is not None:
-        descriptors["trace"] = EvidenceDescriptor(
-            kind="trace",
-            format="atif" if artifacts.atif_trajectory_path.name.startswith("atif") else "json",
-            ref=str(artifacts.atif_trajectory_path),
-        )
-
-    # agent/ logs → evidence["logs"].
-    descriptors["logs"] = EvidenceDescriptor(
-        kind="logs",
-        format="dir",
-        ref=str(layout.agent_log_dir),
-        metadata={"primary_log": "nat_agent.log"},
-    )
-
-    # workspace/ → evidence["final_state"] filesystem descriptor.
-    descriptors["final_state"] = EvidenceDescriptor(
-        kind="filesystem",
-        format="dir",
-        ref=str(layout.workspace_dir),
-        metadata={"role": "final_state"},
-    )
-
-    # Platform extension (non-doc key): preserved platform/db state across phases.
-    descriptors["state"] = EvidenceDescriptor(
-        kind="filesystem",
-        format="dir",
-        ref=str(layout.state_dir),
-        metadata={"role": "platform_state", "extension": "nemo-platform"},
-    )
-
-    # verifier/ logs → evidence["verifier_logs"] (present once verify phase runs).
-    verifier_log_dir = layout.run_dir / "verifier"
-    if verifier_log_dir.exists():
-        descriptors["verifier_logs"] = EvidenceDescriptor(
-            kind="logs",
-            format="dir",
-            ref=str(verifier_log_dir),
-            metadata={"role": "verifier"},
-        )
-
-    return descriptors
-
-
-def _raw_log_paths(agent_log_dir: Path) -> list[str]:
-    if not agent_log_dir.is_dir():
-        return []
-    return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()]
-
-
-def _read_agent_log(agent_log_dir: Path) -> str:
-    log_path = agent_log_dir / "nat_agent.log"
-    if log_path.is_file():
-        return log_path.read_text(encoding="utf-8", errors="replace")
-    return ""
diff --git a/tests/agentic-use/runtimes/shared/container_env.py b/tests/agentic-use/runtimes/shared/container_env.py
deleted file mode 100644
index b59a100b54..0000000000
--- a/tests/agentic-use/runtimes/shared/container_env.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Shared container environment helpers."""
-
-from __future__ import annotations
-
-import json
-from typing import Any
-
-from runtimes.shared.config import AgenticSharedConfig
-from runtimes.shared.constants import (
-    DOCKER_SOCKET_CONTAINER_PATH,
-    DOCKER_SOCKET_HOST_PATH,
-    FILES_STORAGE_CONFIG,
-    PLATFORM_CONFIG_PATH,
-)
-
-
-def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]:
-    """Environment variables shared by all agentic-use container runs."""
-    env: dict[str, str] = {
-        "NMP_BASE_URL": shared.nmp_base_url,
-        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
-        "DATABASE_DIALECT": "sqlite",
-        "DATABASE_PATH": "/data/nmp-platform.db",
-        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
-        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
-        "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec),
-        "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec),
-        "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec),
-    }
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
-    return env
-
-
-def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]:
-    if agent_params:
-        env = dict(env)
-        env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True)
-    return env
diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py
deleted file mode 100644
index a259de71af..0000000000
--- a/tests/agentic-use/runtimes/shared/layout.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Output directory layout for agentic-use runtime runs."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from pathlib import Path
-
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask
-
-from runtimes.shared.config import AgenticSharedConfig
-
-
-@dataclass(frozen=True)
-class AgenticRunLayout:
-    """Filesystem layout for one task run."""
-
-    run_dir: Path
-    agent_log_dir: Path
-    workspace_dir: Path
-    state_dir: Path
-    instruction_path: Path
-
-
-def default_jobs_dir(shared: AgenticSharedConfig) -> Path:
-    if shared.jobs_dir is not None:
-        return shared.jobs_dir
-    return shared.repo_root / "nat-jobs"
-
-
-def new_run_dir(jobs_dir: Path, task_id: str) -> Path:
-    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
-    run_dir = jobs_dir / f"{timestamp}-{task_id}"
-    run_dir.mkdir(parents=True, exist_ok=True)
-    return run_dir
-
-
-def resolve_run_layout(
-    task: AgentEvalTask,
-    shared: AgenticSharedConfig,
-    config: AgentEvalRunConfig | None = None,
-) -> AgenticRunLayout:
-    """Resolve or create the on-disk layout for one task attempt."""
-    if config is not None and config.output_dir is not None:
-        run_dir = Path(config.output_dir)
-    else:
-        run_dir = new_run_dir(default_jobs_dir(shared), task.id)
-
-    agent_log_dir = run_dir / "agent"
-    workspace_dir = run_dir / "workspace"
-    state_dir = run_dir / "state"
-    agent_log_dir.mkdir(parents=True, exist_ok=True)
-    workspace_dir.mkdir(parents=True, exist_ok=True)
-    state_dir.mkdir(parents=True, exist_ok=True)
-
-    instruction_path = agent_log_dir / "instruction.md"
-    instruction_path.write_text(task.intent, encoding="utf-8")
-
-    return AgenticRunLayout(
-        run_dir=run_dir,
-        agent_log_dir=agent_log_dir,
-        workspace_dir=workspace_dir,
-        state_dir=state_dir,
-        instruction_path=instruction_path,
-    )
-
-
-def task_image_tag(task_id: str) -> str:
-    return f"nmp-nat-{task_id}:latest"
diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py
deleted file mode 100644
index 7c68a590ec..0000000000
--- a/tests/agentic-use/runtimes/shared/metrics.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Default metrics for agentic-use agent-eval runs."""
-
-from __future__ import annotations
-
-from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
-
-
-class AgentPhaseSuccessMetric:
-    """Score 1.0 when the agent phase exited successfully, else 0.0."""
-
-    @property
-    def type(self) -> str:
-        return "agentic_use_agent_phase"
-
-    def output_spec(self) -> list[MetricOutputSpec]:
-        return [MetricOutputSpec.continuous_score("agent_phase_success")]
-
-    async def compute_scores(self, input: MetricInput) -> MetricResult:
-        agent_ok = bool(input.candidate.metadata.get("agent_ok"))
-        return MetricResult(
-            outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)],
-        )
-
-
-class VerifierRewardMetric:
-    """Compatibility metric mirroring the legacy pytest verifier reward.
-
-    Reads the verifier outcome that ``nat_runner`` records in ``result.json``
-    (projected onto attempt metadata as ``reward``/``passed``) so existing
-    ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK
-    while task-specific metrics are authored.
-    """
-
-    @property
-    def type(self) -> str:
-        return "agentic_use_verifier_reward"
-
-    def output_spec(self) -> list[MetricOutputSpec]:
-        return [MetricOutputSpec.continuous_score("verifier_reward")]
-
-    async def compute_scores(self, input: MetricInput) -> MetricResult:
-        metadata = input.candidate.metadata
-        reward = metadata.get("reward")
-        if reward is None:
-            reward = 1.0 if metadata.get("passed") else 0.0
-        return MetricResult(
-            outputs=[MetricOutput(name="verifier_reward", value=float(reward))],
-        )
diff --git a/tests/agentic-use/runtimes/shared/platform.py b/tests/agentic-use/runtimes/shared/platform.py
new file mode 100644
index 0000000000..721d717e6f
--- /dev/null
+++ b/tests/agentic-use/runtimes/shared/platform.py
@@ -0,0 +1,791 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""NeMo-Platform glue that sits on top of the generic agent-eval SDK.
+
+Everything generic (Docker helpers, the environment boundary, environment
+authoring, gating, attempt-status/evidence helpers, the verifier mechanic) now
+lives in ``nemo_evaluator_sdk.agent_eval`` and is imported directly where used.
+
+This single module holds only the pieces that are specific to the agentic-use
+benchmark and therefore do not belong in the SDK:
+
+* run layout with the platform ``state_dir`` and the ``nmp-nat-<id>`` image tag,
+* a ``DockerEnvironmentProvider`` defaulting to that platform image tag,
+* default metrics (``AgentPhaseSuccessMetric`` namespace + ``VerifierRewardMetric``),
+* agent-log/usage parsing and the shared container env,
+* attempt construction from live artifacts and from ``nat_runner`` ``result.json``,
+* the live VERIFY phase wired through the SDK environment boundary,
+* the agentic-use task loader.
+"""
+
+from __future__ import annotations
+
+import json
+import textwrap
+import tomllib
+from collections.abc import Callable
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any, TypedDict
+
+from evaluator_agent_eval.artifacts import AgentArtifacts
+from evaluator_agent_eval.schemas import (
+    AgentAttemptInput,
+    AgentAttemptMetadata,
+    AgentAttemptOutput,
+    AgentAttemptTrace,
+    CapturedAgentAttempt,
+)
+from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors
+from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    AgentEnvironmentHandle,
+    EnvRunSpec,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import (
+    DockerEnvironmentProvider as _SDKDockerEnvironmentProvider,
+)
+from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import (
+    VerifierOutcome,
+    collect_verifier_outcome,
+    skipped_outcome,
+)
+from nemo_evaluator_sdk.agent_eval.types import (
+    AgentEvalAttempt,
+    AgentEvalRunConfig,
+    AgentEvalTask,
+    AgentOutput,
+)
+from nemo_evaluator_sdk.metrics.protocol import (
+    Metric,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+)
+from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
+
+from runtimes.shared.config import AgenticRuntimeName, AgenticSharedConfig
+from runtimes.shared.constants import (
+    AGENTIC_USE_DIR,
+    DOCKER_SOCKET_CONTAINER_PATH,
+    DOCKER_SOCKET_HOST_PATH,
+    EVALUATOR_SDK_SRC,
+    FILES_STORAGE_CONFIG,
+    PLATFORM_CONFIG_PATH,
+    SHARED_DIR,
+)
+
+__all__ = [
+    "AgenticRunLayout",
+    "AgentPhaseSuccessMetric",
+    "DockerEnvironmentProvider",
+    "ResultDirAttemptSource",
+    "VerifierRewardMetric",
+    "agent_log_has_workflow_error",
+    "agentic_task_from_dir",
+    "attempt_from_result",
+    "attempt_from_result_dir",
+    "base_container_env",
+    "build_agent_eval_attempt",
+    "build_verify_run_spec",
+    "extract_usage_metrics",
+    "iter_agent_log_json_payloads",
+    "load_task_toml",
+    "maybe_run_verify",
+    "resolve_run_layout",
+    "run_verify",
+    "task_agent_timeout_sec",
+    "task_image_tag",
+    "to_captured_agent_attempt",
+    "verifier_log_dir",
+    "with_candidate_params",
+]
+
+
+# --------------------------------------------------------------------------- #
+# Run layout + image tagging
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class AgenticRunLayout:
+    """Filesystem layout for one task run.
+
+    Extends the SDK's generic ``RunLayout`` shape with a platform-specific
+    ``state_dir`` (preserved platform/database state across agent + verifier).
+    """
+
+    run_dir: Path
+    agent_log_dir: Path
+    workspace_dir: Path
+    state_dir: Path
+    instruction_path: Path
+
+
+def task_image_tag(task_id: str) -> str:
+    return f"nmp-nat-{task_id}:latest"
+
+
+def default_jobs_dir(shared: AgenticSharedConfig) -> Path:
+    if shared.jobs_dir is not None:
+        return shared.jobs_dir
+    return shared.repo_root / "nat-jobs"
+
+
+def new_run_dir(jobs_dir: Path, task_id: str) -> Path:
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    run_dir = jobs_dir / f"{timestamp}-{task_id}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return run_dir
+
+
+def resolve_run_layout(
+    task: AgentEvalTask,
+    shared: AgenticSharedConfig,
+    config: AgentEvalRunConfig | None = None,
+) -> AgenticRunLayout:
+    """Resolve or create the on-disk layout for one task attempt."""
+    output_dir = config.output_dir if config is not None else None
+    run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id))
+
+    # Generic agent/workspace dirs + written instruction come from the SDK helper.
+    base = prepare_run_layout(run_dir, task.intent)
+
+    # Platform extension: a preserved state dir for platform/db across phases.
+    state_dir = base.run_dir / "state"
+    state_dir.mkdir(parents=True, exist_ok=True)
+
+    return AgenticRunLayout(
+        run_dir=base.run_dir,
+        agent_log_dir=base.agent_log_dir,
+        workspace_dir=base.workspace_dir,
+        state_dir=state_dir,
+        instruction_path=base.instruction_path,
+    )
+
+
+class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider):
+    """Platform default: map ``task.id`` to ``nmp-nat-<id>:latest``."""
+
+    def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None:
+        super().__init__(image_tag_fn=image_tag_fn)
+
+
+# --------------------------------------------------------------------------- #
+# Default metrics
+# --------------------------------------------------------------------------- #
+class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric):
+    """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``)."""
+
+    metric_type = "agentic_use_agent_phase"
+
+
+class VerifierRewardMetric:
+    """Compatibility metric mirroring the legacy pytest verifier reward.
+
+    Reads the verifier outcome that ``nat_runner`` records in ``result.json``
+    (projected onto attempt metadata as ``reward``/``passed``) so existing
+    ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK
+    while task-specific metrics are authored.
+    """
+
+    @property
+    def type(self) -> str:
+        return "agentic_use_verifier_reward"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("verifier_reward")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        metadata = input.candidate.metadata
+        reward = metadata.get("reward")
+        if reward is None:
+            reward = 1.0 if metadata.get("passed") else 0.0
+        return MetricResult(
+            outputs=[MetricOutput(name="verifier_reward", value=float(reward))],
+        )
+
+
+# --------------------------------------------------------------------------- #
+# Agent-log parsing + token usage
+# --------------------------------------------------------------------------- #
+class TokenMetrics(TypedDict):
+    prompt_tokens: int | None
+    completion_tokens: int | None
+    total_tokens: int | None
+    cache_creation_tokens: int | None
+    cache_read_tokens: int | None
+    n_assistant_messages: int | None
+    cost_usd: float | None
+    num_turns: int | None
+    duration_ms: float | None
+
+
+def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]:
+    """Extract token usage metrics from an agent log."""
+    import nat_runner
+
+    metrics = nat_runner._extract_usage_metrics(agent_log)
+    return dict(metrics)
+
+
+def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]:
+    """Return JSON dict payloads embedded in an agent log, newest-first after the full log."""
+    candidates = [agent_log.strip()]
+    lines = [line.strip() for line in agent_log.splitlines() if line.strip()]
+    if lines:
+        candidates.append(lines[-1])
+        candidates.extend(reversed(lines))
+
+    payloads: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for candidate in candidates:
+        if not candidate or candidate in seen:
+            continue
+        seen.add(candidate)
+        try:
+            parsed = json.loads(candidate)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(parsed, dict):
+            payloads.append(parsed)
+    return payloads
+
+
+def agent_log_has_workflow_error(agent_log: str) -> bool:
+    """Detect AUT workflow errors returned as successful HTTP JSON payloads."""
+    for payload in iter_agent_log_json_payloads(agent_log):
+        if payload.get("code") == "workflow_error":
+            return True
+    return False
+
+
+# --------------------------------------------------------------------------- #
+# Shared container environment
+# --------------------------------------------------------------------------- #
+def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]:
+    """Environment variables shared by all agentic-use container runs."""
+    env: dict[str, str] = {
+        "NMP_BASE_URL": shared.nmp_base_url,
+        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
+        "DATABASE_DIALECT": "sqlite",
+        "DATABASE_PATH": "/data/nmp-platform.db",
+        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
+        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
+        "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec),
+        "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec),
+        "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec),
+    }
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
+    return env
+
+
+def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]:
+    if agent_params:
+        env = dict(env)
+        env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True)
+    return env
+
+
+# --------------------------------------------------------------------------- #
+# Attempt construction from live artifacts
+# --------------------------------------------------------------------------- #
+def build_agent_eval_attempt(
+    *,
+    task: AgentEvalTask,
+    layout: AgenticRunLayout,
+    runtime_name: AgenticRuntimeName,
+    agent_model: str,
+    exit_code: int,
+    agent_ok: bool,
+    run_id: str | None = None,
+    repo_revision: str | None = None,
+    duration_ms: int | None = None,
+) -> AgentEvalAttempt:
+    """Build an SDK attempt from on-disk agent artifacts.
+
+    Metadata uses the same canonical keys as :class:`CapturedAgentAttempt`
+    (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring
+    helpers can consume attempts without a second adapter.
+    """
+    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    log_text = _read_agent_log(layout.agent_log_dir)
+    usage = extract_usage_metrics(log_text)
+    duration = duration_ms if duration_ms is not None else usage.get("duration_ms")
+
+    output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None
+    raw_log_paths = _raw_log_paths(artifacts.agent_log_dir)
+    initial_state = task.inputs.get("filesystem")
+    descriptors = _evidence_descriptors(
+        layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None
+    )
+
+    metadata: dict[str, object] = {
+        # Canonical CapturedAgentAttempt fields
+        "agent_runtime": runtime_name,
+        "agent_model": agent_model,
+        "agent_runtime_version": None,
+        "repo_revision": repo_revision,
+        "run_id": run_id,
+        "exit_code": exit_code,
+        "duration_ms": duration,
+        # SDK / orchestration extensions
+        "model_id": agent_model,
+        "target_name": agent_model,
+        "attempt_id": f"{task.id}:{runtime_name}",
+        "agent_ok": agent_ok,
+        "agent_log_dir": str(layout.agent_log_dir),
+        "workspace_dir": str(layout.workspace_dir),
+        "state_dir": str(layout.state_dir),
+        "run_dir": str(layout.run_dir),
+        "instruction_path": task.metadata.get("instruction_path"),
+        "final_answer_extracted": artifacts.final_answer.extracted,
+        "final_answer_source": artifacts.final_answer.source,
+        "raw_log_paths": raw_log_paths,
+        "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None,
+        **usage,
+    }
+
+    status = resolve_attempt_status(agent_ok)
+    if output_text:
+        output = AgentOutput(text=output_text)
+    elif agent_ok:
+        output = AgentOutput(text=log_text.strip() or "")
+    else:
+        output = AgentOutput(text=log_text.strip() or "(agent phase failed)")
+
+    return AgentEvalAttempt(
+        id=f"{task.id}:{runtime_name}",
+        task_id=task.id,
+        status=status,
+        output=output,
+        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
+        metadata=metadata,
+    )
+
+
+def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt:
+    """Project an SDK attempt onto the portable CapturedAgentAttempt schema."""
+    metadata = attempt.metadata
+    trace_path = metadata.get("atif_trajectory_path")
+    return CapturedAgentAttempt(
+        task_id=attempt.task_id,
+        input=AgentAttemptInput(
+            instruction_text=task.intent,
+            instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None,
+        ),
+        output=AgentAttemptOutput(
+            final_text=attempt.output.text if attempt.output is not None else "",
+            final_answer_extracted=bool(metadata.get("final_answer_extracted")),
+            final_answer_source=str(metadata.get("final_answer_source"))
+            if metadata.get("final_answer_source") is not None
+            else None,
+            raw_log_paths=list(metadata.get("raw_log_paths") or []),
+        ),
+        metadata=AgentAttemptMetadata(
+            agent_runtime=str(metadata.get("agent_runtime", "unknown")),
+            agent_model=str(metadata.get("agent_model", "unknown")),
+            agent_runtime_version=str(metadata["agent_runtime_version"])
+            if metadata.get("agent_runtime_version") is not None
+            else None,
+            repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None,
+            run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None,
+            exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None,
+            duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None,
+        ),
+        trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None,
+    )
+
+
+def _evidence_descriptors(
+    layout: AgenticRunLayout,
+    artifacts: AgentArtifacts,
+    *,
+    initial_state_ref: str | None = None,
+) -> dict[str, EvidenceDescriptor]:
+    """Compose the SDK's standard evidence keys + the platform ``state`` extension.
+
+    The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/
+    ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state``
+    is a NeMo-Platform-specific *extension* (not a doc key): it carries the
+    preserved platform/database state across the agent + verifier phases.
+    """
+    descriptors = standard_evidence_descriptors(
+        logs_dir=layout.agent_log_dir,
+        final_state_dir=layout.workspace_dir,
+        trace_path=artifacts.atif_trajectory_path,
+        initial_state_ref=initial_state_ref,
+        verifier_logs_dir=layout.run_dir / "verifier",
+        primary_log="nat_agent.log",
+    )
+
+    # Platform extension (non-doc key): preserved platform/db state across phases.
+    descriptors["state"] = EvidenceDescriptor(
+        kind="filesystem",
+        format="dir",
+        ref=str(layout.state_dir),
+        metadata={"role": "platform_state", "extension": "nemo-platform"},
+    )
+
+    return descriptors
+
+
+def _raw_log_paths(agent_log_dir: Path) -> list[str]:
+    if not agent_log_dir.is_dir():
+        return []
+    return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()]
+
+
+def _read_agent_log(agent_log_dir: Path) -> str:
+    log_path = agent_log_dir / "nat_agent.log"
+    if log_path.is_file():
+        return log_path.read_text(encoding="utf-8", errors="replace")
+    return ""
+
+
+# --------------------------------------------------------------------------- #
+# Attempt construction from nat_runner result.json
+# --------------------------------------------------------------------------- #
+# Token/cost measurement keys carried in result.json["metrics"].
+_METRIC_KEYS = (
+    "prompt_tokens",
+    "completion_tokens",
+    "total_tokens",
+    "cache_creation_tokens",
+    "cache_read_tokens",
+    "n_assistant_messages",
+    "cost_usd",
+    "num_turns",
+    "duration_ms",
+    "token_metrics_status",
+    "token_metrics_note",
+)
+
+
+class ResultDirAttemptSource:
+    """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts.
+
+    Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource`
+    protocol so the generic orchestrator's offline path can rescore captured runs.
+    """
+
+    def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt:
+        return attempt_from_result_dir(source, task=task)
+
+
+def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt:
+    """Load ``<output_dir>/result.json`` and build an attempt from it."""
+    output_dir = Path(output_dir)
+    result_path = output_dir / "result.json"
+    if not result_path.is_file():
+        raise FileNotFoundError(f"result.json not found in {output_dir}")
+    result = json.loads(result_path.read_text(encoding="utf-8"))
+    return attempt_from_result(result, output_dir=output_dir, task=task)
+
+
+def attempt_from_result(
+    result: dict[str, Any],
+    *,
+    output_dir: str | Path | None = None,
+    task: AgentEvalTask | None = None,
+) -> AgentEvalAttempt:
+    """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`.
+
+    The attempt ``status`` reflects whether the agent produced a usable
+    response (``agent`` phase outcome). Pass/fail from the verifier is recorded
+    as a *measurement* in metadata (``reward``/``passed``) so scoring metrics —
+    not the runtime — remain the source of truth.
+    """
+    task_id = str(result.get("task") or (task.id if task is not None else "unknown"))
+    backend = str(result.get("agent_backend") or "unknown")
+    resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or "."))
+    layout = _layout_from_result_dir(resolved_dir)
+
+    agent_phase = str(result.get("agent") or "")
+    agent_ok = agent_phase in {"ok", "skipped"}
+    status = resolve_attempt_status(agent_ok)
+
+    output_text, final_extracted, final_source = _resolve_output_text(layout)
+    if not output_text:
+        output_text = "" if agent_ok else "(agent phase failed)"
+
+    descriptors = _evidence_descriptors(
+        layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    )
+
+    metrics = dict(result.get("metrics") or {})
+    metadata: dict[str, Any] = {
+        # Canonical CapturedAgentAttempt-style provenance fields.
+        "agent_runtime": backend,
+        "agent_model": result.get("agent_model"),
+        "run_id": (result.get("provenance") or {}).get("run_id"),
+        "exit_code": 0 if agent_ok else 1,
+        "duration_ms": metrics.get("duration_ms"),
+        # Phase outcomes from result.json.
+        "agent_ok": agent_ok,
+        "build_status": result.get("build"),
+        "agent_status": result.get("agent"),
+        "verify_status": result.get("verify"),
+        # Measurements (verifier reward is a measurement, not attempt status).
+        "passed": result.get("passed"),
+        "reward": result.get("reward"),
+        "runtime_sec": result.get("runtime_sec"),
+        "verifier_scores": result.get("verifier_scores"),
+        # Provenance + candidate identity.
+        "provenance": result.get("provenance"),
+        "candidate_id": result.get("candidate_id"),
+        "candidate_params": result.get("candidate_params"),
+        "image": result.get("image"),
+        "output_dir": str(resolved_dir),
+        # Artifact discovery helpers.
+        "agent_log_dir": str(layout.agent_log_dir),
+        "workspace_dir": str(layout.workspace_dir),
+        "state_dir": str(layout.state_dir),
+        "final_answer_extracted": final_extracted,
+        "final_answer_source": final_source,
+    }
+    metadata.update({key: metrics.get(key) for key in _METRIC_KEYS})
+
+    return AgentEvalAttempt(
+        id=f"{task_id}:{backend}",
+        task_id=task_id,
+        status=status,
+        output=AgentOutput(text=output_text),
+        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
+        metadata=metadata,
+    )
+
+
+def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout:
+    agent_log_dir = output_dir / "agent"
+    return AgenticRunLayout(
+        run_dir=output_dir,
+        agent_log_dir=agent_log_dir,
+        workspace_dir=output_dir / "workspace",
+        state_dir=output_dir / "state",
+        instruction_path=agent_log_dir / "instruction.md",
+    )
+
+
+def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]:
+    if not layout.agent_log_dir.is_dir():
+        return "", False, None
+    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
+    if artifacts.final_answer.extracted and artifacts.final_answer.text:
+        return artifacts.final_answer.text, True, artifacts.final_answer.source
+    log_path = layout.agent_log_dir / "nat_agent.log"
+    if log_path.is_file():
+        return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None
+    return "", False, None
+
+
+# --------------------------------------------------------------------------- #
+# Live VERIFY phase through the SDK environment boundary
+# --------------------------------------------------------------------------- #
+def verifier_log_dir(layout: AgenticRunLayout) -> Path:
+    return layout.run_dir / "verifier"
+
+
+def build_verify_run_spec(
+    task_dir: Path,
+    layout: AgenticRunLayout,
+    *,
+    nmp_base_url: str,
+    agent_backend: str,
+    agent_model: str,
+    smoke_workspace: str | None = None,
+    timeout_sec: int | None = None,
+    extra_args: list[str] | None = None,
+) -> EnvRunSpec | None:
+    """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``.
+
+    Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to
+    verify), matching the runner's behavior.
+    """
+    tests_dir = task_dir / "tests"
+    if not (tests_dir / "test_outputs.py").exists():
+        return None
+
+    log_dir = verifier_log_dir(layout)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    layout.workspace_dir.mkdir(parents=True, exist_ok=True)
+
+    smoke_seed_cmd = ""
+    smoke_cleanup_cmd = ""
+    if smoke_workspace:
+        smoke_seed_cmd = textwrap.dedent("""\
+            /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \
+              --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true
+        """)
+        smoke_cleanup_cmd = textwrap.dedent("""\
+            /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true
+        """)
+
+    verify_cmd = [
+        "bash",
+        "-c",
+        textwrap.dedent(f"""\
+            export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}"
+            export NAT_AGENT=1
+            {smoke_seed_cmd}
+            /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt
+            EXIT=${{PIPESTATUS[0]}}
+            {smoke_cleanup_cmd}
+            if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt
+            exit $EXIT
+        """),
+    ]
+
+    env: dict[str, str] = {
+        "NMP_BASE_URL": nmp_base_url,
+        "NAT_AGENT": "1",
+        "NAT_AGENT_BACKEND": agent_backend,
+        "NAT_AGENT_MODEL": agent_model,
+        "AGENTIC_USE_TASK_DIR": "/task",
+        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
+        "SMOKE_WORKSPACE": smoke_workspace or "",
+        "DATABASE_DIALECT": "sqlite",
+        "DATABASE_PATH": "/data/nmp-platform.db",
+        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
+        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
+    }
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
+
+    mounts: list[tuple[str, str]] = [
+        (str(tests_dir), "/tests"),
+        (str(task_dir), "/task"),
+        (str(layout.workspace_dir), "/app/workspace"),
+        (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"),
+        (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"),
+        (str(layout.agent_log_dir), "/logs/agent"),
+        (str(log_dir), "/logs/verifier"),
+        # Persist platform/db state across AGENT and VERIFY containers.
+        (str(layout.state_dir), "/data"),
+    ]
+    if DOCKER_SOCKET_HOST_PATH.exists():
+        mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH))
+
+    return EnvRunSpec(
+        command=verify_cmd,
+        env=env,
+        mounts=mounts,
+        timeout=timeout_sec,
+        extra_args=list(extra_args or []),
+    )
+
+
+async def run_verify(
+    handle: AgentEnvironmentHandle,
+    spec: EnvRunSpec,
+    layout: AgenticRunLayout,
+) -> VerifierOutcome:
+    """Execute the verifier through the environment handle and collect reward."""
+    result = await handle.run_verifier(spec)
+    return collect_verifier_outcome(
+        ok=result.ok,
+        exit_code=result.exit_code,
+        log_dir=verifier_log_dir(layout),
+    )
+
+
+async def maybe_run_verify(
+    handle: AgentEnvironmentHandle,
+    *,
+    enabled: bool,
+    task_dir: Path,
+    layout: AgenticRunLayout,
+    nmp_base_url: str,
+    agent_backend: str,
+    agent_model: str,
+    smoke_workspace: str | None = None,
+    timeout_sec: int | None = None,
+    extra_args: list[str] | None = None,
+) -> VerifierOutcome:
+    """Run the verifier through ``handle`` when enabled and a verifier exists."""
+    if not enabled:
+        return skipped_outcome()
+    spec = build_verify_run_spec(
+        task_dir,
+        layout,
+        nmp_base_url=nmp_base_url,
+        agent_backend=agent_backend,
+        agent_model=agent_model,
+        smoke_workspace=smoke_workspace,
+        timeout_sec=timeout_sec,
+        extra_args=extra_args,
+    )
+    if spec is None:
+        return skipped_outcome()
+    return await run_verify(handle, spec, layout)
+
+
+# --------------------------------------------------------------------------- #
+# Agentic-use task loader
+# --------------------------------------------------------------------------- #
+def load_task_toml(task_dir: Path) -> dict[str, object]:
+    task_toml = task_dir / "task.toml"
+    if not task_toml.exists():
+        return {}
+    try:
+        with task_toml.open("rb") as handle:
+            data = tomllib.load(handle)
+    except Exception:
+        return {}
+    return data if isinstance(data, dict) else {}
+
+
+def task_agent_timeout_sec(task_dir: Path) -> int | None:
+    data = load_task_toml(task_dir)
+    agent = data.get("agent")
+    if not isinstance(agent, dict):
+        return None
+    timeout_value = agent.get("timeout_sec")
+    if isinstance(timeout_value, (int, float)) and timeout_value > 0:
+        return int(timeout_value)
+    return None
+
+
+def agentic_task_from_dir(
+    task_dir: str | Path,
+    *,
+    tasks_root: Path | None = None,
+    metrics: list[Metric] | None = None,
+) -> AgentEvalTask:
+    """Build an :class:`AgentEvalTask` from an agentic-use task directory.
+
+    ``inputs`` carries only agent-facing material (``instruction``) per the SDK
+    design doc; runtime materialization details such as ``task_dir`` live in
+    ``metadata`` so they cannot leak into metric scoring rows. Metrics are
+    authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the
+    orchestrator only appends compatibility metrics, it does not own the set.
+    """
+    root = Path(tasks_root or AGENTIC_USE_DIR)
+    task_path = Path(task_dir)
+    if not task_path.is_absolute():
+        task_path = (root / task_path).resolve()
+
+    instruction_path = task_path / "instruction.md"
+    if not instruction_path.exists():
+        raise FileNotFoundError(f"instruction.md not found in {task_path}")
+
+    instruction = instruction_path.read_text(encoding="utf-8")
+    task_toml = load_task_toml(task_path)
+
+    return AgentEvalTask(
+        id=task_path.name,
+        intent=instruction,
+        inputs={
+            "instruction": instruction,
+        },
+        metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()],
+        metadata={
+            "benchmark": "agentic-use",
+            "task_toml": task_toml,
+            "instruction_path": str(instruction_path),
+            "task_dir": str(task_path),
+        },
+    )
diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py
deleted file mode 100644
index e8162f9ded..0000000000
--- a/tests/agentic-use/runtimes/shared/result_adapter.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Adapt ``nat_runner`` ``result.json`` records into ``AgentEvalAttempt`` values.
-
-This bridges the existing ``nat_runner`` output contract (see
-``nat_runner._write_result``) onto the agent-eval SDK so a run that already
-produced ``result.json`` can be imported as an attempt without re-executing the
-agent. Per the design doc, ``result.json`` carries the attempt *status*,
-*measurements* (reward + token/cost), and *provenance*.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any
-
-from evaluator_agent_eval.artifacts import AgentArtifacts
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalTask, AgentOutput
-from nemo_evaluator_sdk.values.evidence import CandidateEvidence
-
-from runtimes.shared.artifacts import _evidence_descriptors, resolve_attempt_status  # reuse documented helpers
-from runtimes.shared.layout import AgenticRunLayout
-
-# Token/cost measurement keys carried in result.json["metrics"].
-_METRIC_KEYS = (
-    "prompt_tokens",
-    "completion_tokens",
-    "total_tokens",
-    "cache_creation_tokens",
-    "cache_read_tokens",
-    "n_assistant_messages",
-    "cost_usd",
-    "num_turns",
-    "duration_ms",
-    "token_metrics_status",
-    "token_metrics_note",
-)
-
-
-def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt:
-    """Load ``<output_dir>/result.json`` and build an attempt from it."""
-    output_dir = Path(output_dir)
-    result_path = output_dir / "result.json"
-    if not result_path.is_file():
-        raise FileNotFoundError(f"result.json not found in {output_dir}")
-    result = json.loads(result_path.read_text(encoding="utf-8"))
-    return attempt_from_result(result, output_dir=output_dir, task=task)
-
-
-def attempt_from_result(
-    result: dict[str, Any],
-    *,
-    output_dir: str | Path | None = None,
-    task: AgentEvalTask | None = None,
-) -> AgentEvalAttempt:
-    """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`.
-
-    The attempt ``status`` reflects whether the agent produced a usable
-    response (``agent`` phase outcome). Pass/fail from the verifier is recorded
-    as a *measurement* in metadata (``reward``/``passed``) so scoring metrics —
-    not the runtime — remain the source of truth.
-    """
-    task_id = str(result.get("task") or (task.id if task is not None else "unknown"))
-    backend = str(result.get("agent_backend") or "unknown")
-    resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or "."))
-    layout = _layout_from_result_dir(resolved_dir)
-
-    agent_phase = str(result.get("agent") or "")
-    agent_ok = agent_phase in {"ok", "skipped"}
-    status = resolve_attempt_status(agent_ok)
-
-    output_text, final_extracted, final_source = _resolve_output_text(layout)
-    if not output_text:
-        output_text = "" if agent_ok else "(agent phase failed)"
-
-    descriptors = _evidence_descriptors(
-        layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    )
-
-    metrics = dict(result.get("metrics") or {})
-    metadata: dict[str, Any] = {
-        # Canonical CapturedAgentAttempt-style provenance fields.
-        "agent_runtime": backend,
-        "agent_model": result.get("agent_model"),
-        "run_id": (result.get("provenance") or {}).get("run_id"),
-        "exit_code": 0 if agent_ok else 1,
-        "duration_ms": metrics.get("duration_ms"),
-        # Phase outcomes from result.json.
-        "agent_ok": agent_ok,
-        "build_status": result.get("build"),
-        "agent_status": result.get("agent"),
-        "verify_status": result.get("verify"),
-        # Measurements (verifier reward is a measurement, not attempt status).
-        "passed": result.get("passed"),
-        "reward": result.get("reward"),
-        "runtime_sec": result.get("runtime_sec"),
-        "verifier_scores": result.get("verifier_scores"),
-        # Provenance + candidate identity.
-        "provenance": result.get("provenance"),
-        "candidate_id": result.get("candidate_id"),
-        "candidate_params": result.get("candidate_params"),
-        "image": result.get("image"),
-        "output_dir": str(resolved_dir),
-        # Artifact discovery helpers.
-        "agent_log_dir": str(layout.agent_log_dir),
-        "workspace_dir": str(layout.workspace_dir),
-        "state_dir": str(layout.state_dir),
-        "final_answer_extracted": final_extracted,
-        "final_answer_source": final_source,
-    }
-    metadata.update({key: metrics.get(key) for key in _METRIC_KEYS})
-
-    return AgentEvalAttempt(
-        id=f"{task_id}:{backend}",
-        task_id=task_id,
-        status=status,
-        output=AgentOutput(text=output_text),
-        evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None,
-        metadata=metadata,
-    )
-
-
-def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout:
-    agent_log_dir = output_dir / "agent"
-    return AgenticRunLayout(
-        run_dir=output_dir,
-        agent_log_dir=agent_log_dir,
-        workspace_dir=output_dir / "workspace",
-        state_dir=output_dir / "state",
-        instruction_path=agent_log_dir / "instruction.md",
-    )
-
-
-def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]:
-    if not layout.agent_log_dir.is_dir():
-        return "", False, None
-    artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir)
-    if artifacts.final_answer.extracted and artifacts.final_answer.text:
-        return artifacts.final_answer.text, True, artifacts.final_answer.source
-    log_path = layout.agent_log_dir / "nat_agent.log"
-    if log_path.is_file():
-        return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None
-    return "", False, None
diff --git a/tests/agentic-use/runtimes/shared/task_loader.py b/tests/agentic-use/runtimes/shared/task_loader.py
deleted file mode 100644
index e64a87e99d..0000000000
--- a/tests/agentic-use/runtimes/shared/task_loader.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Map agentic-use task directories to AgentEvalTask values."""
-
-from __future__ import annotations
-
-import tomllib
-from pathlib import Path
-
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask
-from nemo_evaluator_sdk.metrics.protocol import Metric
-
-from runtimes.shared.constants import AGENTIC_USE_DIR
-from runtimes.shared.metrics import AgentPhaseSuccessMetric
-
-
-def load_task_toml(task_dir: Path) -> dict[str, object]:
-    task_toml = task_dir / "task.toml"
-    if not task_toml.exists():
-        return {}
-    try:
-        with task_toml.open("rb") as handle:
-            data = tomllib.load(handle)
-    except Exception:
-        return {}
-    return data if isinstance(data, dict) else {}
-
-
-def task_agent_timeout_sec(task_dir: Path) -> int | None:
-    data = load_task_toml(task_dir)
-    agent = data.get("agent")
-    if not isinstance(agent, dict):
-        return None
-    timeout_value = agent.get("timeout_sec")
-    if isinstance(timeout_value, (int, float)) and timeout_value > 0:
-        return int(timeout_value)
-    return None
-
-
-def agentic_task_from_dir(
-    task_dir: str | Path,
-    *,
-    tasks_root: Path | None = None,
-    metrics: list[Metric] | None = None,
-) -> AgentEvalTask:
-    """Build an :class:`AgentEvalTask` from an agentic-use task directory.
-
-    ``inputs`` carries only agent-facing material (``instruction``) per the SDK
-    design doc; runtime materialization details such as ``task_dir`` live in
-    ``metadata`` so they cannot leak into metric scoring rows. Metrics are
-    authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the
-    orchestrator only appends compatibility metrics, it does not own the set.
-    """
-    root = Path(tasks_root or AGENTIC_USE_DIR)
-    task_path = Path(task_dir)
-    if not task_path.is_absolute():
-        task_path = (root / task_path).resolve()
-
-    instruction_path = task_path / "instruction.md"
-    if not instruction_path.exists():
-        raise FileNotFoundError(f"instruction.md not found in {task_path}")
-
-    instruction = instruction_path.read_text(encoding="utf-8")
-    task_toml = load_task_toml(task_path)
-
-    return AgentEvalTask(
-        id=task_path.name,
-        intent=instruction,
-        inputs={
-            "instruction": instruction,
-        },
-        metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()],
-        metadata={
-            "benchmark": "agentic-use",
-            "task_toml": task_toml,
-            "instruction_path": str(instruction_path),
-            "task_dir": str(task_path),
-        },
-    )
diff --git a/tests/agentic-use/runtimes/shared/usage.py b/tests/agentic-use/runtimes/shared/usage.py
deleted file mode 100644
index 89053ffb97..0000000000
--- a/tests/agentic-use/runtimes/shared/usage.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Token usage extraction from agent logs.
-
-Reuses the proven implementation from ``nat_runner.py`` until the legacy
-runner delegates here and the duplicate can be removed.
-"""
-
-from __future__ import annotations
-
-from typing import TypedDict
-
-
-class TokenMetrics(TypedDict):
-    prompt_tokens: int | None
-    completion_tokens: int | None
-    total_tokens: int | None
-    cache_creation_tokens: int | None
-    cache_read_tokens: int | None
-    n_assistant_messages: int | None
-    cost_usd: float | None
-    num_turns: int | None
-    duration_ms: float | None
-
-
-def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]:
-    """Extract token usage metrics from an agent log."""
-    import nat_runner
-
-    metrics = nat_runner._extract_usage_metrics(agent_log)
-    return dict(metrics)
diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py
deleted file mode 100644
index 8be53924b9..0000000000
--- a/tests/agentic-use/runtimes/shared/verify.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Live VERIFY phase executed through the environment boundary.
-
-Ports ``nat_runner.run_verify_phase`` onto :meth:`AgentEnvironmentHandle.run_verifier`
-so the task-local ``tests/test_outputs.py`` pytest verifier runs in the *same*
-prepared environment (and against the same persisted workspace/state) as the
-agent phase. The resulting reward is stamped onto the attempt metadata so the
-``VerifierRewardMetric`` compatibility metric scores it through the Evaluator SDK.
-"""
-
-from __future__ import annotations
-
-import textwrap
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-
-from runtimes.shared.constants import (
-    DOCKER_SOCKET_CONTAINER_PATH,
-    DOCKER_SOCKET_HOST_PATH,
-    EVALUATOR_SDK_SRC,
-    FILES_STORAGE_CONFIG,
-    PLATFORM_CONFIG_PATH,
-    SHARED_DIR,
-)
-from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec
-from runtimes.shared.layout import AgenticRunLayout
-
-
-@dataclass(frozen=True)
-class VerifierOutcome:
-    """Result of the live verifier phase for one task."""
-
-    ran: bool
-    passed: bool
-    reward: int
-    exit_code: int
-    stdout: str
-    verifier_log_dir: Path | None
-
-
-def verifier_log_dir(layout: AgenticRunLayout) -> Path:
-    return layout.run_dir / "verifier"
-
-
-def build_verify_run_spec(
-    task_dir: Path,
-    layout: AgenticRunLayout,
-    *,
-    nmp_base_url: str,
-    agent_backend: str,
-    agent_model: str,
-    smoke_workspace: str | None = None,
-    timeout_sec: int | None = None,
-    extra_args: list[str] | None = None,
-) -> EnvRunSpec | None:
-    """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``.
-
-    Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to
-    verify), matching the runner's behavior.
-    """
-    tests_dir = task_dir / "tests"
-    if not (tests_dir / "test_outputs.py").exists():
-        return None
-
-    log_dir = verifier_log_dir(layout)
-    log_dir.mkdir(parents=True, exist_ok=True)
-    layout.workspace_dir.mkdir(parents=True, exist_ok=True)
-
-    smoke_seed_cmd = ""
-    smoke_cleanup_cmd = ""
-    if smoke_workspace:
-        smoke_seed_cmd = textwrap.dedent("""\
-            /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \
-              --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true
-        """)
-        smoke_cleanup_cmd = textwrap.dedent("""\
-            /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true
-        """)
-
-    verify_cmd = [
-        "bash",
-        "-c",
-        textwrap.dedent(f"""\
-            export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}"
-            export NAT_AGENT=1
-            {smoke_seed_cmd}
-            /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt
-            EXIT=${{PIPESTATUS[0]}}
-            {smoke_cleanup_cmd}
-            if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt
-            exit $EXIT
-        """),
-    ]
-
-    env: dict[str, str] = {
-        "NMP_BASE_URL": nmp_base_url,
-        "NAT_AGENT": "1",
-        "NAT_AGENT_BACKEND": agent_backend,
-        "NAT_AGENT_MODEL": agent_model,
-        "AGENTIC_USE_TASK_DIR": "/task",
-        "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace",
-        "SMOKE_WORKSPACE": smoke_workspace or "",
-        "DATABASE_DIALECT": "sqlite",
-        "DATABASE_PATH": "/data/nmp-platform.db",
-        "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG,
-        "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH,
-    }
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}"
-
-    mounts: list[tuple[str, str]] = [
-        (str(tests_dir), "/tests"),
-        (str(task_dir), "/task"),
-        (str(layout.workspace_dir), "/app/workspace"),
-        (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"),
-        (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"),
-        (str(layout.agent_log_dir), "/logs/agent"),
-        (str(log_dir), "/logs/verifier"),
-        # Persist platform/db state across AGENT and VERIFY containers.
-        (str(layout.state_dir), "/data"),
-    ]
-    if DOCKER_SOCKET_HOST_PATH.exists():
-        mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH))
-
-    return EnvRunSpec(
-        command=verify_cmd,
-        env=env,
-        mounts=mounts,
-        timeout=timeout_sec,
-        extra_args=list(extra_args or []),
-    )
-
-
-async def run_verify(
-    handle: AgentEnvironmentHandle,
-    spec: EnvRunSpec,
-    layout: AgenticRunLayout,
-) -> VerifierOutcome:
-    """Execute the verifier through the environment handle and collect reward."""
-    result = await handle.run_verifier(spec)
-    log_dir = verifier_log_dir(layout)
-    passed = result.ok
-
-    stdout = ""
-    stdout_path = log_dir / "test-stdout.txt"
-    if stdout_path.is_file():
-        stdout = stdout_path.read_text(encoding="utf-8", errors="replace")
-
-    reward_path = log_dir / "reward.txt"
-    if reward_path.is_file():
-        reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0
-    else:
-        reward = 1 if passed else 0
-        reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8")
-
-    return VerifierOutcome(
-        ran=True,
-        passed=passed,
-        reward=reward,
-        exit_code=result.exit_code,
-        stdout=stdout,
-        verifier_log_dir=log_dir,
-    )
-
-
-async def maybe_run_verify(
-    handle: AgentEnvironmentHandle,
-    *,
-    enabled: bool,
-    task_dir: Path,
-    layout: AgenticRunLayout,
-    nmp_base_url: str,
-    agent_backend: str,
-    agent_model: str,
-    smoke_workspace: str | None = None,
-    timeout_sec: int | None = None,
-    extra_args: list[str] | None = None,
-) -> VerifierOutcome:
-    """Run the verifier through ``handle`` when enabled and a verifier exists."""
-    if not enabled:
-        return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
-    spec = build_verify_run_spec(
-        task_dir,
-        layout,
-        nmp_base_url=nmp_base_url,
-        agent_backend=agent_backend,
-        agent_model=agent_model,
-        smoke_workspace=smoke_workspace,
-        timeout_sec=timeout_sec,
-        extra_args=extra_args,
-    )
-    if spec is None:
-        return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None)
-    return await run_verify(handle, spec, layout)
-
-
-def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None:
-    """Stamp verifier reward/status onto attempt metadata for scoring + gating."""
-    if not outcome.ran:
-        metadata.setdefault("verify_status", "skipped")
-        return
-    metadata["verify_status"] = "ok" if outcome.passed else "failed"
-    metadata["passed"] = outcome.passed
-    metadata["reward"] = outcome.reward
-    metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None
diff --git a/tests/agentic-use/runtimes/workflow/runtime.py b/tests/agentic-use/runtimes/workflow/runtime.py
index 1d8c09fecd..55688b3d24 100644
--- a/tests/agentic-use/runtimes/workflow/runtime.py
+++ b/tests/agentic-use/runtimes/workflow/runtime.py
@@ -8,20 +8,21 @@
 from collections.abc import Sequence
 from pathlib import Path
 
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec
+from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata
 from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask
 
-from runtimes.shared.artifacts import build_agent_eval_attempt
 from runtimes.shared.config import WorkflowRuntimeConfig
 from runtimes.shared.constants import INSTRUCTION_CONTAINER_PATH, WORKFLOW_CONTAINER_PATH
-from runtimes.shared.container_env import base_container_env
-from runtimes.shared.environment import (
-    AgentEnvironmentProvider,
+from runtimes.shared.platform import (
+    AgenticRunLayout,
     DockerEnvironmentProvider,
-    EnvRunSpec,
+    base_container_env,
+    build_agent_eval_attempt,
+    maybe_run_verify,
+    resolve_run_layout,
+    task_agent_timeout_sec,
 )
-from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout
-from runtimes.shared.task_loader import task_agent_timeout_sec
-from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify
 from runtimes.workflow.command import build_workflow_agent_cmd
 from runtimes.workflow.prep import prepare_workflow_for_runtime
 
diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py
index 935ddf7389..1903989705 100644
--- a/tests/agentic-use/tests/test_agentic_runtimes.py
+++ b/tests/agentic-use/tests/test_agentic_runtimes.py
@@ -10,10 +10,9 @@
 
 import pytest
 import yaml
+from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec
 from runtimes.shared.config import AgenticSharedConfig, WorkflowRuntimeConfig
-from runtimes.shared.environment import EnvCommandResult, EnvRunSpec
-from runtimes.shared.layout import resolve_run_layout, task_image_tag
-from runtimes.shared.task_loader import agentic_task_from_dir
+from runtimes.shared.platform import agentic_task_from_dir, resolve_run_layout, task_image_tag
 from runtimes.workflow.command import build_workflow_agent_cmd
 from runtimes.workflow.prep import prepare_workflow_for_runtime
 from runtimes.workflow.runtime import NatWorkflowAttemptRuntime
@@ -122,8 +121,7 @@ def test_runtime_for_backend_rejects_unknown() -> None:
 
 
 def test_build_agent_eval_attempt_metadata_matches_captured_schema(tmp_path: Path) -> None:
-    from runtimes.shared.artifacts import build_agent_eval_attempt, to_captured_agent_attempt
-    from runtimes.shared.layout import AgenticRunLayout
+    from runtimes.shared.platform import AgenticRunLayout, build_agent_eval_attempt, to_captured_agent_attempt
 
     task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR)
     layout = AgenticRunLayout(
@@ -183,7 +181,7 @@ async def test_aut_runtime_run_tasks_with_mocked_env(tmp_path: Path) -> None:
 
 
 def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> None:
-    from runtimes.shared.result_adapter import attempt_from_result
+    from runtimes.shared.platform import attempt_from_result
 
     output_dir = tmp_path / "20260101T000000Z-demo"
     (output_dir / "agent").mkdir(parents=True)
@@ -218,7 +216,7 @@ def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> Non
 
 
 def test_attempt_from_result_marks_unsuccessful_agent_partial(tmp_path: Path) -> None:
-    from runtimes.shared.result_adapter import attempt_from_result
+    from runtimes.shared.platform import attempt_from_result
 
     output_dir = tmp_path / "run"
     (output_dir / "agent").mkdir(parents=True)
@@ -271,7 +269,7 @@ async def test_score_captured_attempts_offline(tmp_path: Path) -> None:
 @pytest.mark.asyncio
 async def test_verifier_reward_metric_reads_metadata() -> None:
     from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput
-    from runtimes.shared.metrics import VerifierRewardMetric
+    from runtimes.shared.platform import VerifierRewardMetric
 
     metric = VerifierRewardMetric()
     candidate = CandidateOutput(output_text="x", metadata={"reward": 1})
@@ -303,6 +301,8 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co
         },
     )
     task_result = AgentEvalTaskResult(
+        id="demo:workflow:agentic_use_verifier_reward",
+        run_id="run-1",
         task_id="demo",
         attempt_id="demo:workflow",
         metric_type="agentic_use_verifier_reward",
@@ -318,7 +318,7 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co
 
 
 def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
-    from runtimes.shared.reporting import summarize_run
+    from nemo_evaluator_sdk.agent_eval.gating import summarize_run
 
     summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5))
 
@@ -331,7 +331,7 @@ def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None:
 
 
 def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None:
-    from runtimes.shared.reporting import GateThresholds, evaluate_gate, write_gate_report
+    from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, write_gate_report
 
     baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0)
     candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0)
@@ -354,7 +354,7 @@ def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> Non
 
 
 def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
-    from runtimes.shared.reporting import GateThresholds, evaluate_gate
+    from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate
 
     baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111")
     candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222")
@@ -376,8 +376,7 @@ def test_evaluate_gate_blocks_cross_commit_comparison() -> None:
 
 
 def test_build_verify_run_spec_shape(tmp_path: Path) -> None:
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import build_verify_run_spec
+    from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec
 
     layout = AgenticRunLayout(
         run_dir=tmp_path,
@@ -402,8 +401,7 @@ def test_build_verify_run_spec_shape(tmp_path: Path) -> None:
 
 
 def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> None:
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import build_verify_run_spec
+    from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec
 
     task_dir = tmp_path / "no-tests-task"
     task_dir.mkdir()
@@ -420,9 +418,8 @@ def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> Non
 
 @pytest.mark.asyncio
 async def test_run_verify_reads_reward_file(tmp_path: Path) -> None:
-    from runtimes.shared.environment import EnvCommandResult, EnvRunSpec
-    from runtimes.shared.layout import AgenticRunLayout
-    from runtimes.shared.verify import run_verify
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec
+    from runtimes.shared.platform import AgenticRunLayout, run_verify
 
     layout = AgenticRunLayout(
         run_dir=tmp_path,
@@ -454,7 +451,7 @@ async def close(self) -> None:
 
 @pytest.mark.asyncio
 async def test_workflow_runtime_runs_verify_through_handle(tmp_path: Path) -> None:
-    from runtimes.shared.verify import verifier_log_dir
+    from runtimes.shared.platform import verifier_log_dir
 
     task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR)
     layout = resolve_run_layout(task, AgenticSharedConfig(jobs_dir=tmp_path))
@@ -491,7 +488,7 @@ async def prepare(self, task: object, config: object = None) -> _Handle:
 
 
 def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     (tmp_path / "environment.yaml").write_text(
         "environment:\n"
@@ -514,7 +511,7 @@ def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None:
 
 
 def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     env_dir = tmp_path / "environment"
     env_dir.mkdir()
@@ -526,14 +523,14 @@ def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None:
 
 
 def test_load_environment_spec_missing_raises(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import load_environment_spec
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec
 
     with pytest.raises(FileNotFoundError):
         load_environment_spec(tmp_path)
 
 
 def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import plan_task_build
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build
 
     env_dir = tmp_path / "environment"
     env_dir.mkdir()
@@ -547,7 +544,7 @@ def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None:
 
 
 def test_plan_task_build_generates_derived_dockerfile(tmp_path: Path) -> None:
-    from runtimes.shared.environment_spec import plan_task_build
+    from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build
 
     (tmp_path / "environment.yaml").write_text(
         "environment:\n  image: base:1\n  dependencies:\n    python: [pytest]\n  setup: [seed-providers]\n",