NVIDIA-NeMo · arpitsardhana · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026 · SandyChapman
@@ -5,9 +5,11 @@
 
 from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard
 from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
+from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
 from nemo_evaluator_sdk.agent_eval.persistence import persist_run
 from nemo_evaluator_sdk.agent_eval.types import (
     AgentAttemptRuntime,
+    AgentAttemptSource,
     AgentEvalAttempt,
     AgentEvalDiagnostic,
     AgentEvalMetricOutputCoverage,
@@ -24,21 +26,24 @@
 from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence
 
 __all__ = [
+    "AgentAttemptRuntime",
+    "AgentAttemptSource",
     "AgentEvalAttempt",
     "AgentEvalDiagnostic",
     "AgentEvalMetricOutputCoverage",
+    "AgentEvalOrchestrator",
     "AgentEvalRunConfig",
     "AgentEvalRunResult",
     "AgentEvalSummary",
     "AgentEvalTarget",
     "AgentEvalTask",
     "AgentEvalTaskResult",
     "AgentEvaluator",
-    "AgentAttemptRuntime",
     "AgentOutput",
     "CandidateEvidence",
     "EvidenceDescriptor",
     "LocalFilesystemEvidence",
+    "OrchestratorConfig",
     "SemanticView",
     "ViewSignal",
     "persist_run",

@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts.
+
+These are the runtime-agnostic pieces: the *scorable* status mapping and the
+standard evidence-key builder. Platform-specific attempt construction (reading
+proprietary artifact layouts, extra evidence keys) composes these in the adapter.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus
+from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor
+
+
+def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
+    """Map an agent-phase outcome to a *scorable* attempt status.
+
+    :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes
+    ``status=="failed"`` from scoring (it short-circuits to a failed metric
+    result). An agent that ran but did not succeed must still be scored — e.g. as
+    a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore
+    use ``"partial"`` for an executed-but-unsuccessful agent and reserve
+    ``"failed"`` for genuine attempt-*production* failures (which a runtime
+    surfaces by raising, not by emitting an unscorable attempt).
+    """
+    return "completed" if agent_ok else "partial"
+
+
+def standard_evidence_descriptors(
+    *,
+    logs_dir: str | Path,
+    final_state_dir: str | Path,
+    trace_path: str | Path | None = None,
+    initial_state_ref: str | None = None,
+    verifier_logs_dir: str | Path | None = None,
+    primary_log: str | None = None,
+) -> dict[str, EvidenceDescriptor]:
+    """Build the documented evidence map for an agent-eval attempt.
+
+    Standard keys: ``initial_state`` (task input filesystem, when staged),
+    ``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log
+    dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present).
+    Callers may add their own extension keys to the returned mapping.
+    """
+    descriptors: dict[str, EvidenceDescriptor] = {}
+
+    if initial_state_ref:
+        descriptors["initial_state"] = EvidenceDescriptor(
+            kind="filesystem",
+            format="dir",
+            ref=str(initial_state_ref),
+            metadata={"role": "initial_state"},
+        )
+
+    if trace_path is not None:
+        trace_name = Path(trace_path).name
+        descriptors["trace"] = EvidenceDescriptor(
+            kind="trace",
+            format="atif" if trace_name.startswith("atif") else "json",
+            ref=str(trace_path),
+        )
+
+    logs_metadata = {"primary_log": primary_log} if primary_log else {}
+    descriptors["logs"] = EvidenceDescriptor(
+        kind="logs",
+        format="dir",
+        ref=str(logs_dir),
+        metadata=logs_metadata,
+    )
+
+    descriptors["final_state"] = EvidenceDescriptor(
+        kind="filesystem",
+        format="dir",
+        ref=str(final_state_dir),
+        metadata={"role": "final_state"},
+    )
+
+    if verifier_logs_dir is not None and Path(verifier_logs_dir).exists():
+        descriptors["verifier_logs"] = EvidenceDescriptor(
+            kind="logs",
+            format="dir",
+            ref=str(verifier_logs_dir),
+            metadata={"role": "verifier"},
+        )
+
+    return descriptors
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Reusable agent-eval metrics.
+
+``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt
+metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it
+scores by inspecting ``candidate.evidence`` (a filesystem evidence handle)
+rather than a reward written into metadata — the value proposition of scoring
+over evidence instead of trusting a verifier's stamped reward.
+"""
+
+from __future__ import annotations
+
+from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+class AgentPhaseSuccessMetric:
+    """Score 1.0 when the agent phase exited successfully, else 0.0.
+
+    The metric ``type`` is overridable via the ``metric_type`` class attribute so
+    callers can namespace it; the output name stays ``agent_phase_success`` (which
+    gating reads as a reward signal).
+    """
+
+    metric_type: str = "agent_phase_success"
+
+    @property
+    def type(self) -> str:
+        return self.metric_type
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("agent_phase_success")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        agent_ok = bool(input.candidate.metadata.get("agent_ok"))
+        return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)])
+
+
+class EvidencePresenceMetric:
+    """Score 1.0 when a named filesystem evidence directory exists (and is non-empty).
+
+    Reads ``candidate.evidence`` directly — the canonical metric-over-evidence
+    pattern — so the score reflects what the agent actually produced on disk,
+    not a reward stamped into metadata by a verifier.
+    """
+
+    def __init__(
+        self,
+        *,
+        evidence_name: str = "final_state",
+        output_name: str = "evidence_present",
+        require_non_empty: bool = True,
+    ) -> None:
+        self._evidence_name = evidence_name
+        self._output_name = output_name
+        self._require_non_empty = require_non_empty
+
+    @property
+    def type(self) -> str:
+        return "evidence_presence"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score(self._output_name)]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        score = 0.0
+        evidence = input.candidate.evidence
+        if evidence is not None and evidence.get(self._evidence_name) is not None:
+            try:
+                handle = await evidence.filesystem(self._evidence_name)
+                if await handle.exists():
+                    if self._require_non_empty:
+                        score = 1.0 if await handle.iter_paths(recursive=True) else 0.0
+                    else:
+                        score = 1.0
+            except (KeyError, ValueError):
+                score = 0.0
+        return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)])
@@ -1,19 +1,21 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 """Deterministic gating + provenance comparison over an agent-eval run bundle.
 
-This closes the design-doc B4 "CI/reporting" gap. Persistence of
-``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html``
-is already handled by the SDK (``agent_eval.persistence.persist_run`` /
-``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline
-gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance
-checks.
-
-The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary
-produced here is interchangeable with the legacy gate's baseline summary. The
-difference is the input: this operates on a typed :class:`AgentEvalRunResult`
-(metric scores + attempt metadata) instead of scanning ``result.json`` files.
+Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/
+``results.jsonl``/``summary.json``/``report.html``) is handled by
+``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate
+-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic
+provenance checks.
+
+Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`:
+that summary reports the *mean score per metric output* over a run. The gate's
+``pass_rate`` here is a different, intentional view — a per-task pass/fail count
+against a reward threshold — so it is computed separately. Token/runtime/
+provenance aggregation is delegated to
+:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the
+measurement keys are read in exactly one place.
 """
 
 from __future__ import annotations
@@ -23,13 +25,13 @@
 from pathlib import Path
 from typing import Any
 
-from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult
+from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
+from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult
 
 # Metric outputs, in priority order, that represent a task's pass/reward signal.
 DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success")
 
-# Provenance fields collapsed into a single run-level summary (matches the
-# legacy gate so baselines are interchangeable).
+# Provenance fields collapsed into a single run-level summary.
 _PROVENANCE_FIELDS: tuple[str, ...] = (
     "commit_sha",
     "commit_short",
@@ -91,7 +93,7 @@ def evaluate_gate(
 
 
 def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path:
-    """Persist the gate report alongside the SDK run bundle."""
+    """Persist the gate report alongside the run bundle."""
     path = Path(output_dir)
     path.mkdir(parents=True, exist_ok=True)
     gate_path = path / filename
@@ -115,8 +117,13 @@ def summarize_run(
     *,
     reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
 ) -> dict[str, Any]:
-    """Aggregate pass-rate, token, runtime, and provenance for one run."""
-    attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts}
+    """Aggregate pass-rate, token, runtime, and provenance for one run.
+
+    Token/runtime/provenance are read via :class:`AttemptMeasurements`; the
+    reward used for pass-rate prefers a scored metric output (``reward_outputs``)
+    and falls back to the attempt's recorded reward.
+    """
+    attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts}
     reward_by_task = _rewards_by_task(result.results, reward_outputs)
     task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task))
 
@@ -131,29 +138,28 @@ def summarize_run(
 
     for task_id in task_ids:
         attempt = attempts_by_task.get(task_id)
-        metadata = attempt.metadata if attempt is not None else {}
+        measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {})
 
-        reward_value = _task_reward(task_id, reward_by_task, metadata)
+        reward_value = reward_by_task.get(task_id)
+        if reward_value is None:
+            reward_value = measurements.reward if measurements.reward is not None else 0.0
         if reward_value >= 1.0:
             passed += 1
 
-        total_tokens = metadata.get("total_tokens")
-        if isinstance(total_tokens, int):
-            token_sum += total_tokens
+        if measurements.total_tokens is not None:
+            token_sum += measurements.total_tokens
             token_count += 1
         else:
             token_unavailable.append(task_id)
 
-        runtime_sec = _task_runtime_sec(metadata)
-        if runtime_sec is not None:
-            runtime_sum += runtime_sec
+        if measurements.runtime_sec is not None:
+            runtime_sum += measurements.runtime_sec
             runtime_count += 1
         else:
             runtime_unavailable.append(task_id)
 
-        prov = metadata.get("provenance")
-        if isinstance(prov, dict):
-            provenance_inputs.append(prov)
+        if measurements.provenance:
+            provenance_inputs.append(measurements.provenance)
 
     total = len(task_ids)
     return {
@@ -404,28 +410,6 @@ def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None
     return None
 
 
-def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float:
-    if task_id in reward_by_task:
-        return reward_by_task[task_id]
-    reward = metadata.get("reward")
-    if reward is not None:
-        try:
-            return float(reward)
-        except (TypeError, ValueError):
-            return 0.0
-    return 1.0 if metadata.get("passed") is True else 0.0
-
-
-def _task_runtime_sec(metadata: dict[str, Any]) -> float | None:
-    runtime_sec = metadata.get("runtime_sec")
-    if isinstance(runtime_sec, int | float):
-        return float(runtime_sec)
-    duration_ms = metadata.get("duration_ms")
-    if isinstance(duration_ms, int | float):
-        return float(duration_ms) / 1000.0
-    return None
-
-
 def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]:
     observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS}
     for prov in provenances: