Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard
from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator
from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig
from nemo_evaluator_sdk.agent_eval.persistence import persist_run
from nemo_evaluator_sdk.agent_eval.types import (
AgentAttemptRuntime,
AgentAttemptSource,
AgentEvalAttempt,
AgentEvalDiagnostic,
AgentEvalMetricOutputCoverage,
Expand All @@ -24,21 +26,24 @@
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence

__all__ = [
"AgentAttemptRuntime",
"AgentAttemptSource",
"AgentEvalAttempt",
"AgentEvalDiagnostic",
"AgentEvalMetricOutputCoverage",
"AgentEvalOrchestrator",
"AgentEvalRunConfig",
"AgentEvalRunResult",
"AgentEvalSummary",
"AgentEvalTarget",
"AgentEvalTask",
"AgentEvalTaskResult",
"AgentEvaluator",
"AgentAttemptRuntime",
"AgentOutput",
"CandidateEvidence",
"EvidenceDescriptor",
"LocalFilesystemEvidence",
"OrchestratorConfig",
"SemanticView",
"ViewSignal",
"persist_run",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts.

These are the runtime-agnostic pieces: the *scorable* status mapping and the
standard evidence-key builder. Platform-specific attempt construction (reading
proprietary artifact layouts, extra evidence keys) composes these in the adapter.
"""

from __future__ import annotations

from pathlib import Path

from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus
from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor


def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus:
"""Map an agent-phase outcome to a *scorable* attempt status.

:class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes
``status=="failed"`` from scoring (it short-circuits to a failed metric
result). An agent that ran but did not succeed must still be scored — e.g. as
a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore
use ``"partial"`` for an executed-but-unsuccessful agent and reserve
``"failed"`` for genuine attempt-*production* failures (which a runtime
surfaces by raising, not by emitting an unscorable attempt).
"""
return "completed" if agent_ok else "partial"


def standard_evidence_descriptors(
*,
logs_dir: str | Path,
final_state_dir: str | Path,
trace_path: str | Path | None = None,
initial_state_ref: str | None = None,
verifier_logs_dir: str | Path | None = None,
primary_log: str | None = None,
) -> dict[str, EvidenceDescriptor]:
"""Build the documented evidence map for an agent-eval attempt.

Standard keys: ``initial_state`` (task input filesystem, when staged),
``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log
dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present).
Callers may add their own extension keys to the returned mapping.
"""
descriptors: dict[str, EvidenceDescriptor] = {}

if initial_state_ref:
descriptors["initial_state"] = EvidenceDescriptor(
kind="filesystem",
format="dir",
ref=str(initial_state_ref),
metadata={"role": "initial_state"},
)

if trace_path is not None:
trace_name = Path(trace_path).name
descriptors["trace"] = EvidenceDescriptor(
kind="trace",
format="atif" if trace_name.startswith("atif") else "json",
ref=str(trace_path),
)

logs_metadata = {"primary_log": primary_log} if primary_log else {}
descriptors["logs"] = EvidenceDescriptor(
kind="logs",
format="dir",
ref=str(logs_dir),
metadata=logs_metadata,
)

descriptors["final_state"] = EvidenceDescriptor(
kind="filesystem",
format="dir",
ref=str(final_state_dir),
metadata={"role": "final_state"},
)

if verifier_logs_dir is not None and Path(verifier_logs_dir).exists():
descriptors["verifier_logs"] = EvidenceDescriptor(
kind="logs",
format="dir",
ref=str(verifier_logs_dir),
metadata={"role": "verifier"},
)

return descriptors
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Reusable agent-eval metrics.

``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt
metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it
scores by inspecting ``candidate.evidence`` (a filesystem evidence handle)
rather than a reward written into metadata — the value proposition of scoring
over evidence instead of trusting a verifier's stamped reward.
"""

from __future__ import annotations

from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult


class AgentPhaseSuccessMetric:
"""Score 1.0 when the agent phase exited successfully, else 0.0.

The metric ``type`` is overridable via the ``metric_type`` class attribute so
callers can namespace it; the output name stays ``agent_phase_success`` (which
gating reads as a reward signal).
"""

metric_type: str = "agent_phase_success"

@property
def type(self) -> str:
return self.metric_type

def output_spec(self) -> list[MetricOutputSpec]:
return [MetricOutputSpec.continuous_score("agent_phase_success")]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe [MetricOutputSpec.boolean("agent_phase_success")] instead? We can then update the compute scores to:

agent_ok = bool(input.candidate.metadata.get("agent_ok"))
return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=agent_ok])

Same point with EvidencePresenceMetric below.


async def compute_scores(self, input: MetricInput) -> MetricResult:
agent_ok = bool(input.candidate.metadata.get("agent_ok"))
return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)])


class EvidencePresenceMetric:
"""Score 1.0 when a named filesystem evidence directory exists (and is non-empty).

Reads ``candidate.evidence`` directly — the canonical metric-over-evidence
pattern — so the score reflects what the agent actually produced on disk,
not a reward stamped into metadata by a verifier.
"""

def __init__(
self,
*,
evidence_name: str = "final_state",
output_name: str = "evidence_present",
require_non_empty: bool = True,
) -> None:
self._evidence_name = evidence_name
self._output_name = output_name
self._require_non_empty = require_non_empty

@property
def type(self) -> str:
return "evidence_presence"

def output_spec(self) -> list[MetricOutputSpec]:
return [MetricOutputSpec.continuous_score(self._output_name)]

async def compute_scores(self, input: MetricInput) -> MetricResult:
score = 0.0
evidence = input.candidate.evidence
if evidence is not None and evidence.get(self._evidence_name) is not None:
try:
handle = await evidence.filesystem(self._evidence_name)
if await handle.exists():
if self._require_non_empty:
score = 1.0 if await handle.iter_paths(recursive=True) else 0.0
else:
score = 1.0
except (KeyError, ValueError):
score = 0.0
Comment on lines +77 to +78

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a log here so we can surface something more specific details on why there's a 0 result.

return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)])
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Deterministic gating + provenance comparison over an agent-eval run bundle.

This closes the design-doc B4 "CI/reporting" gap. Persistence of
``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html``
is already handled by the SDK (``agent_eval.persistence.persist_run`` /
``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline
gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance
checks.

The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary
produced here is interchangeable with the legacy gate's baseline summary. The
difference is the input: this operates on a typed :class:`AgentEvalRunResult`
(metric scores + attempt metadata) instead of scanning ``result.json`` files.
Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/
``results.jsonl``/``summary.json``/``report.html``) is handled by
``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate
-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic
provenance checks.

Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`:
that summary reports the *mean score per metric output* over a run. The gate's
``pass_rate`` here is a different, intentional view — a per-task pass/fail count
against a reward threshold — so it is computed separately. Token/runtime/
provenance aggregation is delegated to
:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the
measurement keys are read in exactly one place.
"""

from __future__ import annotations
Expand All @@ -23,13 +25,13 @@
from pathlib import Path
from typing import Any

from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult
from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements
from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult

# Metric outputs, in priority order, that represent a task's pass/reward signal.
DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success")

# Provenance fields collapsed into a single run-level summary (matches the
# legacy gate so baselines are interchangeable).
# Provenance fields collapsed into a single run-level summary.
_PROVENANCE_FIELDS: tuple[str, ...] = (
"commit_sha",
"commit_short",
Expand Down Expand Up @@ -91,7 +93,7 @@ def evaluate_gate(


def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path:
"""Persist the gate report alongside the SDK run bundle."""
"""Persist the gate report alongside the run bundle."""
path = Path(output_dir)
path.mkdir(parents=True, exist_ok=True)
gate_path = path / filename
Expand All @@ -115,8 +117,13 @@ def summarize_run(
*,
reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS,
) -> dict[str, Any]:
"""Aggregate pass-rate, token, runtime, and provenance for one run."""
attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts}
"""Aggregate pass-rate, token, runtime, and provenance for one run.

Token/runtime/provenance are read via :class:`AttemptMeasurements`; the
reward used for pass-rate prefers a scored metric output (``reward_outputs``)
and falls back to the attempt's recorded reward.
"""
attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts}
reward_by_task = _rewards_by_task(result.results, reward_outputs)
task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task))

Expand All @@ -131,29 +138,28 @@ def summarize_run(

for task_id in task_ids:
attempt = attempts_by_task.get(task_id)
metadata = attempt.metadata if attempt is not None else {}
measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {})

reward_value = _task_reward(task_id, reward_by_task, metadata)
reward_value = reward_by_task.get(task_id)
if reward_value is None:
reward_value = measurements.reward if measurements.reward is not None else 0.0
if reward_value >= 1.0:
passed += 1

total_tokens = metadata.get("total_tokens")
if isinstance(total_tokens, int):
token_sum += total_tokens
if measurements.total_tokens is not None:
token_sum += measurements.total_tokens
token_count += 1
else:
token_unavailable.append(task_id)

runtime_sec = _task_runtime_sec(metadata)
if runtime_sec is not None:
runtime_sum += runtime_sec
if measurements.runtime_sec is not None:
runtime_sum += measurements.runtime_sec
runtime_count += 1
else:
runtime_unavailable.append(task_id)

prov = metadata.get("provenance")
if isinstance(prov, dict):
provenance_inputs.append(prov)
if measurements.provenance:
provenance_inputs.append(measurements.provenance)

total = len(task_ids)
return {
Expand Down Expand Up @@ -404,28 +410,6 @@ def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None
return None


def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float:
if task_id in reward_by_task:
return reward_by_task[task_id]
reward = metadata.get("reward")
if reward is not None:
try:
return float(reward)
except (TypeError, ValueError):
return 0.0
return 1.0 if metadata.get("passed") is True else 0.0


def _task_runtime_sec(metadata: dict[str, Any]) -> float | None:
runtime_sec = metadata.get("runtime_sec")
if isinstance(runtime_sec, int | float):
return float(runtime_sec)
duration_ms = metadata.get("duration_ms")
if isinstance(duration_ms, int | float):
return float(duration_ms) / 1000.0
return None


def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]:
observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS}
for prov in provenances:
Expand Down
Loading
Loading