diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py index b4d9805374..963d869bb5 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py @@ -5,9 +5,11 @@ from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig from nemo_evaluator_sdk.agent_eval.persistence import persist_run from nemo_evaluator_sdk.agent_eval.types import ( AgentAttemptRuntime, + AgentAttemptSource, AgentEvalAttempt, AgentEvalDiagnostic, AgentEvalMetricOutputCoverage, @@ -24,9 +26,12 @@ from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence __all__ = [ + "AgentAttemptRuntime", + "AgentAttemptSource", "AgentEvalAttempt", "AgentEvalDiagnostic", "AgentEvalMetricOutputCoverage", + "AgentEvalOrchestrator", "AgentEvalRunConfig", "AgentEvalRunResult", "AgentEvalSummary", @@ -34,11 +39,11 @@ "AgentEvalTask", "AgentEvalTaskResult", "AgentEvaluator", - "AgentAttemptRuntime", "AgentOutput", "CandidateEvidence", "EvidenceDescriptor", "LocalFilesystemEvidence", + "OrchestratorConfig", "SemanticView", "ViewSignal", "persist_run", diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py new file mode 100644 index 0000000000..dd85fcea5d --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts. + +These are the runtime-agnostic pieces: the *scorable* status mapping and the +standard evidence-key builder. Platform-specific attempt construction (reading +proprietary artifact layouts, extra evidence keys) composes these in the adapter. +""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus +from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor + + +def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus: + """Map an agent-phase outcome to a *scorable* attempt status. + + :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes + ``status=="failed"`` from scoring (it short-circuits to a failed metric + result). An agent that ran but did not succeed must still be scored — e.g. as + a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore + use ``"partial"`` for an executed-but-unsuccessful agent and reserve + ``"failed"`` for genuine attempt-*production* failures (which a runtime + surfaces by raising, not by emitting an unscorable attempt). + """ + return "completed" if agent_ok else "partial" + + +def standard_evidence_descriptors( + *, + logs_dir: str | Path, + final_state_dir: str | Path, + trace_path: str | Path | None = None, + initial_state_ref: str | None = None, + verifier_logs_dir: str | Path | None = None, + primary_log: str | None = None, +) -> dict[str, EvidenceDescriptor]: + """Build the documented evidence map for an agent-eval attempt. + + Standard keys: ``initial_state`` (task input filesystem, when staged), + ``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log + dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present). + Callers may add their own extension keys to the returned mapping. + """ + descriptors: dict[str, EvidenceDescriptor] = {} + + if initial_state_ref: + descriptors["initial_state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(initial_state_ref), + metadata={"role": "initial_state"}, + ) + + if trace_path is not None: + trace_name = Path(trace_path).name + descriptors["trace"] = EvidenceDescriptor( + kind="trace", + format="atif" if trace_name.startswith("atif") else "json", + ref=str(trace_path), + ) + + logs_metadata = {"primary_log": primary_log} if primary_log else {} + descriptors["logs"] = EvidenceDescriptor( + kind="logs", + format="dir", + ref=str(logs_dir), + metadata=logs_metadata, + ) + + descriptors["final_state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(final_state_dir), + metadata={"role": "final_state"}, + ) + + if verifier_logs_dir is not None and Path(verifier_logs_dir).exists(): + descriptors["verifier_logs"] = EvidenceDescriptor( + kind="logs", + format="dir", + ref=str(verifier_logs_dir), + metadata={"role": "verifier"}, + ) + + return descriptors diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py new file mode 100644 index 0000000000..8cece6a5ad --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Reusable agent-eval metrics. + +``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt +metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it +scores by inspecting ``candidate.evidence`` (a filesystem evidence handle) +rather than a reward written into metadata — the value proposition of scoring +over evidence instead of trusting a verifier's stamped reward. +""" + +from __future__ import annotations + +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class AgentPhaseSuccessMetric: + """Score 1.0 when the agent phase exited successfully, else 0.0. + + The metric ``type`` is overridable via the ``metric_type`` class attribute so + callers can namespace it; the output name stays ``agent_phase_success`` (which + gating reads as a reward signal). + """ + + metric_type: str = "agent_phase_success" + + @property + def type(self) -> str: + return self.metric_type + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("agent_phase_success")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + agent_ok = bool(input.candidate.metadata.get("agent_ok")) + return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)]) + + +class EvidencePresenceMetric: + """Score 1.0 when a named filesystem evidence directory exists (and is non-empty). + + Reads ``candidate.evidence`` directly — the canonical metric-over-evidence + pattern — so the score reflects what the agent actually produced on disk, + not a reward stamped into metadata by a verifier. + """ + + def __init__( + self, + *, + evidence_name: str = "final_state", + output_name: str = "evidence_present", + require_non_empty: bool = True, + ) -> None: + self._evidence_name = evidence_name + self._output_name = output_name + self._require_non_empty = require_non_empty + + @property + def type(self) -> str: + return "evidence_presence" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score(self._output_name)] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + score = 0.0 + evidence = input.candidate.evidence + if evidence is not None and evidence.get(self._evidence_name) is not None: + try: + handle = await evidence.filesystem(self._evidence_name) + if await handle.exists(): + if self._require_non_empty: + score = 1.0 if await handle.iter_paths(recursive=True) else 0.0 + else: + score = 1.0 + except (KeyError, ValueError): + score = 0.0 + return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)]) diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py similarity index 85% rename from tests/agentic-use/runtimes/shared/reporting.py rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py index 34b78fbcb7..f6a7d04cfb 100644 --- a/tests/agentic-use/runtimes/shared/reporting.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py @@ -1,19 +1,21 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Deterministic gating + provenance comparison over an agent-eval run bundle. -This closes the design-doc B4 "CI/reporting" gap. Persistence of -``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html`` -is already handled by the SDK (``agent_eval.persistence.persist_run`` / -``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline -gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance -checks. - -The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary -produced here is interchangeable with the legacy gate's baseline summary. The -difference is the input: this operates on a typed :class:`AgentEvalRunResult` -(metric scores + attempt metadata) instead of scanning ``result.json`` files. +Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/ +``results.jsonl``/``summary.json``/``report.html``) is handled by +``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate +-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic +provenance checks. + +Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`: +that summary reports the *mean score per metric output* over a run. The gate's +``pass_rate`` here is a different, intentional view — a per-task pass/fail count +against a reward threshold — so it is computed separately. Token/runtime/ +provenance aggregation is delegated to +:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the +measurement keys are read in exactly one place. """ from __future__ import annotations @@ -23,13 +25,13 @@ from pathlib import Path from typing import Any -from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult +from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements +from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult # Metric outputs, in priority order, that represent a task's pass/reward signal. DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success") -# Provenance fields collapsed into a single run-level summary (matches the -# legacy gate so baselines are interchangeable). +# Provenance fields collapsed into a single run-level summary. _PROVENANCE_FIELDS: tuple[str, ...] = ( "commit_sha", "commit_short", @@ -91,7 +93,7 @@ def evaluate_gate( def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path: - """Persist the gate report alongside the SDK run bundle.""" + """Persist the gate report alongside the run bundle.""" path = Path(output_dir) path.mkdir(parents=True, exist_ok=True) gate_path = path / filename @@ -115,8 +117,13 @@ def summarize_run( *, reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS, ) -> dict[str, Any]: - """Aggregate pass-rate, token, runtime, and provenance for one run.""" - attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts} + """Aggregate pass-rate, token, runtime, and provenance for one run. + + Token/runtime/provenance are read via :class:`AttemptMeasurements`; the + reward used for pass-rate prefers a scored metric output (``reward_outputs``) + and falls back to the attempt's recorded reward. + """ + attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts} reward_by_task = _rewards_by_task(result.results, reward_outputs) task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task)) @@ -131,29 +138,28 @@ def summarize_run( for task_id in task_ids: attempt = attempts_by_task.get(task_id) - metadata = attempt.metadata if attempt is not None else {} + measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {}) - reward_value = _task_reward(task_id, reward_by_task, metadata) + reward_value = reward_by_task.get(task_id) + if reward_value is None: + reward_value = measurements.reward if measurements.reward is not None else 0.0 if reward_value >= 1.0: passed += 1 - total_tokens = metadata.get("total_tokens") - if isinstance(total_tokens, int): - token_sum += total_tokens + if measurements.total_tokens is not None: + token_sum += measurements.total_tokens token_count += 1 else: token_unavailable.append(task_id) - runtime_sec = _task_runtime_sec(metadata) - if runtime_sec is not None: - runtime_sum += runtime_sec + if measurements.runtime_sec is not None: + runtime_sum += measurements.runtime_sec runtime_count += 1 else: runtime_unavailable.append(task_id) - prov = metadata.get("provenance") - if isinstance(prov, dict): - provenance_inputs.append(prov) + if measurements.provenance: + provenance_inputs.append(measurements.provenance) total = len(task_ids) return { @@ -404,28 +410,6 @@ def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None return None -def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float: - if task_id in reward_by_task: - return reward_by_task[task_id] - reward = metadata.get("reward") - if reward is not None: - try: - return float(reward) - except (TypeError, ValueError): - return 0.0 - return 1.0 if metadata.get("passed") is True else 0.0 - - -def _task_runtime_sec(metadata: dict[str, Any]) -> float | None: - runtime_sec = metadata.get("runtime_sec") - if isinstance(runtime_sec, int | float): - return float(runtime_sec) - duration_ms = metadata.get("duration_ms") - if isinstance(duration_ms, int | float): - return float(duration_ms) / 1000.0 - return None - - def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]: observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS} for prov in provenances: diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py new file mode 100644 index 0000000000..0ae2330415 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Typed view over the measurement keys carried on ``AgentEvalAttempt.metadata``. + +Gating and reporting read these typed fields instead of reaching into the +attempt metadata dict by magic string. The keys are still *stored* on +``metadata`` (so the loose-dict contract continues to work during migration); +this module is the single, documented place that names them and applies the +fallbacks (``duration_ms`` → ``runtime_sec``, ``passed`` → ``reward``). +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + +# Token-measurement keys carried on attempt metadata (and in result.json["metrics"]). +TOKEN_KEYS: tuple[str, ...] = ( + "prompt_tokens", + "completion_tokens", + "total_tokens", + "cache_creation_tokens", + "cache_read_tokens", +) + + +class AttemptMeasurements(BaseModel): + """Numeric measurements + provenance projected from attempt metadata. + + This is the public, typed attempt-measurement contract. Reporting/gating + consume it via :meth:`from_metadata`; producers may keep writing the same + keys onto ``AgentEvalAttempt.metadata`` and round-trip via :meth:`to_metadata`. + """ + + model_config = ConfigDict(extra="forbid") + + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + cache_creation_tokens: int | None = None + cache_read_tokens: int | None = None + runtime_sec: float | None = None + reward: float | None = None + passed: bool | None = None + provenance: dict[str, Any] = Field(default_factory=dict) + + @classmethod + def from_metadata(cls, metadata: Mapping[str, Any] | None) -> AttemptMeasurements: + """Project loose attempt metadata onto the typed contract. + + Applies the historical fallbacks so callers don't re-implement them: + ``runtime_sec`` falls back to ``duration_ms / 1000``; ``reward`` falls + back to ``1.0``/``0.0`` derived from ``passed`` when no explicit reward + is recorded. + """ + metadata = metadata or {} + + tokens = {key: _as_int(metadata.get(key)) for key in TOKEN_KEYS} + runtime_sec = _runtime_sec(metadata) + passed = metadata.get("passed") + passed = bool(passed) if isinstance(passed, bool) else None + reward = _reward(metadata, passed) + provenance = metadata.get("provenance") + provenance = dict(provenance) if isinstance(provenance, Mapping) else {} + + return cls( + **tokens, + runtime_sec=runtime_sec, + reward=reward, + passed=passed, + provenance=provenance, + ) + + def to_metadata(self) -> dict[str, Any]: + """Project back onto the loose metadata keys (only set values).""" + payload: dict[str, Any] = {} + for key in TOKEN_KEYS: + value = getattr(self, key) + if value is not None: + payload[key] = value + if self.runtime_sec is not None: + payload["runtime_sec"] = self.runtime_sec + if self.reward is not None: + payload["reward"] = self.reward + if self.passed is not None: + payload["passed"] = self.passed + if self.provenance: + payload["provenance"] = dict(self.provenance) + return payload + + +def _as_int(value: Any) -> int | None: + # bool is an int subclass; never treat True/False as a token count. + if isinstance(value, bool): + return None + return value if isinstance(value, int) else None + + +def _runtime_sec(metadata: Mapping[str, Any]) -> float | None: + runtime_sec = metadata.get("runtime_sec") + if isinstance(runtime_sec, int | float) and not isinstance(runtime_sec, bool): + return float(runtime_sec) + duration_ms = metadata.get("duration_ms") + if isinstance(duration_ms, int | float) and not isinstance(duration_ms, bool): + return float(duration_ms) / 1000.0 + return None + + +def _reward(metadata: Mapping[str, Any], passed: bool | None) -> float | None: + reward = metadata.get("reward") + if reward is not None: + try: + return float(reward) + except (TypeError, ValueError): + return None + if passed is not None: + return 1.0 if passed else 0.0 + return None diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py new file mode 100644 index 0000000000..1fb436f809 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic orchestration: agent/scoring run + deterministic gate. + +Wraps :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` with the +gate from :mod:`nemo_evaluator_sdk.agent_eval.gating`. It is intentionally lean — +the only collaborators are the tasks and a target (online) or attempts (offline). +Two seams keep it backend-agnostic: + +* **verify-enable is inverted to data**: callers pass ``extra_metrics`` to append + (e.g. a verifier-reward metric). The orchestrator never introspects a runtime's + config to decide what to score. +* **environment prep is an injected hook**: ``prepare_task`` (e.g. "build the task + image") runs per task before execution, so Docker/build specifics live in the + caller, not here. + +The common Docker case stays a few lines via :meth:`AgentEvalOrchestrator`'s plain +constructor (config + optional ``extra_metrics``); richer wiring is opt-in. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator +from nemo_evaluator_sdk.agent_eval.gating import ( + GateThresholds, + evaluate_gate, + load_baseline_summary, + write_gate_report, +) +from nemo_evaluator_sdk.agent_eval.types import ( + AgentAttemptRuntime, + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalRunResult, + AgentEvalTask, +) +from nemo_evaluator_sdk.metrics.protocol import Metric + + +@dataclass(frozen=True) +class OrchestratorConfig: + """Run-level knobs shared by the online and offline paths.""" + + parallelism: int = 1 + write_dashboard: bool = True + write_gate: bool = True + gate_thresholds: GateThresholds | None = None + baseline_summary_path: Path | None = None + + +class AgentEvalOrchestrator: + """Run tasks through ``AgentEvaluator`` (online or offline) and apply the gate.""" + + def __init__( + self, + *, + config: OrchestratorConfig | None = None, + extra_metrics: Sequence[Metric] = (), + ) -> None: + self.config = config or OrchestratorConfig() + self._extra_metrics = list(extra_metrics) + + async def run_tasks( + self, + tasks: Sequence[AgentEvalTask], + *, + target: AgentAttemptRuntime, + benchmark: dict[str, object] | None = None, + output_dir: Path | None = None, + run_id: str | None = None, + prepare_task: Callable[[AgentEvalTask], None] | None = None, + ) -> AgentEvalRunResult: + """Online path: optionally prep each task, run the runtime, score, gate.""" + prepared = [self._with_extra_metrics(task) for task in tasks] + if prepare_task is not None: + for task in prepared: + prepare_task(task) + + result = await AgentEvaluator().run( + tasks=prepared, + target=target, + config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark), + ) + self._maybe_write_gate(result) + return result + + async def score_attempts( + self, + tasks: Sequence[AgentEvalTask], + *, + attempts: Sequence[AgentEvalAttempt], + benchmark: dict[str, object] | None = None, + output_dir: Path | None = None, + run_id: str | None = None, + ) -> AgentEvalRunResult: + """Offline path: score already-captured attempts (no agent execution).""" + prepared = [self._with_extra_metrics(task) for task in tasks] + result = await AgentEvaluator().run( + tasks=prepared, + attempts=list(attempts), + config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark), + ) + self._maybe_write_gate(result) + return result + + def _run_config( + self, + *, + output_dir: Path | None, + run_id: str | None, + benchmark: dict[str, object] | None, + ) -> AgentEvalRunConfig: + return AgentEvalRunConfig( + output_dir=output_dir, + run_id=run_id, + parallelism=self.config.parallelism, + write_dashboard=self.config.write_dashboard, + benchmark=dict(benchmark or {}), + ) + + def _with_extra_metrics(self, task: AgentEvalTask) -> AgentEvalTask: + """Append injected metrics, honoring task-authored metrics and avoiding duplicate types.""" + if not self._extra_metrics: + return task + metrics: list[Metric] = list(task.metrics) + existing_types = {type(metric) for metric in metrics} + appended = [metric for metric in self._extra_metrics if type(metric) not in existing_types] + if not appended: + return task + return task.model_copy(update={"metrics": metrics + appended}) + + def _maybe_write_gate(self, result: AgentEvalRunResult) -> None: + if not (self.config.write_gate and result.output_dir is not None): + return + baseline = ( + load_baseline_summary(self.config.baseline_summary_path) + if self.config.baseline_summary_path is not None + else None + ) + report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline) + write_gate_report(report, result.output_dir) + + +__all__ = [ + "AgentEvalOrchestrator", + "GateThresholds", + "OrchestratorConfig", +] diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py new file mode 100644 index 0000000000..a2d7ac9e44 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py @@ -0,0 +1,291 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Plug-and-play seam for coding-agent CLIs (codex/claude/cursor/...). + +The split that makes these "plug-and-play": + +* :class:`CliAgentDriver` is the **driver** — a generic ``AgentAttemptRuntime`` + that runs a CLI which reads a prompt on stdin and writes its final answer to a + file, then captures workspace/stdout/stderr/final-output as evidence. This is + the stable, reusable part. +* :class:`CodingAgentSpec` is the **per-agent adapter** — the bespoke part: how to + build the CLI command and (optionally) how to parse that agent's trajectory into + extra evidence. Implementing a new agent means subclassing this, not rewriting a + runtime. + +The shipped :class:`ClaudeCodeSpec` / :class:`CursorAgentSpec` are *reference* +command builders: the driver and evidence contract are stable, but each CLI's +exact flags and trajectory format are the integrator's responsibility and may +drift with upstream releases. Auth is the caller's concern (inject via env); +nothing here hardcodes credentials. +""" + +from __future__ import annotations + +import asyncio +import json +import shutil +import subprocess +from collections.abc import Awaitable, Callable, Sequence +from dataclasses import dataclass +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor + +DEFAULT_CODING_AGENT_TIMEOUT_S = 600 +ProcessFactory = Callable[..., Awaitable[object]] + + +@dataclass(frozen=True) +class RunArtifacts: + """Resolved on-disk paths for one coding-agent attempt.""" + + evidence_dir: Path + workspace_dir: Path + prompt_path: Path + task_path: Path + stdout_path: Path + stderr_path: Path + final_output_path: Path + + +class CodingAgentSpec: + """Per-agent adapter: prompt, command, and trajectory→evidence parsing. + + Subclass and implement :meth:`build_command`. Override :meth:`build_prompt`, + :meth:`extra_evidence`, or :meth:`final_output` for agent-specific behavior. + """ + + name: str = "coding_agent" + binary: str = "" + model: str | None = None + + def build_prompt(self, task: AgentEvalTask) -> str: + """Default instruction prompt (override per agent if needed).""" + return f"Task id: {task.id}\nIntent: {task.intent}\nInputs: {task.inputs}\n" + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + """Return the argv to launch; the prompt is delivered on stdin.""" + raise NotImplementedError + + def extra_evidence(self, artifacts: RunArtifacts) -> dict[str, EvidenceDescriptor]: + """Optional per-agent evidence (e.g. a parsed trajectory). Default: none.""" + return {} + + def final_output(self, artifacts: RunArtifacts, stdout_text: str) -> str: + """Final answer text: prefer the written final-output file, else stdout.""" + if artifacts.final_output_path.exists(): + return artifacts.final_output_path.read_text(encoding="utf-8") + return stdout_text + + +class CliAgentDriver: + """Generic ``AgentAttemptRuntime`` for stdin-prompt coding-agent CLIs.""" + + def __init__( + self, + spec: CodingAgentSpec, + *, + work_root: str | Path | None = None, + timeout_s: int = DEFAULT_CODING_AGENT_TIMEOUT_S, + process_factory: ProcessFactory | None = None, + ) -> None: + if not spec.binary: + raise ValueError(f"{type(spec).__name__} must set a non-empty `binary`") + self._spec = spec + self._work_root = Path(work_root).expanduser() if work_root is not None else None + self._timeout_s = timeout_s + self._process_factory = process_factory or asyncio.create_subprocess_exec + + async def run_tasks( + self, + tasks: Sequence[AgentEvalTask], + config: AgentEvalRunConfig | None = None, + ) -> Sequence[AgentEvalAttempt]: + if self._process_factory is asyncio.create_subprocess_exec and shutil.which(self._spec.binary) is None: + raise RuntimeError(f"{self._spec.name} CLI executable {self._spec.binary!r} was not found on PATH") + + resolved = config or AgentEvalRunConfig() + semaphore = asyncio.Semaphore(resolved.parallelism) + + async def run_one(index: int, task: AgentEvalTask) -> AgentEvalAttempt: + async with semaphore: + return await self._run_task(index, task, resolved) + + return await asyncio.gather(*(run_one(index, task) for index, task in enumerate(tasks))) + + async def _run_task(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> AgentEvalAttempt: + artifacts = self._artifacts(index, task, config) + artifacts.evidence_dir.mkdir(parents=True, exist_ok=True) + artifacts.workspace_dir.mkdir(parents=True, exist_ok=True) + + prompt = self._spec.build_prompt(task) + artifacts.prompt_path.write_text(prompt, encoding="utf-8") + artifacts.task_path.write_text(task.model_dump_json(indent=2), encoding="utf-8") + + command = self._spec.build_command(artifacts) + try: + process = await self._process_factory( + *command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + process.communicate(prompt.encode("utf-8")), + timeout=self._timeout_s, + ) + except Exception as exc: + return self._failed_attempt(task, artifacts, exc) + + stdout_text = _decode(stdout) + stderr_text = _decode(stderr) + artifacts.stdout_path.write_text(stdout_text, encoding="utf-8") + artifacts.stderr_path.write_text(stderr_text, encoding="utf-8") + + return_code = getattr(process, "returncode", 0) + if return_code: + return self._failed_attempt( + task, + artifacts, + RuntimeError(f"{self._spec.name} exited with status {return_code}: {stderr_text.strip()}"), + ) + + descriptors: dict[str, EvidenceDescriptor] = { + "workspace": EvidenceDescriptor(kind="filesystem", format="dir", ref=str(artifacts.workspace_dir)), + "prompt": EvidenceDescriptor(kind="text", format="txt", ref=str(artifacts.prompt_path)), + "task": EvidenceDescriptor(kind="json", format="json", ref=str(artifacts.task_path)), + "stdout": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stdout_path)), + "stderr": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stderr_path)), + } + descriptors.update(self._spec.extra_evidence(artifacts)) + + return AgentEvalAttempt( + id=f"{task.id}:{self._spec.name}", + task_id=task.id, + status="completed", + output=AgentOutput( + text=self._spec.final_output(artifacts, stdout_text), + metadata={ + "runtime": self._spec.name, + "agent_model": self._spec.model, + "evidence_dir": str(artifacts.evidence_dir), + }, + ), + evidence=CandidateEvidence(descriptors=descriptors, metadata={"runtime": self._spec.name}), + metadata={ + "runtime": self._spec.name, + "agent_model": self._spec.model, + "generated": True, + }, + ) + + def _failed_attempt(self, task: AgentEvalTask, artifacts: RunArtifacts, exc: Exception) -> AgentEvalAttempt: + error_path = artifacts.evidence_dir / "error.json" + error_path.write_text( + json.dumps({"error_type": exc.__class__.__name__, "error": str(exc)}) + "\n", encoding="utf-8" + ) + return AgentEvalAttempt( + id=f"{task.id}:{self._spec.name}", + task_id=task.id, + status="failed", + output=None, + evidence=CandidateEvidence( + descriptors={"error": EvidenceDescriptor(kind="error", format="json", ref=str(error_path))}, + metadata={"runtime": self._spec.name}, + ), + metadata={ + "runtime": self._spec.name, + "error_type": exc.__class__.__name__, + "error": str(exc), + }, + ) + + def _artifacts(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> RunArtifacts: + root = self._work_root or ((config.output_dir or Path.cwd()) / "evidence" / self._spec.name) + evidence_dir = Path(root) / (_safe_path_name(task.id) or f"task-{index}") + return RunArtifacts( + evidence_dir=evidence_dir, + workspace_dir=evidence_dir / "workspace", + prompt_path=evidence_dir / "prompt.txt", + task_path=evidence_dir / "task.json", + stdout_path=evidence_dir / "stdout.txt", + stderr_path=evidence_dir / "stderr.txt", + final_output_path=evidence_dir / "final_output.txt", + ) + + +class ClaudeCodeSpec(CodingAgentSpec): + """Reference command builder for the Claude Code CLI (``claude``).""" + + name = "claude_code" + binary = "claude" + + def __init__(self, *, model: str | None = None, binary: str = "claude") -> None: + self.model = model + self.binary = binary + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + command = [ + self.binary, + "--print", + "--output-format", + "stream-json", + "--add-dir", + str(artifacts.workspace_dir), + ] + if self.model is not None: + command.extend(["--model", self.model]) + return command + + +class CursorAgentSpec(CodingAgentSpec): + """Reference command builder for the Cursor Agent CLI (``cursor-agent``).""" + + name = "cursor_agent" + binary = "cursor-agent" + + def __init__(self, *, model: str | None = None, binary: str = "cursor-agent") -> None: + self.model = model + self.binary = binary + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + command = [ + self.binary, + "--print", + "--output-format", + "text", + "--workdir", + str(artifacts.workspace_dir), + ] + if self.model is not None: + command.extend(["--model", self.model]) + return command + + +def _decode(value: bytes | str | None) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + return value.decode("utf-8", errors="replace") + + +def _safe_path_name(value: str) -> str: + return "".join(char if char.isalnum() or char in "._-" else "-" for char in value).strip(".-")[:120] + + +__all__ = [ + "CliAgentDriver", + "ClaudeCodeSpec", + "CodingAgentSpec", + "CursorAgentSpec", + "RunArtifacts", +] diff --git a/tests/agentic-use/runtimes/shared/docker.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py similarity index 80% rename from tests/agentic-use/runtimes/shared/docker.py rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py index 431d646806..482ca6e55e 100644 --- a/tests/agentic-use/runtimes/shared/docker.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py @@ -1,7 +1,12 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Docker helpers for agentic-use runtimes.""" +"""Docker CLI helpers for agent-eval runtimes. + +These shell out to the ``docker`` CLI (stdlib ``subprocess`` only), so importing +this module does not require the ``agent-runtimes`` extra — only a working +``docker`` binary at call time. +""" from __future__ import annotations @@ -56,11 +61,8 @@ def docker_run( cmd.append(image) cmd += command - print(f"[agentic-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}") - kwargs: dict[str, object] = {"check": False, "text": True} - if timeout is not None: - kwargs["timeout"] = timeout - return subprocess.run(cmd, **kwargs) + print(f"[agent-eval-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}") + return subprocess.run(cmd, check=False, text=True, timeout=timeout) def docker_image_exists(tag: str) -> bool: @@ -72,12 +74,12 @@ def docker_image_exists(tag: str) -> bool: def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None: """Build a Docker image from an explicit Dockerfile + build context.""" cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)] - print(f"[agentic-runtime] $ {' '.join(cmd)}") + print(f"[agent-eval-runtime] $ {' '.join(cmd)}") subprocess.run(cmd, check=True) def build_task_image(task_dir: os.PathLike[str], tag: str) -> None: - """Build a task-specific Docker image from environment/Dockerfile.""" + """Build a task-specific Docker image from ``environment/Dockerfile``.""" from pathlib import Path root = Path(task_dir) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py index 8f84d8ba4f..fc03344c85 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py @@ -1,7 +1,16 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Docker-backed sandbox runtime for agent-eval attempts.""" +"""Docker-backed sandbox runtime for agent-eval attempts. + +Distinct from :mod:`nemo_evaluator_sdk.agent_eval.runtimes.environment`'s +``DockerEnvironmentProvider`` on purpose: this runtime drives the OpenAI Agents +SDK ``SandboxAgent`` (Python ``docker`` + ``agents``, behind the +``agent-runtimes`` extra) and *owns* the agent loop, whereas the environment +provider only shells out to the ``docker`` CLI to execute a caller-built command +inside a prebuilt task image. The two are not merged: this one is an +``AgentAttemptRuntime``; the other is an execution boundary used *by* runtimes. +""" from __future__ import annotations diff --git a/tests/agentic-use/runtimes/shared/environment.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py similarity index 54% rename from tests/agentic-use/runtimes/shared/environment.py rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py index fe23893668..a08dfdc179 100644 --- a/tests/agentic-use/runtimes/shared/environment.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py @@ -1,20 +1,18 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Environment provider boundary for agentic-use runtimes. +"""Process/filesystem environment boundary for agent-eval runtimes. -This is the design-doc's ``EnvironmentProvider`` boundary (section B2): it sits -*below* :class:`AgentAttemptRuntime` so a runtime never needs to know whether -the agent/verifier execute under Docker, locally, Harbor, or NeMo Gym. Today the -only implementation is :class:`DockerEnvironmentProvider`, which wraps -``shared/docker.py``. +This boundary sits *below* :class:`AgentAttemptRuntime` so a runtime never needs +to know whether the agent/verifier execute under Docker, locally, or another +filesystem-backed sandbox. It is intentionally a **process/filesystem** +abstraction, not a fully provider-neutral one: :class:`EnvRunSpec` carries +``mounts``/``extra_args`` as filesystem-environment hints. Providers that are +not filesystem-backed may ignore those fields. -Deviation from the doc sketch: the doc proposes ``run_agent(instruction, config) --> AgentEvalAttempt``. We keep the boundary at "execute a command in the -prepared environment" (returning an :class:`EnvCommandResult`) because each -backend builds its own command/env/mounts, and attempt construction is owned by -``shared/artifacts.py``. This keeps command-building and attempt-shaping out of -the environment layer so new providers only implement process execution. +A handle exposes a single :meth:`AbstractEnvironmentHandle.run` that takes a +``role`` ("agent" or "verifier"); :meth:`run_agent`/:meth:`run_verifier` are thin +role wrappers kept for caller convenience and protocol compatibility. """ from __future__ import annotations @@ -23,12 +21,16 @@ import subprocess from collections.abc import Callable from dataclasses import dataclass, field -from typing import Protocol, runtime_checkable +from typing import Literal, Protocol, runtime_checkable from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask -from runtimes.shared.docker import docker_run -from runtimes.shared.layout import task_image_tag +EnvRole = Literal["agent", "verifier"] + + +def default_image_tag(task_id: str) -> str: + """Default task → image-tag mapping (callers may inject their own).""" + return f"{task_id}:latest" @dataclass(frozen=True) @@ -45,7 +47,11 @@ def ok(self) -> bool: @dataclass class EnvRunSpec: - """How to execute one command inside an environment handle.""" + """How to execute one command inside an environment handle. + + ``mounts``/``extra_args`` are filesystem-environment hints (e.g. Docker bind + mounts and extra CLI args). Non-filesystem providers may ignore them. + """ command: list[str] env: dict[str, str] = field(default_factory=dict) @@ -68,7 +74,7 @@ async def close(self) -> None: ... @runtime_checkable class AgentEnvironmentProvider(Protocol): - """Creates per-task environment handles. Pluggable: Docker now, Gym later.""" + """Creates per-task environment handles. Pluggable: Docker now, others later.""" async def prepare( self, @@ -77,19 +83,37 @@ async def prepare( ) -> AgentEnvironmentHandle: ... -class DockerEnvironmentHandle: - """Docker-backed environment handle bound to one task image.""" +class AbstractEnvironmentHandle: + """Base handle that routes both roles through a single :meth:`run`. - def __init__(self, image: str) -> None: - self.image = image + Concrete handles implement :meth:`run`; ``run_agent``/``run_verifier`` are + role-specialized wrappers so the duplicated phase methods don't have to be + reimplemented per backend. + """ + + async def run(self, spec: EnvRunSpec, role: EnvRole) -> EnvCommandResult: + raise NotImplementedError async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: - return await self._run(spec) + return await self.run(spec, "agent") async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: - return await self._run(spec) + return await self.run(spec, "verifier") + + async def close(self) -> None: + return None + + +class DockerEnvironmentHandle(AbstractEnvironmentHandle): + """Docker-backed environment handle bound to one task image.""" + + def __init__(self, image: str) -> None: + self.image = image + + async def run(self, spec: EnvRunSpec, role: EnvRole = "agent") -> EnvCommandResult: + del role # Docker runs both roles identically against the same image. + from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_run - async def _run(self, spec: EnvRunSpec) -> EnvCommandResult: try: result = await asyncio.to_thread( docker_run, @@ -105,15 +129,11 @@ async def _run(self, spec: EnvRunSpec) -> EnvCommandResult: return EnvCommandResult(exit_code=124, timed_out=True) return EnvCommandResult(exit_code=result.returncode) - async def close(self) -> None: - # `docker run --rm` cleans up the container; nothing persistent to release. - return None - class DockerEnvironmentProvider: """Default provider that maps each task to its built Docker image.""" - def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None: + def __init__(self, *, image_tag_fn: Callable[[str], str] = default_image_tag) -> None: self._image_tag_fn = image_tag_fn async def prepare( diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py similarity index 92% rename from tests/agentic-use/runtimes/shared/environment_spec.py rename to packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py index cd5630926f..a594705907 100644 --- a/tests/agentic-use/runtimes/shared/environment_spec.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py @@ -1,7 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Reusable environment authoring for agentic-use tasks (design-doc B3). +"""Declarative environment authoring for agent-eval tasks. Moves task authoring away from an implicit "Dockerfile per task" toward a small, declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch. @@ -28,8 +28,11 @@ (a Dockerfile + build context + target tag). The Dockerfile path is used as-is; an ``image``-based spec generates a tiny derived Dockerfile (``FROM `` plus optional ``pip install``). ``setup`` steps are carried as plan metadata — they are -runtime concerns (e.g. seed-providers) handled outside the image build — so this -module does not execute them. +runtime concerns handled outside the image build — so this module does not +execute them. + +``yaml`` is imported lazily so that importing this module costs nothing for +callers that never load a spec. """ from __future__ import annotations @@ -37,8 +40,6 @@ from dataclasses import dataclass, field from pathlib import Path -import yaml - ENVIRONMENT_SPEC_FILENAME = "environment.yaml" DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile" @@ -69,6 +70,8 @@ def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec: root = Path(task_dir) spec_path = root / ENVIRONMENT_SPEC_FILENAME if spec_path.is_file(): + import yaml + return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root) dockerfile = root / DEFAULT_DOCKERFILE_RELPATH @@ -160,7 +163,7 @@ def plan_task_build( def execute_build_plan(plan: BuildPlan) -> None: """Build the Docker image described by ``plan``.""" - from runtimes.shared.docker import build_dockerfile + from nemo_evaluator_sdk.agent_eval.runtimes.docker import build_dockerfile build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py new file mode 100644 index 0000000000..5c858cb037 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic on-disk layout for a single agent-eval task run. + +A run produces an agent-log dir and a workspace dir under a run dir, plus a +written instruction file. Callers that need extra directories (e.g. preserved +platform state) add them on top of :class:`RunLayout`. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class RunLayout: + """Filesystem layout for one task run.""" + + run_dir: Path + agent_log_dir: Path + workspace_dir: Path + instruction_path: Path + + +def resolve_run_dir(output_dir: str | Path | None, default_factory: Callable[[], Path]) -> Path: + """Resolve the run dir to an absolute path. + + An explicit ``output_dir`` must be made absolute: run-dir subpaths are used as + Docker bind-mount sources, and Docker treats a relative ``-v`` source as a + (slash-free) named volume rather than a host directory. + """ + if output_dir is not None: + return Path(output_dir).resolve() + return default_factory() + + +def prepare_run_layout( + run_dir: str | Path, + instruction_text: str, + *, + agent_subdir: str = "agent", + workspace_subdir: str = "workspace", + instruction_name: str = "instruction.md", +) -> RunLayout: + """Create the agent/workspace dirs under ``run_dir`` and write the instruction.""" + run_dir = Path(run_dir) + agent_log_dir = run_dir / agent_subdir + workspace_dir = run_dir / workspace_subdir + agent_log_dir.mkdir(parents=True, exist_ok=True) + workspace_dir.mkdir(parents=True, exist_ok=True) + + instruction_path = agent_log_dir / instruction_name + instruction_path.write_text(instruction_text, encoding="utf-8") + + return RunLayout( + run_dir=run_dir, + agent_log_dir=agent_log_dir, + workspace_dir=workspace_dir, + instruction_path=instruction_path, + ) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py new file mode 100644 index 0000000000..7e1b0fb0c0 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic verifier-phase mechanic: collect a reward + stamp attempt metadata. + +This is the backend-agnostic core. *What* the verifier runs (command, env, +mounts) and *how* it is invoked are caller concerns — the caller executes its +verifier through an environment handle, then uses :func:`collect_verifier_outcome` +to read the reward/stdout convention out of the verifier's log dir, and +:func:`apply_verify_to_metadata` to stamp the result onto an attempt so a +reward metric can score it. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class VerifierOutcome: + """Result of a verifier phase for one task.""" + + ran: bool + passed: bool + reward: int + exit_code: int + stdout: str + verifier_log_dir: Path | None + + +def skipped_outcome() -> VerifierOutcome: + """Outcome representing a verifier that did not run.""" + return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) + + +def collect_verifier_outcome( + *, + ok: bool, + exit_code: int, + log_dir: str | Path, + reward_filename: str = "reward.txt", + stdout_filename: str = "test-stdout.txt", +) -> VerifierOutcome: + """Build a :class:`VerifierOutcome` from a verifier run's log dir. + + Reads ``reward.txt`` (``1``/``0``) when present; otherwise derives the reward + from ``ok`` and writes the file so reruns are stable. Reads ``test-stdout.txt`` + when present. + """ + log_dir = Path(log_dir) + passed = ok + + stdout = "" + stdout_path = log_dir / stdout_filename + if stdout_path.is_file(): + stdout = stdout_path.read_text(encoding="utf-8", errors="replace") + + reward_path = log_dir / reward_filename + if reward_path.is_file(): + reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0 + else: + reward = 1 if passed else 0 + reward_path.parent.mkdir(parents=True, exist_ok=True) + reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8") + + return VerifierOutcome( + ran=True, + passed=passed, + reward=reward, + exit_code=exit_code, + stdout=stdout, + verifier_log_dir=log_dir, + ) + + +def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None: + """Stamp verifier reward/status onto attempt metadata for scoring + gating.""" + if not outcome.ran: + metadata.setdefault("verify_status", "skipped") + return + metadata["verify_status"] = "ok" if outcome.passed else "failed" + metadata["passed"] = outcome.passed + metadata["reward"] = outcome.reward + metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py index 589a4efde1..03509ab038 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py @@ -287,6 +287,18 @@ async def run_tasks( ) -> Sequence[AgentEvalAttempt]: ... +@runtime_checkable +class AgentAttemptSource(Protocol): + """Loads a previously captured attempt for a task from a stored artifact. + + The offline counterpart to :class:`AgentAttemptRuntime`: instead of executing + the agent, it adapts an already-produced run directory/file into an + :class:`AgentEvalAttempt` so it can be (re)scored through ``AgentEvaluator``. + """ + + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: ... + + def _metric_coverage( results: Sequence[AgentEvalTaskResult], tasks: Sequence[AgentEvalTask] | None, diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py new file mode 100644 index 0000000000..66e7715c07 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py @@ -0,0 +1,117 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Fixture-based tests for the coding-agent driver seam (no real CLIs).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.runtimes.coding_agent import ( + ClaudeCodeSpec, + CliAgentDriver, + CodingAgentSpec, + CursorAgentSpec, + RunArtifacts, +) +from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask + + +class _EchoSpec(CodingAgentSpec): + name = "echo_agent" + binary = "echo-agent" + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + return [self.binary, "--out", str(artifacts.final_output_path)] + + def extra_evidence(self, artifacts: RunArtifacts) -> dict: + from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor + + return {"trajectory": EvidenceDescriptor(kind="trace", format="jsonl", ref=str(artifacts.stdout_path))} + + +class _FakeProcess: + def __init__(self, *, returncode: int, final_output_path: Path | None, stdout: bytes = b"", stderr: bytes = b""): + self.returncode = returncode + self._final_output_path = final_output_path + self._stdout = stdout + self._stderr = stderr + + async def communicate(self, stdin: bytes | None = None) -> tuple[bytes, bytes]: + if self._final_output_path is not None: + self._final_output_path.write_text("final answer", encoding="utf-8") + return self._stdout, self._stderr + + +def _factory(*, returncode: int = 0, write_final: bool = True): + captured: dict = {} + + async def factory(*command, **kwargs): + captured["command"] = list(command) + final_path = Path(command[command.index("--out") + 1]) if "--out" in command else None + return _FakeProcess( + returncode=returncode, + final_output_path=final_path if write_final else None, + stdout=b'{"event":"done"}\n', + ) + + return factory, captured + + +def _task() -> AgentEvalTask: + return AgentEvalTask(id="demo/task", intent="do the thing", inputs={"k": "v"}) + + +@pytest.mark.asyncio +async def test_driver_produces_completed_attempt_with_evidence(tmp_path: Path) -> None: + factory, captured = _factory() + driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory) + + attempts = await driver.run_tasks([_task()], AgentEvalRunConfig()) + attempt = attempts[0] + + assert captured["command"][0] == "echo-agent" + assert attempt.status == "completed" + assert attempt.output is not None and attempt.output.text == "final answer" + # Standard + spec-provided evidence keys are present and paths exist on disk. + assert {"workspace", "prompt", "task", "stdout", "stderr", "trajectory"} <= set(attempt.evidence.descriptors) + assert (tmp_path / "demo-task" / "prompt.txt").read_text(encoding="utf-8").startswith("Task id: demo/task") + + +@pytest.mark.asyncio +async def test_driver_marks_failed_on_nonzero_exit(tmp_path: Path) -> None: + factory, _ = _factory(returncode=1, write_final=False) + driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory) + + attempt = (await driver.run_tasks([_task()]))[0] + assert attempt.status == "failed" + assert attempt.output is None + assert "error" in attempt.evidence.descriptors + assert (tmp_path / "demo-task" / "error.json").exists() + + +def test_reference_specs_build_expected_commands(tmp_path: Path) -> None: + artifacts = RunArtifacts( + evidence_dir=tmp_path, + workspace_dir=tmp_path / "workspace", + prompt_path=tmp_path / "p", + task_path=tmp_path / "t", + stdout_path=tmp_path / "o", + stderr_path=tmp_path / "e", + final_output_path=tmp_path / "f", + ) + claude_cmd = ClaudeCodeSpec(model="claude-x").build_command(artifacts) + assert claude_cmd[0] == "claude" and "--model" in claude_cmd and "claude-x" in claude_cmd + + cursor_cmd = CursorAgentSpec().build_command(artifacts) + assert cursor_cmd[0] == "cursor-agent" and "--model" not in cursor_cmd + + +def test_driver_rejects_spec_without_binary(tmp_path: Path) -> None: + class _NoBinary(CodingAgentSpec): + def build_command(self, artifacts: RunArtifacts) -> list[str]: + return [] + + with pytest.raises(ValueError, match="non-empty"): + CliAgentDriver(_NoBinary(), work_root=tmp_path) diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py new file mode 100644 index 0000000000..3e5f9361a2 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for promoted attempt helpers and reusable metrics.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric, EvidencePresenceMetric +from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput +from nemo_evaluator_sdk.values.evidence import CandidateEvidence + + +def test_resolve_attempt_status_keeps_failed_agents_scorable() -> None: + assert resolve_attempt_status(True) == "completed" + assert resolve_attempt_status(False) == "partial" + + +def test_standard_evidence_descriptors_builds_doc_keys(tmp_path: Path) -> None: + logs = tmp_path / "agent" + workspace = tmp_path / "workspace" + verifier = tmp_path / "verifier" + logs.mkdir() + workspace.mkdir() + verifier.mkdir() # exists -> verifier_logs included + + descriptors = standard_evidence_descriptors( + logs_dir=logs, + final_state_dir=workspace, + trace_path=tmp_path / "atif_trajectory.json", + initial_state_ref=str(tmp_path / "seed"), + verifier_logs_dir=verifier, + primary_log="nat_agent.log", + ) + assert set(descriptors) == {"initial_state", "trace", "logs", "final_state", "verifier_logs"} + assert descriptors["trace"].format == "atif" + assert descriptors["logs"].metadata["primary_log"] == "nat_agent.log" + + # verifier_logs omitted when the dir is absent. + no_verifier = standard_evidence_descriptors( + logs_dir=logs, final_state_dir=workspace, verifier_logs_dir=tmp_path / "missing" + ) + assert "verifier_logs" not in no_verifier + + +@pytest.mark.asyncio +async def test_agent_phase_success_metric_reads_metadata_and_namespaces_type() -> None: + metric = AgentPhaseSuccessMetric() + assert metric.type == "agent_phase_success" + ok = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(metadata={"agent_ok": True})) + ) + assert ok.outputs[0].value == 1.0 + + class Namespaced(AgentPhaseSuccessMetric): + metric_type = "agentic_use_agent_phase" + + assert Namespaced().type == "agentic_use_agent_phase" + + +@pytest.mark.asyncio +async def test_evidence_presence_metric_scores_over_evidence(tmp_path: Path) -> None: + final_state = tmp_path / "workspace" + final_state.mkdir() + (final_state / "result.txt").write_text("done", encoding="utf-8") + evidence = CandidateEvidence( + descriptors=standard_evidence_descriptors(logs_dir=tmp_path / "agent", final_state_dir=final_state) + ) + + metric = EvidencePresenceMetric() + present = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence)) + ) + assert present.outputs[0].value == 1.0 + + # Empty workspace -> non-empty requirement fails; no evidence -> 0. + (final_state / "result.txt").unlink() + empty = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence)) + ) + assert empty.outputs[0].value == 0.0 + missing = await metric.compute_scores(MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput())) + assert missing.outputs[0].value == 0.0 diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py index 5e0446b1eb..c051499030 100644 --- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py @@ -16,7 +16,7 @@ from nemo_evaluator_sdk.agent_eval.runtimes import docker_sandbox from nemo_evaluator_sdk.agent_eval.runtimes.docker_sandbox import ( DockerSandboxAgentRuntime, - SandboxSdk, + SandboxSDK, ) @@ -147,8 +147,8 @@ async def run(self, agent: _FakeSandboxAgent, prompt: str, *, run_config: _FakeR raise RuntimeError("sandbox run failed") -def _fake_sdk() -> SandboxSdk: - return SandboxSdk( +def _fake_sdk() -> SandboxSDK: + return SandboxSDK( Runner=_FakeRunner(), RunConfig=_FakeRunConfig, SandboxRunConfig=_FakeSandboxRunConfig, diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py new file mode 100644 index 0000000000..b7df9a61d4 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the promoted environment boundary + environment authoring.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.runtimes import docker as docker_mod +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + DockerEnvironmentHandle, + DockerEnvironmentProvider, + EnvRunSpec, + default_image_tag, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec, plan_task_build +from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask + + +@pytest.mark.asyncio +async def test_docker_handle_routes_roles_through_single_run(monkeypatch: pytest.MonkeyPatch) -> None: + calls: list[tuple[str, list[str]]] = [] + + def fake_docker_run(image: str, command: list[str], **kwargs: object) -> subprocess.CompletedProcess[str]: + calls.append((image, command)) + return subprocess.CompletedProcess(args=command, returncode=0) + + monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run) + + handle = DockerEnvironmentHandle("img:latest") + spec = EnvRunSpec(command=["echo", "hi"]) + assert (await handle.run_agent(spec)).ok + assert (await handle.run_verifier(spec)).ok + assert calls == [("img:latest", ["echo", "hi"]), ("img:latest", ["echo", "hi"])] + + +@pytest.mark.asyncio +async def test_docker_handle_reports_timeout(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_docker_run(image: str, command: list[str], **kwargs: object): + raise subprocess.TimeoutExpired(cmd=command, timeout=1) + + monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run) + result = await DockerEnvironmentHandle("img").run(EnvRunSpec(command=["sleep"]), "agent") + assert result.timed_out and result.exit_code == 124 and not result.ok + + +@pytest.mark.asyncio +async def test_provider_uses_injected_image_tag_fn() -> None: + assert default_image_tag("t") == "t:latest" + provider = DockerEnvironmentProvider(image_tag_fn=lambda task_id: f"custom-{task_id}") + handle = await provider.prepare(AgentEvalTask(id="demo", intent="x", inputs={})) + assert isinstance(handle, DockerEnvironmentHandle) + assert handle.image == "custom-demo" + + +def test_environment_spec_yaml_dockerfile_and_plan(tmp_path: Path) -> None: + (tmp_path / "environment.yaml").write_text( + "environment:\n image: base:1\n dependencies:\n python: [pytest]\n setup: [seed]\n", + encoding="utf-8", + ) + spec = load_environment_spec(tmp_path) + assert spec.image == "base:1" and spec.python_dependencies == ["pytest"] + + plan = plan_task_build(tmp_path, "img:latest", generated_dir=tmp_path / "build") + content = plan.dockerfile.read_text(encoding="utf-8") + assert plan.generated and plan.base_image == "base:1" + assert content.startswith("FROM base:1") and "pip install --no-cache-dir pytest" in content + + # Dockerfile escape hatch wins when no yaml present. + other = tmp_path / "task2" / "environment" + other.mkdir(parents=True) + (other / "Dockerfile").write_text("FROM scratch\n", encoding="utf-8") + escape = load_environment_spec(tmp_path / "task2") + assert escape.dockerfile == other / "Dockerfile" and escape.image is None diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py new file mode 100644 index 0000000000..613a4cfaa3 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the promoted deterministic gate.""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, summarize_run, write_gate_report +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunResult, + AgentEvalSummary, + AgentEvalTask, + AgentEvalTaskResult, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import MetricOutput + + +def _make_run_result( + *, reward: float, total_tokens: int, runtime_sec: float, commit: str = "abc123" +) -> AgentEvalRunResult: + task = AgentEvalTask(id="demo", intent="do it", inputs={}) + attempt = AgentEvalAttempt( + id="demo:workflow", + task_id="demo", + status="completed", + output=AgentOutput(text="ok"), + metadata={ + "total_tokens": total_tokens, + "runtime_sec": runtime_sec, + "provenance": {"commit_sha": commit, "commit_short": commit[:7]}, + }, + ) + task_result = AgentEvalTaskResult( + id="demo:workflow:agentic_use_verifier_reward", + run_id="run-1", + task_id="demo", + attempt_id="demo:workflow", + metric_type="agentic_use_verifier_reward", + outputs=[MetricOutput(name="verifier_reward", value=reward)], + ) + return AgentEvalRunResult( + run_id="run-1", + tasks=[task], + attempts=[attempt], + results=[task_result], + summary=AgentEvalSummary(), + ) + + +def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: + summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5)) + assert summary["total_tasks"] == 1 + assert summary["pass_rate"] == 1.0 + assert summary["total_tokens_sum"] == 120 + assert summary["runtime_sec_sum"] == 4.5 + assert summary["token_metrics_coverage"] == 1.0 + assert summary["provenance"]["commit_sha"] == "abc123" + + +def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None: + baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0) + candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0) + + baseline_report = evaluate_gate(baseline, thresholds=GateThresholds()) + assert baseline_report.gate_passed is True + + candidate_report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_report.summary) + assert candidate_report.gate_passed is False + token_check = next(c for c in candidate_report.checks if c.name == "tokens_not_worse_than_baseline") + assert token_check.passed is False + + gate_path = write_gate_report(candidate_report, tmp_path) + assert gate_path.exists() and "gate_passed" in gate_path.read_text(encoding="utf-8") + + +def test_evaluate_gate_blocks_cross_commit_comparison() -> None: + baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111") + candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222") + + baseline_summary = evaluate_gate(baseline, thresholds=GateThresholds()).summary + report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_summary) + cross = next(c for c in report.checks if c.name == "commit_sha_matches_baseline") + assert cross.passed is False and report.gate_passed is False + + allowed = evaluate_gate( + candidate, thresholds=GateThresholds(allow_cross_commit=True), baseline_summary=baseline_summary + ) + cross_allowed = next(c for c in allowed.checks if c.name == "commit_sha_matches_baseline") + assert cross_allowed.passed is True + + +def test_summarize_run_uses_measurement_fallbacks() -> None: + # duration_ms -> runtime_sec, and metadata reward when no scored metric output. + run = _make_run_result(reward=0.0, total_tokens=10, runtime_sec=1.0) + run.attempts[0].metadata.pop("runtime_sec") + run.attempts[0].metadata["duration_ms"] = 2500 + run.attempts[0].metadata["reward"] = 1 + run.results.clear() # no scored metric outputs -> fall back to metadata reward + + summary = summarize_run(run) + assert summary["runtime_sec_sum"] == 2.5 + assert summary["pass_rate"] == 1.0 diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py new file mode 100644 index 0000000000..ed7da3beee --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Guardrail: the agent_eval package must stay free of NeMo-Platform imports. + +The SDK is consumed by ``tests/agentic-use`` (the NeMo-Platform adapter), never +the reverse. This test fails if any module under ``agent_eval`` imports a +platform-specific package, which keeps the promotion from leaking coupling into +the SDK. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import nemo_evaluator_sdk.agent_eval as agent_eval + +AGENT_EVAL_ROOT = Path(agent_eval.__file__).resolve().parent + +# Import statements that would couple the SDK to the platform / adapter. +_FORBIDDEN = re.compile( + r"^\s*(?:from|import)\s+" + r"(nemo_platform|nmp_[A-Za-z0-9_]+|nat_runner|runtimes(?:\.|\s|$)|evaluator_agent_eval)", + re.MULTILINE, +) + + +def test_agent_eval_has_no_platform_imports() -> None: + offenders: list[str] = [] + for path in sorted(AGENT_EVAL_ROOT.rglob("*.py")): + text = path.read_text(encoding="utf-8") + for match in _FORBIDDEN.finditer(text): + line_no = text.count("\n", 0, match.start()) + 1 + offenders.append(f"{path.relative_to(AGENT_EVAL_ROOT)}:{line_no}: {match.group(0).strip()}") + + assert not offenders, "agent_eval must not import NeMo-Platform packages:\n" + "\n".join(offenders) diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py new file mode 100644 index 0000000000..bc11bce7ef --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the typed AttemptMeasurements contract.""" + +from __future__ import annotations + +from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements + + +def test_from_metadata_reads_tokens_runtime_reward_and_provenance() -> None: + measurements = AttemptMeasurements.from_metadata( + { + "total_tokens": 120, + "prompt_tokens": 80, + "completion_tokens": 40, + "runtime_sec": 4.5, + "reward": 1, + "passed": True, + "provenance": {"commit_sha": "abc123"}, + } + ) + assert measurements.total_tokens == 120 + assert measurements.runtime_sec == 4.5 + assert measurements.reward == 1.0 + assert measurements.passed is True + assert measurements.provenance["commit_sha"] == "abc123" + + +def test_from_metadata_applies_fallbacks_and_ignores_bad_types() -> None: + # duration_ms -> runtime_sec, passed -> reward, bool is not a token count. + measurements = AttemptMeasurements.from_metadata( + {"duration_ms": 2500, "passed": False, "total_tokens": True} + ) + assert measurements.runtime_sec == 2.5 + assert measurements.reward == 0.0 + assert measurements.total_tokens is None + + empty = AttemptMeasurements.from_metadata(None) + assert empty.reward is None and empty.runtime_sec is None and empty.provenance == {} + + +def test_to_metadata_round_trips_only_set_values() -> None: + payload = AttemptMeasurements(total_tokens=10, runtime_sec=1.0, reward=1.0).to_metadata() + assert payload == {"total_tokens": 10, "runtime_sec": 1.0, "reward": 1.0} diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py new file mode 100644 index 0000000000..d5acd5bd3f --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the generic agent-eval orchestrator (online + offline paths).""" + +from __future__ import annotations + +import json +from collections.abc import Sequence +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _ExtraMetric: + @property + def type(self) -> str: + return "extra" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("extra")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + return MetricResult(outputs=[MetricOutput(name="extra", value=1.0)]) + + +class _FakeRuntime: + def __init__(self) -> None: + self.prepared_ids: list[str] = [] + + async def run_tasks( + self, tasks: Sequence[AgentEvalTask], config: AgentEvalRunConfig | None = None + ) -> Sequence[AgentEvalAttempt]: + return [ + AgentEvalAttempt( + id=f"{task.id}:fake", + task_id=task.id, + status="completed", + output=AgentOutput(text="ok"), + metadata={"agent_ok": True}, + ) + for task in tasks + ] + + +def _task() -> AgentEvalTask: + return AgentEvalTask(id="demo", intent="do it", inputs={}, metrics=[AgentPhaseSuccessMetric()]) + + +@pytest.mark.asyncio +async def test_run_tasks_appends_extra_metrics_and_runs_prepare_hook(tmp_path: Path) -> None: + runtime = _FakeRuntime() + seen: list[str] = [] + orch = AgentEvalOrchestrator( + config=OrchestratorConfig(write_dashboard=False, write_gate=True), + extra_metrics=[_ExtraMetric()], + ) + + result = await orch.run_tasks( + [_task()], + target=runtime, + benchmark={"benchmark": "demo"}, + output_dir=tmp_path, + run_id="run-1", + prepare_task=lambda task: seen.append(task.id), + ) + + assert seen == ["demo"] + assert {m.type for m in result.tasks[0].metrics} == {"agent_phase_success", "extra"} + assert result.attempts[0].status == "completed" + # Gate is written next to the run bundle. + assert (tmp_path / "gate.json").exists() + + +@pytest.mark.asyncio +async def test_score_attempts_offline_does_not_invoke_runtime() -> None: + orch = AgentEvalOrchestrator(config=OrchestratorConfig(write_dashboard=False, write_gate=False)) + attempt = AgentEvalAttempt( + id="demo:stored", + task_id="demo", + status="completed", + output=AgentOutput(text="ok"), + metadata={"agent_ok": True}, + ) + result = await orch.score_attempts([_task()], attempts=[attempt]) + assert [m.type for m in result.tasks[0].metrics] == ["agent_phase_success"] + assert any(r.metric_type == "agent_phase_success" for r in result.results) + + +@pytest.mark.asyncio +async def test_extra_metrics_deduplicated_by_type() -> None: + task = AgentEvalTask(id="demo", intent="i", inputs={}, metrics=[AgentPhaseSuccessMetric(), _ExtraMetric()]) + orch = AgentEvalOrchestrator( + config=OrchestratorConfig(write_dashboard=False, write_gate=False), + extra_metrics=[_ExtraMetric()], + ) + attempt = AgentEvalAttempt(id="demo:s", task_id="demo", status="completed", output=AgentOutput(text="ok")) + result = await orch.score_attempts([task], attempts=[attempt]) + types = [m.type for m in result.tasks[0].metrics] + assert types.count("extra") == 1 + + +def test_result_dir_attempt_source_protocol_shape(tmp_path: Path) -> None: + # A minimal AgentAttemptSource implementation satisfies the protocol. + from nemo_evaluator_sdk.agent_eval.types import AgentAttemptSource + + class _Source: + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: + payload = json.loads(Path(source).read_text(encoding="utf-8")) + return AgentEvalAttempt( + id=f"{task.id}:stored", + task_id=task.id, + status="completed", + output=AgentOutput(text=payload["agent"]), + ) + + src_path = tmp_path / "result.json" + src_path.write_text(json.dumps({"agent": "ok"}), encoding="utf-8") + source: AgentAttemptSource = _Source() + assert isinstance(source, AgentAttemptSource) + attempt = source.load_attempt(src_path, task=_task()) + assert attempt.task_id == "demo" diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py new file mode 100644 index 0000000000..136fda6075 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the generic verifier mechanic.""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( + apply_verify_to_metadata, + collect_verifier_outcome, + skipped_outcome, +) + + +def test_collect_reads_reward_file_when_present(tmp_path: Path) -> None: + (tmp_path / "reward.txt").write_text("1\n", encoding="utf-8") + (tmp_path / "test-stdout.txt").write_text("PASSED", encoding="utf-8") + outcome = collect_verifier_outcome(ok=False, exit_code=3, log_dir=tmp_path) + # reward.txt is authoritative even when the process exit said not-ok. + assert outcome.ran and outcome.reward == 1 and outcome.exit_code == 3 + assert outcome.stdout == "PASSED" + + +def test_collect_derives_and_writes_reward_when_missing(tmp_path: Path) -> None: + outcome = collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path) + assert outcome.reward == 1 and outcome.passed is True + assert (tmp_path / "reward.txt").read_text(encoding="utf-8").strip() == "1" + + +def test_apply_to_metadata_stamps_and_skips(tmp_path: Path) -> None: + meta: dict[str, object] = {} + apply_verify_to_metadata(meta, skipped_outcome()) + assert meta == {"verify_status": "skipped"} + + meta2: dict[str, object] = {} + apply_verify_to_metadata(meta2, collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path)) + assert meta2["verify_status"] == "ok" and meta2["reward"] == 1 and meta2["passed"] is True diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md index b7d55b1b13..526aa7b0e7 100644 --- a/tests/agentic-use/runtimes/COMPLIANCE.md +++ b/tests/agentic-use/runtimes/COMPLIANCE.md @@ -7,15 +7,34 @@ and `AgentAttemptRuntime` in `nemo_evaluator_sdk.agent_eval`). Design reference: internal agent-eval SDK doc (`https://docs.google.com/document/d/1mA9Kl6LVJFlgbj5CGulUOiaGyliP7QhqBh7jKXFGifM`). +## Adapter-over-SDK note + +The generic building blocks have been **promoted into the SDK** +(`nemo_evaluator_sdk.agent_eval`): the environment boundary +(`runtimes.environment`/`environment_spec`/`docker`), gating (`gating`), attempt +helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics +(`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence +`EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the +`AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the +coding-agent driver seam (`runtimes.coding_agent`). Those SDK homes are imported +**directly** by the runtime scripts — there are no re-export shims. The only +NeMo-Platform specifics that remain (the agentic task loader, `result.json` +import, attempt construction, the pytest verifier command, the `state` evidence +key, `task_image_tag` + platform `DockerEnvironmentProvider`, the +`VerifierRewardMetric`) are consolidated into a single module, +`shared/platform.py` (alongside `shared/config.py` and `shared/constants.py`). +A CI grep gate (`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) +keeps `agent_eval/` free of NeMo-Platform imports. + ## Scope split (per SDK design) | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location | |----------------------------|-----------------------------------|------------------| | AGENT phase — run backend in Docker, capture logs/trajectory | **Yes** | `runtimes//runtime.py` | -| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `shared/environment_spec.py` (env spec / Dockerfile) + `shared/docker.py` | -| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/verify.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) | +| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `agent_eval.runtimes.environment_spec` (env spec / Dockerfile) + `agent_eval.runtimes.docker` | +| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/platform.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) | | CLI — task globs, manifests, summaries | **No** | Still `nat_runner.main` (not migrated) | -| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/result_adapter.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` | +| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/platform.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` | ## Task metrics (authored on the task) @@ -46,14 +65,14 @@ metric scoring row. | `_prepare_workflow_for_runtime` | `workflow/prep.py` | | `_build_aut_agent_cmd` | `aut/command.py` | | `_prepare_aut_config_for_runtime` | `aut/prep.py` | -| `_agent_log_has_workflow_error` | `shared/agent_log.py` | -| `run_verify_phase` | `shared/verify.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) | -| `_docker_run`, `build_task_image` | `shared/docker.py` (`docker_run`, `build_dockerfile`, `build_task_image`) | -| BUILD env resolution (`environment/Dockerfile`) | `shared/environment_spec.py` (`load_environment_spec`, `plan_task_build`) | -| `_write_result` (`result.json`) | `shared/result_adapter.py` (import side only; `nat_runner` still writes it) | -| pass-rate / token / runtime gate | `shared/reporting.py` (mirrors `passrate_token_policy_gate.py`) | -| `_extract_usage_metrics` | `shared/usage.py` (delegates to `nat_runner` until deduped) | -| `capture_agent_attempt` shape | `shared/artifacts.py` | +| `_agent_log_has_workflow_error` | `shared/platform.py` | +| `run_verify_phase` | `shared/platform.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) | +| `_docker_run`, `build_task_image` | `agent_eval.runtimes.docker` (`docker_run`, `build_dockerfile`, `build_task_image`) | +| BUILD env resolution (`environment/Dockerfile`) | `agent_eval.runtimes.environment_spec` (`load_environment_spec`, `plan_task_build`) | +| `_write_result` (`result.json`) | `shared/platform.py` (import side only; `nat_runner` still writes it) | +| pass-rate / token / runtime gate | `agent_eval.gating` (mirrors `passrate_token_policy_gate.py`) | +| `_extract_usage_metrics` | `shared/platform.py` (delegates to `nat_runner` until deduped) | +| `capture_agent_attempt` shape | `shared/platform.py` | | `run_agent_phase` | **Removed per backend** once all backends migrated | ## Attempt record contract @@ -66,14 +85,14 @@ includes canonical `CapturedAgentAttempt` fields: - Artifact paths: `agent_log_dir`, `workspace_dir`, `state_dir`, `atif_trajectory_path` - Phase outcome: `agent_ok` - Verifier outcome (when `run_verify=True`): `verify_status`, `passed`, `reward`, - `verifier_log_dir` (stamped by `shared/verify.py::apply_verify_to_metadata`) + `verifier_log_dir` (stamped by `apply_verify_to_metadata` from `agent_eval.runtimes.verify`) Use `to_captured_agent_attempt(task, attempt)` for verify/scoring code that expects the portable `CapturedAgentAttempt` type. ## `nat_runner` artifact → `AgentEvalAttempt` evidence map (per design doc) -`shared/artifacts.py::_evidence_descriptors` emits the documented keys: +`shared/platform.py::_evidence_descriptors` emits the documented keys: | `nat_runner` output | `AgentEvalAttempt` mapping | Status | |---------------------|----------------------------|--------| @@ -83,7 +102,7 @@ expects the portable `CapturedAgentAttempt` type. | `agent/trajectory.json` | `evidence["trace"]` (ATIF when normalized, else json) | Implemented | | `agent/` logs | `evidence["logs"]` (dir, `primary_log=nat_agent.log`) | Implemented | | `verifier/` logs | `evidence["verifier_logs"]` (added once verify phase runs) | Implemented (conditional) | -| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/result_adapter.py::attempt_from_result` / `attempt_from_result_dir` | +| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/platform.py::attempt_from_result` / `attempt_from_result_dir` | | final agent log/message | `AgentOutput.text` | Implemented | `result.json` mapping detail (`attempt_from_result`): @@ -93,7 +112,7 @@ expects the portable `CapturedAgentAttempt` type. attempt-production failures because the SDK's `AgentEvaluator` excludes `status=="failed"` from scoring (it raises); an agent that ran but failed must stay scorable so pass-rate gating counts it as a `0`. The live builder - (`shared/artifacts.py`) and this importer share the same helper. + (`shared/platform.py`) and this importer share the same helper. - `result["reward"]`/`result["passed"]` → `metadata` measurements (verifier reward stays a *measurement*, scored by `VerifierRewardMetric`, not the attempt status). - `result["metrics"]` (token/cost) → flattened into `metadata`. @@ -103,10 +122,10 @@ expects the portable `CapturedAgentAttempt` type. | Doc section | Status in this package | |-------------|------------------------| -| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/result_adapter.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. | -| **B2** `EnvironmentProvider` boundary | **Implemented** — `shared/environment.py` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; `DockerEnvironmentProvider` wraps `shared/docker.py`. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. | -| **B3** standardize environment authoring | **Implemented (minimal)** — `shared/environment_spec.py` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). | -| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `shared/reporting.py` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. | +| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/platform.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. | +| **B2** `EnvironmentProvider` boundary | **Implemented** — `agent_eval.runtimes.environment` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; the platform `DockerEnvironmentProvider` (`shared/platform.py`) wraps `agent_eval.runtimes.docker` with the `nmp-nat-` image tag. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. | +| **B3** standardize environment authoring | **Implemented (minimal)** — `agent_eval.runtimes.environment_spec` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). | +| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `agent_eval.gating` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. | ### B4 reporting / gating detail @@ -114,7 +133,7 @@ expects the portable `CapturedAgentAttempt` type. calls `agent_eval.persistence.persist_run`, writing `tasks.jsonl`, `attempts.jsonl`, `results.jsonl`, `summary.json`, `benchmark.json`, `run.json`, and (when `write_dashboard=True`) `report.html`. -- **Gating** (`shared/reporting.py`): `summarize_run` aggregates pass-rate, +- **Gating** (`agent_eval.gating`): `summarize_run` aggregates pass-rate, token totals/coverage, runtime totals, and run-level provenance from the typed `AgentEvalRunResult` (metric scores first, attempt metadata as fallback). `evaluate_gate` applies absolute thresholds and candidate-vs-baseline checks: @@ -132,7 +151,7 @@ expects the portable `CapturedAgentAttempt` type. The doc sketches `AgentEnvironmentHandle.run_agent(instruction, config) -> AgentEvalAttempt`. We instead use `run_agent(EnvRunSpec) -> EnvCommandResult` (and the symmetric `run_verifier`). Rationale: per-backend command/env/mount construction lives in the -runtime, and attempt construction lives in `shared/artifacts.py`. Keeping the +runtime, and attempt construction lives in `shared/platform.py`. Keeping the environment layer at "execute a command, return exit status" means a new provider (local, Harbor, NeMo Gym) only implements process execution — it never needs to know about backends or attempt schemas. diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md index 90317f204a..5b149c10ec 100644 --- a/tests/agentic-use/runtimes/README.md +++ b/tests/agentic-use/runtimes/README.md @@ -1,32 +1,70 @@ # Agentic-use AgentAttemptRuntime implementations -Backend-specific runtimes extracted from `nat_runner.py` for use with -`nemo_evaluator_sdk.agent_eval.AgentEvaluator`. +NeMo-Platform **adapter** over the generic agent-eval framework in +`nemo_evaluator_sdk.agent_eval`. The backend-agnostic building blocks (environment +boundary, gating, attempt/evidence helpers, orchestrator, verify mechanic, +coding-agent driver seam) now live in the SDK; this directory holds only the +NeMo-Platform glue (the `workflow`/`aut` backends, agentic task/result formats, +the pytest verifier, the platform Docker build/image-tag) plus a thin factory. + +## Architecture: adapter over SDK + +The backend-agnostic logic lives in `nemo_evaluator_sdk.agent_eval` and is +imported **directly** by the runtime scripts (no re-export shims). Everything +generic comes from these SDK homes: + +| What | SDK home | +|------|----------| +| Docker CLI helpers | `agent_eval.runtimes.docker` | +| Environment boundary (`AgentEnvironmentProvider`/`Handle`, `EnvRunSpec`) | `agent_eval.runtimes.environment` | +| Environment authoring (`load_environment_spec`, `plan_task_build`, …) | `agent_eval.runtimes.environment_spec` | +| Gating (`GateThresholds`, `evaluate_gate`, `summarize_run`, …) | `agent_eval.gating` | +| Verify mechanic (`apply_verify_to_metadata`, `collect_verifier_outcome`) | `agent_eval.runtimes.verify` | +| `AgentPhaseSuccessMetric`, attempt-status + evidence helpers | `agent_eval.common_metrics`, `agent_eval.attempts` | +| Generic orchestrator + run layout | `agent_eval.orchestrator`, `agent_eval.runtimes.layout` | + +All NeMo-Platform-specific glue is consolidated into a single module, +`shared/platform.py`: the run layout with the platform `state_dir`, the +`nmp-nat-` image tag + `DockerEnvironmentProvider` default, the namespaced +`AgentPhaseSuccessMetric` + the `VerifierRewardMetric`, agent-log/usage parsing +and the shared container env, attempt construction (live + `result.json`), the +live VERIFY phase, and the agentic-use task loader. + +The orchestrator (`orchestrator.py`) is a thin factory over +`agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image +build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric` +(`extra_metrics`), and the `result.json` `AgentAttemptSource`. ## Layout ```text runtimes/ - shared/ # backend-agnostic building blocks: - # docker.py Docker exec + build helpers - # environment.py AgentEnvironmentProvider/Handle boundary (B2) - # environment_spec.py environment.yaml authoring + build plans (B3) - # layout.py per-run output layout - # task_loader.py agentic-use task -> AgentEvalTask - # container_env.py base container env vars - # artifacts.py agent artifacts -> AgentEvalAttempt (+ evidence) - # result_adapter.py nat_runner result.json -> AgentEvalAttempt (B1/B4) - # verify.py live VERIFY via run_verifier - # reporting.py summary + candidate/baseline gate (B4) - # metrics.py AgentPhaseSuccessMetric, VerifierRewardMetric - workflow/ # NatWorkflowAttemptRuntime (implemented) - aut/ # AutAgentAttemptRuntime (implemented) - claude_code/ # ClaudeCodeAgentAttemptRuntime (scaffold) - codex/ # CodexAgentAttemptRuntime (scaffold) - cursor_agent/ # CursorAgentAttemptRuntime (scaffold) - orchestrator.py # BUILD (env spec) + AgentEvaluator + gate; verify runs in the runtime + shared/ # platform glue only: + # platform.py — all NeMo-Platform helpers (one file) + # config.py — runtime config dataclasses + # constants.py — paths / container constants + workflow/ # NatWorkflowAttemptRuntime (implemented, NeMo construct) + aut/ # AutAgentAttemptRuntime (implemented, NeMo construct) + claude_code/ # scaffold (stub) — see "Coding-agent runtimes" below + codex/ # scaffold (stub) + cursor_agent/ # scaffold (stub) + orchestrator.py # thin factory over agent_eval.orchestrator.AgentEvalOrchestrator ``` +## Coding-agent runtimes (SDK driver seam) + +Coding-agent CLIs plug into the SDK via +`agent_eval.runtimes.coding_agent`: `CliAgentDriver` (the reusable driver) + +`CodingAgentSpec` (per-agent command builder + trajectory→evidence parser). +Reference `ClaudeCodeSpec`/`CursorAgentSpec` are shipped. The profbench codex +runtime (`agent_eval.runtimes.codex`) remains a separate, standalone-CLI runtime. + +The agentic-use `codex`/`claude_code`/`cursor_agent` backends here are still +stubs: wiring them to run the SDK driver *inside* the `nmp-agentic-base` Docker +environment (like `workflow`/`aut`) is bespoke per agent and a tracked follow-up. +`workflow` and `aut` stay in the adapter — they implement `AgentAttemptRuntime` +but are NeMo constructs, not general SDK runtimes. + ## Example: workflow backend From the repository root (requires Docker + built task image): @@ -76,7 +114,7 @@ Design-doc implementation path (see [COMPLIANCE.md](./COMPLIANCE.md) for detail) ## B1 — `result.json` import + stored-attempt scoring -`shared/result_adapter.py` imports an existing `nat_runner` run as an attempt: +`shared/platform.py` imports an existing `nat_runner` run as an attempt: - `attempt_from_result_dir(output_dir)` reads `/result.json`. - `attempt_from_result(result_dict, output_dir=...)` projects a parsed record. @@ -101,16 +139,17 @@ when `run_verify=True`. `inputs` holds only agent-facing `instruction`; ## B2 — Environment boundary -Runtimes execute the agent through `shared/environment.py` +Runtimes execute the agent through the SDK environment boundary (`AgentEnvironmentProvider` → `AgentEnvironmentHandle`) rather than calling -Docker directly. `DockerEnvironmentProvider` is the default; inject another +Docker directly. The platform `DockerEnvironmentProvider` (`shared/platform.py`, +defaulting to the `nmp-nat-` image tag) is the default; inject another provider (local, Harbor, NeMo Gym) via the runtime's `environment=` argument without changing backend code. ## B3 — Environment authoring Tasks can declare a reusable environment instead of hand-writing a Dockerfile. -`shared/environment_spec.py` loads `environment.yaml` from the task dir: +`agent_eval.runtimes.environment_spec` loads `environment.yaml` from the task dir: ```yaml environment: @@ -141,10 +180,10 @@ as metadata, not executed here (they are runtime concerns). The SDK persists the run bundle (`tasks.jsonl`, `attempts.jsonl`, `results.jsonl`, `summary.json`, `report.html`) when `output_dir` is set. -`shared/reporting.py` adds the gate on top: +`agent_eval.gating` adds the gate on top: ```python -from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report report = evaluate_gate( run_result, @@ -160,7 +199,7 @@ The orchestrator emits `gate.json` automatically (`AgenticOrchestratorConfig.wri ## Live VERIFY phase (through the B2 boundary) -`shared/verify.py` runs the task-local `tests/test_outputs.py` pytest verifier +`shared/platform.py` runs the task-local `tests/test_outputs.py` pytest verifier through `AgentEnvironmentHandle.run_verifier`, in the same prepared environment and against the same persisted workspace/state as the agent phase. Enable it via `AgenticSharedConfig(run_verify=True)`; the runtime stamps `reward`/`passed`/ diff --git a/tests/agentic-use/runtimes/__init__.py b/tests/agentic-use/runtimes/__init__.py index 1b7e12491f..df392483cf 100644 --- a/tests/agentic-use/runtimes/__init__.py +++ b/tests/agentic-use/runtimes/__init__.py @@ -3,20 +3,23 @@ """Backend-specific AgentAttemptRuntime implementations for agentic-use evals.""" -from runtimes.aut.runtime import AutAgentAttemptRuntime -from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime -from runtimes.codex.runtime import CodexAgentAttemptRuntime -from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime -from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend -from runtimes.shared.environment import ( +from nemo_evaluator_sdk.agent_eval.gating import ( + GateCheck, + GateReport, + GateThresholds, + evaluate_gate, + load_baseline_summary, + summarize_run, + write_gate_report, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( AgentEnvironmentHandle, AgentEnvironmentProvider, DockerEnvironmentHandle, - DockerEnvironmentProvider, EnvCommandResult, EnvRunSpec, ) -from runtimes.shared.environment_spec import ( +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import ( BuildPlan, EnvironmentSpec, execute_build_plan, @@ -24,20 +27,19 @@ plan_task_build, render_derived_dockerfile, ) -from runtimes.shared.metrics import AgentPhaseSuccessMetric, VerifierRewardMetric -from runtimes.shared.reporting import ( - GateCheck, - GateReport, - GateThresholds, - evaluate_gate, - load_baseline_summary, - summarize_run, - write_gate_report, -) -from runtimes.shared.result_adapter import attempt_from_result, attempt_from_result_dir -from runtimes.shared.verify import ( - VerifierOutcome, - apply_verify_to_metadata, +from nemo_evaluator_sdk.agent_eval.runtimes.verify import VerifierOutcome, apply_verify_to_metadata + +from runtimes.aut.runtime import AutAgentAttemptRuntime +from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime +from runtimes.codex.runtime import CodexAgentAttemptRuntime +from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime +from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend +from runtimes.shared.platform import ( + AgentPhaseSuccessMetric, + DockerEnvironmentProvider, + VerifierRewardMetric, + attempt_from_result, + attempt_from_result_dir, build_verify_run_spec, maybe_run_verify, run_verify, diff --git a/tests/agentic-use/runtimes/aut/runtime.py b/tests/agentic-use/runtimes/aut/runtime.py index 64bc8e46bc..4185abe826 100644 --- a/tests/agentic-use/runtimes/aut/runtime.py +++ b/tests/agentic-use/runtimes/aut/runtime.py @@ -8,12 +8,12 @@ from collections.abc import Sequence from pathlib import Path +from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec +from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask from runtimes.aut.command import build_aut_agent_cmd from runtimes.aut.prep import prepare_aut_config_for_runtime -from runtimes.shared.agent_log import agent_log_has_workflow_error -from runtimes.shared.artifacts import build_agent_eval_attempt from runtimes.shared.config import AutRuntimeConfig from runtimes.shared.constants import ( DOCKER_SOCKET_CONTAINER_PATH, @@ -21,15 +21,16 @@ INSTRUCTION_CONTAINER_PATH, REPO_ROOT, ) -from runtimes.shared.container_env import base_container_env -from runtimes.shared.environment import ( - AgentEnvironmentProvider, +from runtimes.shared.platform import ( + AgenticRunLayout, DockerEnvironmentProvider, - EnvRunSpec, + agent_log_has_workflow_error, + base_container_env, + build_agent_eval_attempt, + maybe_run_verify, + resolve_run_layout, + task_agent_timeout_sec, ) -from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout -from runtimes.shared.task_loader import task_agent_timeout_sec -from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify RUNTIME_NAME = "aut" AUT_CONFIG_CONTAINER_PATH = "/tmp/aut_agent.yml" diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py index 94eb00050e..74531b1a41 100644 --- a/tests/agentic-use/runtimes/orchestrator.py +++ b/tests/agentic-use/runtimes/orchestrator.py @@ -1,7 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Orchestrate BUILD + AgentEvaluator + VERIFY for agentic-use tasks.""" +"""Agentic-use adapter over the generic SDK orchestrator. + +This is a thin NeMo-Platform factory: the generic run/score/gate loop lives in +:class:`nemo_evaluator_sdk.agent_eval.orchestrator.AgentEvalOrchestrator`. Here we +inject the platform specifics it deliberately does not know about — the agentic +task loader, the Docker image build (``prepare_task``), the ``run_verify``-derived +``VerifierRewardMetric``, and the ``result.json`` :class:`AgentAttemptSource`. +""" from __future__ import annotations @@ -10,7 +17,10 @@ from pathlib import Path from typing import Any -from nemo_evaluator_sdk.agent_eval import AgentEvalRunConfig, AgentEvaluator +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig +from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_image_exists +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import execute_build_plan, plan_task_build from nemo_evaluator_sdk.agent_eval.types import ( AgentAttemptRuntime, AgentEvalRunResult, @@ -18,13 +28,12 @@ ) from nemo_evaluator_sdk.metrics.protocol import Metric -from runtimes.shared.docker import docker_image_exists -from runtimes.shared.environment_spec import execute_build_plan, plan_task_build -from runtimes.shared.layout import task_image_tag -from runtimes.shared.metrics import VerifierRewardMetric -from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report -from runtimes.shared.result_adapter import attempt_from_result_dir -from runtimes.shared.task_loader import agentic_task_from_dir +from runtimes.shared.platform import ( + ResultDirAttemptSource, + VerifierRewardMetric, + agentic_task_from_dir, + task_image_tag, +) @dataclass(frozen=True) @@ -38,7 +47,7 @@ class AgenticOrchestratorConfig: class AgenticEvalOrchestrator: - """Run agentic-use tasks through AgentEvaluator and optional verify phase.""" + """Run agentic-use tasks through the generic orchestrator + optional verify metric.""" def __init__( self, @@ -48,6 +57,16 @@ def __init__( ) -> None: self.runtime = runtime self.config = config or AgenticOrchestratorConfig() + self._orchestrator = AgentEvalOrchestrator( + config=OrchestratorConfig( + parallelism=1, + write_dashboard=self.config.write_dashboard, + write_gate=self.config.write_gate, + gate_thresholds=self.config.gate_thresholds, + baseline_summary_path=self.config.baseline_summary_path, + ), + extra_metrics=self._extra_metrics(), + ) async def run_agent_eval( self, @@ -58,25 +77,15 @@ async def run_agent_eval( ) -> AgentEvalRunResult: """Build the task image when needed, run the agent runtime, return SDK result.""" task = agentic_task_from_dir(task_name) - task = task.model_copy(update={"metrics": self._metrics_for_task(task)}) - image_tag = task_image_tag(task.id) - self._ensure_task_image(task.metadata["task_dir"], image_tag) - - result = await AgentEvaluator().run( - tasks=[task], + return await self._orchestrator.run_tasks( + [task], target=self.runtime, - config=AgentEvalRunConfig( - output_dir=output_dir, - run_id=run_id, - parallelism=1, - write_dashboard=self.config.write_dashboard, - benchmark={"benchmark": "agentic-use", "task": task_name}, - ), + benchmark={"benchmark": "agentic-use", "task": task_name}, + output_dir=output_dir, + run_id=run_id, + prepare_task=self._ensure_task_image, ) - self._maybe_write_gate(result) - return result - async def score_captured_attempts( self, task_name: str, @@ -87,61 +96,38 @@ async def score_captured_attempts( ) -> AgentEvalRunResult: """Score already-captured ``result.json`` runs without re-running the agent. - This is the SDK's first-class *stored-attempt* path: it imports each - ``nat_runner`` output directory via :func:`attempt_from_result_dir` and - scores them through :class:`AgentEvaluator`, so metrics can be exercised - (and runs rescored) with no Docker/agent execution. + The SDK's first-class *stored-attempt* path: each ``nat_runner`` output + dir is adapted via :class:`ResultDirAttemptSource` and scored through the + generic orchestrator, so metrics can be exercised (and runs rescored) with + no Docker/agent execution. """ task = agentic_task_from_dir(task_name) - task = task.model_copy(update={"metrics": self._metrics_for_task(task)}) - attempts = [attempt_from_result_dir(result_dir, task=task) for result_dir in result_dirs] - - result = await AgentEvaluator().run( - tasks=[task], + source = ResultDirAttemptSource() + attempts = [source.load_attempt(result_dir, task=task) for result_dir in result_dirs] + return await self._orchestrator.score_attempts( + [task], attempts=attempts, - config=AgentEvalRunConfig( - output_dir=output_dir, - run_id=run_id, - parallelism=1, - write_dashboard=self.config.write_dashboard, - benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"}, - ), + benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"}, + output_dir=output_dir, + run_id=run_id, ) - self._maybe_write_gate(result) - return result + def _extra_metrics(self) -> list[Metric]: + """Append :class:`VerifierRewardMetric` only when the runtime runs verify. - def _maybe_write_gate(self, result: AgentEvalRunResult) -> None: - if not (self.config.write_gate and result.output_dir is not None): - return - baseline = ( - load_baseline_summary(self.config.baseline_summary_path) - if self.config.baseline_summary_path is not None - else None - ) - report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline) - write_gate_report(report, result.output_dir) - - def _metrics_for_task(self, task: AgentEvalTask) -> list[Metric]: - """Honor task-authored metrics; only *append* a compatibility metric. - - Metrics originate on the task (see ``agentic_task_from_dir``). When the - live verify phase is enabled we append :class:`VerifierRewardMetric` so - the legacy pytest reward is scored too — but we never replace the task's - own metric set, and we avoid duplicating a metric the task already - declares (the SDK rejects duplicate metric types). + The verify-enable decision stays in the adapter (it knows its own runtime + config); the generic orchestrator never introspects the runtime. """ - metrics: list[Metric] = list(task.metrics) - if self._verify_enabled() and not any(isinstance(metric, VerifierRewardMetric) for metric in metrics): - metrics.append(VerifierRewardMetric()) - return metrics + return [VerifierRewardMetric()] if self._verify_enabled() else [] def _verify_enabled(self) -> bool: runtime_config = getattr(self.runtime, "config", None) shared = getattr(runtime_config, "shared", None) return bool(getattr(shared, "run_verify", False)) - def _ensure_task_image(self, task_dir: str | Path, image_tag: str) -> None: + def _ensure_task_image(self, task: AgentEvalTask) -> None: + image_tag = task_image_tag(task.id) + task_dir = task.metadata["task_dir"] if self.config.skip_build: if not docker_image_exists(image_tag): raise RuntimeError( diff --git a/tests/agentic-use/runtimes/shared/agent_log.py b/tests/agentic-use/runtimes/shared/agent_log.py deleted file mode 100644 index 6fc7de0270..0000000000 --- a/tests/agentic-use/runtimes/shared/agent_log.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Agent log parsing helpers shared by backend runtimes.""" - -from __future__ import annotations - -import json -from typing import Any - - -def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]: - """Return JSON dict payloads embedded in an agent log, newest-first after the full log.""" - candidates = [agent_log.strip()] - lines = [line.strip() for line in agent_log.splitlines() if line.strip()] - if lines: - candidates.append(lines[-1]) - candidates.extend(reversed(lines)) - - payloads: list[dict[str, Any]] = [] - seen: set[str] = set() - for candidate in candidates: - if not candidate or candidate in seen: - continue - seen.add(candidate) - try: - parsed = json.loads(candidate) - except json.JSONDecodeError: - continue - if isinstance(parsed, dict): - payloads.append(parsed) - return payloads - - -def agent_log_has_workflow_error(agent_log: str) -> bool: - """Detect AUT workflow errors returned as successful HTTP JSON payloads.""" - for payload in iter_agent_log_json_payloads(agent_log): - if payload.get("code") == "workflow_error": - return True - return False diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py deleted file mode 100644 index 4942568635..0000000000 --- a/tests/agentic-use/runtimes/shared/artifacts.py +++ /dev/null @@ -1,234 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Convert captured agent artifacts into AgentEvalAttempt values.""" - -from __future__ import annotations - -from pathlib import Path - -from evaluator_agent_eval.artifacts import AgentArtifacts -from evaluator_agent_eval.schemas import ( - AgentAttemptInput, - AgentAttemptMetadata, - AgentAttemptOutput, - AgentAttemptTrace, - CapturedAgentAttempt, -) -from nemo_evaluator_sdk.agent_eval.types import ( - AgentEvalAttempt, - AgentEvalAttemptStatus, - AgentEvalTask, - AgentOutput, -) -from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor - -from runtimes.shared.config import AgenticRuntimeName -from runtimes.shared.layout import AgenticRunLayout -from runtimes.shared.usage import extract_usage_metrics - - -def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus: - """Map an agent-phase outcome to a *scorable* attempt status. - - The SDK's :class:`AgentEvaluator` excludes ``status=="failed"`` from scoring - (it raises). An agent that ran but failed must still be scored — e.g. as a - ``0`` by :class:`AgentPhaseSuccessMetric` — so that pass-rate gating counts - it rather than dropping it. We therefore use ``"partial"`` for an - executed-but-unsuccessful agent and reserve ``"failed"`` for genuine - attempt-*production* failures (which a runtime surfaces by raising, not by - emitting an unscorable attempt). This keeps the live builder and the - ``result.json`` importer consistent. - """ - return "completed" if agent_ok else "partial" - - -def build_agent_eval_attempt( - *, - task: AgentEvalTask, - layout: AgenticRunLayout, - runtime_name: AgenticRuntimeName, - agent_model: str, - exit_code: int, - agent_ok: bool, - run_id: str | None = None, - repo_revision: str | None = None, - duration_ms: int | None = None, -) -> AgentEvalAttempt: - """Build an SDK attempt from on-disk agent artifacts. - - Metadata uses the same canonical keys as :class:`CapturedAgentAttempt` - (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring - helpers can consume attempts without a second adapter. - """ - artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - log_text = _read_agent_log(layout.agent_log_dir) - usage = extract_usage_metrics(log_text) - duration = duration_ms if duration_ms is not None else usage.get("duration_ms") - - output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None - raw_log_paths = _raw_log_paths(artifacts.agent_log_dir) - initial_state = task.inputs.get("filesystem") - descriptors = _evidence_descriptors( - layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None - ) - - metadata: dict[str, object] = { - # Canonical CapturedAgentAttempt fields - "agent_runtime": runtime_name, - "agent_model": agent_model, - "agent_runtime_version": None, - "repo_revision": repo_revision, - "run_id": run_id, - "exit_code": exit_code, - "duration_ms": duration, - # SDK / orchestration extensions - "model_id": agent_model, - "target_name": agent_model, - "attempt_id": f"{task.id}:{runtime_name}", - "agent_ok": agent_ok, - "agent_log_dir": str(layout.agent_log_dir), - "workspace_dir": str(layout.workspace_dir), - "state_dir": str(layout.state_dir), - "run_dir": str(layout.run_dir), - "instruction_path": task.metadata.get("instruction_path"), - "final_answer_extracted": artifacts.final_answer.extracted, - "final_answer_source": artifacts.final_answer.source, - "raw_log_paths": raw_log_paths, - "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None, - **usage, - } - - status = resolve_attempt_status(agent_ok) - if output_text: - output = AgentOutput(text=output_text) - elif agent_ok: - output = AgentOutput(text=log_text.strip() or "") - else: - output = AgentOutput(text=log_text.strip() or "(agent phase failed)") - - return AgentEvalAttempt( - id=f"{task.id}:{runtime_name}", - task_id=task.id, - status=status, - output=output, - evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, - metadata=metadata, - ) - - -def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt: - """Project an SDK attempt onto the portable CapturedAgentAttempt schema.""" - metadata = attempt.metadata - trace_path = metadata.get("atif_trajectory_path") - return CapturedAgentAttempt( - task_id=attempt.task_id, - input=AgentAttemptInput( - instruction_text=task.intent, - instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None, - ), - output=AgentAttemptOutput( - final_text=attempt.output.text if attempt.output is not None else "", - final_answer_extracted=bool(metadata.get("final_answer_extracted")), - final_answer_source=str(metadata.get("final_answer_source")) - if metadata.get("final_answer_source") is not None - else None, - raw_log_paths=list(metadata.get("raw_log_paths") or []), - ), - metadata=AgentAttemptMetadata( - agent_runtime=str(metadata.get("agent_runtime", "unknown")), - agent_model=str(metadata.get("agent_model", "unknown")), - agent_runtime_version=str(metadata["agent_runtime_version"]) - if metadata.get("agent_runtime_version") is not None - else None, - repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None, - run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None, - exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None, - duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None, - ), - trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None, - ) - - -def _evidence_descriptors( - layout: AgenticRunLayout, - artifacts: AgentArtifacts, - *, - initial_state_ref: str | None = None, -) -> dict[str, EvidenceDescriptor]: - """Build the evidence map specified by the agent-eval SDK design doc. - - Doc keys: ``initial_state`` (task input filesystem, when staged), - ``final_state`` (workspace), ``trace`` (trajectory, ATIF-normalized), - ``logs`` (agent log dir), and ``verifier_logs`` (verifier log dir). - - ``state`` is a NeMo-Platform-specific *extension* (not a doc key): it carries - the preserved platform/database state across the agent + verifier phases. - """ - descriptors: dict[str, EvidenceDescriptor] = {} - - # task input filesystem → evidence["initial_state"] (only when a seed was staged). - if initial_state_ref: - descriptors["initial_state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=initial_state_ref, - metadata={"role": "initial_state"}, - ) - - # agent/trajectory.json → evidence["trace"], preferably ATIF-normalized. - if artifacts.atif_trajectory_path is not None: - descriptors["trace"] = EvidenceDescriptor( - kind="trace", - format="atif" if artifacts.atif_trajectory_path.name.startswith("atif") else "json", - ref=str(artifacts.atif_trajectory_path), - ) - - # agent/ logs → evidence["logs"]. - descriptors["logs"] = EvidenceDescriptor( - kind="logs", - format="dir", - ref=str(layout.agent_log_dir), - metadata={"primary_log": "nat_agent.log"}, - ) - - # workspace/ → evidence["final_state"] filesystem descriptor. - descriptors["final_state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=str(layout.workspace_dir), - metadata={"role": "final_state"}, - ) - - # Platform extension (non-doc key): preserved platform/db state across phases. - descriptors["state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=str(layout.state_dir), - metadata={"role": "platform_state", "extension": "nemo-platform"}, - ) - - # verifier/ logs → evidence["verifier_logs"] (present once verify phase runs). - verifier_log_dir = layout.run_dir / "verifier" - if verifier_log_dir.exists(): - descriptors["verifier_logs"] = EvidenceDescriptor( - kind="logs", - format="dir", - ref=str(verifier_log_dir), - metadata={"role": "verifier"}, - ) - - return descriptors - - -def _raw_log_paths(agent_log_dir: Path) -> list[str]: - if not agent_log_dir.is_dir(): - return [] - return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()] - - -def _read_agent_log(agent_log_dir: Path) -> str: - log_path = agent_log_dir / "nat_agent.log" - if log_path.is_file(): - return log_path.read_text(encoding="utf-8", errors="replace") - return "" diff --git a/tests/agentic-use/runtimes/shared/container_env.py b/tests/agentic-use/runtimes/shared/container_env.py deleted file mode 100644 index b59a100b54..0000000000 --- a/tests/agentic-use/runtimes/shared/container_env.py +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Shared container environment helpers.""" - -from __future__ import annotations - -import json -from typing import Any - -from runtimes.shared.config import AgenticSharedConfig -from runtimes.shared.constants import ( - DOCKER_SOCKET_CONTAINER_PATH, - DOCKER_SOCKET_HOST_PATH, - FILES_STORAGE_CONFIG, - PLATFORM_CONFIG_PATH, -) - - -def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]: - """Environment variables shared by all agentic-use container runs.""" - env: dict[str, str] = { - "NMP_BASE_URL": shared.nmp_base_url, - "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", - "DATABASE_DIALECT": "sqlite", - "DATABASE_PATH": "/data/nmp-platform.db", - "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, - "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, - "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec), - "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec), - "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec), - } - if DOCKER_SOCKET_HOST_PATH.exists(): - env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" - return env - - -def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]: - if agent_params: - env = dict(env) - env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True) - return env diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py deleted file mode 100644 index a259de71af..0000000000 --- a/tests/agentic-use/runtimes/shared/layout.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Output directory layout for agentic-use runtime runs.""" - -from __future__ import annotations - -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path - -from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask - -from runtimes.shared.config import AgenticSharedConfig - - -@dataclass(frozen=True) -class AgenticRunLayout: - """Filesystem layout for one task run.""" - - run_dir: Path - agent_log_dir: Path - workspace_dir: Path - state_dir: Path - instruction_path: Path - - -def default_jobs_dir(shared: AgenticSharedConfig) -> Path: - if shared.jobs_dir is not None: - return shared.jobs_dir - return shared.repo_root / "nat-jobs" - - -def new_run_dir(jobs_dir: Path, task_id: str) -> Path: - timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") - run_dir = jobs_dir / f"{timestamp}-{task_id}" - run_dir.mkdir(parents=True, exist_ok=True) - return run_dir - - -def resolve_run_layout( - task: AgentEvalTask, - shared: AgenticSharedConfig, - config: AgentEvalRunConfig | None = None, -) -> AgenticRunLayout: - """Resolve or create the on-disk layout for one task attempt.""" - if config is not None and config.output_dir is not None: - run_dir = Path(config.output_dir) - else: - run_dir = new_run_dir(default_jobs_dir(shared), task.id) - - agent_log_dir = run_dir / "agent" - workspace_dir = run_dir / "workspace" - state_dir = run_dir / "state" - agent_log_dir.mkdir(parents=True, exist_ok=True) - workspace_dir.mkdir(parents=True, exist_ok=True) - state_dir.mkdir(parents=True, exist_ok=True) - - instruction_path = agent_log_dir / "instruction.md" - instruction_path.write_text(task.intent, encoding="utf-8") - - return AgenticRunLayout( - run_dir=run_dir, - agent_log_dir=agent_log_dir, - workspace_dir=workspace_dir, - state_dir=state_dir, - instruction_path=instruction_path, - ) - - -def task_image_tag(task_id: str) -> str: - return f"nmp-nat-{task_id}:latest" diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py deleted file mode 100644 index 7c68a590ec..0000000000 --- a/tests/agentic-use/runtimes/shared/metrics.py +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Default metrics for agentic-use agent-eval runs.""" - -from __future__ import annotations - -from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult - - -class AgentPhaseSuccessMetric: - """Score 1.0 when the agent phase exited successfully, else 0.0.""" - - @property - def type(self) -> str: - return "agentic_use_agent_phase" - - def output_spec(self) -> list[MetricOutputSpec]: - return [MetricOutputSpec.continuous_score("agent_phase_success")] - - async def compute_scores(self, input: MetricInput) -> MetricResult: - agent_ok = bool(input.candidate.metadata.get("agent_ok")) - return MetricResult( - outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)], - ) - - -class VerifierRewardMetric: - """Compatibility metric mirroring the legacy pytest verifier reward. - - Reads the verifier outcome that ``nat_runner`` records in ``result.json`` - (projected onto attempt metadata as ``reward``/``passed``) so existing - ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK - while task-specific metrics are authored. - """ - - @property - def type(self) -> str: - return "agentic_use_verifier_reward" - - def output_spec(self) -> list[MetricOutputSpec]: - return [MetricOutputSpec.continuous_score("verifier_reward")] - - async def compute_scores(self, input: MetricInput) -> MetricResult: - metadata = input.candidate.metadata - reward = metadata.get("reward") - if reward is None: - reward = 1.0 if metadata.get("passed") else 0.0 - return MetricResult( - outputs=[MetricOutput(name="verifier_reward", value=float(reward))], - ) diff --git a/tests/agentic-use/runtimes/shared/platform.py b/tests/agentic-use/runtimes/shared/platform.py new file mode 100644 index 0000000000..721d717e6f --- /dev/null +++ b/tests/agentic-use/runtimes/shared/platform.py @@ -0,0 +1,791 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""NeMo-Platform glue that sits on top of the generic agent-eval SDK. + +Everything generic (Docker helpers, the environment boundary, environment +authoring, gating, attempt-status/evidence helpers, the verifier mechanic) now +lives in ``nemo_evaluator_sdk.agent_eval`` and is imported directly where used. + +This single module holds only the pieces that are specific to the agentic-use +benchmark and therefore do not belong in the SDK: + +* run layout with the platform ``state_dir`` and the ``nmp-nat-`` image tag, +* a ``DockerEnvironmentProvider`` defaulting to that platform image tag, +* default metrics (``AgentPhaseSuccessMetric`` namespace + ``VerifierRewardMetric``), +* agent-log/usage parsing and the shared container env, +* attempt construction from live artifacts and from ``nat_runner`` ``result.json``, +* the live VERIFY phase wired through the SDK environment boundary, +* the agentic-use task loader. +""" + +from __future__ import annotations + +import json +import textwrap +import tomllib +from collections.abc import Callable +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any, TypedDict + +from evaluator_agent_eval.artifacts import AgentArtifacts +from evaluator_agent_eval.schemas import ( + AgentAttemptInput, + AgentAttemptMetadata, + AgentAttemptOutput, + AgentAttemptTrace, + CapturedAgentAttempt, +) +from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + AgentEnvironmentHandle, + EnvRunSpec, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + DockerEnvironmentProvider as _SDKDockerEnvironmentProvider, +) +from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir +from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( + VerifierOutcome, + collect_verifier_outcome, + skipped_outcome, +) +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import ( + Metric, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, +) +from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor + +from runtimes.shared.config import AgenticRuntimeName, AgenticSharedConfig +from runtimes.shared.constants import ( + AGENTIC_USE_DIR, + DOCKER_SOCKET_CONTAINER_PATH, + DOCKER_SOCKET_HOST_PATH, + EVALUATOR_SDK_SRC, + FILES_STORAGE_CONFIG, + PLATFORM_CONFIG_PATH, + SHARED_DIR, +) + +__all__ = [ + "AgenticRunLayout", + "AgentPhaseSuccessMetric", + "DockerEnvironmentProvider", + "ResultDirAttemptSource", + "VerifierRewardMetric", + "agent_log_has_workflow_error", + "agentic_task_from_dir", + "attempt_from_result", + "attempt_from_result_dir", + "base_container_env", + "build_agent_eval_attempt", + "build_verify_run_spec", + "extract_usage_metrics", + "iter_agent_log_json_payloads", + "load_task_toml", + "maybe_run_verify", + "resolve_run_layout", + "run_verify", + "task_agent_timeout_sec", + "task_image_tag", + "to_captured_agent_attempt", + "verifier_log_dir", + "with_candidate_params", +] + + +# --------------------------------------------------------------------------- # +# Run layout + image tagging +# --------------------------------------------------------------------------- # +@dataclass(frozen=True) +class AgenticRunLayout: + """Filesystem layout for one task run. + + Extends the SDK's generic ``RunLayout`` shape with a platform-specific + ``state_dir`` (preserved platform/database state across agent + verifier). + """ + + run_dir: Path + agent_log_dir: Path + workspace_dir: Path + state_dir: Path + instruction_path: Path + + +def task_image_tag(task_id: str) -> str: + return f"nmp-nat-{task_id}:latest" + + +def default_jobs_dir(shared: AgenticSharedConfig) -> Path: + if shared.jobs_dir is not None: + return shared.jobs_dir + return shared.repo_root / "nat-jobs" + + +def new_run_dir(jobs_dir: Path, task_id: str) -> Path: + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + run_dir = jobs_dir / f"{timestamp}-{task_id}" + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + +def resolve_run_layout( + task: AgentEvalTask, + shared: AgenticSharedConfig, + config: AgentEvalRunConfig | None = None, +) -> AgenticRunLayout: + """Resolve or create the on-disk layout for one task attempt.""" + output_dir = config.output_dir if config is not None else None + run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id)) + + # Generic agent/workspace dirs + written instruction come from the SDK helper. + base = prepare_run_layout(run_dir, task.intent) + + # Platform extension: a preserved state dir for platform/db across phases. + state_dir = base.run_dir / "state" + state_dir.mkdir(parents=True, exist_ok=True) + + return AgenticRunLayout( + run_dir=base.run_dir, + agent_log_dir=base.agent_log_dir, + workspace_dir=base.workspace_dir, + state_dir=state_dir, + instruction_path=base.instruction_path, + ) + + +class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider): + """Platform default: map ``task.id`` to ``nmp-nat-:latest``.""" + + def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None: + super().__init__(image_tag_fn=image_tag_fn) + + +# --------------------------------------------------------------------------- # +# Default metrics +# --------------------------------------------------------------------------- # +class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric): + """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``).""" + + metric_type = "agentic_use_agent_phase" + + +class VerifierRewardMetric: + """Compatibility metric mirroring the legacy pytest verifier reward. + + Reads the verifier outcome that ``nat_runner`` records in ``result.json`` + (projected onto attempt metadata as ``reward``/``passed``) so existing + ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK + while task-specific metrics are authored. + """ + + @property + def type(self) -> str: + return "agentic_use_verifier_reward" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("verifier_reward")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + metadata = input.candidate.metadata + reward = metadata.get("reward") + if reward is None: + reward = 1.0 if metadata.get("passed") else 0.0 + return MetricResult( + outputs=[MetricOutput(name="verifier_reward", value=float(reward))], + ) + + +# --------------------------------------------------------------------------- # +# Agent-log parsing + token usage +# --------------------------------------------------------------------------- # +class TokenMetrics(TypedDict): + prompt_tokens: int | None + completion_tokens: int | None + total_tokens: int | None + cache_creation_tokens: int | None + cache_read_tokens: int | None + n_assistant_messages: int | None + cost_usd: float | None + num_turns: int | None + duration_ms: float | None + + +def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]: + """Extract token usage metrics from an agent log.""" + import nat_runner + + metrics = nat_runner._extract_usage_metrics(agent_log) + return dict(metrics) + + +def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]: + """Return JSON dict payloads embedded in an agent log, newest-first after the full log.""" + candidates = [agent_log.strip()] + lines = [line.strip() for line in agent_log.splitlines() if line.strip()] + if lines: + candidates.append(lines[-1]) + candidates.extend(reversed(lines)) + + payloads: list[dict[str, Any]] = [] + seen: set[str] = set() + for candidate in candidates: + if not candidate or candidate in seen: + continue + seen.add(candidate) + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + payloads.append(parsed) + return payloads + + +def agent_log_has_workflow_error(agent_log: str) -> bool: + """Detect AUT workflow errors returned as successful HTTP JSON payloads.""" + for payload in iter_agent_log_json_payloads(agent_log): + if payload.get("code") == "workflow_error": + return True + return False + + +# --------------------------------------------------------------------------- # +# Shared container environment +# --------------------------------------------------------------------------- # +def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]: + """Environment variables shared by all agentic-use container runs.""" + env: dict[str, str] = { + "NMP_BASE_URL": shared.nmp_base_url, + "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", + "DATABASE_DIALECT": "sqlite", + "DATABASE_PATH": "/data/nmp-platform.db", + "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, + "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, + "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec), + "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec), + "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec), + } + if DOCKER_SOCKET_HOST_PATH.exists(): + env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" + return env + + +def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]: + if agent_params: + env = dict(env) + env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True) + return env + + +# --------------------------------------------------------------------------- # +# Attempt construction from live artifacts +# --------------------------------------------------------------------------- # +def build_agent_eval_attempt( + *, + task: AgentEvalTask, + layout: AgenticRunLayout, + runtime_name: AgenticRuntimeName, + agent_model: str, + exit_code: int, + agent_ok: bool, + run_id: str | None = None, + repo_revision: str | None = None, + duration_ms: int | None = None, +) -> AgentEvalAttempt: + """Build an SDK attempt from on-disk agent artifacts. + + Metadata uses the same canonical keys as :class:`CapturedAgentAttempt` + (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring + helpers can consume attempts without a second adapter. + """ + artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + log_text = _read_agent_log(layout.agent_log_dir) + usage = extract_usage_metrics(log_text) + duration = duration_ms if duration_ms is not None else usage.get("duration_ms") + + output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None + raw_log_paths = _raw_log_paths(artifacts.agent_log_dir) + initial_state = task.inputs.get("filesystem") + descriptors = _evidence_descriptors( + layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None + ) + + metadata: dict[str, object] = { + # Canonical CapturedAgentAttempt fields + "agent_runtime": runtime_name, + "agent_model": agent_model, + "agent_runtime_version": None, + "repo_revision": repo_revision, + "run_id": run_id, + "exit_code": exit_code, + "duration_ms": duration, + # SDK / orchestration extensions + "model_id": agent_model, + "target_name": agent_model, + "attempt_id": f"{task.id}:{runtime_name}", + "agent_ok": agent_ok, + "agent_log_dir": str(layout.agent_log_dir), + "workspace_dir": str(layout.workspace_dir), + "state_dir": str(layout.state_dir), + "run_dir": str(layout.run_dir), + "instruction_path": task.metadata.get("instruction_path"), + "final_answer_extracted": artifacts.final_answer.extracted, + "final_answer_source": artifacts.final_answer.source, + "raw_log_paths": raw_log_paths, + "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None, + **usage, + } + + status = resolve_attempt_status(agent_ok) + if output_text: + output = AgentOutput(text=output_text) + elif agent_ok: + output = AgentOutput(text=log_text.strip() or "") + else: + output = AgentOutput(text=log_text.strip() or "(agent phase failed)") + + return AgentEvalAttempt( + id=f"{task.id}:{runtime_name}", + task_id=task.id, + status=status, + output=output, + evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, + metadata=metadata, + ) + + +def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt: + """Project an SDK attempt onto the portable CapturedAgentAttempt schema.""" + metadata = attempt.metadata + trace_path = metadata.get("atif_trajectory_path") + return CapturedAgentAttempt( + task_id=attempt.task_id, + input=AgentAttemptInput( + instruction_text=task.intent, + instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None, + ), + output=AgentAttemptOutput( + final_text=attempt.output.text if attempt.output is not None else "", + final_answer_extracted=bool(metadata.get("final_answer_extracted")), + final_answer_source=str(metadata.get("final_answer_source")) + if metadata.get("final_answer_source") is not None + else None, + raw_log_paths=list(metadata.get("raw_log_paths") or []), + ), + metadata=AgentAttemptMetadata( + agent_runtime=str(metadata.get("agent_runtime", "unknown")), + agent_model=str(metadata.get("agent_model", "unknown")), + agent_runtime_version=str(metadata["agent_runtime_version"]) + if metadata.get("agent_runtime_version") is not None + else None, + repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None, + run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None, + exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None, + duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None, + ), + trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None, + ) + + +def _evidence_descriptors( + layout: AgenticRunLayout, + artifacts: AgentArtifacts, + *, + initial_state_ref: str | None = None, +) -> dict[str, EvidenceDescriptor]: + """Compose the SDK's standard evidence keys + the platform ``state`` extension. + + The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/ + ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state`` + is a NeMo-Platform-specific *extension* (not a doc key): it carries the + preserved platform/database state across the agent + verifier phases. + """ + descriptors = standard_evidence_descriptors( + logs_dir=layout.agent_log_dir, + final_state_dir=layout.workspace_dir, + trace_path=artifacts.atif_trajectory_path, + initial_state_ref=initial_state_ref, + verifier_logs_dir=layout.run_dir / "verifier", + primary_log="nat_agent.log", + ) + + # Platform extension (non-doc key): preserved platform/db state across phases. + descriptors["state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(layout.state_dir), + metadata={"role": "platform_state", "extension": "nemo-platform"}, + ) + + return descriptors + + +def _raw_log_paths(agent_log_dir: Path) -> list[str]: + if not agent_log_dir.is_dir(): + return [] + return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()] + + +def _read_agent_log(agent_log_dir: Path) -> str: + log_path = agent_log_dir / "nat_agent.log" + if log_path.is_file(): + return log_path.read_text(encoding="utf-8", errors="replace") + return "" + + +# --------------------------------------------------------------------------- # +# Attempt construction from nat_runner result.json +# --------------------------------------------------------------------------- # +# Token/cost measurement keys carried in result.json["metrics"]. +_METRIC_KEYS = ( + "prompt_tokens", + "completion_tokens", + "total_tokens", + "cache_creation_tokens", + "cache_read_tokens", + "n_assistant_messages", + "cost_usd", + "num_turns", + "duration_ms", + "token_metrics_status", + "token_metrics_note", +) + + +class ResultDirAttemptSource: + """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts. + + Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource` + protocol so the generic orchestrator's offline path can rescore captured runs. + """ + + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: + return attempt_from_result_dir(source, task=task) + + +def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt: + """Load ``/result.json`` and build an attempt from it.""" + output_dir = Path(output_dir) + result_path = output_dir / "result.json" + if not result_path.is_file(): + raise FileNotFoundError(f"result.json not found in {output_dir}") + result = json.loads(result_path.read_text(encoding="utf-8")) + return attempt_from_result(result, output_dir=output_dir, task=task) + + +def attempt_from_result( + result: dict[str, Any], + *, + output_dir: str | Path | None = None, + task: AgentEvalTask | None = None, +) -> AgentEvalAttempt: + """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`. + + The attempt ``status`` reflects whether the agent produced a usable + response (``agent`` phase outcome). Pass/fail from the verifier is recorded + as a *measurement* in metadata (``reward``/``passed``) so scoring metrics — + not the runtime — remain the source of truth. + """ + task_id = str(result.get("task") or (task.id if task is not None else "unknown")) + backend = str(result.get("agent_backend") or "unknown") + resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or ".")) + layout = _layout_from_result_dir(resolved_dir) + + agent_phase = str(result.get("agent") or "") + agent_ok = agent_phase in {"ok", "skipped"} + status = resolve_attempt_status(agent_ok) + + output_text, final_extracted, final_source = _resolve_output_text(layout) + if not output_text: + output_text = "" if agent_ok else "(agent phase failed)" + + descriptors = _evidence_descriptors( + layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + ) + + metrics = dict(result.get("metrics") or {}) + metadata: dict[str, Any] = { + # Canonical CapturedAgentAttempt-style provenance fields. + "agent_runtime": backend, + "agent_model": result.get("agent_model"), + "run_id": (result.get("provenance") or {}).get("run_id"), + "exit_code": 0 if agent_ok else 1, + "duration_ms": metrics.get("duration_ms"), + # Phase outcomes from result.json. + "agent_ok": agent_ok, + "build_status": result.get("build"), + "agent_status": result.get("agent"), + "verify_status": result.get("verify"), + # Measurements (verifier reward is a measurement, not attempt status). + "passed": result.get("passed"), + "reward": result.get("reward"), + "runtime_sec": result.get("runtime_sec"), + "verifier_scores": result.get("verifier_scores"), + # Provenance + candidate identity. + "provenance": result.get("provenance"), + "candidate_id": result.get("candidate_id"), + "candidate_params": result.get("candidate_params"), + "image": result.get("image"), + "output_dir": str(resolved_dir), + # Artifact discovery helpers. + "agent_log_dir": str(layout.agent_log_dir), + "workspace_dir": str(layout.workspace_dir), + "state_dir": str(layout.state_dir), + "final_answer_extracted": final_extracted, + "final_answer_source": final_source, + } + metadata.update({key: metrics.get(key) for key in _METRIC_KEYS}) + + return AgentEvalAttempt( + id=f"{task_id}:{backend}", + task_id=task_id, + status=status, + output=AgentOutput(text=output_text), + evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, + metadata=metadata, + ) + + +def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout: + agent_log_dir = output_dir / "agent" + return AgenticRunLayout( + run_dir=output_dir, + agent_log_dir=agent_log_dir, + workspace_dir=output_dir / "workspace", + state_dir=output_dir / "state", + instruction_path=agent_log_dir / "instruction.md", + ) + + +def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]: + if not layout.agent_log_dir.is_dir(): + return "", False, None + artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + if artifacts.final_answer.extracted and artifacts.final_answer.text: + return artifacts.final_answer.text, True, artifacts.final_answer.source + log_path = layout.agent_log_dir / "nat_agent.log" + if log_path.is_file(): + return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None + return "", False, None + + +# --------------------------------------------------------------------------- # +# Live VERIFY phase through the SDK environment boundary +# --------------------------------------------------------------------------- # +def verifier_log_dir(layout: AgenticRunLayout) -> Path: + return layout.run_dir / "verifier" + + +def build_verify_run_spec( + task_dir: Path, + layout: AgenticRunLayout, + *, + nmp_base_url: str, + agent_backend: str, + agent_model: str, + smoke_workspace: str | None = None, + timeout_sec: int | None = None, + extra_args: list[str] | None = None, +) -> EnvRunSpec | None: + """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``. + + Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to + verify), matching the runner's behavior. + """ + tests_dir = task_dir / "tests" + if not (tests_dir / "test_outputs.py").exists(): + return None + + log_dir = verifier_log_dir(layout) + log_dir.mkdir(parents=True, exist_ok=True) + layout.workspace_dir.mkdir(parents=True, exist_ok=True) + + smoke_seed_cmd = "" + smoke_cleanup_cmd = "" + if smoke_workspace: + smoke_seed_cmd = textwrap.dedent("""\ + /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \ + --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true + """) + smoke_cleanup_cmd = textwrap.dedent("""\ + /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true + """) + + verify_cmd = [ + "bash", + "-c", + textwrap.dedent(f"""\ + export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}" + export NAT_AGENT=1 + {smoke_seed_cmd} + /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt + EXIT=${{PIPESTATUS[0]}} + {smoke_cleanup_cmd} + if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt + exit $EXIT + """), + ] + + env: dict[str, str] = { + "NMP_BASE_URL": nmp_base_url, + "NAT_AGENT": "1", + "NAT_AGENT_BACKEND": agent_backend, + "NAT_AGENT_MODEL": agent_model, + "AGENTIC_USE_TASK_DIR": "/task", + "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", + "SMOKE_WORKSPACE": smoke_workspace or "", + "DATABASE_DIALECT": "sqlite", + "DATABASE_PATH": "/data/nmp-platform.db", + "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, + "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, + } + if DOCKER_SOCKET_HOST_PATH.exists(): + env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" + + mounts: list[tuple[str, str]] = [ + (str(tests_dir), "/tests"), + (str(task_dir), "/task"), + (str(layout.workspace_dir), "/app/workspace"), + (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"), + (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"), + (str(layout.agent_log_dir), "/logs/agent"), + (str(log_dir), "/logs/verifier"), + # Persist platform/db state across AGENT and VERIFY containers. + (str(layout.state_dir), "/data"), + ] + if DOCKER_SOCKET_HOST_PATH.exists(): + mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH)) + + return EnvRunSpec( + command=verify_cmd, + env=env, + mounts=mounts, + timeout=timeout_sec, + extra_args=list(extra_args or []), + ) + + +async def run_verify( + handle: AgentEnvironmentHandle, + spec: EnvRunSpec, + layout: AgenticRunLayout, +) -> VerifierOutcome: + """Execute the verifier through the environment handle and collect reward.""" + result = await handle.run_verifier(spec) + return collect_verifier_outcome( + ok=result.ok, + exit_code=result.exit_code, + log_dir=verifier_log_dir(layout), + ) + + +async def maybe_run_verify( + handle: AgentEnvironmentHandle, + *, + enabled: bool, + task_dir: Path, + layout: AgenticRunLayout, + nmp_base_url: str, + agent_backend: str, + agent_model: str, + smoke_workspace: str | None = None, + timeout_sec: int | None = None, + extra_args: list[str] | None = None, +) -> VerifierOutcome: + """Run the verifier through ``handle`` when enabled and a verifier exists.""" + if not enabled: + return skipped_outcome() + spec = build_verify_run_spec( + task_dir, + layout, + nmp_base_url=nmp_base_url, + agent_backend=agent_backend, + agent_model=agent_model, + smoke_workspace=smoke_workspace, + timeout_sec=timeout_sec, + extra_args=extra_args, + ) + if spec is None: + return skipped_outcome() + return await run_verify(handle, spec, layout) + + +# --------------------------------------------------------------------------- # +# Agentic-use task loader +# --------------------------------------------------------------------------- # +def load_task_toml(task_dir: Path) -> dict[str, object]: + task_toml = task_dir / "task.toml" + if not task_toml.exists(): + return {} + try: + with task_toml.open("rb") as handle: + data = tomllib.load(handle) + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def task_agent_timeout_sec(task_dir: Path) -> int | None: + data = load_task_toml(task_dir) + agent = data.get("agent") + if not isinstance(agent, dict): + return None + timeout_value = agent.get("timeout_sec") + if isinstance(timeout_value, (int, float)) and timeout_value > 0: + return int(timeout_value) + return None + + +def agentic_task_from_dir( + task_dir: str | Path, + *, + tasks_root: Path | None = None, + metrics: list[Metric] | None = None, +) -> AgentEvalTask: + """Build an :class:`AgentEvalTask` from an agentic-use task directory. + + ``inputs`` carries only agent-facing material (``instruction``) per the SDK + design doc; runtime materialization details such as ``task_dir`` live in + ``metadata`` so they cannot leak into metric scoring rows. Metrics are + authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the + orchestrator only appends compatibility metrics, it does not own the set. + """ + root = Path(tasks_root or AGENTIC_USE_DIR) + task_path = Path(task_dir) + if not task_path.is_absolute(): + task_path = (root / task_path).resolve() + + instruction_path = task_path / "instruction.md" + if not instruction_path.exists(): + raise FileNotFoundError(f"instruction.md not found in {task_path}") + + instruction = instruction_path.read_text(encoding="utf-8") + task_toml = load_task_toml(task_path) + + return AgentEvalTask( + id=task_path.name, + intent=instruction, + inputs={ + "instruction": instruction, + }, + metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()], + metadata={ + "benchmark": "agentic-use", + "task_toml": task_toml, + "instruction_path": str(instruction_path), + "task_dir": str(task_path), + }, + ) diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py deleted file mode 100644 index e8162f9ded..0000000000 --- a/tests/agentic-use/runtimes/shared/result_adapter.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Adapt ``nat_runner`` ``result.json`` records into ``AgentEvalAttempt`` values. - -This bridges the existing ``nat_runner`` output contract (see -``nat_runner._write_result``) onto the agent-eval SDK so a run that already -produced ``result.json`` can be imported as an attempt without re-executing the -agent. Per the design doc, ``result.json`` carries the attempt *status*, -*measurements* (reward + token/cost), and *provenance*. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from evaluator_agent_eval.artifacts import AgentArtifacts -from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalTask, AgentOutput -from nemo_evaluator_sdk.values.evidence import CandidateEvidence - -from runtimes.shared.artifacts import _evidence_descriptors, resolve_attempt_status # reuse documented helpers -from runtimes.shared.layout import AgenticRunLayout - -# Token/cost measurement keys carried in result.json["metrics"]. -_METRIC_KEYS = ( - "prompt_tokens", - "completion_tokens", - "total_tokens", - "cache_creation_tokens", - "cache_read_tokens", - "n_assistant_messages", - "cost_usd", - "num_turns", - "duration_ms", - "token_metrics_status", - "token_metrics_note", -) - - -def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt: - """Load ``/result.json`` and build an attempt from it.""" - output_dir = Path(output_dir) - result_path = output_dir / "result.json" - if not result_path.is_file(): - raise FileNotFoundError(f"result.json not found in {output_dir}") - result = json.loads(result_path.read_text(encoding="utf-8")) - return attempt_from_result(result, output_dir=output_dir, task=task) - - -def attempt_from_result( - result: dict[str, Any], - *, - output_dir: str | Path | None = None, - task: AgentEvalTask | None = None, -) -> AgentEvalAttempt: - """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`. - - The attempt ``status`` reflects whether the agent produced a usable - response (``agent`` phase outcome). Pass/fail from the verifier is recorded - as a *measurement* in metadata (``reward``/``passed``) so scoring metrics — - not the runtime — remain the source of truth. - """ - task_id = str(result.get("task") or (task.id if task is not None else "unknown")) - backend = str(result.get("agent_backend") or "unknown") - resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or ".")) - layout = _layout_from_result_dir(resolved_dir) - - agent_phase = str(result.get("agent") or "") - agent_ok = agent_phase in {"ok", "skipped"} - status = resolve_attempt_status(agent_ok) - - output_text, final_extracted, final_source = _resolve_output_text(layout) - if not output_text: - output_text = "" if agent_ok else "(agent phase failed)" - - descriptors = _evidence_descriptors( - layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - ) - - metrics = dict(result.get("metrics") or {}) - metadata: dict[str, Any] = { - # Canonical CapturedAgentAttempt-style provenance fields. - "agent_runtime": backend, - "agent_model": result.get("agent_model"), - "run_id": (result.get("provenance") or {}).get("run_id"), - "exit_code": 0 if agent_ok else 1, - "duration_ms": metrics.get("duration_ms"), - # Phase outcomes from result.json. - "agent_ok": agent_ok, - "build_status": result.get("build"), - "agent_status": result.get("agent"), - "verify_status": result.get("verify"), - # Measurements (verifier reward is a measurement, not attempt status). - "passed": result.get("passed"), - "reward": result.get("reward"), - "runtime_sec": result.get("runtime_sec"), - "verifier_scores": result.get("verifier_scores"), - # Provenance + candidate identity. - "provenance": result.get("provenance"), - "candidate_id": result.get("candidate_id"), - "candidate_params": result.get("candidate_params"), - "image": result.get("image"), - "output_dir": str(resolved_dir), - # Artifact discovery helpers. - "agent_log_dir": str(layout.agent_log_dir), - "workspace_dir": str(layout.workspace_dir), - "state_dir": str(layout.state_dir), - "final_answer_extracted": final_extracted, - "final_answer_source": final_source, - } - metadata.update({key: metrics.get(key) for key in _METRIC_KEYS}) - - return AgentEvalAttempt( - id=f"{task_id}:{backend}", - task_id=task_id, - status=status, - output=AgentOutput(text=output_text), - evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, - metadata=metadata, - ) - - -def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout: - agent_log_dir = output_dir / "agent" - return AgenticRunLayout( - run_dir=output_dir, - agent_log_dir=agent_log_dir, - workspace_dir=output_dir / "workspace", - state_dir=output_dir / "state", - instruction_path=agent_log_dir / "instruction.md", - ) - - -def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]: - if not layout.agent_log_dir.is_dir(): - return "", False, None - artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - if artifacts.final_answer.extracted and artifacts.final_answer.text: - return artifacts.final_answer.text, True, artifacts.final_answer.source - log_path = layout.agent_log_dir / "nat_agent.log" - if log_path.is_file(): - return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None - return "", False, None diff --git a/tests/agentic-use/runtimes/shared/task_loader.py b/tests/agentic-use/runtimes/shared/task_loader.py deleted file mode 100644 index e64a87e99d..0000000000 --- a/tests/agentic-use/runtimes/shared/task_loader.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Map agentic-use task directories to AgentEvalTask values.""" - -from __future__ import annotations - -import tomllib -from pathlib import Path - -from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask -from nemo_evaluator_sdk.metrics.protocol import Metric - -from runtimes.shared.constants import AGENTIC_USE_DIR -from runtimes.shared.metrics import AgentPhaseSuccessMetric - - -def load_task_toml(task_dir: Path) -> dict[str, object]: - task_toml = task_dir / "task.toml" - if not task_toml.exists(): - return {} - try: - with task_toml.open("rb") as handle: - data = tomllib.load(handle) - except Exception: - return {} - return data if isinstance(data, dict) else {} - - -def task_agent_timeout_sec(task_dir: Path) -> int | None: - data = load_task_toml(task_dir) - agent = data.get("agent") - if not isinstance(agent, dict): - return None - timeout_value = agent.get("timeout_sec") - if isinstance(timeout_value, (int, float)) and timeout_value > 0: - return int(timeout_value) - return None - - -def agentic_task_from_dir( - task_dir: str | Path, - *, - tasks_root: Path | None = None, - metrics: list[Metric] | None = None, -) -> AgentEvalTask: - """Build an :class:`AgentEvalTask` from an agentic-use task directory. - - ``inputs`` carries only agent-facing material (``instruction``) per the SDK - design doc; runtime materialization details such as ``task_dir`` live in - ``metadata`` so they cannot leak into metric scoring rows. Metrics are - authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the - orchestrator only appends compatibility metrics, it does not own the set. - """ - root = Path(tasks_root or AGENTIC_USE_DIR) - task_path = Path(task_dir) - if not task_path.is_absolute(): - task_path = (root / task_path).resolve() - - instruction_path = task_path / "instruction.md" - if not instruction_path.exists(): - raise FileNotFoundError(f"instruction.md not found in {task_path}") - - instruction = instruction_path.read_text(encoding="utf-8") - task_toml = load_task_toml(task_path) - - return AgentEvalTask( - id=task_path.name, - intent=instruction, - inputs={ - "instruction": instruction, - }, - metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()], - metadata={ - "benchmark": "agentic-use", - "task_toml": task_toml, - "instruction_path": str(instruction_path), - "task_dir": str(task_path), - }, - ) diff --git a/tests/agentic-use/runtimes/shared/usage.py b/tests/agentic-use/runtimes/shared/usage.py deleted file mode 100644 index 89053ffb97..0000000000 --- a/tests/agentic-use/runtimes/shared/usage.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Token usage extraction from agent logs. - -Reuses the proven implementation from ``nat_runner.py`` until the legacy -runner delegates here and the duplicate can be removed. -""" - -from __future__ import annotations - -from typing import TypedDict - - -class TokenMetrics(TypedDict): - prompt_tokens: int | None - completion_tokens: int | None - total_tokens: int | None - cache_creation_tokens: int | None - cache_read_tokens: int | None - n_assistant_messages: int | None - cost_usd: float | None - num_turns: int | None - duration_ms: float | None - - -def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]: - """Extract token usage metrics from an agent log.""" - import nat_runner - - metrics = nat_runner._extract_usage_metrics(agent_log) - return dict(metrics) diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py deleted file mode 100644 index 8be53924b9..0000000000 --- a/tests/agentic-use/runtimes/shared/verify.py +++ /dev/null @@ -1,208 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Live VERIFY phase executed through the environment boundary. - -Ports ``nat_runner.run_verify_phase`` onto :meth:`AgentEnvironmentHandle.run_verifier` -so the task-local ``tests/test_outputs.py`` pytest verifier runs in the *same* -prepared environment (and against the same persisted workspace/state) as the -agent phase. The resulting reward is stamped onto the attempt metadata so the -``VerifierRewardMetric`` compatibility metric scores it through the Evaluator SDK. -""" - -from __future__ import annotations - -import textwrap -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -from runtimes.shared.constants import ( - DOCKER_SOCKET_CONTAINER_PATH, - DOCKER_SOCKET_HOST_PATH, - EVALUATOR_SDK_SRC, - FILES_STORAGE_CONFIG, - PLATFORM_CONFIG_PATH, - SHARED_DIR, -) -from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec -from runtimes.shared.layout import AgenticRunLayout - - -@dataclass(frozen=True) -class VerifierOutcome: - """Result of the live verifier phase for one task.""" - - ran: bool - passed: bool - reward: int - exit_code: int - stdout: str - verifier_log_dir: Path | None - - -def verifier_log_dir(layout: AgenticRunLayout) -> Path: - return layout.run_dir / "verifier" - - -def build_verify_run_spec( - task_dir: Path, - layout: AgenticRunLayout, - *, - nmp_base_url: str, - agent_backend: str, - agent_model: str, - smoke_workspace: str | None = None, - timeout_sec: int | None = None, - extra_args: list[str] | None = None, -) -> EnvRunSpec | None: - """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``. - - Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to - verify), matching the runner's behavior. - """ - tests_dir = task_dir / "tests" - if not (tests_dir / "test_outputs.py").exists(): - return None - - log_dir = verifier_log_dir(layout) - log_dir.mkdir(parents=True, exist_ok=True) - layout.workspace_dir.mkdir(parents=True, exist_ok=True) - - smoke_seed_cmd = "" - smoke_cleanup_cmd = "" - if smoke_workspace: - smoke_seed_cmd = textwrap.dedent("""\ - /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \ - --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true - """) - smoke_cleanup_cmd = textwrap.dedent("""\ - /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true - """) - - verify_cmd = [ - "bash", - "-c", - textwrap.dedent(f"""\ - export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}" - export NAT_AGENT=1 - {smoke_seed_cmd} - /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt - EXIT=${{PIPESTATUS[0]}} - {smoke_cleanup_cmd} - if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt - exit $EXIT - """), - ] - - env: dict[str, str] = { - "NMP_BASE_URL": nmp_base_url, - "NAT_AGENT": "1", - "NAT_AGENT_BACKEND": agent_backend, - "NAT_AGENT_MODEL": agent_model, - "AGENTIC_USE_TASK_DIR": "/task", - "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", - "SMOKE_WORKSPACE": smoke_workspace or "", - "DATABASE_DIALECT": "sqlite", - "DATABASE_PATH": "/data/nmp-platform.db", - "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, - "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, - } - if DOCKER_SOCKET_HOST_PATH.exists(): - env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" - - mounts: list[tuple[str, str]] = [ - (str(tests_dir), "/tests"), - (str(task_dir), "/task"), - (str(layout.workspace_dir), "/app/workspace"), - (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"), - (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"), - (str(layout.agent_log_dir), "/logs/agent"), - (str(log_dir), "/logs/verifier"), - # Persist platform/db state across AGENT and VERIFY containers. - (str(layout.state_dir), "/data"), - ] - if DOCKER_SOCKET_HOST_PATH.exists(): - mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH)) - - return EnvRunSpec( - command=verify_cmd, - env=env, - mounts=mounts, - timeout=timeout_sec, - extra_args=list(extra_args or []), - ) - - -async def run_verify( - handle: AgentEnvironmentHandle, - spec: EnvRunSpec, - layout: AgenticRunLayout, -) -> VerifierOutcome: - """Execute the verifier through the environment handle and collect reward.""" - result = await handle.run_verifier(spec) - log_dir = verifier_log_dir(layout) - passed = result.ok - - stdout = "" - stdout_path = log_dir / "test-stdout.txt" - if stdout_path.is_file(): - stdout = stdout_path.read_text(encoding="utf-8", errors="replace") - - reward_path = log_dir / "reward.txt" - if reward_path.is_file(): - reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0 - else: - reward = 1 if passed else 0 - reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8") - - return VerifierOutcome( - ran=True, - passed=passed, - reward=reward, - exit_code=result.exit_code, - stdout=stdout, - verifier_log_dir=log_dir, - ) - - -async def maybe_run_verify( - handle: AgentEnvironmentHandle, - *, - enabled: bool, - task_dir: Path, - layout: AgenticRunLayout, - nmp_base_url: str, - agent_backend: str, - agent_model: str, - smoke_workspace: str | None = None, - timeout_sec: int | None = None, - extra_args: list[str] | None = None, -) -> VerifierOutcome: - """Run the verifier through ``handle`` when enabled and a verifier exists.""" - if not enabled: - return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) - spec = build_verify_run_spec( - task_dir, - layout, - nmp_base_url=nmp_base_url, - agent_backend=agent_backend, - agent_model=agent_model, - smoke_workspace=smoke_workspace, - timeout_sec=timeout_sec, - extra_args=extra_args, - ) - if spec is None: - return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) - return await run_verify(handle, spec, layout) - - -def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None: - """Stamp verifier reward/status onto attempt metadata for scoring + gating.""" - if not outcome.ran: - metadata.setdefault("verify_status", "skipped") - return - metadata["verify_status"] = "ok" if outcome.passed else "failed" - metadata["passed"] = outcome.passed - metadata["reward"] = outcome.reward - metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None diff --git a/tests/agentic-use/runtimes/workflow/runtime.py b/tests/agentic-use/runtimes/workflow/runtime.py index 1d8c09fecd..55688b3d24 100644 --- a/tests/agentic-use/runtimes/workflow/runtime.py +++ b/tests/agentic-use/runtimes/workflow/runtime.py @@ -8,20 +8,21 @@ from collections.abc import Sequence from pathlib import Path +from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec +from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask -from runtimes.shared.artifacts import build_agent_eval_attempt from runtimes.shared.config import WorkflowRuntimeConfig from runtimes.shared.constants import INSTRUCTION_CONTAINER_PATH, WORKFLOW_CONTAINER_PATH -from runtimes.shared.container_env import base_container_env -from runtimes.shared.environment import ( - AgentEnvironmentProvider, +from runtimes.shared.platform import ( + AgenticRunLayout, DockerEnvironmentProvider, - EnvRunSpec, + base_container_env, + build_agent_eval_attempt, + maybe_run_verify, + resolve_run_layout, + task_agent_timeout_sec, ) -from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout -from runtimes.shared.task_loader import task_agent_timeout_sec -from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify from runtimes.workflow.command import build_workflow_agent_cmd from runtimes.workflow.prep import prepare_workflow_for_runtime diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py index 935ddf7389..1903989705 100644 --- a/tests/agentic-use/tests/test_agentic_runtimes.py +++ b/tests/agentic-use/tests/test_agentic_runtimes.py @@ -10,10 +10,9 @@ import pytest import yaml +from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec from runtimes.shared.config import AgenticSharedConfig, WorkflowRuntimeConfig -from runtimes.shared.environment import EnvCommandResult, EnvRunSpec -from runtimes.shared.layout import resolve_run_layout, task_image_tag -from runtimes.shared.task_loader import agentic_task_from_dir +from runtimes.shared.platform import agentic_task_from_dir, resolve_run_layout, task_image_tag from runtimes.workflow.command import build_workflow_agent_cmd from runtimes.workflow.prep import prepare_workflow_for_runtime from runtimes.workflow.runtime import NatWorkflowAttemptRuntime @@ -122,8 +121,7 @@ def test_runtime_for_backend_rejects_unknown() -> None: def test_build_agent_eval_attempt_metadata_matches_captured_schema(tmp_path: Path) -> None: - from runtimes.shared.artifacts import build_agent_eval_attempt, to_captured_agent_attempt - from runtimes.shared.layout import AgenticRunLayout + from runtimes.shared.platform import AgenticRunLayout, build_agent_eval_attempt, to_captured_agent_attempt task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR) layout = AgenticRunLayout( @@ -183,7 +181,7 @@ async def test_aut_runtime_run_tasks_with_mocked_env(tmp_path: Path) -> None: def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> None: - from runtimes.shared.result_adapter import attempt_from_result + from runtimes.shared.platform import attempt_from_result output_dir = tmp_path / "20260101T000000Z-demo" (output_dir / "agent").mkdir(parents=True) @@ -218,7 +216,7 @@ def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> Non def test_attempt_from_result_marks_unsuccessful_agent_partial(tmp_path: Path) -> None: - from runtimes.shared.result_adapter import attempt_from_result + from runtimes.shared.platform import attempt_from_result output_dir = tmp_path / "run" (output_dir / "agent").mkdir(parents=True) @@ -271,7 +269,7 @@ async def test_score_captured_attempts_offline(tmp_path: Path) -> None: @pytest.mark.asyncio async def test_verifier_reward_metric_reads_metadata() -> None: from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput - from runtimes.shared.metrics import VerifierRewardMetric + from runtimes.shared.platform import VerifierRewardMetric metric = VerifierRewardMetric() candidate = CandidateOutput(output_text="x", metadata={"reward": 1}) @@ -303,6 +301,8 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co }, ) task_result = AgentEvalTaskResult( + id="demo:workflow:agentic_use_verifier_reward", + run_id="run-1", task_id="demo", attempt_id="demo:workflow", metric_type="agentic_use_verifier_reward", @@ -318,7 +318,7 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: - from runtimes.shared.reporting import summarize_run + from nemo_evaluator_sdk.agent_eval.gating import summarize_run summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5)) @@ -331,7 +331,7 @@ def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None: - from runtimes.shared.reporting import GateThresholds, evaluate_gate, write_gate_report + from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, write_gate_report baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0) candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0) @@ -354,7 +354,7 @@ def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> Non def test_evaluate_gate_blocks_cross_commit_comparison() -> None: - from runtimes.shared.reporting import GateThresholds, evaluate_gate + from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111") candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222") @@ -376,8 +376,7 @@ def test_evaluate_gate_blocks_cross_commit_comparison() -> None: def test_build_verify_run_spec_shape(tmp_path: Path) -> None: - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import build_verify_run_spec + from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec layout = AgenticRunLayout( run_dir=tmp_path, @@ -402,8 +401,7 @@ def test_build_verify_run_spec_shape(tmp_path: Path) -> None: def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> None: - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import build_verify_run_spec + from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec task_dir = tmp_path / "no-tests-task" task_dir.mkdir() @@ -420,9 +418,8 @@ def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> Non @pytest.mark.asyncio async def test_run_verify_reads_reward_file(tmp_path: Path) -> None: - from runtimes.shared.environment import EnvCommandResult, EnvRunSpec - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import run_verify + from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec + from runtimes.shared.platform import AgenticRunLayout, run_verify layout = AgenticRunLayout( run_dir=tmp_path, @@ -454,7 +451,7 @@ async def close(self) -> None: @pytest.mark.asyncio async def test_workflow_runtime_runs_verify_through_handle(tmp_path: Path) -> None: - from runtimes.shared.verify import verifier_log_dir + from runtimes.shared.platform import verifier_log_dir task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR) layout = resolve_run_layout(task, AgenticSharedConfig(jobs_dir=tmp_path)) @@ -491,7 +488,7 @@ async def prepare(self, task: object, config: object = None) -> _Handle: def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec (tmp_path / "environment.yaml").write_text( "environment:\n" @@ -514,7 +511,7 @@ def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None: def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec env_dir = tmp_path / "environment" env_dir.mkdir() @@ -526,14 +523,14 @@ def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None: def test_load_environment_spec_missing_raises(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec with pytest.raises(FileNotFoundError): load_environment_spec(tmp_path) def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import plan_task_build + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build env_dir = tmp_path / "environment" env_dir.mkdir() @@ -547,7 +544,7 @@ def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None: def test_plan_task_build_generates_derived_dockerfile(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import plan_task_build + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build (tmp_path / "environment.yaml").write_text( "environment:\n image: base:1\n dependencies:\n python: [pytest]\n setup: [seed-providers]\n",