From b0a68bdad093984f9ce8d37f391b4e57a0a538ff Mon Sep 17 00:00:00 2001 From: "Arpit Singh (SW-CLOUD)" Date: Tue, 9 Jun 2026 16:44:20 -0700 Subject: [PATCH 1/3] fix layout Signed-off-by: Arpit Singh (SW-CLOUD) --- tests/agentic-use/runtimes/shared/layout.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py index a259de71af..07a7a2dd17 100644 --- a/tests/agentic-use/runtimes/shared/layout.py +++ b/tests/agentic-use/runtimes/shared/layout.py @@ -45,7 +45,9 @@ def resolve_run_layout( ) -> AgenticRunLayout: """Resolve or create the on-disk layout for one task attempt.""" if config is not None and config.output_dir is not None: - run_dir = Path(config.output_dir) + # Must be absolute: run_dir subpaths are used as Docker bind-mount sources, + # and Docker treats a relative `-v` source as a (slash-free) named volume. + run_dir = Path(config.output_dir).resolve() else: run_dir = new_run_dir(default_jobs_dir(shared), task.id) From c081455e08efb09a75459b674ae4a039c2b0aff8 Mon Sep 17 00:00:00 2001 From: "Arpit Singh (SW-CLOUD)" Date: Tue, 9 Jun 2026 18:59:58 -0700 Subject: [PATCH 2/3] feat(evaluator): add orchestration, environments, gating, and coding-agent drivers to agent-eval SDK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend nemo_evaluator_sdk.agent_eval from "evaluator + contracts" into a full agent-evaluation pipeline by adding the layers above and below AgentEvaluator. Orchestration: - orchestrator.py: AgentEvalOrchestrator ties AgentEvaluator + gating into one call. run_tasks(target=runtime) (online) and score_attempts(attempts=...) (offline). Backend-agnostic via injected extra_metrics + a prepare_task hook; it never introspects the runtime. - types.py: AgentAttemptSource protocol — the offline counterpart to AgentAttemptRuntime (adapt a stored artifact into an AgentEvalAttempt). Execution layer (dependency-gated, no core import): - runtimes/environment.py: AgentEnvironmentProvider/Handle with a single run(spec, role) (agent/verifier); DockerEnvironmentProvider default, swappable. - runtimes/environment_spec.py: declarative environment.yaml -> BuildPlan (Dockerfile escape hatch); runtimes/docker.py: stdlib subprocess Docker helpers. - runtimes/coding_agent.py: CliAgentDriver (generic AgentAttemptRuntime for stdin-prompt CLIs) + CodingAgentSpec adapter seam; reference Claude/Cursor specs. - runtimes/layout.py: RunLayout + resolve_run_dir (abs paths for mounts) + prepare_run_layout. - runtimes/verify.py: VerifierOutcome + collect_verifier_outcome + apply_verify_to_metadata. Attempt + scoring: - attempts.py: resolve_attempt_status (ran-but-failed -> scorable "partial") + standard_evidence_descriptors (initial_state/trace/logs/final_state/verifier_logs). - common_metrics.py: AgentPhaseSuccessMetric and EvidencePresenceMetric, a real metric-over-evidence that reads candidate.evidence.filesystem(...). Results + gating: - measurements.py: AttemptMeasurements, one typed projection of tokens/runtime/reward/provenance from attempt metadata. - gating.py: summarize_run + evaluate_gate/GateThresholds/GateReport + write_gate_report + baseline loading (pass-rate, token regression, runtime tie-breaker, cross-commit provenance) -> gate.json. A CI grep gate (tests/agent_eval/test_import_hygiene.py) keeps agent_eval free of external/platform imports. tests/agentic-use is rewired as a thin adapter over these modules via pure re-export shims. Also fixes a pre-existing SandboxSdk->SandboxSDK typo in test_docker_sandbox_runtime.py. 107 tests pass; ty and import-hygiene gate clean; e2e CLI run reaches agent_ok=True, overall_score=1.0, gate_passed=True. Signed-off-by: Arpit Singh (SW-CLOUD) --- .../nemo_evaluator_sdk/agent_eval/__init__.py | 7 +- .../nemo_evaluator_sdk/agent_eval/attempts.py | 90 ++++ .../agent_eval/common_metrics.py | 79 +++ .../nemo_evaluator_sdk/agent_eval/gating.py | 441 ++++++++++++++++ .../agent_eval/measurements.py | 121 +++++ .../agent_eval/orchestrator.py | 153 ++++++ .../agent_eval/runtimes/coding_agent.py | 291 +++++++++++ .../agent_eval/runtimes/docker.py | 89 ++++ .../agent_eval/runtimes/docker_sandbox.py | 11 +- .../agent_eval/runtimes/environment.py | 145 ++++++ .../agent_eval/runtimes/environment_spec.py | 184 +++++++ .../agent_eval/runtimes/layout.py | 63 +++ .../agent_eval/runtimes/verify.py | 86 ++++ .../nemo_evaluator_sdk/agent_eval/types.py | 12 + .../tests/agent_eval/test_coding_agent.py | 117 +++++ .../tests/agent_eval/test_common_metrics.py | 86 ++++ .../agent_eval/test_docker_sandbox_runtime.py | 6 +- .../tests/agent_eval/test_environment.py | 77 +++ .../tests/agent_eval/test_gating.py | 106 ++++ .../tests/agent_eval/test_import_hygiene.py | 37 ++ .../tests/agent_eval/test_measurements.py | 45 ++ .../tests/agent_eval/test_orchestrator.py | 131 +++++ .../tests/agent_eval/test_verify.py | 39 ++ tests/agentic-use/runtimes/COMPLIANCE.md | 17 + tests/agentic-use/runtimes/README.md | 70 ++- tests/agentic-use/runtimes/orchestrator.py | 110 ++-- .../agentic-use/runtimes/shared/artifacts.py | 83 +-- tests/agentic-use/runtimes/shared/docker.py | 101 +--- .../runtimes/shared/environment.py | 141 ++---- .../runtimes/shared/environment_spec.py | 197 +------- tests/agentic-use/runtimes/shared/layout.py | 38 +- tests/agentic-use/runtimes/shared/metrics.py | 25 +- .../agentic-use/runtimes/shared/reporting.py | 471 +----------------- .../runtimes/shared/result_adapter.py | 11 + tests/agentic-use/runtimes/shared/verify.py | 67 +-- .../tests/test_agentic_runtimes.py | 2 + 36 files changed, 2705 insertions(+), 1044 deletions(-) create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py create mode 100644 packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py index b4d9805374..963d869bb5 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/__init__.py @@ -5,9 +5,11 @@ from nemo_evaluator_sdk.agent_eval.dashboard import render_dashboard, write_dashboard from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig from nemo_evaluator_sdk.agent_eval.persistence import persist_run from nemo_evaluator_sdk.agent_eval.types import ( AgentAttemptRuntime, + AgentAttemptSource, AgentEvalAttempt, AgentEvalDiagnostic, AgentEvalMetricOutputCoverage, @@ -24,9 +26,12 @@ from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, LocalFilesystemEvidence __all__ = [ + "AgentAttemptRuntime", + "AgentAttemptSource", "AgentEvalAttempt", "AgentEvalDiagnostic", "AgentEvalMetricOutputCoverage", + "AgentEvalOrchestrator", "AgentEvalRunConfig", "AgentEvalRunResult", "AgentEvalSummary", @@ -34,11 +39,11 @@ "AgentEvalTask", "AgentEvalTaskResult", "AgentEvaluator", - "AgentAttemptRuntime", "AgentOutput", "CandidateEvidence", "EvidenceDescriptor", "LocalFilesystemEvidence", + "OrchestratorConfig", "SemanticView", "ViewSignal", "persist_run", diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py new file mode 100644 index 0000000000..dd85fcea5d --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/attempts.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for shaping :class:`AgentEvalAttempt` values from runtime artifacts. + +These are the runtime-agnostic pieces: the *scorable* status mapping and the +standard evidence-key builder. Platform-specific attempt construction (reading +proprietary artifact layouts, extra evidence keys) composes these in the adapter. +""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttemptStatus +from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor + + +def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus: + """Map an agent-phase outcome to a *scorable* attempt status. + + :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` excludes + ``status=="failed"`` from scoring (it short-circuits to a failed metric + result). An agent that ran but did not succeed must still be scored — e.g. as + a ``0`` — so pass-rate gating counts it instead of dropping it. We therefore + use ``"partial"`` for an executed-but-unsuccessful agent and reserve + ``"failed"`` for genuine attempt-*production* failures (which a runtime + surfaces by raising, not by emitting an unscorable attempt). + """ + return "completed" if agent_ok else "partial" + + +def standard_evidence_descriptors( + *, + logs_dir: str | Path, + final_state_dir: str | Path, + trace_path: str | Path | None = None, + initial_state_ref: str | None = None, + verifier_logs_dir: str | Path | None = None, + primary_log: str | None = None, +) -> dict[str, EvidenceDescriptor]: + """Build the documented evidence map for an agent-eval attempt. + + Standard keys: ``initial_state`` (task input filesystem, when staged), + ``trace`` (trajectory, ATIF-normalized when available), ``logs`` (agent log + dir), ``final_state`` (workspace), and ``verifier_logs`` (only when present). + Callers may add their own extension keys to the returned mapping. + """ + descriptors: dict[str, EvidenceDescriptor] = {} + + if initial_state_ref: + descriptors["initial_state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(initial_state_ref), + metadata={"role": "initial_state"}, + ) + + if trace_path is not None: + trace_name = Path(trace_path).name + descriptors["trace"] = EvidenceDescriptor( + kind="trace", + format="atif" if trace_name.startswith("atif") else "json", + ref=str(trace_path), + ) + + logs_metadata = {"primary_log": primary_log} if primary_log else {} + descriptors["logs"] = EvidenceDescriptor( + kind="logs", + format="dir", + ref=str(logs_dir), + metadata=logs_metadata, + ) + + descriptors["final_state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(final_state_dir), + metadata={"role": "final_state"}, + ) + + if verifier_logs_dir is not None and Path(verifier_logs_dir).exists(): + descriptors["verifier_logs"] = EvidenceDescriptor( + kind="logs", + format="dir", + ref=str(verifier_logs_dir), + metadata={"role": "verifier"}, + ) + + return descriptors diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py new file mode 100644 index 0000000000..8cece6a5ad --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/common_metrics.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Reusable agent-eval metrics. + +``AgentPhaseSuccessMetric`` reads the agent-phase outcome stamped on attempt +metadata. ``EvidencePresenceMetric`` is a genuine *metric-over-evidence*: it +scores by inspecting ``candidate.evidence`` (a filesystem evidence handle) +rather than a reward written into metadata — the value proposition of scoring +over evidence instead of trusting a verifier's stamped reward. +""" + +from __future__ import annotations + +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class AgentPhaseSuccessMetric: + """Score 1.0 when the agent phase exited successfully, else 0.0. + + The metric ``type`` is overridable via the ``metric_type`` class attribute so + callers can namespace it; the output name stays ``agent_phase_success`` (which + gating reads as a reward signal). + """ + + metric_type: str = "agent_phase_success" + + @property + def type(self) -> str: + return self.metric_type + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("agent_phase_success")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + agent_ok = bool(input.candidate.metadata.get("agent_ok")) + return MetricResult(outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)]) + + +class EvidencePresenceMetric: + """Score 1.0 when a named filesystem evidence directory exists (and is non-empty). + + Reads ``candidate.evidence`` directly — the canonical metric-over-evidence + pattern — so the score reflects what the agent actually produced on disk, + not a reward stamped into metadata by a verifier. + """ + + def __init__( + self, + *, + evidence_name: str = "final_state", + output_name: str = "evidence_present", + require_non_empty: bool = True, + ) -> None: + self._evidence_name = evidence_name + self._output_name = output_name + self._require_non_empty = require_non_empty + + @property + def type(self) -> str: + return "evidence_presence" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score(self._output_name)] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + score = 0.0 + evidence = input.candidate.evidence + if evidence is not None and evidence.get(self._evidence_name) is not None: + try: + handle = await evidence.filesystem(self._evidence_name) + if await handle.exists(): + if self._require_non_empty: + score = 1.0 if await handle.iter_paths(recursive=True) else 0.0 + else: + score = 1.0 + except (KeyError, ValueError): + score = 0.0 + return MetricResult(outputs=[MetricOutput(name=self._output_name, value=score)]) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py new file mode 100644 index 0000000000..f6a7d04cfb --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/gating.py @@ -0,0 +1,441 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Deterministic gating + provenance comparison over an agent-eval run bundle. + +Persistence of the run bundle (``tasks.jsonl``/``attempts.jsonl``/ +``results.jsonl``/``summary.json``/``report.html``) is handled by +``agent_eval.persistence`` / ``write_dashboard``. This module adds the candidate +-vs-baseline gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic +provenance checks. + +Relationship to :class:`~nemo_evaluator_sdk.agent_eval.types.AgentEvalSummary`: +that summary reports the *mean score per metric output* over a run. The gate's +``pass_rate`` here is a different, intentional view — a per-task pass/fail count +against a reward threshold — so it is computed separately. Token/runtime/ +provenance aggregation is delegated to +:class:`~nemo_evaluator_sdk.agent_eval.measurements.AttemptMeasurements` so the +measurement keys are read in exactly one place. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements +from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunResult, AgentEvalTaskResult + +# Metric outputs, in priority order, that represent a task's pass/reward signal. +DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success") + +# Provenance fields collapsed into a single run-level summary. +_PROVENANCE_FIELDS: tuple[str, ...] = ( + "commit_sha", + "commit_short", + "commit_dirty", + "branch", + "remote_url", + "agentic_base_image_digest", + "pinned", + "pinned_to_commit", + "pinned_image_tag", +) + + +@dataclass(frozen=True) +class GateThresholds: + """Knobs controlling the candidate gate (defaults are the strict CI policy).""" + + min_pass_rate: float = 1.0 + require_token_metrics: bool = False + max_pass_rate_drop: float = 0.0 + max_token_regression_pct: float = 0.0 + max_runtime_regression_pct: float = 0.0 + allow_cross_commit: bool = False + + +@dataclass +class GateCheck: + name: str + passed: bool + details: str + + +@dataclass +class GateReport: + gate_passed: bool + summary: dict[str, Any] + checks: list[GateCheck] = field(default_factory=list) + + def to_payload(self) -> dict[str, Any]: + return { + "gate_passed": self.gate_passed, + "summary": self.summary, + "checks": [asdict(check) for check in self.checks], + } + + +def evaluate_gate( + result: AgentEvalRunResult, + *, + thresholds: GateThresholds | None = None, + baseline_summary: dict[str, Any] | None = None, + reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS, +) -> GateReport: + """Summarize a run and apply gate checks, optionally against a baseline.""" + thresholds = thresholds or GateThresholds() + summary = summarize_run(result, reward_outputs=reward_outputs) + checks = run_gate_checks(summary, thresholds=thresholds, baseline_summary=baseline_summary) + return GateReport(gate_passed=all(check.passed for check in checks), summary=summary, checks=checks) + + +def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path: + """Persist the gate report alongside the run bundle.""" + path = Path(output_dir) + path.mkdir(parents=True, exist_ok=True) + gate_path = path / filename + gate_path.write_text(json.dumps(report.to_payload(), indent=2, sort_keys=True) + "\n", encoding="utf-8") + return gate_path + + +def load_baseline_summary(path: str | Path) -> dict[str, Any]: + """Load + normalize a baseline summary (raw summary or a prior gate.json).""" + source = Path(path) + payload = json.loads(source.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError(f"Baseline summary must be a JSON object: {source}") + summary = payload.get("summary") if isinstance(payload.get("summary"), dict) else payload + _validate_baseline_summary(summary, source) + return summary + + +def summarize_run( + result: AgentEvalRunResult, + *, + reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS, +) -> dict[str, Any]: + """Aggregate pass-rate, token, runtime, and provenance for one run. + + Token/runtime/provenance are read via :class:`AttemptMeasurements`; the + reward used for pass-rate prefers a scored metric output (``reward_outputs``) + and falls back to the attempt's recorded reward. + """ + attempts_by_task = {attempt.task_id: attempt for attempt in result.attempts} + reward_by_task = _rewards_by_task(result.results, reward_outputs) + task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task)) + + passed = 0 + token_sum = 0 + token_count = 0 + token_unavailable: list[str] = [] + runtime_sum = 0.0 + runtime_count = 0 + runtime_unavailable: list[str] = [] + provenance_inputs: list[dict[str, Any]] = [] + + for task_id in task_ids: + attempt = attempts_by_task.get(task_id) + measurements = AttemptMeasurements.from_metadata(attempt.metadata if attempt is not None else {}) + + reward_value = reward_by_task.get(task_id) + if reward_value is None: + reward_value = measurements.reward if measurements.reward is not None else 0.0 + if reward_value >= 1.0: + passed += 1 + + if measurements.total_tokens is not None: + token_sum += measurements.total_tokens + token_count += 1 + else: + token_unavailable.append(task_id) + + if measurements.runtime_sec is not None: + runtime_sum += measurements.runtime_sec + runtime_count += 1 + else: + runtime_unavailable.append(task_id) + + if measurements.provenance: + provenance_inputs.append(measurements.provenance) + + total = len(task_ids) + return { + "run_id": result.run_id, + "benchmark": result.benchmark, + "total_tasks": total, + "passed_tasks": passed, + "pass_rate": (passed / total) if total else 0.0, + "task_names": task_ids, + "total_tokens_sum": token_sum if token_count else None, + "avg_total_tokens": (token_sum / token_count) if token_count else None, + "token_metrics_coverage": (token_count / total) if total else 0.0, + "token_metrics_available_tasks": token_count, + "token_metrics_unavailable_tasks": sorted(token_unavailable), + "runtime_sec_sum": runtime_sum if runtime_count else None, + "avg_runtime_sec": (runtime_sum / runtime_count) if runtime_count else None, + "runtime_metrics_coverage": (runtime_count / total) if total else 0.0, + "runtime_metrics_available_tasks": runtime_count, + "runtime_metrics_unavailable_tasks": sorted(runtime_unavailable), + "provenance": _aggregate_provenance(provenance_inputs), + } + + +def run_gate_checks( + summary: dict[str, Any], + *, + thresholds: GateThresholds, + baseline_summary: dict[str, Any] | None = None, +) -> list[GateCheck]: + """Apply absolute + relative (vs baseline) gate checks to a summary.""" + checks: list[GateCheck] = [] + total_tasks = int(summary["total_tasks"]) + pass_rate = float(summary["pass_rate"]) + provenance = summary.get("provenance") or {} + + checks.append(GateCheck("non_empty_result_set", total_tasks > 0, f"total_tasks={total_tasks}")) + checks.append( + GateCheck( + "min_pass_rate", + pass_rate >= thresholds.min_pass_rate, + f"pass_rate={pass_rate:.3f}, min_pass_rate={thresholds.min_pass_rate:.3f}", + ) + ) + checks.append(_commit_consistency_check(provenance)) + + if thresholds.require_token_metrics: + token_coverage = float(summary["token_metrics_coverage"]) + runtime_coverage = float(summary["runtime_metrics_coverage"]) + checks.append( + GateCheck( + "token_metrics_available_for_all_tasks", + token_coverage == 1.0, + f"token_metrics_coverage={token_coverage:.3f}", + ) + ) + checks.append( + GateCheck( + "runtime_metrics_available_for_all_tasks", + runtime_coverage == 1.0, + f"runtime_metrics_coverage={runtime_coverage:.3f}", + ) + ) + + if baseline_summary is not None: + checks.extend(_baseline_checks(summary, baseline_summary, thresholds)) + + return checks + + +def _baseline_checks( + summary: dict[str, Any], + baseline_summary: dict[str, Any], + thresholds: GateThresholds, +) -> list[GateCheck]: + checks: list[GateCheck] = [] + pass_rate = float(summary["pass_rate"]) + total_tokens_sum = summary["total_tokens_sum"] + runtime_sec_sum = summary["runtime_sec_sum"] + provenance = summary.get("provenance") or {} + + # Regression checks only make sense when both runs measured the same tasks. + baseline_tasks = baseline_summary.get("task_names") + candidate_tasks = summary.get("task_names") + task_sets_comparable = True + if isinstance(baseline_tasks, list) and isinstance(candidate_tasks, list): + comparable = sorted(baseline_tasks) == sorted(candidate_tasks) + task_sets_comparable = comparable + checks.append( + GateCheck( + "baseline_candidate_task_sets_match", + comparable, + ( + f"both runs measured {len(candidate_tasks)} tasks" + if comparable + else f"baseline={sorted(baseline_tasks)} candidate={sorted(candidate_tasks)}; " + "regression checks short-circuited" + ), + ) + ) + else: + checks.append( + GateCheck( + "baseline_candidate_task_sets_match", + True, + "task_names not present on baseline and/or candidate; skipping equality guard", + ) + ) + + checks.append(_cross_commit_check(provenance, baseline_summary, thresholds.allow_cross_commit)) + + if not task_sets_comparable: + return checks + + baseline_pass_rate = float(baseline_summary.get("pass_rate", 0.0)) + checks.append( + GateCheck( + "no_pass_rate_regression_vs_baseline", + pass_rate >= baseline_pass_rate - thresholds.max_pass_rate_drop, + f"pass_rate={pass_rate:.3f}, baseline={baseline_pass_rate:.3f}, max_drop={thresholds.max_pass_rate_drop:.3f}", + ) + ) + + baseline_tokens = baseline_summary.get("total_tokens_sum") + if isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int): + max_allowed = baseline_tokens * (1.0 + thresholds.max_token_regression_pct / 100.0) + checks.append( + GateCheck( + "tokens_not_worse_than_baseline", + total_tokens_sum <= max_allowed, + f"total_tokens_sum={total_tokens_sum}, baseline={baseline_tokens}, " + f"max_regression_pct={thresholds.max_token_regression_pct:.2f}", + ) + ) + else: + checks.append( + GateCheck( + "tokens_not_worse_than_baseline", + False, + "Missing token totals for candidate or baseline; cannot run deterministic token comparison.", + ) + ) + + # Runtime is only a tie-breaker when token totals match exactly. + baseline_runtime = baseline_summary.get("runtime_sec_sum") + tokens_tied = ( + isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int) and total_tokens_sum == baseline_tokens + ) + if not tokens_tied: + checks.append( + GateCheck( + "runtime_tie_breaker_not_worse_than_baseline", + True, + "Not applicable (token totals differ from baseline).", + ) + ) + elif isinstance(runtime_sec_sum, int | float) and isinstance(baseline_runtime, int | float): + max_allowed_runtime = float(baseline_runtime) * (1.0 + thresholds.max_runtime_regression_pct / 100.0) + checks.append( + GateCheck( + "runtime_tie_breaker_not_worse_than_baseline", + float(runtime_sec_sum) <= max_allowed_runtime, + f"runtime_sec_sum={float(runtime_sec_sum):.3f}, baseline={float(baseline_runtime):.3f}, " + f"max_regression_pct={thresholds.max_runtime_regression_pct:.2f}", + ) + ) + else: + checks.append( + GateCheck( + "runtime_tie_breaker_not_worse_than_baseline", + False, + "Token totals tied with baseline but runtime totals missing; cannot run tie-breaker.", + ) + ) + + return checks + + +def _commit_consistency_check(provenance: dict[str, Any]) -> GateCheck: + commit_observed = provenance.get("commit_sha_observed") + if isinstance(commit_observed, list) and len(commit_observed) > 1: + return GateCheck( + "commit_sha_consistent_within_run", + False, + f"Multiple commit_sha values observed across tasks: {commit_observed}. Re-run from a single commit.", + ) + commit_sha = provenance.get("commit_sha") + if commit_sha: + return GateCheck( + "commit_sha_consistent_within_run", + True, + f"commit={provenance.get('commit_short') or commit_sha[:12]}, branch={provenance.get('branch') or 'detached'}", + ) + return GateCheck( + "commit_sha_consistent_within_run", + True, + "provenance not recorded (legacy artifacts); skipping commit consistency check.", + ) + + +def _cross_commit_check( + provenance: dict[str, Any], + baseline_summary: dict[str, Any], + allow_cross_commit: bool, +) -> GateCheck: + baseline_commit = (baseline_summary.get("provenance") or {}).get("commit_sha") + candidate_commit = provenance.get("commit_sha") + if not (baseline_commit and candidate_commit): + return GateCheck( + "commit_sha_matches_baseline", + True, + "commit_sha not present on baseline and/or candidate; skipping cross-commit guard.", + ) + commits_match = baseline_commit == candidate_commit + if commits_match: + detail = f"both runs at commit={baseline_commit[:12]}" + elif allow_cross_commit: + detail = ( + f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}; " + "comparison allowed by allow_cross_commit (numbers may not be apples-to-apples)." + ) + else: + detail = ( + f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}. " + "Re-run candidate at the baseline commit, or set allow_cross_commit." + ) + return GateCheck("commit_sha_matches_baseline", commits_match or allow_cross_commit, detail) + + +def _rewards_by_task(results: list[AgentEvalTaskResult], reward_outputs: tuple[str, ...]) -> dict[str, float]: + rewards: dict[str, float] = {} + for task_result in results: + for output_name in reward_outputs: + value = _numeric_output(task_result, output_name) + if value is not None: + # Highest-priority output wins; don't overwrite with later metrics. + rewards.setdefault(task_result.task_id, value) + break + return rewards + + +def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None: + for output in task_result.outputs: + if output.name == name: + try: + return float(output.value) + except (TypeError, ValueError): + return None + return None + + +def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]: + observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS} + for prov in provenances: + for field_name in _PROVENANCE_FIELDS: + value = prov.get(field_name) + if value is not None: + observed[field_name].add(value) + + aggregated: dict[str, Any] = {"available": bool(provenances)} + for field_name in _PROVENANCE_FIELDS: + values = observed[field_name] + if len(values) == 1: + aggregated[field_name] = next(iter(values)) + else: + aggregated[field_name] = None + if len(values) > 1: + aggregated[f"{field_name}_observed"] = sorted(map(str, values)) + return aggregated + + +def _validate_baseline_summary(summary: dict[str, Any], source: Path) -> None: + missing = [key for key in ("pass_rate", "total_tokens_sum", "runtime_sec_sum") if key not in summary] + if missing: + raise ValueError( + f"Baseline summary {source} is missing required key(s): {', '.join(missing)}. " + "Expected a raw summary object or a gate.json with a `summary`." + ) + if not isinstance(summary.get("pass_rate"), int | float): + raise ValueError(f"Baseline summary {source} has invalid `pass_rate`; expected a number.") diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py new file mode 100644 index 0000000000..0ae2330415 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/measurements.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Typed view over the measurement keys carried on ``AgentEvalAttempt.metadata``. + +Gating and reporting read these typed fields instead of reaching into the +attempt metadata dict by magic string. The keys are still *stored* on +``metadata`` (so the loose-dict contract continues to work during migration); +this module is the single, documented place that names them and applies the +fallbacks (``duration_ms`` → ``runtime_sec``, ``passed`` → ``reward``). +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + +# Token-measurement keys carried on attempt metadata (and in result.json["metrics"]). +TOKEN_KEYS: tuple[str, ...] = ( + "prompt_tokens", + "completion_tokens", + "total_tokens", + "cache_creation_tokens", + "cache_read_tokens", +) + + +class AttemptMeasurements(BaseModel): + """Numeric measurements + provenance projected from attempt metadata. + + This is the public, typed attempt-measurement contract. Reporting/gating + consume it via :meth:`from_metadata`; producers may keep writing the same + keys onto ``AgentEvalAttempt.metadata`` and round-trip via :meth:`to_metadata`. + """ + + model_config = ConfigDict(extra="forbid") + + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + cache_creation_tokens: int | None = None + cache_read_tokens: int | None = None + runtime_sec: float | None = None + reward: float | None = None + passed: bool | None = None + provenance: dict[str, Any] = Field(default_factory=dict) + + @classmethod + def from_metadata(cls, metadata: Mapping[str, Any] | None) -> AttemptMeasurements: + """Project loose attempt metadata onto the typed contract. + + Applies the historical fallbacks so callers don't re-implement them: + ``runtime_sec`` falls back to ``duration_ms / 1000``; ``reward`` falls + back to ``1.0``/``0.0`` derived from ``passed`` when no explicit reward + is recorded. + """ + metadata = metadata or {} + + tokens = {key: _as_int(metadata.get(key)) for key in TOKEN_KEYS} + runtime_sec = _runtime_sec(metadata) + passed = metadata.get("passed") + passed = bool(passed) if isinstance(passed, bool) else None + reward = _reward(metadata, passed) + provenance = metadata.get("provenance") + provenance = dict(provenance) if isinstance(provenance, Mapping) else {} + + return cls( + **tokens, + runtime_sec=runtime_sec, + reward=reward, + passed=passed, + provenance=provenance, + ) + + def to_metadata(self) -> dict[str, Any]: + """Project back onto the loose metadata keys (only set values).""" + payload: dict[str, Any] = {} + for key in TOKEN_KEYS: + value = getattr(self, key) + if value is not None: + payload[key] = value + if self.runtime_sec is not None: + payload["runtime_sec"] = self.runtime_sec + if self.reward is not None: + payload["reward"] = self.reward + if self.passed is not None: + payload["passed"] = self.passed + if self.provenance: + payload["provenance"] = dict(self.provenance) + return payload + + +def _as_int(value: Any) -> int | None: + # bool is an int subclass; never treat True/False as a token count. + if isinstance(value, bool): + return None + return value if isinstance(value, int) else None + + +def _runtime_sec(metadata: Mapping[str, Any]) -> float | None: + runtime_sec = metadata.get("runtime_sec") + if isinstance(runtime_sec, int | float) and not isinstance(runtime_sec, bool): + return float(runtime_sec) + duration_ms = metadata.get("duration_ms") + if isinstance(duration_ms, int | float) and not isinstance(duration_ms, bool): + return float(duration_ms) / 1000.0 + return None + + +def _reward(metadata: Mapping[str, Any], passed: bool | None) -> float | None: + reward = metadata.get("reward") + if reward is not None: + try: + return float(reward) + except (TypeError, ValueError): + return None + if passed is not None: + return 1.0 if passed else 0.0 + return None diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py new file mode 100644 index 0000000000..1fb436f809 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/orchestrator.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic orchestration: agent/scoring run + deterministic gate. + +Wraps :class:`~nemo_evaluator_sdk.agent_eval.evaluator.AgentEvaluator` with the +gate from :mod:`nemo_evaluator_sdk.agent_eval.gating`. It is intentionally lean — +the only collaborators are the tasks and a target (online) or attempts (offline). +Two seams keep it backend-agnostic: + +* **verify-enable is inverted to data**: callers pass ``extra_metrics`` to append + (e.g. a verifier-reward metric). The orchestrator never introspects a runtime's + config to decide what to score. +* **environment prep is an injected hook**: ``prepare_task`` (e.g. "build the task + image") runs per task before execution, so Docker/build specifics live in the + caller, not here. + +The common Docker case stays a few lines via :meth:`AgentEvalOrchestrator`'s plain +constructor (config + optional ``extra_metrics``); richer wiring is opt-in. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.evaluator import AgentEvaluator +from nemo_evaluator_sdk.agent_eval.gating import ( + GateThresholds, + evaluate_gate, + load_baseline_summary, + write_gate_report, +) +from nemo_evaluator_sdk.agent_eval.types import ( + AgentAttemptRuntime, + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalRunResult, + AgentEvalTask, +) +from nemo_evaluator_sdk.metrics.protocol import Metric + + +@dataclass(frozen=True) +class OrchestratorConfig: + """Run-level knobs shared by the online and offline paths.""" + + parallelism: int = 1 + write_dashboard: bool = True + write_gate: bool = True + gate_thresholds: GateThresholds | None = None + baseline_summary_path: Path | None = None + + +class AgentEvalOrchestrator: + """Run tasks through ``AgentEvaluator`` (online or offline) and apply the gate.""" + + def __init__( + self, + *, + config: OrchestratorConfig | None = None, + extra_metrics: Sequence[Metric] = (), + ) -> None: + self.config = config or OrchestratorConfig() + self._extra_metrics = list(extra_metrics) + + async def run_tasks( + self, + tasks: Sequence[AgentEvalTask], + *, + target: AgentAttemptRuntime, + benchmark: dict[str, object] | None = None, + output_dir: Path | None = None, + run_id: str | None = None, + prepare_task: Callable[[AgentEvalTask], None] | None = None, + ) -> AgentEvalRunResult: + """Online path: optionally prep each task, run the runtime, score, gate.""" + prepared = [self._with_extra_metrics(task) for task in tasks] + if prepare_task is not None: + for task in prepared: + prepare_task(task) + + result = await AgentEvaluator().run( + tasks=prepared, + target=target, + config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark), + ) + self._maybe_write_gate(result) + return result + + async def score_attempts( + self, + tasks: Sequence[AgentEvalTask], + *, + attempts: Sequence[AgentEvalAttempt], + benchmark: dict[str, object] | None = None, + output_dir: Path | None = None, + run_id: str | None = None, + ) -> AgentEvalRunResult: + """Offline path: score already-captured attempts (no agent execution).""" + prepared = [self._with_extra_metrics(task) for task in tasks] + result = await AgentEvaluator().run( + tasks=prepared, + attempts=list(attempts), + config=self._run_config(output_dir=output_dir, run_id=run_id, benchmark=benchmark), + ) + self._maybe_write_gate(result) + return result + + def _run_config( + self, + *, + output_dir: Path | None, + run_id: str | None, + benchmark: dict[str, object] | None, + ) -> AgentEvalRunConfig: + return AgentEvalRunConfig( + output_dir=output_dir, + run_id=run_id, + parallelism=self.config.parallelism, + write_dashboard=self.config.write_dashboard, + benchmark=dict(benchmark or {}), + ) + + def _with_extra_metrics(self, task: AgentEvalTask) -> AgentEvalTask: + """Append injected metrics, honoring task-authored metrics and avoiding duplicate types.""" + if not self._extra_metrics: + return task + metrics: list[Metric] = list(task.metrics) + existing_types = {type(metric) for metric in metrics} + appended = [metric for metric in self._extra_metrics if type(metric) not in existing_types] + if not appended: + return task + return task.model_copy(update={"metrics": metrics + appended}) + + def _maybe_write_gate(self, result: AgentEvalRunResult) -> None: + if not (self.config.write_gate and result.output_dir is not None): + return + baseline = ( + load_baseline_summary(self.config.baseline_summary_path) + if self.config.baseline_summary_path is not None + else None + ) + report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline) + write_gate_report(report, result.output_dir) + + +__all__ = [ + "AgentEvalOrchestrator", + "GateThresholds", + "OrchestratorConfig", +] diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py new file mode 100644 index 0000000000..a2d7ac9e44 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/coding_agent.py @@ -0,0 +1,291 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Plug-and-play seam for coding-agent CLIs (codex/claude/cursor/...). + +The split that makes these "plug-and-play": + +* :class:`CliAgentDriver` is the **driver** — a generic ``AgentAttemptRuntime`` + that runs a CLI which reads a prompt on stdin and writes its final answer to a + file, then captures workspace/stdout/stderr/final-output as evidence. This is + the stable, reusable part. +* :class:`CodingAgentSpec` is the **per-agent adapter** — the bespoke part: how to + build the CLI command and (optionally) how to parse that agent's trajectory into + extra evidence. Implementing a new agent means subclassing this, not rewriting a + runtime. + +The shipped :class:`ClaudeCodeSpec` / :class:`CursorAgentSpec` are *reference* +command builders: the driver and evidence contract are stable, but each CLI's +exact flags and trajectory format are the integrator's responsibility and may +drift with upstream releases. Auth is the caller's concern (inject via env); +nothing here hardcodes credentials. +""" + +from __future__ import annotations + +import asyncio +import json +import shutil +import subprocess +from collections.abc import Awaitable, Callable, Sequence +from dataclasses import dataclass +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor + +DEFAULT_CODING_AGENT_TIMEOUT_S = 600 +ProcessFactory = Callable[..., Awaitable[object]] + + +@dataclass(frozen=True) +class RunArtifacts: + """Resolved on-disk paths for one coding-agent attempt.""" + + evidence_dir: Path + workspace_dir: Path + prompt_path: Path + task_path: Path + stdout_path: Path + stderr_path: Path + final_output_path: Path + + +class CodingAgentSpec: + """Per-agent adapter: prompt, command, and trajectory→evidence parsing. + + Subclass and implement :meth:`build_command`. Override :meth:`build_prompt`, + :meth:`extra_evidence`, or :meth:`final_output` for agent-specific behavior. + """ + + name: str = "coding_agent" + binary: str = "" + model: str | None = None + + def build_prompt(self, task: AgentEvalTask) -> str: + """Default instruction prompt (override per agent if needed).""" + return f"Task id: {task.id}\nIntent: {task.intent}\nInputs: {task.inputs}\n" + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + """Return the argv to launch; the prompt is delivered on stdin.""" + raise NotImplementedError + + def extra_evidence(self, artifacts: RunArtifacts) -> dict[str, EvidenceDescriptor]: + """Optional per-agent evidence (e.g. a parsed trajectory). Default: none.""" + return {} + + def final_output(self, artifacts: RunArtifacts, stdout_text: str) -> str: + """Final answer text: prefer the written final-output file, else stdout.""" + if artifacts.final_output_path.exists(): + return artifacts.final_output_path.read_text(encoding="utf-8") + return stdout_text + + +class CliAgentDriver: + """Generic ``AgentAttemptRuntime`` for stdin-prompt coding-agent CLIs.""" + + def __init__( + self, + spec: CodingAgentSpec, + *, + work_root: str | Path | None = None, + timeout_s: int = DEFAULT_CODING_AGENT_TIMEOUT_S, + process_factory: ProcessFactory | None = None, + ) -> None: + if not spec.binary: + raise ValueError(f"{type(spec).__name__} must set a non-empty `binary`") + self._spec = spec + self._work_root = Path(work_root).expanduser() if work_root is not None else None + self._timeout_s = timeout_s + self._process_factory = process_factory or asyncio.create_subprocess_exec + + async def run_tasks( + self, + tasks: Sequence[AgentEvalTask], + config: AgentEvalRunConfig | None = None, + ) -> Sequence[AgentEvalAttempt]: + if self._process_factory is asyncio.create_subprocess_exec and shutil.which(self._spec.binary) is None: + raise RuntimeError(f"{self._spec.name} CLI executable {self._spec.binary!r} was not found on PATH") + + resolved = config or AgentEvalRunConfig() + semaphore = asyncio.Semaphore(resolved.parallelism) + + async def run_one(index: int, task: AgentEvalTask) -> AgentEvalAttempt: + async with semaphore: + return await self._run_task(index, task, resolved) + + return await asyncio.gather(*(run_one(index, task) for index, task in enumerate(tasks))) + + async def _run_task(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> AgentEvalAttempt: + artifacts = self._artifacts(index, task, config) + artifacts.evidence_dir.mkdir(parents=True, exist_ok=True) + artifacts.workspace_dir.mkdir(parents=True, exist_ok=True) + + prompt = self._spec.build_prompt(task) + artifacts.prompt_path.write_text(prompt, encoding="utf-8") + artifacts.task_path.write_text(task.model_dump_json(indent=2), encoding="utf-8") + + command = self._spec.build_command(artifacts) + try: + process = await self._process_factory( + *command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for( + process.communicate(prompt.encode("utf-8")), + timeout=self._timeout_s, + ) + except Exception as exc: + return self._failed_attempt(task, artifacts, exc) + + stdout_text = _decode(stdout) + stderr_text = _decode(stderr) + artifacts.stdout_path.write_text(stdout_text, encoding="utf-8") + artifacts.stderr_path.write_text(stderr_text, encoding="utf-8") + + return_code = getattr(process, "returncode", 0) + if return_code: + return self._failed_attempt( + task, + artifacts, + RuntimeError(f"{self._spec.name} exited with status {return_code}: {stderr_text.strip()}"), + ) + + descriptors: dict[str, EvidenceDescriptor] = { + "workspace": EvidenceDescriptor(kind="filesystem", format="dir", ref=str(artifacts.workspace_dir)), + "prompt": EvidenceDescriptor(kind="text", format="txt", ref=str(artifacts.prompt_path)), + "task": EvidenceDescriptor(kind="json", format="json", ref=str(artifacts.task_path)), + "stdout": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stdout_path)), + "stderr": EvidenceDescriptor(kind="logs", format="txt", ref=str(artifacts.stderr_path)), + } + descriptors.update(self._spec.extra_evidence(artifacts)) + + return AgentEvalAttempt( + id=f"{task.id}:{self._spec.name}", + task_id=task.id, + status="completed", + output=AgentOutput( + text=self._spec.final_output(artifacts, stdout_text), + metadata={ + "runtime": self._spec.name, + "agent_model": self._spec.model, + "evidence_dir": str(artifacts.evidence_dir), + }, + ), + evidence=CandidateEvidence(descriptors=descriptors, metadata={"runtime": self._spec.name}), + metadata={ + "runtime": self._spec.name, + "agent_model": self._spec.model, + "generated": True, + }, + ) + + def _failed_attempt(self, task: AgentEvalTask, artifacts: RunArtifacts, exc: Exception) -> AgentEvalAttempt: + error_path = artifacts.evidence_dir / "error.json" + error_path.write_text( + json.dumps({"error_type": exc.__class__.__name__, "error": str(exc)}) + "\n", encoding="utf-8" + ) + return AgentEvalAttempt( + id=f"{task.id}:{self._spec.name}", + task_id=task.id, + status="failed", + output=None, + evidence=CandidateEvidence( + descriptors={"error": EvidenceDescriptor(kind="error", format="json", ref=str(error_path))}, + metadata={"runtime": self._spec.name}, + ), + metadata={ + "runtime": self._spec.name, + "error_type": exc.__class__.__name__, + "error": str(exc), + }, + ) + + def _artifacts(self, index: int, task: AgentEvalTask, config: AgentEvalRunConfig) -> RunArtifacts: + root = self._work_root or ((config.output_dir or Path.cwd()) / "evidence" / self._spec.name) + evidence_dir = Path(root) / (_safe_path_name(task.id) or f"task-{index}") + return RunArtifacts( + evidence_dir=evidence_dir, + workspace_dir=evidence_dir / "workspace", + prompt_path=evidence_dir / "prompt.txt", + task_path=evidence_dir / "task.json", + stdout_path=evidence_dir / "stdout.txt", + stderr_path=evidence_dir / "stderr.txt", + final_output_path=evidence_dir / "final_output.txt", + ) + + +class ClaudeCodeSpec(CodingAgentSpec): + """Reference command builder for the Claude Code CLI (``claude``).""" + + name = "claude_code" + binary = "claude" + + def __init__(self, *, model: str | None = None, binary: str = "claude") -> None: + self.model = model + self.binary = binary + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + command = [ + self.binary, + "--print", + "--output-format", + "stream-json", + "--add-dir", + str(artifacts.workspace_dir), + ] + if self.model is not None: + command.extend(["--model", self.model]) + return command + + +class CursorAgentSpec(CodingAgentSpec): + """Reference command builder for the Cursor Agent CLI (``cursor-agent``).""" + + name = "cursor_agent" + binary = "cursor-agent" + + def __init__(self, *, model: str | None = None, binary: str = "cursor-agent") -> None: + self.model = model + self.binary = binary + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + command = [ + self.binary, + "--print", + "--output-format", + "text", + "--workdir", + str(artifacts.workspace_dir), + ] + if self.model is not None: + command.extend(["--model", self.model]) + return command + + +def _decode(value: bytes | str | None) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + return value.decode("utf-8", errors="replace") + + +def _safe_path_name(value: str) -> str: + return "".join(char if char.isalnum() or char in "._-" else "-" for char in value).strip(".-")[:120] + + +__all__ = [ + "CliAgentDriver", + "ClaudeCodeSpec", + "CodingAgentSpec", + "CursorAgentSpec", + "RunArtifacts", +] diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py new file mode 100644 index 0000000000..482ca6e55e --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Docker CLI helpers for agent-eval runtimes. + +These shell out to the ``docker`` CLI (stdlib ``subprocess`` only), so importing +this module does not require the ``agent-runtimes`` extra — only a working +``docker`` binary at call time. +""" + +from __future__ import annotations + +import os +import subprocess +from collections.abc import Sequence + + +def redact_cmd_for_logging(cmd: Sequence[str]) -> list[str]: + """Redact secret values in command logs.""" + redacted: list[str] = [] + sensitive_markers = ("KEY", "TOKEN", "SECRET", "PASSWORD") + for token in cmd: + if "=" not in token: + redacted.append(token) + continue + left, right = token.split("=", 1) + env_key = left.split()[-1] if left else left + if any(marker in env_key.upper() for marker in sensitive_markers): + redacted.append(f"{left}=***REDACTED***") + else: + redacted.append(f"{left}={right}") + return redacted + + +def docker_run( + image: str, + command: list[str], + *, + env: dict[str, str] | None = None, + mounts: list[tuple[str, str]] | None = None, + workdir: str | None = None, + remove: bool = True, + timeout: int | None = None, + extra_args: list[str] | None = None, +) -> subprocess.CompletedProcess[str]: + """Run a command inside a Docker container.""" + cmd = ["docker", "run"] + if remove: + cmd.append("--rm") + if workdir: + cmd += ["-w", workdir] + + for key, value in (env or {}).items(): + cmd += ["-e", f"{key}={value}"] + + for host_path, container_path in mounts or []: + cmd += ["-v", f"{host_path}:{container_path}"] + + docker_extra = (extra_args or []) + (os.environ.get("DOCKER_EXTRA_ARGS", "").split() or []) + cmd += docker_extra + cmd.append(image) + cmd += command + + print(f"[agent-eval-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}") + return subprocess.run(cmd, check=False, text=True, timeout=timeout) + + +def docker_image_exists(tag: str) -> bool: + """Return True when a Docker image tag exists locally.""" + result = subprocess.run(["docker", "image", "inspect", tag], capture_output=True, text=True, check=False) + return result.returncode == 0 + + +def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None: + """Build a Docker image from an explicit Dockerfile + build context.""" + cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)] + print(f"[agent-eval-runtime] $ {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + +def build_task_image(task_dir: os.PathLike[str], tag: str) -> None: + """Build a task-specific Docker image from ``environment/Dockerfile``.""" + from pathlib import Path + + root = Path(task_dir) + env_dockerfile = root / "environment" / "Dockerfile" + if not env_dockerfile.exists(): + raise FileNotFoundError(f"No environment/Dockerfile found in {root}") + build_dockerfile(env_dockerfile, env_dockerfile.parent, tag) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py index 8f84d8ba4f..fc03344c85 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/docker_sandbox.py @@ -1,7 +1,16 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Docker-backed sandbox runtime for agent-eval attempts.""" +"""Docker-backed sandbox runtime for agent-eval attempts. + +Distinct from :mod:`nemo_evaluator_sdk.agent_eval.runtimes.environment`'s +``DockerEnvironmentProvider`` on purpose: this runtime drives the OpenAI Agents +SDK ``SandboxAgent`` (Python ``docker`` + ``agents``, behind the +``agent-runtimes`` extra) and *owns* the agent loop, whereas the environment +provider only shells out to the ``docker`` CLI to execute a caller-built command +inside a prebuilt task image. The two are not merged: this one is an +``AgentAttemptRuntime``; the other is an execution boundary used *by* runtimes. +""" from __future__ import annotations diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py new file mode 100644 index 0000000000..a08dfdc179 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Process/filesystem environment boundary for agent-eval runtimes. + +This boundary sits *below* :class:`AgentAttemptRuntime` so a runtime never needs +to know whether the agent/verifier execute under Docker, locally, or another +filesystem-backed sandbox. It is intentionally a **process/filesystem** +abstraction, not a fully provider-neutral one: :class:`EnvRunSpec` carries +``mounts``/``extra_args`` as filesystem-environment hints. Providers that are +not filesystem-backed may ignore those fields. + +A handle exposes a single :meth:`AbstractEnvironmentHandle.run` that takes a +``role`` ("agent" or "verifier"); :meth:`run_agent`/:meth:`run_verifier` are thin +role wrappers kept for caller convenience and protocol compatibility. +""" + +from __future__ import annotations + +import asyncio +import subprocess +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Literal, Protocol, runtime_checkable + +from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask + +EnvRole = Literal["agent", "verifier"] + + +def default_image_tag(task_id: str) -> str: + """Default task → image-tag mapping (callers may inject their own).""" + return f"{task_id}:latest" + + +@dataclass(frozen=True) +class EnvCommandResult: + """Outcome of running a single command inside a prepared environment.""" + + exit_code: int + timed_out: bool = False + + @property + def ok(self) -> bool: + return self.exit_code == 0 and not self.timed_out + + +@dataclass +class EnvRunSpec: + """How to execute one command inside an environment handle. + + ``mounts``/``extra_args`` are filesystem-environment hints (e.g. Docker bind + mounts and extra CLI args). Non-filesystem providers may ignore them. + """ + + command: list[str] + env: dict[str, str] = field(default_factory=dict) + mounts: list[tuple[str, str]] = field(default_factory=list) + workdir: str | None = None + timeout: int | None = None + extra_args: list[str] = field(default_factory=list) + + +@runtime_checkable +class AgentEnvironmentHandle(Protocol): + """A prepared, single-task environment that can run agent/verifier commands.""" + + async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: ... + + async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: ... + + async def close(self) -> None: ... + + +@runtime_checkable +class AgentEnvironmentProvider(Protocol): + """Creates per-task environment handles. Pluggable: Docker now, others later.""" + + async def prepare( + self, + task: AgentEvalTask, + config: AgentEvalRunConfig | None = None, + ) -> AgentEnvironmentHandle: ... + + +class AbstractEnvironmentHandle: + """Base handle that routes both roles through a single :meth:`run`. + + Concrete handles implement :meth:`run`; ``run_agent``/``run_verifier`` are + role-specialized wrappers so the duplicated phase methods don't have to be + reimplemented per backend. + """ + + async def run(self, spec: EnvRunSpec, role: EnvRole) -> EnvCommandResult: + raise NotImplementedError + + async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: + return await self.run(spec, "agent") + + async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: + return await self.run(spec, "verifier") + + async def close(self) -> None: + return None + + +class DockerEnvironmentHandle(AbstractEnvironmentHandle): + """Docker-backed environment handle bound to one task image.""" + + def __init__(self, image: str) -> None: + self.image = image + + async def run(self, spec: EnvRunSpec, role: EnvRole = "agent") -> EnvCommandResult: + del role # Docker runs both roles identically against the same image. + from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_run + + try: + result = await asyncio.to_thread( + docker_run, + self.image, + spec.command, + env=spec.env, + mounts=spec.mounts, + workdir=spec.workdir, + timeout=spec.timeout, + extra_args=spec.extra_args, + ) + except subprocess.TimeoutExpired: + return EnvCommandResult(exit_code=124, timed_out=True) + return EnvCommandResult(exit_code=result.returncode) + + +class DockerEnvironmentProvider: + """Default provider that maps each task to its built Docker image.""" + + def __init__(self, *, image_tag_fn: Callable[[str], str] = default_image_tag) -> None: + self._image_tag_fn = image_tag_fn + + async def prepare( + self, + task: AgentEvalTask, + config: AgentEvalRunConfig | None = None, + ) -> DockerEnvironmentHandle: + del config + return DockerEnvironmentHandle(self._image_tag_fn(task.id)) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py new file mode 100644 index 0000000000..a594705907 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/environment_spec.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Declarative environment authoring for agent-eval tasks. + +Moves task authoring away from an implicit "Dockerfile per task" toward a small, +declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch. + +Spec shape (``environment.yaml`` in the task dir):: + + environment: + image: nemo-platform-agentic-base:2026.06 + profile: evaluator-platform + dependencies: + python: + - pytest + - nemo-evaluator-sdk + setup: + - seed-providers + - create-workspace + +Escape hatch:: + + environment: + dockerfile: environment/Dockerfile + +Resolution is deliberately minimal: a spec is turned into a :class:`BuildPlan` +(a Dockerfile + build context + target tag). The Dockerfile path is used as-is; +an ``image``-based spec generates a tiny derived Dockerfile (``FROM `` plus +optional ``pip install``). ``setup`` steps are carried as plan metadata — they are +runtime concerns handled outside the image build — so this module does not +execute them. + +``yaml`` is imported lazily so that importing this module costs nothing for +callers that never load a spec. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path + +ENVIRONMENT_SPEC_FILENAME = "environment.yaml" +DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile" + + +@dataclass(frozen=True) +class EnvironmentSpec: + """Declarative environment for one task (or a Dockerfile escape hatch).""" + + image: str | None = None + profile: str | None = None + python_dependencies: list[str] = field(default_factory=list) + setup: list[str] = field(default_factory=list) + dockerfile: Path | None = None + + def __post_init__(self) -> None: + if self.dockerfile is None and self.image is None: + raise ValueError("environment spec requires either 'image' or 'dockerfile'") + + +def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec: + """Load a task's environment spec. + + Resolution order: + 1. ``environment.yaml`` in the task dir (declarative spec, preferred). + 2. ``environment/Dockerfile`` (backward-compatible escape hatch so existing + tasks work without authoring a spec). + """ + root = Path(task_dir) + spec_path = root / ENVIRONMENT_SPEC_FILENAME + if spec_path.is_file(): + import yaml + + return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root) + + dockerfile = root / DEFAULT_DOCKERFILE_RELPATH + if dockerfile.is_file(): + return EnvironmentSpec(dockerfile=dockerfile) + + raise FileNotFoundError( + f"No environment defined for task {root}: expected {ENVIRONMENT_SPEC_FILENAME} or {DEFAULT_DOCKERFILE_RELPATH}" + ) + + +def _parse_spec(payload: dict, task_dir: Path) -> EnvironmentSpec: + data = payload.get("environment", payload) if isinstance(payload, dict) else {} + if not isinstance(data, dict): + raise ValueError(f"Invalid environment spec in {task_dir}: expected a mapping") + + dockerfile_value = data.get("dockerfile") + dockerfile = None + if dockerfile_value: + dockerfile = Path(dockerfile_value) + if not dockerfile.is_absolute(): + dockerfile = (task_dir / dockerfile).resolve() + if not dockerfile.is_file(): + raise FileNotFoundError(f"environment.dockerfile not found: {dockerfile}") + + dependencies = data.get("dependencies") or {} + python_deps = dependencies.get("python") if isinstance(dependencies, dict) else None + + return EnvironmentSpec( + image=data.get("image"), + profile=data.get("profile"), + python_dependencies=list(python_deps or []), + setup=list(data.get("setup") or []), + dockerfile=dockerfile, + ) + + +@dataclass(frozen=True) +class BuildPlan: + """A resolved, executable Docker build for one task.""" + + image_tag: str + dockerfile: Path + context_dir: Path + generated: bool + base_image: str | None = None + setup: list[str] = field(default_factory=list) + + +def plan_task_build( + task_dir: str | Path, + image_tag: str, + *, + spec: EnvironmentSpec | None = None, + generated_dir: Path | None = None, +) -> BuildPlan: + """Resolve a task's environment spec into a concrete :class:`BuildPlan`. + + For the Dockerfile escape hatch the existing Dockerfile/context is used. For + an ``image``-based spec a minimal derived Dockerfile is written under + ``generated_dir`` (defaults to ``/.agentic-build``). + """ + root = Path(task_dir) + spec = spec or load_environment_spec(root) + + if spec.dockerfile is not None: + return BuildPlan( + image_tag=image_tag, + dockerfile=spec.dockerfile, + context_dir=spec.dockerfile.parent, + generated=False, + setup=list(spec.setup), + ) + + # image-based spec: generate a tiny derived Dockerfile. + context_dir = generated_dir if generated_dir is not None else (root / ".agentic-build") + context_dir.mkdir(parents=True, exist_ok=True) + dockerfile = context_dir / "Dockerfile" + dockerfile.write_text(render_derived_dockerfile(spec), encoding="utf-8") + return BuildPlan( + image_tag=image_tag, + dockerfile=dockerfile, + context_dir=context_dir, + generated=True, + base_image=spec.image, + setup=list(spec.setup), + ) + + +def execute_build_plan(plan: BuildPlan) -> None: + """Build the Docker image described by ``plan``.""" + from nemo_evaluator_sdk.agent_eval.runtimes.docker import build_dockerfile + + build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag) + + +def render_derived_dockerfile(spec: EnvironmentSpec) -> str: + """Render a minimal derived Dockerfile from an image-based spec.""" + if spec.image is None: + raise ValueError("cannot render a derived Dockerfile without a base image") + lines = [f"FROM {spec.image}"] + if spec.profile: + lines.append(f"LABEL com.nvidia.agentic.profile={spec.profile}") + if spec.python_dependencies: + deps = " ".join(spec.python_dependencies) + lines.append(f"RUN pip install --no-cache-dir {deps}") + if spec.setup: + # Setup steps are runtime concerns; record them for provenance only. + lines.append(f'LABEL com.nvidia.agentic.setup="{",".join(spec.setup)}"') + return "\n".join(lines) + "\n" diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py new file mode 100644 index 0000000000..5c858cb037 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/layout.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic on-disk layout for a single agent-eval task run. + +A run produces an agent-log dir and a workspace dir under a run dir, plus a +written instruction file. Callers that need extra directories (e.g. preserved +platform state) add them on top of :class:`RunLayout`. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class RunLayout: + """Filesystem layout for one task run.""" + + run_dir: Path + agent_log_dir: Path + workspace_dir: Path + instruction_path: Path + + +def resolve_run_dir(output_dir: str | Path | None, default_factory: Callable[[], Path]) -> Path: + """Resolve the run dir to an absolute path. + + An explicit ``output_dir`` must be made absolute: run-dir subpaths are used as + Docker bind-mount sources, and Docker treats a relative ``-v`` source as a + (slash-free) named volume rather than a host directory. + """ + if output_dir is not None: + return Path(output_dir).resolve() + return default_factory() + + +def prepare_run_layout( + run_dir: str | Path, + instruction_text: str, + *, + agent_subdir: str = "agent", + workspace_subdir: str = "workspace", + instruction_name: str = "instruction.md", +) -> RunLayout: + """Create the agent/workspace dirs under ``run_dir`` and write the instruction.""" + run_dir = Path(run_dir) + agent_log_dir = run_dir / agent_subdir + workspace_dir = run_dir / workspace_subdir + agent_log_dir.mkdir(parents=True, exist_ok=True) + workspace_dir.mkdir(parents=True, exist_ok=True) + + instruction_path = agent_log_dir / instruction_name + instruction_path.write_text(instruction_text, encoding="utf-8") + + return RunLayout( + run_dir=run_dir, + agent_log_dir=agent_log_dir, + workspace_dir=workspace_dir, + instruction_path=instruction_path, + ) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py new file mode 100644 index 0000000000..7e1b0fb0c0 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/runtimes/verify.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generic verifier-phase mechanic: collect a reward + stamp attempt metadata. + +This is the backend-agnostic core. *What* the verifier runs (command, env, +mounts) and *how* it is invoked are caller concerns — the caller executes its +verifier through an environment handle, then uses :func:`collect_verifier_outcome` +to read the reward/stdout convention out of the verifier's log dir, and +:func:`apply_verify_to_metadata` to stamp the result onto an attempt so a +reward metric can score it. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class VerifierOutcome: + """Result of a verifier phase for one task.""" + + ran: bool + passed: bool + reward: int + exit_code: int + stdout: str + verifier_log_dir: Path | None + + +def skipped_outcome() -> VerifierOutcome: + """Outcome representing a verifier that did not run.""" + return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) + + +def collect_verifier_outcome( + *, + ok: bool, + exit_code: int, + log_dir: str | Path, + reward_filename: str = "reward.txt", + stdout_filename: str = "test-stdout.txt", +) -> VerifierOutcome: + """Build a :class:`VerifierOutcome` from a verifier run's log dir. + + Reads ``reward.txt`` (``1``/``0``) when present; otherwise derives the reward + from ``ok`` and writes the file so reruns are stable. Reads ``test-stdout.txt`` + when present. + """ + log_dir = Path(log_dir) + passed = ok + + stdout = "" + stdout_path = log_dir / stdout_filename + if stdout_path.is_file(): + stdout = stdout_path.read_text(encoding="utf-8", errors="replace") + + reward_path = log_dir / reward_filename + if reward_path.is_file(): + reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0 + else: + reward = 1 if passed else 0 + reward_path.parent.mkdir(parents=True, exist_ok=True) + reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8") + + return VerifierOutcome( + ran=True, + passed=passed, + reward=reward, + exit_code=exit_code, + stdout=stdout, + verifier_log_dir=log_dir, + ) + + +def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None: + """Stamp verifier reward/status onto attempt metadata for scoring + gating.""" + if not outcome.ran: + metadata.setdefault("verify_status", "skipped") + return + metadata["verify_status"] = "ok" if outcome.passed else "failed" + metadata["passed"] = outcome.passed + metadata["reward"] = outcome.reward + metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py index 589a4efde1..03509ab038 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/types.py @@ -287,6 +287,18 @@ async def run_tasks( ) -> Sequence[AgentEvalAttempt]: ... +@runtime_checkable +class AgentAttemptSource(Protocol): + """Loads a previously captured attempt for a task from a stored artifact. + + The offline counterpart to :class:`AgentAttemptRuntime`: instead of executing + the agent, it adapts an already-produced run directory/file into an + :class:`AgentEvalAttempt` so it can be (re)scored through ``AgentEvaluator``. + """ + + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: ... + + def _metric_coverage( results: Sequence[AgentEvalTaskResult], tasks: Sequence[AgentEvalTask] | None, diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py new file mode 100644 index 0000000000..66e7715c07 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_coding_agent.py @@ -0,0 +1,117 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Fixture-based tests for the coding-agent driver seam (no real CLIs).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.runtimes.coding_agent import ( + ClaudeCodeSpec, + CliAgentDriver, + CodingAgentSpec, + CursorAgentSpec, + RunArtifacts, +) +from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask + + +class _EchoSpec(CodingAgentSpec): + name = "echo_agent" + binary = "echo-agent" + + def build_command(self, artifacts: RunArtifacts) -> list[str]: + return [self.binary, "--out", str(artifacts.final_output_path)] + + def extra_evidence(self, artifacts: RunArtifacts) -> dict: + from nemo_evaluator_sdk.values.evidence import EvidenceDescriptor + + return {"trajectory": EvidenceDescriptor(kind="trace", format="jsonl", ref=str(artifacts.stdout_path))} + + +class _FakeProcess: + def __init__(self, *, returncode: int, final_output_path: Path | None, stdout: bytes = b"", stderr: bytes = b""): + self.returncode = returncode + self._final_output_path = final_output_path + self._stdout = stdout + self._stderr = stderr + + async def communicate(self, stdin: bytes | None = None) -> tuple[bytes, bytes]: + if self._final_output_path is not None: + self._final_output_path.write_text("final answer", encoding="utf-8") + return self._stdout, self._stderr + + +def _factory(*, returncode: int = 0, write_final: bool = True): + captured: dict = {} + + async def factory(*command, **kwargs): + captured["command"] = list(command) + final_path = Path(command[command.index("--out") + 1]) if "--out" in command else None + return _FakeProcess( + returncode=returncode, + final_output_path=final_path if write_final else None, + stdout=b'{"event":"done"}\n', + ) + + return factory, captured + + +def _task() -> AgentEvalTask: + return AgentEvalTask(id="demo/task", intent="do the thing", inputs={"k": "v"}) + + +@pytest.mark.asyncio +async def test_driver_produces_completed_attempt_with_evidence(tmp_path: Path) -> None: + factory, captured = _factory() + driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory) + + attempts = await driver.run_tasks([_task()], AgentEvalRunConfig()) + attempt = attempts[0] + + assert captured["command"][0] == "echo-agent" + assert attempt.status == "completed" + assert attempt.output is not None and attempt.output.text == "final answer" + # Standard + spec-provided evidence keys are present and paths exist on disk. + assert {"workspace", "prompt", "task", "stdout", "stderr", "trajectory"} <= set(attempt.evidence.descriptors) + assert (tmp_path / "demo-task" / "prompt.txt").read_text(encoding="utf-8").startswith("Task id: demo/task") + + +@pytest.mark.asyncio +async def test_driver_marks_failed_on_nonzero_exit(tmp_path: Path) -> None: + factory, _ = _factory(returncode=1, write_final=False) + driver = CliAgentDriver(_EchoSpec(), work_root=tmp_path, process_factory=factory) + + attempt = (await driver.run_tasks([_task()]))[0] + assert attempt.status == "failed" + assert attempt.output is None + assert "error" in attempt.evidence.descriptors + assert (tmp_path / "demo-task" / "error.json").exists() + + +def test_reference_specs_build_expected_commands(tmp_path: Path) -> None: + artifacts = RunArtifacts( + evidence_dir=tmp_path, + workspace_dir=tmp_path / "workspace", + prompt_path=tmp_path / "p", + task_path=tmp_path / "t", + stdout_path=tmp_path / "o", + stderr_path=tmp_path / "e", + final_output_path=tmp_path / "f", + ) + claude_cmd = ClaudeCodeSpec(model="claude-x").build_command(artifacts) + assert claude_cmd[0] == "claude" and "--model" in claude_cmd and "claude-x" in claude_cmd + + cursor_cmd = CursorAgentSpec().build_command(artifacts) + assert cursor_cmd[0] == "cursor-agent" and "--model" not in cursor_cmd + + +def test_driver_rejects_spec_without_binary(tmp_path: Path) -> None: + class _NoBinary(CodingAgentSpec): + def build_command(self, artifacts: RunArtifacts) -> list[str]: + return [] + + with pytest.raises(ValueError, match="non-empty"): + CliAgentDriver(_NoBinary(), work_root=tmp_path) diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py new file mode 100644 index 0000000000..3e5f9361a2 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_common_metrics.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for promoted attempt helpers and reusable metrics.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric, EvidencePresenceMetric +from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput +from nemo_evaluator_sdk.values.evidence import CandidateEvidence + + +def test_resolve_attempt_status_keeps_failed_agents_scorable() -> None: + assert resolve_attempt_status(True) == "completed" + assert resolve_attempt_status(False) == "partial" + + +def test_standard_evidence_descriptors_builds_doc_keys(tmp_path: Path) -> None: + logs = tmp_path / "agent" + workspace = tmp_path / "workspace" + verifier = tmp_path / "verifier" + logs.mkdir() + workspace.mkdir() + verifier.mkdir() # exists -> verifier_logs included + + descriptors = standard_evidence_descriptors( + logs_dir=logs, + final_state_dir=workspace, + trace_path=tmp_path / "atif_trajectory.json", + initial_state_ref=str(tmp_path / "seed"), + verifier_logs_dir=verifier, + primary_log="nat_agent.log", + ) + assert set(descriptors) == {"initial_state", "trace", "logs", "final_state", "verifier_logs"} + assert descriptors["trace"].format == "atif" + assert descriptors["logs"].metadata["primary_log"] == "nat_agent.log" + + # verifier_logs omitted when the dir is absent. + no_verifier = standard_evidence_descriptors( + logs_dir=logs, final_state_dir=workspace, verifier_logs_dir=tmp_path / "missing" + ) + assert "verifier_logs" not in no_verifier + + +@pytest.mark.asyncio +async def test_agent_phase_success_metric_reads_metadata_and_namespaces_type() -> None: + metric = AgentPhaseSuccessMetric() + assert metric.type == "agent_phase_success" + ok = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(metadata={"agent_ok": True})) + ) + assert ok.outputs[0].value == 1.0 + + class Namespaced(AgentPhaseSuccessMetric): + metric_type = "agentic_use_agent_phase" + + assert Namespaced().type == "agentic_use_agent_phase" + + +@pytest.mark.asyncio +async def test_evidence_presence_metric_scores_over_evidence(tmp_path: Path) -> None: + final_state = tmp_path / "workspace" + final_state.mkdir() + (final_state / "result.txt").write_text("done", encoding="utf-8") + evidence = CandidateEvidence( + descriptors=standard_evidence_descriptors(logs_dir=tmp_path / "agent", final_state_dir=final_state) + ) + + metric = EvidencePresenceMetric() + present = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence)) + ) + assert present.outputs[0].value == 1.0 + + # Empty workspace -> non-empty requirement fails; no evidence -> 0. + (final_state / "result.txt").unlink() + empty = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(evidence=evidence)) + ) + assert empty.outputs[0].value == 0.0 + missing = await metric.compute_scores(MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput())) + assert missing.outputs[0].value == 0.0 diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py index 5e0446b1eb..c051499030 100644 --- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_docker_sandbox_runtime.py @@ -16,7 +16,7 @@ from nemo_evaluator_sdk.agent_eval.runtimes import docker_sandbox from nemo_evaluator_sdk.agent_eval.runtimes.docker_sandbox import ( DockerSandboxAgentRuntime, - SandboxSdk, + SandboxSDK, ) @@ -147,8 +147,8 @@ async def run(self, agent: _FakeSandboxAgent, prompt: str, *, run_config: _FakeR raise RuntimeError("sandbox run failed") -def _fake_sdk() -> SandboxSdk: - return SandboxSdk( +def _fake_sdk() -> SandboxSDK: + return SandboxSDK( Runner=_FakeRunner(), RunConfig=_FakeRunConfig, SandboxRunConfig=_FakeSandboxRunConfig, diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py new file mode 100644 index 0000000000..b7df9a61d4 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_environment.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the promoted environment boundary + environment authoring.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.runtimes import docker as docker_mod +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + DockerEnvironmentHandle, + DockerEnvironmentProvider, + EnvRunSpec, + default_image_tag, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec, plan_task_build +from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask + + +@pytest.mark.asyncio +async def test_docker_handle_routes_roles_through_single_run(monkeypatch: pytest.MonkeyPatch) -> None: + calls: list[tuple[str, list[str]]] = [] + + def fake_docker_run(image: str, command: list[str], **kwargs: object) -> subprocess.CompletedProcess[str]: + calls.append((image, command)) + return subprocess.CompletedProcess(args=command, returncode=0) + + monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run) + + handle = DockerEnvironmentHandle("img:latest") + spec = EnvRunSpec(command=["echo", "hi"]) + assert (await handle.run_agent(spec)).ok + assert (await handle.run_verifier(spec)).ok + assert calls == [("img:latest", ["echo", "hi"]), ("img:latest", ["echo", "hi"])] + + +@pytest.mark.asyncio +async def test_docker_handle_reports_timeout(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_docker_run(image: str, command: list[str], **kwargs: object): + raise subprocess.TimeoutExpired(cmd=command, timeout=1) + + monkeypatch.setattr(docker_mod, "docker_run", fake_docker_run) + result = await DockerEnvironmentHandle("img").run(EnvRunSpec(command=["sleep"]), "agent") + assert result.timed_out and result.exit_code == 124 and not result.ok + + +@pytest.mark.asyncio +async def test_provider_uses_injected_image_tag_fn() -> None: + assert default_image_tag("t") == "t:latest" + provider = DockerEnvironmentProvider(image_tag_fn=lambda task_id: f"custom-{task_id}") + handle = await provider.prepare(AgentEvalTask(id="demo", intent="x", inputs={})) + assert isinstance(handle, DockerEnvironmentHandle) + assert handle.image == "custom-demo" + + +def test_environment_spec_yaml_dockerfile_and_plan(tmp_path: Path) -> None: + (tmp_path / "environment.yaml").write_text( + "environment:\n image: base:1\n dependencies:\n python: [pytest]\n setup: [seed]\n", + encoding="utf-8", + ) + spec = load_environment_spec(tmp_path) + assert spec.image == "base:1" and spec.python_dependencies == ["pytest"] + + plan = plan_task_build(tmp_path, "img:latest", generated_dir=tmp_path / "build") + content = plan.dockerfile.read_text(encoding="utf-8") + assert plan.generated and plan.base_image == "base:1" + assert content.startswith("FROM base:1") and "pip install --no-cache-dir pytest" in content + + # Dockerfile escape hatch wins when no yaml present. + other = tmp_path / "task2" / "environment" + other.mkdir(parents=True) + (other / "Dockerfile").write_text("FROM scratch\n", encoding="utf-8") + escape = load_environment_spec(tmp_path / "task2") + assert escape.dockerfile == other / "Dockerfile" and escape.image is None diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py new file mode 100644 index 0000000000..613a4cfaa3 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_gating.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the promoted deterministic gate.""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, summarize_run, write_gate_report +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunResult, + AgentEvalSummary, + AgentEvalTask, + AgentEvalTaskResult, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import MetricOutput + + +def _make_run_result( + *, reward: float, total_tokens: int, runtime_sec: float, commit: str = "abc123" +) -> AgentEvalRunResult: + task = AgentEvalTask(id="demo", intent="do it", inputs={}) + attempt = AgentEvalAttempt( + id="demo:workflow", + task_id="demo", + status="completed", + output=AgentOutput(text="ok"), + metadata={ + "total_tokens": total_tokens, + "runtime_sec": runtime_sec, + "provenance": {"commit_sha": commit, "commit_short": commit[:7]}, + }, + ) + task_result = AgentEvalTaskResult( + id="demo:workflow:agentic_use_verifier_reward", + run_id="run-1", + task_id="demo", + attempt_id="demo:workflow", + metric_type="agentic_use_verifier_reward", + outputs=[MetricOutput(name="verifier_reward", value=reward)], + ) + return AgentEvalRunResult( + run_id="run-1", + tasks=[task], + attempts=[attempt], + results=[task_result], + summary=AgentEvalSummary(), + ) + + +def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: + summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5)) + assert summary["total_tasks"] == 1 + assert summary["pass_rate"] == 1.0 + assert summary["total_tokens_sum"] == 120 + assert summary["runtime_sec_sum"] == 4.5 + assert summary["token_metrics_coverage"] == 1.0 + assert summary["provenance"]["commit_sha"] == "abc123" + + +def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None: + baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0) + candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0) + + baseline_report = evaluate_gate(baseline, thresholds=GateThresholds()) + assert baseline_report.gate_passed is True + + candidate_report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_report.summary) + assert candidate_report.gate_passed is False + token_check = next(c for c in candidate_report.checks if c.name == "tokens_not_worse_than_baseline") + assert token_check.passed is False + + gate_path = write_gate_report(candidate_report, tmp_path) + assert gate_path.exists() and "gate_passed" in gate_path.read_text(encoding="utf-8") + + +def test_evaluate_gate_blocks_cross_commit_comparison() -> None: + baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111") + candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222") + + baseline_summary = evaluate_gate(baseline, thresholds=GateThresholds()).summary + report = evaluate_gate(candidate, thresholds=GateThresholds(), baseline_summary=baseline_summary) + cross = next(c for c in report.checks if c.name == "commit_sha_matches_baseline") + assert cross.passed is False and report.gate_passed is False + + allowed = evaluate_gate( + candidate, thresholds=GateThresholds(allow_cross_commit=True), baseline_summary=baseline_summary + ) + cross_allowed = next(c for c in allowed.checks if c.name == "commit_sha_matches_baseline") + assert cross_allowed.passed is True + + +def test_summarize_run_uses_measurement_fallbacks() -> None: + # duration_ms -> runtime_sec, and metadata reward when no scored metric output. + run = _make_run_result(reward=0.0, total_tokens=10, runtime_sec=1.0) + run.attempts[0].metadata.pop("runtime_sec") + run.attempts[0].metadata["duration_ms"] = 2500 + run.attempts[0].metadata["reward"] = 1 + run.results.clear() # no scored metric outputs -> fall back to metadata reward + + summary = summarize_run(run) + assert summary["runtime_sec_sum"] == 2.5 + assert summary["pass_rate"] == 1.0 diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py new file mode 100644 index 0000000000..ed7da3beee --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Guardrail: the agent_eval package must stay free of NeMo-Platform imports. + +The SDK is consumed by ``tests/agentic-use`` (the NeMo-Platform adapter), never +the reverse. This test fails if any module under ``agent_eval`` imports a +platform-specific package, which keeps the promotion from leaking coupling into +the SDK. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import nemo_evaluator_sdk.agent_eval as agent_eval + +AGENT_EVAL_ROOT = Path(agent_eval.__file__).resolve().parent + +# Import statements that would couple the SDK to the platform / adapter. +_FORBIDDEN = re.compile( + r"^\s*(?:from|import)\s+" + r"(nemo_platform|nmp_[A-Za-z0-9_]+|nat_runner|runtimes(?:\.|\s|$)|evaluator_agent_eval)", + re.MULTILINE, +) + + +def test_agent_eval_has_no_platform_imports() -> None: + offenders: list[str] = [] + for path in sorted(AGENT_EVAL_ROOT.rglob("*.py")): + text = path.read_text(encoding="utf-8") + for match in _FORBIDDEN.finditer(text): + line_no = text.count("\n", 0, match.start()) + 1 + offenders.append(f"{path.relative_to(AGENT_EVAL_ROOT)}:{line_no}: {match.group(0).strip()}") + + assert not offenders, "agent_eval must not import NeMo-Platform packages:\n" + "\n".join(offenders) diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py new file mode 100644 index 0000000000..bc11bce7ef --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_measurements.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the typed AttemptMeasurements contract.""" + +from __future__ import annotations + +from nemo_evaluator_sdk.agent_eval.measurements import AttemptMeasurements + + +def test_from_metadata_reads_tokens_runtime_reward_and_provenance() -> None: + measurements = AttemptMeasurements.from_metadata( + { + "total_tokens": 120, + "prompt_tokens": 80, + "completion_tokens": 40, + "runtime_sec": 4.5, + "reward": 1, + "passed": True, + "provenance": {"commit_sha": "abc123"}, + } + ) + assert measurements.total_tokens == 120 + assert measurements.runtime_sec == 4.5 + assert measurements.reward == 1.0 + assert measurements.passed is True + assert measurements.provenance["commit_sha"] == "abc123" + + +def test_from_metadata_applies_fallbacks_and_ignores_bad_types() -> None: + # duration_ms -> runtime_sec, passed -> reward, bool is not a token count. + measurements = AttemptMeasurements.from_metadata( + {"duration_ms": 2500, "passed": False, "total_tokens": True} + ) + assert measurements.runtime_sec == 2.5 + assert measurements.reward == 0.0 + assert measurements.total_tokens is None + + empty = AttemptMeasurements.from_metadata(None) + assert empty.reward is None and empty.runtime_sec is None and empty.provenance == {} + + +def test_to_metadata_round_trips_only_set_values() -> None: + payload = AttemptMeasurements(total_tokens=10, runtime_sec=1.0, reward=1.0).to_metadata() + assert payload == {"total_tokens": 10, "runtime_sec": 1.0, "reward": 1.0} diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py new file mode 100644 index 0000000000..d5acd5bd3f --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_orchestrator.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the generic agent-eval orchestrator (online + offline paths).""" + +from __future__ import annotations + +import json +from collections.abc import Sequence +from pathlib import Path + +import pytest +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +class _ExtraMetric: + @property + def type(self) -> str: + return "extra" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("extra")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + return MetricResult(outputs=[MetricOutput(name="extra", value=1.0)]) + + +class _FakeRuntime: + def __init__(self) -> None: + self.prepared_ids: list[str] = [] + + async def run_tasks( + self, tasks: Sequence[AgentEvalTask], config: AgentEvalRunConfig | None = None + ) -> Sequence[AgentEvalAttempt]: + return [ + AgentEvalAttempt( + id=f"{task.id}:fake", + task_id=task.id, + status="completed", + output=AgentOutput(text="ok"), + metadata={"agent_ok": True}, + ) + for task in tasks + ] + + +def _task() -> AgentEvalTask: + return AgentEvalTask(id="demo", intent="do it", inputs={}, metrics=[AgentPhaseSuccessMetric()]) + + +@pytest.mark.asyncio +async def test_run_tasks_appends_extra_metrics_and_runs_prepare_hook(tmp_path: Path) -> None: + runtime = _FakeRuntime() + seen: list[str] = [] + orch = AgentEvalOrchestrator( + config=OrchestratorConfig(write_dashboard=False, write_gate=True), + extra_metrics=[_ExtraMetric()], + ) + + result = await orch.run_tasks( + [_task()], + target=runtime, + benchmark={"benchmark": "demo"}, + output_dir=tmp_path, + run_id="run-1", + prepare_task=lambda task: seen.append(task.id), + ) + + assert seen == ["demo"] + assert {m.type for m in result.tasks[0].metrics} == {"agent_phase_success", "extra"} + assert result.attempts[0].status == "completed" + # Gate is written next to the run bundle. + assert (tmp_path / "gate.json").exists() + + +@pytest.mark.asyncio +async def test_score_attempts_offline_does_not_invoke_runtime() -> None: + orch = AgentEvalOrchestrator(config=OrchestratorConfig(write_dashboard=False, write_gate=False)) + attempt = AgentEvalAttempt( + id="demo:stored", + task_id="demo", + status="completed", + output=AgentOutput(text="ok"), + metadata={"agent_ok": True}, + ) + result = await orch.score_attempts([_task()], attempts=[attempt]) + assert [m.type for m in result.tasks[0].metrics] == ["agent_phase_success"] + assert any(r.metric_type == "agent_phase_success" for r in result.results) + + +@pytest.mark.asyncio +async def test_extra_metrics_deduplicated_by_type() -> None: + task = AgentEvalTask(id="demo", intent="i", inputs={}, metrics=[AgentPhaseSuccessMetric(), _ExtraMetric()]) + orch = AgentEvalOrchestrator( + config=OrchestratorConfig(write_dashboard=False, write_gate=False), + extra_metrics=[_ExtraMetric()], + ) + attempt = AgentEvalAttempt(id="demo:s", task_id="demo", status="completed", output=AgentOutput(text="ok")) + result = await orch.score_attempts([task], attempts=[attempt]) + types = [m.type for m in result.tasks[0].metrics] + assert types.count("extra") == 1 + + +def test_result_dir_attempt_source_protocol_shape(tmp_path: Path) -> None: + # A minimal AgentAttemptSource implementation satisfies the protocol. + from nemo_evaluator_sdk.agent_eval.types import AgentAttemptSource + + class _Source: + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: + payload = json.loads(Path(source).read_text(encoding="utf-8")) + return AgentEvalAttempt( + id=f"{task.id}:stored", + task_id=task.id, + status="completed", + output=AgentOutput(text=payload["agent"]), + ) + + src_path = tmp_path / "result.json" + src_path.write_text(json.dumps({"agent": "ok"}), encoding="utf-8") + source: AgentAttemptSource = _Source() + assert isinstance(source, AgentAttemptSource) + attempt = source.load_attempt(src_path, task=_task()) + assert attempt.task_id == "demo" diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py new file mode 100644 index 0000000000..136fda6075 --- /dev/null +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_verify.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for the generic verifier mechanic.""" + +from __future__ import annotations + +from pathlib import Path + +from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( + apply_verify_to_metadata, + collect_verifier_outcome, + skipped_outcome, +) + + +def test_collect_reads_reward_file_when_present(tmp_path: Path) -> None: + (tmp_path / "reward.txt").write_text("1\n", encoding="utf-8") + (tmp_path / "test-stdout.txt").write_text("PASSED", encoding="utf-8") + outcome = collect_verifier_outcome(ok=False, exit_code=3, log_dir=tmp_path) + # reward.txt is authoritative even when the process exit said not-ok. + assert outcome.ran and outcome.reward == 1 and outcome.exit_code == 3 + assert outcome.stdout == "PASSED" + + +def test_collect_derives_and_writes_reward_when_missing(tmp_path: Path) -> None: + outcome = collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path) + assert outcome.reward == 1 and outcome.passed is True + assert (tmp_path / "reward.txt").read_text(encoding="utf-8").strip() == "1" + + +def test_apply_to_metadata_stamps_and_skips(tmp_path: Path) -> None: + meta: dict[str, object] = {} + apply_verify_to_metadata(meta, skipped_outcome()) + assert meta == {"verify_status": "skipped"} + + meta2: dict[str, object] = {} + apply_verify_to_metadata(meta2, collect_verifier_outcome(ok=True, exit_code=0, log_dir=tmp_path)) + assert meta2["verify_status"] == "ok" and meta2["reward"] == 1 and meta2["passed"] is True diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md index b7d55b1b13..3631361cad 100644 --- a/tests/agentic-use/runtimes/COMPLIANCE.md +++ b/tests/agentic-use/runtimes/COMPLIANCE.md @@ -7,6 +7,23 @@ and `AgentAttemptRuntime` in `nemo_evaluator_sdk.agent_eval`). Design reference: internal agent-eval SDK doc (`https://docs.google.com/document/d/1mA9Kl6LVJFlgbj5CGulUOiaGyliP7QhqBh7jKXFGifM`). +## Adapter-over-SDK note + +The generic building blocks have been **promoted into the SDK** +(`nemo_evaluator_sdk.agent_eval`): the environment boundary +(`runtimes.environment`/`environment_spec`/`docker`), gating (`gating`), attempt +helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics +(`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence +`EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the +`AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the +coding-agent driver seam (`runtimes.coding_agent`). The `shared/*` modules +referenced below are now **re-export shims** over those SDK homes (see +`README.md` for the shim→SDK table); only NeMo-Platform specifics +(`task_loader`, `result_adapter`, `config`, the pytest verifier command, the +`state` evidence key, `task_image_tag`) remain platform code. A CI grep gate +(`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) keeps +`agent_eval/` free of NeMo-Platform imports. + ## Scope split (per SDK design) | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location | diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md index 90317f204a..d5ecff2c38 100644 --- a/tests/agentic-use/runtimes/README.md +++ b/tests/agentic-use/runtimes/README.md @@ -1,32 +1,62 @@ # Agentic-use AgentAttemptRuntime implementations -Backend-specific runtimes extracted from `nat_runner.py` for use with -`nemo_evaluator_sdk.agent_eval.AgentEvaluator`. +NeMo-Platform **adapter** over the generic agent-eval framework in +`nemo_evaluator_sdk.agent_eval`. The backend-agnostic building blocks (environment +boundary, gating, attempt/evidence helpers, orchestrator, verify mechanic, +coding-agent driver seam) now live in the SDK; this directory holds only the +NeMo-Platform glue (the `workflow`/`aut` backends, agentic task/result formats, +the pytest verifier, the platform Docker build/image-tag) plus a thin factory. + +## Architecture: adapter over SDK + +The `shared/*` modules below are **pure re-export shims** over their SDK homes — +they exist only so existing imports keep working; the logic lives in the SDK: + +| `shared/` shim | SDK home | +|----------------|----------| +| `docker.py` | `agent_eval.runtimes.docker` | +| `environment.py` | `agent_eval.runtimes.environment` (re-supplies the platform image-tag) | +| `environment_spec.py` | `agent_eval.runtimes.environment_spec` | +| `reporting.py` | `agent_eval.gating` | +| `verify.py` | wraps `agent_eval.runtimes.verify` (pytest command/env/mounts stay here) | +| `metrics.py` | `AgentPhaseSuccessMetric` from `agent_eval.common_metrics` (namespaced); `VerifierRewardMetric` is platform | +| `artifacts.py` | `resolve_attempt_status` + evidence keys from `agent_eval.attempts`; adds the platform `state` key | +| `layout.py` | delegates to `agent_eval.runtimes.layout`; adds the platform `state_dir` + `task_image_tag` | + +The orchestrator (`orchestrator.py`) is a thin factory over +`agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image +build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric` +(`extra_metrics`), and the `result.json` `AgentAttemptSource`. ## Layout ```text runtimes/ - shared/ # backend-agnostic building blocks: - # docker.py Docker exec + build helpers - # environment.py AgentEnvironmentProvider/Handle boundary (B2) - # environment_spec.py environment.yaml authoring + build plans (B3) - # layout.py per-run output layout - # task_loader.py agentic-use task -> AgentEvalTask - # container_env.py base container env vars - # artifacts.py agent artifacts -> AgentEvalAttempt (+ evidence) - # result_adapter.py nat_runner result.json -> AgentEvalAttempt (B1/B4) - # verify.py live VERIFY via run_verifier - # reporting.py summary + candidate/baseline gate (B4) - # metrics.py AgentPhaseSuccessMetric, VerifierRewardMetric - workflow/ # NatWorkflowAttemptRuntime (implemented) - aut/ # AutAgentAttemptRuntime (implemented) - claude_code/ # ClaudeCodeAgentAttemptRuntime (scaffold) - codex/ # CodexAgentAttemptRuntime (scaffold) - cursor_agent/ # CursorAgentAttemptRuntime (scaffold) - orchestrator.py # BUILD (env spec) + AgentEvaluator + gate; verify runs in the runtime + shared/ # thin re-export shims over agent_eval.* (see table above) + # + platform-only: task_loader.py, result_adapter.py, + # config.py, container_env.py, constants.py + workflow/ # NatWorkflowAttemptRuntime (implemented, NeMo construct) + aut/ # AutAgentAttemptRuntime (implemented, NeMo construct) + claude_code/ # scaffold (stub) — see "Coding-agent runtimes" below + codex/ # scaffold (stub) + cursor_agent/ # scaffold (stub) + orchestrator.py # thin factory over agent_eval.orchestrator.AgentEvalOrchestrator ``` +## Coding-agent runtimes (SDK driver seam) + +Coding-agent CLIs plug into the SDK via +`agent_eval.runtimes.coding_agent`: `CliAgentDriver` (the reusable driver) + +`CodingAgentSpec` (per-agent command builder + trajectory→evidence parser). +Reference `ClaudeCodeSpec`/`CursorAgentSpec` are shipped. The profbench codex +runtime (`agent_eval.runtimes.codex`) remains a separate, standalone-CLI runtime. + +The agentic-use `codex`/`claude_code`/`cursor_agent` backends here are still +stubs: wiring them to run the SDK driver *inside* the `nmp-agentic-base` Docker +environment (like `workflow`/`aut`) is bespoke per agent and a tracked follow-up. +`workflow` and `aut` stay in the adapter — they implement `AgentAttemptRuntime` +but are NeMo constructs, not general SDK runtimes. + ## Example: workflow backend From the repository root (requires Docker + built task image): diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py index 94eb00050e..8a355fbdfc 100644 --- a/tests/agentic-use/runtimes/orchestrator.py +++ b/tests/agentic-use/runtimes/orchestrator.py @@ -1,7 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Orchestrate BUILD + AgentEvaluator + VERIFY for agentic-use tasks.""" +"""Agentic-use adapter over the generic SDK orchestrator. + +This is a thin NeMo-Platform factory: the generic run/score/gate loop lives in +:class:`nemo_evaluator_sdk.agent_eval.orchestrator.AgentEvalOrchestrator`. Here we +inject the platform specifics it deliberately does not know about — the agentic +task loader, the Docker image build (``prepare_task``), the ``run_verify``-derived +``VerifierRewardMetric``, and the ``result.json`` :class:`AgentAttemptSource`. +""" from __future__ import annotations @@ -10,7 +17,7 @@ from pathlib import Path from typing import Any -from nemo_evaluator_sdk.agent_eval import AgentEvalRunConfig, AgentEvaluator +from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig from nemo_evaluator_sdk.agent_eval.types import ( AgentAttemptRuntime, AgentEvalRunResult, @@ -22,8 +29,8 @@ from runtimes.shared.environment_spec import execute_build_plan, plan_task_build from runtimes.shared.layout import task_image_tag from runtimes.shared.metrics import VerifierRewardMetric -from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report -from runtimes.shared.result_adapter import attempt_from_result_dir +from runtimes.shared.reporting import GateThresholds +from runtimes.shared.result_adapter import ResultDirAttemptSource from runtimes.shared.task_loader import agentic_task_from_dir @@ -38,7 +45,7 @@ class AgenticOrchestratorConfig: class AgenticEvalOrchestrator: - """Run agentic-use tasks through AgentEvaluator and optional verify phase.""" + """Run agentic-use tasks through the generic orchestrator + optional verify metric.""" def __init__( self, @@ -48,6 +55,16 @@ def __init__( ) -> None: self.runtime = runtime self.config = config or AgenticOrchestratorConfig() + self._orchestrator = AgentEvalOrchestrator( + config=OrchestratorConfig( + parallelism=1, + write_dashboard=self.config.write_dashboard, + write_gate=self.config.write_gate, + gate_thresholds=self.config.gate_thresholds, + baseline_summary_path=self.config.baseline_summary_path, + ), + extra_metrics=self._extra_metrics(), + ) async def run_agent_eval( self, @@ -58,25 +75,15 @@ async def run_agent_eval( ) -> AgentEvalRunResult: """Build the task image when needed, run the agent runtime, return SDK result.""" task = agentic_task_from_dir(task_name) - task = task.model_copy(update={"metrics": self._metrics_for_task(task)}) - image_tag = task_image_tag(task.id) - self._ensure_task_image(task.metadata["task_dir"], image_tag) - - result = await AgentEvaluator().run( - tasks=[task], + return await self._orchestrator.run_tasks( + [task], target=self.runtime, - config=AgentEvalRunConfig( - output_dir=output_dir, - run_id=run_id, - parallelism=1, - write_dashboard=self.config.write_dashboard, - benchmark={"benchmark": "agentic-use", "task": task_name}, - ), + benchmark={"benchmark": "agentic-use", "task": task_name}, + output_dir=output_dir, + run_id=run_id, + prepare_task=self._ensure_task_image, ) - self._maybe_write_gate(result) - return result - async def score_captured_attempts( self, task_name: str, @@ -87,61 +94,38 @@ async def score_captured_attempts( ) -> AgentEvalRunResult: """Score already-captured ``result.json`` runs without re-running the agent. - This is the SDK's first-class *stored-attempt* path: it imports each - ``nat_runner`` output directory via :func:`attempt_from_result_dir` and - scores them through :class:`AgentEvaluator`, so metrics can be exercised - (and runs rescored) with no Docker/agent execution. + The SDK's first-class *stored-attempt* path: each ``nat_runner`` output + dir is adapted via :class:`ResultDirAttemptSource` and scored through the + generic orchestrator, so metrics can be exercised (and runs rescored) with + no Docker/agent execution. """ task = agentic_task_from_dir(task_name) - task = task.model_copy(update={"metrics": self._metrics_for_task(task)}) - attempts = [attempt_from_result_dir(result_dir, task=task) for result_dir in result_dirs] - - result = await AgentEvaluator().run( - tasks=[task], + source = ResultDirAttemptSource() + attempts = [source.load_attempt(result_dir, task=task) for result_dir in result_dirs] + return await self._orchestrator.score_attempts( + [task], attempts=attempts, - config=AgentEvalRunConfig( - output_dir=output_dir, - run_id=run_id, - parallelism=1, - write_dashboard=self.config.write_dashboard, - benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"}, - ), + benchmark={"benchmark": "agentic-use", "task": task_name, "mode": "offline"}, + output_dir=output_dir, + run_id=run_id, ) - self._maybe_write_gate(result) - return result + def _extra_metrics(self) -> list[Metric]: + """Append :class:`VerifierRewardMetric` only when the runtime runs verify. - def _maybe_write_gate(self, result: AgentEvalRunResult) -> None: - if not (self.config.write_gate and result.output_dir is not None): - return - baseline = ( - load_baseline_summary(self.config.baseline_summary_path) - if self.config.baseline_summary_path is not None - else None - ) - report = evaluate_gate(result, thresholds=self.config.gate_thresholds, baseline_summary=baseline) - write_gate_report(report, result.output_dir) - - def _metrics_for_task(self, task: AgentEvalTask) -> list[Metric]: - """Honor task-authored metrics; only *append* a compatibility metric. - - Metrics originate on the task (see ``agentic_task_from_dir``). When the - live verify phase is enabled we append :class:`VerifierRewardMetric` so - the legacy pytest reward is scored too — but we never replace the task's - own metric set, and we avoid duplicating a metric the task already - declares (the SDK rejects duplicate metric types). + The verify-enable decision stays in the adapter (it knows its own runtime + config); the generic orchestrator never introspects the runtime. """ - metrics: list[Metric] = list(task.metrics) - if self._verify_enabled() and not any(isinstance(metric, VerifierRewardMetric) for metric in metrics): - metrics.append(VerifierRewardMetric()) - return metrics + return [VerifierRewardMetric()] if self._verify_enabled() else [] def _verify_enabled(self) -> bool: runtime_config = getattr(self.runtime, "config", None) shared = getattr(runtime_config, "shared", None) return bool(getattr(shared, "run_verify", False)) - def _ensure_task_image(self, task_dir: str | Path, image_tag: str) -> None: + def _ensure_task_image(self, task: AgentEvalTask) -> None: + image_tag = task_image_tag(task.id) + task_dir = task.metadata["task_dir"] if self.config.skip_build: if not docker_image_exists(image_tag): raise RuntimeError( diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py index 4942568635..912c12a7a0 100644 --- a/tests/agentic-use/runtimes/shared/artifacts.py +++ b/tests/agentic-use/runtimes/shared/artifacts.py @@ -15,9 +15,9 @@ AgentAttemptTrace, CapturedAgentAttempt, ) +from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors from nemo_evaluator_sdk.agent_eval.types import ( AgentEvalAttempt, - AgentEvalAttemptStatus, AgentEvalTask, AgentOutput, ) @@ -27,20 +27,11 @@ from runtimes.shared.layout import AgenticRunLayout from runtimes.shared.usage import extract_usage_metrics - -def resolve_attempt_status(agent_ok: bool) -> AgentEvalAttemptStatus: - """Map an agent-phase outcome to a *scorable* attempt status. - - The SDK's :class:`AgentEvaluator` excludes ``status=="failed"`` from scoring - (it raises). An agent that ran but failed must still be scored — e.g. as a - ``0`` by :class:`AgentPhaseSuccessMetric` — so that pass-rate gating counts - it rather than dropping it. We therefore use ``"partial"`` for an - executed-but-unsuccessful agent and reserve ``"failed"`` for genuine - attempt-*production* failures (which a runtime surfaces by raising, not by - emitting an unscorable attempt). This keeps the live builder and the - ``result.json`` importer consistent. - """ - return "completed" if agent_ok else "partial" +__all__ = [ + "build_agent_eval_attempt", + "resolve_attempt_status", + "to_captured_agent_attempt", +] def build_agent_eval_attempt( @@ -156,48 +147,20 @@ def _evidence_descriptors( *, initial_state_ref: str | None = None, ) -> dict[str, EvidenceDescriptor]: - """Build the evidence map specified by the agent-eval SDK design doc. + """Compose the SDK's standard evidence keys + the platform ``state`` extension. - Doc keys: ``initial_state`` (task input filesystem, when staged), - ``final_state`` (workspace), ``trace`` (trajectory, ATIF-normalized), - ``logs`` (agent log dir), and ``verifier_logs`` (verifier log dir). - - ``state`` is a NeMo-Platform-specific *extension* (not a doc key): it carries - the preserved platform/database state across the agent + verifier phases. + The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/ + ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state`` + is a NeMo-Platform-specific *extension* (not a doc key): it carries the + preserved platform/database state across the agent + verifier phases. """ - descriptors: dict[str, EvidenceDescriptor] = {} - - # task input filesystem → evidence["initial_state"] (only when a seed was staged). - if initial_state_ref: - descriptors["initial_state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=initial_state_ref, - metadata={"role": "initial_state"}, - ) - - # agent/trajectory.json → evidence["trace"], preferably ATIF-normalized. - if artifacts.atif_trajectory_path is not None: - descriptors["trace"] = EvidenceDescriptor( - kind="trace", - format="atif" if artifacts.atif_trajectory_path.name.startswith("atif") else "json", - ref=str(artifacts.atif_trajectory_path), - ) - - # agent/ logs → evidence["logs"]. - descriptors["logs"] = EvidenceDescriptor( - kind="logs", - format="dir", - ref=str(layout.agent_log_dir), - metadata={"primary_log": "nat_agent.log"}, - ) - - # workspace/ → evidence["final_state"] filesystem descriptor. - descriptors["final_state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=str(layout.workspace_dir), - metadata={"role": "final_state"}, + descriptors = standard_evidence_descriptors( + logs_dir=layout.agent_log_dir, + final_state_dir=layout.workspace_dir, + trace_path=artifacts.atif_trajectory_path, + initial_state_ref=initial_state_ref, + verifier_logs_dir=layout.run_dir / "verifier", + primary_log="nat_agent.log", ) # Platform extension (non-doc key): preserved platform/db state across phases. @@ -208,16 +171,6 @@ def _evidence_descriptors( metadata={"role": "platform_state", "extension": "nemo-platform"}, ) - # verifier/ logs → evidence["verifier_logs"] (present once verify phase runs). - verifier_log_dir = layout.run_dir / "verifier" - if verifier_log_dir.exists(): - descriptors["verifier_logs"] = EvidenceDescriptor( - kind="logs", - format="dir", - ref=str(verifier_log_dir), - metadata={"role": "verifier"}, - ) - return descriptors diff --git a/tests/agentic-use/runtimes/shared/docker.py b/tests/agentic-use/runtimes/shared/docker.py index 431d646806..ce3cc6cc22 100644 --- a/tests/agentic-use/runtimes/shared/docker.py +++ b/tests/agentic-use/runtimes/shared/docker.py @@ -1,87 +1,26 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Docker helpers for agentic-use runtimes.""" +"""Compatibility shim — Docker helpers were promoted to the Evaluator SDK. -from __future__ import annotations - -import os -import subprocess -from collections.abc import Sequence - - -def redact_cmd_for_logging(cmd: Sequence[str]) -> list[str]: - """Redact secret values in command logs.""" - redacted: list[str] = [] - sensitive_markers = ("KEY", "TOKEN", "SECRET", "PASSWORD") - for token in cmd: - if "=" not in token: - redacted.append(token) - continue - left, right = token.split("=", 1) - env_key = left.split()[-1] if left else left - if any(marker in env_key.upper() for marker in sensitive_markers): - redacted.append(f"{left}=***REDACTED***") - else: - redacted.append(f"{left}={right}") - return redacted - - -def docker_run( - image: str, - command: list[str], - *, - env: dict[str, str] | None = None, - mounts: list[tuple[str, str]] | None = None, - workdir: str | None = None, - remove: bool = True, - timeout: int | None = None, - extra_args: list[str] | None = None, -) -> subprocess.CompletedProcess[str]: - """Run a command inside a Docker container.""" - cmd = ["docker", "run"] - if remove: - cmd.append("--rm") - if workdir: - cmd += ["-w", workdir] - - for key, value in (env or {}).items(): - cmd += ["-e", f"{key}={value}"] - - for host_path, container_path in mounts or []: - cmd += ["-v", f"{host_path}:{container_path}"] +Import from ``nemo_evaluator_sdk.agent_eval.runtimes.docker`` directly; this +module re-exports the same symbols so existing adapter imports keep working. +""" - docker_extra = (extra_args or []) + (os.environ.get("DOCKER_EXTRA_ARGS", "").split() or []) - cmd += docker_extra - cmd.append(image) - cmd += command - - print(f"[agentic-runtime] $ {' '.join(redact_cmd_for_logging(cmd))}") - kwargs: dict[str, object] = {"check": False, "text": True} - if timeout is not None: - kwargs["timeout"] = timeout - return subprocess.run(cmd, **kwargs) - - -def docker_image_exists(tag: str) -> bool: - """Return True when a Docker image tag exists locally.""" - result = subprocess.run(["docker", "image", "inspect", tag], capture_output=True, text=True, check=False) - return result.returncode == 0 - - -def build_dockerfile(dockerfile: os.PathLike[str], context_dir: os.PathLike[str], tag: str) -> None: - """Build a Docker image from an explicit Dockerfile + build context.""" - cmd = ["docker", "build", "-f", str(dockerfile), "-t", tag, str(context_dir)] - print(f"[agentic-runtime] $ {' '.join(cmd)}") - subprocess.run(cmd, check=True) - - -def build_task_image(task_dir: os.PathLike[str], tag: str) -> None: - """Build a task-specific Docker image from environment/Dockerfile.""" - from pathlib import Path +from __future__ import annotations - root = Path(task_dir) - env_dockerfile = root / "environment" / "Dockerfile" - if not env_dockerfile.exists(): - raise FileNotFoundError(f"No environment/Dockerfile found in {root}") - build_dockerfile(env_dockerfile, env_dockerfile.parent, tag) +from nemo_evaluator_sdk.agent_eval.runtimes.docker import ( + build_dockerfile, + build_task_image, + docker_image_exists, + docker_run, + redact_cmd_for_logging, +) + +__all__ = [ + "build_dockerfile", + "build_task_image", + "docker_image_exists", + "docker_run", + "redact_cmd_for_logging", +] diff --git a/tests/agentic-use/runtimes/shared/environment.py b/tests/agentic-use/runtimes/shared/environment.py index fe23893668..08e55ce2ed 100644 --- a/tests/agentic-use/runtimes/shared/environment.py +++ b/tests/agentic-use/runtimes/shared/environment.py @@ -1,125 +1,50 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Environment provider boundary for agentic-use runtimes. +"""Compatibility shim — the environment boundary was promoted to the Evaluator SDK. -This is the design-doc's ``EnvironmentProvider`` boundary (section B2): it sits -*below* :class:`AgentAttemptRuntime` so a runtime never needs to know whether -the agent/verifier execute under Docker, locally, Harbor, or NeMo Gym. Today the -only implementation is :class:`DockerEnvironmentProvider`, which wraps -``shared/docker.py``. - -Deviation from the doc sketch: the doc proposes ``run_agent(instruction, config) --> AgentEvalAttempt``. We keep the boundary at "execute a command in the -prepared environment" (returning an :class:`EnvCommandResult`) because each -backend builds its own command/env/mounts, and attempt construction is owned by -``shared/artifacts.py``. This keeps command-building and attempt-shaping out of -the environment layer so new providers only implement process execution. +The generic boundary now lives in +``nemo_evaluator_sdk.agent_eval.runtimes.environment``. The only platform-specific +piece kept here is the default task→image mapping (``nmp-nat-:latest``): the +adapter's :class:`DockerEnvironmentProvider` injects :func:`task_image_tag` so +``DockerEnvironmentProvider()`` keeps producing platform-tagged images. """ from __future__ import annotations -import asyncio -import subprocess from collections.abc import Callable -from dataclasses import dataclass, field -from typing import Protocol, runtime_checkable -from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + AbstractEnvironmentHandle, + AgentEnvironmentHandle, + AgentEnvironmentProvider, + DockerEnvironmentHandle, + EnvCommandResult, + EnvRole, + EnvRunSpec, + default_image_tag, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + DockerEnvironmentProvider as _SDKDockerEnvironmentProvider, +) -from runtimes.shared.docker import docker_run from runtimes.shared.layout import task_image_tag - -@dataclass(frozen=True) -class EnvCommandResult: - """Outcome of running a single command inside a prepared environment.""" - - exit_code: int - timed_out: bool = False - - @property - def ok(self) -> bool: - return self.exit_code == 0 and not self.timed_out - - -@dataclass -class EnvRunSpec: - """How to execute one command inside an environment handle.""" - - command: list[str] - env: dict[str, str] = field(default_factory=dict) - mounts: list[tuple[str, str]] = field(default_factory=list) - workdir: str | None = None - timeout: int | None = None - extra_args: list[str] = field(default_factory=list) - - -@runtime_checkable -class AgentEnvironmentHandle(Protocol): - """A prepared, single-task environment that can run agent/verifier commands.""" - - async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: ... - - async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: ... - - async def close(self) -> None: ... +__all__ = [ + "AbstractEnvironmentHandle", + "AgentEnvironmentHandle", + "AgentEnvironmentProvider", + "DockerEnvironmentHandle", + "DockerEnvironmentProvider", + "EnvCommandResult", + "EnvRole", + "EnvRunSpec", + "default_image_tag", +] -@runtime_checkable -class AgentEnvironmentProvider(Protocol): - """Creates per-task environment handles. Pluggable: Docker now, Gym later.""" - - async def prepare( - self, - task: AgentEvalTask, - config: AgentEvalRunConfig | None = None, - ) -> AgentEnvironmentHandle: ... - - -class DockerEnvironmentHandle: - """Docker-backed environment handle bound to one task image.""" - - def __init__(self, image: str) -> None: - self.image = image - - async def run_agent(self, spec: EnvRunSpec) -> EnvCommandResult: - return await self._run(spec) - - async def run_verifier(self, spec: EnvRunSpec) -> EnvCommandResult: - return await self._run(spec) - - async def _run(self, spec: EnvRunSpec) -> EnvCommandResult: - try: - result = await asyncio.to_thread( - docker_run, - self.image, - spec.command, - env=spec.env, - mounts=spec.mounts, - workdir=spec.workdir, - timeout=spec.timeout, - extra_args=spec.extra_args, - ) - except subprocess.TimeoutExpired: - return EnvCommandResult(exit_code=124, timed_out=True) - return EnvCommandResult(exit_code=result.returncode) - - async def close(self) -> None: - # `docker run --rm` cleans up the container; nothing persistent to release. - return None - - -class DockerEnvironmentProvider: - """Default provider that maps each task to its built Docker image.""" +class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider): + """Platform default: map ``task.id`` to ``nmp-nat-:latest``.""" def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None: - self._image_tag_fn = image_tag_fn - - async def prepare( - self, - task: AgentEvalTask, - config: AgentEvalRunConfig | None = None, - ) -> DockerEnvironmentHandle: - del config - return DockerEnvironmentHandle(self._image_tag_fn(task.id)) + super().__init__(image_tag_fn=image_tag_fn) diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/tests/agentic-use/runtimes/shared/environment_spec.py index cd5630926f..9cdd3db71f 100644 --- a/tests/agentic-use/runtimes/shared/environment_spec.py +++ b/tests/agentic-use/runtimes/shared/environment_spec.py @@ -1,181 +1,32 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Reusable environment authoring for agentic-use tasks (design-doc B3). +"""Compatibility shim — environment authoring was promoted to the Evaluator SDK. -Moves task authoring away from an implicit "Dockerfile per task" toward a small, -declarative ``environment.yaml`` spec, while keeping a Dockerfile escape hatch. - -Spec shape (``environment.yaml`` in the task dir):: - - environment: - image: nemo-platform-agentic-base:2026.06 - profile: evaluator-platform - dependencies: - python: - - pytest - - nemo-evaluator-sdk - setup: - - seed-providers - - create-workspace - -Escape hatch:: - - environment: - dockerfile: environment/Dockerfile - -Resolution is deliberately minimal: a spec is turned into a :class:`BuildPlan` -(a Dockerfile + build context + target tag). The Dockerfile path is used as-is; -an ``image``-based spec generates a tiny derived Dockerfile (``FROM `` plus -optional ``pip install``). ``setup`` steps are carried as plan metadata — they are -runtime concerns (e.g. seed-providers) handled outside the image build — so this -module does not execute them. +Import from ``nemo_evaluator_sdk.agent_eval.runtimes.environment_spec`` directly; +this module re-exports the same symbols so existing adapter imports keep working. """ from __future__ import annotations -from dataclasses import dataclass, field -from pathlib import Path - -import yaml - -ENVIRONMENT_SPEC_FILENAME = "environment.yaml" -DEFAULT_DOCKERFILE_RELPATH = "environment/Dockerfile" - - -@dataclass(frozen=True) -class EnvironmentSpec: - """Declarative environment for one task (or a Dockerfile escape hatch).""" - - image: str | None = None - profile: str | None = None - python_dependencies: list[str] = field(default_factory=list) - setup: list[str] = field(default_factory=list) - dockerfile: Path | None = None - - def __post_init__(self) -> None: - if self.dockerfile is None and self.image is None: - raise ValueError("environment spec requires either 'image' or 'dockerfile'") - - -def load_environment_spec(task_dir: str | Path) -> EnvironmentSpec: - """Load a task's environment spec. - - Resolution order: - 1. ``environment.yaml`` in the task dir (declarative spec, preferred). - 2. ``environment/Dockerfile`` (backward-compatible escape hatch so existing - tasks work without authoring a spec). - """ - root = Path(task_dir) - spec_path = root / ENVIRONMENT_SPEC_FILENAME - if spec_path.is_file(): - return _parse_spec(yaml.safe_load(spec_path.read_text(encoding="utf-8")) or {}, root) - - dockerfile = root / DEFAULT_DOCKERFILE_RELPATH - if dockerfile.is_file(): - return EnvironmentSpec(dockerfile=dockerfile) - - raise FileNotFoundError( - f"No environment defined for task {root}: expected {ENVIRONMENT_SPEC_FILENAME} or {DEFAULT_DOCKERFILE_RELPATH}" - ) - - -def _parse_spec(payload: dict, task_dir: Path) -> EnvironmentSpec: - data = payload.get("environment", payload) if isinstance(payload, dict) else {} - if not isinstance(data, dict): - raise ValueError(f"Invalid environment spec in {task_dir}: expected a mapping") - - dockerfile_value = data.get("dockerfile") - dockerfile = None - if dockerfile_value: - dockerfile = Path(dockerfile_value) - if not dockerfile.is_absolute(): - dockerfile = (task_dir / dockerfile).resolve() - if not dockerfile.is_file(): - raise FileNotFoundError(f"environment.dockerfile not found: {dockerfile}") - - dependencies = data.get("dependencies") or {} - python_deps = dependencies.get("python") if isinstance(dependencies, dict) else None - - return EnvironmentSpec( - image=data.get("image"), - profile=data.get("profile"), - python_dependencies=list(python_deps or []), - setup=list(data.get("setup") or []), - dockerfile=dockerfile, - ) - - -@dataclass(frozen=True) -class BuildPlan: - """A resolved, executable Docker build for one task.""" - - image_tag: str - dockerfile: Path - context_dir: Path - generated: bool - base_image: str | None = None - setup: list[str] = field(default_factory=list) - - -def plan_task_build( - task_dir: str | Path, - image_tag: str, - *, - spec: EnvironmentSpec | None = None, - generated_dir: Path | None = None, -) -> BuildPlan: - """Resolve a task's environment spec into a concrete :class:`BuildPlan`. - - For the Dockerfile escape hatch the existing Dockerfile/context is used. For - an ``image``-based spec a minimal derived Dockerfile is written under - ``generated_dir`` (defaults to ``/.agentic-build``). - """ - root = Path(task_dir) - spec = spec or load_environment_spec(root) - - if spec.dockerfile is not None: - return BuildPlan( - image_tag=image_tag, - dockerfile=spec.dockerfile, - context_dir=spec.dockerfile.parent, - generated=False, - setup=list(spec.setup), - ) - - # image-based spec: generate a tiny derived Dockerfile. - context_dir = generated_dir if generated_dir is not None else (root / ".agentic-build") - context_dir.mkdir(parents=True, exist_ok=True) - dockerfile = context_dir / "Dockerfile" - dockerfile.write_text(render_derived_dockerfile(spec), encoding="utf-8") - return BuildPlan( - image_tag=image_tag, - dockerfile=dockerfile, - context_dir=context_dir, - generated=True, - base_image=spec.image, - setup=list(spec.setup), - ) - - -def execute_build_plan(plan: BuildPlan) -> None: - """Build the Docker image described by ``plan``.""" - from runtimes.shared.docker import build_dockerfile - - build_dockerfile(plan.dockerfile, plan.context_dir, plan.image_tag) - - -def render_derived_dockerfile(spec: EnvironmentSpec) -> str: - """Render a minimal derived Dockerfile from an image-based spec.""" - if spec.image is None: - raise ValueError("cannot render a derived Dockerfile without a base image") - lines = [f"FROM {spec.image}"] - if spec.profile: - lines.append(f"LABEL com.nvidia.agentic.profile={spec.profile}") - if spec.python_dependencies: - deps = " ".join(spec.python_dependencies) - lines.append(f"RUN pip install --no-cache-dir {deps}") - if spec.setup: - # Setup steps are runtime concerns; record them for provenance only. - lines.append(f'LABEL com.nvidia.agentic.setup="{",".join(spec.setup)}"') - return "\n".join(lines) + "\n" +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import ( + DEFAULT_DOCKERFILE_RELPATH, + ENVIRONMENT_SPEC_FILENAME, + BuildPlan, + EnvironmentSpec, + execute_build_plan, + load_environment_spec, + plan_task_build, + render_derived_dockerfile, +) + +__all__ = [ + "DEFAULT_DOCKERFILE_RELPATH", + "ENVIRONMENT_SPEC_FILENAME", + "BuildPlan", + "EnvironmentSpec", + "execute_build_plan", + "load_environment_spec", + "plan_task_build", + "render_derived_dockerfile", +] diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py index 07a7a2dd17..86a4c5f4f2 100644 --- a/tests/agentic-use/runtimes/shared/layout.py +++ b/tests/agentic-use/runtimes/shared/layout.py @@ -9,6 +9,7 @@ from datetime import UTC, datetime from pathlib import Path +from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask from runtimes.shared.config import AgenticSharedConfig @@ -16,7 +17,11 @@ @dataclass(frozen=True) class AgenticRunLayout: - """Filesystem layout for one task run.""" + """Filesystem layout for one task run. + + Extends the SDK's generic ``RunLayout`` shape with a platform-specific + ``state_dir`` (preserved platform/database state across agent + verifier). + """ run_dir: Path agent_log_dir: Path @@ -44,29 +49,22 @@ def resolve_run_layout( config: AgentEvalRunConfig | None = None, ) -> AgenticRunLayout: """Resolve or create the on-disk layout for one task attempt.""" - if config is not None and config.output_dir is not None: - # Must be absolute: run_dir subpaths are used as Docker bind-mount sources, - # and Docker treats a relative `-v` source as a (slash-free) named volume. - run_dir = Path(config.output_dir).resolve() - else: - run_dir = new_run_dir(default_jobs_dir(shared), task.id) - - agent_log_dir = run_dir / "agent" - workspace_dir = run_dir / "workspace" - state_dir = run_dir / "state" - agent_log_dir.mkdir(parents=True, exist_ok=True) - workspace_dir.mkdir(parents=True, exist_ok=True) - state_dir.mkdir(parents=True, exist_ok=True) + output_dir = config.output_dir if config is not None else None + run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id)) - instruction_path = agent_log_dir / "instruction.md" - instruction_path.write_text(task.intent, encoding="utf-8") + # Generic agent/workspace dirs + written instruction come from the SDK helper. + base = prepare_run_layout(run_dir, task.intent) + + # Platform extension: a preserved state dir for platform/db across phases. + state_dir = base.run_dir / "state" + state_dir.mkdir(parents=True, exist_ok=True) return AgenticRunLayout( - run_dir=run_dir, - agent_log_dir=agent_log_dir, - workspace_dir=workspace_dir, + run_dir=base.run_dir, + agent_log_dir=base.agent_log_dir, + workspace_dir=base.workspace_dir, state_dir=state_dir, - instruction_path=instruction_path, + instruction_path=base.instruction_path, ) diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py index 7c68a590ec..e7b8496caf 100644 --- a/tests/agentic-use/runtimes/shared/metrics.py +++ b/tests/agentic-use/runtimes/shared/metrics.py @@ -1,28 +1,23 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Default metrics for agentic-use agent-eval runs.""" +"""Default metrics for agentic-use agent-eval runs. + +``AgentPhaseSuccessMetric`` is promoted to the SDK; here it is namespaced under +the ``agentic_use_*`` metric type. ``VerifierRewardMetric`` stays a platform +compatibility shim (mirrors the legacy pytest verifier reward). +""" from __future__ import annotations +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult -class AgentPhaseSuccessMetric: - """Score 1.0 when the agent phase exited successfully, else 0.0.""" +class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric): + """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``).""" - @property - def type(self) -> str: - return "agentic_use_agent_phase" - - def output_spec(self) -> list[MetricOutputSpec]: - return [MetricOutputSpec.continuous_score("agent_phase_success")] - - async def compute_scores(self, input: MetricInput) -> MetricResult: - agent_ok = bool(input.candidate.metadata.get("agent_ok")) - return MetricResult( - outputs=[MetricOutput(name="agent_phase_success", value=1.0 if agent_ok else 0.0)], - ) + metric_type = "agentic_use_agent_phase" class VerifierRewardMetric: diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/tests/agentic-use/runtimes/shared/reporting.py index 34b78fbcb7..7e78de3972 100644 --- a/tests/agentic-use/runtimes/shared/reporting.py +++ b/tests/agentic-use/runtimes/shared/reporting.py @@ -1,457 +1,34 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Deterministic gating + provenance comparison over an agent-eval run bundle. +"""Compatibility shim — gating was promoted to the Evaluator SDK. -This closes the design-doc B4 "CI/reporting" gap. Persistence of -``tasks.jsonl``/``attempts.jsonl``/``results.jsonl``/``summary.json``/``report.html`` -is already handled by the SDK (``agent_eval.persistence.persist_run`` / -``write_dashboard``); this module adds the missing piece: a candidate-vs-baseline -gate (pass-rate, token/cost, runtime tie-breaker) plus deterministic provenance -checks. - -The semantics intentionally mirror ``passrate_token_policy_gate.py`` so a summary -produced here is interchangeable with the legacy gate's baseline summary. The -difference is the input: this operates on a typed :class:`AgentEvalRunResult` -(metric scores + attempt metadata) instead of scanning ``result.json`` files. +Import from ``nemo_evaluator_sdk.agent_eval.gating`` directly; this module +re-exports the same symbols so existing adapter imports keep working. """ from __future__ import annotations -import json -from dataclasses import asdict, dataclass, field -from pathlib import Path -from typing import Any - -from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunResult, AgentEvalTaskResult - -# Metric outputs, in priority order, that represent a task's pass/reward signal. -DEFAULT_REWARD_OUTPUTS: tuple[str, ...] = ("verifier_reward", "agent_phase_success") - -# Provenance fields collapsed into a single run-level summary (matches the -# legacy gate so baselines are interchangeable). -_PROVENANCE_FIELDS: tuple[str, ...] = ( - "commit_sha", - "commit_short", - "commit_dirty", - "branch", - "remote_url", - "agentic_base_image_digest", - "pinned", - "pinned_to_commit", - "pinned_image_tag", +from nemo_evaluator_sdk.agent_eval.gating import ( + DEFAULT_REWARD_OUTPUTS, + GateCheck, + GateReport, + GateThresholds, + evaluate_gate, + load_baseline_summary, + run_gate_checks, + summarize_run, + write_gate_report, ) - -@dataclass(frozen=True) -class GateThresholds: - """Knobs controlling the candidate gate (defaults are the strict CI policy).""" - - min_pass_rate: float = 1.0 - require_token_metrics: bool = False - max_pass_rate_drop: float = 0.0 - max_token_regression_pct: float = 0.0 - max_runtime_regression_pct: float = 0.0 - allow_cross_commit: bool = False - - -@dataclass -class GateCheck: - name: str - passed: bool - details: str - - -@dataclass -class GateReport: - gate_passed: bool - summary: dict[str, Any] - checks: list[GateCheck] = field(default_factory=list) - - def to_payload(self) -> dict[str, Any]: - return { - "gate_passed": self.gate_passed, - "summary": self.summary, - "checks": [asdict(check) for check in self.checks], - } - - -def evaluate_gate( - result: AgentEvalRunResult, - *, - thresholds: GateThresholds | None = None, - baseline_summary: dict[str, Any] | None = None, - reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS, -) -> GateReport: - """Summarize a run and apply gate checks, optionally against a baseline.""" - thresholds = thresholds or GateThresholds() - summary = summarize_run(result, reward_outputs=reward_outputs) - checks = run_gate_checks(summary, thresholds=thresholds, baseline_summary=baseline_summary) - return GateReport(gate_passed=all(check.passed for check in checks), summary=summary, checks=checks) - - -def write_gate_report(report: GateReport, output_dir: str | Path, *, filename: str = "gate.json") -> Path: - """Persist the gate report alongside the SDK run bundle.""" - path = Path(output_dir) - path.mkdir(parents=True, exist_ok=True) - gate_path = path / filename - gate_path.write_text(json.dumps(report.to_payload(), indent=2, sort_keys=True) + "\n", encoding="utf-8") - return gate_path - - -def load_baseline_summary(path: str | Path) -> dict[str, Any]: - """Load + normalize a baseline summary (raw summary or a prior gate.json).""" - source = Path(path) - payload = json.loads(source.read_text(encoding="utf-8")) - if not isinstance(payload, dict): - raise ValueError(f"Baseline summary must be a JSON object: {source}") - summary = payload.get("summary") if isinstance(payload.get("summary"), dict) else payload - _validate_baseline_summary(summary, source) - return summary - - -def summarize_run( - result: AgentEvalRunResult, - *, - reward_outputs: tuple[str, ...] = DEFAULT_REWARD_OUTPUTS, -) -> dict[str, Any]: - """Aggregate pass-rate, token, runtime, and provenance for one run.""" - attempts_by_task: dict[str, AgentEvalAttempt] = {attempt.task_id: attempt for attempt in result.attempts} - reward_by_task = _rewards_by_task(result.results, reward_outputs) - task_ids = sorted({task.id for task in result.tasks} | set(attempts_by_task)) - - passed = 0 - token_sum = 0 - token_count = 0 - token_unavailable: list[str] = [] - runtime_sum = 0.0 - runtime_count = 0 - runtime_unavailable: list[str] = [] - provenance_inputs: list[dict[str, Any]] = [] - - for task_id in task_ids: - attempt = attempts_by_task.get(task_id) - metadata = attempt.metadata if attempt is not None else {} - - reward_value = _task_reward(task_id, reward_by_task, metadata) - if reward_value >= 1.0: - passed += 1 - - total_tokens = metadata.get("total_tokens") - if isinstance(total_tokens, int): - token_sum += total_tokens - token_count += 1 - else: - token_unavailable.append(task_id) - - runtime_sec = _task_runtime_sec(metadata) - if runtime_sec is not None: - runtime_sum += runtime_sec - runtime_count += 1 - else: - runtime_unavailable.append(task_id) - - prov = metadata.get("provenance") - if isinstance(prov, dict): - provenance_inputs.append(prov) - - total = len(task_ids) - return { - "run_id": result.run_id, - "benchmark": result.benchmark, - "total_tasks": total, - "passed_tasks": passed, - "pass_rate": (passed / total) if total else 0.0, - "task_names": task_ids, - "total_tokens_sum": token_sum if token_count else None, - "avg_total_tokens": (token_sum / token_count) if token_count else None, - "token_metrics_coverage": (token_count / total) if total else 0.0, - "token_metrics_available_tasks": token_count, - "token_metrics_unavailable_tasks": sorted(token_unavailable), - "runtime_sec_sum": runtime_sum if runtime_count else None, - "avg_runtime_sec": (runtime_sum / runtime_count) if runtime_count else None, - "runtime_metrics_coverage": (runtime_count / total) if total else 0.0, - "runtime_metrics_available_tasks": runtime_count, - "runtime_metrics_unavailable_tasks": sorted(runtime_unavailable), - "provenance": _aggregate_provenance(provenance_inputs), - } - - -def run_gate_checks( - summary: dict[str, Any], - *, - thresholds: GateThresholds, - baseline_summary: dict[str, Any] | None = None, -) -> list[GateCheck]: - """Apply absolute + relative (vs baseline) gate checks to a summary.""" - checks: list[GateCheck] = [] - total_tasks = int(summary["total_tasks"]) - pass_rate = float(summary["pass_rate"]) - provenance = summary.get("provenance") or {} - - checks.append(GateCheck("non_empty_result_set", total_tasks > 0, f"total_tasks={total_tasks}")) - checks.append( - GateCheck( - "min_pass_rate", - pass_rate >= thresholds.min_pass_rate, - f"pass_rate={pass_rate:.3f}, min_pass_rate={thresholds.min_pass_rate:.3f}", - ) - ) - checks.append(_commit_consistency_check(provenance)) - - if thresholds.require_token_metrics: - token_coverage = float(summary["token_metrics_coverage"]) - runtime_coverage = float(summary["runtime_metrics_coverage"]) - checks.append( - GateCheck( - "token_metrics_available_for_all_tasks", - token_coverage == 1.0, - f"token_metrics_coverage={token_coverage:.3f}", - ) - ) - checks.append( - GateCheck( - "runtime_metrics_available_for_all_tasks", - runtime_coverage == 1.0, - f"runtime_metrics_coverage={runtime_coverage:.3f}", - ) - ) - - if baseline_summary is not None: - checks.extend(_baseline_checks(summary, baseline_summary, thresholds)) - - return checks - - -def _baseline_checks( - summary: dict[str, Any], - baseline_summary: dict[str, Any], - thresholds: GateThresholds, -) -> list[GateCheck]: - checks: list[GateCheck] = [] - pass_rate = float(summary["pass_rate"]) - total_tokens_sum = summary["total_tokens_sum"] - runtime_sec_sum = summary["runtime_sec_sum"] - provenance = summary.get("provenance") or {} - - # Regression checks only make sense when both runs measured the same tasks. - baseline_tasks = baseline_summary.get("task_names") - candidate_tasks = summary.get("task_names") - task_sets_comparable = True - if isinstance(baseline_tasks, list) and isinstance(candidate_tasks, list): - comparable = sorted(baseline_tasks) == sorted(candidate_tasks) - task_sets_comparable = comparable - checks.append( - GateCheck( - "baseline_candidate_task_sets_match", - comparable, - ( - f"both runs measured {len(candidate_tasks)} tasks" - if comparable - else f"baseline={sorted(baseline_tasks)} candidate={sorted(candidate_tasks)}; " - "regression checks short-circuited" - ), - ) - ) - else: - checks.append( - GateCheck( - "baseline_candidate_task_sets_match", - True, - "task_names not present on baseline and/or candidate; skipping equality guard", - ) - ) - - checks.append(_cross_commit_check(provenance, baseline_summary, thresholds.allow_cross_commit)) - - if not task_sets_comparable: - return checks - - baseline_pass_rate = float(baseline_summary.get("pass_rate", 0.0)) - checks.append( - GateCheck( - "no_pass_rate_regression_vs_baseline", - pass_rate >= baseline_pass_rate - thresholds.max_pass_rate_drop, - f"pass_rate={pass_rate:.3f}, baseline={baseline_pass_rate:.3f}, max_drop={thresholds.max_pass_rate_drop:.3f}", - ) - ) - - baseline_tokens = baseline_summary.get("total_tokens_sum") - if isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int): - max_allowed = baseline_tokens * (1.0 + thresholds.max_token_regression_pct / 100.0) - checks.append( - GateCheck( - "tokens_not_worse_than_baseline", - total_tokens_sum <= max_allowed, - f"total_tokens_sum={total_tokens_sum}, baseline={baseline_tokens}, " - f"max_regression_pct={thresholds.max_token_regression_pct:.2f}", - ) - ) - else: - checks.append( - GateCheck( - "tokens_not_worse_than_baseline", - False, - "Missing token totals for candidate or baseline; cannot run deterministic token comparison.", - ) - ) - - # Runtime is only a tie-breaker when token totals match exactly. - baseline_runtime = baseline_summary.get("runtime_sec_sum") - tokens_tied = ( - isinstance(total_tokens_sum, int) and isinstance(baseline_tokens, int) and total_tokens_sum == baseline_tokens - ) - if not tokens_tied: - checks.append( - GateCheck( - "runtime_tie_breaker_not_worse_than_baseline", - True, - "Not applicable (token totals differ from baseline).", - ) - ) - elif isinstance(runtime_sec_sum, int | float) and isinstance(baseline_runtime, int | float): - max_allowed_runtime = float(baseline_runtime) * (1.0 + thresholds.max_runtime_regression_pct / 100.0) - checks.append( - GateCheck( - "runtime_tie_breaker_not_worse_than_baseline", - float(runtime_sec_sum) <= max_allowed_runtime, - f"runtime_sec_sum={float(runtime_sec_sum):.3f}, baseline={float(baseline_runtime):.3f}, " - f"max_regression_pct={thresholds.max_runtime_regression_pct:.2f}", - ) - ) - else: - checks.append( - GateCheck( - "runtime_tie_breaker_not_worse_than_baseline", - False, - "Token totals tied with baseline but runtime totals missing; cannot run tie-breaker.", - ) - ) - - return checks - - -def _commit_consistency_check(provenance: dict[str, Any]) -> GateCheck: - commit_observed = provenance.get("commit_sha_observed") - if isinstance(commit_observed, list) and len(commit_observed) > 1: - return GateCheck( - "commit_sha_consistent_within_run", - False, - f"Multiple commit_sha values observed across tasks: {commit_observed}. Re-run from a single commit.", - ) - commit_sha = provenance.get("commit_sha") - if commit_sha: - return GateCheck( - "commit_sha_consistent_within_run", - True, - f"commit={provenance.get('commit_short') or commit_sha[:12]}, branch={provenance.get('branch') or 'detached'}", - ) - return GateCheck( - "commit_sha_consistent_within_run", - True, - "provenance not recorded (legacy artifacts); skipping commit consistency check.", - ) - - -def _cross_commit_check( - provenance: dict[str, Any], - baseline_summary: dict[str, Any], - allow_cross_commit: bool, -) -> GateCheck: - baseline_commit = (baseline_summary.get("provenance") or {}).get("commit_sha") - candidate_commit = provenance.get("commit_sha") - if not (baseline_commit and candidate_commit): - return GateCheck( - "commit_sha_matches_baseline", - True, - "commit_sha not present on baseline and/or candidate; skipping cross-commit guard.", - ) - commits_match = baseline_commit == candidate_commit - if commits_match: - detail = f"both runs at commit={baseline_commit[:12]}" - elif allow_cross_commit: - detail = ( - f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}; " - "comparison allowed by allow_cross_commit (numbers may not be apples-to-apples)." - ) - else: - detail = ( - f"baseline={baseline_commit[:12]} != candidate={candidate_commit[:12]}. " - "Re-run candidate at the baseline commit, or set allow_cross_commit." - ) - return GateCheck("commit_sha_matches_baseline", commits_match or allow_cross_commit, detail) - - -def _rewards_by_task(results: list[AgentEvalTaskResult], reward_outputs: tuple[str, ...]) -> dict[str, float]: - rewards: dict[str, float] = {} - for task_result in results: - for output_name in reward_outputs: - value = _numeric_output(task_result, output_name) - if value is not None: - # Highest-priority output wins; don't overwrite with later metrics. - rewards.setdefault(task_result.task_id, value) - break - return rewards - - -def _numeric_output(task_result: AgentEvalTaskResult, name: str) -> float | None: - for output in task_result.outputs: - if output.name == name: - try: - return float(output.value) - except (TypeError, ValueError): - return None - return None - - -def _task_reward(task_id: str, reward_by_task: dict[str, float], metadata: dict[str, Any]) -> float: - if task_id in reward_by_task: - return reward_by_task[task_id] - reward = metadata.get("reward") - if reward is not None: - try: - return float(reward) - except (TypeError, ValueError): - return 0.0 - return 1.0 if metadata.get("passed") is True else 0.0 - - -def _task_runtime_sec(metadata: dict[str, Any]) -> float | None: - runtime_sec = metadata.get("runtime_sec") - if isinstance(runtime_sec, int | float): - return float(runtime_sec) - duration_ms = metadata.get("duration_ms") - if isinstance(duration_ms, int | float): - return float(duration_ms) / 1000.0 - return None - - -def _aggregate_provenance(provenances: list[dict[str, Any]]) -> dict[str, Any]: - observed: dict[str, set[Any]] = {field_name: set() for field_name in _PROVENANCE_FIELDS} - for prov in provenances: - for field_name in _PROVENANCE_FIELDS: - value = prov.get(field_name) - if value is not None: - observed[field_name].add(value) - - aggregated: dict[str, Any] = {"available": bool(provenances)} - for field_name in _PROVENANCE_FIELDS: - values = observed[field_name] - if len(values) == 1: - aggregated[field_name] = next(iter(values)) - else: - aggregated[field_name] = None - if len(values) > 1: - aggregated[f"{field_name}_observed"] = sorted(map(str, values)) - return aggregated - - -def _validate_baseline_summary(summary: dict[str, Any], source: Path) -> None: - missing = [key for key in ("pass_rate", "total_tokens_sum", "runtime_sec_sum") if key not in summary] - if missing: - raise ValueError( - f"Baseline summary {source} is missing required key(s): {', '.join(missing)}. " - "Expected a raw summary object or a gate.json with a `summary`." - ) - if not isinstance(summary.get("pass_rate"), int | float): - raise ValueError(f"Baseline summary {source} has invalid `pass_rate`; expected a number.") +__all__ = [ + "DEFAULT_REWARD_OUTPUTS", + "GateCheck", + "GateReport", + "GateThresholds", + "evaluate_gate", + "load_baseline_summary", + "run_gate_checks", + "summarize_run", + "write_gate_report", +] diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py index e8162f9ded..bb0d3fe567 100644 --- a/tests/agentic-use/runtimes/shared/result_adapter.py +++ b/tests/agentic-use/runtimes/shared/result_adapter.py @@ -39,6 +39,17 @@ ) +class ResultDirAttemptSource: + """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts. + + Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource` + protocol so the generic orchestrator's offline path can rescore captured runs. + """ + + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: + return attempt_from_result_dir(source, task=task) + + def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt: """Load ``/result.json`` and build an attempt from it.""" output_dir = Path(output_dir) diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py index 8be53924b9..f83da8edb3 100644 --- a/tests/agentic-use/runtimes/shared/verify.py +++ b/tests/agentic-use/runtimes/shared/verify.py @@ -13,9 +13,14 @@ from __future__ import annotations import textwrap -from dataclasses import dataclass from pathlib import Path -from typing import Any + +from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( + VerifierOutcome, + apply_verify_to_metadata, + collect_verifier_outcome, + skipped_outcome, +) from runtimes.shared.constants import ( DOCKER_SOCKET_CONTAINER_PATH, @@ -28,17 +33,14 @@ from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec from runtimes.shared.layout import AgenticRunLayout - -@dataclass(frozen=True) -class VerifierOutcome: - """Result of the live verifier phase for one task.""" - - ran: bool - passed: bool - reward: int - exit_code: int - stdout: str - verifier_log_dir: Path | None +__all__ = [ + "VerifierOutcome", + "apply_verify_to_metadata", + "build_verify_run_spec", + "maybe_run_verify", + "run_verify", + "verifier_log_dir", +] def verifier_log_dir(layout: AgenticRunLayout) -> Path: @@ -141,28 +143,10 @@ async def run_verify( ) -> VerifierOutcome: """Execute the verifier through the environment handle and collect reward.""" result = await handle.run_verifier(spec) - log_dir = verifier_log_dir(layout) - passed = result.ok - - stdout = "" - stdout_path = log_dir / "test-stdout.txt" - if stdout_path.is_file(): - stdout = stdout_path.read_text(encoding="utf-8", errors="replace") - - reward_path = log_dir / "reward.txt" - if reward_path.is_file(): - reward = 1 if reward_path.read_text(encoding="utf-8").strip() == "1" else 0 - else: - reward = 1 if passed else 0 - reward_path.write_text("1\n" if passed else "0\n", encoding="utf-8") - - return VerifierOutcome( - ran=True, - passed=passed, - reward=reward, + return collect_verifier_outcome( + ok=result.ok, exit_code=result.exit_code, - stdout=stdout, - verifier_log_dir=log_dir, + log_dir=verifier_log_dir(layout), ) @@ -181,7 +165,7 @@ async def maybe_run_verify( ) -> VerifierOutcome: """Run the verifier through ``handle`` when enabled and a verifier exists.""" if not enabled: - return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) + return skipped_outcome() spec = build_verify_run_spec( task_dir, layout, @@ -193,16 +177,5 @@ async def maybe_run_verify( extra_args=extra_args, ) if spec is None: - return VerifierOutcome(ran=False, passed=False, reward=0, exit_code=0, stdout="", verifier_log_dir=None) + return skipped_outcome() return await run_verify(handle, spec, layout) - - -def apply_verify_to_metadata(metadata: dict[str, Any], outcome: VerifierOutcome) -> None: - """Stamp verifier reward/status onto attempt metadata for scoring + gating.""" - if not outcome.ran: - metadata.setdefault("verify_status", "skipped") - return - metadata["verify_status"] = "ok" if outcome.passed else "failed" - metadata["passed"] = outcome.passed - metadata["reward"] = outcome.reward - metadata["verifier_log_dir"] = str(outcome.verifier_log_dir) if outcome.verifier_log_dir else None diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py index 935ddf7389..5a3590071d 100644 --- a/tests/agentic-use/tests/test_agentic_runtimes.py +++ b/tests/agentic-use/tests/test_agentic_runtimes.py @@ -303,6 +303,8 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co }, ) task_result = AgentEvalTaskResult( + id="demo:workflow:agentic_use_verifier_reward", + run_id="run-1", task_id="demo", attempt_id="demo:workflow", metric_type="agentic_use_verifier_reward", From afb7dc87ea27f1d704307869c6306c8aaa4e49ae Mon Sep 17 00:00:00 2001 From: "Arpit Singh (SW-CLOUD)" Date: Tue, 9 Jun 2026 23:16:14 -0700 Subject: [PATCH 3/3] refactor(agentic-use): drop SDK re-export shims for direct imports Remove the compatibility shims under tests/agentic-use/runtimes/shared that re-exported promoted agent_eval SDK symbols, and import those generics directly from nemo_evaluator_sdk.agent_eval (docker, environment, environment_spec, gating, verify) at their use sites. Consolidate the remaining NeMo-Platform-only glue into a single module, shared/platform.py: the run layout with the platform state_dir, task_image_tag + platform DockerEnvironmentProvider, the namespaced AgentPhaseSuccessMetric + VerifierRewardMetric, agent-log/usage parsing and the shared container env, attempt construction (live + result.json/ResultDirAttemptSource), the live VERIFY phase, and the agentic-use task loader. shared/ now holds only platform.py, config.py, and constants.py. Update orchestrator/workflow/aut runtimes, the package __init__ re-exports, the runtime tests, and README/COMPLIANCE docs accordingly. 107 tests pass; ruff clean. Signed-off-by: Arpit Singh (SW-CLOUD) --- tests/agentic-use/runtimes/COMPLIANCE.md | 58 +- tests/agentic-use/runtimes/README.md | 55 +- tests/agentic-use/runtimes/__init__.py | 46 +- tests/agentic-use/runtimes/aut/runtime.py | 19 +- tests/agentic-use/runtimes/orchestrator.py | 16 +- .../agentic-use/runtimes/shared/agent_log.py | 40 - .../agentic-use/runtimes/shared/artifacts.py | 187 ----- .../runtimes/shared/container_env.py | 42 - tests/agentic-use/runtimes/shared/docker.py | 26 - .../runtimes/shared/environment.py | 50 -- .../runtimes/shared/environment_spec.py | 32 - tests/agentic-use/runtimes/shared/layout.py | 72 -- tests/agentic-use/runtimes/shared/metrics.py | 46 - tests/agentic-use/runtimes/shared/platform.py | 791 ++++++++++++++++++ .../agentic-use/runtimes/shared/reporting.py | 34 - .../runtimes/shared/result_adapter.py | 156 ---- .../runtimes/shared/task_loader.py | 80 -- tests/agentic-use/runtimes/shared/usage.py | 32 - tests/agentic-use/runtimes/shared/verify.py | 181 ---- .../agentic-use/runtimes/workflow/runtime.py | 17 +- .../tests/test_agentic_runtimes.py | 43 +- 21 files changed, 924 insertions(+), 1099 deletions(-) delete mode 100644 tests/agentic-use/runtimes/shared/agent_log.py delete mode 100644 tests/agentic-use/runtimes/shared/artifacts.py delete mode 100644 tests/agentic-use/runtimes/shared/container_env.py delete mode 100644 tests/agentic-use/runtimes/shared/docker.py delete mode 100644 tests/agentic-use/runtimes/shared/environment.py delete mode 100644 tests/agentic-use/runtimes/shared/environment_spec.py delete mode 100644 tests/agentic-use/runtimes/shared/layout.py delete mode 100644 tests/agentic-use/runtimes/shared/metrics.py create mode 100644 tests/agentic-use/runtimes/shared/platform.py delete mode 100644 tests/agentic-use/runtimes/shared/reporting.py delete mode 100644 tests/agentic-use/runtimes/shared/result_adapter.py delete mode 100644 tests/agentic-use/runtimes/shared/task_loader.py delete mode 100644 tests/agentic-use/runtimes/shared/usage.py delete mode 100644 tests/agentic-use/runtimes/shared/verify.py diff --git a/tests/agentic-use/runtimes/COMPLIANCE.md b/tests/agentic-use/runtimes/COMPLIANCE.md index 3631361cad..526aa7b0e7 100644 --- a/tests/agentic-use/runtimes/COMPLIANCE.md +++ b/tests/agentic-use/runtimes/COMPLIANCE.md @@ -16,23 +16,25 @@ helpers (`attempts`), generic layout (`runtimes.layout`), reusable metrics (`common_metrics`: `AgentPhaseSuccessMetric` + a real metric-over-evidence `EvidencePresenceMetric`), the generic orchestrator (`orchestrator`), the `AgentAttemptSource` protocol, the verifier mechanic (`runtimes.verify`), and the -coding-agent driver seam (`runtimes.coding_agent`). The `shared/*` modules -referenced below are now **re-export shims** over those SDK homes (see -`README.md` for the shim→SDK table); only NeMo-Platform specifics -(`task_loader`, `result_adapter`, `config`, the pytest verifier command, the -`state` evidence key, `task_image_tag`) remain platform code. A CI grep gate -(`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) keeps -`agent_eval/` free of NeMo-Platform imports. +coding-agent driver seam (`runtimes.coding_agent`). Those SDK homes are imported +**directly** by the runtime scripts — there are no re-export shims. The only +NeMo-Platform specifics that remain (the agentic task loader, `result.json` +import, attempt construction, the pytest verifier command, the `state` evidence +key, `task_image_tag` + platform `DockerEnvironmentProvider`, the +`VerifierRewardMetric`) are consolidated into a single module, +`shared/platform.py` (alongside `shared/config.py` and `shared/constants.py`). +A CI grep gate (`packages/nemo_evaluator_sdk/tests/agent_eval/test_import_hygiene.py`) +keeps `agent_eval/` free of NeMo-Platform imports. ## Scope split (per SDK design) | `nat_runner` responsibility | Belongs in `AgentAttemptRuntime`? | Current location | |----------------------------|-----------------------------------|------------------| | AGENT phase — run backend in Docker, capture logs/trajectory | **Yes** | `runtimes//runtime.py` | -| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `shared/environment_spec.py` (env spec / Dockerfile) + `shared/docker.py` | -| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/verify.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) | +| BUILD — task image | **No** | `AgenticEvalOrchestrator` via `agent_eval.runtimes.environment_spec` (env spec / Dockerfile) + `agent_eval.runtimes.docker` | +| VERIFY — pytest `test_outputs.py`, `reward.txt` | **Through env boundary** | `shared/platform.py` via `AgentEnvironmentHandle.run_verifier` (runtimes call it after the agent when `shared.run_verify=True`) | | CLI — task globs, manifests, summaries | **No** | Still `nat_runner.main` (not migrated) | -| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/result_adapter.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` | +| `result.json` contract | **No** (still produced by `nat_runner`) | Importable as an attempt via `shared/platform.py`; scored offline via `AgenticEvalOrchestrator.score_captured_attempts` | ## Task metrics (authored on the task) @@ -63,14 +65,14 @@ metric scoring row. | `_prepare_workflow_for_runtime` | `workflow/prep.py` | | `_build_aut_agent_cmd` | `aut/command.py` | | `_prepare_aut_config_for_runtime` | `aut/prep.py` | -| `_agent_log_has_workflow_error` | `shared/agent_log.py` | -| `run_verify_phase` | `shared/verify.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) | -| `_docker_run`, `build_task_image` | `shared/docker.py` (`docker_run`, `build_dockerfile`, `build_task_image`) | -| BUILD env resolution (`environment/Dockerfile`) | `shared/environment_spec.py` (`load_environment_spec`, `plan_task_build`) | -| `_write_result` (`result.json`) | `shared/result_adapter.py` (import side only; `nat_runner` still writes it) | -| pass-rate / token / runtime gate | `shared/reporting.py` (mirrors `passrate_token_policy_gate.py`) | -| `_extract_usage_metrics` | `shared/usage.py` (delegates to `nat_runner` until deduped) | -| `capture_agent_attempt` shape | `shared/artifacts.py` | +| `_agent_log_has_workflow_error` | `shared/platform.py` | +| `run_verify_phase` | `shared/platform.py` (`build_verify_run_spec` + `run_verify` via `run_verifier`) | +| `_docker_run`, `build_task_image` | `agent_eval.runtimes.docker` (`docker_run`, `build_dockerfile`, `build_task_image`) | +| BUILD env resolution (`environment/Dockerfile`) | `agent_eval.runtimes.environment_spec` (`load_environment_spec`, `plan_task_build`) | +| `_write_result` (`result.json`) | `shared/platform.py` (import side only; `nat_runner` still writes it) | +| pass-rate / token / runtime gate | `agent_eval.gating` (mirrors `passrate_token_policy_gate.py`) | +| `_extract_usage_metrics` | `shared/platform.py` (delegates to `nat_runner` until deduped) | +| `capture_agent_attempt` shape | `shared/platform.py` | | `run_agent_phase` | **Removed per backend** once all backends migrated | ## Attempt record contract @@ -83,14 +85,14 @@ includes canonical `CapturedAgentAttempt` fields: - Artifact paths: `agent_log_dir`, `workspace_dir`, `state_dir`, `atif_trajectory_path` - Phase outcome: `agent_ok` - Verifier outcome (when `run_verify=True`): `verify_status`, `passed`, `reward`, - `verifier_log_dir` (stamped by `shared/verify.py::apply_verify_to_metadata`) + `verifier_log_dir` (stamped by `apply_verify_to_metadata` from `agent_eval.runtimes.verify`) Use `to_captured_agent_attempt(task, attempt)` for verify/scoring code that expects the portable `CapturedAgentAttempt` type. ## `nat_runner` artifact → `AgentEvalAttempt` evidence map (per design doc) -`shared/artifacts.py::_evidence_descriptors` emits the documented keys: +`shared/platform.py::_evidence_descriptors` emits the documented keys: | `nat_runner` output | `AgentEvalAttempt` mapping | Status | |---------------------|----------------------------|--------| @@ -100,7 +102,7 @@ expects the portable `CapturedAgentAttempt` type. | `agent/trajectory.json` | `evidence["trace"]` (ATIF when normalized, else json) | Implemented | | `agent/` logs | `evidence["logs"]` (dir, `primary_log=nat_agent.log`) | Implemented | | `verifier/` logs | `evidence["verifier_logs"]` (added once verify phase runs) | Implemented (conditional) | -| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/result_adapter.py::attempt_from_result` / `attempt_from_result_dir` | +| `result.json` | attempt status + measurements + provenance + token/cost | Implemented — `shared/platform.py::attempt_from_result` / `attempt_from_result_dir` | | final agent log/message | `AgentOutput.text` | Implemented | `result.json` mapping detail (`attempt_from_result`): @@ -110,7 +112,7 @@ expects the portable `CapturedAgentAttempt` type. attempt-production failures because the SDK's `AgentEvaluator` excludes `status=="failed"` from scoring (it raises); an agent that ran but failed must stay scorable so pass-rate gating counts it as a `0`. The live builder - (`shared/artifacts.py`) and this importer share the same helper. + (`shared/platform.py`) and this importer share the same helper. - `result["reward"]`/`result["passed"]` → `metadata` measurements (verifier reward stays a *measurement*, scored by `VerifierRewardMetric`, not the attempt status). - `result["metrics"]` (token/cost) → flattened into `metadata`. @@ -120,10 +122,10 @@ expects the portable `CapturedAgentAttempt` type. | Doc section | Status in this package | |-------------|------------------------| -| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/result_adapter.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. | -| **B2** `EnvironmentProvider` boundary | **Implemented** — `shared/environment.py` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; `DockerEnvironmentProvider` wraps `shared/docker.py`. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. | -| **B3** standardize environment authoring | **Implemented (minimal)** — `shared/environment_spec.py` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). | -| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `shared/reporting.py` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. | +| **B1** wrap `nat_runner` as attempt runtime(s) | In progress — AGENT phase extracted to per-backend runtimes (`workflow`, `aut` done; 3 CLI backends scaffolded); live VERIFY wired through the B2 boundary; `result.json` import path added via `shared/platform.py`, exposed as the first-class **stored-attempt scoring** path via `AgenticEvalOrchestrator.score_captured_attempts` (and `run_agent_eval.py --rescore-dir`) — no Docker/agent execution. Remaining: 3 CLI backends + converging `nat_runner.main` onto the orchestrator. Note: doc proposes one `NatRunnerAttemptRuntime`; we deliberately split per backend per user direction. | +| **B2** `EnvironmentProvider` boundary | **Implemented** — `agent_eval.runtimes.environment` defines `AgentEnvironmentProvider`/`AgentEnvironmentHandle` below `AgentAttemptRuntime`; the platform `DockerEnvironmentProvider` (`shared/platform.py`) wraps `agent_eval.runtimes.docker` with the `nmp-nat-` image tag. `workflow` + `aut` runtimes execute through the boundary (provider is injectable). NeMo Gym/local providers can now be added without touching runtimes. | +| **B3** standardize environment authoring | **Implemented (minimal)** — `agent_eval.runtimes.environment_spec` adds a declarative `environment.yaml` (`image` + `profile` + python `dependencies` + `setup`) with a `dockerfile:` escape hatch and backward-compatible auto-detection of `environment/Dockerfile`. `plan_task_build` resolves a spec to a `BuildPlan` (image-based specs generate a tiny derived Dockerfile); the orchestrator BUILD step uses it. `setup` steps are carried as plan/label metadata, not executed (runtime concern). | +| **B4** productize results + CI | **Implemented** — SDK `persist_run` writes `tasks/attempts/results.jsonl`, `summary.json`, `report.html`; `agent_eval.gating` adds candidate-vs-baseline gating (pass-rate, token/cost, runtime tie-breaker) + deterministic provenance checks, persisted as `gate.json` by the orchestrator. `result.json` → attempt adapter + `VerifierRewardMetric` compatibility metric also done. | ### B4 reporting / gating detail @@ -131,7 +133,7 @@ expects the portable `CapturedAgentAttempt` type. calls `agent_eval.persistence.persist_run`, writing `tasks.jsonl`, `attempts.jsonl`, `results.jsonl`, `summary.json`, `benchmark.json`, `run.json`, and (when `write_dashboard=True`) `report.html`. -- **Gating** (`shared/reporting.py`): `summarize_run` aggregates pass-rate, +- **Gating** (`agent_eval.gating`): `summarize_run` aggregates pass-rate, token totals/coverage, runtime totals, and run-level provenance from the typed `AgentEvalRunResult` (metric scores first, attempt metadata as fallback). `evaluate_gate` applies absolute thresholds and candidate-vs-baseline checks: @@ -149,7 +151,7 @@ expects the portable `CapturedAgentAttempt` type. The doc sketches `AgentEnvironmentHandle.run_agent(instruction, config) -> AgentEvalAttempt`. We instead use `run_agent(EnvRunSpec) -> EnvCommandResult` (and the symmetric `run_verifier`). Rationale: per-backend command/env/mount construction lives in the -runtime, and attempt construction lives in `shared/artifacts.py`. Keeping the +runtime, and attempt construction lives in `shared/platform.py`. Keeping the environment layer at "execute a command, return exit status" means a new provider (local, Harbor, NeMo Gym) only implements process execution — it never needs to know about backends or attempt schemas. diff --git a/tests/agentic-use/runtimes/README.md b/tests/agentic-use/runtimes/README.md index d5ecff2c38..5b149c10ec 100644 --- a/tests/agentic-use/runtimes/README.md +++ b/tests/agentic-use/runtimes/README.md @@ -9,19 +9,26 @@ the pytest verifier, the platform Docker build/image-tag) plus a thin factory. ## Architecture: adapter over SDK -The `shared/*` modules below are **pure re-export shims** over their SDK homes — -they exist only so existing imports keep working; the logic lives in the SDK: - -| `shared/` shim | SDK home | -|----------------|----------| -| `docker.py` | `agent_eval.runtimes.docker` | -| `environment.py` | `agent_eval.runtimes.environment` (re-supplies the platform image-tag) | -| `environment_spec.py` | `agent_eval.runtimes.environment_spec` | -| `reporting.py` | `agent_eval.gating` | -| `verify.py` | wraps `agent_eval.runtimes.verify` (pytest command/env/mounts stay here) | -| `metrics.py` | `AgentPhaseSuccessMetric` from `agent_eval.common_metrics` (namespaced); `VerifierRewardMetric` is platform | -| `artifacts.py` | `resolve_attempt_status` + evidence keys from `agent_eval.attempts`; adds the platform `state` key | -| `layout.py` | delegates to `agent_eval.runtimes.layout`; adds the platform `state_dir` + `task_image_tag` | +The backend-agnostic logic lives in `nemo_evaluator_sdk.agent_eval` and is +imported **directly** by the runtime scripts (no re-export shims). Everything +generic comes from these SDK homes: + +| What | SDK home | +|------|----------| +| Docker CLI helpers | `agent_eval.runtimes.docker` | +| Environment boundary (`AgentEnvironmentProvider`/`Handle`, `EnvRunSpec`) | `agent_eval.runtimes.environment` | +| Environment authoring (`load_environment_spec`, `plan_task_build`, …) | `agent_eval.runtimes.environment_spec` | +| Gating (`GateThresholds`, `evaluate_gate`, `summarize_run`, …) | `agent_eval.gating` | +| Verify mechanic (`apply_verify_to_metadata`, `collect_verifier_outcome`) | `agent_eval.runtimes.verify` | +| `AgentPhaseSuccessMetric`, attempt-status + evidence helpers | `agent_eval.common_metrics`, `agent_eval.attempts` | +| Generic orchestrator + run layout | `agent_eval.orchestrator`, `agent_eval.runtimes.layout` | + +All NeMo-Platform-specific glue is consolidated into a single module, +`shared/platform.py`: the run layout with the platform `state_dir`, the +`nmp-nat-` image tag + `DockerEnvironmentProvider` default, the namespaced +`AgentPhaseSuccessMetric` + the `VerifierRewardMetric`, agent-log/usage parsing +and the shared container env, attempt construction (live + `result.json`), the +live VERIFY phase, and the agentic-use task loader. The orchestrator (`orchestrator.py`) is a thin factory over `agent_eval.orchestrator.AgentEvalOrchestrator`: it injects the platform image @@ -32,9 +39,10 @@ build (`prepare_task`), the `run_verify`-derived `VerifierRewardMetric` ```text runtimes/ - shared/ # thin re-export shims over agent_eval.* (see table above) - # + platform-only: task_loader.py, result_adapter.py, - # config.py, container_env.py, constants.py + shared/ # platform glue only: + # platform.py — all NeMo-Platform helpers (one file) + # config.py — runtime config dataclasses + # constants.py — paths / container constants workflow/ # NatWorkflowAttemptRuntime (implemented, NeMo construct) aut/ # AutAgentAttemptRuntime (implemented, NeMo construct) claude_code/ # scaffold (stub) — see "Coding-agent runtimes" below @@ -106,7 +114,7 @@ Design-doc implementation path (see [COMPLIANCE.md](./COMPLIANCE.md) for detail) ## B1 — `result.json` import + stored-attempt scoring -`shared/result_adapter.py` imports an existing `nat_runner` run as an attempt: +`shared/platform.py` imports an existing `nat_runner` run as an attempt: - `attempt_from_result_dir(output_dir)` reads `/result.json`. - `attempt_from_result(result_dict, output_dir=...)` projects a parsed record. @@ -131,16 +139,17 @@ when `run_verify=True`. `inputs` holds only agent-facing `instruction`; ## B2 — Environment boundary -Runtimes execute the agent through `shared/environment.py` +Runtimes execute the agent through the SDK environment boundary (`AgentEnvironmentProvider` → `AgentEnvironmentHandle`) rather than calling -Docker directly. `DockerEnvironmentProvider` is the default; inject another +Docker directly. The platform `DockerEnvironmentProvider` (`shared/platform.py`, +defaulting to the `nmp-nat-` image tag) is the default; inject another provider (local, Harbor, NeMo Gym) via the runtime's `environment=` argument without changing backend code. ## B3 — Environment authoring Tasks can declare a reusable environment instead of hand-writing a Dockerfile. -`shared/environment_spec.py` loads `environment.yaml` from the task dir: +`agent_eval.runtimes.environment_spec` loads `environment.yaml` from the task dir: ```yaml environment: @@ -171,10 +180,10 @@ as metadata, not executed here (they are runtime concerns). The SDK persists the run bundle (`tasks.jsonl`, `attempts.jsonl`, `results.jsonl`, `summary.json`, `report.html`) when `output_dir` is set. -`shared/reporting.py` adds the gate on top: +`agent_eval.gating` adds the gate on top: ```python -from runtimes.shared.reporting import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, load_baseline_summary, write_gate_report report = evaluate_gate( run_result, @@ -190,7 +199,7 @@ The orchestrator emits `gate.json` automatically (`AgenticOrchestratorConfig.wri ## Live VERIFY phase (through the B2 boundary) -`shared/verify.py` runs the task-local `tests/test_outputs.py` pytest verifier +`shared/platform.py` runs the task-local `tests/test_outputs.py` pytest verifier through `AgentEnvironmentHandle.run_verifier`, in the same prepared environment and against the same persisted workspace/state as the agent phase. Enable it via `AgenticSharedConfig(run_verify=True)`; the runtime stamps `reward`/`passed`/ diff --git a/tests/agentic-use/runtimes/__init__.py b/tests/agentic-use/runtimes/__init__.py index 1b7e12491f..df392483cf 100644 --- a/tests/agentic-use/runtimes/__init__.py +++ b/tests/agentic-use/runtimes/__init__.py @@ -3,20 +3,23 @@ """Backend-specific AgentAttemptRuntime implementations for agentic-use evals.""" -from runtimes.aut.runtime import AutAgentAttemptRuntime -from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime -from runtimes.codex.runtime import CodexAgentAttemptRuntime -from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime -from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend -from runtimes.shared.environment import ( +from nemo_evaluator_sdk.agent_eval.gating import ( + GateCheck, + GateReport, + GateThresholds, + evaluate_gate, + load_baseline_summary, + summarize_run, + write_gate_report, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( AgentEnvironmentHandle, AgentEnvironmentProvider, DockerEnvironmentHandle, - DockerEnvironmentProvider, EnvCommandResult, EnvRunSpec, ) -from runtimes.shared.environment_spec import ( +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import ( BuildPlan, EnvironmentSpec, execute_build_plan, @@ -24,20 +27,19 @@ plan_task_build, render_derived_dockerfile, ) -from runtimes.shared.metrics import AgentPhaseSuccessMetric, VerifierRewardMetric -from runtimes.shared.reporting import ( - GateCheck, - GateReport, - GateThresholds, - evaluate_gate, - load_baseline_summary, - summarize_run, - write_gate_report, -) -from runtimes.shared.result_adapter import attempt_from_result, attempt_from_result_dir -from runtimes.shared.verify import ( - VerifierOutcome, - apply_verify_to_metadata, +from nemo_evaluator_sdk.agent_eval.runtimes.verify import VerifierOutcome, apply_verify_to_metadata + +from runtimes.aut.runtime import AutAgentAttemptRuntime +from runtimes.claude_code.runtime import ClaudeCodeAgentAttemptRuntime +from runtimes.codex.runtime import CodexAgentAttemptRuntime +from runtimes.cursor_agent.runtime import CursorAgentAttemptRuntime +from runtimes.orchestrator import AgenticEvalOrchestrator, AgenticOrchestratorConfig, runtime_for_backend +from runtimes.shared.platform import ( + AgentPhaseSuccessMetric, + DockerEnvironmentProvider, + VerifierRewardMetric, + attempt_from_result, + attempt_from_result_dir, build_verify_run_spec, maybe_run_verify, run_verify, diff --git a/tests/agentic-use/runtimes/aut/runtime.py b/tests/agentic-use/runtimes/aut/runtime.py index 64bc8e46bc..4185abe826 100644 --- a/tests/agentic-use/runtimes/aut/runtime.py +++ b/tests/agentic-use/runtimes/aut/runtime.py @@ -8,12 +8,12 @@ from collections.abc import Sequence from pathlib import Path +from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec +from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask from runtimes.aut.command import build_aut_agent_cmd from runtimes.aut.prep import prepare_aut_config_for_runtime -from runtimes.shared.agent_log import agent_log_has_workflow_error -from runtimes.shared.artifacts import build_agent_eval_attempt from runtimes.shared.config import AutRuntimeConfig from runtimes.shared.constants import ( DOCKER_SOCKET_CONTAINER_PATH, @@ -21,15 +21,16 @@ INSTRUCTION_CONTAINER_PATH, REPO_ROOT, ) -from runtimes.shared.container_env import base_container_env -from runtimes.shared.environment import ( - AgentEnvironmentProvider, +from runtimes.shared.platform import ( + AgenticRunLayout, DockerEnvironmentProvider, - EnvRunSpec, + agent_log_has_workflow_error, + base_container_env, + build_agent_eval_attempt, + maybe_run_verify, + resolve_run_layout, + task_agent_timeout_sec, ) -from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout -from runtimes.shared.task_loader import task_agent_timeout_sec -from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify RUNTIME_NAME = "aut" AUT_CONFIG_CONTAINER_PATH = "/tmp/aut_agent.yml" diff --git a/tests/agentic-use/runtimes/orchestrator.py b/tests/agentic-use/runtimes/orchestrator.py index 8a355fbdfc..74531b1a41 100644 --- a/tests/agentic-use/runtimes/orchestrator.py +++ b/tests/agentic-use/runtimes/orchestrator.py @@ -17,7 +17,10 @@ from pathlib import Path from typing import Any +from nemo_evaluator_sdk.agent_eval.gating import GateThresholds from nemo_evaluator_sdk.agent_eval.orchestrator import AgentEvalOrchestrator, OrchestratorConfig +from nemo_evaluator_sdk.agent_eval.runtimes.docker import docker_image_exists +from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import execute_build_plan, plan_task_build from nemo_evaluator_sdk.agent_eval.types import ( AgentAttemptRuntime, AgentEvalRunResult, @@ -25,13 +28,12 @@ ) from nemo_evaluator_sdk.metrics.protocol import Metric -from runtimes.shared.docker import docker_image_exists -from runtimes.shared.environment_spec import execute_build_plan, plan_task_build -from runtimes.shared.layout import task_image_tag -from runtimes.shared.metrics import VerifierRewardMetric -from runtimes.shared.reporting import GateThresholds -from runtimes.shared.result_adapter import ResultDirAttemptSource -from runtimes.shared.task_loader import agentic_task_from_dir +from runtimes.shared.platform import ( + ResultDirAttemptSource, + VerifierRewardMetric, + agentic_task_from_dir, + task_image_tag, +) @dataclass(frozen=True) diff --git a/tests/agentic-use/runtimes/shared/agent_log.py b/tests/agentic-use/runtimes/shared/agent_log.py deleted file mode 100644 index 6fc7de0270..0000000000 --- a/tests/agentic-use/runtimes/shared/agent_log.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Agent log parsing helpers shared by backend runtimes.""" - -from __future__ import annotations - -import json -from typing import Any - - -def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]: - """Return JSON dict payloads embedded in an agent log, newest-first after the full log.""" - candidates = [agent_log.strip()] - lines = [line.strip() for line in agent_log.splitlines() if line.strip()] - if lines: - candidates.append(lines[-1]) - candidates.extend(reversed(lines)) - - payloads: list[dict[str, Any]] = [] - seen: set[str] = set() - for candidate in candidates: - if not candidate or candidate in seen: - continue - seen.add(candidate) - try: - parsed = json.loads(candidate) - except json.JSONDecodeError: - continue - if isinstance(parsed, dict): - payloads.append(parsed) - return payloads - - -def agent_log_has_workflow_error(agent_log: str) -> bool: - """Detect AUT workflow errors returned as successful HTTP JSON payloads.""" - for payload in iter_agent_log_json_payloads(agent_log): - if payload.get("code") == "workflow_error": - return True - return False diff --git a/tests/agentic-use/runtimes/shared/artifacts.py b/tests/agentic-use/runtimes/shared/artifacts.py deleted file mode 100644 index 912c12a7a0..0000000000 --- a/tests/agentic-use/runtimes/shared/artifacts.py +++ /dev/null @@ -1,187 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Convert captured agent artifacts into AgentEvalAttempt values.""" - -from __future__ import annotations - -from pathlib import Path - -from evaluator_agent_eval.artifacts import AgentArtifacts -from evaluator_agent_eval.schemas import ( - AgentAttemptInput, - AgentAttemptMetadata, - AgentAttemptOutput, - AgentAttemptTrace, - CapturedAgentAttempt, -) -from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors -from nemo_evaluator_sdk.agent_eval.types import ( - AgentEvalAttempt, - AgentEvalTask, - AgentOutput, -) -from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor - -from runtimes.shared.config import AgenticRuntimeName -from runtimes.shared.layout import AgenticRunLayout -from runtimes.shared.usage import extract_usage_metrics - -__all__ = [ - "build_agent_eval_attempt", - "resolve_attempt_status", - "to_captured_agent_attempt", -] - - -def build_agent_eval_attempt( - *, - task: AgentEvalTask, - layout: AgenticRunLayout, - runtime_name: AgenticRuntimeName, - agent_model: str, - exit_code: int, - agent_ok: bool, - run_id: str | None = None, - repo_revision: str | None = None, - duration_ms: int | None = None, -) -> AgentEvalAttempt: - """Build an SDK attempt from on-disk agent artifacts. - - Metadata uses the same canonical keys as :class:`CapturedAgentAttempt` - (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring - helpers can consume attempts without a second adapter. - """ - artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - log_text = _read_agent_log(layout.agent_log_dir) - usage = extract_usage_metrics(log_text) - duration = duration_ms if duration_ms is not None else usage.get("duration_ms") - - output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None - raw_log_paths = _raw_log_paths(artifacts.agent_log_dir) - initial_state = task.inputs.get("filesystem") - descriptors = _evidence_descriptors( - layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None - ) - - metadata: dict[str, object] = { - # Canonical CapturedAgentAttempt fields - "agent_runtime": runtime_name, - "agent_model": agent_model, - "agent_runtime_version": None, - "repo_revision": repo_revision, - "run_id": run_id, - "exit_code": exit_code, - "duration_ms": duration, - # SDK / orchestration extensions - "model_id": agent_model, - "target_name": agent_model, - "attempt_id": f"{task.id}:{runtime_name}", - "agent_ok": agent_ok, - "agent_log_dir": str(layout.agent_log_dir), - "workspace_dir": str(layout.workspace_dir), - "state_dir": str(layout.state_dir), - "run_dir": str(layout.run_dir), - "instruction_path": task.metadata.get("instruction_path"), - "final_answer_extracted": artifacts.final_answer.extracted, - "final_answer_source": artifacts.final_answer.source, - "raw_log_paths": raw_log_paths, - "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None, - **usage, - } - - status = resolve_attempt_status(agent_ok) - if output_text: - output = AgentOutput(text=output_text) - elif agent_ok: - output = AgentOutput(text=log_text.strip() or "") - else: - output = AgentOutput(text=log_text.strip() or "(agent phase failed)") - - return AgentEvalAttempt( - id=f"{task.id}:{runtime_name}", - task_id=task.id, - status=status, - output=output, - evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, - metadata=metadata, - ) - - -def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt: - """Project an SDK attempt onto the portable CapturedAgentAttempt schema.""" - metadata = attempt.metadata - trace_path = metadata.get("atif_trajectory_path") - return CapturedAgentAttempt( - task_id=attempt.task_id, - input=AgentAttemptInput( - instruction_text=task.intent, - instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None, - ), - output=AgentAttemptOutput( - final_text=attempt.output.text if attempt.output is not None else "", - final_answer_extracted=bool(metadata.get("final_answer_extracted")), - final_answer_source=str(metadata.get("final_answer_source")) - if metadata.get("final_answer_source") is not None - else None, - raw_log_paths=list(metadata.get("raw_log_paths") or []), - ), - metadata=AgentAttemptMetadata( - agent_runtime=str(metadata.get("agent_runtime", "unknown")), - agent_model=str(metadata.get("agent_model", "unknown")), - agent_runtime_version=str(metadata["agent_runtime_version"]) - if metadata.get("agent_runtime_version") is not None - else None, - repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None, - run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None, - exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None, - duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None, - ), - trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None, - ) - - -def _evidence_descriptors( - layout: AgenticRunLayout, - artifacts: AgentArtifacts, - *, - initial_state_ref: str | None = None, -) -> dict[str, EvidenceDescriptor]: - """Compose the SDK's standard evidence keys + the platform ``state`` extension. - - The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/ - ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state`` - is a NeMo-Platform-specific *extension* (not a doc key): it carries the - preserved platform/database state across the agent + verifier phases. - """ - descriptors = standard_evidence_descriptors( - logs_dir=layout.agent_log_dir, - final_state_dir=layout.workspace_dir, - trace_path=artifacts.atif_trajectory_path, - initial_state_ref=initial_state_ref, - verifier_logs_dir=layout.run_dir / "verifier", - primary_log="nat_agent.log", - ) - - # Platform extension (non-doc key): preserved platform/db state across phases. - descriptors["state"] = EvidenceDescriptor( - kind="filesystem", - format="dir", - ref=str(layout.state_dir), - metadata={"role": "platform_state", "extension": "nemo-platform"}, - ) - - return descriptors - - -def _raw_log_paths(agent_log_dir: Path) -> list[str]: - if not agent_log_dir.is_dir(): - return [] - return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()] - - -def _read_agent_log(agent_log_dir: Path) -> str: - log_path = agent_log_dir / "nat_agent.log" - if log_path.is_file(): - return log_path.read_text(encoding="utf-8", errors="replace") - return "" diff --git a/tests/agentic-use/runtimes/shared/container_env.py b/tests/agentic-use/runtimes/shared/container_env.py deleted file mode 100644 index b59a100b54..0000000000 --- a/tests/agentic-use/runtimes/shared/container_env.py +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Shared container environment helpers.""" - -from __future__ import annotations - -import json -from typing import Any - -from runtimes.shared.config import AgenticSharedConfig -from runtimes.shared.constants import ( - DOCKER_SOCKET_CONTAINER_PATH, - DOCKER_SOCKET_HOST_PATH, - FILES_STORAGE_CONFIG, - PLATFORM_CONFIG_PATH, -) - - -def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]: - """Environment variables shared by all agentic-use container runs.""" - env: dict[str, str] = { - "NMP_BASE_URL": shared.nmp_base_url, - "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", - "DATABASE_DIALECT": "sqlite", - "DATABASE_PATH": "/data/nmp-platform.db", - "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, - "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, - "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec), - "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec), - "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec), - } - if DOCKER_SOCKET_HOST_PATH.exists(): - env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" - return env - - -def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]: - if agent_params: - env = dict(env) - env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True) - return env diff --git a/tests/agentic-use/runtimes/shared/docker.py b/tests/agentic-use/runtimes/shared/docker.py deleted file mode 100644 index ce3cc6cc22..0000000000 --- a/tests/agentic-use/runtimes/shared/docker.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Compatibility shim — Docker helpers were promoted to the Evaluator SDK. - -Import from ``nemo_evaluator_sdk.agent_eval.runtimes.docker`` directly; this -module re-exports the same symbols so existing adapter imports keep working. -""" - -from __future__ import annotations - -from nemo_evaluator_sdk.agent_eval.runtimes.docker import ( - build_dockerfile, - build_task_image, - docker_image_exists, - docker_run, - redact_cmd_for_logging, -) - -__all__ = [ - "build_dockerfile", - "build_task_image", - "docker_image_exists", - "docker_run", - "redact_cmd_for_logging", -] diff --git a/tests/agentic-use/runtimes/shared/environment.py b/tests/agentic-use/runtimes/shared/environment.py deleted file mode 100644 index 08e55ce2ed..0000000000 --- a/tests/agentic-use/runtimes/shared/environment.py +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Compatibility shim — the environment boundary was promoted to the Evaluator SDK. - -The generic boundary now lives in -``nemo_evaluator_sdk.agent_eval.runtimes.environment``. The only platform-specific -piece kept here is the default task→image mapping (``nmp-nat-:latest``): the -adapter's :class:`DockerEnvironmentProvider` injects :func:`task_image_tag` so -``DockerEnvironmentProvider()`` keeps producing platform-tagged images. -""" - -from __future__ import annotations - -from collections.abc import Callable - -from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( - AbstractEnvironmentHandle, - AgentEnvironmentHandle, - AgentEnvironmentProvider, - DockerEnvironmentHandle, - EnvCommandResult, - EnvRole, - EnvRunSpec, - default_image_tag, -) -from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( - DockerEnvironmentProvider as _SDKDockerEnvironmentProvider, -) - -from runtimes.shared.layout import task_image_tag - -__all__ = [ - "AbstractEnvironmentHandle", - "AgentEnvironmentHandle", - "AgentEnvironmentProvider", - "DockerEnvironmentHandle", - "DockerEnvironmentProvider", - "EnvCommandResult", - "EnvRole", - "EnvRunSpec", - "default_image_tag", -] - - -class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider): - """Platform default: map ``task.id`` to ``nmp-nat-:latest``.""" - - def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None: - super().__init__(image_tag_fn=image_tag_fn) diff --git a/tests/agentic-use/runtimes/shared/environment_spec.py b/tests/agentic-use/runtimes/shared/environment_spec.py deleted file mode 100644 index 9cdd3db71f..0000000000 --- a/tests/agentic-use/runtimes/shared/environment_spec.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Compatibility shim — environment authoring was promoted to the Evaluator SDK. - -Import from ``nemo_evaluator_sdk.agent_eval.runtimes.environment_spec`` directly; -this module re-exports the same symbols so existing adapter imports keep working. -""" - -from __future__ import annotations - -from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import ( - DEFAULT_DOCKERFILE_RELPATH, - ENVIRONMENT_SPEC_FILENAME, - BuildPlan, - EnvironmentSpec, - execute_build_plan, - load_environment_spec, - plan_task_build, - render_derived_dockerfile, -) - -__all__ = [ - "DEFAULT_DOCKERFILE_RELPATH", - "ENVIRONMENT_SPEC_FILENAME", - "BuildPlan", - "EnvironmentSpec", - "execute_build_plan", - "load_environment_spec", - "plan_task_build", - "render_derived_dockerfile", -] diff --git a/tests/agentic-use/runtimes/shared/layout.py b/tests/agentic-use/runtimes/shared/layout.py deleted file mode 100644 index 86a4c5f4f2..0000000000 --- a/tests/agentic-use/runtimes/shared/layout.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Output directory layout for agentic-use runtime runs.""" - -from __future__ import annotations - -from dataclasses import dataclass -from datetime import UTC, datetime -from pathlib import Path - -from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir -from nemo_evaluator_sdk.agent_eval.types import AgentEvalRunConfig, AgentEvalTask - -from runtimes.shared.config import AgenticSharedConfig - - -@dataclass(frozen=True) -class AgenticRunLayout: - """Filesystem layout for one task run. - - Extends the SDK's generic ``RunLayout`` shape with a platform-specific - ``state_dir`` (preserved platform/database state across agent + verifier). - """ - - run_dir: Path - agent_log_dir: Path - workspace_dir: Path - state_dir: Path - instruction_path: Path - - -def default_jobs_dir(shared: AgenticSharedConfig) -> Path: - if shared.jobs_dir is not None: - return shared.jobs_dir - return shared.repo_root / "nat-jobs" - - -def new_run_dir(jobs_dir: Path, task_id: str) -> Path: - timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") - run_dir = jobs_dir / f"{timestamp}-{task_id}" - run_dir.mkdir(parents=True, exist_ok=True) - return run_dir - - -def resolve_run_layout( - task: AgentEvalTask, - shared: AgenticSharedConfig, - config: AgentEvalRunConfig | None = None, -) -> AgenticRunLayout: - """Resolve or create the on-disk layout for one task attempt.""" - output_dir = config.output_dir if config is not None else None - run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id)) - - # Generic agent/workspace dirs + written instruction come from the SDK helper. - base = prepare_run_layout(run_dir, task.intent) - - # Platform extension: a preserved state dir for platform/db across phases. - state_dir = base.run_dir / "state" - state_dir.mkdir(parents=True, exist_ok=True) - - return AgenticRunLayout( - run_dir=base.run_dir, - agent_log_dir=base.agent_log_dir, - workspace_dir=base.workspace_dir, - state_dir=state_dir, - instruction_path=base.instruction_path, - ) - - -def task_image_tag(task_id: str) -> str: - return f"nmp-nat-{task_id}:latest" diff --git a/tests/agentic-use/runtimes/shared/metrics.py b/tests/agentic-use/runtimes/shared/metrics.py deleted file mode 100644 index e7b8496caf..0000000000 --- a/tests/agentic-use/runtimes/shared/metrics.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Default metrics for agentic-use agent-eval runs. - -``AgentPhaseSuccessMetric`` is promoted to the SDK; here it is namespaced under -the ``agentic_use_*`` metric type. ``VerifierRewardMetric`` stays a platform -compatibility shim (mirrors the legacy pytest verifier reward). -""" - -from __future__ import annotations - -from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric -from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult - - -class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric): - """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``).""" - - metric_type = "agentic_use_agent_phase" - - -class VerifierRewardMetric: - """Compatibility metric mirroring the legacy pytest verifier reward. - - Reads the verifier outcome that ``nat_runner`` records in ``result.json`` - (projected onto attempt metadata as ``reward``/``passed``) so existing - ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK - while task-specific metrics are authored. - """ - - @property - def type(self) -> str: - return "agentic_use_verifier_reward" - - def output_spec(self) -> list[MetricOutputSpec]: - return [MetricOutputSpec.continuous_score("verifier_reward")] - - async def compute_scores(self, input: MetricInput) -> MetricResult: - metadata = input.candidate.metadata - reward = metadata.get("reward") - if reward is None: - reward = 1.0 if metadata.get("passed") else 0.0 - return MetricResult( - outputs=[MetricOutput(name="verifier_reward", value=float(reward))], - ) diff --git a/tests/agentic-use/runtimes/shared/platform.py b/tests/agentic-use/runtimes/shared/platform.py new file mode 100644 index 0000000000..721d717e6f --- /dev/null +++ b/tests/agentic-use/runtimes/shared/platform.py @@ -0,0 +1,791 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""NeMo-Platform glue that sits on top of the generic agent-eval SDK. + +Everything generic (Docker helpers, the environment boundary, environment +authoring, gating, attempt-status/evidence helpers, the verifier mechanic) now +lives in ``nemo_evaluator_sdk.agent_eval`` and is imported directly where used. + +This single module holds only the pieces that are specific to the agentic-use +benchmark and therefore do not belong in the SDK: + +* run layout with the platform ``state_dir`` and the ``nmp-nat-`` image tag, +* a ``DockerEnvironmentProvider`` defaulting to that platform image tag, +* default metrics (``AgentPhaseSuccessMetric`` namespace + ``VerifierRewardMetric``), +* agent-log/usage parsing and the shared container env, +* attempt construction from live artifacts and from ``nat_runner`` ``result.json``, +* the live VERIFY phase wired through the SDK environment boundary, +* the agentic-use task loader. +""" + +from __future__ import annotations + +import json +import textwrap +import tomllib +from collections.abc import Callable +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any, TypedDict + +from evaluator_agent_eval.artifacts import AgentArtifacts +from evaluator_agent_eval.schemas import ( + AgentAttemptInput, + AgentAttemptMetadata, + AgentAttemptOutput, + AgentAttemptTrace, + CapturedAgentAttempt, +) +from nemo_evaluator_sdk.agent_eval.attempts import resolve_attempt_status, standard_evidence_descriptors +from nemo_evaluator_sdk.agent_eval.common_metrics import AgentPhaseSuccessMetric as _SDKAgentPhaseSuccessMetric +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + AgentEnvironmentHandle, + EnvRunSpec, +) +from nemo_evaluator_sdk.agent_eval.runtimes.environment import ( + DockerEnvironmentProvider as _SDKDockerEnvironmentProvider, +) +from nemo_evaluator_sdk.agent_eval.runtimes.layout import prepare_run_layout, resolve_run_dir +from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( + VerifierOutcome, + collect_verifier_outcome, + skipped_outcome, +) +from nemo_evaluator_sdk.agent_eval.types import ( + AgentEvalAttempt, + AgentEvalRunConfig, + AgentEvalTask, + AgentOutput, +) +from nemo_evaluator_sdk.metrics.protocol import ( + Metric, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, +) +from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor + +from runtimes.shared.config import AgenticRuntimeName, AgenticSharedConfig +from runtimes.shared.constants import ( + AGENTIC_USE_DIR, + DOCKER_SOCKET_CONTAINER_PATH, + DOCKER_SOCKET_HOST_PATH, + EVALUATOR_SDK_SRC, + FILES_STORAGE_CONFIG, + PLATFORM_CONFIG_PATH, + SHARED_DIR, +) + +__all__ = [ + "AgenticRunLayout", + "AgentPhaseSuccessMetric", + "DockerEnvironmentProvider", + "ResultDirAttemptSource", + "VerifierRewardMetric", + "agent_log_has_workflow_error", + "agentic_task_from_dir", + "attempt_from_result", + "attempt_from_result_dir", + "base_container_env", + "build_agent_eval_attempt", + "build_verify_run_spec", + "extract_usage_metrics", + "iter_agent_log_json_payloads", + "load_task_toml", + "maybe_run_verify", + "resolve_run_layout", + "run_verify", + "task_agent_timeout_sec", + "task_image_tag", + "to_captured_agent_attempt", + "verifier_log_dir", + "with_candidate_params", +] + + +# --------------------------------------------------------------------------- # +# Run layout + image tagging +# --------------------------------------------------------------------------- # +@dataclass(frozen=True) +class AgenticRunLayout: + """Filesystem layout for one task run. + + Extends the SDK's generic ``RunLayout`` shape with a platform-specific + ``state_dir`` (preserved platform/database state across agent + verifier). + """ + + run_dir: Path + agent_log_dir: Path + workspace_dir: Path + state_dir: Path + instruction_path: Path + + +def task_image_tag(task_id: str) -> str: + return f"nmp-nat-{task_id}:latest" + + +def default_jobs_dir(shared: AgenticSharedConfig) -> Path: + if shared.jobs_dir is not None: + return shared.jobs_dir + return shared.repo_root / "nat-jobs" + + +def new_run_dir(jobs_dir: Path, task_id: str) -> Path: + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + run_dir = jobs_dir / f"{timestamp}-{task_id}" + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + +def resolve_run_layout( + task: AgentEvalTask, + shared: AgenticSharedConfig, + config: AgentEvalRunConfig | None = None, +) -> AgenticRunLayout: + """Resolve or create the on-disk layout for one task attempt.""" + output_dir = config.output_dir if config is not None else None + run_dir = resolve_run_dir(output_dir, lambda: new_run_dir(default_jobs_dir(shared), task.id)) + + # Generic agent/workspace dirs + written instruction come from the SDK helper. + base = prepare_run_layout(run_dir, task.intent) + + # Platform extension: a preserved state dir for platform/db across phases. + state_dir = base.run_dir / "state" + state_dir.mkdir(parents=True, exist_ok=True) + + return AgenticRunLayout( + run_dir=base.run_dir, + agent_log_dir=base.agent_log_dir, + workspace_dir=base.workspace_dir, + state_dir=state_dir, + instruction_path=base.instruction_path, + ) + + +class DockerEnvironmentProvider(_SDKDockerEnvironmentProvider): + """Platform default: map ``task.id`` to ``nmp-nat-:latest``.""" + + def __init__(self, *, image_tag_fn: Callable[[str], str] = task_image_tag) -> None: + super().__init__(image_tag_fn=image_tag_fn) + + +# --------------------------------------------------------------------------- # +# Default metrics +# --------------------------------------------------------------------------- # +class AgentPhaseSuccessMetric(_SDKAgentPhaseSuccessMetric): + """Agentic-use namespaced agent-phase metric (output stays ``agent_phase_success``).""" + + metric_type = "agentic_use_agent_phase" + + +class VerifierRewardMetric: + """Compatibility metric mirroring the legacy pytest verifier reward. + + Reads the verifier outcome that ``nat_runner`` records in ``result.json`` + (projected onto attempt metadata as ``reward``/``passed``) so existing + ``tests/test_outputs.py`` verifiers can score through the Evaluator SDK + while task-specific metrics are authored. + """ + + @property + def type(self) -> str: + return "agentic_use_verifier_reward" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("verifier_reward")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + metadata = input.candidate.metadata + reward = metadata.get("reward") + if reward is None: + reward = 1.0 if metadata.get("passed") else 0.0 + return MetricResult( + outputs=[MetricOutput(name="verifier_reward", value=float(reward))], + ) + + +# --------------------------------------------------------------------------- # +# Agent-log parsing + token usage +# --------------------------------------------------------------------------- # +class TokenMetrics(TypedDict): + prompt_tokens: int | None + completion_tokens: int | None + total_tokens: int | None + cache_creation_tokens: int | None + cache_read_tokens: int | None + n_assistant_messages: int | None + cost_usd: float | None + num_turns: int | None + duration_ms: float | None + + +def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]: + """Extract token usage metrics from an agent log.""" + import nat_runner + + metrics = nat_runner._extract_usage_metrics(agent_log) + return dict(metrics) + + +def iter_agent_log_json_payloads(agent_log: str) -> list[dict[str, Any]]: + """Return JSON dict payloads embedded in an agent log, newest-first after the full log.""" + candidates = [agent_log.strip()] + lines = [line.strip() for line in agent_log.splitlines() if line.strip()] + if lines: + candidates.append(lines[-1]) + candidates.extend(reversed(lines)) + + payloads: list[dict[str, Any]] = [] + seen: set[str] = set() + for candidate in candidates: + if not candidate or candidate in seen: + continue + seen.add(candidate) + try: + parsed = json.loads(candidate) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + payloads.append(parsed) + return payloads + + +def agent_log_has_workflow_error(agent_log: str) -> bool: + """Detect AUT workflow errors returned as successful HTTP JSON payloads.""" + for payload in iter_agent_log_json_payloads(agent_log): + if payload.get("code") == "workflow_error": + return True + return False + + +# --------------------------------------------------------------------------- # +# Shared container environment +# --------------------------------------------------------------------------- # +def base_container_env(shared: AgenticSharedConfig, *, timeout_sec: int) -> dict[str, str]: + """Environment variables shared by all agentic-use container runs.""" + env: dict[str, str] = { + "NMP_BASE_URL": shared.nmp_base_url, + "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", + "DATABASE_DIALECT": "sqlite", + "DATABASE_PATH": "/data/nmp-platform.db", + "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, + "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, + "NEMO_AGENTS_GATEWAY_READ_TIMEOUT": str(timeout_sec), + "NEMO_AGENTS_INVOKE_TIMEOUT": str(timeout_sec), + "AUT_INVOKE_HTTP_TIMEOUT": str(timeout_sec), + } + if DOCKER_SOCKET_HOST_PATH.exists(): + env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" + return env + + +def with_candidate_params(env: dict[str, str], agent_params: dict[str, Any]) -> dict[str, str]: + if agent_params: + env = dict(env) + env["NAT_CANDIDATE_PARAMS"] = json.dumps(agent_params, sort_keys=True) + return env + + +# --------------------------------------------------------------------------- # +# Attempt construction from live artifacts +# --------------------------------------------------------------------------- # +def build_agent_eval_attempt( + *, + task: AgentEvalTask, + layout: AgenticRunLayout, + runtime_name: AgenticRuntimeName, + agent_model: str, + exit_code: int, + agent_ok: bool, + run_id: str | None = None, + repo_revision: str | None = None, + duration_ms: int | None = None, +) -> AgentEvalAttempt: + """Build an SDK attempt from on-disk agent artifacts. + + Metadata uses the same canonical keys as :class:`CapturedAgentAttempt` + (``agent_runtime``, ``agent_model``, ``exit_code``, …) so verify/scoring + helpers can consume attempts without a second adapter. + """ + artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + log_text = _read_agent_log(layout.agent_log_dir) + usage = extract_usage_metrics(log_text) + duration = duration_ms if duration_ms is not None else usage.get("duration_ms") + + output_text = artifacts.final_answer.text if artifacts.final_answer.extracted else None + raw_log_paths = _raw_log_paths(artifacts.agent_log_dir) + initial_state = task.inputs.get("filesystem") + descriptors = _evidence_descriptors( + layout, artifacts, initial_state_ref=str(initial_state) if initial_state else None + ) + + metadata: dict[str, object] = { + # Canonical CapturedAgentAttempt fields + "agent_runtime": runtime_name, + "agent_model": agent_model, + "agent_runtime_version": None, + "repo_revision": repo_revision, + "run_id": run_id, + "exit_code": exit_code, + "duration_ms": duration, + # SDK / orchestration extensions + "model_id": agent_model, + "target_name": agent_model, + "attempt_id": f"{task.id}:{runtime_name}", + "agent_ok": agent_ok, + "agent_log_dir": str(layout.agent_log_dir), + "workspace_dir": str(layout.workspace_dir), + "state_dir": str(layout.state_dir), + "run_dir": str(layout.run_dir), + "instruction_path": task.metadata.get("instruction_path"), + "final_answer_extracted": artifacts.final_answer.extracted, + "final_answer_source": artifacts.final_answer.source, + "raw_log_paths": raw_log_paths, + "atif_trajectory_path": str(artifacts.atif_trajectory_path) if artifacts.atif_trajectory_path else None, + **usage, + } + + status = resolve_attempt_status(agent_ok) + if output_text: + output = AgentOutput(text=output_text) + elif agent_ok: + output = AgentOutput(text=log_text.strip() or "") + else: + output = AgentOutput(text=log_text.strip() or "(agent phase failed)") + + return AgentEvalAttempt( + id=f"{task.id}:{runtime_name}", + task_id=task.id, + status=status, + output=output, + evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, + metadata=metadata, + ) + + +def to_captured_agent_attempt(task: AgentEvalTask, attempt: AgentEvalAttempt) -> CapturedAgentAttempt: + """Project an SDK attempt onto the portable CapturedAgentAttempt schema.""" + metadata = attempt.metadata + trace_path = metadata.get("atif_trajectory_path") + return CapturedAgentAttempt( + task_id=attempt.task_id, + input=AgentAttemptInput( + instruction_text=task.intent, + instruction_path=str(metadata.get("instruction_path")) if metadata.get("instruction_path") else None, + ), + output=AgentAttemptOutput( + final_text=attempt.output.text if attempt.output is not None else "", + final_answer_extracted=bool(metadata.get("final_answer_extracted")), + final_answer_source=str(metadata.get("final_answer_source")) + if metadata.get("final_answer_source") is not None + else None, + raw_log_paths=list(metadata.get("raw_log_paths") or []), + ), + metadata=AgentAttemptMetadata( + agent_runtime=str(metadata.get("agent_runtime", "unknown")), + agent_model=str(metadata.get("agent_model", "unknown")), + agent_runtime_version=str(metadata["agent_runtime_version"]) + if metadata.get("agent_runtime_version") is not None + else None, + repo_revision=str(metadata["repo_revision"]) if metadata.get("repo_revision") is not None else None, + run_id=str(metadata["run_id"]) if metadata.get("run_id") is not None else None, + exit_code=int(metadata["exit_code"]) if isinstance(metadata.get("exit_code"), int) else None, + duration_ms=int(metadata["duration_ms"]) if isinstance(metadata.get("duration_ms"), int | float) else None, + ), + trace=AgentAttemptTrace(atif_path=str(trace_path)) if trace_path else None, + ) + + +def _evidence_descriptors( + layout: AgenticRunLayout, + artifacts: AgentArtifacts, + *, + initial_state_ref: str | None = None, +) -> dict[str, EvidenceDescriptor]: + """Compose the SDK's standard evidence keys + the platform ``state`` extension. + + The doc-standard keys (``initial_state``/``trace``/``logs``/``final_state``/ + ``verifier_logs``) come from :func:`standard_evidence_descriptors`. ``state`` + is a NeMo-Platform-specific *extension* (not a doc key): it carries the + preserved platform/database state across the agent + verifier phases. + """ + descriptors = standard_evidence_descriptors( + logs_dir=layout.agent_log_dir, + final_state_dir=layout.workspace_dir, + trace_path=artifacts.atif_trajectory_path, + initial_state_ref=initial_state_ref, + verifier_logs_dir=layout.run_dir / "verifier", + primary_log="nat_agent.log", + ) + + # Platform extension (non-doc key): preserved platform/db state across phases. + descriptors["state"] = EvidenceDescriptor( + kind="filesystem", + format="dir", + ref=str(layout.state_dir), + metadata={"role": "platform_state", "extension": "nemo-platform"}, + ) + + return descriptors + + +def _raw_log_paths(agent_log_dir: Path) -> list[str]: + if not agent_log_dir.is_dir(): + return [] + return [str(path.relative_to(agent_log_dir)) for path in sorted(agent_log_dir.iterdir()) if path.is_file()] + + +def _read_agent_log(agent_log_dir: Path) -> str: + log_path = agent_log_dir / "nat_agent.log" + if log_path.is_file(): + return log_path.read_text(encoding="utf-8", errors="replace") + return "" + + +# --------------------------------------------------------------------------- # +# Attempt construction from nat_runner result.json +# --------------------------------------------------------------------------- # +# Token/cost measurement keys carried in result.json["metrics"]. +_METRIC_KEYS = ( + "prompt_tokens", + "completion_tokens", + "total_tokens", + "cache_creation_tokens", + "cache_read_tokens", + "n_assistant_messages", + "cost_usd", + "num_turns", + "duration_ms", + "token_metrics_status", + "token_metrics_note", +) + + +class ResultDirAttemptSource: + """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts. + + Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource` + protocol so the generic orchestrator's offline path can rescore captured runs. + """ + + def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: + return attempt_from_result_dir(source, task=task) + + +def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt: + """Load ``/result.json`` and build an attempt from it.""" + output_dir = Path(output_dir) + result_path = output_dir / "result.json" + if not result_path.is_file(): + raise FileNotFoundError(f"result.json not found in {output_dir}") + result = json.loads(result_path.read_text(encoding="utf-8")) + return attempt_from_result(result, output_dir=output_dir, task=task) + + +def attempt_from_result( + result: dict[str, Any], + *, + output_dir: str | Path | None = None, + task: AgentEvalTask | None = None, +) -> AgentEvalAttempt: + """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`. + + The attempt ``status`` reflects whether the agent produced a usable + response (``agent`` phase outcome). Pass/fail from the verifier is recorded + as a *measurement* in metadata (``reward``/``passed``) so scoring metrics — + not the runtime — remain the source of truth. + """ + task_id = str(result.get("task") or (task.id if task is not None else "unknown")) + backend = str(result.get("agent_backend") or "unknown") + resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or ".")) + layout = _layout_from_result_dir(resolved_dir) + + agent_phase = str(result.get("agent") or "") + agent_ok = agent_phase in {"ok", "skipped"} + status = resolve_attempt_status(agent_ok) + + output_text, final_extracted, final_source = _resolve_output_text(layout) + if not output_text: + output_text = "" if agent_ok else "(agent phase failed)" + + descriptors = _evidence_descriptors( + layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + ) + + metrics = dict(result.get("metrics") or {}) + metadata: dict[str, Any] = { + # Canonical CapturedAgentAttempt-style provenance fields. + "agent_runtime": backend, + "agent_model": result.get("agent_model"), + "run_id": (result.get("provenance") or {}).get("run_id"), + "exit_code": 0 if agent_ok else 1, + "duration_ms": metrics.get("duration_ms"), + # Phase outcomes from result.json. + "agent_ok": agent_ok, + "build_status": result.get("build"), + "agent_status": result.get("agent"), + "verify_status": result.get("verify"), + # Measurements (verifier reward is a measurement, not attempt status). + "passed": result.get("passed"), + "reward": result.get("reward"), + "runtime_sec": result.get("runtime_sec"), + "verifier_scores": result.get("verifier_scores"), + # Provenance + candidate identity. + "provenance": result.get("provenance"), + "candidate_id": result.get("candidate_id"), + "candidate_params": result.get("candidate_params"), + "image": result.get("image"), + "output_dir": str(resolved_dir), + # Artifact discovery helpers. + "agent_log_dir": str(layout.agent_log_dir), + "workspace_dir": str(layout.workspace_dir), + "state_dir": str(layout.state_dir), + "final_answer_extracted": final_extracted, + "final_answer_source": final_source, + } + metadata.update({key: metrics.get(key) for key in _METRIC_KEYS}) + + return AgentEvalAttempt( + id=f"{task_id}:{backend}", + task_id=task_id, + status=status, + output=AgentOutput(text=output_text), + evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, + metadata=metadata, + ) + + +def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout: + agent_log_dir = output_dir / "agent" + return AgenticRunLayout( + run_dir=output_dir, + agent_log_dir=agent_log_dir, + workspace_dir=output_dir / "workspace", + state_dir=output_dir / "state", + instruction_path=agent_log_dir / "instruction.md", + ) + + +def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]: + if not layout.agent_log_dir.is_dir(): + return "", False, None + artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) + if artifacts.final_answer.extracted and artifacts.final_answer.text: + return artifacts.final_answer.text, True, artifacts.final_answer.source + log_path = layout.agent_log_dir / "nat_agent.log" + if log_path.is_file(): + return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None + return "", False, None + + +# --------------------------------------------------------------------------- # +# Live VERIFY phase through the SDK environment boundary +# --------------------------------------------------------------------------- # +def verifier_log_dir(layout: AgenticRunLayout) -> Path: + return layout.run_dir / "verifier" + + +def build_verify_run_spec( + task_dir: Path, + layout: AgenticRunLayout, + *, + nmp_base_url: str, + agent_backend: str, + agent_model: str, + smoke_workspace: str | None = None, + timeout_sec: int | None = None, + extra_args: list[str] | None = None, +) -> EnvRunSpec | None: + """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``. + + Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to + verify), matching the runner's behavior. + """ + tests_dir = task_dir / "tests" + if not (tests_dir / "test_outputs.py").exists(): + return None + + log_dir = verifier_log_dir(layout) + log_dir.mkdir(parents=True, exist_ok=True) + layout.workspace_dir.mkdir(parents=True, exist_ok=True) + + smoke_seed_cmd = "" + smoke_cleanup_cmd = "" + if smoke_workspace: + smoke_seed_cmd = textwrap.dedent("""\ + /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \ + --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true + """) + smoke_cleanup_cmd = textwrap.dedent("""\ + /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true + """) + + verify_cmd = [ + "bash", + "-c", + textwrap.dedent(f"""\ + export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}" + export NAT_AGENT=1 + {smoke_seed_cmd} + /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt + EXIT=${{PIPESTATUS[0]}} + {smoke_cleanup_cmd} + if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt + exit $EXIT + """), + ] + + env: dict[str, str] = { + "NMP_BASE_URL": nmp_base_url, + "NAT_AGENT": "1", + "NAT_AGENT_BACKEND": agent_backend, + "NAT_AGENT_MODEL": agent_model, + "AGENTIC_USE_TASK_DIR": "/task", + "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", + "SMOKE_WORKSPACE": smoke_workspace or "", + "DATABASE_DIALECT": "sqlite", + "DATABASE_PATH": "/data/nmp-platform.db", + "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, + "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, + } + if DOCKER_SOCKET_HOST_PATH.exists(): + env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" + + mounts: list[tuple[str, str]] = [ + (str(tests_dir), "/tests"), + (str(task_dir), "/task"), + (str(layout.workspace_dir), "/app/workspace"), + (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"), + (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"), + (str(layout.agent_log_dir), "/logs/agent"), + (str(log_dir), "/logs/verifier"), + # Persist platform/db state across AGENT and VERIFY containers. + (str(layout.state_dir), "/data"), + ] + if DOCKER_SOCKET_HOST_PATH.exists(): + mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH)) + + return EnvRunSpec( + command=verify_cmd, + env=env, + mounts=mounts, + timeout=timeout_sec, + extra_args=list(extra_args or []), + ) + + +async def run_verify( + handle: AgentEnvironmentHandle, + spec: EnvRunSpec, + layout: AgenticRunLayout, +) -> VerifierOutcome: + """Execute the verifier through the environment handle and collect reward.""" + result = await handle.run_verifier(spec) + return collect_verifier_outcome( + ok=result.ok, + exit_code=result.exit_code, + log_dir=verifier_log_dir(layout), + ) + + +async def maybe_run_verify( + handle: AgentEnvironmentHandle, + *, + enabled: bool, + task_dir: Path, + layout: AgenticRunLayout, + nmp_base_url: str, + agent_backend: str, + agent_model: str, + smoke_workspace: str | None = None, + timeout_sec: int | None = None, + extra_args: list[str] | None = None, +) -> VerifierOutcome: + """Run the verifier through ``handle`` when enabled and a verifier exists.""" + if not enabled: + return skipped_outcome() + spec = build_verify_run_spec( + task_dir, + layout, + nmp_base_url=nmp_base_url, + agent_backend=agent_backend, + agent_model=agent_model, + smoke_workspace=smoke_workspace, + timeout_sec=timeout_sec, + extra_args=extra_args, + ) + if spec is None: + return skipped_outcome() + return await run_verify(handle, spec, layout) + + +# --------------------------------------------------------------------------- # +# Agentic-use task loader +# --------------------------------------------------------------------------- # +def load_task_toml(task_dir: Path) -> dict[str, object]: + task_toml = task_dir / "task.toml" + if not task_toml.exists(): + return {} + try: + with task_toml.open("rb") as handle: + data = tomllib.load(handle) + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def task_agent_timeout_sec(task_dir: Path) -> int | None: + data = load_task_toml(task_dir) + agent = data.get("agent") + if not isinstance(agent, dict): + return None + timeout_value = agent.get("timeout_sec") + if isinstance(timeout_value, (int, float)) and timeout_value > 0: + return int(timeout_value) + return None + + +def agentic_task_from_dir( + task_dir: str | Path, + *, + tasks_root: Path | None = None, + metrics: list[Metric] | None = None, +) -> AgentEvalTask: + """Build an :class:`AgentEvalTask` from an agentic-use task directory. + + ``inputs`` carries only agent-facing material (``instruction``) per the SDK + design doc; runtime materialization details such as ``task_dir`` live in + ``metadata`` so they cannot leak into metric scoring rows. Metrics are + authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the + orchestrator only appends compatibility metrics, it does not own the set. + """ + root = Path(tasks_root or AGENTIC_USE_DIR) + task_path = Path(task_dir) + if not task_path.is_absolute(): + task_path = (root / task_path).resolve() + + instruction_path = task_path / "instruction.md" + if not instruction_path.exists(): + raise FileNotFoundError(f"instruction.md not found in {task_path}") + + instruction = instruction_path.read_text(encoding="utf-8") + task_toml = load_task_toml(task_path) + + return AgentEvalTask( + id=task_path.name, + intent=instruction, + inputs={ + "instruction": instruction, + }, + metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()], + metadata={ + "benchmark": "agentic-use", + "task_toml": task_toml, + "instruction_path": str(instruction_path), + "task_dir": str(task_path), + }, + ) diff --git a/tests/agentic-use/runtimes/shared/reporting.py b/tests/agentic-use/runtimes/shared/reporting.py deleted file mode 100644 index 7e78de3972..0000000000 --- a/tests/agentic-use/runtimes/shared/reporting.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Compatibility shim — gating was promoted to the Evaluator SDK. - -Import from ``nemo_evaluator_sdk.agent_eval.gating`` directly; this module -re-exports the same symbols so existing adapter imports keep working. -""" - -from __future__ import annotations - -from nemo_evaluator_sdk.agent_eval.gating import ( - DEFAULT_REWARD_OUTPUTS, - GateCheck, - GateReport, - GateThresholds, - evaluate_gate, - load_baseline_summary, - run_gate_checks, - summarize_run, - write_gate_report, -) - -__all__ = [ - "DEFAULT_REWARD_OUTPUTS", - "GateCheck", - "GateReport", - "GateThresholds", - "evaluate_gate", - "load_baseline_summary", - "run_gate_checks", - "summarize_run", - "write_gate_report", -] diff --git a/tests/agentic-use/runtimes/shared/result_adapter.py b/tests/agentic-use/runtimes/shared/result_adapter.py deleted file mode 100644 index bb0d3fe567..0000000000 --- a/tests/agentic-use/runtimes/shared/result_adapter.py +++ /dev/null @@ -1,156 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Adapt ``nat_runner`` ``result.json`` records into ``AgentEvalAttempt`` values. - -This bridges the existing ``nat_runner`` output contract (see -``nat_runner._write_result``) onto the agent-eval SDK so a run that already -produced ``result.json`` can be imported as an attempt without re-executing the -agent. Per the design doc, ``result.json`` carries the attempt *status*, -*measurements* (reward + token/cost), and *provenance*. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from evaluator_agent_eval.artifacts import AgentArtifacts -from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalTask, AgentOutput -from nemo_evaluator_sdk.values.evidence import CandidateEvidence - -from runtimes.shared.artifacts import _evidence_descriptors, resolve_attempt_status # reuse documented helpers -from runtimes.shared.layout import AgenticRunLayout - -# Token/cost measurement keys carried in result.json["metrics"]. -_METRIC_KEYS = ( - "prompt_tokens", - "completion_tokens", - "total_tokens", - "cache_creation_tokens", - "cache_read_tokens", - "n_assistant_messages", - "cost_usd", - "num_turns", - "duration_ms", - "token_metrics_status", - "token_metrics_note", -) - - -class ResultDirAttemptSource: - """``AgentAttemptSource`` adapting ``nat_runner`` ``result.json`` dirs into attempts. - - Implements the SDK :class:`~nemo_evaluator_sdk.agent_eval.types.AgentAttemptSource` - protocol so the generic orchestrator's offline path can rescore captured runs. - """ - - def load_attempt(self, source: str | Path, *, task: AgentEvalTask) -> AgentEvalAttempt: - return attempt_from_result_dir(source, task=task) - - -def attempt_from_result_dir(output_dir: str | Path, *, task: AgentEvalTask | None = None) -> AgentEvalAttempt: - """Load ``/result.json`` and build an attempt from it.""" - output_dir = Path(output_dir) - result_path = output_dir / "result.json" - if not result_path.is_file(): - raise FileNotFoundError(f"result.json not found in {output_dir}") - result = json.loads(result_path.read_text(encoding="utf-8")) - return attempt_from_result(result, output_dir=output_dir, task=task) - - -def attempt_from_result( - result: dict[str, Any], - *, - output_dir: str | Path | None = None, - task: AgentEvalTask | None = None, -) -> AgentEvalAttempt: - """Project a ``result.json`` dict onto :class:`AgentEvalAttempt`. - - The attempt ``status`` reflects whether the agent produced a usable - response (``agent`` phase outcome). Pass/fail from the verifier is recorded - as a *measurement* in metadata (``reward``/``passed``) so scoring metrics — - not the runtime — remain the source of truth. - """ - task_id = str(result.get("task") or (task.id if task is not None else "unknown")) - backend = str(result.get("agent_backend") or "unknown") - resolved_dir = Path(output_dir) if output_dir is not None else Path(str(result.get("output_dir") or ".")) - layout = _layout_from_result_dir(resolved_dir) - - agent_phase = str(result.get("agent") or "") - agent_ok = agent_phase in {"ok", "skipped"} - status = resolve_attempt_status(agent_ok) - - output_text, final_extracted, final_source = _resolve_output_text(layout) - if not output_text: - output_text = "" if agent_ok else "(agent phase failed)" - - descriptors = _evidence_descriptors( - layout, AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - ) - - metrics = dict(result.get("metrics") or {}) - metadata: dict[str, Any] = { - # Canonical CapturedAgentAttempt-style provenance fields. - "agent_runtime": backend, - "agent_model": result.get("agent_model"), - "run_id": (result.get("provenance") or {}).get("run_id"), - "exit_code": 0 if agent_ok else 1, - "duration_ms": metrics.get("duration_ms"), - # Phase outcomes from result.json. - "agent_ok": agent_ok, - "build_status": result.get("build"), - "agent_status": result.get("agent"), - "verify_status": result.get("verify"), - # Measurements (verifier reward is a measurement, not attempt status). - "passed": result.get("passed"), - "reward": result.get("reward"), - "runtime_sec": result.get("runtime_sec"), - "verifier_scores": result.get("verifier_scores"), - # Provenance + candidate identity. - "provenance": result.get("provenance"), - "candidate_id": result.get("candidate_id"), - "candidate_params": result.get("candidate_params"), - "image": result.get("image"), - "output_dir": str(resolved_dir), - # Artifact discovery helpers. - "agent_log_dir": str(layout.agent_log_dir), - "workspace_dir": str(layout.workspace_dir), - "state_dir": str(layout.state_dir), - "final_answer_extracted": final_extracted, - "final_answer_source": final_source, - } - metadata.update({key: metrics.get(key) for key in _METRIC_KEYS}) - - return AgentEvalAttempt( - id=f"{task_id}:{backend}", - task_id=task_id, - status=status, - output=AgentOutput(text=output_text), - evidence=CandidateEvidence(descriptors=descriptors) if descriptors else None, - metadata=metadata, - ) - - -def _layout_from_result_dir(output_dir: Path) -> AgenticRunLayout: - agent_log_dir = output_dir / "agent" - return AgenticRunLayout( - run_dir=output_dir, - agent_log_dir=agent_log_dir, - workspace_dir=output_dir / "workspace", - state_dir=output_dir / "state", - instruction_path=agent_log_dir / "instruction.md", - ) - - -def _resolve_output_text(layout: AgenticRunLayout) -> tuple[str, bool, str | None]: - if not layout.agent_log_dir.is_dir(): - return "", False, None - artifacts = AgentArtifacts.from_dir(layout.agent_log_dir, workspace_dir=layout.workspace_dir) - if artifacts.final_answer.extracted and artifacts.final_answer.text: - return artifacts.final_answer.text, True, artifacts.final_answer.source - log_path = layout.agent_log_dir / "nat_agent.log" - if log_path.is_file(): - return log_path.read_text(encoding="utf-8", errors="replace").strip(), False, None - return "", False, None diff --git a/tests/agentic-use/runtimes/shared/task_loader.py b/tests/agentic-use/runtimes/shared/task_loader.py deleted file mode 100644 index e64a87e99d..0000000000 --- a/tests/agentic-use/runtimes/shared/task_loader.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Map agentic-use task directories to AgentEvalTask values.""" - -from __future__ import annotations - -import tomllib -from pathlib import Path - -from nemo_evaluator_sdk.agent_eval.types import AgentEvalTask -from nemo_evaluator_sdk.metrics.protocol import Metric - -from runtimes.shared.constants import AGENTIC_USE_DIR -from runtimes.shared.metrics import AgentPhaseSuccessMetric - - -def load_task_toml(task_dir: Path) -> dict[str, object]: - task_toml = task_dir / "task.toml" - if not task_toml.exists(): - return {} - try: - with task_toml.open("rb") as handle: - data = tomllib.load(handle) - except Exception: - return {} - return data if isinstance(data, dict) else {} - - -def task_agent_timeout_sec(task_dir: Path) -> int | None: - data = load_task_toml(task_dir) - agent = data.get("agent") - if not isinstance(agent, dict): - return None - timeout_value = agent.get("timeout_sec") - if isinstance(timeout_value, (int, float)) and timeout_value > 0: - return int(timeout_value) - return None - - -def agentic_task_from_dir( - task_dir: str | Path, - *, - tasks_root: Path | None = None, - metrics: list[Metric] | None = None, -) -> AgentEvalTask: - """Build an :class:`AgentEvalTask` from an agentic-use task directory. - - ``inputs`` carries only agent-facing material (``instruction``) per the SDK - design doc; runtime materialization details such as ``task_dir`` live in - ``metadata`` so they cannot leak into metric scoring rows. Metrics are - authored *on the task* (defaulting to :class:`AgentPhaseSuccessMetric`); the - orchestrator only appends compatibility metrics, it does not own the set. - """ - root = Path(tasks_root or AGENTIC_USE_DIR) - task_path = Path(task_dir) - if not task_path.is_absolute(): - task_path = (root / task_path).resolve() - - instruction_path = task_path / "instruction.md" - if not instruction_path.exists(): - raise FileNotFoundError(f"instruction.md not found in {task_path}") - - instruction = instruction_path.read_text(encoding="utf-8") - task_toml = load_task_toml(task_path) - - return AgentEvalTask( - id=task_path.name, - intent=instruction, - inputs={ - "instruction": instruction, - }, - metrics=metrics if metrics is not None else [AgentPhaseSuccessMetric()], - metadata={ - "benchmark": "agentic-use", - "task_toml": task_toml, - "instruction_path": str(instruction_path), - "task_dir": str(task_path), - }, - ) diff --git a/tests/agentic-use/runtimes/shared/usage.py b/tests/agentic-use/runtimes/shared/usage.py deleted file mode 100644 index 89053ffb97..0000000000 --- a/tests/agentic-use/runtimes/shared/usage.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Token usage extraction from agent logs. - -Reuses the proven implementation from ``nat_runner.py`` until the legacy -runner delegates here and the duplicate can be removed. -""" - -from __future__ import annotations - -from typing import TypedDict - - -class TokenMetrics(TypedDict): - prompt_tokens: int | None - completion_tokens: int | None - total_tokens: int | None - cache_creation_tokens: int | None - cache_read_tokens: int | None - n_assistant_messages: int | None - cost_usd: float | None - num_turns: int | None - duration_ms: float | None - - -def extract_usage_metrics(agent_log: str) -> dict[str, int | float | None]: - """Extract token usage metrics from an agent log.""" - import nat_runner - - metrics = nat_runner._extract_usage_metrics(agent_log) - return dict(metrics) diff --git a/tests/agentic-use/runtimes/shared/verify.py b/tests/agentic-use/runtimes/shared/verify.py deleted file mode 100644 index f83da8edb3..0000000000 --- a/tests/agentic-use/runtimes/shared/verify.py +++ /dev/null @@ -1,181 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Live VERIFY phase executed through the environment boundary. - -Ports ``nat_runner.run_verify_phase`` onto :meth:`AgentEnvironmentHandle.run_verifier` -so the task-local ``tests/test_outputs.py`` pytest verifier runs in the *same* -prepared environment (and against the same persisted workspace/state) as the -agent phase. The resulting reward is stamped onto the attempt metadata so the -``VerifierRewardMetric`` compatibility metric scores it through the Evaluator SDK. -""" - -from __future__ import annotations - -import textwrap -from pathlib import Path - -from nemo_evaluator_sdk.agent_eval.runtimes.verify import ( - VerifierOutcome, - apply_verify_to_metadata, - collect_verifier_outcome, - skipped_outcome, -) - -from runtimes.shared.constants import ( - DOCKER_SOCKET_CONTAINER_PATH, - DOCKER_SOCKET_HOST_PATH, - EVALUATOR_SDK_SRC, - FILES_STORAGE_CONFIG, - PLATFORM_CONFIG_PATH, - SHARED_DIR, -) -from runtimes.shared.environment import AgentEnvironmentHandle, EnvRunSpec -from runtimes.shared.layout import AgenticRunLayout - -__all__ = [ - "VerifierOutcome", - "apply_verify_to_metadata", - "build_verify_run_spec", - "maybe_run_verify", - "run_verify", - "verifier_log_dir", -] - - -def verifier_log_dir(layout: AgenticRunLayout) -> Path: - return layout.run_dir / "verifier" - - -def build_verify_run_spec( - task_dir: Path, - layout: AgenticRunLayout, - *, - nmp_base_url: str, - agent_backend: str, - agent_model: str, - smoke_workspace: str | None = None, - timeout_sec: int | None = None, - extra_args: list[str] | None = None, -) -> EnvRunSpec | None: - """Build the verifier ``EnvRunSpec`` mirroring ``nat_runner.run_verify_phase``. - - Returns ``None`` when the task has no ``tests/test_outputs.py`` (nothing to - verify), matching the runner's behavior. - """ - tests_dir = task_dir / "tests" - if not (tests_dir / "test_outputs.py").exists(): - return None - - log_dir = verifier_log_dir(layout) - log_dir.mkdir(parents=True, exist_ok=True) - layout.workspace_dir.mkdir(parents=True, exist_ok=True) - - smoke_seed_cmd = "" - smoke_cleanup_cmd = "" - if smoke_workspace: - smoke_seed_cmd = textwrap.dedent("""\ - /app/.venv/bin/nemo workspaces create "${SMOKE_WORKSPACE}" \ - --description "Seeded by agentic runtime smoke mode" >/dev/null 2>&1 || true - """) - smoke_cleanup_cmd = textwrap.dedent("""\ - /app/.venv/bin/nemo workspaces delete "${SMOKE_WORKSPACE}" >/dev/null 2>&1 || true - """) - - verify_cmd = [ - "bash", - "-c", - textwrap.dedent(f"""\ - export PYTHONPATH="/app/tests/agentic-use/shared:/app/packages/nemo_evaluator_sdk/src:${{PYTHONPATH}}" - export NAT_AGENT=1 - {smoke_seed_cmd} - /app/.venv/bin/python -m pytest /tests/test_outputs.py -rA -v 2>&1 | tee /logs/verifier/test-stdout.txt - EXIT=${{PIPESTATUS[0]}} - {smoke_cleanup_cmd} - if [ $EXIT -eq 0 ]; then echo 1; else echo 0; fi > /logs/verifier/reward.txt - exit $EXIT - """), - ] - - env: dict[str, str] = { - "NMP_BASE_URL": nmp_base_url, - "NAT_AGENT": "1", - "NAT_AGENT_BACKEND": agent_backend, - "NAT_AGENT_MODEL": agent_model, - "AGENTIC_USE_TASK_DIR": "/task", - "AGENTIC_USE_WORKSPACE_DIR": "/app/workspace", - "SMOKE_WORKSPACE": smoke_workspace or "", - "DATABASE_DIALECT": "sqlite", - "DATABASE_PATH": "/data/nmp-platform.db", - "NMP_FILES_DEFAULT_STORAGE_CONFIG": FILES_STORAGE_CONFIG, - "NMP_CONFIG_FILE_PATH": PLATFORM_CONFIG_PATH, - } - if DOCKER_SOCKET_HOST_PATH.exists(): - env["DOCKER_HOST"] = f"unix://{DOCKER_SOCKET_CONTAINER_PATH}" - - mounts: list[tuple[str, str]] = [ - (str(tests_dir), "/tests"), - (str(task_dir), "/task"), - (str(layout.workspace_dir), "/app/workspace"), - (str(SHARED_DIR), "/app/tests/agentic-use/shared:ro"), - (str(EVALUATOR_SDK_SRC), "/app/packages/nemo_evaluator_sdk/src:ro"), - (str(layout.agent_log_dir), "/logs/agent"), - (str(log_dir), "/logs/verifier"), - # Persist platform/db state across AGENT and VERIFY containers. - (str(layout.state_dir), "/data"), - ] - if DOCKER_SOCKET_HOST_PATH.exists(): - mounts.append((str(DOCKER_SOCKET_HOST_PATH), DOCKER_SOCKET_CONTAINER_PATH)) - - return EnvRunSpec( - command=verify_cmd, - env=env, - mounts=mounts, - timeout=timeout_sec, - extra_args=list(extra_args or []), - ) - - -async def run_verify( - handle: AgentEnvironmentHandle, - spec: EnvRunSpec, - layout: AgenticRunLayout, -) -> VerifierOutcome: - """Execute the verifier through the environment handle and collect reward.""" - result = await handle.run_verifier(spec) - return collect_verifier_outcome( - ok=result.ok, - exit_code=result.exit_code, - log_dir=verifier_log_dir(layout), - ) - - -async def maybe_run_verify( - handle: AgentEnvironmentHandle, - *, - enabled: bool, - task_dir: Path, - layout: AgenticRunLayout, - nmp_base_url: str, - agent_backend: str, - agent_model: str, - smoke_workspace: str | None = None, - timeout_sec: int | None = None, - extra_args: list[str] | None = None, -) -> VerifierOutcome: - """Run the verifier through ``handle`` when enabled and a verifier exists.""" - if not enabled: - return skipped_outcome() - spec = build_verify_run_spec( - task_dir, - layout, - nmp_base_url=nmp_base_url, - agent_backend=agent_backend, - agent_model=agent_model, - smoke_workspace=smoke_workspace, - timeout_sec=timeout_sec, - extra_args=extra_args, - ) - if spec is None: - return skipped_outcome() - return await run_verify(handle, spec, layout) diff --git a/tests/agentic-use/runtimes/workflow/runtime.py b/tests/agentic-use/runtimes/workflow/runtime.py index 1d8c09fecd..55688b3d24 100644 --- a/tests/agentic-use/runtimes/workflow/runtime.py +++ b/tests/agentic-use/runtimes/workflow/runtime.py @@ -8,20 +8,21 @@ from collections.abc import Sequence from pathlib import Path +from nemo_evaluator_sdk.agent_eval.runtimes.environment import AgentEnvironmentProvider, EnvRunSpec +from nemo_evaluator_sdk.agent_eval.runtimes.verify import apply_verify_to_metadata from nemo_evaluator_sdk.agent_eval.types import AgentEvalAttempt, AgentEvalRunConfig, AgentEvalTask -from runtimes.shared.artifacts import build_agent_eval_attempt from runtimes.shared.config import WorkflowRuntimeConfig from runtimes.shared.constants import INSTRUCTION_CONTAINER_PATH, WORKFLOW_CONTAINER_PATH -from runtimes.shared.container_env import base_container_env -from runtimes.shared.environment import ( - AgentEnvironmentProvider, +from runtimes.shared.platform import ( + AgenticRunLayout, DockerEnvironmentProvider, - EnvRunSpec, + base_container_env, + build_agent_eval_attempt, + maybe_run_verify, + resolve_run_layout, + task_agent_timeout_sec, ) -from runtimes.shared.layout import AgenticRunLayout, resolve_run_layout -from runtimes.shared.task_loader import task_agent_timeout_sec -from runtimes.shared.verify import apply_verify_to_metadata, maybe_run_verify from runtimes.workflow.command import build_workflow_agent_cmd from runtimes.workflow.prep import prepare_workflow_for_runtime diff --git a/tests/agentic-use/tests/test_agentic_runtimes.py b/tests/agentic-use/tests/test_agentic_runtimes.py index 5a3590071d..1903989705 100644 --- a/tests/agentic-use/tests/test_agentic_runtimes.py +++ b/tests/agentic-use/tests/test_agentic_runtimes.py @@ -10,10 +10,9 @@ import pytest import yaml +from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec from runtimes.shared.config import AgenticSharedConfig, WorkflowRuntimeConfig -from runtimes.shared.environment import EnvCommandResult, EnvRunSpec -from runtimes.shared.layout import resolve_run_layout, task_image_tag -from runtimes.shared.task_loader import agentic_task_from_dir +from runtimes.shared.platform import agentic_task_from_dir, resolve_run_layout, task_image_tag from runtimes.workflow.command import build_workflow_agent_cmd from runtimes.workflow.prep import prepare_workflow_for_runtime from runtimes.workflow.runtime import NatWorkflowAttemptRuntime @@ -122,8 +121,7 @@ def test_runtime_for_backend_rejects_unknown() -> None: def test_build_agent_eval_attempt_metadata_matches_captured_schema(tmp_path: Path) -> None: - from runtimes.shared.artifacts import build_agent_eval_attempt, to_captured_agent_attempt - from runtimes.shared.layout import AgenticRunLayout + from runtimes.shared.platform import AgenticRunLayout, build_agent_eval_attempt, to_captured_agent_attempt task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR) layout = AgenticRunLayout( @@ -183,7 +181,7 @@ async def test_aut_runtime_run_tasks_with_mocked_env(tmp_path: Path) -> None: def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> None: - from runtimes.shared.result_adapter import attempt_from_result + from runtimes.shared.platform import attempt_from_result output_dir = tmp_path / "20260101T000000Z-demo" (output_dir / "agent").mkdir(parents=True) @@ -218,7 +216,7 @@ def test_attempt_from_result_maps_status_and_measurements(tmp_path: Path) -> Non def test_attempt_from_result_marks_unsuccessful_agent_partial(tmp_path: Path) -> None: - from runtimes.shared.result_adapter import attempt_from_result + from runtimes.shared.platform import attempt_from_result output_dir = tmp_path / "run" (output_dir / "agent").mkdir(parents=True) @@ -271,7 +269,7 @@ async def test_score_captured_attempts_offline(tmp_path: Path) -> None: @pytest.mark.asyncio async def test_verifier_reward_metric_reads_metadata() -> None: from nemo_evaluator_sdk.metrics.protocol import CandidateOutput, DatasetRow, MetricInput - from runtimes.shared.metrics import VerifierRewardMetric + from runtimes.shared.platform import VerifierRewardMetric metric = VerifierRewardMetric() candidate = CandidateOutput(output_text="x", metadata={"reward": 1}) @@ -320,7 +318,7 @@ def _make_run_result(*, reward: float, total_tokens: int, runtime_sec: float, co def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: - from runtimes.shared.reporting import summarize_run + from nemo_evaluator_sdk.agent_eval.gating import summarize_run summary = summarize_run(_make_run_result(reward=1.0, total_tokens=120, runtime_sec=4.5)) @@ -333,7 +331,7 @@ def test_summarize_run_aggregates_pass_tokens_runtime_provenance() -> None: def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> None: - from runtimes.shared.reporting import GateThresholds, evaluate_gate, write_gate_report + from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate, write_gate_report baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0) candidate = _make_run_result(reward=1.0, total_tokens=200, runtime_sec=4.0) @@ -356,7 +354,7 @@ def test_evaluate_gate_passes_then_flags_token_regression(tmp_path: Path) -> Non def test_evaluate_gate_blocks_cross_commit_comparison() -> None: - from runtimes.shared.reporting import GateThresholds, evaluate_gate + from nemo_evaluator_sdk.agent_eval.gating import GateThresholds, evaluate_gate baseline = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="aaa111") candidate = _make_run_result(reward=1.0, total_tokens=100, runtime_sec=4.0, commit="bbb222") @@ -378,8 +376,7 @@ def test_evaluate_gate_blocks_cross_commit_comparison() -> None: def test_build_verify_run_spec_shape(tmp_path: Path) -> None: - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import build_verify_run_spec + from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec layout = AgenticRunLayout( run_dir=tmp_path, @@ -404,8 +401,7 @@ def test_build_verify_run_spec_shape(tmp_path: Path) -> None: def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> None: - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import build_verify_run_spec + from runtimes.shared.platform import AgenticRunLayout, build_verify_run_spec task_dir = tmp_path / "no-tests-task" task_dir.mkdir() @@ -422,9 +418,8 @@ def test_build_verify_run_spec_returns_none_without_tests(tmp_path: Path) -> Non @pytest.mark.asyncio async def test_run_verify_reads_reward_file(tmp_path: Path) -> None: - from runtimes.shared.environment import EnvCommandResult, EnvRunSpec - from runtimes.shared.layout import AgenticRunLayout - from runtimes.shared.verify import run_verify + from nemo_evaluator_sdk.agent_eval.runtimes.environment import EnvCommandResult, EnvRunSpec + from runtimes.shared.platform import AgenticRunLayout, run_verify layout = AgenticRunLayout( run_dir=tmp_path, @@ -456,7 +451,7 @@ async def close(self) -> None: @pytest.mark.asyncio async def test_workflow_runtime_runs_verify_through_handle(tmp_path: Path) -> None: - from runtimes.shared.verify import verifier_log_dir + from runtimes.shared.platform import verifier_log_dir task = agentic_task_from_dir(WORKSPACE_BASIC, tasks_root=TASKS_DIR) layout = resolve_run_layout(task, AgenticSharedConfig(jobs_dir=tmp_path)) @@ -493,7 +488,7 @@ async def prepare(self, task: object, config: object = None) -> _Handle: def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec (tmp_path / "environment.yaml").write_text( "environment:\n" @@ -516,7 +511,7 @@ def test_load_environment_spec_prefers_yaml(tmp_path: Path) -> None: def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec env_dir = tmp_path / "environment" env_dir.mkdir() @@ -528,14 +523,14 @@ def test_load_environment_spec_falls_back_to_dockerfile(tmp_path: Path) -> None: def test_load_environment_spec_missing_raises(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import load_environment_spec + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import load_environment_spec with pytest.raises(FileNotFoundError): load_environment_spec(tmp_path) def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import plan_task_build + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build env_dir = tmp_path / "environment" env_dir.mkdir() @@ -549,7 +544,7 @@ def test_plan_task_build_dockerfile_escape_hatch(tmp_path: Path) -> None: def test_plan_task_build_generates_derived_dockerfile(tmp_path: Path) -> None: - from runtimes.shared.environment_spec import plan_task_build + from nemo_evaluator_sdk.agent_eval.runtimes.environment_spec import plan_task_build (tmp_path / "environment.yaml").write_text( "environment:\n image: base:1\n dependencies:\n python: [pytest]\n setup: [seed-providers]\n",