Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 284 additions & 0 deletions packages/nemo_evaluator_sdk/examples/requirements_code_snippets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Requirement-oriented snippets derived from ``examples.py``.

This file exists so external requirement tables can link to concrete evaluator
SDK code examples with stable GitHub line ranges.
"""

from __future__ import annotations

from nemo_evaluator_sdk.execution.evaluator import Evaluator
from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric
from nemo_evaluator_sdk.metrics.f1 import F1Metric
from nemo_evaluator_sdk.metrics.llm_judge import LLMJudgeMetric
from nemo_evaluator_sdk.metrics.protocol import Metric, MetricInput, MetricOutput, MetricOutputSpec, MetricResult
from nemo_evaluator_sdk.metrics.string_check import StringCheckMetric
from nemo_evaluator_sdk.values import (
InferenceParams,
JSONScoreParser,
Model,
RangeScore,
RunConfig,
SecretRef,
)
from nemo_evaluator_sdk.values.multi_metric_results import BenchmarkEvaluationResult

# Alias for BenchmarkEvaluationResult to align with the requirements
EvaluationSuiteResult = BenchmarkEvaluationResult

HELPFULNESS_PROMPT_V1 = (
"You are an evaluator. Rate the response's helpfulness from 0-4. "
'Return only a JSON object with this shape: {"helpfulness": <integer>}.'
)

OFFLINE_SUITE_DATASET = [
{
"prompt": "What is the capital of France?",
"reference": "Paris",
"actual": "Paris",
"response": "Paris",
"required_phrase": "Paris",
},
{
"prompt": "What is the capital of England?",
"reference": "London",
"actual": "Berlin",
"response": "Berlin",
"required_phrase": "London",
},
]

ONLINE_SUITE_DATASET = [
{
"prompt": "Return exactly this word with no punctuation: Paris",
"reference": "Paris",
"required_phrase": "Paris",
},
{
"prompt": "Return exactly this word with no punctuation: Oslo",
"reference": "London",
"required_phrase": "London",
},
]

ONLINE_CHAT_PROMPT_TEMPLATE = {"messages": [{"role": "user", "content": "{{item.prompt}}"}]}

MODEL = Model(
url="https://integrate.api.nvidia.com/v1/chat/completions",
name="nvidia/nemotron-3-nano-30b-a3b",
api_key_secret=SecretRef(root="NVIDIA_API_KEY"),
)


def suite_metrics() -> list[Metric]:
"""Build the scorer set used by the suite snippets below."""
exact_match = ExactMatchMetric(reference="{{item.reference}}", candidate="{{item.actual}}")
contains_required_phrase = StringCheckMetric(
operation="contains",
left_template="{{item.actual}}",
right_template="{{item.required_phrase}}",
)
return [exact_match, contains_required_phrase]


def create_reference_free_judge_metric(judge_model: Model) -> LLMJudgeMetric:
"""Build an LLM-as-judge scorer using the same metric contract."""
return LLMJudgeMetric(
model=judge_model,
scores=[
RangeScore(
name="helpfulness",
minimum=0,
maximum=4,
parser=JSONScoreParser(json_path="helpfulness"),
description="How well does the response help the user?",
)
],
inference=InferenceParams(temperature=0.0, max_tokens=32768),
prompt_template={
"messages": [
{"role": "system", "content": HELPFULNESS_PROMPT_V1},
{
"role": "user",
"content": (
"User prompt: {{item.prompt}}\n\n"
"Assistant response: {{sample.output_text | default(item.response)}}\n\n"
"Rate this response."
),
},
],
},
)


class CustomPythonExactMatchMetric:
"""
A pure-Python custom scorer that implements the same Metric protocol as LLMJudgeMetric.
This is a deterministic code metric that can be used in the same way as other metrics.
"""

type = "custom-python-exact-match"

def output_spec(self) -> list[MetricOutputSpec]:
return [MetricOutputSpec.continuous_score(self.type)]

async def compute_scores(self, input: MetricInput) -> MetricResult:
prediction = input.candidate.output_text
reference = input.row.data.get("actual")
if reference is None:
reference = input.row.data.get("reference")
score = 1.0 if prediction is not None and prediction == reference else 0.0
return MetricResult(outputs=[MetricOutput(name=self.type, value=score)])


# 1a) Evaluation Suite: Dataset + Taskset
async def run_candidate_against_fixed_suite() -> EvaluationSuiteResult:
"""Run one fixed dataset and scorer set so candidate results are comparable."""
metrics_suite = [
ExactMatchMetric(reference="{{item.reference}}", candidate="{{item.actual}}"),
StringCheckMetric(
operation="contains",
left_template="{{item.actual}}",
right_template="{{item.required_phrase}}",
),
]

return await Evaluator().run(
metrics=metrics_suite,
dataset=OFFLINE_SUITE_DATASET,
config=RunConfig(parallelism=4),
)


# 1b) Per-trial & Aggregate Results
async def collect_per_trial_and_aggregate_results() -> dict[str, object]:
"""Read per-row records and aggregate scores from the same completed result."""
result = await run_candidate_against_fixed_suite()
return {
"per_trial": result.to_records(view="rows"),
"aggregate": result.to_records(view="aggregate"),
"exact_match_aggregate": result.metric_result("exact-match").aggregate_scores.scores,
}


# 1c) Service Status
# Service related code snippets leverage the `nemo_evaluator.sdk` plugin SDK.
# The `nemo_evaluator.sdk` plugin SDK is a client that is used within the context of nemo-platform.
# The `nemo_evaluator_sdk` is standalone evaluator package that is used independently of nemo-platform.
# More details: https://jubilant-adventure-g4rv38m.pages.github.io/main/evaluator/#key-differences-from-standalone-library
async def submit_plugin_job_and_check_status() -> dict[str, object]:
"""Submit through the plugin SDK and inspect the returned job status."""
from nemo_evaluator.sdk import AsyncEvaluator
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager
from nemo_platform import NeMoPlatform

client = NeMoPlatform(base_url="http://localhost:8080", workspace="default")
evaluator_client: AsyncEvaluator = client.evaluator

try:
job = await evaluator_client.submit(
metric=ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}"),
dataset=[
{"expected": "blue", "model_output": "blue"},
{"expected": "Jupiter", "model_output": "Saturn"},
],
config=RunConfig(parallelism=2, limit_samples=2),
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
submitted_status = await job.get_job_status()
await job.wait_until_done(poll_interval_seconds=5)
return {
"job_name": job.name,
"submitted_status": submitted_status,
"terminal_status": await job.get_job_status(),
}
finally:
client.close()


# 2a) Support for Local & Remote mode eval
async def run_local_and_submit_remote_with_plugin_sdk() -> dict[str, object]:
"""Use plugin SDK ``run`` locally and ``submit`` for service-backed execution."""
from nemo_evaluator.sdk import AsyncEvaluator
from nemo_evaluator.shared.metric_bundles.cloudpickle import CloudpickleMetricBundlePackager
from nemo_platform import NeMoPlatform

dataset = [
{"expected": "blue", "model_output": "blue"},
{"expected": "Jupiter", "model_output": "Saturn"},
]
metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.model_output}}")
config = RunConfig(parallelism=2, limit_samples=2)

client = NeMoPlatform(base_url="http://localhost:8080", workspace="default")
evaluator_client: AsyncEvaluator = client.evaluator

try:
local_result = evaluator_client.run(metric=metric, dataset=dataset, config=config)
remote_job = await evaluator_client.submit(
metric=metric,
dataset=dataset,
config=config,
metric_bundle_packager=CloudpickleMetricBundlePackager(),
)
return {
"local_result": local_result,
"remote_job_name": remote_job.name,
"remote_status": await remote_job.get_job_status(),
}
finally:
client.close()


# 3a) Runner independent of scorer
# 3b) Change in scorer should not impact runner
async def run_supplied_scorers() -> None:
"""Keep runner execution independent from how each scorer is defined."""

# Define the scorer
exact_only = [ExactMatchMetric(reference="{{item.reference}}", candidate="{{item.actual}}")]

# Run the scorer
await Evaluator().run(
metrics=exact_only,
dataset=OFFLINE_SUITE_DATASET,
config=RunConfig(parallelism=4),
)


# 4a) Common interface for judge & python code based scorer
# 4b) Common schema for judge & python based scorer
def scorer_interface_examples() -> None:
"""Check that LLM-judge and Python scorers satisfy the Metric protocol."""
judge_metric = create_reference_free_judge_metric(MODEL)
if not isinstance(judge_metric, Metric):
raise TypeError(f"{type(judge_metric).__name__} does not implement Metric")

python_metric = CustomPythonExactMatchMetric()
if not isinstance(python_metric, Metric):
raise TypeError(f"{type(python_metric).__name__} does not implement Metric")


# 5a) Support variety types of scorers: reference-based (ground-truth) metrics,
# rubric-based reference-free LLM-as-judge, deterministic code metrics
# All available metrics: https://github.com/NVIDIA-NeMo/nemo-platform/tree/main/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/metrics
def scorer_style_examples() -> dict[str, Metric]:
"""Inline concrete metric examples for different scorer styles."""
return {
"exact_match": ExactMatchMetric(reference="{{item.reference}}", candidate="{{item.actual}}"),
"f1": F1Metric(reference="{{item.reference}}", candidate="{{item.actual}}"),
"llm_judge": create_reference_free_judge_metric(MODEL),
"custom_python": CustomPythonExactMatchMetric(),
}


# 5b) Support variety types of scorers all scorer styles are expressible and runnable within one suite
async def run_mixed_scorer_suite() -> EvaluationSuiteResult:
"""Run the available scorer styles together in one evaluator suite."""
return await Evaluator().run(
metrics=list(scorer_style_examples().values()),
dataset=OFFLINE_SUITE_DATASET,
config=RunConfig(parallelism=4),
)
Loading