From 6cece4827218516bd81b84242c22cc093f724eda Mon Sep 17 00:00:00 2001 From: David Gardner Date: Fri, 1 May 2026 14:50:08 -0700 Subject: [PATCH 1/5] Install missing example install command Signed-off-by: David Gardner --- docs/source/improve-workflows/evaluate.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/improve-workflows/evaluate.md b/docs/source/improve-workflows/evaluate.md index d695d813cf..fcbe1392d7 100644 --- a/docs/source/improve-workflows/evaluate.md +++ b/docs/source/improve-workflows/evaluate.md @@ -93,7 +93,12 @@ To evaluate a workflow, you can use the `nat eval` command. The `nat eval` comma Note: If you would like to set up visualization dashboards for this initial evaluation, please refer to the **Visualizing Evaluation Results** section below. -To run and evaluate the simple example workflow, use the following command: +To run and evaluate the simple web query example workflow, first install the example with: +```bash +uv pip install -e examples/evaluation_and_profiling/simple_web_query_eval +``` + +Then, use the following command: ```bash nat eval --config_file=examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml ``` From 7e258933d9b794acf5b5b58c9c3c07b744361d9b Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 4 May 2026 09:45:59 -0700 Subject: [PATCH 2/5] Raise an explicit value error when the input dataset is empty, this avoids a vague pandas error later Signed-off-by: David Gardner --- .../nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py index 3ce4930812..0d28d43906 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py @@ -347,6 +347,9 @@ async def profile_workflow(self) -> ProfilerResults: all_stats = [item.trajectory for item in self.eval_input.eval_input_items] + if len(all_stats) == 0 or all(len(stats) == 0 for stats in all_stats): + raise ValueError("No trajectories found for profiling.") + profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir, write_output=self.config.write_output) From b76f701a51fb054e7ab49f63ceaba324e96aa878 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 4 May 2026 09:54:16 -0700 Subject: [PATCH 3/5] Update yaml snippets to match what is currently in the source tree. Update judge LLM leadership board to match Ragas, update github url for Ragas Update documentation to reflect that the default value for `cleanup` is true Signed-off-by: David Gardner --- docs/source/improve-workflows/evaluate.md | 34 ++++++++++++++--------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/docs/source/improve-workflows/evaluate.md b/docs/source/improve-workflows/evaluate.md index fcbe1392d7..b332b6ae80 100644 --- a/docs/source/improve-workflows/evaluate.md +++ b/docs/source/improve-workflows/evaluate.md @@ -114,7 +114,7 @@ If you encounter rate limiting (`[429] Too Many Requests`) during evaluation, yo llms: nim_rag_eval_llm: _type: nim - model_name: meta/llama-3.1-70b-instruct + model_name: nvidia/nemotron-3-nano max_tokens: 8 base_url: http://localhost:8000/v1 ``` @@ -124,11 +124,13 @@ If you encounter rate limiting (`[429] Too Many Requests`) during evaluation, yo ## Understanding the Evaluation Configuration The `eval` section in the configuration file specifies the dataset and the evaluators to use. The following is an example of an `eval` section in a configuration file: -`examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml`: +`examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml` (some attributes have been omitted for brevity): ```yaml eval: general: - output_dir: ./.tmp/nat/examples/getting_started/simple_web_query/ + output: + dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/eval/ + cleanup: true dataset: _type: json file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json @@ -265,23 +267,30 @@ These metrics use a judge LLM for evaluating the generated output and retrieved llms: nim_rag_eval_llm: _type: nim - model_name: meta/llama-3.1-70b-instruct + model_name: nvidia/nemotron-3-nano-30b-a3b max_tokens: 8 + chat_template_kwargs: + enable_thinking: false ``` For these metrics, it is recommended to use 8 tokens for the judge LLM. The judge LLM returns a floating point score between 0 and 1 for each metric where 1.0 indicates a perfect match between the expected output and the generated output. Evaluation is dependent on the judge LLM's ability to accurately evaluate the generated output and retrieved context. This is the leadership board for the judge LLM: ``` - 1) nvidia/Llama-3_3-Nemotron-Super-49B-v1 - 2) mistralai/mixtral-8x22b-instruct-v0.1 - 3) mistralai/mixtral-8x7b-instruct-v0.1 - 4) meta/llama-3.1-70b-instruct - 5) meta/llama-3.3-70b-instruct + 1)- nvidia/Llama-3_3-Nemotron-Super-49B-v1 + 2)- mistralai/mixtral-8x22b-instruct-v0.1 + 3)- mistralai/mixtral-8x7b-instruct-v0.1 + 4)- meta/llama-3.1-70b-instruct + 5)- meta/llama-3.3-70b-instruct + 6)- meta/llama-3.1-405b-instruct + 7)- mistralai/mistral-nemo-12b-instruct + 8)- nvidia/llama-3.1-nemotron-70b-instruct + 9)- meta/llama-3.1-8b-instruct + 10)- google/gemma-2-2b-it ``` -For a complete list of up-to-date judge LLMs, refer to the [Ragas NV metrics leadership board](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_nv_metrics.py) +For a complete list of up-to-date judge LLMs, refer to the [Ragas NV metrics leadership board](https://github.com/vibrantlabsai/ragas/blob/v0.4.3/src/ragas/metrics/_nv_metrics.py) -For more information on the prompt used by the judge LLM, refer to the [Ragas NV metrics](https://github.com/explodinggradients/ragas/blob/v0.2.14/src/ragas/metrics/_nv_metrics.py). The prompt for these metrics is not configurable. If you need a custom prompt, you can use the [Tunable RAG Evaluator](#tunable-rag-evaluator) or implement your own evaluator using the [Custom Evaluator](../extend/custom-components/custom-evaluator.md) documentation. +For more information on the prompt used by the judge LLM, refer to the [Ragas NV metrics](https://github.com/vibrantlabsai/ragas/blob/v0.4.3/src/ragas/metrics/_nv_metrics.py). The prompt for these metrics is not configurable. If you need a custom prompt, you can use the [Tunable RAG Evaluator](#tunable-rag-evaluator) or implement your own evaluator using the [Custom Evaluator](../extend/custom-components/custom-evaluator.md) documentation. ### Trajectory Evaluator This evaluator uses the intermediate steps generated by the workflow to evaluate the workflow trajectory. The evaluator configuration includes the evaluator type and any additional parameters required by the evaluator. @@ -351,7 +360,7 @@ eval: ``` :::{note} -If `cleanup` is set to `true`, the entire output directory will be removed after the evaluation is complete. This is useful for temporary evaluations where you don't need to retain the output files. Use this option with caution, as it will delete all evaluation results including workflow outputs and evaluator outputs. +If `cleanup` is set to `true`, the entire output directory will be removed prior to performing the evaluation. ::: @@ -1238,7 +1247,6 @@ eval: dir: ./.tmp/nat/examples/simple_output/ cleanup: true ``` -Output directory cleanup is disabled by default for easy troubleshooting. #### Job eviction from output directory When running multiple evaluations, especially with `append_job_id_to_output_dir` enabled, the output directory can accumulate a large number of job folders over time. You can control this growth using a job eviction policy. From a99faed5ee3e8bfb7362b23d937adebff9366674 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 4 May 2026 11:31:32 -0700 Subject: [PATCH 4/5] When output is undefined, but output_dir is, instantiate EvalOutputConfig(dir=output_dir) Signed-off-by: David Gardner --- .../src/nat/data_models/evaluate_config.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py index f7b6a62c8a..7c74007d68 100644 --- a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py +++ b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py @@ -21,6 +21,8 @@ from pydantic import BaseModel from pydantic import Discriminator from pydantic import Field +from pydantic import ValidationInfo +from pydantic import field_validator from pydantic import model_validator from nat.data_models.common import TypedBaseModel @@ -132,12 +134,16 @@ class EvalGeneralConfig(BaseModel): "this creates a fresh workflow instance per eval item, resetting all stateful tools to their " "initial state. Set to False to disable this behavior.") - # overwrite the output_dir with the output config if present + # If output_dir is defined and output is not, define an EvalOutputConfig with output_dir as the dir @model_validator(mode="before") @classmethod def override_output_dir(cls, values): - if values.get("output") and values["output"].get("dir"): - values["output_dir"] = values["output"]["dir"] + output_config = values.get("output") + if output_config is None: + output_dir = values.get("output_dir") + if output_dir is not None: + values["output"] = EvalOutputConfig(dir=output_dir) + return values @classmethod From 48c0facecca76d190282512e3185505f4c8bf4b6 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 4 May 2026 11:56:06 -0700 Subject: [PATCH 5/5] Remove unused imports Signed-off-by: David Gardner --- packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py index 7c74007d68..7e2126640b 100644 --- a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py +++ b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py @@ -21,8 +21,6 @@ from pydantic import BaseModel from pydantic import Discriminator from pydantic import Field -from pydantic import ValidationInfo -from pydantic import field_validator from pydantic import model_validator from nat.data_models.common import TypedBaseModel