From 6cece4827218516bd81b84242c22cc093f724eda Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Fri, 1 May 2026 14:50:08 -0700
Subject: [PATCH 1/5] Install missing example install command

Signed-off-by: David Gardner <dagardner@nvidia.com>
---
 docs/source/improve-workflows/evaluate.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/improve-workflows/evaluate.md b/docs/source/improve-workflows/evaluate.md
index d695d813cf..fcbe1392d7 100644
--- a/docs/source/improve-workflows/evaluate.md
+++ b/docs/source/improve-workflows/evaluate.md
@@ -93,7 +93,12 @@ To evaluate a workflow, you can use the `nat eval` command. The `nat eval` comma
 
 Note: If you would like to set up visualization dashboards for this initial evaluation, please refer to the **Visualizing Evaluation Results** section below.
 
-To run and evaluate the simple example workflow, use the following command:
+To run and evaluate the simple web query example workflow, first install the example with:
+```bash
+uv pip install -e examples/evaluation_and_profiling/simple_web_query_eval
+```
+
+Then, use the following command:
 ```bash
 nat eval --config_file=examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml
 ```

From 7e258933d9b794acf5b5b58c9c3c07b744361d9b Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Mon, 4 May 2026 09:45:59 -0700
Subject: [PATCH 2/5] Raise an explicit value error when the input dataset is
 empty, this avoids a vague pandas error later

Signed-off-by: David Gardner <dagardner@nvidia.com>
---
 .../nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py
index 3ce4930812..0d28d43906 100644
--- a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py
+++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py
@@ -347,6 +347,9 @@ async def profile_workflow(self) -> ProfilerResults:
 
         all_stats = [item.trajectory for item in self.eval_input.eval_input_items]
 
+        if len(all_stats) == 0 or all(len(stats) == 0 for stats in all_stats):
+            raise ValueError("No trajectories found for profiling.")
+
         profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
                                          self.eval_config.general.output_dir,
                                          write_output=self.config.write_output)

From b76f701a51fb054e7ab49f63ceaba324e96aa878 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Mon, 4 May 2026 09:54:16 -0700
Subject: [PATCH 3/5] Update yaml snippets to match what is currently in the
 source tree.

Update judge LLM leadership board to match Ragas, update github url for Ragas

Update documentation to reflect that the default value for `cleanup` is true

Signed-off-by: David Gardner <dagardner@nvidia.com>
---
 docs/source/improve-workflows/evaluate.md | 34 ++++++++++++++---------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/docs/source/improve-workflows/evaluate.md b/docs/source/improve-workflows/evaluate.md
index fcbe1392d7..b332b6ae80 100644
--- a/docs/source/improve-workflows/evaluate.md
+++ b/docs/source/improve-workflows/evaluate.md
@@ -114,7 +114,7 @@ If you encounter rate limiting (`[429] Too Many Requests`) during evaluation, yo
      llms:
        nim_rag_eval_llm:
          _type: nim
-         model_name: meta/llama-3.1-70b-instruct
+         model_name: nvidia/nemotron-3-nano
          max_tokens: 8
          base_url: http://localhost:8000/v1
      ```
@@ -124,11 +124,13 @@ If you encounter rate limiting (`[429] Too Many Requests`) during evaluation, yo
 ## Understanding the Evaluation Configuration
 The `eval` section in the configuration file specifies the dataset and the evaluators to use. The following is an example of an `eval` section in a configuration file:
 
-`examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml`:
+`examples/evaluation_and_profiling/simple_web_query_eval/configs/eval_config.yml` (some attributes have been omitted for brevity):
 ```yaml
 eval:
   general:
-    output_dir: ./.tmp/nat/examples/getting_started/simple_web_query/
+    output:
+      dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/eval/
+      cleanup: true
     dataset:
       _type: json
       file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json
@@ -265,23 +267,30 @@ These metrics use a judge LLM for evaluating the generated output and retrieved
 llms:
   nim_rag_eval_llm:
     _type: nim
-    model_name: meta/llama-3.1-70b-instruct
+    model_name: nvidia/nemotron-3-nano-30b-a3b
     max_tokens: 8
+    chat_template_kwargs:
+      enable_thinking: false
 ```
 For these metrics, it is recommended to use 8 tokens for the judge LLM. The judge LLM returns a floating point score between 0 and 1 for each metric where 1.0 indicates a perfect match between the expected output and the generated output.
 
 Evaluation is dependent on the judge LLM's ability to accurately evaluate the generated output and retrieved context. This is the leadership board for the judge LLM:
 ```
-    1) nvidia/Llama-3_3-Nemotron-Super-49B-v1
-    2) mistralai/mixtral-8x22b-instruct-v0.1
-    3) mistralai/mixtral-8x7b-instruct-v0.1
-    4) meta/llama-3.1-70b-instruct
-    5) meta/llama-3.3-70b-instruct
+    1)- nvidia/Llama-3_3-Nemotron-Super-49B-v1
+    2)- mistralai/mixtral-8x22b-instruct-v0.1
+    3)- mistralai/mixtral-8x7b-instruct-v0.1
+    4)- meta/llama-3.1-70b-instruct
+    5)- meta/llama-3.3-70b-instruct
+    6)- meta/llama-3.1-405b-instruct
+    7)- mistralai/mistral-nemo-12b-instruct
+    8)- nvidia/llama-3.1-nemotron-70b-instruct
+    9)- meta/llama-3.1-8b-instruct
+    10)- google/gemma-2-2b-it
 ```
 <!-- Update the link here when ragas is updated -->
-For a complete list of up-to-date judge LLMs, refer to the [Ragas NV metrics leadership board](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_nv_metrics.py)
+For a complete list of up-to-date judge LLMs, refer to the [Ragas NV metrics leadership board](https://github.com/vibrantlabsai/ragas/blob/v0.4.3/src/ragas/metrics/_nv_metrics.py)
 
-For more information on the prompt used by the judge LLM, refer to the [Ragas NV metrics](https://github.com/explodinggradients/ragas/blob/v0.2.14/src/ragas/metrics/_nv_metrics.py). The prompt for these metrics is not configurable. If you need a custom prompt, you can use the [Tunable RAG Evaluator](#tunable-rag-evaluator) or implement your own evaluator using the [Custom Evaluator](../extend/custom-components/custom-evaluator.md) documentation.
+For more information on the prompt used by the judge LLM, refer to the [Ragas NV metrics](https://github.com/vibrantlabsai/ragas/blob/v0.4.3/src/ragas/metrics/_nv_metrics.py). The prompt for these metrics is not configurable. If you need a custom prompt, you can use the [Tunable RAG Evaluator](#tunable-rag-evaluator) or implement your own evaluator using the [Custom Evaluator](../extend/custom-components/custom-evaluator.md) documentation.
 
 ### Trajectory Evaluator
 This evaluator uses the intermediate steps generated by the workflow to evaluate the workflow trajectory. The evaluator configuration includes the evaluator type and any additional parameters required by the evaluator.
@@ -351,7 +360,7 @@ eval:
 ```
 
 :::{note}
-If `cleanup` is set to `true`, the entire output directory will be removed after the evaluation is complete. This is useful for temporary evaluations where you don't need to retain the output files. Use this option with caution, as it will delete all evaluation results including workflow outputs and evaluator outputs.
+If `cleanup` is set to `true`, the entire output directory will be removed prior to performing the evaluation.
 :::
 
 
@@ -1238,7 +1247,6 @@ eval:
       dir: ./.tmp/nat/examples/simple_output/
       cleanup: true
 ```
-Output directory cleanup is disabled by default for easy troubleshooting.
 
 #### Job eviction from output directory
 When running multiple evaluations, especially with `append_job_id_to_output_dir` enabled, the output directory can accumulate a large number of job folders over time. You can control this growth using a job eviction policy.

From a99faed5ee3e8bfb7362b23d937adebff9366674 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Mon, 4 May 2026 11:31:32 -0700
Subject: [PATCH 4/5] When output is undefined, but output_dir is, instantiate
 EvalOutputConfig(dir=output_dir)

Signed-off-by: David Gardner <dagardner@nvidia.com>
---
 .../src/nat/data_models/evaluate_config.py           | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
index f7b6a62c8a..7c74007d68 100644
--- a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
+++ b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
@@ -21,6 +21,8 @@
 from pydantic import BaseModel
 from pydantic import Discriminator
 from pydantic import Field
+from pydantic import ValidationInfo
+from pydantic import field_validator
 from pydantic import model_validator
 
 from nat.data_models.common import TypedBaseModel
@@ -132,12 +134,16 @@ class EvalGeneralConfig(BaseModel):
         "this creates a fresh workflow instance per eval item, resetting all stateful tools to their "
         "initial state. Set to False to disable this behavior.")
 
-    # overwrite the output_dir with the output config if present
+    # If output_dir is defined and output is not, define an EvalOutputConfig with output_dir as the dir
     @model_validator(mode="before")
     @classmethod
     def override_output_dir(cls, values):
-        if values.get("output") and values["output"].get("dir"):
-            values["output_dir"] = values["output"]["dir"]
+        output_config = values.get("output")
+        if output_config is None:
+            output_dir = values.get("output_dir")
+            if output_dir is not None:
+                values["output"] = EvalOutputConfig(dir=output_dir)
+
         return values
 
     @classmethod

From 48c0facecca76d190282512e3185505f4c8bf4b6 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Mon, 4 May 2026 11:56:06 -0700
Subject: [PATCH 5/5] Remove unused imports

Signed-off-by: David Gardner <dagardner@nvidia.com>
---
 packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
index 7c74007d68..7e2126640b 100644
--- a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
+++ b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py
@@ -21,8 +21,6 @@
 from pydantic import BaseModel
 from pydantic import Discriminator
 from pydantic import Field
-from pydantic import ValidationInfo
-from pydantic import field_validator
 from pydantic import model_validator
 
 from nat.data_models.common import TypedBaseModel