From 07601e9538570689874f4241832b114b4a9c88f7 Mon Sep 17 00:00:00 2001
From: Miyoung Choi <miyoungc@nvidia.com>
Date: Thu, 28 May 2026 15:14:58 -0700
Subject: [PATCH] chore: skill eval fix batch 1

---
 .../evals/evals.json                          |  21 +++-
 .../evals/evals.json                          | 105 +++++++++++++++---
 .../evals/evals.json                          |  21 +++-
 .../evals/evals.json                          | 105 +++++++++++++++---
 4 files changed, 216 insertions(+), 36 deletions(-)

diff --git a/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json b/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json
index 6109419721..aaed26c0eb 100644
--- a/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json
+++ b/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json
@@ -3,18 +3,33 @@
     "id": "docs-resources-agent-skills-001",
     "question": "I'm looking at NemoClaw agent skills. Help me find a skill that can guide installation, policy, inference, or operations so I can delegate the right workflow to my AI coding assistant.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant."
+    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-002",
     "question": "I'm choosing among multiple NemoClaw skills. Help me understand what each skill is designed to do so I can avoid using a broad assistant when a targeted skill exists.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-003",
     "question": "I'm letting an agent follow NemoClaw-specific instructions. Help me see why the skill guidance is trustworthy and scoped so I can use agent assistance without losing operational control.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control."
+    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]
diff --git a/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json b/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json
index a0bd47ac29..47d3ac8937 100644
--- a/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json
+++ b/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json
@@ -3,90 +3,165 @@
     "id": "docs-inference-inference-options-001",
     "question": "I'm choosing an inference option during onboarding. Help me compare hosted providers, local servers, and compatible endpoints so I can select a model path that fits my privacy, cost, and reliability needs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-inference-options-002",
     "question": "I'm preparing provider credentials. Help me know which provider capabilities and secrets onboarding requires so I can complete setup without avoidable credential errors.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors."
+    "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-inference-options-003",
     "question": "I'm evaluating routed inference. Help me understand how the sandbox calls models through the gateway so I can trust that model credentials stay outside the sandbox.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-001",
     "question": "I'm connecting a local inference server. Help me route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint so I can meet privacy, latency, or cost goals.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals."
+    "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-002",
     "question": "I'm debugging local endpoint reachability. Help me separate NemoClaw routing issues from model-server issues so I can fix the right component first.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first."
+    "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-003",
     "question": "I'm configuring traffic through `inference.local`. Help me understand the required host, port, and model settings so I can make sandboxed inference calls resolve to my local server.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-001",
     "question": "I'm switching inference models during a running session. Help me change model behavior without restarting the sandbox so I can adapt to task, cost, or reliability needs quickly.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly."
+    "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-002",
     "question": "I'm confirming a runtime model change. Help me verify the agent is using the new active model so I can avoid mistaking host configuration changes for live routing changes.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes."
+    "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-003",
     "question": "I'm trying a different model during active work. Help me know how to roll back to the previous model so I can experiment without disrupting the assistant workflow.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow."
+    "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-001",
     "question": "I'm configuring a task-specific sub-agent. Help me assign a specialized model to work the default agent should not handle so I can improve task fit without changing the whole assistant.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant."
+    "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-002",
     "question": "I'm editing sub-agent model configuration. Help me understand where files, credentials, and workspace settings live so I can avoid leaking secrets or changing the wrong agent.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-003",
     "question": "I'm testing a new sub-agent. Help me send a prompt that exercises the intended routing so I can prove it uses the expected provider and model.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model."
+    "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-001",
     "question": "I'm seeing tool calls leak as plain text. Help me diagnose whether the model, server, or parser is incompatible so I can restore reliable tool execution.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution."
+    "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-002",
     "question": "I'm comparing local inference runtimes. Help me understand whether Ollama, vLLM, or parser settings better support tool calls so I can choose a runtime that matches the agent's tool needs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-003",
     "question": "I'm letting an always-on assistant use tools unattended. Help me define the reliability bar for local tool calling so I can avoid silent failures or unsafe plain-text tool outputs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]
diff --git a/skills/nemoclaw-user-agent-skills/evals/evals.json b/skills/nemoclaw-user-agent-skills/evals/evals.json
index 6109419721..aaed26c0eb 100644
--- a/skills/nemoclaw-user-agent-skills/evals/evals.json
+++ b/skills/nemoclaw-user-agent-skills/evals/evals.json
@@ -3,18 +3,33 @@
     "id": "docs-resources-agent-skills-001",
     "question": "I'm looking at NemoClaw agent skills. Help me find a skill that can guide installation, policy, inference, or operations so I can delegate the right workflow to my AI coding assistant.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant."
+    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-002",
     "question": "I'm choosing among multiple NemoClaw skills. Help me understand what each skill is designed to do so I can avoid using a broad assistant when a targeted skill exists.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-003",
     "question": "I'm letting an agent follow NemoClaw-specific instructions. Help me see why the skill guidance is trustworthy and scoped so I can use agent assistance without losing operational control.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control."
+    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]
diff --git a/skills/nemoclaw-user-configure-inference/evals/evals.json b/skills/nemoclaw-user-configure-inference/evals/evals.json
index a0bd47ac29..47d3ac8937 100644
--- a/skills/nemoclaw-user-configure-inference/evals/evals.json
+++ b/skills/nemoclaw-user-configure-inference/evals/evals.json
@@ -3,90 +3,165 @@
     "id": "docs-inference-inference-options-001",
     "question": "I'm choosing an inference option during onboarding. Help me compare hosted providers, local servers, and compatible endpoints so I can select a model path that fits my privacy, cost, and reliability needs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-inference-options-002",
     "question": "I'm preparing provider credentials. Help me know which provider capabilities and secrets onboarding requires so I can complete setup without avoidable credential errors.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors."
+    "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-inference-options-003",
     "question": "I'm evaluating routed inference. Help me understand how the sandbox calls models through the gateway so I can trust that model credentials stay outside the sandbox.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/inference-options.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-001",
     "question": "I'm connecting a local inference server. Help me route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint so I can meet privacy, latency, or cost goals.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals."
+    "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-002",
     "question": "I'm debugging local endpoint reachability. Help me separate NemoClaw routing issues from model-server issues so I can fix the right component first.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first."
+    "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-use-local-inference-003",
     "question": "I'm configuring traffic through `inference.local`. Help me understand the required host, port, and model settings so I can make sandboxed inference calls resolve to my local server.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/use-local-inference-details.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-001",
     "question": "I'm switching inference models during a running session. Help me change model behavior without restarting the sandbox so I can adapt to task, cost, or reliability needs quickly.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly."
+    "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-002",
     "question": "I'm confirming a runtime model change. Help me verify the agent is using the new active model so I can avoid mistaking host configuration changes for live routing changes.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes."
+    "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-switch-inference-providers-003",
     "question": "I'm trying a different model during active work. Help me know how to roll back to the previous model so I can experiment without disrupting the assistant workflow.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow."
+    "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/switch-inference-providers.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-001",
     "question": "I'm configuring a task-specific sub-agent. Help me assign a specialized model to work the default agent should not handle so I can improve task fit without changing the whole assistant.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant."
+    "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-002",
     "question": "I'm editing sub-agent model configuration. Help me understand where files, credentials, and workspace settings live so I can avoid leaking secrets or changing the wrong agent.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-set-up-sub-agent-003",
     "question": "I'm testing a new sub-agent. Help me send a prompt that exercises the intended routing so I can prove it uses the expected provider and model.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model."
+    "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/set-up-sub-agent.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-001",
     "question": "I'm seeing tool calls leak as plain text. Help me diagnose whether the model, server, or parser is incompatible so I can restore reliable tool execution.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution."
+    "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-002",
     "question": "I'm comparing local inference runtimes. Help me understand whether Ollama, vLLM, or parser settings better support tool calls so I can choose a runtime that matches the agent's tool needs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-inference-tool-calling-reliability-003",
     "question": "I'm letting an always-on assistant use tools unattended. Help me define the reliability bar for local tool calling so I can avoid silent failures or unsafe plain-text tool outputs.",
     "expected_skill": "nemoclaw-user-configure-inference",
-    "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs."
+    "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-inference` skill.",
+      "Uses `references/tool-calling-reliability.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]