NVIDIA · miyoungc · May 28, 2026
diff --git a/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json b/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json
@@ -3,18 +3,33 @@
     "id": "docs-resources-agent-skills-001",
     "question": "I'm looking at NemoClaw agent skills. Help me find a skill that can guide installation, policy, inference, or operations so I can delegate the right workflow to my AI coding assistant.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant."
+    "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-002",
     "question": "I'm choosing among multiple NemoClaw skills. Help me understand what each skill is designed to do so I can avoid using a broad assistant when a targeted skill exists.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-resources-agent-skills-003",
     "question": "I'm letting an agent follow NemoClaw-specific instructions. Help me see why the skill guidance is trustworthy and scoped so I can use agent assistance without losing operational control.",
     "expected_skill": "nemoclaw-user-agent-skills",
-    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control."
+    "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-agent-skills` skill.",
+      "Uses `references/agent-skills.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]
diff --git a/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json b/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json
diff --git a/.agents/skills/nemoclaw-user-configure-security/evals/evals.json b/.agents/skills/nemoclaw-user-configure-security/evals/evals.json
@@ -3,54 +3,99 @@
     "id": "docs-security-best-practices-001",
     "question": "I'm evaluating NemoClaw security best practices. Help me understand the risk posture of each configurable control so I can justify the setup to my team or security reviewers.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/best-practices.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-best-practices-002",
     "question": "I'm balancing developer convenience with lockdown. Help me compare the trade-offs of changing security controls so I can choose a posture that fits the environment.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment."
+    "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/best-practices.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-best-practices-003",
     "question": "I'm preparing for production-like use. Help me see which defaults are acceptable and which require changes so I can avoid shipping with accidental weak spots.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots."
+    "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/best-practices.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-credential-storage-001",
     "question": "I'm inspecting NemoClaw credential storage. Help me verify how secrets are stored and protected so I can decide whether the setup meets my secret-handling expectations.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations."
+    "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/credential-storage.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-credential-storage-002",
     "question": "I'm tracing where credentials live. Help me distinguish host, gateway, and sandbox storage boundaries so I can avoid assuming secrets are available in the wrong place.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place."
+    "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/credential-storage.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-credential-storage-003",
     "question": "I'm rotating or inspecting credentials. Help me follow a workflow that does not print secrets in logs or docs so I can recover or update access safely.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely."
+    "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/credential-storage.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-openclaw-controls-001",
     "question": "I'm reading about controls outside NemoClaw's scope. Help me understand which security responsibilities remain with OpenClaw so I can avoid treating sandbox isolation as a complete application security model.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model."
+    "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/openclaw-controls.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-openclaw-controls-002",
     "question": "I'm assessing application-layer agent risk. Help me identify the controls NemoClaw does not add so I can plan separate mitigations for authentication, prompt handling, and agent behavior.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior."
+    "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/openclaw-controls.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   },
   {
     "id": "docs-security-openclaw-controls-003",
     "question": "I'm documenting the security boundary. Help me explain where NemoClaw protection ends so I can set accurate expectations for reviewers and operators.",
     "expected_skill": "nemoclaw-user-configure-security",
-    "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators."
+    "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators.",
+    "expected_behavior": [
+      "Uses the `nemoclaw-user-configure-security` skill.",
+      "Uses `references/openclaw-controls.md` from that skill.",
+      "Answers the stated NemoClaw task directly."
+    ]
   }
 ]