From 8bc92c54f95ff0f0f890520f160627b5487da282 Mon Sep 17 00:00:00 2001 From: Miyoung Choi Date: Thu, 28 May 2026 14:49:39 -0700 Subject: [PATCH] chore: fix for eval schema validation --- .../evals/evals.json | 21 +++- .../evals/evals.json | 105 +++++++++++++++--- .../evals/evals.json | 63 +++++++++-- .../evals/evals.json | 84 ++++++++++++-- .../evals/evals.json | 84 ++++++++++++-- .../evals/evals.json | 63 +++++++++-- .../evals/evals.json | 105 +++++++++++++++--- .../evals/evals.json | 21 +++- .../nemoclaw-user-overview/evals/evals.json | 105 +++++++++++++++--- .../nemoclaw-user-reference/evals/evals.json | 105 +++++++++++++++--- .../evals/evals.json | 21 +++- .../evals/evals.json | 105 +++++++++++++++--- .../evals/evals.json | 63 +++++++++-- .../evals/evals.json | 84 ++++++++++++-- .../evals/evals.json | 84 ++++++++++++-- .../evals/evals.json | 63 +++++++++-- .../evals/evals.json | 105 +++++++++++++++--- .../evals/evals.json | 21 +++- .../nemoclaw-user-overview/evals/evals.json | 105 +++++++++++++++--- .../nemoclaw-user-reference/evals/evals.json | 105 +++++++++++++++--- 20 files changed, 1296 insertions(+), 216 deletions(-) diff --git a/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json b/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json index 6109419721..aaed26c0eb 100644 --- a/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json +++ b/.agents/skills/nemoclaw-user-agent-skills/evals/evals.json @@ -3,18 +3,33 @@ "id": "docs-resources-agent-skills-001", "question": "I'm looking at NemoClaw agent skills. Help me find a skill that can guide installation, policy, inference, or operations so I can delegate the right workflow to my AI coding assistant.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-resources-agent-skills-002", "question": "I'm choosing among multiple NemoClaw skills. Help me understand what each skill is designed to do so I can avoid using a broad assistant when a targeted skill exists.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-resources-agent-skills-003", "question": "I'm letting an agent follow NemoClaw-specific instructions. Help me see why the skill guidance is trustworthy and scoped so I can use agent assistance without losing operational control.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control." + "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json b/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json index a0bd47ac29..47d3ac8937 100644 --- a/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json +++ b/.agents/skills/nemoclaw-user-configure-inference/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-inference-inference-options-001", "question": "I'm choosing an inference option during onboarding. Help me compare hosted providers, local servers, and compatible endpoints so I can select a model path that fits my privacy, cost, and reliability needs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs." + "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-inference-options-002", "question": "I'm preparing provider credentials. Help me know which provider capabilities and secrets onboarding requires so I can complete setup without avoidable credential errors.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors." + "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-inference-options-003", "question": "I'm evaluating routed inference. Help me understand how the sandbox calls models through the gateway so I can trust that model credentials stay outside the sandbox.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-001", "question": "I'm connecting a local inference server. Help me route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint so I can meet privacy, latency, or cost goals.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals." + "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-002", "question": "I'm debugging local endpoint reachability. Help me separate NemoClaw routing issues from model-server issues so I can fix the right component first.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first." + "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-003", "question": "I'm configuring traffic through `inference.local`. Help me understand the required host, port, and model settings so I can make sandboxed inference calls resolve to my local server.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-001", "question": "I'm switching inference models during a running session. Help me change model behavior without restarting the sandbox so I can adapt to task, cost, or reliability needs quickly.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly." + "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-002", "question": "I'm confirming a runtime model change. Help me verify the agent is using the new active model so I can avoid mistaking host configuration changes for live routing changes.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-003", "question": "I'm trying a different model during active work. Help me know how to roll back to the previous model so I can experiment without disrupting the assistant workflow.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-001", "question": "I'm configuring a task-specific sub-agent. Help me assign a specialized model to work the default agent should not handle so I can improve task fit without changing the whole assistant.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-002", "question": "I'm editing sub-agent model configuration. Help me understand where files, credentials, and workspace settings live so I can avoid leaking secrets or changing the wrong agent.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent." + "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-003", "question": "I'm testing a new sub-agent. Help me send a prompt that exercises the intended routing so I can prove it uses the expected provider and model.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model." + "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-001", "question": "I'm seeing tool calls leak as plain text. Help me diagnose whether the model, server, or parser is incompatible so I can restore reliable tool execution.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution." + "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-002", "question": "I'm comparing local inference runtimes. Help me understand whether Ollama, vLLM, or parser settings better support tool calls so I can choose a runtime that matches the agent's tool needs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs." + "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-003", "question": "I'm letting an always-on assistant use tools unattended. Help me define the reliability bar for local tool calling so I can avoid silent failures or unsafe plain-text tool outputs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs." + "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-configure-security/evals/evals.json b/.agents/skills/nemoclaw-user-configure-security/evals/evals.json index 9e17d64983..a3757d87da 100644 --- a/.agents/skills/nemoclaw-user-configure-security/evals/evals.json +++ b/.agents/skills/nemoclaw-user-configure-security/evals/evals.json @@ -3,54 +3,99 @@ "id": "docs-security-best-practices-001", "question": "I'm evaluating NemoClaw security best practices. Help me understand the risk posture of each configurable control so I can justify the setup to my team or security reviewers.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-best-practices-002", "question": "I'm balancing developer convenience with lockdown. Help me compare the trade-offs of changing security controls so I can choose a posture that fits the environment.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment." + "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-best-practices-003", "question": "I'm preparing for production-like use. Help me see which defaults are acceptable and which require changes so I can avoid shipping with accidental weak spots.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots." + "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-001", "question": "I'm inspecting NemoClaw credential storage. Help me verify how secrets are stored and protected so I can decide whether the setup meets my secret-handling expectations.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations." + "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-002", "question": "I'm tracing where credentials live. Help me distinguish host, gateway, and sandbox storage boundaries so I can avoid assuming secrets are available in the wrong place.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place." + "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-003", "question": "I'm rotating or inspecting credentials. Help me follow a workflow that does not print secrets in logs or docs so I can recover or update access safely.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely." + "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-001", "question": "I'm reading about controls outside NemoClaw's scope. Help me understand which security responsibilities remain with OpenClaw so I can avoid treating sandbox isolation as a complete application security model.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-002", "question": "I'm assessing application-layer agent risk. Help me identify the controls NemoClaw does not add so I can plan separate mitigations for authentication, prompt handling, and agent behavior.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior." + "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-003", "question": "I'm documenting the security boundary. Help me explain where NemoClaw protection ends so I can set accurate expectations for reviewers and operators.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators." + "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-deploy-remote/evals/evals.json b/.agents/skills/nemoclaw-user-deploy-remote/evals/evals.json index 41af478f9e..1cf519758c 100644 --- a/.agents/skills/nemoclaw-user-deploy-remote/evals/evals.json +++ b/.agents/skills/nemoclaw-user-deploy-remote/evals/evals.json @@ -3,72 +3,132 @@ "id": "docs-deployment-deploy-to-remote-gpu-001", "question": "I'm deploying NemoClaw to a remote GPU instance. Help me move the sandboxed assistant off my local machine so I can support persistent or GPU-backed operation.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user move the sandboxed assistant off my local machine and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support persistent or GPU-backed operation." + "ground_truth": "A NemoClaw-specific answer that helps the user move the sandboxed assistant off my local machine and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support persistent or GPU-backed operation.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-deploy-to-remote-gpu-002", "question": "I'm using the legacy Brev compatibility flow. Help me understand what the flow still does and where it is deprecated so I can avoid depending on an outdated path blindly.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the flow still does and where it is deprecated and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid depending on an outdated path blindly." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the flow still does and where it is deprecated and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid depending on an outdated path blindly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-deploy-to-remote-gpu-003", "question": "I'm after remote deployment succeeds. Help me find the connection, operation, and recovery details so I can operate the sandbox after initial setup.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user find the connection, operation, and recovery details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox after initial setup." + "ground_truth": "A NemoClaw-specific answer that helps the user find the connection, operation, and recovery details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox after initial setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-001", "question": "I'm launching NemoClaw from the Brev web UI. Help me avoid local CLI setup and local GPU requirements so I can start a hosted sandbox quickly.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user avoid local CLI setup and local GPU requirements and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start a hosted sandbox quickly." + "ground_truth": "A NemoClaw-specific answer that helps the user avoid local CLI setup and local GPU requirements and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start a hosted sandbox quickly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-002", "question": "I'm reviewing hosted launch choices. Help me understand each web UI option before creating the instance so I can choose settings that match my expected sandbox workflow.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand each web UI option before creating the instance and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose settings that match my expected sandbox workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user understand each web UI option before creating the instance and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose settings that match my expected sandbox workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-003", "question": "I'm the hosted sandbox is created. Help me confirm where to connect and how to start using it so I can move from provisioning to actual agent work.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm where to connect and how to start using it and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move from provisioning to actual agent work." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm where to connect and how to start using it and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move from provisioning to actual agent work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-001", "question": "I'm installing an OpenClaw plugin in a NemoClaw-managed sandbox. Help me add a new agent capability inside the sandbox so I can extend the assistant without weakening the host boundary.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user add a new agent capability inside the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to extend the assistant without weakening the host boundary." + "ground_truth": "A NemoClaw-specific answer that helps the user add a new agent capability inside the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to extend the assistant without weakening the host boundary.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-002", "question": "I'm deciding where to install a plugin. Help me distinguish host environment changes from sandbox environment changes so I can modify the right filesystem and runtime.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host environment changes from sandbox environment changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to modify the right filesystem and runtime." + "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host environment changes from sandbox environment changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to modify the right filesystem and runtime.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-003", "question": "I'm verifying a plugin installation. Help me confirm the agent can discover and use the plugin so I can trust that the capability works inside NemoClaw's security model.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm the agent can discover and use the plugin and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that the capability works inside NemoClaw's security model." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm the agent can discover and use the plugin and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that the capability works inside NemoClaw's security model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-001", "question": "I'm reviewing sandbox image hardening. Help me understand which container risks NemoClaw reduces so I can decide whether unattended agents are acceptable in my environment.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which container risks NemoClaw reduces and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether unattended agents are acceptable in my environment." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which container risks NemoClaw reduces and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether unattended agents are acceptable in my environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-002", "question": "I'm mapping NemoClaw to an organizational security baseline. Help me identify capability drops, least privilege, and runtime protections so I can document how the sandbox meets or misses required controls.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user identify capability drops, least privilege, and runtime protections and gives enough concrete guidance, decision criteria, verification steps, or risk framing to document how the sandbox meets or misses required controls." + "ground_truth": "A NemoClaw-specific answer that helps the user identify capability drops, least privilege, and runtime protections and gives enough concrete guidance, decision criteria, verification steps, or risk framing to document how the sandbox meets or misses required controls.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-003", "question": "I'm considering production use. Help me see the limitations and residual risks of the hardened image so I can avoid overstating what container hardening guarantees.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user see the limitations and residual risks of the hardened image and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid overstating what container hardening guarantees." + "ground_truth": "A NemoClaw-specific answer that helps the user see the limitations and residual risks of the hardened image and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid overstating what container hardening guarantees.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-get-started/evals/evals.json b/.agents/skills/nemoclaw-user-get-started/evals/evals.json index 946c2dfea5..6cbfddba24 100644 --- a/.agents/skills/nemoclaw-user-get-started/evals/evals.json +++ b/.agents/skills/nemoclaw-user-get-started/evals/evals.json @@ -3,72 +3,132 @@ "id": "docs-get-started-prerequisites-001", "question": "I'm checking prerequisites before installation. Help me verify my host has the required hardware, software, and platform support so I can avoid a failed first setup.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user verify my host has the required hardware, software, and platform support and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid a failed first setup." + "ground_truth": "A NemoClaw-specific answer that helps the user verify my host has the required hardware, software, and platform support and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid a failed first setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-prerequisites-002", "question": "I'm using a machine with limited CPU, memory, disk, or Docker capacity. Help me understand the practical minimums and known bottlenecks so I can prepare the machine before onboarding starts.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the practical minimums and known bottlenecks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prepare the machine before onboarding starts." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the practical minimums and known bottlenecks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prepare the machine before onboarding starts.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-prerequisites-003", "question": "I'm choosing between local, Windows WSL, and remote GPU setup. Help me compare supported platform paths so I can start with the environment most likely to work.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user compare supported platform paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start with the environment most likely to work." + "ground_truth": "A NemoClaw-specific answer that helps the user compare supported platform paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start with the environment most likely to work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-001", "question": "I'm preparing a Windows machine for NemoClaw. Help me enable WSL 2, Ubuntu, and Docker Desktop correctly so I can enter the standard quickstart from a working Linux environment.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user enable WSL 2, Ubuntu, and Docker Desktop correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enter the standard quickstart from a working Linux environment." + "ground_truth": "A NemoClaw-specific answer that helps the user enable WSL 2, Ubuntu, and Docker Desktop correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enter the standard quickstart from a working Linux environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-002", "question": "I'm unsure whether Windows-specific setup is complete. Help me check the WSL and Docker integration steps that commonly block installs so I can fix host issues before running the NemoClaw installer.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user check the WSL and Docker integration steps that commonly block installs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix host issues before running the NemoClaw installer." + "ground_truth": "A NemoClaw-specific answer that helps the user check the WSL and Docker integration steps that commonly block installs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix host issues before running the NemoClaw installer.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-003", "question": "I'm ready to leave the Windows preparation guide. Help me confirm the Ubuntu shell can run the required commands so I can follow the quickstart without mixing Windows and Linux instructions.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm the Ubuntu shell can run the required commands and gives enough concrete guidance, decision criteria, verification steps, or risk framing to follow the quickstart without mixing Windows and Linux instructions." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm the Ubuntu shell can run the required commands and gives enough concrete guidance, decision criteria, verification steps, or risk framing to follow the quickstart without mixing Windows and Linux instructions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-001", "question": "I'm running the OpenClaw quickstart. Help me install NemoClaw and create my first sandboxed agent so I can send a prompt to a working OpenClaw assistant.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user install NemoClaw and create my first sandboxed agent and gives enough concrete guidance, decision criteria, verification steps, or risk framing to send a prompt to a working OpenClaw assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user install NemoClaw and create my first sandboxed agent and gives enough concrete guidance, decision criteria, verification steps, or risk framing to send a prompt to a working OpenClaw assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-002", "question": "I'm encountering installer prompts or host preflight checks. Help me understand what the installer is asking for and why so I can continue setup without granting unnecessary access.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the installer is asking for and why and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue setup without granting unnecessary access." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the installer is asking for and why and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue setup without granting unnecessary access.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-003", "question": "I'm finishing onboarding. Help me verify the sandbox, inference route, and OpenClaw agent are connected so I can know the setup succeeded end to end.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the sandbox, inference route, and OpenClaw agent are connected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to know the setup succeeded end to end." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the sandbox, inference route, and OpenClaw agent are connected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to know the setup succeeded end to end.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-001", "question": "I'm choosing Hermes instead of OpenClaw. Help me launch a sandboxed Hermes API endpoint so I can serve the agent workflow my downstream clients expect.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user launch a sandboxed Hermes API endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to serve the agent workflow my downstream clients expect." + "ground_truth": "A NemoClaw-specific answer that helps the user launch a sandboxed Hermes API endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to serve the agent workflow my downstream clients expect.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-002", "question": "I'm moving through onboarding for Hermes. Help me confirm NemoClaw selected Hermes-specific setup and configuration so I can avoid accidentally creating the default OpenClaw environment.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm NemoClaw selected Hermes-specific setup and configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidentally creating the default OpenClaw environment." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm NemoClaw selected Hermes-specific setup and configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidentally creating the default OpenClaw environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-003", "question": "I'm checking the sandboxed Hermes endpoint. Help me run a small request that proves the endpoint is live so I can hand it to clients or tests with confidence.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user run a small request that proves the endpoint is live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to hand it to clients or tests with confidence." + "ground_truth": "A NemoClaw-specific answer that helps the user run a small request that proves the endpoint is live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to hand it to clients or tests with confidence.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-manage-policy/evals/evals.json b/.agents/skills/nemoclaw-user-manage-policy/evals/evals.json index 26bb32a8fd..c3b1734a1f 100644 --- a/.agents/skills/nemoclaw-user-manage-policy/evals/evals.json +++ b/.agents/skills/nemoclaw-user-manage-policy/evals/evals.json @@ -3,54 +3,99 @@ "id": "docs-network-policy-customize-network-policy-001", "question": "I'm customizing sandbox network policy. Help me allow the agent to reach a required external service so I can enable the integration while preserving least privilege.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user allow the agent to reach a required external service and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enable the integration while preserving least privilege." + "ground_truth": "A NemoClaw-specific answer that helps the user allow the agent to reach a required external service and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enable the integration while preserving least privilege.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-customize-network-policy-002", "question": "I'm writing an egress rule. Help me specify the minimum necessary host, port, and protocol so I can avoid opening broader access than the agent needs.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user specify the minimum necessary host, port, and protocol and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid opening broader access than the agent needs." + "ground_truth": "A NemoClaw-specific answer that helps the user specify the minimum necessary host, port, and protocol and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid opening broader access than the agent needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-customize-network-policy-003", "question": "I'm validating a policy change. Help me test that the intended integration works and unrelated egress remains blocked so I can ship a safer policy update.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user test that the intended integration works and unrelated egress remains blocked and gives enough concrete guidance, decision criteria, verification steps, or risk framing to ship a safer policy update." + "ground_truth": "A NemoClaw-specific answer that helps the user test that the intended integration works and unrelated egress remains blocked and gives enough concrete guidance, decision criteria, verification steps, or risk framing to ship a safer policy update.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-001", "question": "I'm reviewing a blocked network request. Help me understand why the agent wants to reach that endpoint so I can approve only requests that support the current job.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user understand why the agent wants to reach that endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to approve only requests that support the current job." + "ground_truth": "A NemoClaw-specific answer that helps the user understand why the agent wants to reach that endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to approve only requests that support the current job.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-002", "question": "I'm using the approval UI. Help me spot unexpected or prompt-injection-driven egress so I can deny suspicious access before it becomes policy.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user spot unexpected or prompt-injection-driven egress and gives enough concrete guidance, decision criteria, verification steps, or risk framing to deny suspicious access before it becomes policy." + "ground_truth": "A NemoClaw-specific answer that helps the user spot unexpected or prompt-injection-driven egress and gives enough concrete guidance, decision criteria, verification steps, or risk framing to deny suspicious access before it becomes policy.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-003", "question": "I'm after approving or denying a request. Help me understand audit, rollback, and policy update behavior so I can keep operator decisions traceable.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user understand audit, rollback, and policy update behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to keep operator decisions traceable." + "ground_truth": "A NemoClaw-specific answer that helps the user understand audit, rollback, and policy update behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to keep operator decisions traceable.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-001", "question": "I'm following an integration policy example. Help me enable a common third-party workflow quickly so I can avoid writing a policy from scratch.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user enable a common third-party workflow quickly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid writing a policy from scratch." + "ground_truth": "A NemoClaw-specific answer that helps the user enable a common third-party workflow quickly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid writing a policy from scratch.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-002", "question": "I'm adapting an example to my organization. Help me replace sample hosts and ports with exact production endpoints so I can create a policy that matches our real integration.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user replace sample hosts and ports with exact production endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to create a policy that matches our real integration." + "ground_truth": "A NemoClaw-specific answer that helps the user replace sample hosts and ports with exact production endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to create a policy that matches our real integration.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-003", "question": "I'm copying an example into a stricter environment. Help me identify broad rules or assumptions that need tightening so I can avoid weakening production egress controls.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user identify broad rules or assumptions that need tightening and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid weakening production egress controls." + "ground_truth": "A NemoClaw-specific answer that helps the user identify broad rules or assumptions that need tightening and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid weakening production egress controls.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-manage-sandboxes/evals/evals.json b/.agents/skills/nemoclaw-user-manage-sandboxes/evals/evals.json index ff6af55509..e875fa8c95 100644 --- a/.agents/skills/nemoclaw-user-manage-sandboxes/evals/evals.json +++ b/.agents/skills/nemoclaw-user-manage-sandboxes/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-manage-sandboxes-lifecycle-001", "question": "I'm managing a NemoClaw sandbox. Help me check status, health, logs, ports, providers, upgrades, and uninstall paths so I can operate the sandbox safely after quickstart.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user check status, health, logs, ports, providers, upgrades, and uninstall paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox safely after quickstart." + "ground_truth": "A NemoClaw-specific answer that helps the user check status, health, logs, ports, providers, upgrades, and uninstall paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox safely after quickstart.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-lifecycle-002", "question": "I'm choosing a lifecycle command. Help me understand which commands inspect, restart, rebuild, or destroy state so I can avoid accidental data loss.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which commands inspect, restart, rebuild, or destroy state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidental data loss." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which commands inspect, restart, rebuild, or destroy state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidental data loss.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-lifecycle-003", "question": "I'm planning an upgrade, rebuild, or uninstall. Help me know when to preserve workspace files first so I can recover useful agent state after disruptive changes.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know when to preserve workspace files first and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover useful agent state after disruptive changes." + "ground_truth": "A NemoClaw-specific answer that helps the user know when to preserve workspace files first and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover useful agent state after disruptive changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-001", "question": "I'm changing a running sandbox. Help me know which controls can change without rebuild or re-onboarding so I can make safe adjustments with minimal downtime.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know which controls can change without rebuild or re-onboarding and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make safe adjustments with minimal downtime." + "ground_truth": "A NemoClaw-specific answer that helps the user know which controls can change without rebuild or re-onboarding and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make safe adjustments with minimal downtime.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-002", "question": "I'm reviewing a runtime control. Help me classify it as hot-reloadable, rebuild-only, or onboarding-only so I can choose the correct operational path.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user classify it as hot-reloadable, rebuild-only, or onboarding-only and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the correct operational path." + "ground_truth": "A NemoClaw-specific answer that helps the user classify it as hot-reloadable, rebuild-only, or onboarding-only and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the correct operational path.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-003", "question": "I'm responding to an incident or risky agent behavior. Help me use `shields up`, `shields down`, and `shields status` correctly so I can tighten or inspect controls without confusion.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user use `shields up`, `shields down`, and `shields status` correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to tighten or inspect controls without confusion." + "ground_truth": "A NemoClaw-specific answer that helps the user use `shields up`, `shields down`, and `shields status` correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to tighten or inspect controls without confusion.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-001", "question": "I'm backing up workspace files before a destructive operation. Help me preserve agent memory, identity, and useful configuration so I can rebuild or migrate without losing important state.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user preserve agent memory, identity, and useful configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to rebuild or migrate without losing important state." + "ground_truth": "A NemoClaw-specific answer that helps the user preserve agent memory, identity, and useful configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to rebuild or migrate without losing important state.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-002", "question": "I'm handling a workspace archive. Help me understand credential stripping and integrity checks so I can trust the archive without exposing secrets.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand credential stripping and integrity checks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the archive without exposing secrets." + "ground_truth": "A NemoClaw-specific answer that helps the user understand credential stripping and integrity checks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the archive without exposing secrets.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-003", "question": "I'm restoring workspace files. Help me verify the agent's useful memory returned so I can continue work without reintroducing sensitive host data.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent's useful memory returned and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue work without reintroducing sensitive host data." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent's useful memory returned and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue work without reintroducing sensitive host data.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-001", "question": "I'm inspecting workspace files. Help me understand where personality, identity, and configuration live so I can predict how the agent will behave across sessions.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand where personality, identity, and configuration live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to predict how the agent will behave across sessions." + "ground_truth": "A NemoClaw-specific answer that helps the user understand where personality, identity, and configuration live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to predict how the agent will behave across sessions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-002", "question": "I'm adding durable instructions for the agent. Help me know which files persist and who owns them so I can put guidance in the right place.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know which files persist and who owns them and gives enough concrete guidance, decision criteria, verification steps, or risk framing to put guidance in the right place." + "ground_truth": "A NemoClaw-specific answer that helps the user know which files persist and who owns them and gives enough concrete guidance, decision criteria, verification steps, or risk framing to put guidance in the right place.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-003", "question": "I'm restarting, rebuilding, or migrating a sandbox. Help me understand how each action affects workspace state so I can avoid losing or duplicating important files.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how each action affects workspace state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid losing or duplicating important files." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how each action affects workspace state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid losing or duplicating important files.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-001", "question": "I'm connecting a messaging channel. Help me let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel so I can support real-world always-on interactions.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support real-world always-on interactions." + "ground_truth": "A NemoClaw-specific answer that helps the user let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support real-world always-on interactions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-002", "question": "I'm configuring channel credentials and processes. Help me understand what OpenShell supervises and where secrets live so I can trust the messaging integration operationally.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what OpenShell supervises and where secrets live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the messaging integration operationally." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what OpenShell supervises and where secrets live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the messaging integration operationally.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-003", "question": "I'm testing a new messaging channel. Help me send and receive a message through the full path so I can prove the channel, gateway, and sandboxed agent are wired correctly.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user send and receive a message through the full path and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove the channel, gateway, and sandboxed agent are wired correctly." + "ground_truth": "A NemoClaw-specific answer that helps the user send and receive a message through the full path and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove the channel, gateway, and sandboxed agent are wired correctly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-monitor-sandbox/evals/evals.json b/.agents/skills/nemoclaw-user-monitor-sandbox/evals/evals.json index 260e8ec64e..231936529b 100644 --- a/.agents/skills/nemoclaw-user-monitor-sandbox/evals/evals.json +++ b/.agents/skills/nemoclaw-user-monitor-sandbox/evals/evals.json @@ -3,18 +3,33 @@ "id": "docs-monitoring-monitor-sandbox-activity-001", "question": "I'm monitoring sandbox activity. Help me understand what the agent and sandbox are doing now so I can detect unhealthy or unexpected behavior early.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the agent and sandbox are doing now and gives enough concrete guidance, decision criteria, verification steps, or risk framing to detect unhealthy or unexpected behavior early." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the agent and sandbox are doing now and gives enough concrete guidance, decision criteria, verification steps, or risk framing to detect unhealthy or unexpected behavior early.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-monitoring-monitor-sandbox-activity-002", "question": "I'm diagnosing a runtime failure. Help me use health, logs, and traces to locate the failing layer so I can separate host, gateway, sandbox, policy, and inference issues.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user use health, logs, and traces to locate the failing layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to separate host, gateway, sandbox, policy, and inference issues." + "ground_truth": "A NemoClaw-specific answer that helps the user use health, logs, and traces to locate the failing layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to separate host, gateway, sandbox, policy, and inference issues.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-monitoring-monitor-sandbox-activity-003", "question": "I'm collecting debugging evidence. Help me gather enough information without weakening controls so I can investigate safely and share useful diagnostics.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user gather enough information without weakening controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to investigate safely and share useful diagnostics." + "ground_truth": "A NemoClaw-specific answer that helps the user gather enough information without weakening controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to investigate safely and share useful diagnostics.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-overview/evals/evals.json b/.agents/skills/nemoclaw-user-overview/evals/evals.json index fc8c3e0aca..f4b0b599ee 100644 --- a/.agents/skills/nemoclaw-user-overview/evals/evals.json +++ b/.agents/skills/nemoclaw-user-overview/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-index-001", "question": "I'm first arriving at the NemoClaw docs. Help me understand what NemoClaw helps me run and why it exists so I can decide whether it is worth installing before I spend time on setup.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what NemoClaw helps me run and why it exists and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether it is worth installing before I spend time on setup." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what NemoClaw helps me run and why it exists and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether it is worth installing before I spend time on setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-index-002", "question": "I'm evaluating whether an always-on assistant can run safely in my environment. Help me see the core safety, lifecycle, and inference-routing promises up front so I can judge whether the stack matches my risk tolerance.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the core safety, lifecycle, and inference-routing promises up front and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the stack matches my risk tolerance." + "ground_truth": "A NemoClaw-specific answer that helps the user see the core safety, lifecycle, and inference-routing promises up front and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the stack matches my risk tolerance.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-index-003", "question": "I'm considering the one-command install path. Help me know what the command will install, configure, and launch so I can take the next step without feeling like I am accepting an opaque shortcut.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user know what the command will install, configure, and launch and gives enough concrete guidance, decision criteria, verification steps, or risk framing to take the next step without feeling like I am accepting an opaque shortcut." + "ground_truth": "A NemoClaw-specific answer that helps the user know what the command will install, configure, and launch and gives enough concrete guidance, decision criteria, verification steps, or risk framing to take the next step without feeling like I am accepting an opaque shortcut.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-001", "question": "I'm explaining NemoClaw to a teammate. Help me summarize the product, stack, and value in plain language so I can align on whether NemoClaw is relevant to our agent workflow.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user summarize the product, stack, and value in plain language and gives enough concrete guidance, decision criteria, verification steps, or risk framing to align on whether NemoClaw is relevant to our agent workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user summarize the product, stack, and value in plain language and gives enough concrete guidance, decision criteria, verification steps, or risk framing to align on whether NemoClaw is relevant to our agent workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-002", "question": "I'm worried about security, cost, or operations risks from unattended agents. Help me understand which guardrails NemoClaw adds so I can decide whether sandboxed execution addresses my main concerns.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which guardrails NemoClaw adds and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether sandboxed execution addresses my main concerns." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which guardrails NemoClaw adds and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether sandboxed execution addresses my main concerns.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-003", "question": "I'm comparing NemoClaw with direct OpenClaw or OpenShell usage. Help me see the capabilities NemoClaw owns so I can classify it as the right reference stack rather than generic setup glue.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the capabilities NemoClaw owns and gives enough concrete guidance, decision criteria, verification steps, or risk framing to classify it as the right reference stack rather than generic setup glue." + "ground_truth": "A NemoClaw-specific answer that helps the user see the capabilities NemoClaw owns and gives enough concrete guidance, decision criteria, verification steps, or risk framing to classify it as the right reference stack rather than generic setup glue.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-001", "question": "I'm comparing OpenClaw, OpenShell, and NemoClaw. Help me understand the role of each layer so I can choose the right adoption path for my project.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the role of each layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the right adoption path for my project." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the role of each layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the right adoption path for my project.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-002", "question": "I'm deciding whether to use the reference integration. Help me identify when NemoClaw is enough versus when I need direct OpenShell integration so I can avoid unnecessary platform work.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user identify when NemoClaw is enough versus when I need direct OpenShell integration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unnecessary platform work." + "ground_truth": "A NemoClaw-specific answer that helps the user identify when NemoClaw is enough versus when I need direct OpenShell integration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unnecessary platform work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-003", "question": "I'm planning a deployment with multiple moving parts. Help me separate agent, runtime, and orchestration responsibilities so I can assign ownership and troubleshoot the right layer later.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user separate agent, runtime, and orchestration responsibilities and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assign ownership and troubleshoot the right layer later." + "ground_truth": "A NemoClaw-specific answer that helps the user separate agent, runtime, and orchestration responsibilities and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assign ownership and troubleshoot the right layer later.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-001", "question": "I'm studying the NemoClaw architecture. Help me understand how the CLI, plugin, blueprint, and sandbox interact so I can reason about failures and maintenance work with confidence.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how the CLI, plugin, blueprint, and sandbox interact and gives enough concrete guidance, decision criteria, verification steps, or risk framing to reason about failures and maintenance work with confidence." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how the CLI, plugin, blueprint, and sandbox interact and gives enough concrete guidance, decision criteria, verification steps, or risk framing to reason about failures and maintenance work with confidence.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-002", "question": "I'm debugging a broken setup. Help me identify which lifecycle boundary owns the failure so I can fix the problem without changing unrelated layers.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user identify which lifecycle boundary owns the failure and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the problem without changing unrelated layers." + "ground_truth": "A NemoClaw-specific answer that helps the user identify which lifecycle boundary owns the failure and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the problem without changing unrelated layers.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-003", "question": "I'm deciding whether blueprint-driven setup is repeatable enough. Help me see how versions, digests, and sandbox creation fit together so I can trust the process for team or fleet usage.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see how versions, digests, and sandbox creation fit together and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the process for team or fleet usage." + "ground_truth": "A NemoClaw-specific answer that helps the user see how versions, digests, and sandbox creation fit together and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the process for team or fleet usage.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-001", "question": "I'm reading NemoClaw release notes. Help me understand what changed since my installed version so I can assess upgrade risk before touching a working sandbox.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what changed since my installed version and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess upgrade risk before touching a working sandbox." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what changed since my installed version and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess upgrade risk before touching a working sandbox.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-002", "question": "I'm maintaining an existing sandbox. Help me spot compatibility notes, migrations, or behavior changes so I can decide whether to update now or wait.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user spot compatibility notes, migrations, or behavior changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether to update now or wait." + "ground_truth": "A NemoClaw-specific answer that helps the user spot compatibility notes, migrations, or behavior changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether to update now or wait.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-003", "question": "I'm evaluating NemoClaw for a longer-running assistant workflow. Help me see the pace and nature of recent changes so I can judge whether the project feels stable enough for my use case.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the pace and nature of recent changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the project feels stable enough for my use case." + "ground_truth": "A NemoClaw-specific answer that helps the user see the pace and nature of recent changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the project feels stable enough for my use case.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/.agents/skills/nemoclaw-user-reference/evals/evals.json b/.agents/skills/nemoclaw-user-reference/evals/evals.json index d303e6d5cd..3ac767ebcc 100644 --- a/.agents/skills/nemoclaw-user-reference/evals/evals.json +++ b/.agents/skills/nemoclaw-user-reference/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-reference-architecture-001", "question": "I'm using the architecture reference. Help me verify implementation and operations details so I can make changes or debug behavior from the right mental model.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user verify implementation and operations details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make changes or debug behavior from the right mental model." + "ground_truth": "A NemoClaw-specific answer that helps the user verify implementation and operations details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make changes or debug behavior from the right mental model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-architecture-002", "question": "I'm locating the right layer for a change. Help me connect structure, lifecycle, and ownership so I can edit the plugin, blueprint, or sandbox layer intentionally.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user connect structure, lifecycle, and ownership and gives enough concrete guidance, decision criteria, verification steps, or risk framing to edit the plugin, blueprint, or sandbox layer intentionally." + "ground_truth": "A NemoClaw-specific answer that helps the user connect structure, lifecycle, and ownership and gives enough concrete guidance, decision criteria, verification steps, or risk framing to edit the plugin, blueprint, or sandbox layer intentionally.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-architecture-003", "question": "I'm modifying architecture-sensitive files. Help me understand the invariants that must remain true so I can avoid breaking sandbox creation or operation.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the invariants that must remain true and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid breaking sandbox creation or operation." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the invariants that must remain true and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid breaking sandbox creation or operation.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-001", "question": "I'm choosing between `nemoclaw` and `openshell`. Help me pick the command surface that owns my operation so I can complete the task without breaking NemoClaw management.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user pick the command surface that owns my operation and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete the task without breaking NemoClaw management." + "ground_truth": "A NemoClaw-specific answer that helps the user pick the command surface that owns my operation and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete the task without breaking NemoClaw management.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-002", "question": "I'm tempted to run a low-level OpenShell command. Help me understand when it bypasses NemoClaw lifecycle expectations so I can avoid unsupported or confusing states.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand when it bypasses NemoClaw lifecycle expectations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unsupported or confusing states." + "ground_truth": "A NemoClaw-specific answer that helps the user understand when it bypasses NemoClaw lifecycle expectations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unsupported or confusing states.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-003", "question": "I'm doing onboarding, policy, debugging, or low-level sandbox work. Help me apply a simple CLI selection rule so I can move quickly without memorizing every command boundary.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user apply a simple CLI selection rule and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move quickly without memorizing every command boundary." + "ground_truth": "A NemoClaw-specific answer that helps the user apply a simple CLI selection rule and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move quickly without memorizing every command boundary.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-001", "question": "I'm opening the CLI command reference. Help me find the exact action, flag, or recovery path I need so I can run the right command without scanning source code.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user find the exact action, flag, or recovery path I need and gives enough concrete guidance, decision criteria, verification steps, or risk framing to run the right command without scanning source code." + "ground_truth": "A NemoClaw-specific answer that helps the user find the exact action, flag, or recovery path I need and gives enough concrete guidance, decision criteria, verification steps, or risk framing to run the right command without scanning source code.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-002", "question": "I'm choosing among setup, lifecycle, policy, inference, and diagnostic commands. Help me see command groups and examples clearly so I can locate the right command faster.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user see command groups and examples clearly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to locate the right command faster." + "ground_truth": "A NemoClaw-specific answer that helps the user see command groups and examples clearly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to locate the right command faster.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-003", "question": "I'm using `nemoclaw` in scripts or CI. Help me understand example output and exit behavior so I can automate safely and handle failures correctly.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand example output and exit behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to automate safely and handle failures correctly." + "ground_truth": "A NemoClaw-specific answer that helps the user understand example output and exit behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to automate safely and handle failures correctly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-001", "question": "I'm reading the network policy reference. Help me understand baseline access decisions so I can assess the sandbox's default exposure.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand baseline access decisions and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess the sandbox's default exposure." + "ground_truth": "A NemoClaw-specific answer that helps the user understand baseline access decisions and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess the sandbox's default exposure.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-002", "question": "I'm reviewing default rules and filesystem constraints. Help me know what is allowed before any customization so I can decide what needs to change for my environment.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user know what is allowed before any customization and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide what needs to change for my environment." + "ground_truth": "A NemoClaw-specific answer that helps the user know what is allowed before any customization and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide what needs to change for my environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-003", "question": "I'm tracing a denied request. Help me understand how operator approval turns into policy so I can audit and reason about egress changes.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how operator approval turns into policy and gives enough concrete guidance, decision criteria, verification steps, or risk framing to audit and reason about egress changes." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how operator approval turns into policy and gives enough concrete guidance, decision criteria, verification steps, or risk framing to audit and reason about egress changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-001", "question": "I'm troubleshooting a NemoClaw problem. Help me map my symptom to the fastest safe recovery step so I can get unstuck without making the environment worse.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user map my symptom to the fastest safe recovery step and gives enough concrete guidance, decision criteria, verification steps, or risk framing to get unstuck without making the environment worse." + "ground_truth": "A NemoClaw-specific answer that helps the user map my symptom to the fastest safe recovery step and gives enough concrete guidance, decision criteria, verification steps, or risk framing to get unstuck without making the environment worse.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-002", "question": "I'm uncertain which subsystem failed. Help me run diagnostics that separate install, onboarding, runtime, policy, and inference issues so I can focus on the likely cause.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user run diagnostics that separate install, onboarding, runtime, policy, and inference issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to focus on the likely cause." + "ground_truth": "A NemoClaw-specific answer that helps the user run diagnostics that separate install, onboarding, runtime, policy, and inference issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to focus on the likely cause.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-003", "question": "I'm asking the community or filing an issue. Help me collect the right logs, versions, and error details so I can give maintainers enough information to help.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user collect the right logs, versions, and error details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to give maintainers enough information to help." + "ground_truth": "A NemoClaw-specific answer that helps the user collect the right logs, versions, and error details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to give maintainers enough information to help.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-agent-skills/evals/evals.json b/skills/nemoclaw-user-agent-skills/evals/evals.json index 6109419721..aaed26c0eb 100644 --- a/skills/nemoclaw-user-agent-skills/evals/evals.json +++ b/skills/nemoclaw-user-agent-skills/evals/evals.json @@ -3,18 +3,33 @@ "id": "docs-resources-agent-skills-001", "question": "I'm looking at NemoClaw agent skills. Help me find a skill that can guide installation, policy, inference, or operations so I can delegate the right workflow to my AI coding assistant.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user find a skill that can guide installation, policy, inference, or operations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to delegate the right workflow to my AI coding assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-resources-agent-skills-002", "question": "I'm choosing among multiple NemoClaw skills. Help me understand what each skill is designed to do so I can avoid using a broad assistant when a targeted skill exists.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what each skill is designed to do and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid using a broad assistant when a targeted skill exists.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-resources-agent-skills-003", "question": "I'm letting an agent follow NemoClaw-specific instructions. Help me see why the skill guidance is trustworthy and scoped so I can use agent assistance without losing operational control.", "expected_skill": "nemoclaw-user-agent-skills", - "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control." + "ground_truth": "A NemoClaw-specific answer that helps the user see why the skill guidance is trustworthy and scoped and gives enough concrete guidance, decision criteria, verification steps, or risk framing to use agent assistance without losing operational control.", + "expected_behavior": [ + "Uses the `nemoclaw-user-agent-skills` skill.", + "Uses `references/agent-skills.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-configure-inference/evals/evals.json b/skills/nemoclaw-user-configure-inference/evals/evals.json index a0bd47ac29..47d3ac8937 100644 --- a/skills/nemoclaw-user-configure-inference/evals/evals.json +++ b/skills/nemoclaw-user-configure-inference/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-inference-inference-options-001", "question": "I'm choosing an inference option during onboarding. Help me compare hosted providers, local servers, and compatible endpoints so I can select a model path that fits my privacy, cost, and reliability needs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs." + "ground_truth": "A NemoClaw-specific answer that helps the user compare hosted providers, local servers, and compatible endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to select a model path that fits my privacy, cost, and reliability needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-inference-options-002", "question": "I'm preparing provider credentials. Help me know which provider capabilities and secrets onboarding requires so I can complete setup without avoidable credential errors.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors." + "ground_truth": "A NemoClaw-specific answer that helps the user know which provider capabilities and secrets onboarding requires and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete setup without avoidable credential errors.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-inference-options-003", "question": "I'm evaluating routed inference. Help me understand how the sandbox calls models through the gateway so I can trust that model credentials stay outside the sandbox.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how the sandbox calls models through the gateway and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that model credentials stay outside the sandbox.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/inference-options.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-001", "question": "I'm connecting a local inference server. Help me route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint so I can meet privacy, latency, or cost goals.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals." + "ground_truth": "A NemoClaw-specific answer that helps the user route NemoClaw model traffic to Ollama, vLLM, TensorRT-LLM, NIM, or another compatible endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to meet privacy, latency, or cost goals.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-002", "question": "I'm debugging local endpoint reachability. Help me separate NemoClaw routing issues from model-server issues so I can fix the right component first.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first." + "ground_truth": "A NemoClaw-specific answer that helps the user separate NemoClaw routing issues from model-server issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the right component first.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-use-local-inference-003", "question": "I'm configuring traffic through `inference.local`. Help me understand the required host, port, and model settings so I can make sandboxed inference calls resolve to my local server.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the required host, port, and model settings and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make sandboxed inference calls resolve to my local server.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/use-local-inference-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-001", "question": "I'm switching inference models during a running session. Help me change model behavior without restarting the sandbox so I can adapt to task, cost, or reliability needs quickly.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly." + "ground_truth": "A NemoClaw-specific answer that helps the user change model behavior without restarting the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to adapt to task, cost, or reliability needs quickly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-002", "question": "I'm confirming a runtime model change. Help me verify the agent is using the new active model so I can avoid mistaking host configuration changes for live routing changes.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent is using the new active model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid mistaking host configuration changes for live routing changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-switch-inference-providers-003", "question": "I'm trying a different model during active work. Help me know how to roll back to the previous model so I can experiment without disrupting the assistant workflow.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user know how to roll back to the previous model and gives enough concrete guidance, decision criteria, verification steps, or risk framing to experiment without disrupting the assistant workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/switch-inference-providers.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-001", "question": "I'm configuring a task-specific sub-agent. Help me assign a specialized model to work the default agent should not handle so I can improve task fit without changing the whole assistant.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user assign a specialized model to work the default agent should not handle and gives enough concrete guidance, decision criteria, verification steps, or risk framing to improve task fit without changing the whole assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-002", "question": "I'm editing sub-agent model configuration. Help me understand where files, credentials, and workspace settings live so I can avoid leaking secrets or changing the wrong agent.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent." + "ground_truth": "A NemoClaw-specific answer that helps the user understand where files, credentials, and workspace settings live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid leaking secrets or changing the wrong agent.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-set-up-sub-agent-003", "question": "I'm testing a new sub-agent. Help me send a prompt that exercises the intended routing so I can prove it uses the expected provider and model.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model." + "ground_truth": "A NemoClaw-specific answer that helps the user send a prompt that exercises the intended routing and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove it uses the expected provider and model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/set-up-sub-agent.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-001", "question": "I'm seeing tool calls leak as plain text. Help me diagnose whether the model, server, or parser is incompatible so I can restore reliable tool execution.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution." + "ground_truth": "A NemoClaw-specific answer that helps the user diagnose whether the model, server, or parser is incompatible and gives enough concrete guidance, decision criteria, verification steps, or risk framing to restore reliable tool execution.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-002", "question": "I'm comparing local inference runtimes. Help me understand whether Ollama, vLLM, or parser settings better support tool calls so I can choose a runtime that matches the agent's tool needs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs." + "ground_truth": "A NemoClaw-specific answer that helps the user understand whether Ollama, vLLM, or parser settings better support tool calls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a runtime that matches the agent's tool needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-inference-tool-calling-reliability-003", "question": "I'm letting an always-on assistant use tools unattended. Help me define the reliability bar for local tool calling so I can avoid silent failures or unsafe plain-text tool outputs.", "expected_skill": "nemoclaw-user-configure-inference", - "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs." + "ground_truth": "A NemoClaw-specific answer that helps the user define the reliability bar for local tool calling and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid silent failures or unsafe plain-text tool outputs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-inference` skill.", + "Uses `references/tool-calling-reliability.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-configure-security/evals/evals.json b/skills/nemoclaw-user-configure-security/evals/evals.json index 9e17d64983..a3757d87da 100644 --- a/skills/nemoclaw-user-configure-security/evals/evals.json +++ b/skills/nemoclaw-user-configure-security/evals/evals.json @@ -3,54 +3,99 @@ "id": "docs-security-best-practices-001", "question": "I'm evaluating NemoClaw security best practices. Help me understand the risk posture of each configurable control so I can justify the setup to my team or security reviewers.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the risk posture of each configurable control and gives enough concrete guidance, decision criteria, verification steps, or risk framing to justify the setup to my team or security reviewers.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-best-practices-002", "question": "I'm balancing developer convenience with lockdown. Help me compare the trade-offs of changing security controls so I can choose a posture that fits the environment.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment." + "ground_truth": "A NemoClaw-specific answer that helps the user compare the trade-offs of changing security controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose a posture that fits the environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-best-practices-003", "question": "I'm preparing for production-like use. Help me see which defaults are acceptable and which require changes so I can avoid shipping with accidental weak spots.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots." + "ground_truth": "A NemoClaw-specific answer that helps the user see which defaults are acceptable and which require changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid shipping with accidental weak spots.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/best-practices.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-001", "question": "I'm inspecting NemoClaw credential storage. Help me verify how secrets are stored and protected so I can decide whether the setup meets my secret-handling expectations.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations." + "ground_truth": "A NemoClaw-specific answer that helps the user verify how secrets are stored and protected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether the setup meets my secret-handling expectations.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-002", "question": "I'm tracing where credentials live. Help me distinguish host, gateway, and sandbox storage boundaries so I can avoid assuming secrets are available in the wrong place.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place." + "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host, gateway, and sandbox storage boundaries and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid assuming secrets are available in the wrong place.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-credential-storage-003", "question": "I'm rotating or inspecting credentials. Help me follow a workflow that does not print secrets in logs or docs so I can recover or update access safely.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely." + "ground_truth": "A NemoClaw-specific answer that helps the user follow a workflow that does not print secrets in logs or docs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover or update access safely.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/credential-storage.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-001", "question": "I'm reading about controls outside NemoClaw's scope. Help me understand which security responsibilities remain with OpenClaw so I can avoid treating sandbox isolation as a complete application security model.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which security responsibilities remain with OpenClaw and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid treating sandbox isolation as a complete application security model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-002", "question": "I'm assessing application-layer agent risk. Help me identify the controls NemoClaw does not add so I can plan separate mitigations for authentication, prompt handling, and agent behavior.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior." + "ground_truth": "A NemoClaw-specific answer that helps the user identify the controls NemoClaw does not add and gives enough concrete guidance, decision criteria, verification steps, or risk framing to plan separate mitigations for authentication, prompt handling, and agent behavior.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-security-openclaw-controls-003", "question": "I'm documenting the security boundary. Help me explain where NemoClaw protection ends so I can set accurate expectations for reviewers and operators.", "expected_skill": "nemoclaw-user-configure-security", - "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators." + "ground_truth": "A NemoClaw-specific answer that helps the user explain where NemoClaw protection ends and gives enough concrete guidance, decision criteria, verification steps, or risk framing to set accurate expectations for reviewers and operators.", + "expected_behavior": [ + "Uses the `nemoclaw-user-configure-security` skill.", + "Uses `references/openclaw-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-deploy-remote/evals/evals.json b/skills/nemoclaw-user-deploy-remote/evals/evals.json index 41af478f9e..1cf519758c 100644 --- a/skills/nemoclaw-user-deploy-remote/evals/evals.json +++ b/skills/nemoclaw-user-deploy-remote/evals/evals.json @@ -3,72 +3,132 @@ "id": "docs-deployment-deploy-to-remote-gpu-001", "question": "I'm deploying NemoClaw to a remote GPU instance. Help me move the sandboxed assistant off my local machine so I can support persistent or GPU-backed operation.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user move the sandboxed assistant off my local machine and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support persistent or GPU-backed operation." + "ground_truth": "A NemoClaw-specific answer that helps the user move the sandboxed assistant off my local machine and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support persistent or GPU-backed operation.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-deploy-to-remote-gpu-002", "question": "I'm using the legacy Brev compatibility flow. Help me understand what the flow still does and where it is deprecated so I can avoid depending on an outdated path blindly.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the flow still does and where it is deprecated and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid depending on an outdated path blindly." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the flow still does and where it is deprecated and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid depending on an outdated path blindly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-deploy-to-remote-gpu-003", "question": "I'm after remote deployment succeeds. Help me find the connection, operation, and recovery details so I can operate the sandbox after initial setup.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user find the connection, operation, and recovery details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox after initial setup." + "ground_truth": "A NemoClaw-specific answer that helps the user find the connection, operation, and recovery details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox after initial setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-001", "question": "I'm launching NemoClaw from the Brev web UI. Help me avoid local CLI setup and local GPU requirements so I can start a hosted sandbox quickly.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user avoid local CLI setup and local GPU requirements and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start a hosted sandbox quickly." + "ground_truth": "A NemoClaw-specific answer that helps the user avoid local CLI setup and local GPU requirements and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start a hosted sandbox quickly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-002", "question": "I'm reviewing hosted launch choices. Help me understand each web UI option before creating the instance so I can choose settings that match my expected sandbox workflow.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand each web UI option before creating the instance and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose settings that match my expected sandbox workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user understand each web UI option before creating the instance and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose settings that match my expected sandbox workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-brev-web-ui-003", "question": "I'm the hosted sandbox is created. Help me confirm where to connect and how to start using it so I can move from provisioning to actual agent work.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm where to connect and how to start using it and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move from provisioning to actual agent work." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm where to connect and how to start using it and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move from provisioning to actual agent work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/brev-web-ui.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-001", "question": "I'm installing an OpenClaw plugin in a NemoClaw-managed sandbox. Help me add a new agent capability inside the sandbox so I can extend the assistant without weakening the host boundary.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user add a new agent capability inside the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to extend the assistant without weakening the host boundary." + "ground_truth": "A NemoClaw-specific answer that helps the user add a new agent capability inside the sandbox and gives enough concrete guidance, decision criteria, verification steps, or risk framing to extend the assistant without weakening the host boundary.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-002", "question": "I'm deciding where to install a plugin. Help me distinguish host environment changes from sandbox environment changes so I can modify the right filesystem and runtime.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host environment changes from sandbox environment changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to modify the right filesystem and runtime." + "ground_truth": "A NemoClaw-specific answer that helps the user distinguish host environment changes from sandbox environment changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to modify the right filesystem and runtime.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-install-openclaw-plugins-003", "question": "I'm verifying a plugin installation. Help me confirm the agent can discover and use the plugin so I can trust that the capability works inside NemoClaw's security model.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm the agent can discover and use the plugin and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that the capability works inside NemoClaw's security model." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm the agent can discover and use the plugin and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust that the capability works inside NemoClaw's security model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/install-openclaw-plugins.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-001", "question": "I'm reviewing sandbox image hardening. Help me understand which container risks NemoClaw reduces so I can decide whether unattended agents are acceptable in my environment.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which container risks NemoClaw reduces and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether unattended agents are acceptable in my environment." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which container risks NemoClaw reduces and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether unattended agents are acceptable in my environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-002", "question": "I'm mapping NemoClaw to an organizational security baseline. Help me identify capability drops, least privilege, and runtime protections so I can document how the sandbox meets or misses required controls.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user identify capability drops, least privilege, and runtime protections and gives enough concrete guidance, decision criteria, verification steps, or risk framing to document how the sandbox meets or misses required controls." + "ground_truth": "A NemoClaw-specific answer that helps the user identify capability drops, least privilege, and runtime protections and gives enough concrete guidance, decision criteria, verification steps, or risk framing to document how the sandbox meets or misses required controls.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-deployment-sandbox-hardening-003", "question": "I'm considering production use. Help me see the limitations and residual risks of the hardened image so I can avoid overstating what container hardening guarantees.", "expected_skill": "nemoclaw-user-deploy-remote", - "ground_truth": "A NemoClaw-specific answer that helps the user see the limitations and residual risks of the hardened image and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid overstating what container hardening guarantees." + "ground_truth": "A NemoClaw-specific answer that helps the user see the limitations and residual risks of the hardened image and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid overstating what container hardening guarantees.", + "expected_behavior": [ + "Uses the `nemoclaw-user-deploy-remote` skill.", + "Uses `references/sandbox-hardening.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-get-started/evals/evals.json b/skills/nemoclaw-user-get-started/evals/evals.json index 946c2dfea5..6cbfddba24 100644 --- a/skills/nemoclaw-user-get-started/evals/evals.json +++ b/skills/nemoclaw-user-get-started/evals/evals.json @@ -3,72 +3,132 @@ "id": "docs-get-started-prerequisites-001", "question": "I'm checking prerequisites before installation. Help me verify my host has the required hardware, software, and platform support so I can avoid a failed first setup.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user verify my host has the required hardware, software, and platform support and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid a failed first setup." + "ground_truth": "A NemoClaw-specific answer that helps the user verify my host has the required hardware, software, and platform support and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid a failed first setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-prerequisites-002", "question": "I'm using a machine with limited CPU, memory, disk, or Docker capacity. Help me understand the practical minimums and known bottlenecks so I can prepare the machine before onboarding starts.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the practical minimums and known bottlenecks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prepare the machine before onboarding starts." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the practical minimums and known bottlenecks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prepare the machine before onboarding starts.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-prerequisites-003", "question": "I'm choosing between local, Windows WSL, and remote GPU setup. Help me compare supported platform paths so I can start with the environment most likely to work.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user compare supported platform paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start with the environment most likely to work." + "ground_truth": "A NemoClaw-specific answer that helps the user compare supported platform paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to start with the environment most likely to work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/prerequisites.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-001", "question": "I'm preparing a Windows machine for NemoClaw. Help me enable WSL 2, Ubuntu, and Docker Desktop correctly so I can enter the standard quickstart from a working Linux environment.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user enable WSL 2, Ubuntu, and Docker Desktop correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enter the standard quickstart from a working Linux environment." + "ground_truth": "A NemoClaw-specific answer that helps the user enable WSL 2, Ubuntu, and Docker Desktop correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enter the standard quickstart from a working Linux environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-002", "question": "I'm unsure whether Windows-specific setup is complete. Help me check the WSL and Docker integration steps that commonly block installs so I can fix host issues before running the NemoClaw installer.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user check the WSL and Docker integration steps that commonly block installs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix host issues before running the NemoClaw installer." + "ground_truth": "A NemoClaw-specific answer that helps the user check the WSL and Docker integration steps that commonly block installs and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix host issues before running the NemoClaw installer.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-windows-preparation-003", "question": "I'm ready to leave the Windows preparation guide. Help me confirm the Ubuntu shell can run the required commands so I can follow the quickstart without mixing Windows and Linux instructions.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm the Ubuntu shell can run the required commands and gives enough concrete guidance, decision criteria, verification steps, or risk framing to follow the quickstart without mixing Windows and Linux instructions." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm the Ubuntu shell can run the required commands and gives enough concrete guidance, decision criteria, verification steps, or risk framing to follow the quickstart without mixing Windows and Linux instructions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/windows-preparation.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-001", "question": "I'm running the OpenClaw quickstart. Help me install NemoClaw and create my first sandboxed agent so I can send a prompt to a working OpenClaw assistant.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user install NemoClaw and create my first sandboxed agent and gives enough concrete guidance, decision criteria, verification steps, or risk framing to send a prompt to a working OpenClaw assistant." + "ground_truth": "A NemoClaw-specific answer that helps the user install NemoClaw and create my first sandboxed agent and gives enough concrete guidance, decision criteria, verification steps, or risk framing to send a prompt to a working OpenClaw assistant.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-002", "question": "I'm encountering installer prompts or host preflight checks. Help me understand what the installer is asking for and why so I can continue setup without granting unnecessary access.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the installer is asking for and why and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue setup without granting unnecessary access." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the installer is asking for and why and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue setup without granting unnecessary access.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-003", "question": "I'm finishing onboarding. Help me verify the sandbox, inference route, and OpenClaw agent are connected so I can know the setup succeeded end to end.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the sandbox, inference route, and OpenClaw agent are connected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to know the setup succeeded end to end." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the sandbox, inference route, and OpenClaw agent are connected and gives enough concrete guidance, decision criteria, verification steps, or risk framing to know the setup succeeded end to end.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-001", "question": "I'm choosing Hermes instead of OpenClaw. Help me launch a sandboxed Hermes API endpoint so I can serve the agent workflow my downstream clients expect.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user launch a sandboxed Hermes API endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to serve the agent workflow my downstream clients expect." + "ground_truth": "A NemoClaw-specific answer that helps the user launch a sandboxed Hermes API endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to serve the agent workflow my downstream clients expect.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-002", "question": "I'm moving through onboarding for Hermes. Help me confirm NemoClaw selected Hermes-specific setup and configuration so I can avoid accidentally creating the default OpenClaw environment.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user confirm NemoClaw selected Hermes-specific setup and configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidentally creating the default OpenClaw environment." + "ground_truth": "A NemoClaw-specific answer that helps the user confirm NemoClaw selected Hermes-specific setup and configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidentally creating the default OpenClaw environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-get-started-quickstart-hermes-003", "question": "I'm checking the sandboxed Hermes endpoint. Help me run a small request that proves the endpoint is live so I can hand it to clients or tests with confidence.", "expected_skill": "nemoclaw-user-get-started", - "ground_truth": "A NemoClaw-specific answer that helps the user run a small request that proves the endpoint is live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to hand it to clients or tests with confidence." + "ground_truth": "A NemoClaw-specific answer that helps the user run a small request that proves the endpoint is live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to hand it to clients or tests with confidence.", + "expected_behavior": [ + "Uses the `nemoclaw-user-get-started` skill.", + "Uses `references/quickstart-hermes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-manage-policy/evals/evals.json b/skills/nemoclaw-user-manage-policy/evals/evals.json index 26bb32a8fd..c3b1734a1f 100644 --- a/skills/nemoclaw-user-manage-policy/evals/evals.json +++ b/skills/nemoclaw-user-manage-policy/evals/evals.json @@ -3,54 +3,99 @@ "id": "docs-network-policy-customize-network-policy-001", "question": "I'm customizing sandbox network policy. Help me allow the agent to reach a required external service so I can enable the integration while preserving least privilege.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user allow the agent to reach a required external service and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enable the integration while preserving least privilege." + "ground_truth": "A NemoClaw-specific answer that helps the user allow the agent to reach a required external service and gives enough concrete guidance, decision criteria, verification steps, or risk framing to enable the integration while preserving least privilege.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-customize-network-policy-002", "question": "I'm writing an egress rule. Help me specify the minimum necessary host, port, and protocol so I can avoid opening broader access than the agent needs.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user specify the minimum necessary host, port, and protocol and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid opening broader access than the agent needs." + "ground_truth": "A NemoClaw-specific answer that helps the user specify the minimum necessary host, port, and protocol and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid opening broader access than the agent needs.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-customize-network-policy-003", "question": "I'm validating a policy change. Help me test that the intended integration works and unrelated egress remains blocked so I can ship a safer policy update.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user test that the intended integration works and unrelated egress remains blocked and gives enough concrete guidance, decision criteria, verification steps, or risk framing to ship a safer policy update." + "ground_truth": "A NemoClaw-specific answer that helps the user test that the intended integration works and unrelated egress remains blocked and gives enough concrete guidance, decision criteria, verification steps, or risk framing to ship a safer policy update.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/customize-network-policy-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-001", "question": "I'm reviewing a blocked network request. Help me understand why the agent wants to reach that endpoint so I can approve only requests that support the current job.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user understand why the agent wants to reach that endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to approve only requests that support the current job." + "ground_truth": "A NemoClaw-specific answer that helps the user understand why the agent wants to reach that endpoint and gives enough concrete guidance, decision criteria, verification steps, or risk framing to approve only requests that support the current job.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-002", "question": "I'm using the approval UI. Help me spot unexpected or prompt-injection-driven egress so I can deny suspicious access before it becomes policy.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user spot unexpected or prompt-injection-driven egress and gives enough concrete guidance, decision criteria, verification steps, or risk framing to deny suspicious access before it becomes policy." + "ground_truth": "A NemoClaw-specific answer that helps the user spot unexpected or prompt-injection-driven egress and gives enough concrete guidance, decision criteria, verification steps, or risk framing to deny suspicious access before it becomes policy.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-approve-network-requests-003", "question": "I'm after approving or denying a request. Help me understand audit, rollback, and policy update behavior so I can keep operator decisions traceable.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user understand audit, rollback, and policy update behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to keep operator decisions traceable." + "ground_truth": "A NemoClaw-specific answer that helps the user understand audit, rollback, and policy update behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to keep operator decisions traceable.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/approve-network-requests.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-001", "question": "I'm following an integration policy example. Help me enable a common third-party workflow quickly so I can avoid writing a policy from scratch.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user enable a common third-party workflow quickly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid writing a policy from scratch." + "ground_truth": "A NemoClaw-specific answer that helps the user enable a common third-party workflow quickly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid writing a policy from scratch.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-002", "question": "I'm adapting an example to my organization. Help me replace sample hosts and ports with exact production endpoints so I can create a policy that matches our real integration.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user replace sample hosts and ports with exact production endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to create a policy that matches our real integration." + "ground_truth": "A NemoClaw-specific answer that helps the user replace sample hosts and ports with exact production endpoints and gives enough concrete guidance, decision criteria, verification steps, or risk framing to create a policy that matches our real integration.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-network-policy-integration-policy-examples-003", "question": "I'm copying an example into a stricter environment. Help me identify broad rules or assumptions that need tightening so I can avoid weakening production egress controls.", "expected_skill": "nemoclaw-user-manage-policy", - "ground_truth": "A NemoClaw-specific answer that helps the user identify broad rules or assumptions that need tightening and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid weakening production egress controls." + "ground_truth": "A NemoClaw-specific answer that helps the user identify broad rules or assumptions that need tightening and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid weakening production egress controls.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-policy` skill.", + "Uses `references/integration-policy-examples.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-manage-sandboxes/evals/evals.json b/skills/nemoclaw-user-manage-sandboxes/evals/evals.json index ff6af55509..e875fa8c95 100644 --- a/skills/nemoclaw-user-manage-sandboxes/evals/evals.json +++ b/skills/nemoclaw-user-manage-sandboxes/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-manage-sandboxes-lifecycle-001", "question": "I'm managing a NemoClaw sandbox. Help me check status, health, logs, ports, providers, upgrades, and uninstall paths so I can operate the sandbox safely after quickstart.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user check status, health, logs, ports, providers, upgrades, and uninstall paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox safely after quickstart." + "ground_truth": "A NemoClaw-specific answer that helps the user check status, health, logs, ports, providers, upgrades, and uninstall paths and gives enough concrete guidance, decision criteria, verification steps, or risk framing to operate the sandbox safely after quickstart.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-lifecycle-002", "question": "I'm choosing a lifecycle command. Help me understand which commands inspect, restart, rebuild, or destroy state so I can avoid accidental data loss.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which commands inspect, restart, rebuild, or destroy state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidental data loss." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which commands inspect, restart, rebuild, or destroy state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid accidental data loss.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-lifecycle-003", "question": "I'm planning an upgrade, rebuild, or uninstall. Help me know when to preserve workspace files first so I can recover useful agent state after disruptive changes.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know when to preserve workspace files first and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover useful agent state after disruptive changes." + "ground_truth": "A NemoClaw-specific answer that helps the user know when to preserve workspace files first and gives enough concrete guidance, decision criteria, verification steps, or risk framing to recover useful agent state after disruptive changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/lifecycle-details.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-001", "question": "I'm changing a running sandbox. Help me know which controls can change without rebuild or re-onboarding so I can make safe adjustments with minimal downtime.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know which controls can change without rebuild or re-onboarding and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make safe adjustments with minimal downtime." + "ground_truth": "A NemoClaw-specific answer that helps the user know which controls can change without rebuild or re-onboarding and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make safe adjustments with minimal downtime.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-002", "question": "I'm reviewing a runtime control. Help me classify it as hot-reloadable, rebuild-only, or onboarding-only so I can choose the correct operational path.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user classify it as hot-reloadable, rebuild-only, or onboarding-only and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the correct operational path." + "ground_truth": "A NemoClaw-specific answer that helps the user classify it as hot-reloadable, rebuild-only, or onboarding-only and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the correct operational path.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-runtime-controls-003", "question": "I'm responding to an incident or risky agent behavior. Help me use `shields up`, `shields down`, and `shields status` correctly so I can tighten or inspect controls without confusion.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user use `shields up`, `shields down`, and `shields status` correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to tighten or inspect controls without confusion." + "ground_truth": "A NemoClaw-specific answer that helps the user use `shields up`, `shields down`, and `shields status` correctly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to tighten or inspect controls without confusion.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/runtime-controls.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-001", "question": "I'm backing up workspace files before a destructive operation. Help me preserve agent memory, identity, and useful configuration so I can rebuild or migrate without losing important state.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user preserve agent memory, identity, and useful configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to rebuild or migrate without losing important state." + "ground_truth": "A NemoClaw-specific answer that helps the user preserve agent memory, identity, and useful configuration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to rebuild or migrate without losing important state.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-002", "question": "I'm handling a workspace archive. Help me understand credential stripping and integrity checks so I can trust the archive without exposing secrets.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand credential stripping and integrity checks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the archive without exposing secrets." + "ground_truth": "A NemoClaw-specific answer that helps the user understand credential stripping and integrity checks and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the archive without exposing secrets.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-backup-restore-003", "question": "I'm restoring workspace files. Help me verify the agent's useful memory returned so I can continue work without reintroducing sensitive host data.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent's useful memory returned and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue work without reintroducing sensitive host data." + "ground_truth": "A NemoClaw-specific answer that helps the user verify the agent's useful memory returned and gives enough concrete guidance, decision criteria, verification steps, or risk framing to continue work without reintroducing sensitive host data.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/backup-restore.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-001", "question": "I'm inspecting workspace files. Help me understand where personality, identity, and configuration live so I can predict how the agent will behave across sessions.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand where personality, identity, and configuration live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to predict how the agent will behave across sessions." + "ground_truth": "A NemoClaw-specific answer that helps the user understand where personality, identity, and configuration live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to predict how the agent will behave across sessions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-002", "question": "I'm adding durable instructions for the agent. Help me know which files persist and who owns them so I can put guidance in the right place.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user know which files persist and who owns them and gives enough concrete guidance, decision criteria, verification steps, or risk framing to put guidance in the right place." + "ground_truth": "A NemoClaw-specific answer that helps the user know which files persist and who owns them and gives enough concrete guidance, decision criteria, verification steps, or risk framing to put guidance in the right place.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-workspace-files-003", "question": "I'm restarting, rebuilding, or migrating a sandbox. Help me understand how each action affects workspace state so I can avoid losing or duplicating important files.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how each action affects workspace state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid losing or duplicating important files." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how each action affects workspace state and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid losing or duplicating important files.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/workspace-files.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-001", "question": "I'm connecting a messaging channel. Help me let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel so I can support real-world always-on interactions.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support real-world always-on interactions." + "ground_truth": "A NemoClaw-specific answer that helps the user let users reach the sandboxed agent through Telegram, Discord, Slack, or another channel and gives enough concrete guidance, decision criteria, verification steps, or risk framing to support real-world always-on interactions.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-002", "question": "I'm configuring channel credentials and processes. Help me understand what OpenShell supervises and where secrets live so I can trust the messaging integration operationally.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what OpenShell supervises and where secrets live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the messaging integration operationally." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what OpenShell supervises and where secrets live and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the messaging integration operationally.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-manage-sandboxes-messaging-channels-003", "question": "I'm testing a new messaging channel. Help me send and receive a message through the full path so I can prove the channel, gateway, and sandboxed agent are wired correctly.", "expected_skill": "nemoclaw-user-manage-sandboxes", - "ground_truth": "A NemoClaw-specific answer that helps the user send and receive a message through the full path and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove the channel, gateway, and sandboxed agent are wired correctly." + "ground_truth": "A NemoClaw-specific answer that helps the user send and receive a message through the full path and gives enough concrete guidance, decision criteria, verification steps, or risk framing to prove the channel, gateway, and sandboxed agent are wired correctly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-manage-sandboxes` skill.", + "Uses `references/messaging-channels.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-monitor-sandbox/evals/evals.json b/skills/nemoclaw-user-monitor-sandbox/evals/evals.json index 260e8ec64e..231936529b 100644 --- a/skills/nemoclaw-user-monitor-sandbox/evals/evals.json +++ b/skills/nemoclaw-user-monitor-sandbox/evals/evals.json @@ -3,18 +3,33 @@ "id": "docs-monitoring-monitor-sandbox-activity-001", "question": "I'm monitoring sandbox activity. Help me understand what the agent and sandbox are doing now so I can detect unhealthy or unexpected behavior early.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what the agent and sandbox are doing now and gives enough concrete guidance, decision criteria, verification steps, or risk framing to detect unhealthy or unexpected behavior early." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what the agent and sandbox are doing now and gives enough concrete guidance, decision criteria, verification steps, or risk framing to detect unhealthy or unexpected behavior early.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-monitoring-monitor-sandbox-activity-002", "question": "I'm diagnosing a runtime failure. Help me use health, logs, and traces to locate the failing layer so I can separate host, gateway, sandbox, policy, and inference issues.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user use health, logs, and traces to locate the failing layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to separate host, gateway, sandbox, policy, and inference issues." + "ground_truth": "A NemoClaw-specific answer that helps the user use health, logs, and traces to locate the failing layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to separate host, gateway, sandbox, policy, and inference issues.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-monitoring-monitor-sandbox-activity-003", "question": "I'm collecting debugging evidence. Help me gather enough information without weakening controls so I can investigate safely and share useful diagnostics.", "expected_skill": "nemoclaw-user-monitor-sandbox", - "ground_truth": "A NemoClaw-specific answer that helps the user gather enough information without weakening controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to investigate safely and share useful diagnostics." + "ground_truth": "A NemoClaw-specific answer that helps the user gather enough information without weakening controls and gives enough concrete guidance, decision criteria, verification steps, or risk framing to investigate safely and share useful diagnostics.", + "expected_behavior": [ + "Uses the `nemoclaw-user-monitor-sandbox` skill.", + "Uses `SKILL.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-overview/evals/evals.json b/skills/nemoclaw-user-overview/evals/evals.json index fc8c3e0aca..f4b0b599ee 100644 --- a/skills/nemoclaw-user-overview/evals/evals.json +++ b/skills/nemoclaw-user-overview/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-index-001", "question": "I'm first arriving at the NemoClaw docs. Help me understand what NemoClaw helps me run and why it exists so I can decide whether it is worth installing before I spend time on setup.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what NemoClaw helps me run and why it exists and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether it is worth installing before I spend time on setup." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what NemoClaw helps me run and why it exists and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether it is worth installing before I spend time on setup.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-index-002", "question": "I'm evaluating whether an always-on assistant can run safely in my environment. Help me see the core safety, lifecycle, and inference-routing promises up front so I can judge whether the stack matches my risk tolerance.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the core safety, lifecycle, and inference-routing promises up front and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the stack matches my risk tolerance." + "ground_truth": "A NemoClaw-specific answer that helps the user see the core safety, lifecycle, and inference-routing promises up front and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the stack matches my risk tolerance.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-index-003", "question": "I'm considering the one-command install path. Help me know what the command will install, configure, and launch so I can take the next step without feeling like I am accepting an opaque shortcut.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user know what the command will install, configure, and launch and gives enough concrete guidance, decision criteria, verification steps, or risk framing to take the next step without feeling like I am accepting an opaque shortcut." + "ground_truth": "A NemoClaw-specific answer that helps the user know what the command will install, configure, and launch and gives enough concrete guidance, decision criteria, verification steps, or risk framing to take the next step without feeling like I am accepting an opaque shortcut.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-001", "question": "I'm explaining NemoClaw to a teammate. Help me summarize the product, stack, and value in plain language so I can align on whether NemoClaw is relevant to our agent workflow.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user summarize the product, stack, and value in plain language and gives enough concrete guidance, decision criteria, verification steps, or risk framing to align on whether NemoClaw is relevant to our agent workflow." + "ground_truth": "A NemoClaw-specific answer that helps the user summarize the product, stack, and value in plain language and gives enough concrete guidance, decision criteria, verification steps, or risk framing to align on whether NemoClaw is relevant to our agent workflow.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-002", "question": "I'm worried about security, cost, or operations risks from unattended agents. Help me understand which guardrails NemoClaw adds so I can decide whether sandboxed execution addresses my main concerns.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand which guardrails NemoClaw adds and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether sandboxed execution addresses my main concerns." + "ground_truth": "A NemoClaw-specific answer that helps the user understand which guardrails NemoClaw adds and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether sandboxed execution addresses my main concerns.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-overview-003", "question": "I'm comparing NemoClaw with direct OpenClaw or OpenShell usage. Help me see the capabilities NemoClaw owns so I can classify it as the right reference stack rather than generic setup glue.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the capabilities NemoClaw owns and gives enough concrete guidance, decision criteria, verification steps, or risk framing to classify it as the right reference stack rather than generic setup glue." + "ground_truth": "A NemoClaw-specific answer that helps the user see the capabilities NemoClaw owns and gives enough concrete guidance, decision criteria, verification steps, or risk framing to classify it as the right reference stack rather than generic setup glue.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/overview.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-001", "question": "I'm comparing OpenClaw, OpenShell, and NemoClaw. Help me understand the role of each layer so I can choose the right adoption path for my project.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the role of each layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the right adoption path for my project." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the role of each layer and gives enough concrete guidance, decision criteria, verification steps, or risk framing to choose the right adoption path for my project.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-002", "question": "I'm deciding whether to use the reference integration. Help me identify when NemoClaw is enough versus when I need direct OpenShell integration so I can avoid unnecessary platform work.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user identify when NemoClaw is enough versus when I need direct OpenShell integration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unnecessary platform work." + "ground_truth": "A NemoClaw-specific answer that helps the user identify when NemoClaw is enough versus when I need direct OpenShell integration and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unnecessary platform work.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-ecosystem-003", "question": "I'm planning a deployment with multiple moving parts. Help me separate agent, runtime, and orchestration responsibilities so I can assign ownership and troubleshoot the right layer later.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user separate agent, runtime, and orchestration responsibilities and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assign ownership and troubleshoot the right layer later." + "ground_truth": "A NemoClaw-specific answer that helps the user separate agent, runtime, and orchestration responsibilities and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assign ownership and troubleshoot the right layer later.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/ecosystem.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-001", "question": "I'm studying the NemoClaw architecture. Help me understand how the CLI, plugin, blueprint, and sandbox interact so I can reason about failures and maintenance work with confidence.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how the CLI, plugin, blueprint, and sandbox interact and gives enough concrete guidance, decision criteria, verification steps, or risk framing to reason about failures and maintenance work with confidence." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how the CLI, plugin, blueprint, and sandbox interact and gives enough concrete guidance, decision criteria, verification steps, or risk framing to reason about failures and maintenance work with confidence.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-002", "question": "I'm debugging a broken setup. Help me identify which lifecycle boundary owns the failure so I can fix the problem without changing unrelated layers.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user identify which lifecycle boundary owns the failure and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the problem without changing unrelated layers." + "ground_truth": "A NemoClaw-specific answer that helps the user identify which lifecycle boundary owns the failure and gives enough concrete guidance, decision criteria, verification steps, or risk framing to fix the problem without changing unrelated layers.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-how-it-works-003", "question": "I'm deciding whether blueprint-driven setup is repeatable enough. Help me see how versions, digests, and sandbox creation fit together so I can trust the process for team or fleet usage.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see how versions, digests, and sandbox creation fit together and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the process for team or fleet usage." + "ground_truth": "A NemoClaw-specific answer that helps the user see how versions, digests, and sandbox creation fit together and gives enough concrete guidance, decision criteria, verification steps, or risk framing to trust the process for team or fleet usage.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/how-it-works.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-001", "question": "I'm reading NemoClaw release notes. Help me understand what changed since my installed version so I can assess upgrade risk before touching a working sandbox.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user understand what changed since my installed version and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess upgrade risk before touching a working sandbox." + "ground_truth": "A NemoClaw-specific answer that helps the user understand what changed since my installed version and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess upgrade risk before touching a working sandbox.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-002", "question": "I'm maintaining an existing sandbox. Help me spot compatibility notes, migrations, or behavior changes so I can decide whether to update now or wait.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user spot compatibility notes, migrations, or behavior changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether to update now or wait." + "ground_truth": "A NemoClaw-specific answer that helps the user spot compatibility notes, migrations, or behavior changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide whether to update now or wait.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-about-release-notes-003", "question": "I'm evaluating NemoClaw for a longer-running assistant workflow. Help me see the pace and nature of recent changes so I can judge whether the project feels stable enough for my use case.", "expected_skill": "nemoclaw-user-overview", - "ground_truth": "A NemoClaw-specific answer that helps the user see the pace and nature of recent changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the project feels stable enough for my use case." + "ground_truth": "A NemoClaw-specific answer that helps the user see the pace and nature of recent changes and gives enough concrete guidance, decision criteria, verification steps, or risk framing to judge whether the project feels stable enough for my use case.", + "expected_behavior": [ + "Uses the `nemoclaw-user-overview` skill.", + "Uses `references/release-notes.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ] diff --git a/skills/nemoclaw-user-reference/evals/evals.json b/skills/nemoclaw-user-reference/evals/evals.json index d303e6d5cd..3ac767ebcc 100644 --- a/skills/nemoclaw-user-reference/evals/evals.json +++ b/skills/nemoclaw-user-reference/evals/evals.json @@ -3,90 +3,165 @@ "id": "docs-reference-architecture-001", "question": "I'm using the architecture reference. Help me verify implementation and operations details so I can make changes or debug behavior from the right mental model.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user verify implementation and operations details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make changes or debug behavior from the right mental model." + "ground_truth": "A NemoClaw-specific answer that helps the user verify implementation and operations details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to make changes or debug behavior from the right mental model.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-architecture-002", "question": "I'm locating the right layer for a change. Help me connect structure, lifecycle, and ownership so I can edit the plugin, blueprint, or sandbox layer intentionally.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user connect structure, lifecycle, and ownership and gives enough concrete guidance, decision criteria, verification steps, or risk framing to edit the plugin, blueprint, or sandbox layer intentionally." + "ground_truth": "A NemoClaw-specific answer that helps the user connect structure, lifecycle, and ownership and gives enough concrete guidance, decision criteria, verification steps, or risk framing to edit the plugin, blueprint, or sandbox layer intentionally.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-architecture-003", "question": "I'm modifying architecture-sensitive files. Help me understand the invariants that must remain true so I can avoid breaking sandbox creation or operation.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand the invariants that must remain true and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid breaking sandbox creation or operation." + "ground_truth": "A NemoClaw-specific answer that helps the user understand the invariants that must remain true and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid breaking sandbox creation or operation.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/architecture.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-001", "question": "I'm choosing between `nemoclaw` and `openshell`. Help me pick the command surface that owns my operation so I can complete the task without breaking NemoClaw management.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user pick the command surface that owns my operation and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete the task without breaking NemoClaw management." + "ground_truth": "A NemoClaw-specific answer that helps the user pick the command surface that owns my operation and gives enough concrete guidance, decision criteria, verification steps, or risk framing to complete the task without breaking NemoClaw management.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-002", "question": "I'm tempted to run a low-level OpenShell command. Help me understand when it bypasses NemoClaw lifecycle expectations so I can avoid unsupported or confusing states.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand when it bypasses NemoClaw lifecycle expectations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unsupported or confusing states." + "ground_truth": "A NemoClaw-specific answer that helps the user understand when it bypasses NemoClaw lifecycle expectations and gives enough concrete guidance, decision criteria, verification steps, or risk framing to avoid unsupported or confusing states.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-cli-selection-guide-003", "question": "I'm doing onboarding, policy, debugging, or low-level sandbox work. Help me apply a simple CLI selection rule so I can move quickly without memorizing every command boundary.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user apply a simple CLI selection rule and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move quickly without memorizing every command boundary." + "ground_truth": "A NemoClaw-specific answer that helps the user apply a simple CLI selection rule and gives enough concrete guidance, decision criteria, verification steps, or risk framing to move quickly without memorizing every command boundary.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/cli-selection-guide.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-001", "question": "I'm opening the CLI command reference. Help me find the exact action, flag, or recovery path I need so I can run the right command without scanning source code.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user find the exact action, flag, or recovery path I need and gives enough concrete guidance, decision criteria, verification steps, or risk framing to run the right command without scanning source code." + "ground_truth": "A NemoClaw-specific answer that helps the user find the exact action, flag, or recovery path I need and gives enough concrete guidance, decision criteria, verification steps, or risk framing to run the right command without scanning source code.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-002", "question": "I'm choosing among setup, lifecycle, policy, inference, and diagnostic commands. Help me see command groups and examples clearly so I can locate the right command faster.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user see command groups and examples clearly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to locate the right command faster." + "ground_truth": "A NemoClaw-specific answer that helps the user see command groups and examples clearly and gives enough concrete guidance, decision criteria, verification steps, or risk framing to locate the right command faster.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-commands-003", "question": "I'm using `nemoclaw` in scripts or CI. Help me understand example output and exit behavior so I can automate safely and handle failures correctly.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand example output and exit behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to automate safely and handle failures correctly." + "ground_truth": "A NemoClaw-specific answer that helps the user understand example output and exit behavior and gives enough concrete guidance, decision criteria, verification steps, or risk framing to automate safely and handle failures correctly.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/commands.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-001", "question": "I'm reading the network policy reference. Help me understand baseline access decisions so I can assess the sandbox's default exposure.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand baseline access decisions and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess the sandbox's default exposure." + "ground_truth": "A NemoClaw-specific answer that helps the user understand baseline access decisions and gives enough concrete guidance, decision criteria, verification steps, or risk framing to assess the sandbox's default exposure.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-002", "question": "I'm reviewing default rules and filesystem constraints. Help me know what is allowed before any customization so I can decide what needs to change for my environment.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user know what is allowed before any customization and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide what needs to change for my environment." + "ground_truth": "A NemoClaw-specific answer that helps the user know what is allowed before any customization and gives enough concrete guidance, decision criteria, verification steps, or risk framing to decide what needs to change for my environment.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-network-policies-003", "question": "I'm tracing a denied request. Help me understand how operator approval turns into policy so I can audit and reason about egress changes.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user understand how operator approval turns into policy and gives enough concrete guidance, decision criteria, verification steps, or risk framing to audit and reason about egress changes." + "ground_truth": "A NemoClaw-specific answer that helps the user understand how operator approval turns into policy and gives enough concrete guidance, decision criteria, verification steps, or risk framing to audit and reason about egress changes.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/network-policies.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-001", "question": "I'm troubleshooting a NemoClaw problem. Help me map my symptom to the fastest safe recovery step so I can get unstuck without making the environment worse.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user map my symptom to the fastest safe recovery step and gives enough concrete guidance, decision criteria, verification steps, or risk framing to get unstuck without making the environment worse." + "ground_truth": "A NemoClaw-specific answer that helps the user map my symptom to the fastest safe recovery step and gives enough concrete guidance, decision criteria, verification steps, or risk framing to get unstuck without making the environment worse.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-002", "question": "I'm uncertain which subsystem failed. Help me run diagnostics that separate install, onboarding, runtime, policy, and inference issues so I can focus on the likely cause.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user run diagnostics that separate install, onboarding, runtime, policy, and inference issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to focus on the likely cause." + "ground_truth": "A NemoClaw-specific answer that helps the user run diagnostics that separate install, onboarding, runtime, policy, and inference issues and gives enough concrete guidance, decision criteria, verification steps, or risk framing to focus on the likely cause.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] }, { "id": "docs-reference-troubleshooting-003", "question": "I'm asking the community or filing an issue. Help me collect the right logs, versions, and error details so I can give maintainers enough information to help.", "expected_skill": "nemoclaw-user-reference", - "ground_truth": "A NemoClaw-specific answer that helps the user collect the right logs, versions, and error details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to give maintainers enough information to help." + "ground_truth": "A NemoClaw-specific answer that helps the user collect the right logs, versions, and error details and gives enough concrete guidance, decision criteria, verification steps, or risk framing to give maintainers enough information to help.", + "expected_behavior": [ + "Uses the `nemoclaw-user-reference` skill.", + "Uses `references/troubleshooting.md` from that skill.", + "Answers the stated NemoClaw task directly." + ] } ]