Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions eng/skill-validator/src/Check/CheckCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ public static class CheckCommand
? StringComparer.OrdinalIgnoreCase
: StringComparer.Ordinal;

// A skill with `disable-model-invocation: true` in its frontmatter is
// dropped from the Copilot CLI's model-facing skill menu and therefore does
// not consume the skill-menu character budget tracked by
// SkillProfiler.MaxRenderedSkillMenuLength. The flag is parsed once during
// discovery and surfaced on SkillInfo.DisableModelInvocation.

public static Command Create()
{
var pluginOpt = new Option<string[]>("--plugin") { Description = "Plugin directories to check (discovers skills, agents, plugin.json)", AllowMultipleArgumentsPerToken = true };
Expand Down Expand Up @@ -219,14 +225,26 @@ private static async Task<int> RunPluginCheck(CheckConfig config, CheckReportBui

foreach (var (pluginDirectoryPath, skills) in pluginSkills)
{
int totalChars = skills.Sum(s => s.Description.Length);
if (totalChars <= SkillProfiler.MaxAggregateDescriptionLength)
// Sum each model-invocable skill's RENDERED menu cost — the full
// <skill> block the Copilot CLI emits (name + description + location
// + markup), via SkillProfiler.RenderedSkillMenuCost — so this
// mirrors the real SKILL_CHAR_BUDGET rather than just the raw
// description length.
//
// Skills hidden from the model-facing skill menu via
// `disable-model-invocation: true` do not consume that budget, so
// they are excluded from the aggregate (see
// SkillProfiler.MaxRenderedSkillMenuLength).
int totalChars = skills
.Where(s => !s.DisableModelInvocation)
.Sum(SkillProfiler.RenderedSkillMenuCost);
if (totalChars <= SkillProfiler.MaxRenderedSkillMenuLength)
continue;

var pluginResult = builder.Plugins.FirstOrDefault(p => string.Equals(p.DirectoryPath, pluginDirectoryPath, s_pathComparison));
var pluginLabel = pluginResult?.Name
?? Path.GetFileName(pluginDirectoryPath.TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar));
var message = $"Plugin '{pluginLabel}' aggregate description size is {totalChars:N0} characters — maximum is {SkillProfiler.MaxAggregateDescriptionLength:N0}.";
var message = $"Plugin '{pluginLabel}' rendered skill-menu size is {totalChars:N0} characters — maximum is {SkillProfiler.MaxRenderedSkillMenuLength:N0}.";
if (pluginResult is not null)
pluginResult.Errors.Add(message);
else
Expand Down
88 changes: 68 additions & 20 deletions eng/skill-validator/src/Check/SkillProfiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,33 +32,55 @@ public static partial class SkillProfiler
// vocabularies, but BPE counts are close enough across models for complexity classification.
private static readonly Lazy<TiktokenTokenizer> s_bpeTokenizer = new(() => TiktokenTokenizer.CreateForModel("gpt-4"));

// Per-plugin aggregate description size cap. NOTE: this is a local repo
// policy, NOT a documented Copilot/agentskills constraint. The agentskills.io
// specification (https://agentskills.io/specification) defines per-skill
// limits — description (1024 chars, #description-field), compatibility
// (500 chars, #compatibility-field), and name (64 chars, #name-field) —
// but does NOT define any aggregate per-plugin cap. The original 15,000
// was introduced in #238 / discussed in #222 ("15K characters was
// mentioned, we could choose smaller") as an informal guardrail against
// bloated metadata costs at startup.
// Per-plugin rendered skill-menu budget (NOT a raw description-length sum —
// see RenderedSkillMenuCost and the notes below). This mirrors a REAL GitHub
// Copilot CLI constraint: the CLI renders the model-facing
// <available_skills> list under a hard character budget of 15,000
// (the agent SDK's SKILL_CHAR_BUDGET, default 15e3 — confirmed in CLI
// 1.0.36 and 1.0.61). Skills are listed alphabetically by name and emitted
// WITH their full <description> only until that budget is exhausted; every
// skill past the cut-off collapses to a bare name with NO description and
// therefore can no longer be reliably model-activated. So once a plugin's
// rendered skill-menu footprint approaches ~15K, its alphabetically-later
// skills silently lose their descriptions — and their discoverability — in
// plugin / marketplace contexts. (This is exactly why dotnet-test's
// `run-tests` and `test-*` skills stopped activating in plugin eval runs.)
//
// TODO: validate this guardrail against literature (skill-routing studies)
// and run experiments measuring whether large aggregate description footprints
// actually degrade selection accuracy or just cost more tokens up-front.
// Until then, keep the cap aligned with current enforcement as a hard
// validation failure, while leaving enough headroom for reasonable plugin growth.
// History / correction: this was previously documented here as "a local
// repo policy, NOT a documented Copilot/agentskills constraint" and the cap
// was raised 15,000 -> 20,000 -> 22,000 to admit plugin growth. That masked
// the silent menu truncation instead of fixing it. The agentskills.io
// specification (https://agentskills.io/specification) does only define
// per-skill limits — description (1024 chars, #description-field),
// compatibility (500 chars, #compatibility-field), name (64 chars,
// #name-field) — and no aggregate cap, but the CLI's runtime skill-menu
// budget makes 15,000 the effective ceiling regardless.
//
// Raised 20,000 -> 22,000: the dotnet-test plugin (the largest and most
// active) reached ~20,400 aggregate chars after adding the
// find-untested-sources-polyglot skill, legitimate growth that exceeded the
// previous cap. Bumped to restore ~1.6k headroom rather than degrade the
// routing keywords of existing skills. Prior precedent: 15,000 -> 20,000.
internal const int MaxAggregateDescriptionLength = 22_000;
// Notes:
// * The CLI budget is measured over the fully-rendered <skill> blocks
// (name + description + location + markup), NOT the raw descriptions —
// those blocks are ~90-100 chars larger per skill. The aggregate below
// mirrors that rendering via RenderedSkillMenuCost(...) (with XML
// escaping applied), so "passing check" faithfully implies the plugin's
// model-invocable menu stays under the real budget instead of being a
// lenient description-only proxy that could still overflow and silently
// truncate alphabetically-later skills.
// * Skills marked `disable-model-invocation: true` are dropped from the
// CLI menu entirely and do not consume the budget; the aggregate below
// excludes them to match.
internal const int MaxRenderedSkillMenuLength = 15_000;
private const int MaxNameLength = 64;
internal const int MinDescriptionLength = 10;
private const int MaxCompatibilityLength = 500;
private const long MaxAssetFileSize = 5 * 1024 * 1024; // 5 MB

// Location label the Copilot CLI renders for each skill in the
// <available_skills> menu. The value is environment-dependent ("project",
// "user", "Custom", ...) but short and roughly constant across skills; we
// use a representative value so RenderedSkillMenuCost models the real
// per-skill footprint.
internal const string SkillMenuLocation = "project";

public static SkillProfile AnalyzeSkill(SkillInfo skill, CheckOptions? options = null)
{
var allowRepoTraversal = options?.AllowRepoTraversal ?? false;
Expand Down Expand Up @@ -343,6 +365,32 @@ public static IReadOnlyList<string> FormatDiagnosisHints(SkillProfile profile)
..profile.Warnings.Select(w => $" • {w}")];
}

/// <summary>
/// Estimate the number of characters a single skill contributes to the
/// Copilot CLI's model-facing skill menu. This mirrors the runtime rendering
/// in github/copilot-agent-runtime (src/skills/skillToolDescription.ts): each
/// skill is emitted as an XML <c>&lt;skill&gt;</c> block — with XML-escaped
/// name and description and a <c>&lt;location&gt;</c> label — followed by a
/// newline separator. Counting the whole block (not just the description)
/// keeps <see cref="MaxRenderedSkillMenuLength"/> a conservative proxy for
/// the real 15,000-char <c>SKILL_CHAR_BUDGET</c>, so a plugin that passes the
/// aggregate check cannot silently overflow the menu and truncate
/// alphabetically-later skills.
/// </summary>
internal static int RenderedSkillMenuCost(SkillInfo skill)
{
var block =
$"<skill>\n <name>{EscapeXml(skill.Name)}</name>\n <description>{EscapeXml(skill.Description)}</description>\n <location>{SkillMenuLocation}</location>\n</skill>";
return block.Length + 1; // +1 for the newline separator between skills
}

private static string EscapeXml(string s) =>
s.Replace("&", "&amp;")
.Replace("<", "&lt;")
.Replace(">", "&gt;")
.Replace("\"", "&quot;")
.Replace("'", "&apos;");

[GeneratedRegex(@"^#{1,4}\s+", RegexOptions.Multiline)]
private static partial Regex SectionRegex();

Expand Down
11 changes: 10 additions & 1 deletion eng/skill-validator/src/Shared/Models.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using YamlDotNet.Serialization;

namespace SkillValidator.Shared;

// --- MCP server definition (from plugin.json) ---
Expand All @@ -18,7 +20,8 @@ public sealed record SkillInfo(
string Path,
string SkillMdPath,
string SkillMdContent,
string? Compatibility = null);
string? Compatibility = null,
bool DisableModelInvocation = false);

// --- Agent info ---

Expand Down Expand Up @@ -48,6 +51,12 @@ public sealed record SkillFrontmatter
public string? Name { get; set; }
public string? Description { get; set; }
public string? Compatibility { get; set; }

// The frontmatter key is hyphenated (`disable-model-invocation`) and does not
// follow the underscore naming convention, so map it explicitly. A skill with
// this set to true is dropped from the Copilot CLI's model-facing skill menu.
[YamlMember(Alias = "disable-model-invocation", ApplyNamingConventions = false)]
public bool DisableModelInvocation { get; set; }
}

public sealed record AgentFrontmatter
Expand Down
3 changes: 2 additions & 1 deletion eng/skill-validator/src/Shared/SkillDiscovery.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ public static async Task<IReadOnlyList<SkillInfo>> DiscoverSkillsRecursive(strin
Path: dirPath,
SkillMdPath: skillMdPath,
SkillMdContent: skillMdContent,
Compatibility: compatibility);
Compatibility: compatibility,
DisableModelInvocation: metadata.DisableModelInvocation);
}

internal static (SkillFrontmatter Metadata, string Body) ParseFrontmatter(string content)
Expand Down
31 changes: 31 additions & 0 deletions eng/skill-validator/src/docs/InvestigatingResults.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,37 @@ Several scenario-level options in `eval.yaml` are relevant when diagnosing failu
- Make sure the description includes keywords from the scenario
- Check the scenario itself has sufficient information that the agent can reason that it needs the skill. (It should not cheat and suggest the skill.)

> **Plugin-arm-only non-activation (skill-menu budget overflow).** If a skill
> activates reliably in the **isolated** arm but consistently fails to activate
> in the **plugin** arm (`skillActivationIsolated.activated: true` but
> `skillActivationPlugin.activated: false`, with empty `detectedSkills`), the
> cause is usually *not* the description text — it may never be shown. The
> Copilot CLI renders the model-facing `<available_skills>` menu under a hard
> **15,000-character budget** (the agent SDK's `SKILL_CHAR_BUDGET`, default
> `15e3`). Skills are listed **alphabetically by name** and emitted with their
> full `<description>` only until the budget is exhausted; every skill past the
> cut-off collapses to a **bare name with no description** and can no longer be
> reliably model-activated. In a large plugin, an alphabetically-late skill
> (e.g. `run-tests`, `test-*`) silently loses its description in the plugin
> menu even though it is fine in isolation.
>
> Fixes for this case (description tuning will *not* help — the text is not in
> the menu):
> - Mark reference / agent-orchestrated skills that are never meant to be
> model-invoked from a user prompt with `disable-model-invocation: true`.
> The CLI drops them from the menu entirely, freeing budget for the skills
> that should be discoverable. (They remain invocable by explicit name.)
> - Reduce the plugin's aggregate skill-menu footprint so its model-invocable
> skills fit under the budget. The `check` command enforces this via
> `SkillProfiler.MaxRenderedSkillMenuLength` (15,000), summing each
> model-invocable skill's **rendered `<skill>` block** (name + description +
> location + markup, via `SkillProfiler.RenderedSkillMenuCost`) — not just the
> raw description — and counting only skills *without*
> `disable-model-invocation: true`. Counting the rendered block makes passing
> `check` a faithful proxy for "fits in the real CLI menu budget".
> - As a last resort, consolidate overlapping skills so the plugin exposes
> fewer model-invocable entries.

### 6. Rubric penalizes valid alternatives

**Symptoms:**
Expand Down
32 changes: 27 additions & 5 deletions eng/skill-validator/tests/Check/CheckCommandTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,32 @@ public async Task UnderAggregateLimit_Passes()
}

[Fact]
public async Task AtAggregateLimit_Passes()
public void RenderedSkillMenuCost_CountsEscapedNameDescriptionLocationAndMarkup()
{
// Create skills whose descriptions sum exactly to the limit, each under per-skill max (1024)
int limit = SkillProfiler.MaxAggregateDescriptionLength;
var skill = new SkillInfo(
Name: "my-skill",
Description: "Tom & Jerry <tag>",
Path: "",
SkillMdPath: "",
SkillMdContent: "");

// Mirrors github/copilot-agent-runtime skillToolDescription.ts: the full
// <skill> block (XML-escaped name + description, plus location/markup)
// followed by a single newline separator.
string expectedBlock =
$"<skill>\n <name>my-skill</name>\n <description>Tom &amp; Jerry &lt;tag&gt;</description>\n <location>{SkillProfiler.SkillMenuLocation}</location>\n</skill>";

Assert.Equal(expectedBlock.Length + 1, SkillProfiler.RenderedSkillMenuCost(skill));
}

[Fact]
public async Task DescriptionsSummingToLimit_Fails_BecauseRenderedOverheadIsCounted()
{
// Descriptions ALONE sum to exactly the cap. The previous check (which
// counted only Description.Length) treated this as "at limit → pass",
// but the real CLI budget also includes each skill's name, location and
// <skill> markup, so the rendered total exceeds the cap and must fail.
int limit = SkillProfiler.MaxRenderedSkillMenuLength;
int perSkill = 1024;
int skillCount = limit / perSkill;
int remainder = limit - (skillCount * perSkill);
Expand All @@ -69,15 +91,15 @@ public async Task AtAggregateLimit_Passes()
{
var config = new CheckConfig { PluginPaths = [Path.Combine(root, "test-plugin")] };
var result = await CheckCommand.Run(config);
Assert.Equal(0, result);
Assert.Equal(1, result);
}
finally { Directory.Delete(root, true); }
}

[Fact]
public async Task OverAggregateLimit_Fails()
{
int limit = SkillProfiler.MaxAggregateDescriptionLength;
int limit = SkillProfiler.MaxRenderedSkillMenuLength;
int perSkill = 1024;
// Enough skills to exceed the aggregate limit
int skillCount = (limit / perSkill) + 1;
Expand Down
38 changes: 38 additions & 0 deletions eng/skill-validator/tests/Shared/DiscoveryTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,44 @@ public async Task DiscoverSkillsRecursiveReturnsEmptyForMissingDir()
}
}

public class ParseFrontmatterTests
{
[Fact]
public void DisableModelInvocation_True_WhenTopLevelKeySet()
{
var content = "---\nname: my-skill\ndescription: A skill.\ndisable-model-invocation: true\n---\nBody";
var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
Assert.True(metadata.DisableModelInvocation);
}

[Fact]
public void DisableModelInvocation_False_WhenKeyAbsent()
{
var content = "---\nname: my-skill\ndescription: A skill.\n---\nBody";
var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
Assert.False(metadata.DisableModelInvocation);
}

[Fact]
public void DisableModelInvocation_False_WhenKeyAppearsInsideBlockScalarDescription()
{
// Regression: a previous regex-based check matched any line in the YAML,
// so a block-scalar description that merely mentions the key on its own
// line was wrongly treated as disabling model invocation. Proper YAML
// parsing must not be fooled by indented block-scalar content.
var content =
"---\n" +
"name: my-skill\n" +
"description: |\n" +
" This skill explains config options.\n" +
" disable-model-invocation: true\n" +
"---\n" +
"Body";
var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
Assert.False(metadata.DisableModelInvocation);
}
}

public class PluginDiscoveryTests
{
[Fact]
Expand Down
2 changes: 1 addition & 1 deletion plugins/dotnet-test/skills/assertion-quality/SKILL.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
name: assertion-quality
description: "Analyzes the variety and depth of assertions across test suites in any language. Use when the user asks to evaluate assertion quality, find shallow tests, identify assertion-free tests (no assertions or only trivial ones like Assert.IsNotNull / toBeTruthy() / assert x is not None), flag self-referential or tautological assertions (output equals input on round-trip operations), measure assertion diversity, or audit whether tests verify different facets of behavior. Polyglot: .NET, Python (pytest), TS/JS (Jest/Vitest), Java, Go, Ruby, Rust, Swift, Kotlin, PowerShell, C++. DO NOT USE FOR: writing new tests (use code-testing-agent / writing-mstest-tests), checking whether tests would catch a bug if code changed (mutation reasoning — use test-gap-analysis), anti-patterns like flakiness or duplication, or a general severity-ranked anti-pattern audit even when focused on self-referential / tautological assertions and not asking for assertion-diversity metrics (use test-anti-patterns); fixing assertions."
description: "Analyzes the variety and depth of assertions across test suites in any language. Use when the user asks to evaluate assertion quality, find shallow tests, identify assertion-free tests (no assertions or only trivial ones like Assert.IsNotNull / toBeTruthy()), flag self-referential or tautological assertions, measure assertion diversity, or audit whether tests verify different facets of behavior. Polyglot: .NET, Python, TS/JS, Java, Go, Ruby, Rust, Swift, Kotlin, PowerShell, C++. DO NOT USE FOR: writing new tests (use code-testing-agent / writing-mstest-tests), mutation reasoning about whether tests would catch a bug (use test-gap-analysis), or a general severity-ranked anti-pattern audit (use test-anti-patterns); fixing assertions."
license: MIT
---

Expand Down
Loading
Loading