dotnet · Evangelink · Jun 25, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 23, 2026
@@ -14,6 +14,12 @@ public static class CheckCommand
         ? StringComparer.OrdinalIgnoreCase
         : StringComparer.Ordinal;
 
+    // A skill with `disable-model-invocation: true` in its frontmatter is
+    // dropped from the Copilot CLI's model-facing skill menu and therefore does
+    // not consume the skill-menu character budget tracked by
+    // SkillProfiler.MaxRenderedSkillMenuLength. The flag is parsed once during
+    // discovery and surfaced on SkillInfo.DisableModelInvocation.
+
     public static Command Create()
     {
         var pluginOpt = new Option<string[]>("--plugin") { Description = "Plugin directories to check (discovers skills, agents, plugin.json)", AllowMultipleArgumentsPerToken = true };
@@ -219,14 +225,26 @@ private static async Task<int> RunPluginCheck(CheckConfig config, CheckReportBui
 
         foreach (var (pluginDirectoryPath, skills) in pluginSkills)
         {
-            int totalChars = skills.Sum(s => s.Description.Length);
-            if (totalChars <= SkillProfiler.MaxAggregateDescriptionLength)
+            // Sum each model-invocable skill's RENDERED menu cost — the full
+            // <skill> block the Copilot CLI emits (name + description + location
+            // + markup), via SkillProfiler.RenderedSkillMenuCost — so this
+            // mirrors the real SKILL_CHAR_BUDGET rather than just the raw
+            // description length.
+            //
+            // Skills hidden from the model-facing skill menu via
+            // `disable-model-invocation: true` do not consume that budget, so
+            // they are excluded from the aggregate (see
+            // SkillProfiler.MaxRenderedSkillMenuLength).
+            int totalChars = skills
+                .Where(s => !s.DisableModelInvocation)
+                .Sum(SkillProfiler.RenderedSkillMenuCost);
+            if (totalChars <= SkillProfiler.MaxRenderedSkillMenuLength)
                 continue;
 
             var pluginResult = builder.Plugins.FirstOrDefault(p => string.Equals(p.DirectoryPath, pluginDirectoryPath, s_pathComparison));
             var pluginLabel = pluginResult?.Name
                 ?? Path.GetFileName(pluginDirectoryPath.TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar));
-            var message = $"Plugin '{pluginLabel}' aggregate description size is {totalChars:N0} characters — maximum is {SkillProfiler.MaxAggregateDescriptionLength:N0}.";
+            var message = $"Plugin '{pluginLabel}' rendered skill-menu size is {totalChars:N0} characters — maximum is {SkillProfiler.MaxRenderedSkillMenuLength:N0}.";
             if (pluginResult is not null)
                 pluginResult.Errors.Add(message);
             else

@@ -32,33 +32,55 @@ public static partial class SkillProfiler
     // vocabularies, but BPE counts are close enough across models for complexity classification.
     private static readonly Lazy<TiktokenTokenizer> s_bpeTokenizer = new(() => TiktokenTokenizer.CreateForModel("gpt-4"));
 
-    // Per-plugin aggregate description size cap. NOTE: this is a local repo
-    // policy, NOT a documented Copilot/agentskills constraint. The agentskills.io
-    // specification (https://agentskills.io/specification) defines per-skill
-    // limits — description (1024 chars, #description-field), compatibility
-    // (500 chars, #compatibility-field), and name (64 chars, #name-field) —
-    // but does NOT define any aggregate per-plugin cap. The original 15,000
-    // was introduced in #238 / discussed in #222 ("15K characters was
-    // mentioned, we could choose smaller") as an informal guardrail against
-    // bloated metadata costs at startup.
+    // Per-plugin rendered skill-menu budget (NOT a raw description-length sum —
+    // see RenderedSkillMenuCost and the notes below). This mirrors a REAL GitHub
+    // Copilot CLI constraint: the CLI renders the model-facing
+    // <available_skills> list under a hard character budget of 15,000
+    // (the agent SDK's SKILL_CHAR_BUDGET, default 15e3 — confirmed in CLI
+    // 1.0.36 and 1.0.61). Skills are listed alphabetically by name and emitted
+    // WITH their full <description> only until that budget is exhausted; every
+    // skill past the cut-off collapses to a bare name with NO description and
+    // therefore can no longer be reliably model-activated. So once a plugin's
+    // rendered skill-menu footprint approaches ~15K, its alphabetically-later
+    // skills silently lose their descriptions — and their discoverability — in
+    // plugin / marketplace contexts. (This is exactly why dotnet-test's
+    // `run-tests` and `test-*` skills stopped activating in plugin eval runs.)
     //
-    // TODO: validate this guardrail against literature (skill-routing studies)
-    // and run experiments measuring whether large aggregate description footprints
-    // actually degrade selection accuracy or just cost more tokens up-front.
-    // Until then, keep the cap aligned with current enforcement as a hard
-    // validation failure, while leaving enough headroom for reasonable plugin growth.
+    // History / correction: this was previously documented here as "a local
+    // repo policy, NOT a documented Copilot/agentskills constraint" and the cap
+    // was raised 15,000 -> 20,000 -> 22,000 to admit plugin growth. That masked
+    // the silent menu truncation instead of fixing it. The agentskills.io
+    // specification (https://agentskills.io/specification) does only define
+    // per-skill limits — description (1024 chars, #description-field),
+    // compatibility (500 chars, #compatibility-field), name (64 chars,
+    // #name-field) — and no aggregate cap, but the CLI's runtime skill-menu
+    // budget makes 15,000 the effective ceiling regardless.
     //
-    // Raised 20,000 -> 22,000: the dotnet-test plugin (the largest and most
-    // active) reached ~20,400 aggregate chars after adding the
-    // find-untested-sources-polyglot skill, legitimate growth that exceeded the
-    // previous cap. Bumped to restore ~1.6k headroom rather than degrade the
-    // routing keywords of existing skills. Prior precedent: 15,000 -> 20,000.
-    internal const int MaxAggregateDescriptionLength = 22_000;
+    // Notes:
+    //  * The CLI budget is measured over the fully-rendered <skill> blocks
+    //    (name + description + location + markup), NOT the raw descriptions —
+    //    those blocks are ~90-100 chars larger per skill. The aggregate below
+    //    mirrors that rendering via RenderedSkillMenuCost(...) (with XML
+    //    escaping applied), so "passing check" faithfully implies the plugin's
+    //    model-invocable menu stays under the real budget instead of being a
+    //    lenient description-only proxy that could still overflow and silently
+    //    truncate alphabetically-later skills.
+    //  * Skills marked `disable-model-invocation: true` are dropped from the
+    //    CLI menu entirely and do not consume the budget; the aggregate below
+    //    excludes them to match.
+    internal const int MaxRenderedSkillMenuLength = 15_000;
     private const int MaxNameLength = 64;
     internal const int MinDescriptionLength = 10;
     private const int MaxCompatibilityLength = 500;
     private const long MaxAssetFileSize = 5 * 1024 * 1024; // 5 MB
 
+    // Location label the Copilot CLI renders for each skill in the
+    // <available_skills> menu. The value is environment-dependent ("project",
+    // "user", "Custom", ...) but short and roughly constant across skills; we
+    // use a representative value so RenderedSkillMenuCost models the real
+    // per-skill footprint.
+    internal const string SkillMenuLocation = "project";
+
     public static SkillProfile AnalyzeSkill(SkillInfo skill, CheckOptions? options = null)
     {
         var allowRepoTraversal = options?.AllowRepoTraversal ?? false;
@@ -343,6 +365,32 @@ public static IReadOnlyList<string> FormatDiagnosisHints(SkillProfile profile)
             ..profile.Warnings.Select(w => $"  • {w}")];
     }
 
+    /// <summary>
+    /// Estimate the number of characters a single skill contributes to the
+    /// Copilot CLI's model-facing skill menu. This mirrors the runtime rendering
+    /// in github/copilot-agent-runtime (src/skills/skillToolDescription.ts): each
+    /// skill is emitted as an XML <c>&lt;skill&gt;</c> block — with XML-escaped
+    /// name and description and a <c>&lt;location&gt;</c> label — followed by a
+    /// newline separator. Counting the whole block (not just the description)
+    /// keeps <see cref="MaxRenderedSkillMenuLength"/> a conservative proxy for
+    /// the real 15,000-char <c>SKILL_CHAR_BUDGET</c>, so a plugin that passes the
+    /// aggregate check cannot silently overflow the menu and truncate
+    /// alphabetically-later skills.
+    /// </summary>
+    internal static int RenderedSkillMenuCost(SkillInfo skill)
+    {
+        var block =
+            $"<skill>\n  <name>{EscapeXml(skill.Name)}</name>\n  <description>{EscapeXml(skill.Description)}</description>\n  <location>{SkillMenuLocation}</location>\n</skill>";
+        return block.Length + 1; // +1 for the newline separator between skills
+    }
+
+    private static string EscapeXml(string s) =>
+        s.Replace("&", "&amp;")
+         .Replace("<", "&lt;")
+         .Replace(">", "&gt;")
+         .Replace("\"", "&quot;")
+         .Replace("'", "&apos;");
+
     [GeneratedRegex(@"^#{1,4}\s+", RegexOptions.Multiline)]
     private static partial Regex SectionRegex();
 

@@ -1,3 +1,5 @@
+using YamlDotNet.Serialization;
+
 namespace SkillValidator.Shared;
 
 // --- MCP server definition (from plugin.json) ---
@@ -18,7 +20,8 @@ public sealed record SkillInfo(
     string Path,
     string SkillMdPath,
     string SkillMdContent,
-    string? Compatibility = null);
+    string? Compatibility = null,
+    bool DisableModelInvocation = false);
 
 // --- Agent info ---
 
@@ -48,6 +51,12 @@ public sealed record SkillFrontmatter
     public string? Name { get; set; }
     public string? Description { get; set; }
     public string? Compatibility { get; set; }
+
+    // The frontmatter key is hyphenated (`disable-model-invocation`) and does not
+    // follow the underscore naming convention, so map it explicitly. A skill with
+    // this set to true is dropped from the Copilot CLI's model-facing skill menu.
+    [YamlMember(Alias = "disable-model-invocation", ApplyNamingConventions = false)]
+    public bool DisableModelInvocation { get; set; }
 }
 
 public sealed record AgentFrontmatter

@@ -77,7 +77,8 @@ public static async Task<IReadOnlyList<SkillInfo>> DiscoverSkillsRecursive(strin
             Path: dirPath,
             SkillMdPath: skillMdPath,
             SkillMdContent: skillMdContent,
-            Compatibility: compatibility);
+            Compatibility: compatibility,
+            DisableModelInvocation: metadata.DisableModelInvocation);
     }
 
     internal static (SkillFrontmatter Metadata, string Body) ParseFrontmatter(string content)

@@ -216,6 +216,37 @@ Several scenario-level options in `eval.yaml` are relevant when diagnosing failu
 - Make sure the description includes keywords from the scenario
 - Check the scenario itself has sufficient information that the agent can reason that it needs the skill. (It should not cheat and suggest the skill.)
 
+> **Plugin-arm-only non-activation (skill-menu budget overflow).** If a skill
+> activates reliably in the **isolated** arm but consistently fails to activate
+> in the **plugin** arm (`skillActivationIsolated.activated: true` but
+> `skillActivationPlugin.activated: false`, with empty `detectedSkills`), the
+> cause is usually *not* the description text — it may never be shown. The
+> Copilot CLI renders the model-facing `<available_skills>` menu under a hard
+> **15,000-character budget** (the agent SDK's `SKILL_CHAR_BUDGET`, default
+> `15e3`). Skills are listed **alphabetically by name** and emitted with their
+> full `<description>` only until the budget is exhausted; every skill past the
+> cut-off collapses to a **bare name with no description** and can no longer be
+> reliably model-activated. In a large plugin, an alphabetically-late skill
+> (e.g. `run-tests`, `test-*`) silently loses its description in the plugin
+> menu even though it is fine in isolation.
+>
+> Fixes for this case (description tuning will *not* help — the text is not in
+> the menu):
+> - Mark reference / agent-orchestrated skills that are never meant to be
+>   model-invoked from a user prompt with `disable-model-invocation: true`.
+>   The CLI drops them from the menu entirely, freeing budget for the skills
+>   that should be discoverable. (They remain invocable by explicit name.)
+> - Reduce the plugin's aggregate skill-menu footprint so its model-invocable
+>   skills fit under the budget. The `check` command enforces this via
+>   `SkillProfiler.MaxRenderedSkillMenuLength` (15,000), summing each
+>   model-invocable skill's **rendered `<skill>` block** (name + description +
+>   location + markup, via `SkillProfiler.RenderedSkillMenuCost`) — not just the
+>   raw description — and counting only skills *without*
+>   `disable-model-invocation: true`. Counting the rendered block makes passing
+>   `check` a faithful proxy for "fits in the real CLI menu budget".
+> - As a last resort, consolidate overlapping skills so the plugin exposes
+>   fewer model-invocable entries.
+
 ### 6. Rubric penalizes valid alternatives
 
 **Symptoms:**

@@ -50,10 +50,32 @@ public async Task UnderAggregateLimit_Passes()
     }
 
     [Fact]
-    public async Task AtAggregateLimit_Passes()
+    public void RenderedSkillMenuCost_CountsEscapedNameDescriptionLocationAndMarkup()
     {
-        // Create skills whose descriptions sum exactly to the limit, each under per-skill max (1024)
-        int limit = SkillProfiler.MaxAggregateDescriptionLength;
+        var skill = new SkillInfo(
+            Name: "my-skill",
+            Description: "Tom & Jerry <tag>",
+            Path: "",
+            SkillMdPath: "",
+            SkillMdContent: "");
+
+        // Mirrors github/copilot-agent-runtime skillToolDescription.ts: the full
+        // <skill> block (XML-escaped name + description, plus location/markup)
+        // followed by a single newline separator.
+        string expectedBlock =
+            $"<skill>\n  <name>my-skill</name>\n  <description>Tom &amp; Jerry &lt;tag&gt;</description>\n  <location>{SkillProfiler.SkillMenuLocation}</location>\n</skill>";
+
+        Assert.Equal(expectedBlock.Length + 1, SkillProfiler.RenderedSkillMenuCost(skill));
+    }
+
+    [Fact]
+    public async Task DescriptionsSummingToLimit_Fails_BecauseRenderedOverheadIsCounted()
+    {
+        // Descriptions ALONE sum to exactly the cap. The previous check (which
+        // counted only Description.Length) treated this as "at limit → pass",
+        // but the real CLI budget also includes each skill's name, location and
+        // <skill> markup, so the rendered total exceeds the cap and must fail.
+        int limit = SkillProfiler.MaxRenderedSkillMenuLength;
         int perSkill = 1024;
         int skillCount = limit / perSkill;
         int remainder = limit - (skillCount * perSkill);
@@ -69,15 +91,15 @@ public async Task AtAggregateLimit_Passes()
         {
             var config = new CheckConfig { PluginPaths = [Path.Combine(root, "test-plugin")] };
             var result = await CheckCommand.Run(config);
-            Assert.Equal(0, result);
+            Assert.Equal(1, result);
         }
         finally { Directory.Delete(root, true); }
     }
 
     [Fact]
     public async Task OverAggregateLimit_Fails()
     {
-        int limit = SkillProfiler.MaxAggregateDescriptionLength;
+        int limit = SkillProfiler.MaxRenderedSkillMenuLength;
         int perSkill = 1024;
         // Enough skills to exceed the aggregate limit
         int skillCount = (limit / perSkill) + 1;

@@ -82,6 +82,44 @@ public async Task DiscoverSkillsRecursiveReturnsEmptyForMissingDir()
     }
 }
 
+public class ParseFrontmatterTests
+{
+    [Fact]
+    public void DisableModelInvocation_True_WhenTopLevelKeySet()
+    {
+        var content = "---\nname: my-skill\ndescription: A skill.\ndisable-model-invocation: true\n---\nBody";
+        var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
+        Assert.True(metadata.DisableModelInvocation);
+    }
+
+    [Fact]
+    public void DisableModelInvocation_False_WhenKeyAbsent()
+    {
+        var content = "---\nname: my-skill\ndescription: A skill.\n---\nBody";
+        var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
+        Assert.False(metadata.DisableModelInvocation);
+    }
+
+    [Fact]
+    public void DisableModelInvocation_False_WhenKeyAppearsInsideBlockScalarDescription()
+    {
+        // Regression: a previous regex-based check matched any line in the YAML,
+        // so a block-scalar description that merely mentions the key on its own
+        // line was wrongly treated as disabling model invocation. Proper YAML
+        // parsing must not be fooled by indented block-scalar content.
+        var content =
+            "---\n" +
+            "name: my-skill\n" +
+            "description: |\n" +
+            "  This skill explains config options.\n" +
+            "  disable-model-invocation: true\n" +
+            "---\n" +
+            "Body";
+        var (metadata, _) = SkillDiscovery.ParseFrontmatter(content);
+        Assert.False(metadata.DisableModelInvocation);
+    }
+}
+
 public class PluginDiscoveryTests
 {
     [Fact]

diff --git a/plugins/dotnet-test/skills/assertion-quality/SKILL.md b/plugins/dotnet-test/skills/assertion-quality/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: assertion-quality
-description: "Analyzes the variety and depth of assertions across test suites in any language. Use when the user asks to evaluate assertion quality, find shallow tests, identify assertion-free tests (no assertions or only trivial ones like Assert.IsNotNull / toBeTruthy() / assert x is not None), flag self-referential or tautological assertions (output equals input on round-trip operations), measure assertion diversity, or audit whether tests verify different facets of behavior. Polyglot: .NET, Python (pytest), TS/JS (Jest/Vitest), Java, Go, Ruby, Rust, Swift, Kotlin, PowerShell, C++. DO NOT USE FOR: writing new tests (use code-testing-agent / writing-mstest-tests), checking whether tests would catch a bug if code changed (mutation reasoning — use test-gap-analysis), anti-patterns like flakiness or duplication, or a general severity-ranked anti-pattern audit even when focused on self-referential / tautological assertions and not asking for assertion-diversity metrics (use test-anti-patterns); fixing assertions."
+description: "Analyzes the variety and depth of assertions across test suites in any language. Use when the user asks to evaluate assertion quality, find shallow tests, identify assertion-free tests (no assertions or only trivial ones like Assert.IsNotNull / toBeTruthy()), flag self-referential or tautological assertions, measure assertion diversity, or audit whether tests verify different facets of behavior. Polyglot: .NET, Python, TS/JS, Java, Go, Ruby, Rust, Swift, Kotlin, PowerShell, C++. DO NOT USE FOR: writing new tests (use code-testing-agent / writing-mstest-tests), mutation reasoning about whether tests would catch a bug (use test-gap-analysis), or a general severity-ranked anti-pattern audit (use test-anti-patterns); fixing assertions."
 license: MIT
 ---