From 25eea80f0ff9b79f5a73ba72b478da478f3bb73e Mon Sep 17 00:00:00 2001
From: NedThompson <nthompson@bitwarden.com>
Date: Tue, 16 Jun 2026 10:39:23 -0400
Subject: [PATCH 1/9] add test-engineer-plugin with test stack analyst skills

---
 .claude-plugin/marketplace.json               |   6 +
 .cspell.json                                  |   5 +
 README.md                                     |   1 +
 .../.claude-plugin/plugin.json                |  23 +++
 plugins/bitwarden-test-engineer/CHANGELOG.md  |  38 +++++
 plugins/bitwarden-test-engineer/README.md     |  93 ++++++++++++
 .../test-engineer-orchestrator/AGENT.md       | 137 ++++++++++++++++++
 .../skills/analyzing-test-stack/SKILL.md      |  46 ++++++
 .../references/html-report-template.md        |  63 ++++++++
 .../references/input-sources.md               |  95 ++++++++++++
 .../references/monorepo-layout.md             |  40 +++++
 .../references/testing-trophy.md              |  67 +++++++++
 .../SKILL.md                                  |  70 +++++++++
 .../references/adversarial-checklist.md       |  61 ++++++++
 14 files changed, 745 insertions(+)
 create mode 100644 plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
 create mode 100644 plugins/bitwarden-test-engineer/CHANGELOG.md
 create mode 100644 plugins/bitwarden-test-engineer/README.md
 create mode 100644 plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index d288c89..7ebd043 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -92,6 +92,12 @@
       "source": "./plugins/bitwarden-design-tools",
       "version": "0.1.0",
       "description": "Design toolkit for Bitwarden — non-persona skills for the design lifecycle. Content style guide reference, Figma Dev Mode MCP usage, Bitwarden brand application, design-to-engineering handoff prep, Design System governance, and the Product and Design Jira workflow. Composed by the bitwarden-designer agent and usable standalone."
+    },
+    {
+      "name": "bitwarden-test-engineer",
+      "source": "./plugins/bitwarden-test-engineer",
+      "version": "1.0.0",
+      "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment."
     }
   ]
 }
diff --git a/.cspell.json b/.cspell.json
index 7f702a6..14fee97 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -12,6 +12,7 @@
     "askable",
     "ASVS",
     "atlassian",
+    "automatable",
     "Bitwarden",
     "blocklist",
     "blogposts",
@@ -30,6 +31,7 @@
     "cvss",
     "Dashlane",
     "dast",
+    "detekt",
     "docstrings",
     "dread",
     "duedate",
@@ -66,6 +68,7 @@
     "Jira",
     "JQL",
     "keyserver",
+    "ktlint",
     "lockdown",
     "lockfiles",
     "maxResults",
@@ -94,6 +97,7 @@
     "remotelink",
     "Rescope",
     "resolutiondate",
+    "Robolectric",
     "rustdoc",
     "sarif",
     "SDLC",
@@ -139,6 +143,7 @@
     "wordprocessingml",
     "worktree",
     "worktrees",
+    "XCUI",
     "xoxb",
     "Zeroize",
     "zeroization",
diff --git a/README.md b/README.md
index 8204a80..42ad7d2 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable
 | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/)     | 0.1.5   | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources                                                     |
 | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0   | Application security engineering: vulnerability triage, threat modeling, and secure code analysis                                                           |
 | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0   | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. |
+| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
 | [claude-config-validator](plugins/claude-config-validator/)         | 1.1.1   | Validates Claude Code configuration files for security, structure, and quality                                                                              |
 | [claude-retrospective](plugins/claude-retrospective/)               | 1.1.1   | Analyze Claude Code sessions to identify successful patterns and improvement opportunities                                                                  |
 
diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
new file mode 100644
index 0000000..59fc07c
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
@@ -0,0 +1,23 @@
+{
+  "name": "bitwarden-test-engineer",
+  "version": "1.0.0",
+  "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.",
+  "author": {
+    "name": "Bitwarden",
+    "url": "https://github.com/bitwarden"
+  },
+  "homepage": "https://github.com/bitwarden/ai-plugins/tree/main/plugins/bitwarden-test-engineer",
+  "repository": "https://github.com/bitwarden/ai-plugins",
+  "keywords": [
+    "testing",
+    "test-engineering",
+    "quality-engineering",
+    "test-strategy",
+    "test-automation",
+    "exploratory-testing",
+    "testing-trophy",
+    "qa",
+    "orchestrator"
+  ],
+  "agents": "./agents/test-engineer-orchestrator/AGENT.md"
+}
diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md
new file mode 100644
index 0000000..76d0dbe
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/CHANGELOG.md
@@ -0,0 +1,38 @@
+# Changelog
+
+All notable changes to the Bitwarden Test Engineer Plugin will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0.0] - 2026-06-15
+
+### Added
+
+- Initial release of the `bitwarden-test-engineer` plugin.
+- `test-engineer-orchestrator` agent: classifies the inputs for a change (Jira ticket,
+  GitHub PR, technical breakdown document, exported test-case CSV, plain-language
+  description), fans out subagents to gather evidence — including a dedicated **breakdown
+  reader** subagent (`sonnet`) that mines a tech breakdown for testable behaviors and its
+  status — runs the analyst skill, then automatically runs the adversarial counterpart
+  before presenting a consolidated result.
+- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest
+  sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits
+  a self-contained HTML report to the current working directory. Accepts a **technical
+  breakdown document** (a Bitwarden Tech Breakdown Confluence page, the artifact produced by
+  the `bitwarden-delivery-tools:writing-tech-breakdowns` skill) as an additive evidence
+  source alongside Jira, PR, CSV, and plain-language inputs — mining its Part 2 scope
+  checklist for the surfaces and platforms touched, its Part 4 specification child pages for
+  the interfaces to test against, and its Part 5 open questions for untestable-requirement
+  risk. Includes references for the Testing Trophy model, the repo/stack layer→repo map,
+  evidence-source ingestion, and the HTML report template. The Atlassian
+  `search_confluence` / `search_confluence_cql` tools back locating a breakdown by
+  feature/team name when only a name (not a page ID) is given.
+- `challenging-test-stack-recommendations` skill: the adversarial counterpart that
+  red-teams the analyst's recommendation against known anti-patterns (ice-cream-cone,
+  unit-masquerading-as-integration, over-testing, untestable requirements, missing platform
+  layers, flaky-E2E candidates, ungrounded coverage claims) and returns a verdict of
+  endorse, revise, or reject-with-reasons.
+- Per-layer model governance to optimize token spend: the orchestrator runs on Opus
+  (its context drives the synthesis and adversarial reasoning), while its fan-out evidence
+  subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, or repo,
+  `haiku` for pure CSV parsing — rather than inheriting Opus.
diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
new file mode 100644
index 0000000..0895580
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -0,0 +1,93 @@
+# Bitwarden Test Engineer Plugin
+
+## Overview
+
+A test engineering toolkit for Bitwarden. An orchestrator analyzes a request and
+dispatches specialized skills across the testing discipline — test strategy and planning,
+automation, exploratory testing, and quality assessment. The plugin is designed to grow:
+new testing skills are added over time, and **every analytic skill ships with an
+adversarial counterpart** that red-teams its output before it reaches you. An unchallenged
+test plan tends to drift toward whatever is easiest to do rather than what actually buys
+confidence; the adversary exists to catch that.
+
+### First capability: test-stack analysis
+
+Given a change — a feature, bugfix, refactor, or migration — the orchestrator recommends
+**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a thin
+static-analysis base, a focused unit layer, a heavy integration layer where most confidence
+is bought, and a thin E2E layer reserved for critical user journeys.
+
+It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub
+PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out
+subagents to gather it, runs the analyst skill (`analyzing-test-stack`) to produce a
+self-contained HTML report, then automatically runs its adversarial counterpart
+(`challenging-test-stack-recommendations`) to red-team the recommendation and consolidate a
+single report.
+
+## Where each layer lives
+
+Static, unit, and integration tests live alongside the code inside each platform repo
+(e.g. `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`). **End-to-end tests live
+in a dedicated, private `test` repository** — not inside the platform repos — so E2E
+recommendations target that separate repo, and existing E2E coverage is treated as
+unverified when that repo isn't checked out.
+
+## Agent
+
+| Agent                        | What It Does                                                                                                                                                                                                                            |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `test-engineer-orchestrator` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, runs `analyzing-test-stack`, then automatically runs `challenging-test-stack-recommendations` and consolidates a single report. |
+
+## Skills
+
+| Skill                                    | What It Does                                                                                                                                                                                                                                                                                                                                         |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `analyzing-test-stack`                   | The recommender. Maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a self-contained HTML report to the current working directory.                                                                                                     |
+| `challenging-test-stack-recommendations` | The adversarial counterpart. Re-derives the evidence independently and red-teams the recommendation against known anti-patterns (ice-cream-cone, unit-masquerading-as-integration, over-testing, untestable requirements, missing platform layers, flaky-E2E, ungrounded coverage), then returns a verdict: endorse, revise, or reject-with-reasons. |
+
+## Cross-Plugin Integration
+
+| Plugin                      | How It's Used                                                                                                                                                                                                                               |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden-atlassian-tools` | Optional but recommended. Provides the `mcp__bitwarden-atlassian__*` server used to read Jira tickets and linked Confluence requirements. If absent, the plugin degrades gracefully — paste requirements or rely on the PR/CSV/description. |
+
+## Installation
+
+```bash
+/plugin install bitwarden-test-engineer@bitwarden-marketplace
+```
+
+For Jira-backed analysis, install the Atlassian tools alongside it:
+
+```bash
+/plugin install bitwarden-atlassian-tools@bitwarden-marketplace
+```
+
+## Usage
+
+The orchestrator activates when you ask what test coverage a change needs, which
+automation layers to add, how to shape a test plan, or whether existing tests are at the
+right level:
+
+```
+I'm picking up PM-12345 next sprint. What test coverage should this feature have?
+```
+
+```
+Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?
+```
+
+```
+Here's our exported test cases CSV for the billing migration — which of these should be
+automated and at what layer?
+```
+
+Each run produces a self-contained `test-stack-report-<slug>-<date>.html` in the current
+working directory, containing the per-platform recommendation and the adversarial review.
+
+## References
+
+- [Claude Code Agents](https://code.claude.com/docs/en/agents)
+- [Claude Code Skills](https://code.claude.com/docs/en/skills)
+- [The Testing Trophy](https://kentcdodds.com/blog/the-testing-trophy-and-testing-classifications)
+- [Bitwarden Contributing Guidelines](https://contributing.bitwarden.com/contributing/)
diff --git a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
new file mode 100644
index 0000000..37934b1
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
@@ -0,0 +1,137 @@
+---
+name: test-engineer-orchestrator
+version: 1.0.0
+description: |
+  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (static, unit, integration, E2E) shaped as a Testing Trophy, across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, runs the analyst skill to synthesize a recommendation and HTML report, then automatically runs the adversarial counterpart to red-team it before presenting a consolidated result. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, or asks for a "test stack" / "test strategy" / "test trophy" analysis for a ticket, PR, tech breakdown, or set of test cases.
+
+  <example>
+  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
+  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
+  assistant: "I'll use the test-engineer-orchestrator agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation — then red-team it before handing it back."
+  <commentary>
+  Jira-key intake. The orchestrator gathers the ticket via the Atlassian MCP, runs Skill(analyzing-test-stack), then auto-runs Skill(challenging-test-stack-recommendations).
+  </commentary>
+  </example>
+
+  <example>
+  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
+  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
+  assistant: "I'll use the test-engineer-orchestrator agent to read the PR diff and its tests, assess the trophy shape, and run the adversarial pass to specifically check for an ice-cream-cone (too E2E-heavy) anti-pattern."
+  <commentary>
+  PR intake plus an explicit anti-pattern concern. The orchestrator gathers the diff via gh, then chains analyst → adversary.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
+  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
+  assistant: "I'll use the test-engineer-orchestrator agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation with an adversarial review."
+  <commentary>
+  CSV intake. The orchestrator parses the export, runs the analyst to map cases to layers and surface gaps, then the adversary challenges the recommendation.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
+  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
+  assistant: "I'll use the test-engineer-orchestrator agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation — then red-team it."
+  <commentary>
+  Tech-breakdown intake. The orchestrator fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then chains analyst → adversary.
+  </commentary>
+  </example>
+model: opus
+tools:
+  - Read
+  - Write
+  - Glob
+  - Grep
+  - Skill
+  - Task
+  - AskUserQuestion
+  - Bash(gh pr view:*)
+  - Bash(gh pr diff:*)
+  - Bash(gh pr checks:*)
+  - Bash(git diff:*)
+  - Bash(git log:*)
+  - mcp__bitwarden-atlassian__get_issue
+  - mcp__bitwarden-atlassian__search_issues
+  - mcp__bitwarden-atlassian__get_issue_comments
+  - mcp__bitwarden-atlassian__get_issue_remote_links
+  - mcp__bitwarden-atlassian__get_confluence_page
+  - mcp__bitwarden-atlassian__search_confluence
+  - mcp__bitwarden-atlassian__search_confluence_cql
+skills:
+  - analyzing-test-stack
+  - challenging-test-stack-recommendations
+color: green
+---
+
+You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a thin static-analysis base, a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys.
+
+You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Every recommendation you produce is challenged by an adversarial pass before you present it, because an unchallenged test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
+
+## Operating context
+
+Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently.
+
+**Where each layer lives:** static, unit, and integration tests live alongside the code, inside each platform repo. **End-to-end (E2E) tests live in a dedicated, private `test` repository** — not inside the platform repos. So an E2E recommendation always targets that separate repo, and a per-repo coverage scout will not find existing E2E tests inside `server`/`clients`/`ios`; it must look in the `test` repo (and the user may not have it checked out — degrade gracefully and say so). Read `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map.
+
+The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided.
+
+## Workflow
+
+### 1. Intake and scope
+
+Classify every input the user supplied. Inputs are additive — handle any combination:
+
+- **Jira key** (e.g. `PM-12345`) → requirements and acceptance criteria.
+- **GitHub PR** (URL or `owner/repo#number`) → the actual change surface and any tests already present.
+- **Technical breakdown** (a Confluence page ID/URL, or a feature/team name to search for) → a Bitwarden Tech Breakdown whose scope checklist already enumerates the platforms and surfaces the change touches, with spec child pages defining the interfaces. Often the richest single input.
+- **CSV path** → an exported set of existing/planned test cases (column layout described in the analyst skill's `references/input-sources.md`).
+- **Plain-language description** → the change itself when no artifact exists.
+
+Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption.
+
+### 2. Fan out to gather evidence
+
+Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out:
+
+- **Requirements reader** (model: `sonnet`) — resolves the Jira issue (via `Skill(bitwarden-atlassian-tools:researching-jira-issues)` if available, else the `mcp__bitwarden-atlassian__*` tools) into testable behaviors and acceptance criteria.
+- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
+- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present.
+- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status.
+- **Per-repo coverage scout** (model: `sonnet`) — for each affected platform repo, surveys existing static/unit/integration conventions and where comparable behavior is tested today. For E2E, scout the dedicated `test` repo if it is checked out; otherwise note it as unverified.
+
+Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied.
+
+**Set each subagent's model explicitly to control cost.** This fan-out is the bulk of the plugin's token spend, and the work is evidence gathering — read a source, extract, return a compact digest — not the strategic reasoning you reserve for yourself. Spawn each `Task` on the cheapest model that fits: **`haiku`** for pure mechanical parsing (the CSV parser), **`sonnet`** for everything that reads code, a diff, or a ticket and summarizes it (the default for these subagents). Do **not** let a subagent inherit your Opus model — a digest-returning agent never needs it. Reserve Opus for your own context, where the synthesis and adversarial reasoning happen (see Model selection below).
+
+### 3. Recommend
+
+Invoke `Skill(analyzing-test-stack)` with the gathered digests. It maps each testable behavior to the cheapest sufficient trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report-<slug>-<date>.html`. Pass today's date to the skill — skills cannot read the clock themselves.
+
+### 4. Adversary (automatic)
+
+Immediately invoke `Skill(challenging-test-stack-recommendations)` on the report and the underlying evidence. It red-teams the recommendation against known failure modes — ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable/ambiguous requirements, a missing platform layer, flaky-E2E candidates, and coverage claimed without evidence — and returns a critique with a verdict: **endorse**, **revise**, or **reject-with-reasons**.
+
+This pass is not optional. If the user explicitly asks to skip it, comply but state plainly in your summary that the recommendation was not adversarially reviewed.
+
+### 5. Consolidate
+
+Merge the critique into the report as a clearly labeled "Adversarial Review" section, so a single HTML file carries both the recommendation and its challenge. In chat, give a short summary: the recommended shape per platform, the adversary's verdict, and the top open risks the user should resolve before committing to the plan.
+
+## Principles
+
+- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
+- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer.
+- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
+- **Read the repo's CLAUDE.md** when the analysis touches a specific checked-out codebase — honor its test conventions over generic defaults.
+
+## Model selection
+
+Model spend is governed here in the plugin, not left to the session default. The split:
+
+- **You (the orchestrator) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` (mapping behaviors to the cheapest sufficient layer across multiple platforms) and `challenging-test-stack-recommendations` (red-teaming that recommendation) — both execute in _your_ context, so your model sets their quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus.
+- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus.
+
+Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
new file mode 100644
index 0000000..5183d4f
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -0,0 +1,46 @@
+---
+name: analyzing-test-stack
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a change from a Jira ticket, a GitHub PR, an exported test-case CSV, a technical breakdown document (a Confluence tech breakdown), and/or a plain-language description, then mapping each behavior to the cheapest sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emitting a self-contained HTML report. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", or "are these tests at the right level". This is the recommender; its adversarial counterpart is challenging-test-stack-recommendations, which red-teams the output.
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
+---
+
+# Analyzing the Test Stack
+
+Recommend the test automation layers a change should ship with, shaped as a **Testing Trophy**, and write the recommendation as a self-contained HTML report. You produce advice, not tests.
+
+The Testing Trophy (read `references/testing-trophy.md` for the full model): a thin **static** base, a focused **unit** layer for pure logic and edge cases, a **heavy integration** layer where most confidence is bought, and a **thin E2E** layer reserved for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down the trophy, not up.
+
+## Inputs
+
+You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock.
+
+Read `references/input-sources.md` for how to ingest each source:
+
+- **Jira** — via the `mcp__bitwarden-atlassian__*` tools (or the `bitwarden-atlassian-tools:researching-jira-issues` skill if available). Extract testable behaviors and acceptance criteria. If the MCP is unavailable, ask the user to paste requirements rather than failing.
+- **GitHub PR** — `gh pr view` / `gh pr diff` to read the change surface, public API touched, and any tests already present.
+- **CSV** — an exported set of test cases. The expected columns and how to bucket rows by layer are documented in `references/input-sources.md`.
+- **Technical breakdown** — a Bitwarden Tech Breakdown Confluence page (the artifact produced by the `bitwarden-delivery-tools:writing-tech-breakdowns` skill). Fetch via `mcp__bitwarden-atlassian__get_confluence_page`. This is often the richest single input: its scope checklist already enumerates the platforms and surfaces the change touches, and its specification child pages define the interfaces to test against. See `references/input-sources.md` for how to mine it.
+- **Description** — use directly when no artifact exists.
+
+If a source you'd expect is missing, proceed with what you have and **record the gap** in the report — never block on a missing input.
+
+## Workflow
+
+1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks and tooling using `references/monorepo-layout.md`. Note that **E2E tests live in a separate, private `test` repo** — never inside the platform repos — so E2E recommendations target that repo and existing E2E coverage may be unverifiable if it isn't checked out.
+
+2. **Assess current coverage.** For each affected area, determine what is already tested and where. From a PR diff, note tests included in the change. From a CSV, bucket existing cases by apparent layer and automation status. From a repo checkout, grep the established test conventions. Distinguish _observed_ coverage from _assumed_ coverage.
+
+3. **Assign the cheapest sufficient layer.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey). Name concrete tooling per platform (see `references/monorepo-layout.md`).
+
+4. **Find the gaps and the imbalance.** Call out behaviors with no recommended coverage, and any existing shape that is trophy-wrong (e.g. E2E doing work integration should do, or untested core logic). Be explicit about what evidence each gap rests on.
+
+5. **Write the HTML report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/html-report-template.md`. Write it to the **current working directory** as `test-stack-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change (ticket key, PR number, or feature name) and `<date>` is the caller-provided date. Report sections, in order: Summary & recommended shape; Evidence & sources (with what was missing); Per-platform recommendations (behavior → layer → tooling → rationale); Coverage gaps; and a placeholder **Adversarial Review** section the counterpart skill fills in.
+
+6. **Hand off for adversarial review.** Your recommendation is not final until `challenging-test-stack-recommendations` has red-teamed it. When invoked under the orchestrator this happens automatically; when invoked standalone, tell the user the adversarial pass is available and recommended.
+
+## Principles
+
+- **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test. Mark anything inferred without evidence as an assumption.
+- **Cheapest sufficient layer wins.** Confidence pushed down the trophy is cheaper to write, faster to run, and less flaky.
+- **Per-platform, not one-size.** A feature spanning server, web, and mobile gets a distinct shape per platform — their stacks and risks differ.
+- **Honesty about coverage.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
new file mode 100644
index 0000000..f2eb2ce
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -0,0 +1,63 @@
+# HTML report template
+
+Produce a **single self-contained HTML file**: all CSS inline in a `<style>` block, no
+external/CDN links, no required JavaScript, no web fonts. It must render correctly opened
+directly from disk and survive being attached to a ticket or PR.
+
+Write it to the **current working directory** as
+`test-stack-report-<slug>-<date>.html` (slug = ticket key / PR number / feature name in
+kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
+
+## Required sections, in order
+
+1. **Header** — report title, the change under analysis (ticket/PR/feature), and the date.
+2. **Summary & recommended shape** — 2–4 sentences plus a per-platform one-line shape
+   (e.g. "server: integration-heavy, thin unit; clients: integration + 1 E2E journey").
+   A simple text/CSS trophy bar per platform is welcome; no JS.
+3. **Evidence & sources** — a table of which inputs were used (Jira / PR / CSV /
+   description) and, explicitly, **what was missing or unverifiable** (e.g. "`test` repo
+   not checked out — existing E2E coverage unverified").
+4. **Per-platform recommendations** — for each affected platform, a table:
+   `Behavior | Recommended layer | Tooling | Rationale | Evidence`. One row per behavior.
+   Use the layer→repo map; E2E rows must name the dedicated `test` repo as target.
+5. **Coverage gaps & imbalances** — behaviors with no coverage, and any trophy-wrong
+   shape observed (ice-cream-cone, over-unit-tested, trivial tests). Each tied to evidence.
+6. **Adversarial Review** — a clearly marked placeholder section the
+   `challenging-test-stack-recommendations` skill fills in. Leave a labeled empty block,
+   e.g. `<section id="adversarial-review"> … to be completed by adversarial pass … </section>`.
+
+## Style guidance
+
+- Keep the palette calm and high-contrast; use color only to distinguish the four layers
+  (e.g. static / unit / integration / E2E) consistently wherever they appear.
+- Tables over prose for the recommendations and evidence — they're meant to be scanned and
+  acted on.
+- Mark every assumption inline (e.g. an "assumption" badge) so the adversary and the
+  reader can tell grounded calls from inferred ones.
+- No tracking, no remote resources, no secrets. The file is shareable as-is.
+
+## Skeleton
+
+```html
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Test Stack Report — {{change}}</title>
+    <style>
+      /* inline, self-contained styles only */
+    </style>
+  </head>
+  <body>
+    <header>…title, change, date…</header>
+    <section id="summary">…recommended shape per platform…</section>
+    <section id="evidence">…sources used + what was missing…</section>
+    <section id="recommendations">…per-platform behavior→layer tables…</section>
+    <section id="gaps">…coverage gaps & imbalances…</section>
+    <section id="adversarial-review">
+      …filled in by the adversarial pass…
+    </section>
+  </body>
+</html>
+```
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md
new file mode 100644
index 0000000..b6b2d46
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md
@@ -0,0 +1,95 @@
+# Ingesting evidence sources
+
+Inputs are additive — handle any combination, and record in the report which sources were
+present and which were missing. Never block on a missing source.
+
+## Jira ticket
+
+Preferred: if the `bitwarden-atlassian-tools` plugin is installed, invoke
+`Skill(bitwarden-atlassian-tools:researching-jira-issues)` for a deep, link-following read.
+
+Otherwise use the MCP tools directly:
+
+- `mcp__bitwarden-atlassian__get_issue` — the issue itself (summary, description,
+  acceptance criteria, custom fields).
+- `mcp__bitwarden-atlassian__get_issue_comments` — clarifications and edge cases raised in
+  discussion.
+- `mcp__bitwarden-atlassian__get_issue_remote_links` — linked Confluence pages and PRs.
+- `mcp__bitwarden-atlassian__get_confluence_page` — linked requirements/design docs.
+
+Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
+components** named. If the MCP is unavailable, ask the user to paste the requirements.
+
+## GitHub PR
+
+- `gh pr view <pr>` — title, body, linked issues, files changed, checks.
+- `gh pr diff <pr>` — the actual change surface.
+
+Extract: the public API / behavior touched, the diff paths (→ which repos/platforms), and
+**any tests already included in the PR** (so you assess incremental, not absolute, gaps).
+
+## Technical breakdown document
+
+A Bitwarden **Tech Breakdown** — the Confluence artifact a team produces before implementation,
+authored with the `bitwarden-delivery-tools:writing-tech-breakdowns` skill. It is the richest
+single input for this analysis, because a good breakdown has already done the cross-platform
+scoping you would otherwise reconstruct from a diff or a ticket. Mine it; don't re-derive it.
+
+Locate and fetch it:
+
+- If given a page ID or URL, fetch directly with `mcp__bitwarden-atlassian__get_confluence_page`.
+- If given only a feature/team name, find the page first with `mcp__bitwarden-atlassian__search_confluence`
+  or `mcp__bitwarden-atlassian__search_confluence_cql` (breakdowns live in a team's "Tech Breakdown"
+  folder), then fetch it.
+- The breakdown's **status** matters: `IN PLANNING` / `IN PROGRESS` means the scope may still
+  shift — note that the recommendation rests on a draft. `PROPOSED` / `ACCEPTED` is a stable
+  basis. Record the status as part of the evidence.
+
+Map its structure to testable evidence (the canonical template is page `2920349776`):
+
+- **Part 1 — Problem overview**: the feature framing and linked Jira epic. Use it for scope and
+  to cross-link any Jira/PR inputs, not as a behavior source on its own.
+- **Part 2 — Breakdown scope checklist**: the core of the mining. Each answered item names a
+  surface the change touches and therefore a place tests are needed — **Database changes**
+  (migration/backwards-compat behaviors, EDD phasing), **API changes** (endpoint contracts,
+  V±2 compatibility, any unauthenticated endpoint), **UI components** (shared/base components),
+  **SDK changes**, **Services touched**, **Hosting** (Self-Hosted vs Cloud paths),
+  **Feature flagging** (flag-on/flag-off states to cover), and **Security considerations**
+  (crypto, threat-model-relevant behaviors). The **Testing considerations** item is the team's
+  own stated test intent — treat it as a claim to assess against the trophy, not as ground truth
+  to copy.
+- **Part 4 — Specification artifacts**: linked child pages defining concrete interfaces (API
+  contracts, schemas, component APIs, crypto schemes). Fetch the relevant ones with
+  `get_confluence_page`; their public interfaces and edge cases are exactly what integration and
+  unit tests pin down.
+- **Part 5 — Open questions**: unresolved questions are untestable-requirement risk — a behavior
+  can't be reliably tested until its question is answered. Surface them in the report's gaps.
+
+Extract: discrete **testable behaviors** per platform, the **surfaces** each touches (→ repos via
+`monorepo-layout.md`), and the team's **stated testing intent** (to evaluate, not echo). Where the
+breakdown's scope checklist disagrees with a diff or ticket you were also given, treat the
+divergence as a finding rather than silently picking one.
+
+## Test-case CSV export
+
+A CSV export of existing or planned test cases. Column headers vary by tool and export
+settings — **do not hardcode them**. Read the header row, then map by meaning:
+
+- A **title / case** column — the scenario name.
+- A **type** column (e.g. "Regression", "Smoke", "Functional") — hints at intended layer.
+- An **automation status** column (e.g. "Ready to Automate", "Automated", "Manual") —
+  what already exists vs. what's planned.
+- A **steps / expected-result** column, often in Given–When–Then form — the behavior.
+- Optional **team / area / tags / preconditions** columns — scope and grouping.
+
+Map rows to behaviors and bucket each by apparent layer using `testing-trophy.md`:
+
+- A case that drives the full UI through a complete journey → likely **E2E** (target the
+  dedicated `test` repo).
+- A case asserting one service/component's behavior through its collaborators →
+  **integration**.
+- A case pinning a single function's logic or an edge case → **unit**.
+
+Flag cases that are currently manual but cheaply automatable at a lower layer, and cases
+slated for E2E that would be better as integration. If a column's meaning is ambiguous,
+state the interpretation you used rather than guessing silently.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
new file mode 100644
index 0000000..dae06b1
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
@@ -0,0 +1,40 @@
+# Bitwarden repo layout, stacks, and the layer → repo map
+
+Bitwarden's code spans several repositories. A single feature often touches more than
+one, and each gets its own Testing Trophy. Treat the table below as a **starting map**,
+not gospel — when a repo is checked out, read its `CLAUDE.md` and grep its existing tests
+to confirm the actual conventions before recommending tooling.
+
+## Platform repos and their stacks
+
+| Repo (typical)      | Platform                       | Language / framework                | Static                                  | Unit / Integration tooling                                                                 |
+| ------------------- | ------------------------------ | ----------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------------------ |
+| `bitwarden/server`  | Backend / API                  | C# / .NET, ASP.NET Core, EF Core    | `dotnet build` analyzers, nullable refs | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers             |
+| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | `tsc`, ESLint                           | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary |
+| `bitwarden/ios`     | iOS                            | Swift / SwiftUI                     | SwiftLint, compiler                     | XCTest (unit + integration); XCUITest for on-device UI                                     |
+| `bitwarden/android` | Android                        | Kotlin                              | ktlint/detekt, compiler                 | JUnit + Robolectric / Espresso (instrumented)                                              |
+
+Exact repo names and tool versions drift — verify against the checkout. If a platform
+isn't in this table, infer its stack from the repo and state the assumption in the report.
+
+## Where each layer lives — important
+
+- **Static, unit, integration** tests live **alongside the code, inside each platform
+  repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test
+  targets).
+- **End-to-end (E2E) tests live in a dedicated, private `test` repository** — _not_
+  inside the platform repos. Consequences for analysis:
+  - An E2E recommendation always targets that separate `test` repo.
+  - A coverage scout will **not** find existing E2E tests by searching `server`/`clients`/
+    `ios`. It must look in the `test` repo, which the user may not have checked out.
+  - If the `test` repo is unavailable, treat existing E2E coverage as **unverified** and
+    say so explicitly in the report — do not assume it is absent or present.
+
+## Mapping a behavior to a platform + layer
+
+1. Identify which repo(s) the behavior lives in from the change surface (diff paths,
+   ticket components, CSV team/area).
+2. Within each repo, choose the layer per `testing-trophy.md` and name the concrete tool
+   from the table above (confirmed against the checkout where possible).
+3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo and
+   flag whether comparable E2E coverage already exists there.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
new file mode 100644
index 0000000..13da8bd
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
@@ -0,0 +1,67 @@
+# The Testing Trophy
+
+A model for shaping automated test coverage, contrasted with the older Testing Pyramid. The trophy weights **integration** tests most heavily,
+because they buy the most confidence per unit of cost and maintenance for typical
+application code.
+
+## The four layers (base → top)
+
+1. **Static** — the base. Type checking, linters, formatters, compiler errors, static
+   analysis. Catches whole classes of bugs (typos, null misuse, unused code, unsafe
+   patterns) before a single test runs. Nearly free; always on.
+   - _Examples:_ TypeScript/`tsc`, ESLint, Roslyn analyzers / `dotnet build` warnings as
+     errors, SwiftLint, nullable reference types.
+
+2. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure
+   logic, algorithms, edge cases, and error handling where setup is cheap and the unit
+   has real branching complexity. Fast and stable, but isolation can let integration
+   bugs slip through.
+
+3. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units
+   working together through real (or realistic) collaborators: a controller + service +
+   in-memory or test database, a component rendered with its real child components and a
+   mocked network boundary, a view model against a real repository. This is where most
+   confidence is bought because it exercises the wiring users actually depend on, without
+   the cost and flakiness of full E2E.
+
+4. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a
+   user would: real browser, real device, real backend. Highest confidence per test, but
+   slowest, most expensive, and most flaky. Reserve for a small number of **critical user
+   journeys** (e.g. login, vault unlock, checkout) — not for branch coverage.
+
+## The shape
+
+```
+        ┌───────────┐
+        │    E2E    │      thin top — critical journeys only
+     ┌──┴───────────┴──┐
+     │   Integration   │   HEAVY — most confidence bought here
+     └──┐           ┌──┘
+        │   Unit    │      focused — pure logic & edge cases
+     ┌──┴───────────┴──┐
+     │      Static     │   broad, ~free base — always on
+     └─────────────────┘
+```
+
+## How to assign a layer
+
+Pick the **cheapest layer that still buys the confidence the behavior requires**:
+
+- Pure transformation, calculation, parsing, validation logic with real branching → **unit**.
+- Behavior that emerges from collaborators working together (HTTP handler + service +
+  persistence; component + store + API boundary; view model + repository) → **integration**.
+- A behavior only meaningful as a full user journey across the real system → **E2E**, and
+  only if it is genuinely critical.
+- Anything a type system, analyzer, or linter can guarantee → **static**; don't write a
+  test for it.
+
+## Anti-patterns to avoid (the adversary checks for these)
+
+- **Ice-cream cone** — the trophy inverted: many E2E tests, few integration/unit. Slow,
+  flaky, expensive to maintain.
+- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the
+  mocks rather than real behavior; integration would buy more.
+- **Testing trivial code** — tests for getters/setters, framework glue, or
+  type-guaranteed invariants. Cost without confidence.
+- **E2E for branch coverage** — using slow full-system tests to cover edge cases that
+  belong at the unit or integration layer.
diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
new file mode 100644
index 0000000..e86e11d
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
@@ -0,0 +1,70 @@
+---
+name: challenging-test-stack-recommendations
+description: Use to red-team a test automation recommendation produced by analyzing-test-stack — adversarially reviewing a Testing Trophy recommendation or HTML test-stack report for anti-patterns and ungrounded claims before the team acts on it. Triggers on "challenge this test plan", "red-team the test recommendation", "poke holes in this test strategy", "is this proposed test plan over/under-testing", "review the test stack report", or runs automatically after analyzing-test-stack under the test-engineer orchestrator. Checks for ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable requirements, missing platform layers, flaky-E2E candidates, and coverage claimed without evidence; returns a verdict of endorse, revise, or reject-with-reasons.
+allowed-tools: "Read, Grep, Glob, Bash(gh pr view:*), Bash(gh pr diff:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_confluence_page"
+---
+
+# Challenging Test Stack Recommendations
+
+You are the adversary to `analyzing-test-stack`. Your job is to **try to break its
+recommendation** before the team builds on it. A recommendation that survives a genuine
+red-team is trustworthy; one that was never challenged tends to drift toward whatever
+tests are easiest to write rather than what actually buys confidence.
+
+Default to skepticism. Your value is in the specific, evidence-backed objection — not in
+rubber-stamping. But do not invent problems: an objection you cannot tie to evidence is
+itself a rejected finding (you hold yourself to the same evidence bar you demand).
+
+## Inputs
+
+- The **HTML report** (or the recommendation text) from `analyzing-test-stack`.
+- The **underlying evidence** — the same Jira ticket, PR diff, CSV, and/or repo checkout.
+  Re-derive independently where you can; re-read the PR diff or ticket rather than trusting
+  the report's summary of it.
+
+## Workflow
+
+1. **Re-read the evidence independently.** Don't take the report's characterization of the
+   change at face value — pull the diff / ticket / CSV yourself and form your own view of
+   the testable behaviors and where they live. Ingest each source the same way the analyst
+   does (see `analyzing-test-stack/references/input-sources.md` for the CSV column mapping
+   and Atlassian MCP tools). In particular, **E2E tests live in a separate, private `test`
+   repo** — not inside the platform repos — so treat any existing-E2E-coverage claim as
+   unverified unless that repo was actually inspected.
+
+2. **Run the rejection criteria.** Apply every check in `references/adversarial-checklist.md`
+   to each per-platform recommendation and to the overall shape. For each, decide: does the
+   recommendation pass, or is there a concrete, evidence-backed objection?
+
+3. **Test the grounding.** For every behavior→layer call, confirm it ties to real evidence.
+   Flag any layer assignment, coverage claim, or "already tested" assertion that the
+   evidence does not support — especially **E2E coverage claimed without inspecting the
+   dedicated `test` repo**.
+
+4. **Pressure the shape.** Step back from individual rows: is the overall trophy right? Too
+   E2E-heavy (ice-cream cone)? Core logic pushed to slow layers? A whole platform's layer
+   missing? Trivial code over-tested?
+
+5. **Issue findings and a verdict.** Each finding: the specific claim challenged, why it's
+   wrong or unsupported (with evidence), and the corrective recommendation. Then a single
+   verdict:
+   - **Endorse** — sound and well-grounded; minor or no notes.
+   - **Revise** — directionally right but has specific fixable issues (list them).
+   - **Reject-with-reasons** — the shape or grounding is wrong enough that the team should
+     not act on it as written; state what a correct recommendation would require.
+
+6. **Write the critique into the report.** Populate the report's `#adversarial-review`
+   section with your findings and verdict (preserve the self-contained, no-external-deps
+   HTML constraint). When run standalone without the orchestrator, return the critique as
+   a clearly structured summary instead.
+
+## Principles
+
+- **Adversarial, not contrarian.** Push hard, but every objection carries evidence. Drop
+  any finding you can't support — apply the analyst's own evidence standard to yourself.
+- **Re-derive, don't trust.** The report's summary of the diff/ticket is a claim to verify,
+  not a fact to accept.
+- **Name the anti-pattern.** When you flag a shape problem, use the precise term
+  (ice-cream-cone, over-unit-testing, E2E-for-branch-coverage) so the fix is unambiguous.
+- **Unverifiable is a finding.** "The report claims E2E coverage exists but the `test` repo
+  was never inspected" is a legitimate, important objection — surface it.
diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md
new file mode 100644
index 0000000..7fbd307
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md
@@ -0,0 +1,61 @@
+# Adversarial checklist — rejection criteria
+
+Run every check against each per-platform recommendation and against the overall shape.
+A check "fails" only when you can state a concrete, evidence-backed objection. Record the
+evidence; an objection you can't ground is itself rejected.
+
+## Shape-level checks
+
+1. **Ice-cream cone (too E2E-heavy).** Is confidence concentrated in slow, flaky E2E tests
+   that integration or unit tests could buy more cheaply? Any behavior recommended for E2E
+   that is not a genuinely critical, full-system user journey is suspect — demand the
+   justification and propose the lower layer.
+
+2. **Missing platform layer.** Does an affected platform have a gap in its trophy — e.g.
+   server logic with no integration layer, a client with only E2E and no component/unit
+   coverage, core logic with nothing at all? A whole missing layer is a major finding.
+
+3. **Inverted cost/confidence.** Is core branching logic pushed up to integration/E2E
+   while trivial glue sits at lower layers? Confidence should sit at the cheapest
+   sufficient layer.
+
+## Row-level checks (per behavior → layer assignment)
+
+4. **Unit masquerading as integration (and vice-versa).** Is something labeled
+   "integration" actually a unit test with everything mocked (re-asserting mocks, not real
+   collaboration)? Or a true cross-collaborator behavior mislabeled "unit"? Mislabeling
+   distorts the shape and the confidence claim.
+
+5. **Over-testing trivial code.** Tests recommended for getters/setters, framework glue,
+   generated code, or invariants the type system/analyzer already guarantees. Cost without
+   confidence — recommend dropping or moving to static.
+
+6. **E2E for branch coverage.** Edge cases or error paths assigned to slow full-system
+   tests when they belong at unit/integration. E2E is for journeys, not branches.
+
+7. **Flaky-E2E candidate.** Does a recommended E2E test depend on timing, external
+   services, animation, network, or shared mutable state likely to make it flaky? Flag the
+   flakiness risk and whether an integration test with a controlled boundary would be more
+   reliable.
+
+## Grounding checks
+
+8. **Coverage claimed without evidence.** Any "already tested" / "existing coverage"
+   assertion not backed by an observed test, diff hunk, or CSV row. Especially: **E2E
+   coverage asserted without inspecting the dedicated private `test` repo** — that repo is
+   not inside the platform repos, so unexamined E2E claims are unverified by definition.
+
+9. **Untestable / ambiguous requirement.** A behavior recommended for testing whose
+   acceptance criteria are too vague to write a deterministic assertion against. The fix is
+   to flag the requirement gap upstream, not to write a test against a guess.
+
+10. **Assumption presented as fact.** Inferred platform, stack, tooling, or scope stated
+    without an "assumption" marker. Demand it be labeled so the reader can weigh it.
+
+## Verdict mapping
+
+- **Endorse** — no failing checks, or only cosmetic notes.
+- **Revise** — one or more fixable row-level findings, shape essentially sound.
+- **Reject-with-reasons** — a shape-level failure (ice-cream cone, missing layer, inverted
+  cost/confidence) or pervasive ungrounded coverage claims. State what a correct
+  recommendation would require.

From 471fd43e835754042f72913710a2b75a477ce1e0 Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Wed, 17 Jun 2026 12:36:29 -0400
Subject: [PATCH 2/9] split stack analysis from coverage report

---
 .claude-plugin/marketplace.json               |   2 +-
 .cspell.json                                  |   5 +
 README.md                                     |   2 +-
 .../.claude-plugin/plugin.json                |   4 +-
 plugins/bitwarden-test-engineer/CHANGELOG.md  |  56 +-
 plugins/bitwarden-test-engineer/README.md     |  50 +-
 .../bitwarden-test-engineer/agents/AGENT.md   | 136 +++++
 .../test-engineer-orchestrator/AGENT.md       | 137 -----
 .../references/input-sources.md               |  73 ++-
 .../references/report-style-tokens.md         | 496 ++++++++++++++++++
 .../skills/analyzing-test-stack/SKILL.md      |  33 +-
 .../references/html-report-template.md        | 173 ++++--
 .../references/monorepo-layout.md             |  45 +-
 .../references/severity-risk.md               |  81 +++
 .../references/testing-trophy.md              |  26 +-
 .../skills/assessing-test-coverage/SKILL.md   |  51 ++
 .../references/coverage-report-template.md    | 155 ++++++
 .../references/finding-coverage.md            | 119 +++++
 .../SKILL.md                                  |  70 ---
 .../references/adversarial-checklist.md       |  61 ---
 20 files changed, 1374 insertions(+), 401 deletions(-)
 create mode 100644 plugins/bitwarden-test-engineer/agents/AGENT.md
 delete mode 100644 plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
 rename plugins/bitwarden-test-engineer/{skills/analyzing-test-stack => }/references/input-sources.md (51%)
 create mode 100644 plugins/bitwarden-test-engineer/references/report-style-tokens.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
 delete mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
 delete mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index c02023f..a457038 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -97,7 +97,7 @@
       "name": "bitwarden-test-engineer",
       "source": "./plugins/bitwarden-test-engineer",
       "version": "1.0.0",
-      "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment."
+      "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment."
     }
   ]
 }
diff --git a/.cspell.json b/.cspell.json
index 14fee97..b8189cd 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -26,6 +26,7 @@
     "codeBlock",
     "CODEOWNERS",
     "Confluence",
+    "Consolas",
     "CQL",
     "customfield",
     "cvss",
@@ -73,6 +74,7 @@
     "lockfiles",
     "maxResults",
     "mcp",
+    "Menlo",
     "metacharacters",
     "modelcontextprotocol",
     "msword",
@@ -103,6 +105,7 @@
     "SDLC",
     "sast",
     "sbom",
+    "Segoe",
     "semver",
     "shellcheck",
     "shortlog",
@@ -124,12 +127,14 @@
     "tarpit",
     "thumbsup",
     "tinyui",
+    "tnum",
     "touchpoint",
     "touchpoints",
     "triaging",
     "unassigning",
     "unassigns",
     "ungroup",
+    "unlinkable",
     "unresponded",
     "unsanitized",
     "userflow",
diff --git a/README.md b/README.md
index c693611..61b5442 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable
 | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/)     | 0.1.5   | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources                                                     |
 | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0   | Application security engineering: vulnerability triage, threat modeling, and secure code analysis                                                           |
 | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0   | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. |
-| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
+| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
 | [claude-config-validator](plugins/claude-config-validator/)         | 1.1.1   | Validates Claude Code configuration files for security, structure, and quality                                                                              |
 | [claude-retrospective](plugins/claude-retrospective/)               | 1.1.1   | Analyze Claude Code sessions to identify successful patterns and improvement opportunities                                                                  |
 
diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
index 59fc07c..2d60354 100644
--- a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
+++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "bitwarden-test-engineer",
   "version": "1.0.0",
-  "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.",
+  "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.",
   "author": {
     "name": "Bitwarden",
     "url": "https://github.com/bitwarden"
@@ -19,5 +19,5 @@
     "qa",
     "orchestrator"
   ],
-  "agents": "./agents/test-engineer-orchestrator/AGENT.md"
+  "agents": "./agents/AGENT.md"
 }
diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md
index 76d0dbe..12cf16b 100644
--- a/plugins/bitwarden-test-engineer/CHANGELOG.md
+++ b/plugins/bitwarden-test-engineer/CHANGELOG.md
@@ -9,13 +9,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Initial release of the `bitwarden-test-engineer` plugin.
-- `test-engineer-orchestrator` agent: classifies the inputs for a change (Jira ticket,
+- `bitwarden-test-engineer` agent: classifies the inputs for a change (Jira ticket,
   GitHub PR, technical breakdown document, exported test-case CSV, plain-language
   description), fans out subagents to gather evidence — including a dedicated **breakdown
   reader** subagent (`sonnet`) that mines a tech breakdown for testable behaviors and its
-  status — runs the analyst skill, then automatically runs the adversarial counterpart
-  before presenting a consolidated result.
-- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest
+  status — then runs the analyst skill and presents its recommendation. When
+  inspecting a checked-out repo, subagents read its Claude config (root `CLAUDE.md`,
+  `.claude/`, nested `CLAUDE.md`) for test conventions before opening test files, and
+  establish existing coverage PR-first (tests in linked/merged PRs) with a targeted lookup
+  for pre-existing tests — never a repo-wide grep. The agent runs a dedicated **assess
+  existing coverage** step (per-repo coverage scouts applying `assessing-test-coverage`)
+  after evidence gathering and before invoking `analyzing-test-stack`, passing the merged
+  coverage inventory into the recommendation.
+- `assessing-test-coverage` skill: a backward-looking inventory of what a change is
+  **already tested** by. Scoped to the change surface (PR-first, then a targeted lookup —
+  never a repo-wide sweep), it discovers each repo's test conventions config-first, buckets
+  every observed test by layer, cites it as a stable GitHub permalink (commit SHA, not
+  branch), records untested behaviors as `unverified` gaps, and writes its own self-contained
+  HTML **coverage report** (`test-coverage-report-<slug>-<date>.html`) following
+  `references/coverage-report-template.md`. Usable standalone to audit current coverage, and
+  consumed by `analyzing-test-stack`. Owns convention discovery, existing-test finding, and
+  the GitHub permalink citation rules (in `references/finding-coverage.md`) — concerns kept
+  separate from the trophy recommendation.
+- Plugin-level shared `references/`: `input-sources.md` (evidence-source ingestion, used by
+  both skills and the agent) and `report-style-tokens.md` (the single off-brand data-report
+  styling system both the coverage report and the test-stack report inline verbatim, so the
+  two read as one instrument).
+- `analyzing-test-stack` skill: consumes the coverage inventory from `assessing-test-coverage`,
+  then maps a change's testable behaviors to the cheapest
   sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits
   a self-contained HTML report to the current working directory. Accepts a **technical
   breakdown document** (a Bitwarden Tech Breakdown Confluence page, the artifact produced by
@@ -23,16 +44,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   source alongside Jira, PR, CSV, and plain-language inputs — mining its Part 2 scope
   checklist for the surfaces and platforms touched, its Part 4 specification child pages for
   the interfaces to test against, and its Part 5 open questions for untestable-requirement
-  risk. Includes references for the Testing Trophy model, the repo/stack layer→repo map,
-  evidence-source ingestion, and the HTML report template. The Atlassian
-  `search_confluence` / `search_confluence_cql` tools back locating a breakdown by
-  feature/team name when only a name (not a page ID) is given.
-- `challenging-test-stack-recommendations` skill: the adversarial counterpart that
-  red-teams the analyst's recommendation against known anti-patterns (ice-cream-cone,
-  unit-masquerading-as-integration, over-testing, untestable requirements, missing platform
-  layers, flaky-E2E candidates, ungrounded coverage claims) and returns a verdict of
-  endorse, revise, or reject-with-reasons.
-- Per-layer model governance to optimize token spend: the orchestrator runs on Opus
-  (its context drives the synthesis and adversarial reasoning), while its fan-out evidence
-  subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, or repo,
-  `haiku` for pure CSV parsing — rather than inheriting Opus.
+  risk. The report surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone,
+  over-testing, missing platform layers), recording ungrounded findings as `unverified`
+  gaps. Includes references for the Testing Trophy model, the repo/stack
+  layer→repo map, evidence-source ingestion, and the HTML report
+  template. The Atlassian `search_confluence` / `search_confluence_cql` tools back locating a
+  breakdown by feature/team name when only a name (not a page ID) is given.
+- Top-of-report `#overview` synthesis section, written by the analyst: a 2–4 sentence recap
+  of the recommended shape per platform, the top 3 open risks (drawn from
+  `#gaps`), and anchor links into the detail sections, so readers see the bottom line without
+  scrolling. The overview is additive — per-behavior detail stays in `#recommendations`/`#gaps`.
+- Per-layer model governance to optimize token spend: the agent runs on Opus
+  (its context drives the analysis and the recommendation), while the fan-out
+  evidence subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket,
+  or repo, `haiku` for pure CSV parsing — rather than inheriting Opus.
diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index 0895580..f18e06e 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -2,31 +2,28 @@
 
 ## Overview
 
-A test engineering toolkit for Bitwarden. An orchestrator analyzes a request and
-dispatches specialized skills across the testing discipline — test strategy and planning,
+A test engineering toolkit for Bitwarden. A generalist test-engineer agent analyzes a
+request and dispatches specialized skills across the testing discipline — test strategy and planning,
 automation, exploratory testing, and quality assessment. The plugin is designed to grow:
-new testing skills are added over time, and **every analytic skill ships with an
-adversarial counterpart** that red-teams its output before it reaches you. An unchallenged
-test plan tends to drift toward whatever is easiest to do rather than what actually buys
-confidence; the adversary exists to catch that.
+new testing skills are added over time.
 
 ### First capability: test-stack analysis
 
-Given a change — a feature, bugfix, refactor, or migration — the orchestrator recommends
-**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a thin
-static-analysis base, a focused unit layer, a heavy integration layer where most confidence
-is bought, and a thin E2E layer reserved for critical user journeys.
+Given a change — a feature, bugfix, refactor, or migration — the agent recommends
+**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a focused
+unit layer, a heavy integration layer where most confidence is bought, and a thin E2E
+layer reserved for critical user journeys.
 
 It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub
 PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out
-subagents to gather it, runs the analyst skill (`analyzing-test-stack`) to produce a
-self-contained HTML report, then automatically runs its adversarial counterpart
-(`challenging-test-stack-recommendations`) to red-team the recommendation and consolidate a
-single report.
+subagents to gather it, assesses what is **already tested** (the `assessing-test-coverage`
+skill, which inventories existing tests, cites each as a GitHub permalink, and writes a
+coverage report), then runs the analyst skill (`analyzing-test-stack`), which produces the
+test-stack recommendation. Both skills emit a self-contained HTML report.
 
 ## Where each layer lives
 
-Static, unit, and integration tests live alongside the code inside each platform repo
+Unit and integration tests live alongside the code inside each platform repo
 (e.g. `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`). **End-to-end tests live
 in a dedicated, private `test` repository** — not inside the platform repos — so E2E
 recommendations target that separate repo, and existing E2E coverage is treated as
@@ -34,16 +31,16 @@ unverified when that repo isn't checked out.
 
 ## Agent
 
-| Agent                        | What It Does                                                                                                                                                                                                                            |
-| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `test-engineer-orchestrator` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, runs `analyzing-test-stack`, then automatically runs `challenging-test-stack-recommendations` and consolidates a single report. |
+| Agent                     | What It Does                                                                                                                                                                                                                                                                         |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `bitwarden-test-engineer` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, assesses existing coverage (`assessing-test-coverage`), then runs `analyzing-test-stack` — emitting a self-contained coverage report and a self-contained test-stack report. |
 
 ## Skills
 
-| Skill                                    | What It Does                                                                                                                                                                                                                                                                                                                                         |
-| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `analyzing-test-stack`                   | The recommender. Maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a self-contained HTML report to the current working directory.                                                                                                     |
-| `challenging-test-stack-recommendations` | The adversarial counterpart. Re-derives the evidence independently and red-teams the recommendation against known anti-patterns (ice-cream-cone, unit-masquerading-as-integration, over-testing, untestable requirements, missing platform layers, flaky-E2E, ungrounded coverage), then returns a verdict: endorse, revise, or reject-with-reasons. |
+| Skill                     | What It Does                                                                                                                                                                                                                                                                                                                                                                                    |
+| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `assessing-test-coverage` | The backward-looking inventory. Determines what is **already tested** for a change — scoped to the change surface, PR-first then a targeted lookup — buckets each observed test by layer, cites it as a stable GitHub permalink, flags untested behaviors as gaps, and writes a self-contained HTML coverage report. Feeds `analyzing-test-stack`; usable standalone to audit current coverage. |
+| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report to the current working directory.                          |
 
 ## Cross-Plugin Integration
 
@@ -65,7 +62,7 @@ For Jira-backed analysis, install the Atlassian tools alongside it:
 
 ## Usage
 
-The orchestrator activates when you ask what test coverage a change needs, which
+The agent activates when you ask what test coverage a change needs, which
 automation layers to add, how to shape a test plan, or whether existing tests are at the
 right level:
 
@@ -82,8 +79,11 @@ Here's our exported test cases CSV for the billing migration — which of these
 automated and at what layer?
 ```
 
-Each run produces a self-contained `test-stack-report-<slug>-<date>.html` in the current
-working directory, containing the per-platform recommendation and the adversarial review.
+Each run produces two self-contained HTML files in the current working directory: a
+`test-coverage-report-<slug>-<date>.html` (what is already tested — observed tests per layer,
+each cited as a GitHub permalink, plus gaps) and a `test-stack-report-<slug>-<date>.html` (the
+per-platform recommendation and its coverage-gap findings). Both share one off-brand
+data-report visual system so they read as the same instrument.
 
 ## References
 
diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md
new file mode 100644
index 0000000..fdc1dd6
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/agents/AGENT.md
@@ -0,0 +1,136 @@
+---
+name: bitwarden-test-engineer
+version: 1.0.0
+description: |
+  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E) shaped as a Testing Trophy and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, assesses what is already tested (the `assessing-test-coverage` skill), then runs the analyst skill (`analyzing-test-stack`), which emits a self-contained HTML report. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or asks for a "test stack" / "test strategy" / "test trophy" / "risk-based coverage" analysis for a ticket, PR, tech breakdown, or set of test cases.
+
+  <example>
+  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
+  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
+  assistant: "I'll use the bitwarden-test-engineer agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation."
+  <commentary>
+  Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
+  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
+  assistant: "I'll use the bitwarden-test-engineer agent to read the PR diff and its tests, assess the trophy shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
+  <commentary>
+  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the trophy shape including the ice-cream-cone check.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
+  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
+  assistant: "I'll use the bitwarden-test-engineer agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation."
+  <commentary>
+  CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
+  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
+  assistant: "I'll use the bitwarden-test-engineer agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation."
+  <commentary>
+  Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
+  </commentary>
+  </example>
+model: opus
+tools:
+  - Read
+  - Write
+  - Glob
+  - Grep
+  - Skill
+  - Task
+  - AskUserQuestion
+  - Bash(gh pr view:*)
+  - Bash(gh pr diff:*)
+  - Bash(gh pr checks:*)
+  - Bash(git diff:*)
+  - Bash(git log:*)
+  - Bash(git rev-parse:*)
+  - Bash(git remote get-url:*)
+  - Bash(git -C * rev-parse:*)
+  - Bash(git -C * remote get-url:*)
+  - mcp__bitwarden-atlassian__get_issue
+  - mcp__bitwarden-atlassian__search_issues
+  - mcp__bitwarden-atlassian__get_issue_comments
+  - mcp__bitwarden-atlassian__get_issue_remote_links
+  - mcp__bitwarden-atlassian__get_confluence_page
+  - mcp__bitwarden-atlassian__search_confluence
+  - mcp__bitwarden-atlassian__search_confluence_cql
+skills:
+  - assessing-test-coverage
+  - analyzing-test-stack
+color: green
+---
+
+You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys.
+
+You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep the trophy shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
+
+## Operating context
+
+Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently.
+
+**Where each layer lives:** unit and integration live alongside the code in each platform repo; **E2E lives in the dedicated `test` repo** (sibling of the platform repos). See `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map.
+
+The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided.
+
+## Workflow
+
+### 1. Intake and scope
+
+Classify every input the user supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name to search), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) is specified in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it here.
+
+Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption.
+
+### 2. Fan out to gather evidence
+
+Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out:
+
+- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_.
+- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
+- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present.
+- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status.
+
+Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied.
+
+**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit Opus. See _Model selection_ below for the rationale.
+
+### 3. Assess existing coverage
+
+Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return a **permalink record per cited test** (`{ path, start_line, end_line, owner_repo, sha, layer, permalink }`, or `{ path, unlinkable_reason }` when an ingredient is missing) plus `unverified` gaps. The output contract, the PR-first/targeted-lookup discipline, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' records into a single coverage inventory.
+
+This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C <repo> rev-parse HEAD` and `owner/repo` via `git -C <repo> remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date: it writes a **self-contained HTML coverage report** to the current working directory as `test-coverage-report-<slug>-<date>.html` (the backward-looking inventory — observed tests per layer with permalinks, plus `unverified` gaps) and returns the inventory records for step 4. The scouts do the gathering; the skill assembles the report. Pass today's date — skills cannot read the clock.
+
+### 4. Recommend
+
+Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. It maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report-<slug>-<date>.html`. The analyst writes the report's `#overview` itself. Pass today's date to the skill — skills cannot read the clock themselves.
+
+### 5. Present
+
+The run produces **two self-contained HTML files** in the current working directory: the `test-coverage-report-*.html` (what is already tested, from step 3) and the `test-stack-report-*.html` (the recommendation, from step 4). Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at both files — the coverage report for the existing-test detail, the test-stack report for the per-behavior recommendation.
+
+## Principles
+
+- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
+- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer.
+- **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency).
+- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
+- **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_.
+- **Coverage before recommendation.** Assess what already exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute.
+
+## Model selection
+
+Model spend is governed here in the plugin, not left to the session default. The split:
+
+- **You (the test-engineer agent) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` — mapping behaviors to the cheapest sufficient layer across multiple platforms — all in _your_ context, so your model sets its quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus.
+- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus.
+
+Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus.
diff --git a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
deleted file mode 100644
index 37934b1..0000000
--- a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md
+++ /dev/null
@@ -1,137 +0,0 @@
----
-name: test-engineer-orchestrator
-version: 1.0.0
-description: |
-  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (static, unit, integration, E2E) shaped as a Testing Trophy, across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, runs the analyst skill to synthesize a recommendation and HTML report, then automatically runs the adversarial counterpart to red-team it before presenting a consolidated result. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, or asks for a "test stack" / "test strategy" / "test trophy" analysis for a ticket, PR, tech breakdown, or set of test cases.
-
-  <example>
-  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
-  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
-  assistant: "I'll use the test-engineer-orchestrator agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation — then red-team it before handing it back."
-  <commentary>
-  Jira-key intake. The orchestrator gathers the ticket via the Atlassian MCP, runs Skill(analyzing-test-stack), then auto-runs Skill(challenging-test-stack-recommendations).
-  </commentary>
-  </example>
-
-  <example>
-  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
-  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
-  assistant: "I'll use the test-engineer-orchestrator agent to read the PR diff and its tests, assess the trophy shape, and run the adversarial pass to specifically check for an ice-cream-cone (too E2E-heavy) anti-pattern."
-  <commentary>
-  PR intake plus an explicit anti-pattern concern. The orchestrator gathers the diff via gh, then chains analyst → adversary.
-  </commentary>
-  </example>
-
-  <example>
-  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
-  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
-  assistant: "I'll use the test-engineer-orchestrator agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation with an adversarial review."
-  <commentary>
-  CSV intake. The orchestrator parses the export, runs the analyst to map cases to layers and surface gaps, then the adversary challenges the recommendation.
-  </commentary>
-  </example>
-
-  <example>
-  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
-  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
-  assistant: "I'll use the test-engineer-orchestrator agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation — then red-team it."
-  <commentary>
-  Tech-breakdown intake. The orchestrator fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then chains analyst → adversary.
-  </commentary>
-  </example>
-model: opus
-tools:
-  - Read
-  - Write
-  - Glob
-  - Grep
-  - Skill
-  - Task
-  - AskUserQuestion
-  - Bash(gh pr view:*)
-  - Bash(gh pr diff:*)
-  - Bash(gh pr checks:*)
-  - Bash(git diff:*)
-  - Bash(git log:*)
-  - mcp__bitwarden-atlassian__get_issue
-  - mcp__bitwarden-atlassian__search_issues
-  - mcp__bitwarden-atlassian__get_issue_comments
-  - mcp__bitwarden-atlassian__get_issue_remote_links
-  - mcp__bitwarden-atlassian__get_confluence_page
-  - mcp__bitwarden-atlassian__search_confluence
-  - mcp__bitwarden-atlassian__search_confluence_cql
-skills:
-  - analyzing-test-stack
-  - challenging-test-stack-recommendations
-color: green
----
-
-You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a thin static-analysis base, a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys.
-
-You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Every recommendation you produce is challenged by an adversarial pass before you present it, because an unchallenged test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
-
-## Operating context
-
-Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently.
-
-**Where each layer lives:** static, unit, and integration tests live alongside the code, inside each platform repo. **End-to-end (E2E) tests live in a dedicated, private `test` repository** — not inside the platform repos. So an E2E recommendation always targets that separate repo, and a per-repo coverage scout will not find existing E2E tests inside `server`/`clients`/`ios`; it must look in the `test` repo (and the user may not have it checked out — degrade gracefully and say so). Read `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map.
-
-The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided.
-
-## Workflow
-
-### 1. Intake and scope
-
-Classify every input the user supplied. Inputs are additive — handle any combination:
-
-- **Jira key** (e.g. `PM-12345`) → requirements and acceptance criteria.
-- **GitHub PR** (URL or `owner/repo#number`) → the actual change surface and any tests already present.
-- **Technical breakdown** (a Confluence page ID/URL, or a feature/team name to search for) → a Bitwarden Tech Breakdown whose scope checklist already enumerates the platforms and surfaces the change touches, with spec child pages defining the interfaces. Often the richest single input.
-- **CSV path** → an exported set of existing/planned test cases (column layout described in the analyst skill's `references/input-sources.md`).
-- **Plain-language description** → the change itself when no artifact exists.
-
-Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption.
-
-### 2. Fan out to gather evidence
-
-Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out:
-
-- **Requirements reader** (model: `sonnet`) — resolves the Jira issue (via `Skill(bitwarden-atlassian-tools:researching-jira-issues)` if available, else the `mcp__bitwarden-atlassian__*` tools) into testable behaviors and acceptance criteria.
-- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
-- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present.
-- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status.
-- **Per-repo coverage scout** (model: `sonnet`) — for each affected platform repo, surveys existing static/unit/integration conventions and where comparable behavior is tested today. For E2E, scout the dedicated `test` repo if it is checked out; otherwise note it as unverified.
-
-Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied.
-
-**Set each subagent's model explicitly to control cost.** This fan-out is the bulk of the plugin's token spend, and the work is evidence gathering — read a source, extract, return a compact digest — not the strategic reasoning you reserve for yourself. Spawn each `Task` on the cheapest model that fits: **`haiku`** for pure mechanical parsing (the CSV parser), **`sonnet`** for everything that reads code, a diff, or a ticket and summarizes it (the default for these subagents). Do **not** let a subagent inherit your Opus model — a digest-returning agent never needs it. Reserve Opus for your own context, where the synthesis and adversarial reasoning happen (see Model selection below).
-
-### 3. Recommend
-
-Invoke `Skill(analyzing-test-stack)` with the gathered digests. It maps each testable behavior to the cheapest sufficient trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report-<slug>-<date>.html`. Pass today's date to the skill — skills cannot read the clock themselves.
-
-### 4. Adversary (automatic)
-
-Immediately invoke `Skill(challenging-test-stack-recommendations)` on the report and the underlying evidence. It red-teams the recommendation against known failure modes — ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable/ambiguous requirements, a missing platform layer, flaky-E2E candidates, and coverage claimed without evidence — and returns a critique with a verdict: **endorse**, **revise**, or **reject-with-reasons**.
-
-This pass is not optional. If the user explicitly asks to skip it, comply but state plainly in your summary that the recommendation was not adversarially reviewed.
-
-### 5. Consolidate
-
-Merge the critique into the report as a clearly labeled "Adversarial Review" section, so a single HTML file carries both the recommendation and its challenge. In chat, give a short summary: the recommended shape per platform, the adversary's verdict, and the top open risks the user should resolve before committing to the plan.
-
-## Principles
-
-- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
-- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer.
-- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
-- **Read the repo's CLAUDE.md** when the analysis touches a specific checked-out codebase — honor its test conventions over generic defaults.
-
-## Model selection
-
-Model spend is governed here in the plugin, not left to the session default. The split:
-
-- **You (the orchestrator) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` (mapping behaviors to the cheapest sufficient layer across multiple platforms) and `challenging-test-stack-recommendations` (red-teaming that recommendation) — both execute in _your_ context, so your model sets their quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus.
-- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus.
-
-Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
similarity index 51%
rename from plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md
rename to plugins/bitwarden-test-engineer/references/input-sources.md
index b6b2d46..9724f09 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -20,13 +20,70 @@ Otherwise use the MCP tools directly:
 Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
 components** named. If the MCP is unavailable, ask the user to paste the requirements.
 
+Also capture **severity** — for a bug/defect ticket, read the severity assigned on the issue
+(the severity field, or the QA/reporter's stated severity in the description/comments) and
+carry it with the behaviors; for a feature/story without a defect, leave it to the analyst to
+assess each behavior's risk severity. Severity is the impact dial the `analyzing-test-stack`
+skill uses to risk-weight coverage — see that skill's `references/severity-risk.md`, mirrored
+from the Defect Severity Classification Guide (Confluence page `2759229512`).
+
+### Epic intake
+
+A Jira key may resolve to an Epic (or, in next-gen projects, a Feature) rather than a single
+story. The epic body itself rarely lists testable behaviors — those live on its children
+and on the PRs the children produce. If you analyze only the epic, you will under-scope the
+trophy. So when the `issuetype` on the `get_issue` response is `Epic` or `Feature`, expand
+before extracting:
+
+1. **Discover children.** Read the `subtasks` field first. If empty (common in next-gen
+   projects, which use `parent` relationships rather than the legacy `subtasks` field), fall
+   back to `mcp__bitwarden-atlassian__search_issues` with JQL `parent = <EPIC-KEY>`. On
+   classic projects, also try `"Epic Link" = <EPIC-KEY>`. Together these cover both schemas.
+2. **Bound the fan-out.** If the epic has more than ~10 children, fetch the first 10 in full
+   and summarize the rest as a one-line list (key, status, summary) from the search results.
+   This matches the depth-control discipline in
+   `bitwarden-atlassian-tools:researching-jira-issues` (Steps 2–3) — re-use that recipe; do
+   not re-derive it.
+3. **Per child, gather behaviors and PRs.**
+   - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance
+     criteria — these are the testable behaviors for the trophy.
+   - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub").
+     Each PR URL becomes an input to the **GitHub PR** branch below: hand it off to
+     `gh pr view` / `gh pr diff` so the actual change surface and any tests-in-PR feed the
+     recommendation. **These merged/linked PRs are the reliable backbone for existing
+     coverage** — the tests they contain are what shipped with this work, and the PR head SHA
+     makes each one permalink-ready (see the `assessing-test-coverage` skill's
+     `references/finding-coverage.md` → _Finding existing coverage_).
+     If `gh` cannot reach a PR (private fork, not authenticated, repo not accessible), record
+     the URL as evidence-not-inspected in the report rather than silently dropping it.
+4. **Track epic status.** The epic's own status (`In Planning`, `In Progress`, `Done`) tells
+   you how much of the work is shipped: children in `Done` with merged PRs likely already
+   have tests-in-PR you can audit for shape; children still `To Do` are scope-only and your
+   recommendation is necessarily prospective. Surface this distinction in the Evidence
+   section of the report.
+5. **Preferred path when available.** If `bitwarden-atlassian-tools` is installed, invoke
+   `Skill(bitwarden-atlassian-tools:researching-jira-issues)` on the epic key — its Step 2
+   already does the hierarchical-link discovery and Step 3 the depth-controlled traversal,
+   and returns the children + linked Confluence pages + remote links in one synthesized read.
+   Use the direct MCP calls above only when that skill is unavailable.
+
 ## GitHub PR
 
-- `gh pr view <pr>` — title, body, linked issues, files changed, checks.
+- `gh pr view <pr> --json url,headRefOid,baseRefName,title,body,files,state` — title,
+  body, linked issues, files changed, **and the head SHA + `owner/repo`** needed for
+  permalink production downstream.
 - `gh pr diff <pr>` — the actual change surface.
 
-Extract: the public API / behavior touched, the diff paths (→ which repos/platforms), and
-**any tests already included in the PR** (so you assess incremental, not absolute, gaps).
+Extract: the public API / behavior touched, the diff paths (→ which repos/platforms),
+**any tests already included in the PR** (so you assess incremental, not absolute,
+gaps), and the captured **`headRefOid`** + **`owner/repo`** (parsed from the PR URL).
+The SHA and `owner/repo` are required — they are what makes every test cited as
+existing coverage clickable in the report. Tests observed in the PR diff are primary
+coverage evidence; for _pre-existing_ tests not in the diff, do a targeted lookup scoped
+to the changed paths/symbols rather than a repo-wide sweep. See the
+`assessing-test-coverage` skill's `references/finding-coverage.md` → _Finding existing
+coverage_ and _Citing tests as GitHub permalinks_ for the link form and the fallback when
+ingredients are missing.
 
 ## Technical breakdown document
 
@@ -48,7 +105,10 @@ Locate and fetch it:
 Map its structure to testable evidence (the canonical template is page `2920349776`):
 
 - **Part 1 — Problem overview**: the feature framing and linked Jira epic. Use it for scope and
-  to cross-link any Jira/PR inputs, not as a behavior source on its own.
+  to cross-link any Jira/PR inputs, not as a behavior source on its own. **When Part 1 names an
+  Epic**, treat it the same as an Epic-key intake — drill into its children and their PR remote
+  links per the _Epic intake_ recipe above. A breakdown plus its epic together usually surface
+  more testable behavior than either alone.
 - **Part 2 — Breakdown scope checklist**: the core of the mining. Each answered item names a
   surface the change touches and therefore a place tests are needed — **Database changes**
   (migration/backwards-compat behaviors, EDD phasing), **API changes** (endpoint contracts,
@@ -66,7 +126,8 @@ Map its structure to testable evidence (the canonical template is page `29203497
   can't be reliably tested until its question is answered. Surface them in the report's gaps.
 
 Extract: discrete **testable behaviors** per platform, the **surfaces** each touches (→ repos via
-`monorepo-layout.md`), and the team's **stated testing intent** (to evaluate, not echo). Where the
+the `analyzing-test-stack` skill's `references/monorepo-layout.md`), and the team's **stated testing
+intent** (to evaluate, not echo). Where the
 breakdown's scope checklist disagrees with a diff or ticket you were also given, treat the
 divergence as a finding rather than silently picking one.
 
@@ -82,7 +143,7 @@ settings — **do not hardcode them**. Read the header row, then map by meaning:
 - A **steps / expected-result** column, often in Given–When–Then form — the behavior.
 - Optional **team / area / tags / preconditions** columns — scope and grouping.
 
-Map rows to behaviors and bucket each by apparent layer using `testing-trophy.md`:
+Map rows to behaviors and bucket each by apparent layer using the `analyzing-test-stack` skill's `references/testing-trophy.md`:
 
 - A case that drives the full UI through a complete journey → likely **E2E** (target the
   dedicated `test` repo).
diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
new file mode 100644
index 0000000..4957f9f
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
@@ -0,0 +1,496 @@
+# Report style tokens — data-report visual system for HTML reports
+
+This file is the **single source of styling truth** for every self-contained HTML report the
+`bitwarden-test-engineer` plugin emits — the `analyzing-test-stack` test-stack report and the
+`assessing-test-coverage` coverage report alike. The HTML output requirements (single file,
+inline CSS, no external/CDN assets, no web fonts, no JS) mean a report cannot `<link>` to a
+design system at runtime — instead, **inline the stylesheet block at the bottom of this file
+verbatim** into the report's `<style>` element.
+
+The look is deliberately **not** a brand skin. It is a quiet, ink-on-paper _data report_
+— the aesthetic of a statistical notebook or a coverage readout, where the data is the
+hero and nothing decorates. Every report ships the same system so two reports read as the
+same instrument. Do not re-pick colors, fonts, or layer tokens per report.
+
+## Design intent (why these choices)
+
+- **Flat paper, no chrome.** White page, hairline rules, no cards, no shadows, no
+  rounded panels. Sections are separated by a single rule and whitespace. Simple and
+  low-key by construction.
+- **Monospace is a structural role, not just for code.** Section numbers, eyebrows,
+  table headers, layer/badge chips, axis labels, counts, and SHAs are all set in
+  the system monospace stack. Prose is set in the system sans stack. The split makes
+  "data" and "argument" visually distinct and gives the report its notebook character
+  without any web font.
+- **The layer ramp is sequential, because the layers are ordered.** unit → integration
+  → e2e is a cost/depth sequence (cheapest/shallowest → most expensive/deepest). A
+  single-hue light→dark ramp encodes that order honestly; a thin dark sliver therefore
+  reads as "expensive, used sparingly." Do not swap it for unrelated categorical hues.
+- **State colors are categorical and muted.** The assumption/warn/ok badges each carry
+  exactly one meaning. Muted traffic colors, not saturated brand colors.
+
+## Token → meaning mapping (binding)
+
+These mappings are **normative**. Do not re-pick colors per report.
+
+### Layer tokens (used wherever a Testing Trophy layer is rendered — chips, distribution bars, table cells)
+
+| Layer       | Token           | HEX       | Role in the ramp                 |
+| ----------- | --------------- | --------- | -------------------------------- |
+| unit        | `--unit`        | `#8FB3D1` | lightest — cheapest / shallowest |
+| integration | `--integration` | `#3F7196` | mid — the trophy's bulge         |
+| e2e         | `--e2e`         | `#1D3A54` | deepest — most expensive, thin   |
+
+`unit` is light, so layer chips and bar segments at the unit layer use **dark** text
+(`--on-unit`); integration and e2e use **white** text (`--on-deep`).
+
+### Badge / state tokens
+
+| Badge      | Token    | Use                                             |
+| ---------- | -------- | ----------------------------------------------- |
+| assumption | `--warn` | Anything inferred without direct evidence       |
+| warn       | `--bad`  | Risks, missing-input flags, unverifiable claims |
+| ok         | `--ok`   | Confirmed coverage, grounded calls              |
+
+All badge chips use white (`--on-state`) text on these muted fills — the one
+contrast tradeoff in the system, kept legible by bold mono chip text at small sizes.
+
+### Surface, ink, and structural tokens
+
+| Token         | HEX       | Use                                           |
+| ------------- | --------- | --------------------------------------------- |
+| `--paper`     | `#FFFFFF` | Page background (flat — no cards)             |
+| `--panel`     | `#F4F6F8` | Inline code, chart track, table row hover     |
+| `--ink`       | `#16191D` | Primary text                                  |
+| `--ink-soft`  | `#585F68` | Secondary text, captions, table cells of note |
+| `--ink-faint` | `#818892` | Eyebrows, section numbers, axis labels        |
+| `--rule`      | `#E4E7EA` | Hairlines, dividers, table row borders        |
+| `--link`      | `#2F6E9E` | Links                                         |
+
+## Typography
+
+System fonts only — **no web fonts, no `@font-face`, no CDN imports**. Two roles, mapped
+to two stacks via `--sans` (prose) and `--mono` (data, labels, chrome):
+
+```
+--sans: system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif
+--mono: ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace
+```
+
+## Graphics — the layer-distribution chart
+
+The one graphic the report needs is the **recommended layer distribution per platform**,
+rendered as a normalized horizontal **stacked bar** (a `<figure>` captioned `Fig 1`):
+
+- One `.dist-row` per platform: a right-aligned `.dist-label` (the platform) and a
+  `.bar` track holding one `.seg` per layer present.
+- **Segment width is proportional to the recommended test count at that layer** — set it
+  with an inline `style="flex: <count>"`. The flex values are the raw counts; the browser
+  normalizes them to fill the track. Do not hand-compute percentages or pixel widths.
+- Each segment shows its **count** as a monospace label inside it; the shared `.legend`
+  above maps color → layer. A `figcaption` names the figure. The unit segment carries
+  **dark** text (`--on-unit`) like the unit chip; integration and e2e segments carry
+  white (`--on-deep`).
+
+This replaces any arbitrary fixed-width bar. The chart is the report's signature: keep
+everything around it quiet so it reads.
+
+## Paste-ready stylesheet
+
+Paste the entire block below — unchanged — into the report's `<style>` element, as a single
+contiguous block. **Both report templates inline this identically** — the coverage report
+(`assessing-test-coverage`'s `coverage-report-template.md`) and the test-stack report
+(`analyzing-test-stack`'s `html-report-template.md`). Do not prune unused selectors, do not
+reorder, and do not let one report carry a trimmed copy; that is exactly how two reports that
+claim the same system drift apart. Component classes (`.layer.*`, `.badge.*`,
+`.dist`/`.seg.*`, `.shapes`, etc.) are part of the binding contract — both templates reference
+them by name.
+
+```css
+:root {
+  /* Surfaces & ink — flat paper, no cards or shadows */
+  --paper: #ffffff;
+  --panel: #f4f6f8;
+  --ink: #16191d;
+  --ink-soft: #585f68;
+  --ink-faint: #818892;
+  --rule: #e4e7ea;
+
+  /* Layer ramp — SEQUENTIAL: ordered cheap/shallow -> costly/deep */
+  --unit: #8fb3d1;
+  --integration: #3f7196;
+  --e2e: #1d3a54;
+  --on-unit: #16191d; /* --unit is light: use dark text */
+  --on-deep: #ffffff; /* white text on integration/e2e */
+
+  /* Verdict & state — muted categorical */
+  --ok: #43875a;
+  --warn: #b07d2f;
+  --bad: #bf564a;
+  --on-state: #ffffff;
+
+  --link: #2f6e9e;
+
+  --sans:
+    system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+  --mono:
+    ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono",
+    monospace;
+}
+
+* {
+  box-sizing: border-box;
+}
+html {
+  -webkit-text-size-adjust: 100%;
+}
+
+body {
+  margin: 0;
+  background: var(--paper);
+  color: var(--ink);
+  font: 15px/1.6 var(--sans);
+  font-feature-settings: "tnum" 1; /* tabular figures where supported */
+}
+
+a {
+  color: var(--link);
+  text-decoration: underline;
+  text-underline-offset: 2px;
+  text-decoration-thickness: 1px;
+}
+a:focus-visible,
+summary:focus-visible {
+  outline: 2px solid var(--link);
+  outline-offset: 2px;
+}
+
+/* Masthead */
+header {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 56px 32px 28px;
+}
+header .eyebrow {
+  margin: 0 0 14px;
+  font: 600 11px/1 var(--mono);
+  letter-spacing: 0.18em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+}
+header h1 {
+  margin: 0 0 12px;
+  font-size: 28px;
+  line-height: 1.2;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+}
+header .meta {
+  font: 12px/1.6 var(--mono);
+  color: var(--ink-soft);
+}
+header .meta a {
+  color: var(--ink-soft);
+}
+
+/* Sections — flat, hairline-separated, auto-numbered */
+main {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 0 32px 96px;
+  counter-reset: sec;
+}
+section {
+  counter-increment: sec;
+  padding: 36px 0;
+  border-top: 1px solid var(--rule);
+}
+section:first-of-type {
+  border-top: 0;
+}
+section > h2 {
+  margin: 0 0 18px;
+  font-size: 19px;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+}
+section > h2::before {
+  content: counter(sec, decimal-leading-zero);
+  display: inline-block;
+  margin-right: 12px;
+  font: 600 12px/1 var(--mono);
+  letter-spacing: 0.1em;
+  color: var(--ink-faint);
+  vertical-align: 2px;
+}
+section h3 {
+  margin: 28px 0 10px;
+  font: 600 11px/1.3 var(--mono);
+  letter-spacing: 0.12em;
+  text-transform: uppercase;
+  color: var(--ink-soft);
+}
+
+/* Prose */
+p {
+  margin: 0 0 14px;
+  max-width: 72ch;
+}
+.lead {
+  font-size: 16px;
+}
+.small {
+  font-size: 12.5px;
+  color: var(--ink-soft);
+}
+ul.tight {
+  margin: 8px 0 16px;
+  padding-left: 20px;
+}
+ul.tight li {
+  margin: 0 0 6px;
+}
+ol {
+  padding-left: 22px;
+}
+ol li {
+  margin: 0 0 10px;
+}
+code {
+  font: 0.86em var(--mono);
+  background: var(--panel);
+  padding: 1px 5px;
+  border-radius: 3px;
+}
+
+/* Tables — heavy header rule, hairline rows */
+.scroll {
+  overflow-x: auto;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 4px 0 18px;
+  font-size: 13.5px;
+}
+thead th {
+  text-align: left;
+  vertical-align: bottom;
+  padding: 0 12px 8px;
+  font: 600 10.5px/1.3 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  border-bottom: 1px solid var(--ink);
+}
+tbody td {
+  vertical-align: top;
+  padding: 10px 12px;
+  border-bottom: 1px solid var(--rule);
+}
+tbody tr:hover {
+  background: var(--panel);
+}
+th:first-child,
+td:first-child {
+  padding-left: 0;
+}
+th:last-child,
+td:last-child {
+  padding-right: 0;
+}
+
+/* Layer chip */
+.layer {
+  display: inline-block;
+  font: 600 10.5px/1.6 var(--mono);
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 2px 8px;
+  border-radius: 2px;
+  white-space: nowrap;
+}
+.layer.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.layer.integration {
+  background: var(--integration);
+  color: var(--on-deep);
+}
+.layer.e2e {
+  background: var(--e2e);
+  color: var(--on-deep);
+}
+
+/* Layer-distribution chart (the signature graphic) */
+figure {
+  margin: 18px 0;
+}
+figcaption {
+  margin-bottom: 14px;
+  font: 11px/1.4 var(--mono);
+  letter-spacing: 0.04em;
+  color: var(--ink-faint);
+}
+.dist .legend {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 18px;
+  margin-bottom: 14px;
+  font: 11px/1 var(--mono);
+  color: var(--ink-soft);
+}
+.dist .legend .key {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+.dist .legend .key::before {
+  content: "";
+  width: 10px;
+  height: 10px;
+  border-radius: 2px;
+  background: var(--rule);
+}
+.dist .legend .unit::before {
+  background: var(--unit);
+}
+.dist .legend .integration::before {
+  background: var(--integration);
+}
+.dist .legend .e2e::before {
+  background: var(--e2e);
+}
+.dist-row {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  margin: 7px 0;
+}
+.dist-row .dist-label {
+  flex: 0 0 14ch;
+  text-align: right;
+  font: 11px/1.3 var(--mono);
+  color: var(--ink-soft);
+  word-break: break-word;
+}
+.dist-row .bar {
+  flex: 1;
+  display: flex;
+  height: 24px;
+  background: var(--panel);
+  border-radius: 3px;
+  overflow: hidden;
+}
+.bar .seg {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 18px;
+  font: 600 11px/1 var(--mono);
+  color: var(--on-deep);
+}
+.bar .seg.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.bar .seg.integration {
+  background: var(--integration);
+}
+.bar .seg.e2e {
+  background: var(--e2e);
+}
+
+/* Per-platform recommended-shape list (replaces card blocks) */
+ul.shapes {
+  margin: 6px 0 0;
+  padding: 0;
+  list-style: none;
+}
+ul.shapes li {
+  padding: 10px 0;
+  border-top: 1px solid var(--rule);
+}
+ul.shapes li:first-child {
+  border-top: 0;
+}
+ul.shapes .plat {
+  font: 600 13px/1.5 var(--mono);
+}
+
+/* Badges */
+.badge {
+  display: inline-block;
+  font: 600 10px/1.5 var(--mono);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  padding: 1px 6px;
+  border-radius: 2px;
+  color: var(--on-state);
+  white-space: nowrap;
+}
+.badge.assumption {
+  background: var(--warn);
+}
+.badge.warn {
+  background: var(--bad);
+}
+.badge.ok {
+  background: var(--ok);
+}
+
+/* Unlinkable evidence */
+.unlinkable {
+  font: italic 12px/1.4 var(--mono);
+  color: var(--ink-faint);
+}
+
+@media (max-width: 720px) {
+  header,
+  main {
+    padding-left: 20px;
+    padding-right: 20px;
+  }
+  .dist-row {
+    flex-direction: column;
+    align-items: stretch;
+    gap: 4px;
+  }
+  .dist-row .dist-label {
+    flex: none;
+    text-align: left;
+  }
+}
+
+@media print {
+  body {
+    font-size: 11pt;
+  }
+  section {
+    break-inside: avoid;
+    border-top-color: #ccc;
+  }
+  tbody tr:hover {
+    background: none;
+  }
+  a {
+    color: var(--ink);
+  }
+}
+```
+
+## What not to do
+
+- Do not reintroduce a brand skin — no saturated brand blue/yellow, no logo images, no
+  `<link>` to a design system. The report is intentionally off-brand and self-contained.
+- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the
+  encoding.
+- Do not introduce web fonts, CDN links, or `<link rel="stylesheet">` — the single-file
+  constraint is binding.
+- Do not narrow the stylesheet down to "only the classes this report uses." The template
+  ships the full stylesheet so a reader inspecting any report sees the same system.
+- Do not hand-compute the distribution bar widths in pixels or percentages — set
+  `flex: <count>` per segment and let the browser normalize.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index 5183d4f..923b5aa 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -1,46 +1,47 @@
 ---
 name: analyzing-test-stack
-description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a change from a Jira ticket, a GitHub PR, an exported test-case CSV, a technical breakdown document (a Confluence tech breakdown), and/or a plain-language description, then mapping each behavior to the cheapest sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emitting a self-contained HTML report. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", or "are these tests at the right level". This is the recommender; its adversarial counterpart is challenging-test-stack-recommendations, which red-teams the output.
-allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a Jira ticket, GitHub PR, exported test-case CSV, technical breakdown, and/or plain-language description, then mapping each behavior to the cheapest sufficient Testing Trophy layer (unit, integration, E2E) per platform, risk-weighted by each behavior's defect severity (impact, not urgency), and emitting a self-contained HTML report. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", "are these tests at the right level", "risk-based test coverage", "what tests does this Critical/High bug need", or "rank coverage gaps by severity".
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
 ---
 
 # Analyzing the Test Stack
 
 Recommend the test automation layers a change should ship with, shaped as a **Testing Trophy**, and write the recommendation as a self-contained HTML report. You produce advice, not tests.
 
-The Testing Trophy (read `references/testing-trophy.md` for the full model): a thin **static** base, a focused **unit** layer for pure logic and edge cases, a **heavy integration** layer where most confidence is bought, and a **thin E2E** layer reserved for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down the trophy, not up.
+The Testing Trophy (read `references/testing-trophy.md` for the full model): a focused **unit** layer for pure logic and edge cases, a **heavy integration** layer where most confidence is bought, and a **thin E2E** layer reserved for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down the trophy, not up.
 
 ## Inputs
 
-You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock.
+You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. You also consume a **coverage inventory** — the existing-test records produced by the `assessing-test-coverage` skill (permalink records + `unverified` gaps). Under the `bitwarden-test-engineer` agent this is gathered for you before this skill runs; if it is absent (e.g. run standalone), invoke `Skill(assessing-test-coverage)` for the affected change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
 
-Read `references/input-sources.md` for how to ingest each source:
+`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. At a glance:
 
-- **Jira** — via the `mcp__bitwarden-atlassian__*` tools (or the `bitwarden-atlassian-tools:researching-jira-issues` skill if available). Extract testable behaviors and acceptance criteria. If the MCP is unavailable, ask the user to paste requirements rather than failing.
-- **GitHub PR** — `gh pr view` / `gh pr diff` to read the change surface, public API touched, and any tests already present.
-- **CSV** — an exported set of test cases. The expected columns and how to bucket rows by layer are documented in `references/input-sources.md`.
-- **Technical breakdown** — a Bitwarden Tech Breakdown Confluence page (the artifact produced by the `bitwarden-delivery-tools:writing-tech-breakdowns` skill). Fetch via `mcp__bitwarden-atlassian__get_confluence_page`. This is often the richest single input: its scope checklist already enumerates the platforms and surfaces the change touches, and its specification child pages define the interfaces to test against. See `references/input-sources.md` for how to mine it.
+- **Jira** — extract testable behaviors and acceptance criteria; Epics/Features expand to their children before extraction.
+- **GitHub PR** — extract the change surface, API touched, and any tests already present.
+- **CSV** — bucket rows by apparent layer and automation status.
+- **Technical breakdown** — often the richest single input; its scope checklist already enumerates the platforms and surfaces.
 - **Description** — use directly when no artifact exists.
 
 If a source you'd expect is missing, proceed with what you have and **record the gap** in the report — never block on a missing input.
 
-## Workflow
+Alongside the behaviors, carry each behavior's **risk severity** — the impact a defect in it would have, per Bitwarden's severity guide. `references/severity-risk.md` is the canonical model: where severity comes from (the Jira severity field for bugs; an assessment against the guide's criteria for features/PRs/breakdowns) and how it calibrates the recommendation. Severity is the dial that turns "cheapest sufficient layer" into a risk-weighted call — it decides how completely a behavior must be covered and how hard a missing test counts as a gap. Weight by severity (impact), not priority (urgency). **Security-sensitive behaviors (crypto, auth, threat-model-relevant paths) are at least Critical regardless of the guide's table** — see the reference's source-of-truth note.
 
-1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks and tooling using `references/monorepo-layout.md`. Note that **E2E tests live in a separate, private `test` repo** — never inside the platform repos — so E2E recommendations target that repo and existing E2E coverage may be unverifiable if it isn't checked out.
+## Workflow
 
-2. **Assess current coverage.** For each affected area, determine what is already tested and where. From a PR diff, note tests included in the change. From a CSV, bucket existing cases by apparent layer and automation status. From a repo checkout, grep the established test conventions. Distinguish _observed_ coverage from _assumed_ coverage.
+1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated.
 
-3. **Assign the cheapest sufficient layer.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey). Name concrete tooling per platform (see `references/monorepo-layout.md`).
+2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input: the permalink records for observed tests (each `{ path, line range, owner_repo, sha, layer, permalink }`, or path-only with an `unlinkable` reason) and the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column and the gap analysis below.
 
-4. **Find the gaps and the imbalance.** Call out behaviors with no recommended coverage, and any existing shape that is trophy-wrong (e.g. E2E doing work integration should do, or untested core logic). Be explicit about what evidence each gap rests on.
+3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer the trophy reserves for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey). Name concrete tooling per platform (see `references/monorepo-layout.md`).
 
-5. **Write the HTML report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/html-report-template.md`. Write it to the **current working directory** as `test-stack-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change (ticket key, PR number, or feature name) and `<date>` is the caller-provided date. Report sections, in order: Summary & recommended shape; Evidence & sources (with what was missing); Per-platform recommendations (behavior → layer → tooling → rationale); Coverage gaps; and a placeholder **Adversarial Review** section the counterpart skill fills in.
+4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is trophy-wrong (e.g. E2E doing work integration should do, or untested core logic). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
 
-6. **Hand off for adversarial review.** Your recommendation is not final until `challenging-test-stack-recommendations` has red-teamed it. When invoked under the orchestrator this happens automatically; when invoked standalone, tell the user the adversarial pass is available and recommended.
+5. **Write the HTML report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/html-report-template.md`. **Inline the canonical stylesheet from `../../references/report-style-tokens.md` verbatim** — the plugin-level styling source shared with the coverage report; do not re-pick colors, fonts, or layer tokens; the off-brand data-report visual system and the layer/badge mappings in that file are binding. Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`). Write `#overview` yourself as a short top-of-report synthesis: a 2–4 sentence recap of the recommended shape per platform, the top 3 open risks the reader should resolve before acting (drawn from `#gaps`, **highest severity first**), and anchor links into `#recommendations` and `#gaps`. The per-platform recommendations table carries a **Severity** column per behavior. Write the report to the **current working directory** as `test-stack-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change (ticket key, PR number, or feature name) and `<date>` is the caller-provided date. The Per-platform recommendations table's Evidence column must contain a GitHub permalink (or an explicit `unlinkable` note) for every cited existing test.
 
 ## Principles
 
 - **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test. Mark anything inferred without evidence as an assumption.
 - **Cheapest sufficient layer wins.** Confidence pushed down the trophy is cheaper to write, faster to run, and less flaky.
+- **Severity sets the bar, not the layer.** Weight each behavior's coverage by the impact a defect in it would have, per `references/severity-risk.md` — severity decides how completely a behavior is covered and how high its gap ranks, never which layer is "cheapest sufficient." It is impact, not priority (urgency).
 - **Per-platform, not one-size.** A feature spanning server, web, and mobile gets a distinct shape per platform — their stacks and risks differ.
 - **Honesty about coverage.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
index f2eb2ce..a18ccb7 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -8,32 +8,89 @@ Write it to the **current working directory** as
 `test-stack-report-<slug>-<date>.html` (slug = ticket key / PR number / feature name in
 kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
 
+## Styling — binding
+
+Inline the paste-ready stylesheet from `../../../references/report-style-tokens.md` **verbatim**
+into the `<style>` block. The report uses a deliberately off-brand, low-key _data-report_
+visual system (flat white paper, monospace for data/labels/chrome, sans for prose, a
+sequential layer ramp). Do not re-pick colors, do not invent additional layer tokens, do
+not reintroduce a brand skin, do not add `<link>`/`@font-face`/CDN imports. The layer →
+token mapping (unit / integration / e2e) and the badge → token mapping
+(assumption / warn / ok) are normative wherever rendered — chips, distribution bars,
+table cells, and recommendation rows.
+
+Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section
+and do not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it
+scrolls rather than overflows on narrow widths.
+
 ## Required sections, in order
 
-1. **Header** — report title, the change under analysis (ticket/PR/feature), and the date.
-2. **Summary & recommended shape** — 2–4 sentences plus a per-platform one-line shape
-   (e.g. "server: integration-heavy, thin unit; clients: integration + 1 E2E journey").
-   A simple text/CSS trophy bar per platform is welcome; no JS.
-3. **Evidence & sources** — a table of which inputs were used (Jira / PR / CSV /
-   description) and, explicitly, **what was missing or unverifiable** (e.g. "`test` repo
-   not checked out — existing E2E coverage unverified").
-4. **Per-platform recommendations** — for each affected platform, a table:
-   `Behavior | Recommended layer | Tooling | Rationale | Evidence`. One row per behavior.
-   Use the layer→repo map; E2E rows must name the dedicated `test` repo as target.
-5. **Coverage gaps & imbalances** — behaviors with no coverage, and any trophy-wrong
-   shape observed (ice-cream-cone, over-unit-tested, trivial tests). Each tied to evidence.
-6. **Adversarial Review** — a clearly marked placeholder section the
-   `challenging-test-stack-recommendations` skill fills in. Leave a labeled empty block,
-   e.g. `<section id="adversarial-review"> … to be completed by adversarial pass … </section>`.
-
-## Style guidance
-
-- Keep the palette calm and high-contrast; use color only to distinguish the four layers
-  (e.g. static / unit / integration / E2E) consistently wherever they appear.
-- Tables over prose for the recommendations and evidence — they're meant to be scanned and
+Each section uses the **normative `id` listed below**. Do not rename, omit, or add
+top-level sections — readers look these up by id.
+
+1. **Header** (no id; `<header>` element) — report title, the change under analysis
+   (ticket/PR/feature), and the date.
+2. **`#overview`** — A short top-of-report synthesis written by the analyst, so a reader
+   sees the bottom line without scrolling. It must contain: a 2–4 sentence recap of the
+   recommended shape per platform; the top 3 open risks the reader must resolve before
+   acting (drawn from `#gaps`, **ordered highest severity first**); and anchor links into
+   `#recommendations` and `#gaps` for the underlying detail. The overview is additive —
+   the per-behavior detail stays in `#recommendations`/`#gaps`.
+3. **`#summary`** — Summary & recommended shape — 2–4 sentences, then the
+   **layer-distribution chart** (the report's signature graphic) and a per-platform
+   one-line shape list. Render the chart as a captioned `<figure class="dist">` (`Fig 1`)
+   containing a `.legend` and one `.dist-row` per platform; each row has a `.dist-label`
+   (the platform) and a `.bar` track holding one `.seg` per layer present, sized by
+   `style="flex: <count>"` where `<count>` is the recommended test count at that layer
+   (the browser normalizes; never hand-compute widths). Each `.seg` shows its count; the
+   legend maps color → layer. Follow with `<ul class="shapes">`, one `<li>` per platform:
+   a `.plat` name plus the one-line shape (e.g. "server: integration-heavy, thin unit;
+   clients: integration + 1 E2E journey"). No JS. See `../../../references/report-style-tokens.md`
+   → _Graphics_ for the chart contract. The chart encodes recommended **shape** (counts per
+   layer) only; risk severity is carried in the `#recommendations` table's Severity column,
+   not in this graphic — leave the chart severity-blind.
+4. **`#evidence`** — Evidence & sources — a table of which inputs were used (Jira / PR /
+   CSV / tech breakdown / description) and, explicitly, **what was missing or
+   unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage
+   unverified"). For PR inputs include the captured **head SHA** and **`owner/repo`** so
+   per-test permalinks elsewhere in the report can be audited against the same commit.
+5. **`#recommendations`** — Per-platform recommendations — for each affected platform, a
+   table:
+   `Behavior | Severity | Recommended layer | Tooling | Rationale | Evidence (linked)`. One
+   row per behavior. The **Severity** cell carries the behavior's risk severity
+   (Critical / High / Medium / Low / Informative) per the `analyzing-test-stack` skill's
+   `references/severity-risk.md`. Render it with the stylesheet's existing inline-code
+   treatment — `<code>Critical</code>` — **not** a new color token: the layer ramp and the
+   assumption/warn/ok badges are the only colored chips the styling system defines, and
+   severity deliberately does not get its own hue. Mark a severity the analyst inferred
+   (rather than read from a bug's Jira field) with
+   `<span class="badge assumption">assumption</span>`. Use the layer → repo map; E2E rows
+   must name the dedicated `test` repo as target.
+
+   **The "Evidence (linked)" column is binding.** For every existing test cited as
+   current coverage, render a GitHub permalink anchored to the captured commit SHA and
+   line range — `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
+   If a test cannot be linked (no remote, detached HEAD, private fork the agent
+   couldn't reach), use `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>`
+   instead of fabricating a URL. These records come from the coverage inventory; the
+   permalink production rules live in the `assessing-test-coverage` skill's
+   `references/finding-coverage.md` → _Citing tests as GitHub permalinks_.
+
+6. **`#gaps`** — Coverage gaps & imbalances — behaviors with no coverage, and any
+   trophy-wrong shape observed (ice-cream-cone, over-unit-tested, trivial tests). **Order
+   the list by severity**, highest first, so a Critical uncovered behavior leads and the
+   reader resolves the worst-impact gaps first; Informative behaviors are recorded as
+   out-of-scope rather than gaps. Each tied to evidence. Findings you could not ground
+   belong here, marked `unverified` with a one-line reason.
+
+## Content rules
+
+- Tables over prose for recommendations and evidence — they're meant to be scanned and
   acted on.
-- Mark every assumption inline (e.g. an "assumption" badge) so the adversary and the
-  reader can tell grounded calls from inferred ones.
+- Mark every assumption inline with `<span class="badge assumption">assumption</span>`
+  so the reader can tell grounded calls from inferred ones.
+- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g.
+  E2E coverage claimed without the `test` repo checked out).
 - No tracking, no remote resources, no secrets. The file is shareable as-is.
 
 ## Skeleton
@@ -46,18 +103,70 @@ kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
     <meta name="viewport" content="width=device-width, initial-scale=1" />
     <title>Test Stack Report — {{change}}</title>
     <style>
-      /* inline, self-contained styles only */
+      /* Paste the full paste-ready stylesheet from
+         ../../../references/report-style-tokens.md here, verbatim. */
     </style>
   </head>
   <body>
-    <header>…title, change, date…</header>
-    <section id="summary">…recommended shape per platform…</section>
-    <section id="evidence">…sources used + what was missing…</section>
-    <section id="recommendations">…per-platform behavior→layer tables…</section>
-    <section id="gaps">…coverage gaps & imbalances…</section>
-    <section id="adversarial-review">
-      …filled in by the adversarial pass…
-    </section>
+    <header>
+      <p class="eyebrow">Test Stack Report</p>
+      <h1>…the change under analysis…</h1>
+      <p class="meta">…ticket/PR · status · team · date…</p>
+    </header>
+    <main>
+      <section id="overview">
+        <h2>Overview</h2>
+        …2–4 sentence recap of the recommended shape per platform; top 3 open
+        risks; anchor links into #recommendations and #gaps…
+      </section>
+      <section id="summary">
+        <h2>Summary &amp; recommended shape</h2>
+        …2–4 sentences…
+        <figure class="dist">
+          <figcaption>
+            Fig 1 · Recommended layer distribution by platform
+          </figcaption>
+          <div class="legend">
+            <span class="key unit">unit</span>
+            <span class="key integration">integration</span>
+            <span class="key e2e">e2e</span>
+          </div>
+          <div class="dist-row">
+            <span class="dist-label">bitwarden/server</span>
+            <div class="bar">
+              <span class="seg unit" style="flex:3">3</span>
+              <span class="seg integration" style="flex:11">11</span>
+              <span class="seg e2e" style="flex:1">1</span>
+            </div>
+          </div>
+          <!-- one .dist-row per platform -->
+        </figure>
+        <ul class="shapes">
+          <li>
+            <span class="plat">bitwarden/server</span> — integration-heavy, thin
+            unit, 1 E2E journey
+          </li>
+          <!-- one li per platform -->
+        </ul>
+      </section>
+      <section id="evidence">
+        <h2>Evidence &amp; sources</h2>
+        <div class="scroll">
+          …sources used + what was missing + commit SHA(s)…
+        </div>
+      </section>
+      <section id="recommendations">
+        <h2>Per-platform recommendations</h2>
+        <div class="scroll">
+          …per-platform tables: Behavior | Severity | Recommended layer |
+          Tooling | Rationale | Evidence (linked)…
+        </div>
+      </section>
+      <section id="gaps">
+        <h2>Coverage gaps &amp; imbalances</h2>
+        …gaps and trophy-wrong shapes; ungrounded findings marked unverified…
+      </section>
+    </main>
   </body>
 </html>
 ```
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
index dae06b1..6188e09 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
@@ -2,33 +2,38 @@
 
 Bitwarden's code spans several repositories. A single feature often touches more than
 one, and each gets its own Testing Trophy. Treat the table below as a **starting map**,
-not gospel — when a repo is checked out, read its `CLAUDE.md` and grep its existing tests
-to confirm the actual conventions before recommending tooling.
+not gospel — when a repo is checked out, confirm the actual conventions from its config
+first (the `assessing-test-coverage` skill's `references/finding-coverage.md` →
+_Discovering a repo's test conventions_), and read the table as the last-resort default.
+
+Establishing what a change is **already tested** by — finding existing coverage and citing
+it as permalinks — is a separate job owned by the `assessing-test-coverage` skill. This file
+covers only the repo/stack map and the rules for mapping a behavior to the layer it _should_
+live at.
 
 ## Platform repos and their stacks
 
-| Repo (typical)      | Platform                       | Language / framework                | Static                                  | Unit / Integration tooling                                                                 |
-| ------------------- | ------------------------------ | ----------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------------------ |
-| `bitwarden/server`  | Backend / API                  | C# / .NET, ASP.NET Core, EF Core    | `dotnet build` analyzers, nullable refs | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers             |
-| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | `tsc`, ESLint                           | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary |
-| `bitwarden/ios`     | iOS                            | Swift / SwiftUI                     | SwiftLint, compiler                     | XCTest (unit + integration); XCUITest for on-device UI                                     |
-| `bitwarden/android` | Android                        | Kotlin                              | ktlint/detekt, compiler                 | JUnit + Robolectric / Espresso (instrumented)                                              |
+| Repo (typical)      | Platform                       | Language / framework                | Unit / Integration tooling                                                                 |
+| ------------------- | ------------------------------ | ----------------------------------- | ------------------------------------------------------------------------------------------ |
+| `bitwarden/server`  | Backend / API                  | C# / .NET, ASP.NET Core, EF Core    | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers             |
+| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary |
+| `bitwarden/ios`     | iOS                            | Swift / SwiftUI                     | XCTest (unit + integration); XCUITest for on-device UI                                     |
+| `bitwarden/android` | Android                        | Kotlin                              | JUnit + Robolectric / Espresso (instrumented)                                              |
 
 Exact repo names and tool versions drift — verify against the checkout. If a platform
 isn't in this table, infer its stack from the repo and state the assumption in the report.
 
 ## Where each layer lives — important
 
-- **Static, unit, integration** tests live **alongside the code, inside each platform
+- **Unit and integration** tests live **alongside the code, inside each platform
   repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test
   targets).
-- **End-to-end (E2E) tests live in a dedicated, private `test` repository** — _not_
-  inside the platform repos. Consequences for analysis:
-  - An E2E recommendation always targets that separate `test` repo.
-  - A coverage scout will **not** find existing E2E tests by searching `server`/`clients`/
-    `ios`. It must look in the `test` repo, which the user may not have checked out.
-  - If the `test` repo is unavailable, treat existing E2E coverage as **unverified** and
-    say so explicitly in the report — do not assume it is absent or present.
+- **End-to-end (E2E) tests live in a dedicated `test` repository** — _not_ inside the
+  platform repos. It sits as a sibling of `server` / `clients` / `ios` in the user's
+  Bitwarden checkout root, so look for it next to whichever platform repo you're in
+  (e.g. if `clients` is at `~/repos/Bitwarden/clients`, `test` is at
+  `~/repos/Bitwarden/test`). Source: `https://github.com/bitwarden/test` — cite this URL
+  in the report only if no local sibling is found.
 
 ## Mapping a behavior to a platform + layer
 
@@ -37,4 +42,10 @@ isn't in this table, infer its stack from the repo and state the assumption in t
 2. Within each repo, choose the layer per `testing-trophy.md` and name the concrete tool
    from the table above (confirmed against the checkout where possible).
 3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo and
-   flag whether comparable E2E coverage already exists there.
+   flag whether comparable E2E coverage already exists there (per the coverage inventory
+   from `assessing-test-coverage`).
+
+Existing coverage to compare these recommendations against — including the GitHub permalinks
+the report's Evidence column requires — comes from the `assessing-test-coverage` skill's
+coverage inventory (`references/finding-coverage.md` → _Citing tests as GitHub permalinks_
+and _Output contract_), not from this file.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
new file mode 100644
index 0000000..d3f292b
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -0,0 +1,81 @@
+# Severity as a risk weight
+
+The Testing Trophy tells you the _cheapest layer that buys the confidence a behavior
+requires_. **Severity tells you how much confidence is required.** A defect in vault
+unlock and a typo on a settings label are not owed the same rigor — severity is the dial
+that turns "cheapest sufficient" from a flat rule into a risk-weighted one.
+
+Severity is the **impact of a defect on the system or user**, independent of how urgently
+it gets fixed (that is _priority_). This skill weights coverage by severity, not priority.
+
+## Source of truth
+
+The canonical classification is Bitwarden's **Defect Severity Classification Guide**,
+Confluence page `2759229512`
+(`https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity`). The levels
+and criteria below mirror that page so the analysis degrades gracefully when the Atlassian
+MCP is unavailable — but the page is authoritative. When the `bitwarden-atlassian-tools`
+MCP is available, fetch it with `mcp__bitwarden-atlassian__get_confluence_page` (pageId
+`2759229512`) to pick up revisions before relying on the cached copy here. If the fetch
+fails or the MCP is unavailable, use the mirrored table below and note in the report that
+the severity definitions are from the cached copy (version not re-verified) — degrade
+gracefully; never block on it.
+
+**Security-vulnerability defects are the exception:** their severity follows the
+_Vulnerability Tracking and Management_ guide, not this one. If a behavior is
+security-sensitive (crypto, auth, a threat-model-relevant path), treat its risk as at
+least Critical regardless of the table below.
+
+## Where each behavior's severity comes from
+
+- **Bug / defect ticket** — read the severity already assigned on the Jira issue (the
+  severity field, or the reporter/QA's stated severity in the description/comments). Use it
+  directly; if it is absent, classify against the criteria below and mark it an assumption.
+- **Feature, PR, tech breakdown** — there is no defect yet, so assess each behavior's
+  **risk severity**: _if this behavior broke in production, what severity would the
+  resulting defect carry?_ Classify it against the same criteria. This is what makes the
+  recommendation risk-aware rather than uniform.
+
+## Levels and criteria (mirrored from the guide)
+
+| Severity        | A defect here would…                                                                                                                     | Signals (from the guide)                                                                                                                                               |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Critical**    | Severely harm core functionality, data integrity, or security with no viable workaround                                                  | Blocks a critical flow (login, vault access, billing, account creation); data loss/corruption/exposure; crash/unrecoverable state; affects all or a broad user segment |
+| **High**        | Significantly degrade a core feature/flow, but a workaround exists (difficult or non-obvious), or impact is limited to a subset of users | Core feature impaired but not blocked; specific client/OS/auth method; burdensome/undiscoverable workaround; compounding friction in a core workflow                   |
+| **Medium**      | Degrade functionality or UX meaningfully, but a workaround exists or scope is limited                                                    | Non-critical / secondary flow broken; misleading-but-not-destructive output; degraded experience for a subset; extra steps to work around                              |
+| **Low**         | Have minimal functional impact; does not meaningfully hinder the user                                                                    | Cosmetic / typo / visual only; negligible edge case; minor UX inconsistency; trivial workaround                                                                        |
+| **Informative** | Be a known limitation, third-party compatibility issue, or environmental quirk — not a defect in Bitwarden's core behavior               | Autofill on a non-standard third-party site/app; no clear owner or fix path; unlikely to be actioned                                                                   |
+
+## How severity calibrates the recommendation
+
+Severity does **not** mean "push everything Critical to E2E." The cheapest-sufficient rule
+still governs _which_ layer; severity governs _how completely_ the behavior must be covered
+and _how hard a missing test counts as a gap_. Concretely:
+
+- **Critical** — the confidence bar is highest: cover the behavior's material failure modes,
+  not just the happy path, at whatever layer each mode is cheapest to pin down. Critical
+  behaviors that are genuine end-to-end journeys (login, vault unlock, checkout) are exactly
+  what the trophy reserves the **thin E2E layer** for — the guide's "critical user flows"
+  map 1:1 onto that reservation. A Critical behavior with no observed coverage is a
+  **top-priority gap** and belongs at the head of `#overview`'s open risks.
+- **High** — strong integration coverage of the primary path _and_ the documented
+  workaround / affected configuration (the specific client, OS, or auth method that scopes
+  the impact). Reach for E2E only when the path is itself a critical journey. An uncovered
+  High behavior is a gap that should be scheduled, not silently accepted.
+- **Medium** — the plain cheapest-sufficient layer with no escalation. A gap here is worth
+  recording and ranking below Critical/High; it is reasonable to defer.
+- **Low** — minimal coverage; often a single unit or integration assertion, or an explicit
+  "not worth automating" call. Do **not** spend an E2E test on a Low behavior — that is the
+  ice-cream-cone anti-pattern wearing a risk costume.
+- **Informative** — generally not automatable as a Bitwarden behavior; record as
+  out-of-scope rather than as a coverage gap, with a one-line reason.
+
+Two corollaries:
+
+1. **Severity ranks the gaps.** When `#gaps` and `#overview` list open risks, order them by
+   severity — the reader should resolve the Critical-uncovered behaviors first. Gap
+   prioritization is severity-driven, not list-order-driven.
+2. **Severity ≠ priority.** A Low-severity defect can be High-priority before a launch, and
+   a High-severity bug in a rarely used admin panel can be Low-priority. This skill weights
+   coverage by **severity** (impact). Note priority only if the caller supplied it and it
+   changes what to test first.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
index 13da8bd..ff5cd41 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
@@ -4,27 +4,21 @@ A model for shaping automated test coverage, contrasted with the older Testing P
 because they buy the most confidence per unit of cost and maintenance for typical
 application code.
 
-## The four layers (base → top)
+## The three layers (base → top)
 
-1. **Static** — the base. Type checking, linters, formatters, compiler errors, static
-   analysis. Catches whole classes of bugs (typos, null misuse, unused code, unsafe
-   patterns) before a single test runs. Nearly free; always on.
-   - _Examples:_ TypeScript/`tsc`, ESLint, Roslyn analyzers / `dotnet build` warnings as
-     errors, SwiftLint, nullable reference types.
-
-2. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure
+1. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure
    logic, algorithms, edge cases, and error handling where setup is cheap and the unit
    has real branching complexity. Fast and stable, but isolation can let integration
    bugs slip through.
 
-3. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units
+2. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units
    working together through real (or realistic) collaborators: a controller + service +
    in-memory or test database, a component rendered with its real child components and a
    mocked network boundary, a view model against a real repository. This is where most
    confidence is bought because it exercises the wiring users actually depend on, without
    the cost and flakiness of full E2E.
 
-4. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a
+3. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a
    user would: real browser, real device, real backend. Highest confidence per test, but
    slowest, most expensive, and most flaky. Reserve for a small number of **critical user
    journeys** (e.g. login, vault unlock, checkout) — not for branch coverage.
@@ -38,11 +32,11 @@ application code.
      │   Integration   │   HEAVY — most confidence bought here
      └──┐           ┌──┘
         │   Unit    │      focused — pure logic & edge cases
-     ┌──┴───────────┴──┐
-     │      Static     │   broad, ~free base — always on
-     └─────────────────┘
+        └───────────┘
 ```
 
+Static analysis (type checking, linters, formatters) sits below the trophy and is handled by per-repo tooling — not recommended by this skill.
+
 ## How to assign a layer
 
 Pick the **cheapest layer that still buys the confidence the behavior requires**:
@@ -52,10 +46,10 @@ Pick the **cheapest layer that still buys the confidence the behavior requires**
   persistence; component + store + API boundary; view model + repository) → **integration**.
 - A behavior only meaningful as a full user journey across the real system → **E2E**, and
   only if it is genuinely critical.
-- Anything a type system, analyzer, or linter can guarantee → **static**; don't write a
-  test for it.
+- Anything a type system, analyzer, or linter already guarantees → don't write a test
+  for it.
 
-## Anti-patterns to avoid (the adversary checks for these)
+## Anti-patterns to avoid
 
 - **Ice-cream cone** — the trophy inverted: many E2E tests, few integration/unit. Slow,
   flaky, expensive to maintain.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
new file mode 100644
index 0000000..32905a3
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
@@ -0,0 +1,51 @@
+---
+name: assessing-test-coverage
+description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or set of changed paths across Bitwarden's repos, citing each as a stable GitHub permalink, bucketing it by test layer, and flagging behaviors with no observed test as gaps. Distinguishes observed coverage from assumed. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", "find the existing tests for", "is this component covered", "audit current test coverage". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient trophy layers; for that, use analyzing-test-stack.
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*)"
+---
+
+# Assessing Test Coverage
+
+Produce an evidence-grounded inventory of what is **already tested** for a change, scoped to the change surface, with every cited test rendered as a stable GitHub permalink and bucketed by test layer. This is a backward-looking, descriptive job: you report what exists, you do **not** recommend what to add or judge whether the shape is right — that is `analyzing-test-stack`'s job, which consumes this inventory.
+
+The output is a **coverage inventory**: a set of permalink records for observed tests plus a list of behaviors/surfaces recorded as gaps (`unverified`). Honesty is the whole point — a behavior with no observed test is a gap, never assumed covered.
+
+## Inputs
+
+You work from a **change surface** and the repos it touches:
+
+- **Change surface** — the changed paths/symbols and the named component(s). Usually supplied by the caller (the agent's evidence fan-out, or an `analyzing-test-stack` run). If you're handed only a Jira key or a PR with no resolved surface, derive a minimal surface from the PR diff (`gh pr diff`) before looking for coverage; the shared `../../references/input-sources.md` (the same intake guide `analyzing-test-stack` uses) covers how to resolve a PR or Epic into its diff paths and linked PRs.
+- **Affected repos** — which platform checkouts to inspect, and whether the sibling `test` repo (E2E) is available.
+- **Linked/merged PRs** — the PRs that shipped this work; their diffs are the primary, permalink-ready coverage evidence.
+
+A missing input narrows the inventory; it never blocks it. Record what you could not inspect as part of the result.
+
+**Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
+
+## Workflow
+
+1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_.
+
+2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. For E2E, inspect the sibling `test` repo if available. See `references/finding-coverage.md` → _Finding existing coverage_.
+
+3. **Cite and bucket each observed test.** Render every cited test as a GitHub permalink (commit SHA, not branch), following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_. A test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket each by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`.
+
+4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_.
+
+5. **Write the coverage report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/coverage-report-template.md`. **Inline the canonical stylesheet from `../../references/report-style-tokens.md` verbatim** — the same plugin-level styling source the test-stack report uses, so the two reports read as one instrument; do not re-pick colors or reintroduce a brand skin. Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`) and write `#overview` yourself as a short synthesis. Write the report to the **current working directory** as `test-coverage-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change and `<date>` is the caller-provided date.
+
+## Output
+
+Two artifacts:
+
+- The **coverage inventory** as structured data — the record shape defined in `references/finding-coverage.md` → _Output contract_: one permalink record per observed test, plus the list of `unverified` gaps. When run under the `bitwarden-test-engineer` agent, return these records for `analyzing-test-stack` to consume as-is.
+- The **self-contained HTML coverage report** (step 5), written to the current working directory.
+
+Mirror the report's `#overview` in chat — the observed shape per platform and the top gaps — and point the reader at the report file for the per-test detail.
+
+## Principles
+
+- **Observed vs. assumed.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
+- **Scoped, not swept.** Coverage is established PR-first then scoped to the change surface — never a repo-wide grep.
+- **Stable links only.** Permalinks use the commit SHA, not a branch. Unlinkable tests are recorded with a reason; URLs are never fabricated.
+- **Backward-looking only.** You inventory what exists. Recommending new tests, assigning cheapest-sufficient layers, and judging trophy shape belong to `analyzing-test-stack` — hand off, don't cross over.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
new file mode 100644
index 0000000..0a08050
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -0,0 +1,155 @@
+# Coverage report template
+
+Produce a **single self-contained HTML file** inventorying the existing test coverage for a
+change: all CSS inline in a `<style>` block, no external/CDN links, no required JavaScript, no
+web fonts. It must render correctly opened directly from disk and survive being attached to a
+ticket or PR. This is the coverage counterpart to the `analyzing-test-stack` test-stack report;
+the two share one visual system so they read as the same instrument.
+
+Write it to the **current working directory** as
+`test-coverage-report-<slug>-<date>.html` (slug = ticket key / PR number / feature name in
+kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
+
+## Styling — binding
+
+Inline the paste-ready stylesheet from `../../../references/report-style-tokens.md` (the
+plugin-level `references/` directory) **verbatim** into the `<style>` block — the same styling
+source the test-stack report uses, pasted identically so the two reports do not drift. Do
+not re-pick colors, fonts, or layer tokens, and do not reintroduce a brand skin or any
+`<link>`/`@font-face`/CDN import; the off-brand data-report system and the layer/badge token
+mappings in that file are binding. The layer chips (`unit` / `integration` / `e2e`), the
+badges (`assumption` / `warn` / `ok`), the distribution chart, and the `.unlinkable` span are
+all defined there.
+
+Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
+not hand-number. Wrap each wide table in `<div class="scroll">…</div>`.
+
+## Required sections, in order
+
+Each section uses the **normative `id` listed below**. Do not rename, omit, or add top-level
+sections — readers look these up by id.
+
+1. **Header** (no id; `<header>` element) — report title ("Test Coverage Report"), the change
+   under analysis (ticket/PR/feature), and the date.
+2. **`#overview`** — A short top-of-report synthesis written so a reader sees the bottom line
+   without scrolling. It must contain: a 2–4 sentence recap of how well covered the change is
+   per platform (where observed tests concentrate, which layers are bare); the top 3 coverage
+   gaps the reader should know about (drawn from `#gaps`); and anchor links into `#coverage`
+   and `#gaps`. This report **describes** coverage — it does not recommend new tests or assign
+   cheapest-sufficient layers (that is the test-stack report's job); say so in one line and, if
+   a test-stack report was also produced, link to it.
+3. **`#summary`** — Observed coverage shape — 2–4 sentences, then the **layer-distribution
+   chart** rendered exactly per `../../../references/report-style-tokens.md` → _Graphics_, but
+   with each `.seg`'s `flex:<count>` set to the **count of observed tests** at that layer for
+   the platform (not recommended counts). Caption it `Fig 1 · Observed test coverage by platform`.
+   Follow with `<ul class="shapes">`, one `<li>` per platform giving the one-line
+   observed shape (e.g. "server: 14 integration, 3 unit, 0 E2E observed"). A platform with no
+   observed coverage still gets a row, shown empty.
+4. **`#evidence`** — Evidence & sources — a table of what was inspected (which repos/checkouts,
+   which PRs read, whether the sibling `test` repo was available) and, explicitly, **what was
+   missing or unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage
+   unverified"). For PR-sourced records include the captured **head SHA** and **`owner/repo`**
+   so the per-test permalinks can be audited against the same commit.
+5. **`#coverage`** — Observed coverage — for each affected platform, a table:
+   `Behavior / surface | Layer | Test (linked) | Source | Notes`. One row per observed test.
+   The **Test (linked)** column is binding: render a GitHub permalink anchored to the captured
+   commit SHA and line range —
+   `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
+   If a test cannot be linked, use
+   `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` instead
+   of fabricating a URL. The **Layer** cell uses the matching layer chip. **Source** is `PR`
+   (tests shipped in a linked/merged PR) or `pre-existing` (found by the targeted lookup) —
+   keep the observed-vs-assumed distinction visible. Permalink production rules live in
+   `finding-coverage.md` → _Citing tests as GitHub permalinks_.
+6. **`#gaps`** — Coverage gaps — behaviors/surfaces in the change with **no observed test**,
+   each marked `<span class="badge warn">unverified</span>` with a one-line reason (no
+   PR-observed test and no targeted hit; or `test` repo unavailable). This is the honest
+   record of what is _not_ known to be covered — it is not a recommendation to add tests.
+
+## Content rules
+
+- Tables over prose for the coverage inventory and evidence — they're meant to be scanned.
+- Mark anything inferred without direct evidence with
+  `<span class="badge assumption">assumption</span>`; confirmed observed coverage may carry
+  `<span class="badge ok">ok</span>`.
+- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g. E2E
+  coverage claimed without the `test` repo checked out).
+- Never present assumed coverage as observed, and never fabricate a permalink.
+- No tracking, no remote resources, no secrets. The file is shareable as-is.
+
+## Skeleton
+
+```html
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Test Coverage Report — {{change}}</title>
+    <style>
+      /* Paste the full paste-ready stylesheet from
+         ../../../references/report-style-tokens.md here, verbatim. */
+    </style>
+  </head>
+  <body>
+    <header>
+      <p class="eyebrow">Test Coverage Report</p>
+      <h1>…the change under analysis…</h1>
+      <p class="meta">…ticket/PR · status · team · date…</p>
+    </header>
+    <main>
+      <section id="overview">
+        <h2>Overview</h2>
+        …2–4 sentence recap of observed coverage per platform; top 3 gaps;
+        anchor links into #coverage and #gaps; one line noting this is a
+        coverage inventory, not a recommendation…
+      </section>
+      <section id="summary">
+        <h2>Observed coverage shape</h2>
+        …2–4 sentences…
+        <figure class="dist">
+          <figcaption>Fig 1 · Observed test coverage by platform</figcaption>
+          <div class="legend">
+            <span class="key unit">unit</span>
+            <span class="key integration">integration</span>
+            <span class="key e2e">e2e</span>
+          </div>
+          <div class="dist-row">
+            <span class="dist-label">bitwarden/server</span>
+            <div class="bar">
+              <span class="seg unit" style="flex:3">3</span>
+              <span class="seg integration" style="flex:14">14</span>
+            </div>
+          </div>
+          <!-- one .dist-row per platform; empty bar if none observed -->
+        </figure>
+        <ul class="shapes">
+          <li>
+            <span class="plat">bitwarden/server</span> — 14 integration, 3 unit,
+            0 E2E observed
+          </li>
+          <!-- one li per platform -->
+        </ul>
+      </section>
+      <section id="evidence">
+        <h2>Evidence &amp; sources</h2>
+        <div class="scroll">
+          …repos inspected + PRs read + test-repo availability + what was
+          missing + commit SHA(s)…
+        </div>
+      </section>
+      <section id="coverage">
+        <h2>Observed coverage</h2>
+        <div class="scroll">
+          …per-platform behavior→test tables with linked evidence…
+        </div>
+      </section>
+      <section id="gaps">
+        <h2>Coverage gaps</h2>
+        …behaviors with no observed test, each marked unverified with a one-line
+        reason…
+      </section>
+    </main>
+  </body>
+</html>
+```
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
new file mode 100644
index 0000000..af2f27e
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -0,0 +1,119 @@
+# Finding and citing existing test coverage
+
+How to determine what a change is **already** tested by, scoped to the change surface, and how to cite each observed test as a stable link. This is the repo-reading half of test engineering; the trophy-mapping half (which layer a behavior _should_ live at) is in the `analyzing-test-stack` skill.
+
+## Discovering a repo's test conventions (config-first)
+
+Test conventions, tooling, and where tests live are usually documented in a repo's Claude
+config — read it **before** opening any test files, and stop as soon as it answers the
+question. This keeps token spend low on large repos. Work the tiers in order:
+
+1. **Config first.** Read the repo's root `CLAUDE.md`, its `.claude/` directory (rules and
+   settings), and any **nested `CLAUDE.md`** in the subdirectories the change touches (e.g.
+   `clients/apps/<app>/CLAUDE.md`). Extract the test tooling, the test-file layout/naming, and
+   any stated layer conventions.
+2. **Test files as fallback — only for gaps config leaves.** If config is silent on a
+   convention you need, read a _few representative_ test files near the change surface to
+   confirm it. Do **not** sweep the repo.
+3. **Generic stack table as last resort.** When neither config nor local tests answer, fall
+   back to the per-repo stack/tooling table in the `analyzing-test-stack` skill's
+   `references/monorepo-layout.md` and **state the assumption** in the result.
+
+This tier governs _conventions_ — what the tooling is and where tests live. Finding which
+behaviors are _already covered_ is the next job, below.
+
+## Finding existing coverage (PRs first, then a targeted lookup)
+
+Reliably establishing what is **already tested** does not require grepping a whole repo. Work
+two ordered moves, and record anything still unfound as a gap rather than dropping it:
+
+1. **Merged/linked PRs are the backbone.** The PRs hanging off the Jira issue and its epic
+   children (`get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
+   the tests that shipped with this work, and are already permalink-ready via the PR head SHA.
+   Take the tests observed in those PR diffs as primary coverage evidence.
+2. **Targeted repo lookup for pre-existing tests.** Tests written _before_ this ticket won't
+   appear in those PRs. Find them with a lookup **scoped to the change surface** — the files
+   and symbols the PRs/diff touch, and the component named in the ticket — not a repo-wide
+   sweep. Confirm conventions from config (above) so the lookup targets the right paths.
+
+For end-to-end coverage, inspect the dedicated sibling `test` repo if it is checked out (see
+the `analyzing-test-stack` skill's `references/monorepo-layout.md` → _Where each layer lives_)
+and cite specific files; if it is not available, record E2E coverage as `unverified`.
+
+A behavior with no PR-observed test and no targeted hit is recorded as a coverage gap /
+`unverified` — never silently assumed covered.
+
+## Citing tests as GitHub permalinks
+
+Every test cited as **current coverage** must be rendered as a clickable
+GitHub permalink so a reader can jump to the actual test. The link form is:
+
+```
+https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>
+```
+
+Use the **commit SHA**, not a branch name. Branch links rot under rebase and
+force-push; SHA links are stable.
+
+### Acquiring the four ingredients
+
+1. **`owner/repo`** — from the remote URL.
+   - PR-sourced: parse from the PR URL (e.g. `gh pr view <pr> --json url`).
+   - Local checkout: `git -C <repo> remote get-url origin` and parse the
+     `github.com[:/]<owner>/<repo>(\.git)?` segment.
+2. **Commit SHA**.
+   - PR-sourced: `gh pr view <pr> --json headRefOid` returns the PR head SHA. This is
+     the SHA the diff was computed against and is the right anchor for any
+     tests-in-PR or tests-on-the-PR-branch references.
+   - Local checkout: `git -C <repo> rev-parse HEAD` for the working-tree SHA. If the
+     working tree is dirty (uncommitted changes), still use HEAD and note in the
+     evidence that links point to HEAD, not the working tree.
+3. **Path** — repo-relative path of the test file (no leading slash). The same path
+   you'd pass to `Read`, minus the repo root.
+4. **Line range** — start line through end line of the test declaration. Acceptable
+   resolutions, in descending preference:
+   - Full block: from the `it(`/`test(`/`Test(`/`func Test…(` declaration line through
+     the matching closing brace.
+   - Declaration only: the single line where the test name is declared (`#L42`).
+   - File only (`#L1`) — accept reluctantly, and only when grep cannot localize the
+     test. Avoid for newly authored tests.
+
+### When a test cannot be linked
+
+If any of the four ingredients is missing — no remote (`git remote get-url origin`
+returns empty), detached HEAD with no remote, private fork the session cannot reach,
+or the file exists only in a local working tree never pushed — record the test as
+**unlinkable** with the reason. Never fabricate a URL. Both this skill's coverage report
+(`coverage-report-template.md`) and the downstream `analyzing-test-stack` test-stack report
+render these as `<span class="unlinkable">path — unlinkable: &lt;reason&gt;</span>`.
+
+### Output contract
+
+For every cited test, return a record of the shape:
+
+```
+{
+  "path": "src/services/Foo/FooService.spec.ts",
+  "start_line": 42,
+  "end_line": 89,
+  "owner_repo": "bitwarden/clients",
+  "sha": "a1b2c3d4e5f6…",
+  "layer": "integration",
+  "permalink": "https://github.com/bitwarden/clients/blob/a1b2c3d4e5f6…/src/services/Foo/FooService.spec.ts#L42-L89"
+}
+```
+
+…or, when unlinkable:
+
+```
+{ "path": "src/services/Foo/FooService.spec.ts", "layer": "integration", "unlinkable_reason": "no remote for local checkout" }
+```
+
+Behaviors/surfaces with no observed test are returned as gaps:
+
+```
+{ "behavior": "tier downgrade preserves seat count", "platform": "server", "status": "unverified" }
+```
+
+The `analyzing-test-stack` recommender consumes these records as-is to populate the
+report's Evidence (linked) column and to seed its gap analysis.
diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
deleted file mode 100644
index e86e11d..0000000
--- a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-name: challenging-test-stack-recommendations
-description: Use to red-team a test automation recommendation produced by analyzing-test-stack — adversarially reviewing a Testing Trophy recommendation or HTML test-stack report for anti-patterns and ungrounded claims before the team acts on it. Triggers on "challenge this test plan", "red-team the test recommendation", "poke holes in this test strategy", "is this proposed test plan over/under-testing", "review the test stack report", or runs automatically after analyzing-test-stack under the test-engineer orchestrator. Checks for ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable requirements, missing platform layers, flaky-E2E candidates, and coverage claimed without evidence; returns a verdict of endorse, revise, or reject-with-reasons.
-allowed-tools: "Read, Grep, Glob, Bash(gh pr view:*), Bash(gh pr diff:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_confluence_page"
----
-
-# Challenging Test Stack Recommendations
-
-You are the adversary to `analyzing-test-stack`. Your job is to **try to break its
-recommendation** before the team builds on it. A recommendation that survives a genuine
-red-team is trustworthy; one that was never challenged tends to drift toward whatever
-tests are easiest to write rather than what actually buys confidence.
-
-Default to skepticism. Your value is in the specific, evidence-backed objection — not in
-rubber-stamping. But do not invent problems: an objection you cannot tie to evidence is
-itself a rejected finding (you hold yourself to the same evidence bar you demand).
-
-## Inputs
-
-- The **HTML report** (or the recommendation text) from `analyzing-test-stack`.
-- The **underlying evidence** — the same Jira ticket, PR diff, CSV, and/or repo checkout.
-  Re-derive independently where you can; re-read the PR diff or ticket rather than trusting
-  the report's summary of it.
-
-## Workflow
-
-1. **Re-read the evidence independently.** Don't take the report's characterization of the
-   change at face value — pull the diff / ticket / CSV yourself and form your own view of
-   the testable behaviors and where they live. Ingest each source the same way the analyst
-   does (see `analyzing-test-stack/references/input-sources.md` for the CSV column mapping
-   and Atlassian MCP tools). In particular, **E2E tests live in a separate, private `test`
-   repo** — not inside the platform repos — so treat any existing-E2E-coverage claim as
-   unverified unless that repo was actually inspected.
-
-2. **Run the rejection criteria.** Apply every check in `references/adversarial-checklist.md`
-   to each per-platform recommendation and to the overall shape. For each, decide: does the
-   recommendation pass, or is there a concrete, evidence-backed objection?
-
-3. **Test the grounding.** For every behavior→layer call, confirm it ties to real evidence.
-   Flag any layer assignment, coverage claim, or "already tested" assertion that the
-   evidence does not support — especially **E2E coverage claimed without inspecting the
-   dedicated `test` repo**.
-
-4. **Pressure the shape.** Step back from individual rows: is the overall trophy right? Too
-   E2E-heavy (ice-cream cone)? Core logic pushed to slow layers? A whole platform's layer
-   missing? Trivial code over-tested?
-
-5. **Issue findings and a verdict.** Each finding: the specific claim challenged, why it's
-   wrong or unsupported (with evidence), and the corrective recommendation. Then a single
-   verdict:
-   - **Endorse** — sound and well-grounded; minor or no notes.
-   - **Revise** — directionally right but has specific fixable issues (list them).
-   - **Reject-with-reasons** — the shape or grounding is wrong enough that the team should
-     not act on it as written; state what a correct recommendation would require.
-
-6. **Write the critique into the report.** Populate the report's `#adversarial-review`
-   section with your findings and verdict (preserve the self-contained, no-external-deps
-   HTML constraint). When run standalone without the orchestrator, return the critique as
-   a clearly structured summary instead.
-
-## Principles
-
-- **Adversarial, not contrarian.** Push hard, but every objection carries evidence. Drop
-  any finding you can't support — apply the analyst's own evidence standard to yourself.
-- **Re-derive, don't trust.** The report's summary of the diff/ticket is a claim to verify,
-  not a fact to accept.
-- **Name the anti-pattern.** When you flag a shape problem, use the precise term
-  (ice-cream-cone, over-unit-testing, E2E-for-branch-coverage) so the fix is unambiguous.
-- **Unverifiable is a finding.** "The report claims E2E coverage exists but the `test` repo
-  was never inspected" is a legitimate, important objection — surface it.
diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md
deleted file mode 100644
index 7fbd307..0000000
--- a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Adversarial checklist — rejection criteria
-
-Run every check against each per-platform recommendation and against the overall shape.
-A check "fails" only when you can state a concrete, evidence-backed objection. Record the
-evidence; an objection you can't ground is itself rejected.
-
-## Shape-level checks
-
-1. **Ice-cream cone (too E2E-heavy).** Is confidence concentrated in slow, flaky E2E tests
-   that integration or unit tests could buy more cheaply? Any behavior recommended for E2E
-   that is not a genuinely critical, full-system user journey is suspect — demand the
-   justification and propose the lower layer.
-
-2. **Missing platform layer.** Does an affected platform have a gap in its trophy — e.g.
-   server logic with no integration layer, a client with only E2E and no component/unit
-   coverage, core logic with nothing at all? A whole missing layer is a major finding.
-
-3. **Inverted cost/confidence.** Is core branching logic pushed up to integration/E2E
-   while trivial glue sits at lower layers? Confidence should sit at the cheapest
-   sufficient layer.
-
-## Row-level checks (per behavior → layer assignment)
-
-4. **Unit masquerading as integration (and vice-versa).** Is something labeled
-   "integration" actually a unit test with everything mocked (re-asserting mocks, not real
-   collaboration)? Or a true cross-collaborator behavior mislabeled "unit"? Mislabeling
-   distorts the shape and the confidence claim.
-
-5. **Over-testing trivial code.** Tests recommended for getters/setters, framework glue,
-   generated code, or invariants the type system/analyzer already guarantees. Cost without
-   confidence — recommend dropping or moving to static.
-
-6. **E2E for branch coverage.** Edge cases or error paths assigned to slow full-system
-   tests when they belong at unit/integration. E2E is for journeys, not branches.
-
-7. **Flaky-E2E candidate.** Does a recommended E2E test depend on timing, external
-   services, animation, network, or shared mutable state likely to make it flaky? Flag the
-   flakiness risk and whether an integration test with a controlled boundary would be more
-   reliable.
-
-## Grounding checks
-
-8. **Coverage claimed without evidence.** Any "already tested" / "existing coverage"
-   assertion not backed by an observed test, diff hunk, or CSV row. Especially: **E2E
-   coverage asserted without inspecting the dedicated private `test` repo** — that repo is
-   not inside the platform repos, so unexamined E2E claims are unverified by definition.
-
-9. **Untestable / ambiguous requirement.** A behavior recommended for testing whose
-   acceptance criteria are too vague to write a deterministic assertion against. The fix is
-   to flag the requirement gap upstream, not to write a test against a guess.
-
-10. **Assumption presented as fact.** Inferred platform, stack, tooling, or scope stated
-    without an "assumption" marker. Demand it be labeled so the reader can weigh it.
-
-## Verdict mapping
-
-- **Endorse** — no failing checks, or only cosmetic notes.
-- **Revise** — one or more fixable row-level findings, shape essentially sound.
-- **Reject-with-reasons** — a shape-level failure (ice-cream cone, missing layer, inverted
-  cost/confidence) or pervasive ungrounded coverage claims. State what a correct
-  recommendation would require.

From be2db8d50a85587bfef20e776a0f2d0ff8f49908 Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Thu, 18 Jun 2026 14:14:50 -0400
Subject: [PATCH 3/9] html report changes, defer to current test stack shape
 rather than forcing trophy

---
 .cspell.json                                  |  10 +
 README.md                                     |   2 +-
 plugins/bitwarden-test-engineer/CHANGELOG.md  |  26 +-
 plugins/bitwarden-test-engineer/README.md     |  10 +-
 .../bitwarden-test-engineer/agents/AGENT.md   |  57 +-
 .../references/input-sources.md               |  41 +-
 .../references/report-style-tokens.md         | 484 +++------------
 .../references/report-style.css               | 552 ++++++++++++++++++
 .../scripts/build-report.sh                   | 202 +++++++
 .../skills/analyzing-test-stack/SKILL.md      |  18 +-
 .../references/html-report-template.md        | 116 +++-
 .../references/monorepo-layout.md             |  69 ++-
 .../references/severity-risk.md               |  16 +-
 .../references/testing-trophy.md              | 117 ++--
 .../skills/assessing-test-coverage/SKILL.md   |   8 +-
 .../references/coverage-report-template.md    |  95 ++-
 .../references/finding-coverage.md            |  63 +-
 17 files changed, 1322 insertions(+), 564 deletions(-)
 create mode 100644 plugins/bitwarden-test-engineer/references/report-style.css
 create mode 100755 plugins/bitwarden-test-engineer/scripts/build-report.sh

diff --git a/.cspell.json b/.cspell.json
index b8189cd..e39c245 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -3,6 +3,7 @@
   "version": "0.2",
   "words": [
     "accum",
+    "actioned",
     "adf",
     "AKIA",
     "anthropics",
@@ -53,6 +54,7 @@
     "Gatekeeping",
     "GHAS",
     "ghsa",
+    "getline",
     "gofmt",
     "gradlew",
     "grype",
@@ -63,6 +65,7 @@
     "hotspots",
     "IDOR",
     "inclusivity",
+    "inlines",
     "issueIdOrKey",
     "issuelinks",
     "issuetype",
@@ -76,6 +79,7 @@
     "mcp",
     "Menlo",
     "metacharacters",
+    "mockall",
     "modelcontextprotocol",
     "msword",
     "MVVM",
@@ -83,6 +87,7 @@
     "mypassword",
     "myproject",
     "Newtonsoft",
+    "nextest",
     "nextPageToken",
     "numstat",
     "NVARCHAR",
@@ -124,6 +129,10 @@
     "startswith",
     "stride",
     "structurizr",
+    "stylesheet",
+    "subdirs",
+    "tablist",
+    "tabpanel",
     "tarpit",
     "thumbsup",
     "tinyui",
@@ -133,6 +142,7 @@
     "triaging",
     "unassigning",
     "unassigns",
+    "unfound",
     "ungroup",
     "unlinkable",
     "unresponded",
diff --git a/README.md b/README.md
index 61b5442..c693611 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable
 | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/)     | 0.1.5   | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources                                                     |
 | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0   | Application security engineering: vulnerability triage, threat modeling, and secure code analysis                                                           |
 | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0   | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. |
-| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
+| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
 | [claude-config-validator](plugins/claude-config-validator/)         | 1.1.1   | Validates Claude Code configuration files for security, structure, and quality                                                                              |
 | [claude-retrospective](plugins/claude-retrospective/)               | 1.1.1   | Analyze Claude Code sessions to identify successful patterns and improvement opportunities                                                                  |
 
diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md
index 12cf16b..9b3ab89 100644
--- a/plugins/bitwarden-test-engineer/CHANGELOG.md
+++ b/plugins/bitwarden-test-engineer/CHANGELOG.md
@@ -26,15 +26,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   never a repo-wide sweep), it discovers each repo's test conventions config-first, buckets
   every observed test by layer, cites it as a stable GitHub permalink (commit SHA, not
   branch), records untested behaviors as `unverified` gaps, and writes its own self-contained
-  HTML **coverage report** (`test-coverage-report-<slug>-<date>.html`) following
+  HTML **coverage report** (`test-coverage-report-<slug>-<date>-<HHMMSS>.html`) following
   `references/coverage-report-template.md`. Usable standalone to audit current coverage, and
   consumed by `analyzing-test-stack`. Owns convention discovery, existing-test finding, and
   the GitHub permalink citation rules (in `references/finding-coverage.md`) — concerns kept
   separate from the trophy recommendation.
 - Plugin-level shared `references/`: `input-sources.md` (evidence-source ingestion, used by
-  both skills and the agent) and `report-style-tokens.md` (the single off-brand data-report
-  styling system both the coverage report and the test-stack report inline verbatim, so the
-  two read as one instrument).
+  both skills and the agent), `report-style.css` (the single off-brand data-report stylesheet
+  both reports use) and `report-style-tokens.md` (its design contract). The
+  `scripts/build-report.sh` build script splices `report-style.css` into each report so the
+  stylesheet is never reproduced as model output and the coverage and test-stack reports
+  cannot drift — they read as one instrument.
+- Combined two-tab report: when the agent runs end to end, the `test-combined` build mode
+  stitches the two standalone reports into one page with _Current coverage_ and
+  _Recommended coverage_ tabs (CSS-only, no JavaScript; stacks both views on print). It is a
+  presentation-only merge assembled from the finished report files — each skill still authors
+  and builds its own standalone report unchanged, so the split between coverage and
+  recommendation stays intact. The tab chrome lives entirely in `report-style.css` and the
+  build script; no skill or template knows about tabs.
 - `analyzing-test-stack` skill: consumes the coverage inventory from `assessing-test-coverage`,
   then maps a change's testable behaviors to the cheapest
   sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits
@@ -50,11 +59,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   layer→repo map, evidence-source ingestion, and the HTML report
   template. The Atlassian `search_confluence` / `search_confluence_cql` tools back locating a
   breakdown by feature/team name when only a name (not a page ID) is given.
+- Linked table of contents (`.toc`) at the top of every report's `<main>`, linking to
+  each section; in the combined report the build script namespaces the ToC's anchors per tab so
+  each panel's ToC jumps within its own panel.
 - Top-of-report `#overview` synthesis section, written by the analyst: a 2–4 sentence recap
   of the recommended shape per platform, the top 3 open risks (drawn from
   `#gaps`), and anchor links into the detail sections, so readers see the bottom line without
   scrolling. The overview is additive — per-behavior detail stays in `#recommendations`/`#gaps`.
-- Per-layer model governance to optimize token spend: the agent runs on Opus
-  (its context drives the analysis and the recommendation), while the fan-out
+- Per-layer model governance to optimize token spend: the agent inherits the session model
+  for its own context (which drives the analysis and the recommendation), while the fan-out
   evidence subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket,
-  or repo, `haiku` for pure CSV parsing — rather than inheriting Opus.
+  or repo, `haiku` for pure CSV parsing — rather than inheriting the orchestrator's model.
diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index f18e06e..86b3336 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -80,10 +80,12 @@ automated and at what layer?
 ```
 
 Each run produces two self-contained HTML files in the current working directory: a
-`test-coverage-report-<slug>-<date>.html` (what is already tested — observed tests per layer,
-each cited as a GitHub permalink, plus gaps) and a `test-stack-report-<slug>-<date>.html` (the
-per-platform recommendation and its coverage-gap findings). Both share one off-brand
-data-report visual system so they read as the same instrument.
+`test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested — observed tests per
+layer, each cited as a GitHub permalink, plus gaps) and a
+`test-stack-report-<slug>-<date>-<HHMMSS>.html` (the per-platform recommendation and its
+coverage-gap findings). The `HHMMSS` time suffix is stamped at build time, so re-running on the
+same day never overwrites a prior report. Both share one off-brand data-report visual system so
+they read as the same instrument.
 
 ## References
 
diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md
index fdc1dd6..31a1d92 100644
--- a/plugins/bitwarden-test-engineer/agents/AGENT.md
+++ b/plugins/bitwarden-test-engineer/agents/AGENT.md
@@ -39,7 +39,7 @@ description: |
   Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
   </commentary>
   </example>
-model: opus
+model: inherit
 tools:
   - Read
   - Write
@@ -57,6 +57,7 @@ tools:
   - Bash(git remote get-url:*)
   - Bash(git -C * rev-parse:*)
   - Bash(git -C * remote get-url:*)
+  - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)
   - mcp__bitwarden-atlassian__get_issue
   - mcp__bitwarden-atlassian__search_issues
   - mcp__bitwarden-atlassian__get_issue_comments
@@ -70,13 +71,13 @@ skills:
 color: green
 ---
 
-You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys.
+You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, across three layers: a unit layer for pure logic, an integration layer for collaborator wiring, and a thin E2E layer reserved for critical user journeys. How those layers are weighted is **per repo, not one universal trophy** — Bitwarden's repos span unit-heavy pyramids (`server`, `clients`, `sdk-internal`, `android`), an integration + snapshot trophy (`ios`), and all-E2E repos (`test`, `browser-interactions-testing`).
 
-You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep the trophy shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
+You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep each repo's shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
 
 ## Operating context
 
-Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently.
+Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each repo is shaped independently — match the recommendation to that repo's actual practice (`monorepo-layout.md` → _Each repo's test shape in practice_), not a single house style.
 
 **Where each layer lives:** unit and integration live alongside the code in each platform repo; **E2E lives in the dedicated `test` repo** (sibling of the platform repos). See `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map.
 
@@ -94,43 +95,65 @@ Then determine the **affected repos/platforms**. If scope is genuinely ambiguous
 
 Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out:
 
-- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_.
+- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted, and the **source issue key + browse URL** for each behavior (for an Epic, the specific child the behavior came from) so the report can link every behavior back to its requirement. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_ and _Citing Jira issues as links_.
 - **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
 - **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present.
 - **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status.
 
 Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied.
 
-**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit Opus. See _Model selection_ below for the rationale.
+**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit the orchestrator's model. See _Model selection_ below for the rationale.
 
 ### 3. Assess existing coverage
 
-Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return a **permalink record per cited test** (`{ path, start_line, end_line, owner_repo, sha, layer, permalink }`, or `{ path, unlinkable_reason }` when an ingredient is missing) plus `unverified` gaps. The output contract, the PR-first/targeted-lookup discipline, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' records into a single coverage inventory.
+Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return **one record per behavior** — its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [{ path, start_line, end_line, owner_repo, sha, permalink }] }`) plus `unverified` gaps. **Scouts must establish coverage per behavior and stop as soon as it's confirmed — never enumerate every test method in a covered area** (this is the dominant cost control; a behavior backed by 40 tests is one record with a count of ~40 and 3 exemplars, not 40 records). The output contract, the per-behavior discipline, the PR-first/targeted-lookup rule, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' per-behavior records into a single coverage inventory.
 
-This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C <repo> rev-parse HEAD` and `owner/repo` via `git -C <repo> remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date: it writes a **self-contained HTML coverage report** to the current working directory as `test-coverage-report-<slug>-<date>.html` (the backward-looking inventory — observed tests per layer with permalinks, plus `unverified` gaps) and returns the inventory records for step 4. The scouts do the gathering; the skill assembles the report. Pass today's date — skills cannot read the clock.
+This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C <repo> rev-parse HEAD` and `owner/repo` via `git -C <repo> remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date to produce the backward-looking coverage inventory (observed tests per layer with permalinks, plus `unverified` gaps) and the **self-contained HTML coverage report** — a `test-coverage-report-<slug>-<date>-<HHMMSS>.html` file in the current working directory. The skill returns the inventory records for step 4. Per the skill, the actual HTML _rendering_ is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — only the gathering and inventory merge happen in your context. Pass today's date — skills cannot read the clock; the build script stamps the `HHMMSS` suffix so the file is always fresh.
 
 ### 4. Recommend
 
-Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. It maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report-<slug>-<date>.html`. The analyst writes the report's `#overview` itself. Pass today's date to the skill — skills cannot read the clock themselves.
+Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill — skills cannot read the clock; the build script stamps the `HHMMSS` suffix.
 
-### 5. Present
+### 5. Combine and present
 
-The run produces **two self-contained HTML files** in the current working directory: the `test-coverage-report-*.html` (what is already tested, from step 3) and the `test-stack-report-*.html` (the recommendation, from step 4). Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at both files — the coverage report for the existing-test detail, the test-stack report for the per-behavior recommendation.
+Steps 3 and 4 each emit a self-contained HTML file in the current working directory: the `test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested) and the `test-stack-report-<slug>-<date>-<HHMMSS>.html` (the recommendation). Each filename carries the build script's timestamp, so re-running never overwrites a prior report.
+
+Then assemble the **combined two-tab page** — the primary deliverable, with _Current coverage_ (the coverage report) and _Recommended coverage_ (the test-stack report) on one page. Run the build script yourself (it is pure file assembly — no template or stylesheet reading, so your context stays lean) with the two filenames the prior steps printed:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-combined --slug <slug> --date <today> \
+  --current <test-coverage-report-…​.html> \
+  --recommended <test-stack-report-…​.html>
+```
+
+This writes `test-combined-report-<slug>-<date>-<HHMMSS>.html`; the two standalone reports are read, not modified, and remain available. Use the exact filenames the build script printed.
+
+Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at the **combined page** first (both views in one file), and note the two standalone reports are also available for sharing a single view.
 
 ## Principles
 
 - **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
-- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer.
+- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer, then land the call inside the target repo's actual shape (pyramid for `server`/`sdk-internal`/`clients`/`android`, integration + snapshot for `ios`, all-E2E for `test`/`browser-interactions-testing`).
 - **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency).
-- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
+- **Degrade gracefully.** A missing input (no `bitwarden-atlassian-tools` MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
 - **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_.
 - **Coverage before recommendation.** Assess what already exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute.
 
 ## Model selection
 
-Model spend is governed here in the plugin, not left to the session default. The split:
+This agent **inherits the session model** for its own context — the orchestration and the hard reasoning run on whatever model the user set the session to. What the plugin governs explicitly is the model of every subagent you fan out, so the cheap, high-volume work never runs at the orchestrator's rate. The split:
+
+- **You (the test-engineer agent) keep the genuinely hard work in your own context** — classifying intake, then mapping behaviors to the cheapest sufficient layer across multiple platforms, risk-weighted by severity. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on, so it stays with the orchestrator rather than being delegated to a subagent.
+- **Evidence-gathering subagents run on Sonnet or Haiku.** Everything you fan out to gather is evidence that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit the orchestrator's model.
+- **Report rendering runs on Sonnet — the report-writer subagent.** Once the coverage inventory (step 3) and the behavior→layer/severity mapping (step 4) are decided, turning them into HTML is **mechanical formatting, not reasoning**, and is delegated rather than done in your own context. Dispatch a `Task` (model: `sonnet`) report-writer that receives the decided structured records (plus the `#overview` synthesis you wrote), authors the report **content fragment** per the skill's template, and runs `${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh` to splice in the stylesheet and emit the file. The stylesheet itself is a static file the build script inlines — it is never reproduced as model output by anyone, on any model.
+
+Rule of thumb: push the cheap, high-volume gathering **and the mechanical report rendering** down to explicitly-pinned Sonnet/Haiku subagents; keep only the irreducible layer/severity reasoning in the orchestrator context.
+
+## Keep your orchestrator context lean
 
-- **You (the test-engineer agent) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` — mapping behaviors to the cheapest sufficient layer across multiple platforms — all in _your_ context, so your model sets its quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus.
-- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus.
+Your own context is the most expensive token pool in the run — what you read into it and re-emit is re-cached on every subsequent turn. Three rules:
 
-Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus.
+- **Never read the rendering files into your context.** The report templates (`html-report-template.md`, `coverage-report-template.md`), `report-style-tokens.md`, `report-style.css`, and `build-report.sh` are the **report-writer subagent's** concern only — it reads them. You only need the reasoning references (`testing-trophy.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). Loading the templates or stylesheet into your context is wasted cache. (The combined-page build in step 5 is the one time you _invoke_ `build-report.sh` directly — but you only run it on the two finished report filenames; you still never read its source or the rendering files.)
+- **Don't restate digests.** Subagents return compact digests; synthesize them into the decision, don't echo them back to the user mid-run. Keep inter-step narration to a few lines — the reports are the deliverable, not a running commentary.
+- **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records (now small by design) and the `#overview` text. If a record set is still large, `Write` it to a temp file (e.g. `./.test-engineer-<slug>.json`) and pass the path instead of pasting the blob into the prompt.
diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
index 9724f09..7938b95 100644
--- a/plugins/bitwarden-test-engineer/references/input-sources.md
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -20,6 +20,13 @@ Otherwise use the MCP tools directly:
 Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
 components** named. If the MCP is unavailable, ask the user to paste the requirements.
 
+Also capture, for every issue you read, its **key and browse URL** (prefer the URL the MCP/skill
+returns; otherwise construct `https://bitwarden.atlassian.net/browse/<KEY>`), and **carry the
+originating issue key with each behavior you extract**. The report links every behavior back to
+the Jira item it came from — see _Citing Jira issues as links_ below — so provenance must survive
+intake. A behavior that traces to no Jira item (e.g. found only in a PR diff) simply carries no
+source issue.
+
 Also capture **severity** — for a bug/defect ticket, read the severity assigned on the issue
 (the severity field, or the QA/reporter's stated severity in the description/comments) and
 carry it with the behaviors; for a feature/story without a defect, leave it to the analyst to
@@ -46,7 +53,9 @@ before extracting:
    not re-derive it.
 3. **Per child, gather behaviors and PRs.**
    - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance
-     criteria — these are the testable behaviors for the trophy.
+     criteria — these are the testable behaviors for the trophy. Capture each child's **key and
+     browse URL** and carry it with the behaviors it produces, exactly as for a single-issue
+     intake — a behavior sourced from a child issue links to that child, not the epic.
    - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub").
      Each PR URL becomes an input to the **GitHub PR** branch below: hand it off to
      `gh pr view` / `gh pr diff` so the actual change surface and any tests-in-PR feed the
@@ -154,3 +163,33 @@ Map rows to behaviors and bucket each by apparent layer using the `analyzing-tes
 Flag cases that are currently manual but cheaply automatable at a lower layer, and cases
 slated for E2E that would be better as integration. If a column's meaning is ambiguous,
 state the interpretation you used rather than guessing silently.
+
+## Citing Jira issues as links
+
+Every Jira item the report **names** — and every behavior the report shows that was **found from
+a Jira item** — is rendered as a clickable link to that item, never as bare key text. This is the
+Jira counterpart to the GitHub permalink rule for tests (the `assessing-test-coverage` skill's
+`references/finding-coverage.md` → _Citing tests as GitHub permalinks_).
+
+The link form is the issue's browse URL:
+
+```
+https://bitwarden.atlassian.net/browse/<KEY>
+```
+
+where `<KEY>` is the issue key (e.g. `PM-1234`). Prefer the URL the MCP tool or
+`bitwarden-atlassian-tools:researching-jira-issues` skill returns for the issue; fall back to
+constructing the browse URL from the key. The same rule covers epics and their children — link
+each to its own key.
+
+Apply it everywhere the report renders one of these:
+
+- An **issue, epic, or child key** named in the Overview, Summary, or Evidence sections —
+  anchor the key: `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`.
+- A **behavior row** (in the recommendations/coverage and gaps sections) whose behavior was
+  extracted from a Jira item — append the linked source key to the behavior cell so a reader can
+  jump to the requirement it came from. A behavior with no Jira source (PR-only) carries no key.
+
+These are informational `<a href>` citations — text, not loaded assets — so they do not violate
+the reports' self-contained / no-remote-resources constraint. Never fabricate a key or URL; if an
+issue's key is unknown, name the source in plain text rather than inventing a link.
diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
index 4957f9f..117303e 100644
--- a/plugins/bitwarden-test-engineer/references/report-style-tokens.md
+++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
@@ -1,11 +1,20 @@
 # Report style tokens — data-report visual system for HTML reports
 
-This file is the **single source of styling truth** for every self-contained HTML report the
+This file documents the **visual system** for every self-contained HTML report the
 `bitwarden-test-engineer` plugin emits — the `analyzing-test-stack` test-stack report and the
 `assessing-test-coverage` coverage report alike. The HTML output requirements (single file,
 inline CSS, no external/CDN assets, no web fonts, no JS) mean a report cannot `<link>` to a
-design system at runtime — instead, **inline the stylesheet block at the bottom of this file
-verbatim** into the report's `<style>` element.
+design system at runtime — the stylesheet must be inlined into the report's `<style>` element.
+
+**You do not retype the stylesheet.** The canonical CSS lives as a real file at
+`report-style.css` (alongside this file in the plugin-level `references/` directory) and is
+spliced into the report by the `scripts/build-report.sh` build script — never reproduced as
+model output. Authoring a
+report means writing its **content** (the sections below) into a fragment whose `<style>`
+element holds a single sentinel line, then running the build script, which substitutes
+`report-style.css` for the sentinel verbatim. See _Building the report_ below. This is what
+keeps the two reports on one identical system: they splice the same file, so they cannot drift,
+and the ~400-line stylesheet costs zero output tokens per report.
 
 The look is deliberately **not** a brand skin. It is a quiet, ink-on-paper _data report_
 — the aesthetic of a statistical notebook or a coverage readout, where the data is the
@@ -33,12 +42,12 @@ same instrument. Do not re-pick colors, fonts, or layer tokens per report.
 
 These mappings are **normative**. Do not re-pick colors per report.
 
-### Layer tokens (used wherever a Testing Trophy layer is rendered — chips, distribution bars, table cells)
+### Layer tokens (used wherever a test layer is rendered — chips, distribution bars, table cells)
 
 | Layer       | Token           | HEX       | Role in the ramp                 |
 | ----------- | --------------- | --------- | -------------------------------- |
 | unit        | `--unit`        | `#8FB3D1` | lightest — cheapest / shallowest |
-| integration | `--integration` | `#3F7196` | mid — the trophy's bulge         |
+| integration | `--integration` | `#3F7196` | mid — the confidence layer       |
 | e2e         | `--e2e`         | `#1D3A54` | deepest — most expensive, thin   |
 
 `unit` is light, so layer chips and bar segments at the unit layer use **dark** text
@@ -95,393 +104,85 @@ rendered as a normalized horizontal **stacked bar** (a `<figure>` captioned `Fig
 This replaces any arbitrary fixed-width bar. The chart is the report's signature: keep
 everything around it quiet so it reads.
 
-## Paste-ready stylesheet
-
-Paste the entire block below — unchanged — into the report's `<style>` element, as a single
-contiguous block. **Both report templates inline this identically** — the coverage report
+## Combined report — tabs (assembled, not authored)
+
+When both reports are produced for the same change, the build script can assemble them into
+**one page with two tabs** — _Current coverage_ (the `assessing-test-coverage` report) and
+_Recommended coverage_ (the `analyzing-test-stack` report). This is purely a **presentation**
+merge: each skill still authors and builds its own standalone report exactly as before; the
+combined page is an _additional_ deliverable stitched from the two finished report files. No
+skill or template knows about tabs — the tab markup and its CSS are owned entirely by
+`build-report.sh` and `report-style.css`, so the per-skill split stays intact.
+
+You never hand-write the tab markup. The build script reuses each report's `<header>`/`<main>`,
+namespaces the normative section ids so the two bodies coexist in one document
+(`#overview` → `#cur-overview` / `#rec-overview`, and likewise for the in-page anchor links),
+and emits the tab chrome. The mechanism is **CSS-only** (no JavaScript): two visually-hidden
+radio inputs (`.tab-input#tab-current` / `#tab-recommended`) drive the active `.tablist label`
+and which `.tabpanel[data-panel]` shows, via general-sibling selectors. On print, the tabs
+collapse and both panels stack, each titled by its `aria-label`, so a shared PDF carries the
+whole analysis. These classes live in the stylesheet's _Tabbed combined report_ block and are
+inert in the standalone reports, which never emit them.
+
+## The stylesheet file (binding contract)
+
+The full stylesheet is `report-style.css` (alongside this file). It is the single source of styling truth —
+**both** report templates resolve to it through the build script, so the coverage report
 (`assessing-test-coverage`'s `coverage-report-template.md`) and the test-stack report
-(`analyzing-test-stack`'s `html-report-template.md`). Do not prune unused selectors, do not
-reorder, and do not let one report carry a trimmed copy; that is exactly how two reports that
-claim the same system drift apart. Component classes (`.layer.*`, `.badge.*`,
-`.dist`/`.seg.*`, `.shapes`, etc.) are part of the binding contract — both templates reference
-them by name.
-
-```css
-:root {
-  /* Surfaces & ink — flat paper, no cards or shadows */
-  --paper: #ffffff;
-  --panel: #f4f6f8;
-  --ink: #16191d;
-  --ink-soft: #585f68;
-  --ink-faint: #818892;
-  --rule: #e4e7ea;
-
-  /* Layer ramp — SEQUENTIAL: ordered cheap/shallow -> costly/deep */
-  --unit: #8fb3d1;
-  --integration: #3f7196;
-  --e2e: #1d3a54;
-  --on-unit: #16191d; /* --unit is light: use dark text */
-  --on-deep: #ffffff; /* white text on integration/e2e */
-
-  /* Verdict & state — muted categorical */
-  --ok: #43875a;
-  --warn: #b07d2f;
-  --bad: #bf564a;
-  --on-state: #ffffff;
-
-  --link: #2f6e9e;
-
-  --sans:
-    system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
-  --mono:
-    ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono",
-    monospace;
-}
-
-* {
-  box-sizing: border-box;
-}
-html {
-  -webkit-text-size-adjust: 100%;
-}
-
-body {
-  margin: 0;
-  background: var(--paper);
-  color: var(--ink);
-  font: 15px/1.6 var(--sans);
-  font-feature-settings: "tnum" 1; /* tabular figures where supported */
-}
-
-a {
-  color: var(--link);
-  text-decoration: underline;
-  text-underline-offset: 2px;
-  text-decoration-thickness: 1px;
-}
-a:focus-visible,
-summary:focus-visible {
-  outline: 2px solid var(--link);
-  outline-offset: 2px;
-}
-
-/* Masthead */
-header {
-  max-width: 60rem;
-  margin: 0 auto;
-  padding: 56px 32px 28px;
-}
-header .eyebrow {
-  margin: 0 0 14px;
-  font: 600 11px/1 var(--mono);
-  letter-spacing: 0.18em;
-  text-transform: uppercase;
-  color: var(--ink-faint);
-}
-header h1 {
-  margin: 0 0 12px;
-  font-size: 28px;
-  line-height: 1.2;
-  font-weight: 650;
-  letter-spacing: -0.01em;
-}
-header .meta {
-  font: 12px/1.6 var(--mono);
-  color: var(--ink-soft);
-}
-header .meta a {
-  color: var(--ink-soft);
-}
-
-/* Sections — flat, hairline-separated, auto-numbered */
-main {
-  max-width: 60rem;
-  margin: 0 auto;
-  padding: 0 32px 96px;
-  counter-reset: sec;
-}
-section {
-  counter-increment: sec;
-  padding: 36px 0;
-  border-top: 1px solid var(--rule);
-}
-section:first-of-type {
-  border-top: 0;
-}
-section > h2 {
-  margin: 0 0 18px;
-  font-size: 19px;
-  font-weight: 650;
-  letter-spacing: -0.01em;
-}
-section > h2::before {
-  content: counter(sec, decimal-leading-zero);
-  display: inline-block;
-  margin-right: 12px;
-  font: 600 12px/1 var(--mono);
-  letter-spacing: 0.1em;
-  color: var(--ink-faint);
-  vertical-align: 2px;
-}
-section h3 {
-  margin: 28px 0 10px;
-  font: 600 11px/1.3 var(--mono);
-  letter-spacing: 0.12em;
-  text-transform: uppercase;
-  color: var(--ink-soft);
-}
-
-/* Prose */
-p {
-  margin: 0 0 14px;
-  max-width: 72ch;
-}
-.lead {
-  font-size: 16px;
-}
-.small {
-  font-size: 12.5px;
-  color: var(--ink-soft);
-}
-ul.tight {
-  margin: 8px 0 16px;
-  padding-left: 20px;
-}
-ul.tight li {
-  margin: 0 0 6px;
-}
-ol {
-  padding-left: 22px;
-}
-ol li {
-  margin: 0 0 10px;
-}
-code {
-  font: 0.86em var(--mono);
-  background: var(--panel);
-  padding: 1px 5px;
-  border-radius: 3px;
-}
-
-/* Tables — heavy header rule, hairline rows */
-.scroll {
-  overflow-x: auto;
-}
-table {
-  width: 100%;
-  border-collapse: collapse;
-  margin: 4px 0 18px;
-  font-size: 13.5px;
-}
-thead th {
-  text-align: left;
-  vertical-align: bottom;
-  padding: 0 12px 8px;
-  font: 600 10.5px/1.3 var(--mono);
-  letter-spacing: 0.1em;
-  text-transform: uppercase;
-  color: var(--ink-faint);
-  border-bottom: 1px solid var(--ink);
-}
-tbody td {
-  vertical-align: top;
-  padding: 10px 12px;
-  border-bottom: 1px solid var(--rule);
-}
-tbody tr:hover {
-  background: var(--panel);
-}
-th:first-child,
-td:first-child {
-  padding-left: 0;
-}
-th:last-child,
-td:last-child {
-  padding-right: 0;
-}
-
-/* Layer chip */
-.layer {
-  display: inline-block;
-  font: 600 10.5px/1.6 var(--mono);
-  letter-spacing: 0.08em;
-  text-transform: uppercase;
-  padding: 2px 8px;
-  border-radius: 2px;
-  white-space: nowrap;
-}
-.layer.unit {
-  background: var(--unit);
-  color: var(--on-unit);
-}
-.layer.integration {
-  background: var(--integration);
-  color: var(--on-deep);
-}
-.layer.e2e {
-  background: var(--e2e);
-  color: var(--on-deep);
-}
-
-/* Layer-distribution chart (the signature graphic) */
-figure {
-  margin: 18px 0;
-}
-figcaption {
-  margin-bottom: 14px;
-  font: 11px/1.4 var(--mono);
-  letter-spacing: 0.04em;
-  color: var(--ink-faint);
-}
-.dist .legend {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 18px;
-  margin-bottom: 14px;
-  font: 11px/1 var(--mono);
-  color: var(--ink-soft);
-}
-.dist .legend .key {
-  display: inline-flex;
-  align-items: center;
-  gap: 6px;
-  text-transform: uppercase;
-  letter-spacing: 0.06em;
-}
-.dist .legend .key::before {
-  content: "";
-  width: 10px;
-  height: 10px;
-  border-radius: 2px;
-  background: var(--rule);
-}
-.dist .legend .unit::before {
-  background: var(--unit);
-}
-.dist .legend .integration::before {
-  background: var(--integration);
-}
-.dist .legend .e2e::before {
-  background: var(--e2e);
-}
-.dist-row {
-  display: flex;
-  align-items: center;
-  gap: 14px;
-  margin: 7px 0;
-}
-.dist-row .dist-label {
-  flex: 0 0 14ch;
-  text-align: right;
-  font: 11px/1.3 var(--mono);
-  color: var(--ink-soft);
-  word-break: break-word;
-}
-.dist-row .bar {
-  flex: 1;
-  display: flex;
-  height: 24px;
-  background: var(--panel);
-  border-radius: 3px;
-  overflow: hidden;
-}
-.bar .seg {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  min-width: 18px;
-  font: 600 11px/1 var(--mono);
-  color: var(--on-deep);
-}
-.bar .seg.unit {
-  background: var(--unit);
-  color: var(--on-unit);
-}
-.bar .seg.integration {
-  background: var(--integration);
-}
-.bar .seg.e2e {
-  background: var(--e2e);
-}
-
-/* Per-platform recommended-shape list (replaces card blocks) */
-ul.shapes {
-  margin: 6px 0 0;
-  padding: 0;
-  list-style: none;
-}
-ul.shapes li {
-  padding: 10px 0;
-  border-top: 1px solid var(--rule);
-}
-ul.shapes li:first-child {
-  border-top: 0;
-}
-ul.shapes .plat {
-  font: 600 13px/1.5 var(--mono);
-}
-
-/* Badges */
-.badge {
-  display: inline-block;
-  font: 600 10px/1.5 var(--mono);
-  letter-spacing: 0.04em;
-  text-transform: uppercase;
-  padding: 1px 6px;
-  border-radius: 2px;
-  color: var(--on-state);
-  white-space: nowrap;
-}
-.badge.assumption {
-  background: var(--warn);
-}
-.badge.warn {
-  background: var(--bad);
-}
-.badge.ok {
-  background: var(--ok);
-}
-
-/* Unlinkable evidence */
-.unlinkable {
-  font: italic 12px/1.4 var(--mono);
-  color: var(--ink-faint);
-}
-
-@media (max-width: 720px) {
-  header,
-  main {
-    padding-left: 20px;
-    padding-right: 20px;
-  }
-  .dist-row {
-    flex-direction: column;
-    align-items: stretch;
-    gap: 4px;
-  }
-  .dist-row .dist-label {
-    flex: none;
-    text-align: left;
-  }
-}
-
-@media print {
-  body {
-    font-size: 11pt;
-  }
-  section {
-    break-inside: avoid;
-    border-top-color: #ccc;
-  }
-  tbody tr:hover {
-    background: none;
-  }
-  a {
-    color: var(--ink);
-  }
-}
+(`analyzing-test-stack`'s `html-report-template.md`) carry byte-identical CSS. Component
+classes (`.layer.*`, `.badge.*`, `.dist`/`.seg.*`, `.shapes`, `.unlinkable`, `.toc`, etc.) are
+part of the binding contract — both templates reference them by name; the markup you author must
+use exactly those class names so the spliced stylesheet styles it. Each report opens its
+`<main>` with a `.toc` nav of linked section anchors; in the combined report the build
+script namespaces those anchor links per tab.
+
+You never read, reproduce, prune, or hand-edit `report-style.css` when authoring a report —
+the build script inlines it whole. If the visual system genuinely needs to change, edit
+`report-style.css` once and every future report inherits it.
+
+## Building the report
+
+The model authors a **content fragment** — a complete HTML document whose `<style>` element
+contains exactly one line, the sentinel:
+
+```html
+<style>
+  /* @@BITWARDEN_REPORT_STYLESHEET@@ */
+</style>
+```
+
+Write that fragment to a temporary path (e.g. `<kind>-report-<slug>.fragment.html`), then run
+the build script from the plugin root:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind <test-stack|test-coverage> --slug <slug> --date <YYYY-MM-DD> \
+  <fragment-file>
 ```
 
+The script replaces the sentinel with `report-style.css` verbatim and writes
+`<kind>-report-<slug>-<date>-<HHMMSS>.html` to the current working directory, printing the
+final filename to stdout. The `<HHMMSS>` suffix is stamped from the wall clock by the script
+(the model cannot read the clock), so **every run gets a fresh filename** — a report is never
+overwritten, and an existing report never has to be read back and regenerated. Delete the
+temporary fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`,
+fragment not found) it writes nothing — fix the fragment and re-run rather than falling back to
+pasting CSS by hand.
+
+To assemble the **combined two-tab page** from the two already-built standalone reports, call
+the script with `--kind test-combined` and the two finished report files (no fragment, no
+sentinel — the bodies are reused as-is):
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-combined --slug <slug> --date <YYYY-MM-DD> \
+  --current <test-coverage-report-…​.html> \
+  --recommended <test-stack-report-…​.html>
+```
+
+It writes `test-combined-report-<slug>-<date>-<HHMMSS>.html` and prints the filename. The two
+input reports are read, not modified, and their standalone files remain.
+
 ## What not to do
 
 - Do not reintroduce a brand skin — no saturated brand blue/yellow, no logo images, no
@@ -490,7 +191,8 @@ ul.shapes .plat {
   encoding.
 - Do not introduce web fonts, CDN links, or `<link rel="stylesheet">` — the single-file
   constraint is binding.
-- Do not narrow the stylesheet down to "only the classes this report uses." The template
-  ships the full stylesheet so a reader inspecting any report sees the same system.
+- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only
+  the sentinel, and the build script supplies the full stylesheet. A report that ships a
+  hand-copied or "only the classes I used" stylesheet is exactly how two reports drift apart.
 - Do not hand-compute the distribution bar widths in pixels or percentages — set
   `flex: <count>` per segment and let the browser normalize.
diff --git a/plugins/bitwarden-test-engineer/references/report-style.css b/plugins/bitwarden-test-engineer/references/report-style.css
new file mode 100644
index 0000000..ad98427
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-style.css
@@ -0,0 +1,552 @@
+:root {
+  /* Surfaces & ink — flat paper, no cards or shadows */
+  --paper: #ffffff;
+  --panel: #f4f6f8;
+  --ink: #16191d;
+  --ink-soft: #585f68;
+  --ink-faint: #818892;
+  --rule: #e4e7ea;
+
+  /* Layer ramp — SEQUENTIAL: ordered cheap/shallow -> costly/deep */
+  --unit: #8fb3d1;
+  --integration: #3f7196;
+  --e2e: #1d3a54;
+  --on-unit: #16191d; /* --unit is light: use dark text */
+  --on-deep: #ffffff; /* white text on integration/e2e */
+
+  /* Verdict & state — muted categorical */
+  --ok: #43875a;
+  --warn: #b07d2f;
+  --bad: #bf564a;
+  --on-state: #ffffff;
+
+  --link: #2f6e9e;
+
+  --sans:
+    system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+  --mono:
+    ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono",
+    monospace;
+}
+
+* {
+  box-sizing: border-box;
+}
+html {
+  -webkit-text-size-adjust: 100%;
+  scroll-padding-top: 24px; /* keep anchored sections clear of the top edge */
+}
+
+body {
+  margin: 0;
+  background: var(--paper);
+  color: var(--ink);
+  font: 15px/1.6 var(--sans);
+  font-feature-settings: "tnum" 1; /* tabular figures where supported */
+  -webkit-font-smoothing: antialiased;
+  text-rendering: optimizeLegibility;
+}
+
+/* Smooth in-page jumps for the report's overview -> section anchor links,
+   suppressed when the reader prefers reduced motion. */
+@media (prefers-reduced-motion: no-preference) {
+  html {
+    scroll-behavior: smooth;
+  }
+}
+
+a {
+  color: var(--link);
+  text-decoration: underline;
+  text-underline-offset: 2px;
+  text-decoration-thickness: 1px;
+}
+a:focus-visible,
+summary:focus-visible {
+  outline: 2px solid var(--link);
+  outline-offset: 2px;
+}
+
+/* Masthead */
+header {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: clamp(36px, 7vw, 56px) clamp(20px, 5vw, 32px) 28px;
+}
+header .eyebrow {
+  margin: 0 0 14px;
+  font: 600 11px/1 var(--mono);
+  letter-spacing: 0.18em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+}
+header h1 {
+  margin: 0 0 12px;
+  font-size: clamp(24px, 5vw, 32px);
+  line-height: 1.2;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+  text-wrap: balance;
+}
+header .meta {
+  font: 12px/1.6 var(--mono);
+  color: var(--ink-soft);
+}
+header .meta a {
+  color: var(--ink-soft);
+}
+
+/* In-page table of contents — a compact monospace row of section links at the
+   top of <main>. In the combined report the build script namespaces each link's
+   href per tab, so a panel's ToC jumps within its own panel. */
+.toc {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px 18px;
+  margin: 0 0 4px;
+  padding: 0 0 20px;
+  border-bottom: 1px solid var(--rule);
+  font: 600 11px/1.6 var(--mono);
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.toc a {
+  color: var(--ink-soft);
+  text-decoration: none;
+}
+.toc a:hover {
+  color: var(--link);
+  text-decoration: underline;
+}
+
+/* Sections — flat, hairline-separated, auto-numbered */
+main {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 0 clamp(20px, 5vw, 32px) 96px;
+  counter-reset: sec;
+}
+section {
+  counter-increment: sec;
+  padding: 36px 0;
+  border-top: 1px solid var(--rule);
+  scroll-margin-top: 24px;
+}
+section:first-of-type {
+  border-top: 0;
+}
+/* Quiet landing cue: briefly tint a section an in-page link jumped to. */
+@media (prefers-reduced-motion: no-preference) {
+  section:target {
+    animation: section-land 1.4s ease-out;
+  }
+  @keyframes section-land {
+    from {
+      background: var(--panel);
+    }
+    to {
+      background: transparent;
+    }
+  }
+}
+section > h2 {
+  margin: 0 0 18px;
+  font-size: 19px;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+  text-wrap: balance;
+}
+section > h2::before {
+  content: counter(sec, decimal-leading-zero);
+  display: inline-block;
+  margin-right: 12px;
+  font: 600 12px/1 var(--mono);
+  letter-spacing: 0.1em;
+  color: var(--ink-faint);
+  vertical-align: 2px;
+}
+section h3 {
+  margin: 28px 0 10px;
+  font: 600 11px/1.3 var(--mono);
+  letter-spacing: 0.12em;
+  text-transform: uppercase;
+  color: var(--ink-soft);
+}
+
+/* Prose */
+p {
+  margin: 0 0 14px;
+  max-width: 72ch;
+  text-wrap: pretty; /* avoid orphans / ragged short last lines */
+}
+.lead {
+  font-size: 16px;
+}
+.small {
+  font-size: 12.5px;
+  color: var(--ink-soft);
+}
+ul.tight {
+  margin: 8px 0 16px;
+  padding-left: 20px;
+}
+ul.tight li {
+  margin: 0 0 6px;
+}
+ol {
+  padding-left: 22px;
+}
+ol li {
+  margin: 0 0 10px;
+}
+code {
+  font: 0.86em var(--mono);
+  background: var(--panel);
+  padding: 1px 5px;
+  border-radius: 3px;
+}
+
+/* Tables — heavy header rule, hairline rows */
+.scroll {
+  overflow-x: auto;
+  -webkit-overflow-scrolling: touch;
+  overscroll-behavior-x: contain;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 4px 0 18px;
+  font-size: 13.5px;
+}
+thead th {
+  text-align: left;
+  vertical-align: bottom;
+  padding: 0 12px 8px;
+  font: 600 10.5px/1.3 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  border-bottom: 1px solid var(--ink);
+}
+tbody td {
+  vertical-align: top;
+  padding: 10px 12px;
+  border-bottom: 1px solid var(--rule);
+}
+tbody tr:hover {
+  background: var(--panel);
+}
+th:first-child,
+td:first-child {
+  padding-left: 0;
+}
+th:last-child,
+td:last-child {
+  padding-right: 0;
+}
+
+/* Layer chip */
+.layer {
+  display: inline-block;
+  font: 600 10.5px/1.6 var(--mono);
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 2px 8px;
+  border-radius: 2px;
+  white-space: nowrap;
+}
+.layer.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.layer.integration {
+  background: var(--integration);
+  color: var(--on-deep);
+}
+.layer.e2e {
+  background: var(--e2e);
+  color: var(--on-deep);
+}
+
+/* Layer-distribution chart (the signature graphic) */
+figure {
+  margin: 18px 0;
+}
+figcaption {
+  margin-bottom: 14px;
+  font: 11px/1.4 var(--mono);
+  letter-spacing: 0.04em;
+  color: var(--ink-faint);
+}
+.dist .legend {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 18px;
+  margin-bottom: 14px;
+  font: 11px/1 var(--mono);
+  color: var(--ink-soft);
+}
+.dist .legend .key {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+.dist .legend .key::before {
+  content: "";
+  width: 10px;
+  height: 10px;
+  border-radius: 2px;
+  background: var(--rule);
+}
+.dist .legend .unit::before {
+  background: var(--unit);
+}
+.dist .legend .integration::before {
+  background: var(--integration);
+}
+.dist .legend .e2e::before {
+  background: var(--e2e);
+}
+.dist-row {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  margin: 7px 0;
+}
+.dist-row .dist-label {
+  flex: 0 0 14ch;
+  text-align: right;
+  font: 11px/1.3 var(--mono);
+  color: var(--ink-soft);
+  word-break: break-word;
+}
+.dist-row .bar {
+  flex: 1;
+  display: flex;
+  height: 24px;
+  background: var(--panel);
+  border-radius: 3px;
+  overflow: hidden;
+}
+.bar .seg {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 18px;
+  font: 600 11px/1 var(--mono);
+  color: var(--on-deep);
+}
+.bar .seg.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.bar .seg.integration {
+  background: var(--integration);
+}
+.bar .seg.e2e {
+  background: var(--e2e);
+}
+
+/* Per-platform recommended-shape list (replaces card blocks) */
+ul.shapes {
+  margin: 6px 0 0;
+  padding: 0;
+  list-style: none;
+}
+ul.shapes li {
+  padding: 10px 0;
+  border-top: 1px solid var(--rule);
+}
+ul.shapes li:first-child {
+  border-top: 0;
+}
+ul.shapes .plat {
+  font: 600 13px/1.5 var(--mono);
+}
+
+/* Badges */
+.badge {
+  display: inline-block;
+  font: 600 10px/1.5 var(--mono);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  padding: 1px 6px;
+  border-radius: 2px;
+  color: var(--on-state);
+  white-space: nowrap;
+}
+.badge.assumption {
+  background: var(--warn);
+}
+.badge.warn {
+  background: var(--bad);
+}
+.badge.ok {
+  background: var(--ok);
+}
+
+/* Unlinkable evidence */
+.unlinkable {
+  font: italic 12px/1.4 var(--mono);
+  color: var(--ink-faint);
+}
+
+/* Tabbed combined report — the Current-coverage and Recommended-coverage report
+   bodies surfaced as two tabs on one page, CSS-only (no JavaScript). The radio
+   inputs are visually hidden but keep keyboard focus; the checked input drives
+   both the active label and which panel shows. These rules are only exercised by
+   the combined report; they are inert in the standalone coverage/test-stack
+   reports, which never emit these elements. */
+.tab-input {
+  position: absolute;
+  width: 1px;
+  height: 1px;
+  margin: -1px;
+  opacity: 0;
+}
+.tablist {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 0 clamp(20px, 5vw, 32px);
+  display: flex;
+  flex-wrap: wrap;
+  gap: 4px;
+  border-bottom: 1px solid var(--ink);
+}
+.tablist label {
+  display: inline-block;
+  padding: 11px 16px;
+  font: 600 11px/1.4 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  cursor: pointer;
+  border: 1px solid transparent;
+  border-bottom: 0;
+  border-radius: 3px 3px 0 0;
+  margin-bottom: -1px; /* sit the tab on the list's bottom rule */
+}
+.tablist label:hover {
+  color: var(--ink);
+  background: var(--panel);
+}
+/* A tabpanel is itself a section element; neutralize the global section chrome
+   so only the report sections nested inside its main element render with rules
+   and numbering. */
+.tabpanel {
+  display: none;
+  padding: 0;
+  border-top: 0;
+  counter-increment: none;
+}
+/* Active tab + its panel, driven by the checked radio (general-sibling ~). */
+#tab-current:checked ~ .tablist label[for="tab-current"],
+#tab-recommended:checked ~ .tablist label[for="tab-recommended"] {
+  color: var(--ink);
+  border-color: var(--ink);
+  border-bottom-color: var(--paper);
+  background: var(--paper);
+}
+#tab-current:checked ~ .tabpanel[data-panel="current"],
+#tab-recommended:checked ~ .tabpanel[data-panel="recommended"] {
+  display: block;
+}
+/* Keyboard focus on the visually-hidden radio surfaces a ring on its label. */
+#tab-current:focus-visible ~ .tablist label[for="tab-current"],
+#tab-recommended:focus-visible ~ .tablist label[for="tab-recommended"] {
+  outline: 2px solid var(--link);
+  outline-offset: -2px;
+}
+
+/* Floating "back to top" control — a fixed action button that rides along as the
+   reader scrolls and jumps to the top via the in-page #top anchor on <header>. No
+   JavaScript: it reuses the same smooth-scroll / reduced-motion behavior as the ToC
+   links. Flat to fit the data-report system — a solid ink fill carries it over the
+   content instead of a shadow. Present in every report; hidden when printing. */
+.to-top {
+  position: fixed;
+  right: clamp(16px, 4vw, 28px);
+  bottom: clamp(16px, 4vw, 28px);
+  z-index: 20;
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  padding: 9px 13px;
+  background: var(--ink);
+  color: var(--paper);
+  font: 600 10.5px/1 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  text-decoration: none;
+  border-radius: 4px;
+}
+.to-top::before {
+  content: "\2191"; /* upwards arrow */
+  font-size: 13px;
+  line-height: 1;
+}
+.to-top:hover {
+  background: var(--link);
+  color: var(--paper);
+}
+.to-top:focus-visible {
+  outline: 2px solid var(--link);
+  outline-offset: 2px;
+}
+
+@media (max-width: 720px) {
+  header,
+  main,
+  .tablist {
+    padding-left: 20px;
+    padding-right: 20px;
+  }
+  .dist-row {
+    flex-direction: column;
+    align-items: stretch;
+    gap: 4px;
+  }
+  .dist-row .dist-label {
+    flex: none;
+    text-align: left;
+  }
+}
+
+@media print {
+  body {
+    font-size: 11pt;
+  }
+  /* Tabs cannot be toggled on paper — drop the controls and stack both report
+     bodies, each titled by its panel label so the printout stays legible. */
+  .tab-input,
+  .tablist,
+  .to-top {
+    display: none;
+  }
+  .tabpanel {
+    display: block !important;
+  }
+  .tabpanel::before {
+    content: attr(aria-label);
+    display: block;
+    max-width: 60rem;
+    margin: 0 auto;
+    padding: 16px clamp(20px, 5vw, 32px) 0;
+    font: 600 11px/1.3 var(--mono);
+    letter-spacing: 0.12em;
+    text-transform: uppercase;
+    color: var(--ink-faint);
+  }
+  section {
+    break-inside: avoid;
+    border-top-color: #ccc;
+  }
+  tbody tr:hover {
+    background: none;
+  }
+  a {
+    color: var(--ink);
+  }
+}
diff --git a/plugins/bitwarden-test-engineer/scripts/build-report.sh b/plugins/bitwarden-test-engineer/scripts/build-report.sh
new file mode 100755
index 0000000..fe06c6b
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/scripts/build-report.sh
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+#
+# build-report.sh — assemble a self-contained HTML report for the
+# bitwarden-test-engineer plugin by splicing the canonical stylesheet into a
+# model-authored content fragment.
+#
+# The model writes a fragment whose <style> element contains a single sentinel
+# line; this script replaces that sentinel with references/report-style.css
+# verbatim. That keeps the ~400-line stylesheet out of model output entirely
+# (no token cost, no drift between the two reports) while the model authors only
+# the report's actual content.
+#
+# Usage (single report):
+#   build-report.sh --kind <test-stack|test-coverage> --slug <slug> \
+#                   --date <YYYY-MM-DD> <fragment-html-file>
+#
+# Usage (combined two-tab page):
+#   build-report.sh --kind test-combined --slug <slug> --date <YYYY-MM-DD> \
+#                   --current <coverage-report.html> \
+#                   --recommended <test-stack-report.html>
+#
+# The combined mode assembles ONE page with two CSS-only tabs — "Current
+# coverage" (the assessing-test-coverage report) and "Recommended coverage" (the
+# analyzing-test-stack report) — from the two already-built standalone report
+# files. It reuses each report's <header>/<main>, namespaces the section ids so
+# the two bodies coexist in one document (cur-* / rec-*), and splices the
+# stylesheet in once. The two source reports are read, not modified, and their
+# standalone files remain; the combined page is an additional deliverable.
+#
+# Writes <kind>-report-<slug>-<date>-<HHMMSS>.html to the current working
+# directory and prints the final filename to stdout. The HHMMSS suffix is read
+# from the wall clock here (the model cannot read the clock) and guarantees a
+# fresh filename per run, so a report is never overwritten and an existing file
+# never has to be read back and regenerated.
+#
+# Input files are left untouched; delete any temporary fragment yourself.
+
+set -euo pipefail
+
+SENTINEL='/* @@BITWARDEN_REPORT_STYLESHEET@@ */'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CSS_FILE="${SCRIPT_DIR}/../references/report-style.css"
+
+KIND=""
+SLUG=""
+DATE=""
+FRAGMENT=""
+CURRENT=""
+RECOMMENDED=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --kind) KIND="${2:-}"; shift 2 ;;
+    --slug) SLUG="${2:-}"; shift 2 ;;
+    --date) DATE="${2:-}"; shift 2 ;;
+    --current) CURRENT="${2:-}"; shift 2 ;;
+    --recommended) RECOMMENDED="${2:-}"; shift 2 ;;
+    -h|--help)
+      grep '^#' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
+      exit 0 ;;
+    --*) echo "build-report.sh: unknown option '$1'" >&2; exit 2 ;;
+    *) FRAGMENT="$1"; shift ;;
+  esac
+done
+
+# --- validate common inputs --------------------------------------------------
+case "$KIND" in
+  test-stack|test-coverage|test-combined) ;;
+  *) echo "build-report.sh: --kind must be 'test-stack', 'test-coverage', or 'test-combined' (got '${KIND}')" >&2; exit 2 ;;
+esac
+
+if [[ -z "$SLUG" ]]; then
+  echo "build-report.sh: --slug is required (a short kebab-case change identifier)" >&2
+  exit 2
+fi
+if [[ ! "$SLUG" =~ ^[a-zA-Z0-9._-]+$ ]]; then
+  echo "build-report.sh: --slug '${SLUG}' must be kebab-case (letters, digits, dot, dash, underscore)" >&2
+  exit 2
+fi
+if [[ ! "$DATE" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
+  echo "build-report.sh: --date must be YYYY-MM-DD (got '${DATE}')" >&2
+  exit 2
+fi
+if [[ ! -f "$CSS_FILE" ]]; then
+  echo "build-report.sh: stylesheet not found at '${CSS_FILE}'" >&2
+  exit 1
+fi
+
+TIME="$(date +%H%M%S)"
+OUT="${KIND}-report-${SLUG}-${DATE}-${TIME}.html"
+
+# Splice the canonical stylesheet in place of the sentinel line. awk reads the
+# CSS file line by line, so no shell escaping ever touches the CSS content.
+splice_css() {
+  awk -v css="$CSS_FILE" -v sentinel="$SENTINEL" '
+    index($0, sentinel) {
+      while ((getline line < css) > 0) print line
+      close(css)
+      next
+    }
+    { print }
+  '
+}
+
+if [[ "$KIND" == "test-combined" ]]; then
+  # --- combined two-tab page -------------------------------------------------
+  for f in "$CURRENT" "$RECOMMENDED"; do
+    if [[ -z "$f" || ! -f "$f" ]]; then
+      echo "build-report.sh: --kind test-combined needs --current and --recommended report files (missing: '${f}')" >&2
+      exit 2
+    fi
+    if ! grep -q '<main' "$f"; then
+      echo "build-report.sh: '${f}' does not look like a built report (no <main> element)" >&2
+      exit 1
+    fi
+  done
+
+  # Pull the inclusive <header>…</header> or <main>…</main> region from a report.
+  # Only scan from <body> onward: the finished reports carry the whole stylesheet
+  # inlined in <head>, and a CSS comment can legitimately mention "<main>" etc. —
+  # gating on <body> keeps those from being mistaken for the real element.
+  extract_region() {
+    awk -v startTag="$2" -v endTag="$3" '
+      /<body[ >]/ { inBody = 1 }
+      !inBody { next }
+      index($0, startTag) { f = 1 }
+      f { print }
+      index($0, endTag) { if (f) exit }
+    ' "$1"
+  }
+
+  # Namespace the normative section ids (and their in-page anchor links) so the
+  # two report bodies can share one document without colliding on #overview etc.
+  IDS='overview|summary|evidence|coverage|recommendations|gaps'
+  prefix_ids() {
+    sed -E \
+      -e "s/ id=\"(${IDS})\"/ id=\"$1-\1\"/g" \
+      -e "s/href=\"#(${IDS})\"/href=\"#$1-\1\"/g"
+  }
+
+  {
+    cat <<HTML
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Test Engineering Report — ${SLUG}</title>
+    <style>
+      ${SENTINEL}
+    </style>
+  </head>
+  <body>
+HTML
+    # Shared masthead: reuse the recommendation report's header, relabel its
+    # eyebrow so the page reads as the combined deliverable, not one report.
+    extract_region "$RECOMMENDED" "<header" "</header>" \
+      | sed -E 's#(<p class="eyebrow">)[^<]*(</p>)#\1Test Engineering Report\2#'
+    cat <<'HTML'
+    <input class="tab-input" type="radio" name="report-view" id="tab-current" checked />
+    <input class="tab-input" type="radio" name="report-view" id="tab-recommended" />
+    <nav class="tablist" aria-label="Report views">
+      <label for="tab-current">Current coverage</label>
+      <label for="tab-recommended">Recommended coverage</label>
+    </nav>
+    <section class="tabpanel" data-panel="current" aria-label="Current coverage">
+HTML
+    extract_region "$CURRENT" "<main" "</main>" | prefix_ids cur
+    cat <<'HTML'
+    </section>
+    <section class="tabpanel" data-panel="recommended" aria-label="Recommended coverage">
+HTML
+    extract_region "$RECOMMENDED" "<main" "</main>" | prefix_ids rec
+    # The reused masthead carries id="top"; emit the back-to-top control once for
+    # the whole page. Each standalone report's own control sits after its </main>,
+    # outside the extracted region, so the combined page would otherwise have none.
+    cat <<'HTML'
+    </section>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
+  </body>
+</html>
+HTML
+  } | splice_css > "$OUT"
+
+  echo "$OUT"
+  exit 0
+fi
+
+# --- single report (test-stack | test-coverage) ------------------------------
+if [[ -z "$FRAGMENT" || ! -f "$FRAGMENT" ]]; then
+  echo "build-report.sh: fragment HTML file not found: '${FRAGMENT}'" >&2
+  exit 2
+fi
+if ! grep -qF "$SENTINEL" "$FRAGMENT"; then
+  echo "build-report.sh: fragment '${FRAGMENT}' has no stylesheet sentinel." >&2
+  echo "  Put exactly this line inside the <style> element: ${SENTINEL}" >&2
+  exit 1
+fi
+
+splice_css < "$FRAGMENT" > "$OUT"
+
+echo "$OUT"
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index 923b5aa..8003edb 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -1,20 +1,20 @@
 ---
 name: analyzing-test-stack
-description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a Jira ticket, GitHub PR, exported test-case CSV, technical breakdown, and/or plain-language description, then mapping each behavior to the cheapest sufficient Testing Trophy layer (unit, integration, E2E) per platform, risk-weighted by each behavior's defect severity (impact, not urgency), and emitting a self-contained HTML report. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", "are these tests at the right level", "risk-based test coverage", "what tests does this Critical/High bug need", or "rank coverage gaps by severity".
-allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a Jira ticket, GitHub PR, exported test-case CSV, technical breakdown, and/or plain-language description, then mapping each behavior to the cheapest sufficient test layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", "are these tests at the right level", "risk-based test coverage", "what tests does this Critical/High bug need", or "rank coverage gaps by severity".
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
 ---
 
 # Analyzing the Test Stack
 
-Recommend the test automation layers a change should ship with, shaped as a **Testing Trophy**, and write the recommendation as a self-contained HTML report. You produce advice, not tests.
+Recommend the test automation layers a change should ship with — shaped to **each target repo's actual test practice**, not one universal model — and write the recommendation as a self-contained HTML report. You produce advice, not tests.
 
-The Testing Trophy (read `references/testing-trophy.md` for the full model): a focused **unit** layer for pure logic and edge cases, a **heavy integration** layer where most confidence is bought, and a **thin E2E** layer reserved for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down the trophy, not up.
+The three layers (read `references/testing-trophy.md` for the full model): a focused **unit** layer for pure logic and edge cases, an **integration** layer where collaborator wiring is exercised, and a **thin E2E** layer for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down toward unit. How the volume distributes across those layers differs per repo: Bitwarden's repos span unit-heavy **pyramids** (`server`, `clients`, `sdk-internal`, `android`), an integration + snapshot **trophy** (`ios`), and **all-E2E** repos (`test`, `browser-interactions-testing`). Land each call inside the target repo's shape — see `references/monorepo-layout.md` → _Each repo's test shape in practice_.
 
 ## Inputs
 
 You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. You also consume a **coverage inventory** — the existing-test records produced by the `assessing-test-coverage` skill (permalink records + `unverified` gaps). Under the `bitwarden-test-engineer` agent this is gathered for you before this skill runs; if it is absent (e.g. run standalone), invoke `Skill(assessing-test-coverage)` for the affected change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
 
-`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. At a glance:
+`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule: prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`, fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable, and if neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance:
 
 - **Jira** — extract testable behaviors and acceptance criteria; Epics/Features expand to their children before extraction.
 - **GitHub PR** — extract the change surface, API touched, and any tests already present.
@@ -30,13 +30,13 @@ Alongside the behaviors, carry each behavior's **risk severity** — the impact
 
 1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated.
 
-2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input: the permalink records for observed tests (each `{ path, line range, owner_repo, sha, layer, permalink }`, or path-only with an `unlinkable` reason) and the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column and the gap analysis below.
+2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input: **one record per behavior** carrying its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`, representative tests path-only with an `unlinkable` reason when they can't be linked) plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below.
 
-3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer the trophy reserves for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey). Name concrete tooling per platform (see `references/monorepo-layout.md`).
+3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`).
 
-4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is trophy-wrong (e.g. E2E doing work integration should do, or untested core logic). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
+4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is wrong for its repo (e.g. E2E doing work integration should do, untested core logic, or a layer the repo doesn't even maintain). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
 
-5. **Write the HTML report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/html-report-template.md`. **Inline the canonical stylesheet from `../../references/report-style-tokens.md` verbatim** — the plugin-level styling source shared with the coverage report; do not re-pick colors, fonts, or layer tokens; the off-brand data-report visual system and the layer/badge mappings in that file are binding. Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`). Write `#overview` yourself as a short top-of-report synthesis: a 2–4 sentence recap of the recommended shape per platform, the top 3 open risks the reader should resolve before acting (drawn from `#gaps`, **highest severity first**), and anchor links into `#recommendations` and `#gaps`. The per-platform recommendations table carries a **Severity** column per behavior. Write the report to the **current working directory** as `test-stack-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change (ticket key, PR number, or feature name) and `<date>` is the caller-provided date. The Per-platform recommendations table's Evidence column must contain a GitHub permalink (or an explicit `unlinkable` note) for every cited existing test.
+5. **Render the HTML report.** Once steps 1–4 have decided the per-behavior layer/severity mapping, rendering it to HTML is **mechanical formatting, not reasoning** — under the `bitwarden-test-engineer` agent this step runs on a Sonnet report-writer subagent (see the agent's _Model selection_), not in the analytical context. Author a **content fragment** following `references/html-report-template.md`: a full HTML document whose `<style>` element holds only the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — **never paste or retype CSS**; the build script splices in the canonical `../../references/report-style.css` verbatim (the off-brand data-report system and the layer/badge mappings are binding — use the exact class names). Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`). Write `#overview` yourself as a short top-of-report synthesis: a 2–4 sentence recap of the recommended shape per platform, the top 3 open risks the reader should resolve before acting (drawn from `#gaps`, **highest severity first**), and anchor links into `#recommendations` and `#gaps`. The per-platform recommendations table carries a **Severity** column per behavior, and its Evidence column must contain a GitHub permalink (or an explicit `unlinkable` note) for every cited existing test. Then build the final file: `"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" --kind test-stack --slug <slug> --date <date> <fragment>` (slug = short kebab-case change identifier; date = the caller-provided date). The script writes `test-stack-report-<slug>-<date>-<HHMMSS>.html` to the current working directory — the `HHMMSS` suffix is stamped by the script so each run is a fresh file, never overwriting a prior report — and prints the filename; delete the temporary fragment and report that filename. Do not hand-assemble the file or paste CSS as a fallback.
 
 ## Principles
 
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
index a18ccb7..1486493 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -1,23 +1,28 @@
 # HTML report template
 
 Produce a **single self-contained HTML file**: all CSS inline in a `<style>` block, no
-external/CDN links, no required JavaScript, no web fonts. It must render correctly opened
+external/CDN _resource_ links (stylesheets, fonts, scripts, images), no required JavaScript,
+no web fonts. Informational `<a href>` citations to public sources are fine and encouraged —
+they are text, not loaded assets (see _Content rules_). It must render correctly opened
 directly from disk and survive being attached to a ticket or PR.
 
-Write it to the **current working directory** as
-`test-stack-report-<slug>-<date>.html` (slug = ticket key / PR number / feature name in
-kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
+You do not write the final file directly and you do not paste any CSS. Author a **content
+fragment** (the full HTML below, but with only a stylesheet sentinel inside `<style>`), then run
+the build script — it inlines the stylesheet and stamps the output filename. See _Building the
+report_ at the end of this file.
 
 ## Styling — binding
 
-Inline the paste-ready stylesheet from `../../../references/report-style-tokens.md` **verbatim**
-into the `<style>` block. The report uses a deliberately off-brand, low-key _data-report_
-visual system (flat white paper, monospace for data/labels/chrome, sans for prose, a
-sequential layer ramp). Do not re-pick colors, do not invent additional layer tokens, do
-not reintroduce a brand skin, do not add `<link>`/`@font-face`/CDN imports. The layer →
-token mapping (unit / integration / e2e) and the badge → token mapping
-(assumption / warn / ok) are normative wherever rendered — chips, distribution bars,
-table cells, and recommendation rows.
+Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
+one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices
+in the canonical stylesheet (`../../../references/report-style.css`) verbatim. The report uses a
+deliberately off-brand, low-key _data-report_ visual system (flat white paper, monospace for
+data/labels/chrome, sans for prose, a sequential layer ramp). Do not re-pick colors, do not
+invent additional layer tokens, do not reintroduce a brand skin, do not add
+`<link>`/`@font-face`/CDN imports. The layer → token mapping (unit / integration / e2e) and the
+badge → token mapping (assumption / warn / ok) are normative wherever rendered — chips,
+distribution bars, table cells, and recommendation rows; your markup must use those exact class
+names. See `../../../references/report-style-tokens.md` for the token → meaning contract.
 
 Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section
 and do not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it
@@ -28,6 +33,12 @@ scrolls rather than overflows on narrow widths.
 Each section uses the **normative `id` listed below**. Do not rename, omit, or add
 top-level sections — readers look these up by id.
 
+Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
+`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section below
+(Overview, Summary, Evidence, Recommendations, Gaps), each anchoring its section id. It is a
+`<nav>`, not a numbered section. (In the combined two-tab report the build script namespaces
+these anchor links per tab, so a panel's ToC jumps within its own panel.)
+
 1. **Header** (no id; `<header>` element) — report title, the change under analysis
    (ticket/PR/feature), and the date.
 2. **`#overview`** — A short top-of-report synthesis written by the analyst, so a reader
@@ -44,8 +55,9 @@ top-level sections — readers look these up by id.
    `style="flex: <count>"` where `<count>` is the recommended test count at that layer
    (the browser normalizes; never hand-compute widths). Each `.seg` shows its count; the
    legend maps color → layer. Follow with `<ul class="shapes">`, one `<li>` per platform:
-   a `.plat` name plus the one-line shape (e.g. "server: integration-heavy, thin unit;
-   clients: integration + 1 E2E journey"). No JS. See `../../../references/report-style-tokens.md`
+   a `.plat` name plus the one-line shape that matches the repo's actual practice
+   (e.g. "server: unit-heavy pyramid, thin integration, no E2E; ios: integration +
+   snapshot, no XCUITest"). No JS. See `../../../references/report-style-tokens.md`
    → _Graphics_ for the chart contract. The chart encodes recommended **shape** (counts per
    layer) only; risk severity is carried in the `#recommendations` table's Severity column,
    not in this graphic — leave the chart severity-blind.
@@ -57,7 +69,12 @@ top-level sections — readers look these up by id.
 5. **`#recommendations`** — Per-platform recommendations — for each affected platform, a
    table:
    `Behavior | Severity | Recommended layer | Tooling | Rationale | Evidence (linked)`. One
-   row per behavior. The **Severity** cell carries the behavior's risk severity
+   row per behavior. When a behavior was extracted from a Jira item (its record carries a
+   `source_issue`), the **Behavior** cell appends the linked issue key —
+   `… behavior text … <a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>` — so
+   the row points back at the requirement; a behavior with no Jira source carries no key (see
+   `../../../references/input-sources.md` → _Citing Jira issues as links_). The **Severity** cell
+   carries the behavior's risk severity
    (Critical / High / Medium / Low / Informative) per the `analyzing-test-stack` skill's
    `references/severity-risk.md`. Render it with the stylesheet's existing inline-code
    treatment — `<code>Critical</code>` — **not** a new color token: the layer ramp and the
@@ -76,12 +93,13 @@ top-level sections — readers look these up by id.
    permalink production rules live in the `assessing-test-coverage` skill's
    `references/finding-coverage.md` → _Citing tests as GitHub permalinks_.
 
-6. **`#gaps`** — Coverage gaps & imbalances — behaviors with no coverage, and any
-   trophy-wrong shape observed (ice-cream-cone, over-unit-tested, trivial tests). **Order
+6. **`#gaps`** — Coverage gaps & imbalances — behaviors with no coverage, and any shape
+   wrong for its repo observed (ice-cream-cone, over-unit-tested, trivial tests). **Order
    the list by severity**, highest first, so a Critical uncovered behavior leads and the
    reader resolves the worst-impact gaps first; Informative behaviors are recorded as
-   out-of-scope rather than gaps. Each tied to evidence. Findings you could not ground
-   belong here, marked `unverified` with a one-line reason.
+   out-of-scope rather than gaps. Each tied to evidence, and — where the gap behavior came from
+   a Jira item — to its linked source key (same form as `#recommendations`). Findings you could
+   not ground belong here, marked `unverified` with a one-line reason.
 
 ## Content rules
 
@@ -91,7 +109,29 @@ top-level sections — readers look these up by id.
   so the reader can tell grounded calls from inferred ones.
 - Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g.
   E2E coverage claimed without the `test` repo checked out).
-- No tracking, no remote resources, no secrets. The file is shareable as-is.
+- **Hyperlink every GitHub or Atlassian source the report names.** Cited tests are GitHub
+  permalinks (see the Evidence rule above), and if the report names the
+  [Defect Severity Classification Guide](https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity)
+  or any Jira/Confluence/GitHub artifact, anchor it to its URL rather than naming it in plain
+  text. An informational `<a href>` to a GitHub/Atlassian page is **text, not a fetched
+  resource** — it does not violate the "no remote resources" rule below (which targets loaded
+  assets: CSS, fonts, scripts, CDN imports). Do not strip these links to honor the
+  self-contained constraint.
+- **Link every Jira item, and link each behavior to the Jira item it came from.** Any issue,
+  epic, or child key named anywhere in the report (Overview, Summary, Evidence) is an `<a href>`
+  to its browse URL — `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`,
+  never bare key text. And for every behavior in `#recommendations`/`#gaps` that was extracted
+  from a Jira item (the record's `source_issue`), append the linked source key to the behavior
+  cell so the reader can jump to the requirement. A behavior with no Jira source (PR-only)
+  carries no key. See `../../../references/input-sources.md` → _Citing Jira issues as links_ for
+  the link form. Never fabricate a key or URL.
+- No tracking, no remote resources, no secrets. The file is shareable as-is. ("Remote
+  resources" means assets the page loads — stylesheets, fonts, scripts, images, CDN imports —
+  not informational `<a href>` citations, which are encouraged per the rule above.)
+- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
+  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to
+  the top of the report from anywhere; it is CSS-only (styled by the stylesheet's `.to-top`
+  rule, no JavaScript). Do not drop either half or the anchor breaks.
 
 ## Skeleton
 
@@ -103,17 +143,23 @@ top-level sections — readers look these up by id.
     <meta name="viewport" content="width=device-width, initial-scale=1" />
     <title>Test Stack Report — {{change}}</title>
     <style>
-      /* Paste the full paste-ready stylesheet from
-         ../../../references/report-style-tokens.md here, verbatim. */
+      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
     </style>
   </head>
   <body>
-    <header>
+    <header id="top">
       <p class="eyebrow">Test Stack Report</p>
       <h1>…the change under analysis…</h1>
       <p class="meta">…ticket/PR · status · team · date…</p>
     </header>
     <main>
+      <nav class="toc" aria-label="Sections">
+        <a href="#overview">Overview</a>
+        <a href="#summary">Summary</a>
+        <a href="#evidence">Evidence</a>
+        <a href="#recommendations">Recommendations</a>
+        <a href="#gaps">Gaps</a>
+      </nav>
       <section id="overview">
         <h2>Overview</h2>
         …2–4 sentence recap of the recommended shape per platform; top 3 open
@@ -143,8 +189,8 @@ top-level sections — readers look these up by id.
         </figure>
         <ul class="shapes">
           <li>
-            <span class="plat">bitwarden/server</span> — integration-heavy, thin
-            unit, 1 E2E journey
+            <span class="plat">bitwarden/server</span> — unit-heavy pyramid,
+            thin integration, no E2E
           </li>
           <!-- one li per platform -->
         </ul>
@@ -167,6 +213,26 @@ top-level sections — readers look these up by id.
         …gaps and trophy-wrong shapes; ungrounded findings marked unverified…
       </section>
     </main>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
   </body>
 </html>
 ```
+
+## Building the report
+
+Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only
+content of `<style>`) to a temporary path, then run the build script:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-stack --slug <slug> --date <YYYY-MM-DD> \
+  test-stack-report-<slug>.fragment.html
+```
+
+`<slug>` is a short kebab-case identifier for the change (ticket key / PR number / feature
+name); `<date>` is the caller-provided date. The script splices in `report-style.css`, writes
+`test-stack-report-<slug>-<date>-<HHMMSS>.html` to the current working directory (the `HHMMSS`
+time suffix is stamped by the script, so each run is a fresh file — nothing is ever
+overwritten), and prints the final filename. Delete the temporary fragment afterward, and
+report the printed filename to the caller. Do not hand-assemble the final file or paste CSS as a
+fallback — if the script errors, fix the fragment and re-run.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
index 6188e09..9dc5310 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
@@ -1,10 +1,12 @@
 # Bitwarden repo layout, stacks, and the layer → repo map
 
 Bitwarden's code spans several repositories. A single feature often touches more than
-one, and each gets its own Testing Trophy. Treat the table below as a **starting map**,
-not gospel — when a repo is checked out, confirm the actual conventions from its config
-first (the `assessing-test-coverage` skill's `references/finding-coverage.md` →
-_Discovering a repo's test conventions_), and read the table as the last-resort default.
+one, and **each repo follows its own test shape** — pyramid, trophy, or all-E2E (see
+_Each repo's test shape in practice_ below; the shapes themselves are defined in
+`testing-trophy.md`). Treat the tables below as a **starting map**, not gospel — when a
+repo is checked out, confirm the actual conventions from its config first (the
+`assessing-test-coverage` skill's `references/finding-coverage.md` → _Discovering a repo's
+test conventions_), and read the table as the last-resort default.
 
 Establishing what a change is **already tested** by — finding existing coverage and citing
 it as permalinks — is a separate job owned by the `assessing-test-coverage` skill. This file
@@ -13,37 +15,70 @@ live at.
 
 ## Platform repos and their stacks
 
-| Repo (typical)      | Platform                       | Language / framework                | Unit / Integration tooling                                                                 |
-| ------------------- | ------------------------------ | ----------------------------------- | ------------------------------------------------------------------------------------------ |
-| `bitwarden/server`  | Backend / API                  | C# / .NET, ASP.NET Core, EF Core    | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers             |
-| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary |
-| `bitwarden/ios`     | iOS                            | Swift / SwiftUI                     | XCTest (unit + integration); XCUITest for on-device UI                                     |
-| `bitwarden/android` | Android                        | Kotlin                              | JUnit + Robolectric / Espresso (instrumented)                                              |
+| Repo (typical)                           | Platform                                                                        | Language / framework                                                      | Unit / Integration tooling                                                                                                                                                                                                             |
+| ---------------------------------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden/server`                       | Backend / API                                                                   | C# / .NET, ASP.NET Core, EF Core                                          | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers                                                                                                                                                         |
+| `bitwarden/clients`                      | Web, Browser ext, Desktop, CLI                                                  | TypeScript, Angular, Electron, RxJS                                       | Jest + `jest-mock-extended` + Angular TestBed (unit + shallow component); mocked HTTP at the boundary — _no_ Testing Library                                                                                                           |
+| `bitwarden/ios`                          | iOS                                                                             | Swift / SwiftUI                                                           | XCTest (+ emerging Swift Testing); SnapshotTesting + ViewInspector for SwiftUI views; processor/coordinator tests with mocks — no systematic XCUITest                                                                                  |
+| `bitwarden/android`                      | Android                                                                         | Kotlin                                                                    | JUnit5 + MockK + Turbine for ViewModels/logic; Compose UI tests run on the JVM via Robolectric — **all JVM `src/test`, no `androidTest`/Espresso, no screenshot testing**                                                              |
+| `bitwarden/sdk-internal`                 | Cross-platform SDK (core logic powering clients via WASM and mobile via UniFFI) | Rust (cargo workspace, ~50 crates); WASM + UniFFI (Swift/Kotlin) bindings | `cargo test --workspace` (no nextest; cargo-llvm-cov for coverage); mostly inline `#[cfg(test)]` unit tests, `mockall` + `wiremock` for the few HTTP/trait integration tests; binding surfaces consumed by `clients`, `ios`, `android` |
+| `bitwarden/test`                         | Cross-platform E2E (web, desktop, browser ext, iOS, android, CLI, API)          | C# / .NET                                                                 | NUnit + Selenium WebDriver (web/desktop/ext) + Appium (mobile) + CliWrap (CLI), Page Object Model; drives real builds — E2E only                                                                                                       |
+| `bitwarden/browser-interactions-testing` | Browser extension autofill (dedicated E2E suite)                                | TypeScript, Playwright, Docker Compose                                    | Playwright E2E form-fill against real extension builds (Chromium only); static-page + live-site scenarios — _not_ unit/integration                                                                                                     |
 
 Exact repo names and tool versions drift — verify against the checkout. If a platform
 isn't in this table, infer its stack from the repo and state the assumption in the report.
 
+## Each repo's test shape in practice
+
+The shape a repo actually maintains — not a one-size trophy. Recommend the layer that fits the
+repo's real distribution (see `testing-trophy.md` for the shapes). Each shape below was
+**confirmed against a local checkout**; still re-verify when versions drift, and for any repo
+not listed here, infer its shape from the checkout and state the assumption in the report.
+
+| Repo                                     | Shape                                       | What that means for recommendations                                                                                                                                                                                                                                                                   |
+| ---------------------------------------- | ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden/server`                       | **Pyramid** (unit-heavy)                    | Broad xUnit unit base (~5:1 over integration), a meaningful integration layer via `WebApplicationFactory` + test DB, **no E2E in-repo**. Default behaviors to unit; reserve integration for endpoint/persistence wiring.                                                                              |
+| `bitwarden/clients`                      | **Unit-heavy** (pyramid-leaning)            | ~1,000+ colocated `*.spec.ts`, heavy `jest-mock-extended`, TestBed component tests that mock their children (shallow, not deep integration). Push logic to unit; treat true component-integration as the deliberate step up. No E2E in-repo.                                                          |
+| `bitwarden/ios`                          | **Trophy + snapshot layer**                 | Component/processor/coordinator tests with mocks dominate (integration-leaning), a large **snapshot-testing** layer (SnapshotTesting) for SwiftUI views is first-class, lighter pure-unit layer, **no systematic XCUITest**. Recommend snapshot coverage for view changes explicitly.                 |
+| `bitwarden/android`                      | **Unit-heavy + JVM Compose-UI integration** | ~558 JVM `src/test` files: a unit base (ViewModels/logic with MockK + Turbine) plus a substantial Compose-UI integration tier running on the JVM via Robolectric. **No `androidTest`/Espresso, no screenshot testing, no E2E in-repo.** Don't recommend device-instrumented or screenshot tests here. |
+| `bitwarden/sdk-internal`                 | **Pyramid** (strongly unit-heavy)           | ~50 Rust crates, ~97% inline `#[cfg(test)]` unit tests (crypto/encoding/parsing logic, deterministic, no mocks) vs ~3% in `tests/` dirs; `mockall`/`wiremock` only where HTTP or cross-module orchestration matters. **No E2E.** Default to unit; integration only for binding/orchestration flows.   |
+| `bitwarden/test`                         | **All E2E**                                 | The cross-system journeys themselves; C# NUnit + Selenium/Appium driving real builds. Everything here is E2E by definition — never recommend unit/integration in this repo.                                                                                                                           |
+| `bitwarden/browser-interactions-testing` | **All E2E** (autofill)                      | Playwright autofill/form-fill against real Chromium extension builds. E2E only; the autofill counterpart to `test`.                                                                                                                                                                                   |
+
 ## Where each layer lives — important
 
 - **Unit and integration** tests live **alongside the code, inside each platform
   repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test
-  targets).
+  targets, and `sdk-internal`'s Rust crates, whose `cargo test` suites sit next to the
+  code they cover).
 - **End-to-end (E2E) tests live in a dedicated `test` repository** — _not_ inside the
   platform repos. It sits as a sibling of `server` / `clients` / `ios` in the user's
   Bitwarden checkout root, so look for it next to whichever platform repo you're in
   (e.g. if `clients` is at `~/repos/Bitwarden/clients`, `test` is at
-  `~/repos/Bitwarden/test`). Source: `https://github.com/bitwarden/test` — cite this URL
+  `~/repos/Bitwarden/test`). Source: [`bitwarden/test`](https://github.com/bitwarden/test) — cite this URL
   in the report only if no local sibling is found.
+- **Browser-extension autofill / form-fill E2E** also has a dedicated repo,
+  [`bitwarden/browser-interactions-testing`](https://github.com/bitwarden/browser-interactions-testing) —
+  Playwright driving real extension builds against static pattern pages and live sites
+  (Chromium today). Note the **overlap**: the cross-platform `test` repo _also_ carries
+  extension autofill coverage, so a given autofill journey may be tested in either (or
+  both). When recommending or inventorying autofill E2E, check both repos and flag where
+  coverage overlaps or where one is the better home, rather than assuming a single owner.
 
 ## Mapping a behavior to a platform + layer
 
 1. Identify which repo(s) the behavior lives in from the change surface (diff paths,
    ticket components, CSV team/area).
-2. Within each repo, choose the layer per `testing-trophy.md` and name the concrete tool
-   from the table above (confirmed against the checkout where possible).
-3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo and
-   flag whether comparable E2E coverage already exists there (per the coverage inventory
-   from `assessing-test-coverage`).
+2. Within each repo, choose the layer per `testing-trophy.md` (the cheapest sufficient layer)
+   **landed inside that repo's shape** from _Each repo's test shape in practice_ above — a
+   pyramid repo like `server` or `sdk-internal` resolves toward unit; `ios` toward its
+   component + snapshot practice — and name the concrete tool from the table above (confirmed
+   against the checkout where possible).
+3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo;
+   for browser-extension autofill / form-fill journeys, also consider
+   `browser-interactions-testing`. Coverage for autofill can live in either repo, so
+   check both and flag any overlap or comparable existing E2E coverage (per the coverage
+   inventory from `assessing-test-coverage`).
 
 Existing coverage to compare these recommendations against — including the GitHub permalinks
 the report's Evidence column requires — comes from the `assessing-test-coverage` skill's
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
index d3f292b..86caef4 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -1,7 +1,8 @@
 # Severity as a risk weight
 
-The Testing Trophy tells you the _cheapest layer that buys the confidence a behavior
-requires_. **Severity tells you how much confidence is required.** A defect in vault
+The layer model (`testing-trophy.md`) tells you the _cheapest layer that buys the confidence a
+behavior requires_, landed inside the target repo's shape. **Severity tells you how much
+confidence is required.** A defect in vault
 unlock and a typo on a settings label are not owed the same rigor — severity is the dial
 that turns "cheapest sufficient" from a flat rule into a risk-weighted one.
 
@@ -10,12 +11,11 @@ it gets fixed (that is _priority_). This skill weights coverage by severity, not
 
 ## Source of truth
 
-The canonical classification is Bitwarden's **Defect Severity Classification Guide**,
-Confluence page `2759229512`
-(`https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity`). The levels
-and criteria below mirror that page so the analysis degrades gracefully when the Atlassian
-MCP is unavailable — but the page is authoritative. When the `bitwarden-atlassian-tools`
-MCP is available, fetch it with `mcp__bitwarden-atlassian__get_confluence_page` (pageId
+The canonical classification is Bitwarden's [**Defect Severity Classification Guide**](https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity),
+Confluence page `2759229512`. The levels
+and criteria below mirror that page so the analysis degrades gracefully when the
+`bitwarden-atlassian-tools` MCP is unavailable — but the page is authoritative. When the
+`bitwarden-atlassian-tools` MCP is available, fetch it with `mcp__bitwarden-atlassian__get_confluence_page` (pageId
 `2759229512`) to pick up revisions before relying on the cached copy here. If the fetch
 fails or the MCP is unavailable, use the mirrored table below and note in the report that
 the severity definitions are from the cached copy (version not re-verified) — degrade
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
index ff5cd41..71f9e78 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
@@ -1,61 +1,90 @@
-# The Testing Trophy
+# Test shape: pyramid, trophy, and where Bitwarden's repos actually sit
 
-A model for shaping automated test coverage, contrasted with the older Testing Pyramid. The trophy weights **integration** tests most heavily,
-because they buy the most confidence per unit of cost and maintenance for typical
-application code.
+A model for shaping automated test coverage across three layers — **unit**, **integration**,
+**E2E**. Two classic shapes describe how the volume is distributed across those layers:
 
-## The three layers (base → top)
+- **Testing Pyramid** — a broad **unit** base, a smaller **integration** layer, a thin (often
+  absent) **E2E** cap. Optimizes for fast, stable, cheap-to-maintain coverage. The natural fit
+  for backend and logic-heavy code where units have real branching complexity.
+- **Testing Trophy** — a focused **unit** base, a **heavy integration** bulge where most
+  confidence is bought, a thin **E2E** cap. The fit for application code where behavior emerges
+  from collaborators (UI components, view models) and an isolated unit proves little.
+
+Neither shape is universally "correct," and **this skill does not impose one on every repo.**
+Bitwarden's repos deliberately sit at different points — some pyramid, some trophy, some a mix,
+and two are effectively **all E2E**. Recommend the layer that fits the **target repo's actual
+practice** (mapped per repo in `monorepo-layout.md`), not an idealized shape. A "funky mix" of
+pyramid and trophy within or across repos is normal and fine.
+
+## The three layers (cheapest → most expensive)
 
 1. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure
    logic, algorithms, edge cases, and error handling where setup is cheap and the unit
    has real branching complexity. Fast and stable, but isolation can let integration
    bugs slip through.
 
-2. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units
-   working together through real (or realistic) collaborators: a controller + service +
-   in-memory or test database, a component rendered with its real child components and a
-   mocked network boundary, a view model against a real repository. This is where most
-   confidence is bought because it exercises the wiring users actually depend on, without
-   the cost and flakiness of full E2E.
+2. **Integration** — the **confidence layer**: the trophy's bulge and the pyramid's middle.
+   Tests several units working together through real (or realistic) collaborators: a
+   controller + service + in-memory or test database, a component rendered with its real
+   child components and a mocked network boundary, a view model against a real repository.
+   It exercises the wiring users actually depend on without the cost and flakiness of full
+   E2E. How _much_ of it a repo carries is what separates a trophy (a lot) from a pyramid
+   (a moderate middle).
 
-3. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a
-   user would: real browser, real device, real backend. Highest confidence per test, but
-   slowest, most expensive, and most flaky. Reserve for a small number of **critical user
-   journeys** (e.g. login, vault unlock, checkout) — not for branch coverage.
+3. **E2E (end-to-end)** — thin top in most repos, the **entire suite** in the dedicated E2E
+   repos. Drives the real, fully assembled system the way a user would: real browser, real
+   device, real backend. Highest confidence per test, but slowest, most expensive, and most
+   flaky. In a platform repo, reserve it for a small number of **critical user journeys**
+   (e.g. login, vault unlock, checkout) — not for branch coverage. The cross-system journeys
+   themselves live in the `test` repo, where E2E _is_ the strategy.
 
-## The shape
+## The two shapes
 
 ```
-        ┌───────────┐
-        │    E2E    │      thin top — critical journeys only
-     ┌──┴───────────┴──┐
-     │   Integration   │   HEAVY — most confidence bought here
-     └──┐           ┌──┘
-        │   Unit    │      focused — pure logic & edge cases
-        └───────────┘
+   Pyramid (e.g. server)              Trophy (e.g. ios)
+
+      ┌─────────┐                       ┌───────────┐
+      │   E2E   │  thin / none          │    E2E    │   thin top
+   ┌──┴─────────┴──┐                 ┌──┴───────────┴──┐
+   │  Integration  │  moderate       │   Integration   │   HEAVY
+   └──┬─────────┬──┘              ┌──┴─────────────────┴──┐
+   │     Unit      │  BROAD base   │        Unit           │  focused
+   └───────────────┘              └───────────────────────┘
 ```
 
-Static analysis (type checking, linters, formatters) sits below the trophy and is handled by per-repo tooling — not recommended by this skill.
+Static analysis (type checking, linters, formatters) sits below both shapes and is handled by
+per-repo tooling — not recommended by this skill.
 
 ## How to assign a layer
 
-Pick the **cheapest layer that still buys the confidence the behavior requires**:
-
-- Pure transformation, calculation, parsing, validation logic with real branching → **unit**.
-- Behavior that emerges from collaborators working together (HTTP handler + service +
-  persistence; component + store + API boundary; view model + repository) → **integration**.
-- A behavior only meaningful as a full user journey across the real system → **E2E**, and
-  only if it is genuinely critical.
-- Anything a type system, analyzer, or linter already guarantees → don't write a test
-  for it.
-
-## Anti-patterns to avoid
-
-- **Ice-cream cone** — the trophy inverted: many E2E tests, few integration/unit. Slow,
-  flaky, expensive to maintain.
-- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the
-  mocks rather than real behavior; integration would buy more.
-- **Testing trivial code** — tests for getters/setters, framework glue, or
-  type-guaranteed invariants. Cost without confidence.
-- **E2E for branch coverage** — using slow full-system tests to cover edge cases that
-  belong at the unit or integration layer.
+Apply two rules together:
+
+1. **Cheapest sufficient layer.** Pick the lowest-cost layer (unit < integration < E2E) that
+   still buys the confidence the behavior requires:
+   - Pure transformation, calculation, parsing, validation logic with real branching → **unit**.
+   - Behavior that emerges from collaborators working together (HTTP handler + service +
+     persistence; component + store + API boundary; view model + repository) → **integration**.
+   - A behavior only meaningful as a full user journey across the real system → **E2E**, and
+     only if it is genuinely critical.
+   - Anything a type system, analyzer, or linter already guarantees → don't write a test for it.
+
+2. **Honor the target repo's shape.** The cheapest-sufficient call lands inside the shape the
+   repo's engineers actually maintain. The same kind of behavior resolves differently per repo:
+   in `server` it lands in a unit-heavy pyramid; in `ios` it lands in component/processor
+   integration tests plus the repo's snapshot layer; a cross-system journey lands as E2E in the
+   dedicated `test` repo, never inside a platform repo. Recommend what that repo maintains today,
+   citing the per-repo shape in `monorepo-layout.md` — and where a repo's real shape is unknown,
+   say so rather than defaulting to the trophy.
+
+## Anti-patterns to avoid (in any shape)
+
+- **Ice-cream cone** — many E2E tests, few integration/unit. Slow, flaky, expensive to maintain.
+  Wrong everywhere, including in a pyramid repo that has started leaning on E2E for branch coverage.
+- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the mocks
+  rather than real behavior; integration would buy more. The most common failure in unit-heavy repos.
+- **Testing trivial code** — tests for getters/setters, framework glue, or type-guaranteed
+  invariants. Cost without confidence.
+- **E2E for branch coverage** — using slow full-system tests to cover edge cases that belong
+  at the unit or integration layer.
+- **Forcing a foreign shape** — recommending an integration bulge for a repo that runs a unit
+  pyramid (or vice versa) just because a model says so. Match the repo, not the textbook.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
index 32905a3..49521bc 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: assessing-test-coverage
 description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or set of changed paths across Bitwarden's repos, citing each as a stable GitHub permalink, bucketing it by test layer, and flagging behaviors with no observed test as gaps. Distinguishes observed coverage from assumed. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", "find the existing tests for", "is this component covered", "audit current test coverage". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient trophy layers; for that, use analyzing-test-stack.
-allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*)"
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)"
 ---
 
 # Assessing Test Coverage
@@ -26,13 +26,13 @@ A missing input narrows the inventory; it never blocks it. Record what you could
 
 1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_.
 
-2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. For E2E, inspect the sibling `test` repo if available. See `references/finding-coverage.md` → _Finding existing coverage_.
+2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed** — capture 1–3 representative tests plus an approximate count per behavior; do not open and enumerate every test method in a covered area (the dominant cost control — see `references/finding-coverage.md` → _Establish coverage per behavior_). For E2E, inspect the sibling `test` repo if available.
 
-3. **Cite and bucket each observed test.** Render every cited test as a GitHub permalink (commit SHA, not branch), following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_. A test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket each by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`.
+3. **Cite and bucket each behavior's coverage.** For each behavior, render its 1–3 representative tests as GitHub permalinks (commit SHA, not branch) and record its layer and approximate count, following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _Output contract_. A representative test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`.
 
 4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_.
 
-5. **Write the coverage report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/coverage-report-template.md`. **Inline the canonical stylesheet from `../../references/report-style-tokens.md` verbatim** — the same plugin-level styling source the test-stack report uses, so the two reports read as one instrument; do not re-pick colors or reintroduce a brand skin. Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`) and write `#overview` yourself as a short synthesis. Write the report to the **current working directory** as `test-coverage-report-<slug>-<date>.html`, where `<slug>` is a short kebab-case identifier for the change and `<date>` is the caller-provided date.
+5. **Render the coverage report.** Turning the gathered inventory into HTML is **mechanical formatting, not reasoning** — under the `bitwarden-test-engineer` agent this step runs on a Sonnet report-writer subagent (see the agent's _Model selection_), not in the analytical context. Author a **content fragment** following `references/coverage-report-template.md`: a full HTML document whose `<style>` element holds only the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — **never paste or retype CSS**; the build script splices in the canonical `../../references/report-style.css` verbatim (the same source the test-stack report uses, so the two read as one instrument — use the exact class names, do not re-pick colors or reintroduce a brand skin). Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`) and write `#overview` yourself as a short synthesis. Then build the final file: `"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" --kind test-coverage --slug <slug> --date <date> <fragment>` (slug = short kebab-case change identifier; date = the caller-provided date). The script writes `test-coverage-report-<slug>-<date>-<HHMMSS>.html` to the current working directory — the `HHMMSS` suffix is stamped by the script so each run is a fresh file, never overwriting a prior report — and prints the filename; delete the temporary fragment and report that filename. Do not hand-assemble the file or paste CSS as a fallback.
 
 ## Output
 
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
index 0a08050..48787e5 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -6,20 +6,23 @@ web fonts. It must render correctly opened directly from disk and survive being
 ticket or PR. This is the coverage counterpart to the `analyzing-test-stack` test-stack report;
 the two share one visual system so they read as the same instrument.
 
-Write it to the **current working directory** as
-`test-coverage-report-<slug>-<date>.html` (slug = ticket key / PR number / feature name in
-kebab-case; date = the caller-provided date, `YYYY-MM-DD`).
+You do not write the final file directly and you do not paste any CSS. Author a **content
+fragment** (the full HTML below, but with only a stylesheet sentinel inside `<style>`), then run
+the build script — it inlines the stylesheet and stamps the output filename. See _Building the
+report_ at the end of this file.
 
 ## Styling — binding
 
-Inline the paste-ready stylesheet from `../../../references/report-style-tokens.md` (the
-plugin-level `references/` directory) **verbatim** into the `<style>` block — the same styling
-source the test-stack report uses, pasted identically so the two reports do not drift. Do
-not re-pick colors, fonts, or layer tokens, and do not reintroduce a brand skin or any
+Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
+one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices
+in the canonical stylesheet (`../../../references/report-style.css`) verbatim. It is the same
+styling source the test-stack report uses, spliced identically so the two reports do not drift.
+Do not re-pick colors, fonts, or layer tokens, and do not reintroduce a brand skin or any
 `<link>`/`@font-face`/CDN import; the off-brand data-report system and the layer/badge token
-mappings in that file are binding. The layer chips (`unit` / `integration` / `e2e`), the
-badges (`assumption` / `warn` / `ok`), the distribution chart, and the `.unlinkable` span are
-all defined there.
+mappings are binding. The layer chips (`unit` / `integration` / `e2e`), the badges
+(`assumption` / `warn` / `ok`), the distribution chart, and the `.unlinkable` span are all
+defined in the stylesheet; your markup must use those exact class names. See
+`../../../references/report-style-tokens.md` for the token → meaning contract.
 
 Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
 not hand-number. Wrap each wide table in `<div class="scroll">…</div>`.
@@ -29,6 +32,12 @@ not hand-number. Wrap each wide table in `<div class="scroll">…</div>`.
 Each section uses the **normative `id` listed below**. Do not rename, omit, or add top-level
 sections — readers look these up by id.
 
+Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
+`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section below
+(Overview, Summary, Evidence, Coverage, Gaps), each anchoring its section id. It is a `<nav>`,
+not a numbered section. (In the combined two-tab report the build script namespaces these anchor
+links per tab, so a panel's ToC jumps within its own panel.)
+
 1. **Header** (no id; `<header>` element) — report title ("Test Coverage Report"), the change
    under analysis (ticket/PR/feature), and the date.
 2. **`#overview`** — A short top-of-report synthesis written so a reader sees the bottom line
@@ -51,11 +60,19 @@ sections — readers look these up by id.
    unverified"). For PR-sourced records include the captured **head SHA** and **`owner/repo`**
    so the per-test permalinks can be audited against the same commit.
 5. **`#coverage`** — Observed coverage — for each affected platform, a table:
-   `Behavior / surface | Layer | Test (linked) | Source | Notes`. One row per observed test.
-   The **Test (linked)** column is binding: render a GitHub permalink anchored to the captured
-   commit SHA and line range —
-   `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
-   If a test cannot be linked, use
+   `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`. **One row per
+   behavior**, not per test — match the per-behavior coverage records. When a behavior's record
+   carries a `source_issue`, the **Behavior / surface** cell appends the linked issue key —
+   `… behavior … <a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>` — so the
+   row points back at the requirement it came from (see `../../../references/input-sources.md` →
+   _Citing Jira issues as links_); a behavior with no Jira source carries no key. The **Tests (linked)**
+   column renders the behavior's 1–3 representative permalinks (binding), anchored to the
+   captured commit SHA and line range —
+   `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`;
+   the **Count** column gives the approximate number of tests covering that behavior at that
+   layer (breadth without enumerating every test). Do not expand a well-covered behavior into
+   dozens of rows — that bloats the report and is not what a reader needs.
+   If a representative test cannot be linked, use
    `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` instead
    of fabricating a URL. The **Layer** cell uses the matching layer chip. **Source** is `PR`
    (tests shipped in a linked/merged PR) or `pre-existing` (found by the targeted lookup) —
@@ -63,7 +80,8 @@ sections — readers look these up by id.
    `finding-coverage.md` → _Citing tests as GitHub permalinks_.
 6. **`#gaps`** — Coverage gaps — behaviors/surfaces in the change with **no observed test**,
    each marked `<span class="badge warn">unverified</span>` with a one-line reason (no
-   PR-observed test and no targeted hit; or `test` repo unavailable). This is the honest
+   PR-observed test and no targeted hit; or `test` repo unavailable), and — where the behavior
+   came from a Jira item — its linked source key (same form as `#coverage`). This is the honest
    record of what is _not_ known to be covered — it is not a recommendation to add tests.
 
 ## Content rules
@@ -75,7 +93,20 @@ sections — readers look these up by id.
 - Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g. E2E
   coverage claimed without the `test` repo checked out).
 - Never present assumed coverage as observed, and never fabricate a permalink.
+- **Link every Jira item, and link each behavior to the Jira item it came from.** Any issue,
+  epic, or child key named anywhere (Overview, Summary, Evidence) is an `<a href>` to its browse
+  URL — `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`, never bare key
+  text. For every behavior row in `#coverage`/`#gaps` whose behavior was extracted from a Jira
+  item (the record's `source_issue`), append the linked source key to the behavior cell so the
+  reader can jump to the requirement; a behavior with no Jira source carries no key. See
+  `../../../references/input-sources.md` → _Citing Jira issues as links_. Never fabricate a key
+  or URL. An informational `<a href>` citation is text, not a loaded asset — it does not violate
+  the no-remote-resources rule below.
 - No tracking, no remote resources, no secrets. The file is shareable as-is.
+- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
+  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to
+  the top of the report from anywhere; it is CSS-only (styled by the stylesheet's `.to-top`
+  rule, no JavaScript). Do not drop either half or the anchor breaks.
 
 ## Skeleton
 
@@ -87,17 +118,23 @@ sections — readers look these up by id.
     <meta name="viewport" content="width=device-width, initial-scale=1" />
     <title>Test Coverage Report — {{change}}</title>
     <style>
-      /* Paste the full paste-ready stylesheet from
-         ../../../references/report-style-tokens.md here, verbatim. */
+      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
     </style>
   </head>
   <body>
-    <header>
+    <header id="top">
       <p class="eyebrow">Test Coverage Report</p>
       <h1>…the change under analysis…</h1>
       <p class="meta">…ticket/PR · status · team · date…</p>
     </header>
     <main>
+      <nav class="toc" aria-label="Sections">
+        <a href="#overview">Overview</a>
+        <a href="#summary">Summary</a>
+        <a href="#evidence">Evidence</a>
+        <a href="#coverage">Coverage</a>
+        <a href="#gaps">Gaps</a>
+      </nav>
       <section id="overview">
         <h2>Overview</h2>
         …2–4 sentence recap of observed coverage per platform; top 3 gaps;
@@ -150,6 +187,26 @@ sections — readers look these up by id.
         reason…
       </section>
     </main>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
   </body>
 </html>
 ```
+
+## Building the report
+
+Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only
+content of `<style>`) to a temporary path, then run the build script:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-coverage --slug <slug> --date <YYYY-MM-DD> \
+  test-coverage-report-<slug>.fragment.html
+```
+
+`<slug>` is a short kebab-case identifier for the change (ticket key / PR number / feature
+name); `<date>` is the caller-provided date. The script splices in `report-style.css`, writes
+`test-coverage-report-<slug>-<date>-<HHMMSS>.html` to the current working directory (the
+`HHMMSS` time suffix is stamped by the script, so each run is a fresh file — nothing is ever
+overwritten), and prints the final filename. Delete the temporary fragment afterward, and
+report the printed filename to the caller. Do not hand-assemble the final file or paste CSS as a
+fallback — if the script errors, fix the fragment and re-run.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
index af2f27e..625f124 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -28,7 +28,7 @@ Reliably establishing what is **already tested** does not require grepping a who
 two ordered moves, and record anything still unfound as a gap rather than dropping it:
 
 1. **Merged/linked PRs are the backbone.** The PRs hanging off the Jira issue and its epic
-   children (`get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
+   children (`mcp__bitwarden-atlassian__get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
    the tests that shipped with this work, and are already permalink-ready via the PR head SHA.
    Take the tests observed in those PR diffs as primary coverage evidence.
 2. **Targeted repo lookup for pre-existing tests.** Tests written _before_ this ticket won't
@@ -43,6 +43,19 @@ and cite specific files; if it is not available, record E2E coverage as `unverif
 A behavior with no PR-observed test and no targeted hit is recorded as a coverage gap /
 `unverified` — never silently assumed covered.
 
+### Establish coverage per behavior, not per test — stop as soon as it's confirmed
+
+The inventory is keyed to the **change's testable behaviors**, not to every test method in the
+repo. For each behavior, find _whether and at what layer_ it is covered, capture **1–3
+representative tests** as evidence plus an approximate **count** at that layer, and then **move
+on** — do not open and enumerate every test in a covered area. A behavior backed by 40 unit
+tests is recorded as `{ count: ~40, representative: [3 permalinks] }`, not 40 records. This is
+the dominant cost control on large repos: exhaustively cataloguing a well-covered area burns
+many tool calls and tokens to produce a record set no recommendation needs, and bloats the
+downstream report into an unreadable dump. Spend the search budget on **resolving each
+behavior's status**, not on completeness of enumeration. Two or three confirming tests prove a
+behavior is covered; the 38 others add cost, not confidence.
+
 ## Citing tests as GitHub permalinks
 
 Every test cited as **current coverage** must be rendered as a clickable
@@ -89,31 +102,47 @@ render these as `<span class="unlinkable">path — unlinkable: &lt;reason&gt;</s
 
 ### Output contract
 
-For every cited test, return a record of the shape:
+Return **one record per behavior** (not per test), carrying its layer, an approximate count,
+1–3 representative tests as evidence, and — when the behavior was extracted from a Jira item —
+the originating `source_issue` (`key` + browse `url`) so the report can link the behavior back to
+its requirement (see `../../../references/input-sources.md` → _Citing Jira issues as links_). The
+`source_issue` is **carried through from intake** with the behavior — it is provenance recorded
+when the behavior was extracted, not something coverage discovery determines; echo it through when
+present. A behavior with no Jira source (e.g. found only in a PR diff) omits `source_issue`.
 
 ```
 {
-  "path": "src/services/Foo/FooService.spec.ts",
-  "start_line": 42,
-  "end_line": 89,
-  "owner_repo": "bitwarden/clients",
-  "sha": "a1b2c3d4e5f6…",
+  "behavior": "per-phase price resolution on schedule activation",
+  "platform": "server",
   "layer": "integration",
-  "permalink": "https://github.com/bitwarden/clients/blob/a1b2c3d4e5f6…/src/services/Foo/FooService.spec.ts#L42-L89"
+  "status": "covered",
+  "count": 21,
+  "source_issue": {
+    "key": "PM-1234",
+    "url": "https://bitwarden.atlassian.net/browse/PM-1234"
+  },
+  "representative": [
+    {
+      "path": "test/Billing/.../ScheduleHandlerTests.cs",
+      "start_line": 42,
+      "end_line": 89,
+      "owner_repo": "bitwarden/server",
+      "sha": "a1b2c3d4e5f6…",
+      "permalink": "https://github.com/bitwarden/server/blob/a1b2c3d4e5f6…/test/Billing/.../ScheduleHandlerTests.cs#L42-L89"
+    }
+  ]
 }
 ```
 
-…or, when unlinkable:
-
-```
-{ "path": "src/services/Foo/FooService.spec.ts", "layer": "integration", "unlinkable_reason": "no remote for local checkout" }
-```
-
-Behaviors/surfaces with no observed test are returned as gaps:
+A representative test that cannot be linked is recorded path-only with a reason inside
+`representative` (`{ "path": "…", "unlinkable_reason": "no remote for local checkout" }`) —
+never fabricate a URL. Behaviors/surfaces with no observed test are returned as gaps:
 
 ```
 { "behavior": "tier downgrade preserves seat count", "platform": "server", "status": "unverified" }
 ```
 
-The `analyzing-test-stack` recommender consumes these records as-is to populate the
-report's Evidence (linked) column and to seed its gap analysis.
+Keep `representative` to at most three permalinks per behavior; the `count` conveys breadth
+without listing every test. The `analyzing-test-stack` recommender consumes these records as-is
+to populate the report's Evidence (linked) column (rendering the representative permalinks) and
+to seed its gap analysis.

From 736b8a5bf80910d2909fbf3adb32e693f5279ccc Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Thu, 18 Jun 2026 16:50:22 -0400
Subject: [PATCH 4/9] clean up consistency issues

---
 plugins/bitwarden-test-engineer/README.md     |  12 +-
 .../bitwarden-test-engineer/agents/AGENT.md   |  31 +-
 .../references/input-sources.md               |   6 +-
 .../references/report-template-common.md      | 174 +++++++++++
 .../skills/analyzing-test-stack/SKILL.md      |  14 +-
 .../references/html-report-template.md        | 290 ++++--------------
 .../references/severity-risk.md               |   2 +-
 .../skills/assessing-test-coverage/SKILL.md   |   8 +-
 .../references/coverage-report-template.md    | 260 ++++------------
 .../references/finding-coverage.md            |   2 +-
 10 files changed, 328 insertions(+), 471 deletions(-)
 create mode 100644 plugins/bitwarden-test-engineer/references/report-template-common.md

diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index 86b3336..e7f222d 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -10,9 +10,13 @@ new testing skills are added over time.
 ### First capability: test-stack analysis
 
 Given a change — a feature, bugfix, refactor, or migration — the agent recommends
-**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a focused
-unit layer, a heavy integration layer where most confidence is bought, and a thin E2E
-layer reserved for critical user journeys.
+**what to test, at which layer, and why**, shaped to **each repo's actual test practice**.
+Two ideas drive it: each behavior is tested at the cheapest layer that buys the confidence it
+needs (unit, integration, or E2E), and how those layers are weighted is decided per repo — a
+unit-heavy pyramid (`server`, `clients`, `sdk-internal`, `android`), an integration/snapshot
+trophy (`ios`), or a wholly all-E2E repo (the dedicated `test` repo,
+`browser-interactions-testing`). E2E is "thin" only _within_ a platform repo; the dedicated
+`test` repo is entirely E2E by design.
 
 It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub
 PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out
@@ -40,7 +44,7 @@ unverified when that repo isn't checked out.
 | Skill                     | What It Does                                                                                                                                                                                                                                                                                                                                                                                    |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `assessing-test-coverage` | The backward-looking inventory. Determines what is **already tested** for a change — scoped to the change surface, PR-first then a targeted lookup — buckets each observed test by layer, cites it as a stable GitHub permalink, flags untested behaviors as gaps, and writes a self-contained HTML coverage report. Feeds `analyzing-test-stack`; usable standalone to audit current coverage. |
-| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report to the current working directory.                          |
+| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient test layer per platform, inside each repo's actual shape, names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report to the current working directory.   |
 
 ## Cross-Plugin Integration
 
diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md
index 31a1d92..755d10a 100644
--- a/plugins/bitwarden-test-engineer/agents/AGENT.md
+++ b/plugins/bitwarden-test-engineer/agents/AGENT.md
@@ -2,12 +2,12 @@
 name: bitwarden-test-engineer
 version: 1.0.0
 description: |
-  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E) shaped as a Testing Trophy and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, assesses what is already tested (the `assessing-test-coverage` skill), then runs the analyst skill (`analyzing-test-stack`), which emits a self-contained HTML report. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or asks for a "test stack" / "test strategy" / "test trophy" / "risk-based coverage" analysis for a ticket, PR, tech breakdown, or set of test cases.
+  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E) shaped to each repo's actual test practice — a unit-heavy pyramid, an integration/snapshot trophy, or an all-E2E repo, not one universal trophy — and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, assesses what is already tested (the `assessing-test-coverage` skill), then runs the analyst skill (`analyzing-test-stack`), which emits a self-contained HTML report. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or asks for a "test stack" / "test strategy" / "test trophy" / "risk-based coverage" analysis for a ticket, PR, tech breakdown, or set of test cases.
 
   <example>
   Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
   user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
-  assistant: "I'll use the bitwarden-test-engineer agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation."
+  assistant: "I'll use the bitwarden-test-engineer agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a test-layer recommendation shaped to each affected repo."
   <commentary>
   Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report.
   </commentary>
@@ -16,16 +16,16 @@ description: |
   <example>
   Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
   user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
-  assistant: "I'll use the bitwarden-test-engineer agent to read the PR diff and its tests, assess the trophy shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
+  assistant: "I'll use the bitwarden-test-engineer agent to read the PR diff and its tests, assess the test shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
   <commentary>
-  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the trophy shape including the ice-cream-cone check.
+  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the test shape including the ice-cream-cone check.
   </commentary>
   </example>
 
   <example>
   Context: A QA engineer exported a set of manual test cases and wants an automation plan.
   user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
-  assistant: "I'll use the bitwarden-test-engineer agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation."
+  assistant: "I'll use the bitwarden-test-engineer agent to parse the CSV, bucket the existing cases by test layer, find the gaps, and produce a layer-by-layer automation recommendation."
   <commentary>
   CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
   </commentary>
@@ -34,7 +34,7 @@ description: |
   <example>
   Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
   user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
-  assistant: "I'll use the bitwarden-test-engineer agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation."
+  assistant: "I'll use the bitwarden-test-engineer agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform test-stack recommendation shaped to each repo."
   <commentary>
   Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
   </commentary>
@@ -85,6 +85,19 @@ The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin
 
 ## Workflow
 
+**Route first.** Classify what the request actually needs, then dispatch to the matching skill(s) — the skills are self-describing and each can run standalone, so you select among them rather than forcing every request through a single fixed path.
+
+The **primary flow — and the one steps 1–5 below specify — is the coverage → recommendation pipeline**: assess what is already tested, then recommend what to add. It runs whenever the user wants a test plan, a test-stack analysis, or a risk-based coverage recommendation for a change. The two steps are genuinely ordered (the coverage inventory feeds the recommendation), so when the full plan is wanted, run them in sequence.
+
+But not every request is the full pipeline. When a request maps cleanly onto a single capability, invoke just that skill and stop:
+
+- _"What's already tested for this PR?"_ → `Skill(assessing-test-coverage)` alone; skip the recommendation.
+- _"What layers should this change ship with?"_ (coverage already known or not wanted) → `Skill(analyzing-test-stack)`, which pulls its own coverage inventory if none was supplied.
+
+As the plugin grows, a request that doesn't fit the coverage → recommendation pipeline dispatches to the skill that owns it rather than being bent through the steps below — add the new branch here, leave the pipeline intact. The orchestration concerns that span every flow (parallel evidence fan-out, explicit subagent model-pinning, coverage-before-recommendation ordering, context discipline) live in this agent regardless of which skill runs.
+
+The steps below specify the primary pipeline end to end.
+
 ### 1. Intake and scope
 
 Classify every input the user supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name to search), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) is specified in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it here.
@@ -112,7 +125,7 @@ This step depends on step 2's change surface, so run it after the evidence fan-o
 
 ### 4. Recommend
 
-Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill — skills cannot read the clock; the build script stamps the `HHMMSS` suffix.
+Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient test layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill — skills cannot read the clock; the build script stamps the `HHMMSS` suffix.
 
 ### 5. Combine and present
 
@@ -134,7 +147,7 @@ Mirror the test-stack report's `#overview` in chat: the recommended shape per pl
 ## Principles
 
 - **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
-- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer, then land the call inside the target repo's actual shape (pyramid for `server`/`sdk-internal`/`clients`/`android`, integration + snapshot for `ios`, all-E2E for `test`/`browser-interactions-testing`).
+- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer, then land the call inside the target repo's actual shape (per `monorepo-layout.md` → _Each repo's test shape in practice_, not a single house style).
 - **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency).
 - **Degrade gracefully.** A missing input (no `bitwarden-atlassian-tools` MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
 - **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_.
@@ -154,6 +167,6 @@ Rule of thumb: push the cheap, high-volume gathering **and the mechanical report
 
 Your own context is the most expensive token pool in the run — what you read into it and re-emit is re-cached on every subsequent turn. Three rules:
 
-- **Never read the rendering files into your context.** The report templates (`html-report-template.md`, `coverage-report-template.md`), `report-style-tokens.md`, `report-style.css`, and `build-report.sh` are the **report-writer subagent's** concern only — it reads them. You only need the reasoning references (`testing-trophy.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). Loading the templates or stylesheet into your context is wasted cache. (The combined-page build in step 5 is the one time you _invoke_ `build-report.sh` directly — but you only run it on the two finished report filenames; you still never read its source or the rendering files.)
+- **Never read the rendering files into your context.** The report templates (`html-report-template.md`, `coverage-report-template.md`, the shared `report-template-common.md`), `report-style-tokens.md`, `report-style.css`, and `build-report.sh` are the **report-writer subagent's** concern only — it reads them. You only need the reasoning references (`testing-trophy.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). Loading the templates or stylesheet into your context is wasted cache. (The combined-page build in step 5 is the one time you _invoke_ `build-report.sh` directly — but you only run it on the two finished report filenames; you still never read its source or the rendering files.)
 - **Don't restate digests.** Subagents return compact digests; synthesize them into the decision, don't echo them back to the user mid-run. Keep inter-step narration to a few lines — the reports are the deliverable, not a running commentary.
 - **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records (now small by design) and the `#overview` text. If a record set is still large, `Write` it to a temp file (e.g. `./.test-engineer-<slug>.json`) and pass the path instead of pasting the blob into the prompt.
diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
index 7938b95..33bc24d 100644
--- a/plugins/bitwarden-test-engineer/references/input-sources.md
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -39,7 +39,7 @@ from the Defect Severity Classification Guide (Confluence page `2759229512`).
 A Jira key may resolve to an Epic (or, in next-gen projects, a Feature) rather than a single
 story. The epic body itself rarely lists testable behaviors — those live on its children
 and on the PRs the children produce. If you analyze only the epic, you will under-scope the
-trophy. So when the `issuetype` on the `get_issue` response is `Epic` or `Feature`, expand
+analysis. So when the `issuetype` on the `get_issue` response is `Epic` or `Feature`, expand
 before extracting:
 
 1. **Discover children.** Read the `subtasks` field first. If empty (common in next-gen
@@ -53,7 +53,7 @@ before extracting:
    not re-derive it.
 3. **Per child, gather behaviors and PRs.**
    - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance
-     criteria — these are the testable behaviors for the trophy. Capture each child's **key and
+     criteria — these are the testable behaviors for the analysis. Capture each child's **key and
      browse URL** and carry it with the behaviors it produces, exactly as for a single-issue
      intake — a behavior sourced from a child issue links to that child, not the epic.
    - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub").
@@ -125,7 +125,7 @@ Map its structure to testable evidence (the canonical template is page `29203497
   **SDK changes**, **Services touched**, **Hosting** (Self-Hosted vs Cloud paths),
   **Feature flagging** (flag-on/flag-off states to cover), and **Security considerations**
   (crypto, threat-model-relevant behaviors). The **Testing considerations** item is the team's
-  own stated test intent — treat it as a claim to assess against the trophy, not as ground truth
+  own stated test intent — treat it as a claim to assess, not as ground truth
   to copy.
 - **Part 4 — Specification artifacts**: linked child pages defining concrete interfaces (API
   contracts, schemas, component APIs, crypto schemes). Fetch the relevant ones with
diff --git a/plugins/bitwarden-test-engineer/references/report-template-common.md b/plugins/bitwarden-test-engineer/references/report-template-common.md
new file mode 100644
index 0000000..a4b255f
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-template-common.md
@@ -0,0 +1,174 @@
+# Report HTML — shared authoring contract
+
+Both self-contained HTML reports the `bitwarden-test-engineer` plugin emits — the
+`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage
+report** — are authored against this shared contract, so the two read as one instrument. Each
+skill's own template (`html-report-template.md` / `coverage-report-template.md`) covers only what
+differs: its section set, its per-platform table columns, and its recommend-vs-inventory framing.
+**Read this file first, then that template.**
+
+## Output constraints
+
+Produce a **single self-contained HTML file**: all CSS inline in a `<style>` block, no
+external/CDN _resource_ links (stylesheets, fonts, scripts, images), no required JavaScript, no
+web fonts. Informational `<a href>` citations to public sources are fine and encouraged — they
+are text, not loaded assets (see _Content rules_). It must render correctly opened directly from
+disk and survive being attached to a ticket or PR.
+
+You do not write the final file directly and you do not paste any CSS. Author a **content
+fragment** (the full HTML document below, but with only a stylesheet sentinel inside `<style>`),
+then run the build script. The build mechanics — invocation, output filename, and the `HHMMSS`
+freshness stamp — live in `report-style-tokens.md` → _Building the report_ (the single source of
+truth); your template only names its `--kind`.
+
+## Styling — binding
+
+Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
+one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices in
+the canonical stylesheet (`report-style.css`) verbatim, identically for both
+reports so they cannot drift. The report uses a deliberately off-brand, low-key _data-report_
+visual system (flat white paper, monospace for data/labels/chrome, sans for prose, a sequential
+layer ramp). Do not re-pick colors, do not invent layer tokens, do not reintroduce a brand skin,
+do not add `<link>`/`@font-face`/CDN imports. The layer → token mapping (unit / integration /
+e2e) and the badge → token mapping (assumption / warn / ok) are normative wherever rendered —
+chips, distribution bars, table cells, and data rows; your markup must use those exact class
+names. See `report-style-tokens.md` for the token → meaning contract.
+
+Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
+not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it scrolls rather than
+overflows on narrow widths.
+
+## Table of contents
+
+Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
+`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section in the report
+(your template lists them), each anchoring its section id. It is a `<nav>`, not a numbered
+section. (In the combined two-tab report the build script namespaces these anchor links per tab,
+so a panel's ToC jumps within its own panel.)
+
+## Sections common to both reports
+
+Each section uses its **normative `id`** — do not rename, omit, or add top-level sections;
+readers look these up by id. The four below are shared; your template defines the report-specific
+data section (`#recommendations` or `#coverage`) and the `#gaps` contents, and adds framing notes
+for the shared ones (e.g. whether the chart shows recommended or observed counts).
+
+1. **Header** (no id; `<header>` element) — report title, the change under analysis
+   (ticket/PR/feature), and the date.
+2. **`#overview`** — a short top-of-report synthesis written by the author so a reader sees the
+   bottom line without scrolling: a 2–4 sentence recap per platform, the top 3 items the reader
+   should resolve (drawn from `#gaps`), and anchor links into the detail sections. Additive — the
+   per-behavior detail stays in the tables below. (Your template says what the recap and the
+   top-3 are _about_.)
+3. **`#summary`** — 2–4 sentences, then the **layer-distribution chart** (the report's signature
+   graphic) and a per-platform one-line shape list. Render the chart as a captioned
+   `<figure class="dist">` (`Fig 1`) containing a `.legend` and one `.dist-row` per platform;
+   each row has a `.dist-label` (the platform) and a `.bar` track holding one `.seg` per layer
+   present, sized by `style="flex: <count>"` — the raw count, which the browser normalizes (never
+   hand-compute widths). Each `.seg` shows its count; the legend maps color → layer. The unit
+   segment carries dark text (`--on-unit`), integration and e2e white (`--on-deep`). Follow with
+   `<ul class="shapes">`, one `<li>` per platform: a `.plat` name plus the one-line shape. No JS.
+   See `report-style-tokens.md` → _Graphics_ for the chart contract. The chart
+   encodes **shape** (counts per layer) only — it is severity-blind. (Your template says whether
+   the counts are _recommended_ or _observed_ and supplies the caption.)
+4. **`#evidence`** — a table of which inputs were used and, explicitly, **what was missing or
+   unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage unverified"). For PR
+   inputs include the captured **head SHA** and **`owner/repo`** so per-test permalinks elsewhere
+   in the report can be audited against the same commit.
+
+`#gaps` is the last section in both reports; its exact contents differ — see your template.
+
+## Content rules
+
+- Tables over prose for the data sections and evidence — they're meant to be scanned and acted on.
+- Mark every assumption inline with `<span class="badge assumption">assumption</span>` so the
+  reader can tell grounded calls from inferred ones.
+- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g. E2E coverage
+  claimed without the `test` repo checked out).
+- **Hyperlink every GitHub or Atlassian source the report names.** Cited tests are GitHub
+  permalinks (see your template's evidence/coverage rule); any Jira/Confluence/GitHub artifact the
+  report names is anchored to its URL, never plain text. **Jira items and Jira-sourced behaviors
+  follow `input-sources.md` → _Citing Jira issues as links_** — the link form,
+  where to apply it, and the never-fabricate-a-key rule all live there. An informational
+  `<a href>` is text, not a fetched resource — it does not violate the no-remote-resources rule.
+- No tracking, no remote resources, no secrets — the file is shareable as-is. ("Remote resources"
+  means assets the page loads — stylesheets, fonts, scripts, images, CDN imports — not
+  informational `<a href>` citations, which are encouraged per the rule above.)
+- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
+  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to the
+  top from anywhere; it is CSS-only (the stylesheet's `.to-top` rule, no JavaScript). Drop either
+  half and the anchor breaks.
+
+## Skeleton
+
+The shared document shell. Your template supplies the `<title>`, the eyebrow, the ToC section
+list, the report-specific section(s) between `#evidence` and `#gaps`, and the `#summary`/`#gaps`
+headings:
+
+```html
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>…report title — {{change}}…</title>
+    <style>
+      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
+    </style>
+  </head>
+  <body>
+    <header id="top">
+      <p class="eyebrow">…report title…</p>
+      <h1>…the change under analysis…</h1>
+      <p class="meta">…ticket/PR · status · team · date…</p>
+    </header>
+    <main>
+      <nav class="toc" aria-label="Sections">
+        <!-- one <a href="#…"> per section, per your template's section list -->
+      </nav>
+      <section id="overview">
+        <h2>Overview</h2>
+        …synthesis: recap per platform; top 3 items; anchor links into the
+        detail sections…
+      </section>
+      <section id="summary">
+        <h2>…summary heading…</h2>
+        …2–4 sentences…
+        <figure class="dist">
+          <figcaption>Fig 1 · …layer distribution by platform…</figcaption>
+          <div class="legend">
+            <span class="key unit">unit</span>
+            <span class="key integration">integration</span>
+            <span class="key e2e">e2e</span>
+          </div>
+          <div class="dist-row">
+            <span class="dist-label">bitwarden/server</span>
+            <div class="bar">
+              <span class="seg unit" style="flex:3">3</span>
+              <span class="seg integration" style="flex:11">11</span>
+              <span class="seg e2e" style="flex:1">1</span>
+            </div>
+          </div>
+          <!-- one .dist-row per platform -->
+        </figure>
+        <ul class="shapes">
+          <li><span class="plat">bitwarden/server</span> — …one-line shape…</li>
+          <!-- one li per platform -->
+        </ul>
+      </section>
+      <section id="evidence">
+        <h2>Evidence &amp; sources</h2>
+        <div class="scroll">
+          …sources used + what was missing + commit SHA(s)…
+        </div>
+      </section>
+      <!-- report-specific section(s) here, per your template -->
+      <section id="gaps">
+        <h2>…gaps heading…</h2>
+        …per your template…
+      </section>
+    </main>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
+  </body>
+</html>
+```
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index 8003edb..13db7c1 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: analyzing-test-stack
-description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a Jira ticket, GitHub PR, exported test-case CSV, technical breakdown, and/or plain-language description, then mapping each behavior to the cheapest sufficient test layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", "are these tests at the right level", "risk-based test coverage", "what tests does this Critical/High bug need", or "rank coverage gaps by severity".
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need".
 allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
 ---
 
@@ -14,7 +14,7 @@ The three layers (read `references/testing-trophy.md` for the full model): a foc
 
 You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. You also consume a **coverage inventory** — the existing-test records produced by the `assessing-test-coverage` skill (permalink records + `unverified` gaps). Under the `bitwarden-test-engineer` agent this is gathered for you before this skill runs; if it is absent (e.g. run standalone), invoke `Skill(assessing-test-coverage)` for the affected change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
 
-`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule: prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`, fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable, and if neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance:
+`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule. Prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`; fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable. If neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance:
 
 - **Jira** — extract testable behaviors and acceptance criteria; Epics/Features expand to their children before extraction.
 - **GitHub PR** — extract the change surface, API touched, and any tests already present.
@@ -30,18 +30,18 @@ Alongside the behaviors, carry each behavior's **risk severity** — the impact
 
 1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated.
 
-2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input: **one record per behavior** carrying its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`, representative tests path-only with an `unlinkable` reason when they can't be linked) plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below.
+2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input. It is **one record per behavior**, carrying that behavior's layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`; representative tests are path-only with an `unlinkable` reason when they can't be linked), plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below.
 
-3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`).
+3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest test layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`).
 
 4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is wrong for its repo (e.g. E2E doing work integration should do, untested core logic, or a layer the repo doesn't even maintain). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
 
-5. **Render the HTML report.** Once steps 1–4 have decided the per-behavior layer/severity mapping, rendering it to HTML is **mechanical formatting, not reasoning** — under the `bitwarden-test-engineer` agent this step runs on a Sonnet report-writer subagent (see the agent's _Model selection_), not in the analytical context. Author a **content fragment** following `references/html-report-template.md`: a full HTML document whose `<style>` element holds only the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — **never paste or retype CSS**; the build script splices in the canonical `../../references/report-style.css` verbatim (the off-brand data-report system and the layer/badge mappings are binding — use the exact class names). Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`). Write `#overview` yourself as a short top-of-report synthesis: a 2–4 sentence recap of the recommended shape per platform, the top 3 open risks the reader should resolve before acting (drawn from `#gaps`, **highest severity first**), and anchor links into `#recommendations` and `#gaps`. The per-platform recommendations table carries a **Severity** column per behavior, and its Evidence column must contain a GitHub permalink (or an explicit `unlinkable` note) for every cited existing test. Then build the final file: `"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" --kind test-stack --slug <slug> --date <date> <fragment>` (slug = short kebab-case change identifier; date = the caller-provided date). The script writes `test-stack-report-<slug>-<date>-<HHMMSS>.html` to the current working directory — the `HHMMSS` suffix is stamped by the script so each run is a fresh file, never overwriting a prior report — and prints the filename; delete the temporary fragment and report that filename. Do not hand-assemble the file or paste CSS as a fallback.
+5. **Render the HTML report.** Once steps 1–4 have decided the per-behavior layer/severity mapping, rendering it to HTML is **mechanical formatting, not reasoning**. Author a content fragment per `references/html-report-template.md` (and the shared `../../references/report-template-common.md` it builds on), then run the build script to splice in the stylesheet and emit the file. Write `#overview` yourself as a short synthesis — recommended shape per platform, and the top 3 open risks drawn from `#gaps`, highest severity first. Ensure the `#recommendations` table carries a **Severity** column and a GitHub permalink (or explicit `unlinkable` note) in every Evidence cell. The template owns the section IDs, the never-paste-CSS rule, the `--kind test-stack` build invocation, and the filename/freshness contract — follow it; do not hand-assemble the file.
 
 ## Principles
 
 - **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test. Mark anything inferred without evidence as an assumption.
-- **Cheapest sufficient layer wins.** Confidence pushed down the trophy is cheaper to write, faster to run, and less flaky.
+- **Cheapest sufficient layer wins.** Confidence pushed to a lower layer is cheaper to write, faster to run, and less flaky.
 - **Severity sets the bar, not the layer.** Weight each behavior's coverage by the impact a defect in it would have, per `references/severity-risk.md` — severity decides how completely a behavior is covered and how high its gap ranks, never which layer is "cheapest sufficient." It is impact, not priority (urgency).
 - **Per-platform, not one-size.** A feature spanning server, web, and mobile gets a distinct shape per platform — their stacks and risks differ.
-- **Honesty about coverage.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
+- **Honesty about coverage.** Treat only _observed_ coverage from the inventory as verified; everything unconfirmed feeds the gap analysis, never an assumed-covered call. An un-inspectable repo is a recorded gap, not a silent pass.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
index 1486493..49148eb 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -1,238 +1,58 @@
-# HTML report template
-
-Produce a **single self-contained HTML file**: all CSS inline in a `<style>` block, no
-external/CDN _resource_ links (stylesheets, fonts, scripts, images), no required JavaScript,
-no web fonts. Informational `<a href>` citations to public sources are fine and encouraged —
-they are text, not loaded assets (see _Content rules_). It must render correctly opened
-directly from disk and survive being attached to a ticket or PR.
-
-You do not write the final file directly and you do not paste any CSS. Author a **content
-fragment** (the full HTML below, but with only a stylesheet sentinel inside `<style>`), then run
-the build script — it inlines the stylesheet and stamps the output filename. See _Building the
-report_ at the end of this file.
-
-## Styling — binding
-
-Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
-one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices
-in the canonical stylesheet (`../../../references/report-style.css`) verbatim. The report uses a
-deliberately off-brand, low-key _data-report_ visual system (flat white paper, monospace for
-data/labels/chrome, sans for prose, a sequential layer ramp). Do not re-pick colors, do not
-invent additional layer tokens, do not reintroduce a brand skin, do not add
-`<link>`/`@font-face`/CDN imports. The layer → token mapping (unit / integration / e2e) and the
-badge → token mapping (assumption / warn / ok) are normative wherever rendered — chips,
-distribution bars, table cells, and recommendation rows; your markup must use those exact class
-names. See `../../../references/report-style-tokens.md` for the token → meaning contract.
-
-Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section
-and do not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it
-scrolls rather than overflows on narrow widths.
-
-## Required sections, in order
-
-Each section uses the **normative `id` listed below**. Do not rename, omit, or add
-top-level sections — readers look these up by id.
-
-Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
-`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section below
-(Overview, Summary, Evidence, Recommendations, Gaps), each anchoring its section id. It is a
-`<nav>`, not a numbered section. (In the combined two-tab report the build script namespaces
-these anchor links per tab, so a panel's ToC jumps within its own panel.)
-
-1. **Header** (no id; `<header>` element) — report title, the change under analysis
-   (ticket/PR/feature), and the date.
-2. **`#overview`** — A short top-of-report synthesis written by the analyst, so a reader
-   sees the bottom line without scrolling. It must contain: a 2–4 sentence recap of the
-   recommended shape per platform; the top 3 open risks the reader must resolve before
-   acting (drawn from `#gaps`, **ordered highest severity first**); and anchor links into
-   `#recommendations` and `#gaps` for the underlying detail. The overview is additive —
-   the per-behavior detail stays in `#recommendations`/`#gaps`.
-3. **`#summary`** — Summary & recommended shape — 2–4 sentences, then the
-   **layer-distribution chart** (the report's signature graphic) and a per-platform
-   one-line shape list. Render the chart as a captioned `<figure class="dist">` (`Fig 1`)
-   containing a `.legend` and one `.dist-row` per platform; each row has a `.dist-label`
-   (the platform) and a `.bar` track holding one `.seg` per layer present, sized by
-   `style="flex: <count>"` where `<count>` is the recommended test count at that layer
-   (the browser normalizes; never hand-compute widths). Each `.seg` shows its count; the
-   legend maps color → layer. Follow with `<ul class="shapes">`, one `<li>` per platform:
-   a `.plat` name plus the one-line shape that matches the repo's actual practice
-   (e.g. "server: unit-heavy pyramid, thin integration, no E2E; ios: integration +
-   snapshot, no XCUITest"). No JS. See `../../../references/report-style-tokens.md`
-   → _Graphics_ for the chart contract. The chart encodes recommended **shape** (counts per
-   layer) only; risk severity is carried in the `#recommendations` table's Severity column,
-   not in this graphic — leave the chart severity-blind.
-4. **`#evidence`** — Evidence & sources — a table of which inputs were used (Jira / PR /
-   CSV / tech breakdown / description) and, explicitly, **what was missing or
-   unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage
-   unverified"). For PR inputs include the captured **head SHA** and **`owner/repo`** so
-   per-test permalinks elsewhere in the report can be audited against the same commit.
-5. **`#recommendations`** — Per-platform recommendations — for each affected platform, a
-   table:
-   `Behavior | Severity | Recommended layer | Tooling | Rationale | Evidence (linked)`. One
-   row per behavior. When a behavior was extracted from a Jira item (its record carries a
-   `source_issue`), the **Behavior** cell appends the linked issue key —
-   `… behavior text … <a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>` — so
-   the row points back at the requirement; a behavior with no Jira source carries no key (see
-   `../../../references/input-sources.md` → _Citing Jira issues as links_). The **Severity** cell
-   carries the behavior's risk severity
-   (Critical / High / Medium / Low / Informative) per the `analyzing-test-stack` skill's
-   `references/severity-risk.md`. Render it with the stylesheet's existing inline-code
-   treatment — `<code>Critical</code>` — **not** a new color token: the layer ramp and the
-   assumption/warn/ok badges are the only colored chips the styling system defines, and
-   severity deliberately does not get its own hue. Mark a severity the analyst inferred
-   (rather than read from a bug's Jira field) with
-   `<span class="badge assumption">assumption</span>`. Use the layer → repo map; E2E rows
-   must name the dedicated `test` repo as target.
-
-   **The "Evidence (linked)" column is binding.** For every existing test cited as
-   current coverage, render a GitHub permalink anchored to the captured commit SHA and
-   line range — `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
-   If a test cannot be linked (no remote, detached HEAD, private fork the agent
-   couldn't reach), use `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>`
-   instead of fabricating a URL. These records come from the coverage inventory; the
-   permalink production rules live in the `assessing-test-coverage` skill's
-   `references/finding-coverage.md` → _Citing tests as GitHub permalinks_.
-
-6. **`#gaps`** — Coverage gaps & imbalances — behaviors with no coverage, and any shape
-   wrong for its repo observed (ice-cream-cone, over-unit-tested, trivial tests). **Order
-   the list by severity**, highest first, so a Critical uncovered behavior leads and the
-   reader resolves the worst-impact gaps first; Informative behaviors are recorded as
-   out-of-scope rather than gaps. Each tied to evidence, and — where the gap behavior came from
-   a Jira item — to its linked source key (same form as `#recommendations`). Findings you could
-   not ground belong here, marked `unverified` with a one-line reason.
-
-## Content rules
-
-- Tables over prose for recommendations and evidence — they're meant to be scanned and
-  acted on.
-- Mark every assumption inline with `<span class="badge assumption">assumption</span>`
-  so the reader can tell grounded calls from inferred ones.
-- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g.
-  E2E coverage claimed without the `test` repo checked out).
-- **Hyperlink every GitHub or Atlassian source the report names.** Cited tests are GitHub
-  permalinks (see the Evidence rule above), and if the report names the
-  [Defect Severity Classification Guide](https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity)
-  or any Jira/Confluence/GitHub artifact, anchor it to its URL rather than naming it in plain
-  text. An informational `<a href>` to a GitHub/Atlassian page is **text, not a fetched
-  resource** — it does not violate the "no remote resources" rule below (which targets loaded
-  assets: CSS, fonts, scripts, CDN imports). Do not strip these links to honor the
-  self-contained constraint.
-- **Link every Jira item, and link each behavior to the Jira item it came from.** Any issue,
-  epic, or child key named anywhere in the report (Overview, Summary, Evidence) is an `<a href>`
-  to its browse URL — `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`,
-  never bare key text. And for every behavior in `#recommendations`/`#gaps` that was extracted
-  from a Jira item (the record's `source_issue`), append the linked source key to the behavior
-  cell so the reader can jump to the requirement. A behavior with no Jira source (PR-only)
-  carries no key. See `../../../references/input-sources.md` → _Citing Jira issues as links_ for
-  the link form. Never fabricate a key or URL.
-- No tracking, no remote resources, no secrets. The file is shareable as-is. ("Remote
-  resources" means assets the page loads — stylesheets, fonts, scripts, images, CDN imports —
-  not informational `<a href>` citations, which are encouraged per the rule above.)
-- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
-  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to
-  the top of the report from anywhere; it is CSS-only (styled by the stylesheet's `.to-top`
-  rule, no JavaScript). Do not drop either half or the anchor breaks.
-
-## Skeleton
+# Test-stack report template
+
+The **recommendation** report: per-platform test-layer recommendations, risk-weighted by
+severity. Build it against the shared contract in
+`../../../references/report-template-common.md` (output constraints, styling/sentinel rule,
+auto-numbering, ToC, the Header/Overview/Summary/Evidence sections, content rules, and the
+skeleton) — **read that first**. This file covers only what is specific to the test-stack report.
+Build with `--kind test-stack`; the invocation and filename rules are in
+`../../../references/report-style-tokens.md` → _Building the report_.
+
+## Sections (in order)
+
+ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`.
+
+- **`#overview`** — recap the **recommended shape per platform**; the top 3 open risks the reader
+  must resolve before acting are drawn from `#gaps`, **ordered highest severity first**; anchor
+  into `#recommendations` and `#gaps`.
+- **`#summary`** — heading "Summary & recommended shape". The distribution chart's
+  `.seg flex:<count>` is the **recommended** test count at each layer; caption it
+  `Fig 1 · Recommended layer distribution by platform`. The `.shapes` list gives each platform's
+  recommended shape matched to its repo's actual practice (e.g. "server: unit-heavy pyramid, thin
+  integration, no E2E; ios: integration + snapshot, no XCUITest").
+- **`#recommendations`** — per-platform tables, one row per behavior:
+  `Behavior | Severity | Recommended layer | Tooling | Rationale | Evidence (linked)`.
+  - **Severity** carries the behavior's risk severity (Critical / High / Medium / Low /
+    Informative) per `severity-risk.md`, rendered with the stylesheet's inline-code treatment —
+    `<code>Critical</code>`, **not** a new color token (the layer ramp and assumption/warn/ok
+    badges are the only colored chips the system defines; severity deliberately gets no hue). Mark
+    a severity the analyst inferred (rather than read from a bug's Jira field) with
+    `<span class="badge assumption">assumption</span>`.
+  - Use the layer → repo map; **E2E rows must name the dedicated `test` repo** as target.
+  - **The "Evidence (linked)" column is binding.** For every existing test cited as current
+    coverage, render a GitHub permalink anchored to the captured commit SHA and line range —
+    `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
+    If a test cannot be linked, use
+    `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` instead of
+    fabricating a URL. These records come from the coverage inventory; the permalink production
+    rules live in the `assessing-test-coverage` skill's `references/finding-coverage.md` →
+    _Citing tests as GitHub permalinks_.
+- **`#gaps`** — heading "Coverage gaps & imbalances": behaviors with no coverage, and any shape
+  wrong for its repo (ice-cream-cone, over-unit-tested, trivial tests). **Order by severity**,
+  highest first, so a Critical uncovered behavior leads; Informative behaviors are recorded as
+  out-of-scope rather than gaps. Each tied to evidence; findings you could not ground are marked
+  `<span class="badge warn">unverified</span>` with a one-line reason.
+
+## Recommendations section markup
+
+Slot this between `#evidence` and `#gaps` in the shared skeleton:
 
 ```html
-<!doctype html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <title>Test Stack Report — {{change}}</title>
-    <style>
-      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
-    </style>
-  </head>
-  <body>
-    <header id="top">
-      <p class="eyebrow">Test Stack Report</p>
-      <h1>…the change under analysis…</h1>
-      <p class="meta">…ticket/PR · status · team · date…</p>
-    </header>
-    <main>
-      <nav class="toc" aria-label="Sections">
-        <a href="#overview">Overview</a>
-        <a href="#summary">Summary</a>
-        <a href="#evidence">Evidence</a>
-        <a href="#recommendations">Recommendations</a>
-        <a href="#gaps">Gaps</a>
-      </nav>
-      <section id="overview">
-        <h2>Overview</h2>
-        …2–4 sentence recap of the recommended shape per platform; top 3 open
-        risks; anchor links into #recommendations and #gaps…
-      </section>
-      <section id="summary">
-        <h2>Summary &amp; recommended shape</h2>
-        …2–4 sentences…
-        <figure class="dist">
-          <figcaption>
-            Fig 1 · Recommended layer distribution by platform
-          </figcaption>
-          <div class="legend">
-            <span class="key unit">unit</span>
-            <span class="key integration">integration</span>
-            <span class="key e2e">e2e</span>
-          </div>
-          <div class="dist-row">
-            <span class="dist-label">bitwarden/server</span>
-            <div class="bar">
-              <span class="seg unit" style="flex:3">3</span>
-              <span class="seg integration" style="flex:11">11</span>
-              <span class="seg e2e" style="flex:1">1</span>
-            </div>
-          </div>
-          <!-- one .dist-row per platform -->
-        </figure>
-        <ul class="shapes">
-          <li>
-            <span class="plat">bitwarden/server</span> — unit-heavy pyramid,
-            thin integration, no E2E
-          </li>
-          <!-- one li per platform -->
-        </ul>
-      </section>
-      <section id="evidence">
-        <h2>Evidence &amp; sources</h2>
-        <div class="scroll">
-          …sources used + what was missing + commit SHA(s)…
-        </div>
-      </section>
-      <section id="recommendations">
-        <h2>Per-platform recommendations</h2>
-        <div class="scroll">
-          …per-platform tables: Behavior | Severity | Recommended layer |
-          Tooling | Rationale | Evidence (linked)…
-        </div>
-      </section>
-      <section id="gaps">
-        <h2>Coverage gaps &amp; imbalances</h2>
-        …gaps and trophy-wrong shapes; ungrounded findings marked unverified…
-      </section>
-    </main>
-    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
-  </body>
-</html>
+<section id="recommendations">
+  <h2>Per-platform recommendations</h2>
+  <div class="scroll">
+    …per-platform tables: Behavior | Severity | Recommended layer | Tooling |
+    Rationale | Evidence (linked)…
+  </div>
+</section>
 ```
-
-## Building the report
-
-Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only
-content of `<style>`) to a temporary path, then run the build script:
-
-```bash
-"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
-  --kind test-stack --slug <slug> --date <YYYY-MM-DD> \
-  test-stack-report-<slug>.fragment.html
-```
-
-`<slug>` is a short kebab-case identifier for the change (ticket key / PR number / feature
-name); `<date>` is the caller-provided date. The script splices in `report-style.css`, writes
-`test-stack-report-<slug>-<date>-<HHMMSS>.html` to the current working directory (the `HHMMSS`
-time suffix is stamped by the script, so each run is a fresh file — nothing is ever
-overwritten), and prints the final filename. Delete the temporary fragment afterward, and
-report the printed filename to the caller. Do not hand-assemble the final file or paste CSS as a
-fallback — if the script errors, fix the fragment and re-run.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
index 86caef4..de80fb9 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -55,7 +55,7 @@ and _how hard a missing test counts as a gap_. Concretely:
 - **Critical** — the confidence bar is highest: cover the behavior's material failure modes,
   not just the happy path, at whatever layer each mode is cheapest to pin down. Critical
   behaviors that are genuine end-to-end journeys (login, vault unlock, checkout) are exactly
-  what the trophy reserves the **thin E2E layer** for — the guide's "critical user flows"
+  what the **thin E2E layer** is reserved for — the guide's "critical user flows"
   map 1:1 onto that reservation. A Critical behavior with no observed coverage is a
   **top-priority gap** and belongs at the head of `#overview`'s open risks.
 - **High** — strong integration coverage of the primary path _and_ the documented
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
index 49521bc..0a9646e 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: assessing-test-coverage
-description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or set of changed paths across Bitwarden's repos, citing each as a stable GitHub permalink, bucketing it by test layer, and flagging behaviors with no observed test as gaps. Distinguishes observed coverage from assumed. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", "find the existing tests for", "is this component covered", "audit current test coverage". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient trophy layers; for that, use analyzing-test-stack.
+description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or changed paths across Bitwarden's repos, citing each as a stable GitHub permalink bucketed by test layer, and flagging behaviors with no observed test as gaps. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", or "is this component covered". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient test layers; for that, use analyzing-test-stack.
 allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)"
 ---
 
@@ -26,13 +26,13 @@ A missing input narrows the inventory; it never blocks it. Record what you could
 
 1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_.
 
-2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed** — capture 1–3 representative tests plus an approximate count per behavior; do not open and enumerate every test method in a covered area (the dominant cost control — see `references/finding-coverage.md` → _Establish coverage per behavior_). For E2E, inspect the sibling `test` repo if available.
+2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed**: capture 1–3 representative tests plus an approximate count per behavior, and do not open and enumerate every test method in a covered area. This is the dominant cost control — see `references/finding-coverage.md` → _Establish coverage per behavior_. For E2E, inspect the sibling `test` repo if available.
 
 3. **Cite and bucket each behavior's coverage.** For each behavior, render its 1–3 representative tests as GitHub permalinks (commit SHA, not branch) and record its layer and approximate count, following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _Output contract_. A representative test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`.
 
 4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_.
 
-5. **Render the coverage report.** Turning the gathered inventory into HTML is **mechanical formatting, not reasoning** — under the `bitwarden-test-engineer` agent this step runs on a Sonnet report-writer subagent (see the agent's _Model selection_), not in the analytical context. Author a **content fragment** following `references/coverage-report-template.md`: a full HTML document whose `<style>` element holds only the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — **never paste or retype CSS**; the build script splices in the canonical `../../references/report-style.css` verbatim (the same source the test-stack report uses, so the two read as one instrument — use the exact class names, do not re-pick colors or reintroduce a brand skin). Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`) and write `#overview` yourself as a short synthesis. Then build the final file: `"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" --kind test-coverage --slug <slug> --date <date> <fragment>` (slug = short kebab-case change identifier; date = the caller-provided date). The script writes `test-coverage-report-<slug>-<date>-<HHMMSS>.html` to the current working directory — the `HHMMSS` suffix is stamped by the script so each run is a fresh file, never overwriting a prior report — and prints the filename; delete the temporary fragment and report that filename. Do not hand-assemble the file or paste CSS as a fallback.
+5. **Render the coverage report.** Turning the gathered inventory into HTML is **mechanical formatting, not reasoning**. Author a content fragment per `references/coverage-report-template.md` (and the shared `../../references/report-template-common.md` it builds on), then run the build script to splice in the stylesheet and emit the file. Write `#overview` yourself as a short synthesis — observed coverage per platform and the top gaps. The template owns the section IDs, the never-paste-CSS rule, the `--kind test-coverage` build invocation, and the filename/freshness contract — follow it; do not hand-assemble the file.
 
 ## Output
 
@@ -48,4 +48,4 @@ Mirror the report's `#overview` in chat — the observed shape per platform and
 - **Observed vs. assumed.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
 - **Scoped, not swept.** Coverage is established PR-first then scoped to the change surface — never a repo-wide grep.
 - **Stable links only.** Permalinks use the commit SHA, not a branch. Unlinkable tests are recorded with a reason; URLs are never fabricated.
-- **Backward-looking only.** You inventory what exists. Recommending new tests, assigning cheapest-sufficient layers, and judging trophy shape belong to `analyzing-test-stack` — hand off, don't cross over.
+- **Backward-looking only.** You inventory what exists. Recommending new tests, assigning cheapest-sufficient layers, and judging test shape belong to `analyzing-test-stack` — hand off, don't cross over.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
index 48787e5..d0fba0f 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -1,212 +1,58 @@
 # Coverage report template
 
-Produce a **single self-contained HTML file** inventorying the existing test coverage for a
-change: all CSS inline in a `<style>` block, no external/CDN links, no required JavaScript, no
-web fonts. It must render correctly opened directly from disk and survive being attached to a
-ticket or PR. This is the coverage counterpart to the `analyzing-test-stack` test-stack report;
-the two share one visual system so they read as the same instrument.
-
-You do not write the final file directly and you do not paste any CSS. Author a **content
-fragment** (the full HTML below, but with only a stylesheet sentinel inside `<style>`), then run
-the build script — it inlines the stylesheet and stamps the output filename. See _Building the
-report_ at the end of this file.
-
-## Styling — binding
-
-Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
-one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices
-in the canonical stylesheet (`../../../references/report-style.css`) verbatim. It is the same
-styling source the test-stack report uses, spliced identically so the two reports do not drift.
-Do not re-pick colors, fonts, or layer tokens, and do not reintroduce a brand skin or any
-`<link>`/`@font-face`/CDN import; the off-brand data-report system and the layer/badge token
-mappings are binding. The layer chips (`unit` / `integration` / `e2e`), the badges
-(`assumption` / `warn` / `ok`), the distribution chart, and the `.unlinkable` span are all
-defined in the stylesheet; your markup must use those exact class names. See
-`../../../references/report-style-tokens.md` for the token → meaning contract.
-
-Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
-not hand-number. Wrap each wide table in `<div class="scroll">…</div>`.
-
-## Required sections, in order
-
-Each section uses the **normative `id` listed below**. Do not rename, omit, or add top-level
-sections — readers look these up by id.
-
-Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
-`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section below
-(Overview, Summary, Evidence, Coverage, Gaps), each anchoring its section id. It is a `<nav>`,
-not a numbered section. (In the combined two-tab report the build script namespaces these anchor
-links per tab, so a panel's ToC jumps within its own panel.)
-
-1. **Header** (no id; `<header>` element) — report title ("Test Coverage Report"), the change
-   under analysis (ticket/PR/feature), and the date.
-2. **`#overview`** — A short top-of-report synthesis written so a reader sees the bottom line
-   without scrolling. It must contain: a 2–4 sentence recap of how well covered the change is
-   per platform (where observed tests concentrate, which layers are bare); the top 3 coverage
-   gaps the reader should know about (drawn from `#gaps`); and anchor links into `#coverage`
-   and `#gaps`. This report **describes** coverage — it does not recommend new tests or assign
-   cheapest-sufficient layers (that is the test-stack report's job); say so in one line and, if
-   a test-stack report was also produced, link to it.
-3. **`#summary`** — Observed coverage shape — 2–4 sentences, then the **layer-distribution
-   chart** rendered exactly per `../../../references/report-style-tokens.md` → _Graphics_, but
-   with each `.seg`'s `flex:<count>` set to the **count of observed tests** at that layer for
-   the platform (not recommended counts). Caption it `Fig 1 · Observed test coverage by platform`.
-   Follow with `<ul class="shapes">`, one `<li>` per platform giving the one-line
-   observed shape (e.g. "server: 14 integration, 3 unit, 0 E2E observed"). A platform with no
-   observed coverage still gets a row, shown empty.
-4. **`#evidence`** — Evidence & sources — a table of what was inspected (which repos/checkouts,
-   which PRs read, whether the sibling `test` repo was available) and, explicitly, **what was
-   missing or unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage
-   unverified"). For PR-sourced records include the captured **head SHA** and **`owner/repo`**
-   so the per-test permalinks can be audited against the same commit.
-5. **`#coverage`** — Observed coverage — for each affected platform, a table:
-   `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`. **One row per
-   behavior**, not per test — match the per-behavior coverage records. When a behavior's record
-   carries a `source_issue`, the **Behavior / surface** cell appends the linked issue key —
-   `… behavior … <a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>` — so the
-   row points back at the requirement it came from (see `../../../references/input-sources.md` →
-   _Citing Jira issues as links_); a behavior with no Jira source carries no key. The **Tests (linked)**
-   column renders the behavior's 1–3 representative permalinks (binding), anchored to the
-   captured commit SHA and line range —
-   `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`;
-   the **Count** column gives the approximate number of tests covering that behavior at that
-   layer (breadth without enumerating every test). Do not expand a well-covered behavior into
-   dozens of rows — that bloats the report and is not what a reader needs.
-   If a representative test cannot be linked, use
-   `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` instead
-   of fabricating a URL. The **Layer** cell uses the matching layer chip. **Source** is `PR`
-   (tests shipped in a linked/merged PR) or `pre-existing` (found by the targeted lookup) —
-   keep the observed-vs-assumed distinction visible. Permalink production rules live in
-   `finding-coverage.md` → _Citing tests as GitHub permalinks_.
-6. **`#gaps`** — Coverage gaps — behaviors/surfaces in the change with **no observed test**,
-   each marked `<span class="badge warn">unverified</span>` with a one-line reason (no
-   PR-observed test and no targeted hit; or `test` repo unavailable), and — where the behavior
-   came from a Jira item — its linked source key (same form as `#coverage`). This is the honest
-   record of what is _not_ known to be covered — it is not a recommendation to add tests.
-
-## Content rules
-
-- Tables over prose for the coverage inventory and evidence — they're meant to be scanned.
-- Mark anything inferred without direct evidence with
-  `<span class="badge assumption">assumption</span>`; confirmed observed coverage may carry
-  `<span class="badge ok">ok</span>`.
-- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g. E2E
-  coverage claimed without the `test` repo checked out).
-- Never present assumed coverage as observed, and never fabricate a permalink.
-- **Link every Jira item, and link each behavior to the Jira item it came from.** Any issue,
-  epic, or child key named anywhere (Overview, Summary, Evidence) is an `<a href>` to its browse
-  URL — `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`, never bare key
-  text. For every behavior row in `#coverage`/`#gaps` whose behavior was extracted from a Jira
-  item (the record's `source_issue`), append the linked source key to the behavior cell so the
-  reader can jump to the requirement; a behavior with no Jira source carries no key. See
-  `../../../references/input-sources.md` → _Citing Jira issues as links_. Never fabricate a key
-  or URL. An informational `<a href>` citation is text, not a loaded asset — it does not violate
-  the no-remote-resources rule below.
-- No tracking, no remote resources, no secrets. The file is shareable as-is.
-- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
-  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to
-  the top of the report from anywhere; it is CSS-only (styled by the stylesheet's `.to-top`
-  rule, no JavaScript). Do not drop either half or the anchor breaks.
-
-## Skeleton
+The **inventory** report: what is already tested for a change, per platform, every cited test a
+stable GitHub permalink. Build it against the shared contract in
+`../../../references/report-template-common.md` (output constraints, styling/sentinel rule,
+auto-numbering, ToC, the Header/Overview/Summary/Evidence sections, content rules, and the
+skeleton) — **read that first**. This file covers only what is specific to the coverage report.
+Build with `--kind test-coverage`; the invocation and filename rules are in
+`../../../references/report-style-tokens.md` → _Building the report_.
+
+This is the coverage counterpart to the `analyzing-test-stack` test-stack report; the two splice
+the same stylesheet and follow the same shared contract, so they read as one instrument.
+
+## Sections (in order)
+
+ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`.
+
+- **`#overview`** — recap **how well covered the change is per platform** (where observed tests
+  concentrate, which layers are bare); the top 3 coverage gaps the reader should know about are
+  drawn from `#gaps`; anchor into `#coverage` and `#gaps`. This report **describes** coverage — it
+  does not recommend new tests or assign cheapest-sufficient layers (that is the test-stack
+  report's job); say so in one line and, if a test-stack report was also produced, link to it.
+- **`#summary`** — heading "Observed coverage shape". The distribution chart's `.seg flex:<count>`
+  is the **count of observed tests** at each layer (not recommended counts); caption it
+  `Fig 1 · Observed test coverage by platform`. The `.shapes` list gives each platform's observed
+  shape (e.g. "server: 14 integration, 3 unit, 0 E2E observed"); a platform with no observed
+  coverage still gets a row, shown empty.
+- **`#coverage`** — per-platform tables, **one row per behavior** (not per test):
+  `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`.
+  - **Tests (linked)** renders the behavior's 1–3 representative permalinks (binding), anchored to
+    the captured commit SHA and line range —
+    `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
+    A representative test that cannot be linked uses
+    `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` — never a
+    fabricated URL. Permalink production rules live in `finding-coverage.md` →
+    _Citing tests as GitHub permalinks_.
+  - **Count** is the approximate number of tests covering that behavior at that layer — breadth
+    without enumerating every test. Do not expand a well-covered behavior into dozens of rows.
+  - **Layer** uses the matching layer chip. **Source** is `PR` (tests shipped in a linked/merged
+    PR) or `pre-existing` (found by the targeted lookup) — keep the observed-vs-assumed
+    distinction visible.
+- **`#gaps`** — heading "Coverage gaps": behaviors/surfaces in the change with **no observed
+  test**, each marked `<span class="badge warn">unverified</span>` with a one-line reason (no
+  PR-observed test and no targeted hit; or `test` repo unavailable). The honest record of what is
+  _not_ known to be covered — not a recommendation to add tests.
+
+## Coverage section markup
+
+Slot this between `#evidence` and `#gaps` in the shared skeleton:
 
 ```html
-<!doctype html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <title>Test Coverage Report — {{change}}</title>
-    <style>
-      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
-    </style>
-  </head>
-  <body>
-    <header id="top">
-      <p class="eyebrow">Test Coverage Report</p>
-      <h1>…the change under analysis…</h1>
-      <p class="meta">…ticket/PR · status · team · date…</p>
-    </header>
-    <main>
-      <nav class="toc" aria-label="Sections">
-        <a href="#overview">Overview</a>
-        <a href="#summary">Summary</a>
-        <a href="#evidence">Evidence</a>
-        <a href="#coverage">Coverage</a>
-        <a href="#gaps">Gaps</a>
-      </nav>
-      <section id="overview">
-        <h2>Overview</h2>
-        …2–4 sentence recap of observed coverage per platform; top 3 gaps;
-        anchor links into #coverage and #gaps; one line noting this is a
-        coverage inventory, not a recommendation…
-      </section>
-      <section id="summary">
-        <h2>Observed coverage shape</h2>
-        …2–4 sentences…
-        <figure class="dist">
-          <figcaption>Fig 1 · Observed test coverage by platform</figcaption>
-          <div class="legend">
-            <span class="key unit">unit</span>
-            <span class="key integration">integration</span>
-            <span class="key e2e">e2e</span>
-          </div>
-          <div class="dist-row">
-            <span class="dist-label">bitwarden/server</span>
-            <div class="bar">
-              <span class="seg unit" style="flex:3">3</span>
-              <span class="seg integration" style="flex:14">14</span>
-            </div>
-          </div>
-          <!-- one .dist-row per platform; empty bar if none observed -->
-        </figure>
-        <ul class="shapes">
-          <li>
-            <span class="plat">bitwarden/server</span> — 14 integration, 3 unit,
-            0 E2E observed
-          </li>
-          <!-- one li per platform -->
-        </ul>
-      </section>
-      <section id="evidence">
-        <h2>Evidence &amp; sources</h2>
-        <div class="scroll">
-          …repos inspected + PRs read + test-repo availability + what was
-          missing + commit SHA(s)…
-        </div>
-      </section>
-      <section id="coverage">
-        <h2>Observed coverage</h2>
-        <div class="scroll">
-          …per-platform behavior→test tables with linked evidence…
-        </div>
-      </section>
-      <section id="gaps">
-        <h2>Coverage gaps</h2>
-        …behaviors with no observed test, each marked unverified with a one-line
-        reason…
-      </section>
-    </main>
-    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
-  </body>
-</html>
-```
-
-## Building the report
-
-Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only
-content of `<style>`) to a temporary path, then run the build script:
-
-```bash
-"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
-  --kind test-coverage --slug <slug> --date <YYYY-MM-DD> \
-  test-coverage-report-<slug>.fragment.html
+<section id="coverage">
+  <h2>Observed coverage</h2>
+  <div class="scroll">
+    …per-platform behavior→test tables with linked evidence…
+  </div>
+</section>
 ```
-
-`<slug>` is a short kebab-case identifier for the change (ticket key / PR number / feature
-name); `<date>` is the caller-provided date. The script splices in `report-style.css`, writes
-`test-coverage-report-<slug>-<date>-<HHMMSS>.html` to the current working directory (the
-`HHMMSS` time suffix is stamped by the script, so each run is a fresh file — nothing is ever
-overwritten), and prints the final filename. Delete the temporary fragment afterward, and
-report the printed filename to the caller. Do not hand-assemble the final file or paste CSS as a
-fallback — if the script errors, fix the fragment and re-run.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
index 625f124..7829a95 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -1,6 +1,6 @@
 # Finding and citing existing test coverage
 
-How to determine what a change is **already** tested by, scoped to the change surface, and how to cite each observed test as a stable link. This is the repo-reading half of test engineering; the trophy-mapping half (which layer a behavior _should_ live at) is in the `analyzing-test-stack` skill.
+How to determine what a change is **already** tested by, scoped to the change surface, and how to cite each observed test as a stable link. This is the repo-reading half of test engineering; the layer-mapping half (which layer a behavior _should_ live at) is in the `analyzing-test-stack` skill.
 
 ## Discovering a repo's test conventions (config-first)
 

From 0ca9c427725d9ac6f87bd9aaab44fdf7bc54d22e Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Thu, 18 Jun 2026 17:12:32 -0400
Subject: [PATCH 5/9] deduplication pass

---
 .../bitwarden-test-engineer/agents/AGENT.md   |  4 +-
 .../references/report-template-common.md      | 37 +++++++++----------
 .../skills/analyzing-test-stack/SKILL.md      |  6 +--
 .../references/html-report-template.md        | 12 +++---
 .../references/coverage-report-template.md    | 11 ++----
 5 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md
index 755d10a..85994eb 100644
--- a/plugins/bitwarden-test-engineer/agents/AGENT.md
+++ b/plugins/bitwarden-test-engineer/agents/AGENT.md
@@ -125,11 +125,11 @@ This step depends on step 2's change surface, so run it after the evidence fan-o
 
 ### 4. Recommend
 
-Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient test layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill — skills cannot read the clock; the build script stamps the `HHMMSS` suffix.
+Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient test layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill (the clock-and-`HHMMSS` rule is stated in step 3).
 
 ### 5. Combine and present
 
-Steps 3 and 4 each emit a self-contained HTML file in the current working directory: the `test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested) and the `test-stack-report-<slug>-<date>-<HHMMSS>.html` (the recommendation). Each filename carries the build script's timestamp, so re-running never overwrites a prior report.
+Steps 3 and 4 each emit a self-contained HTML file in the current working directory: the `test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested) and the `test-stack-report-<slug>-<date>-<HHMMSS>.html` (the recommendation) — the timestamped filenames never collide (step 3).
 
 Then assemble the **combined two-tab page** — the primary deliverable, with _Current coverage_ (the coverage report) and _Recommended coverage_ (the test-stack report) on one page. Run the build script yourself (it is pure file assembly — no template or stylesheet reading, so your context stays lean) with the two filenames the prior steps printed:
 
diff --git a/plugins/bitwarden-test-engineer/references/report-template-common.md b/plugins/bitwarden-test-engineer/references/report-template-common.md
index a4b255f..3eac48b 100644
--- a/plugins/bitwarden-test-engineer/references/report-template-common.md
+++ b/plugins/bitwarden-test-engineer/references/report-template-common.md
@@ -23,16 +23,15 @@ truth); your template only names its `--kind`.
 
 ## Styling — binding
 
-Do **not** paste, retype, or trim any CSS. Inside the fragment's `<style>` element put exactly
-one line — the sentinel `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and the build script splices in
-the canonical stylesheet (`report-style.css`) verbatim, identically for both
-reports so they cannot drift. The report uses a deliberately off-brand, low-key _data-report_
-visual system (flat white paper, monospace for data/labels/chrome, sans for prose, a sequential
-layer ramp). Do not re-pick colors, do not invent layer tokens, do not reintroduce a brand skin,
-do not add `<link>`/`@font-face`/CDN imports. The layer → token mapping (unit / integration /
-e2e) and the badge → token mapping (assumption / warn / ok) are normative wherever rendered —
-chips, distribution bars, table cells, and data rows; your markup must use those exact class
-names. See `report-style-tokens.md` for the token → meaning contract.
+Inside the fragment's `<style>` element put exactly one line — the sentinel
+`/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and nothing else; the build script splices in the
+canonical stylesheet (`report-style.css`) verbatim, identically for both reports so they cannot
+drift. **Do not paste, retype, or trim any CSS, re-pick colors/fonts/layer tokens, reintroduce a
+brand skin, or add external/CDN assets** — the visual system and the full list of prohibitions are
+owned by `report-style-tokens.md` (its _token → meaning_ contract and _What not to do_); read it.
+Your markup must use the exact normative class names it defines: the layer → token mapping (unit /
+integration / e2e) and the badge → token mapping (assumption / warn / ok) apply wherever rendered —
+chips, distribution bars, table cells, and data rows.
 
 Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
 not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it scrolls rather than
@@ -61,16 +60,14 @@ for the shared ones (e.g. whether the chart shows recommended or observed counts
    per-behavior detail stays in the tables below. (Your template says what the recap and the
    top-3 are _about_.)
 3. **`#summary`** — 2–4 sentences, then the **layer-distribution chart** (the report's signature
-   graphic) and a per-platform one-line shape list. Render the chart as a captioned
-   `<figure class="dist">` (`Fig 1`) containing a `.legend` and one `.dist-row` per platform;
-   each row has a `.dist-label` (the platform) and a `.bar` track holding one `.seg` per layer
-   present, sized by `style="flex: <count>"` — the raw count, which the browser normalizes (never
-   hand-compute widths). Each `.seg` shows its count; the legend maps color → layer. The unit
-   segment carries dark text (`--on-unit`), integration and e2e white (`--on-deep`). Follow with
-   `<ul class="shapes">`, one `<li>` per platform: a `.plat` name plus the one-line shape. No JS.
-   See `report-style-tokens.md` → _Graphics_ for the chart contract. The chart
-   encodes **shape** (counts per layer) only — it is severity-blind. (Your template says whether
-   the counts are _recommended_ or _observed_ and supplies the caption.)
+   graphic) and a per-platform one-line shape list. The chart is a captioned `<figure class="dist">`
+   (`Fig 1`) with a `.legend` and one `.dist-row` per platform; follow it with
+   `<ul class="shapes">`, one `<li>` per platform (a `.plat` name plus the one-line shape). The
+   exact segment markup and the render rules (`flex:<count>`, the `--on-unit`/`--on-deep` text
+   colors, never hand-computing widths, no JS) are the chart contract owned by
+   `report-style-tokens.md` → _Graphics_ — follow it there. The chart encodes **shape** (counts per
+   layer) only — it is severity-blind. (Your template says whether the counts are _recommended_ or
+   _observed_ and supplies the caption.)
 4. **`#evidence`** — a table of which inputs were used and, explicitly, **what was missing or
    unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage unverified"). For PR
    inputs include the captured **head SHA** and **`owner/repo`** so per-test permalinks elsewhere
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index 13db7c1..befb824 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -40,8 +40,6 @@ Alongside the behaviors, carry each behavior's **risk severity** — the impact
 
 ## Principles
 
-- **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test. Mark anything inferred without evidence as an assumption.
-- **Cheapest sufficient layer wins.** Confidence pushed to a lower layer is cheaper to write, faster to run, and less flaky.
+- **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test; treat only _observed_ coverage from the inventory as verified, and mark anything inferred without evidence as an assumption.
+- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down (unit over integration over E2E) unless a behavior truly needs the higher layer — then land that call inside the target repo's actual shape, not one universal trophy.
 - **Severity sets the bar, not the layer.** Weight each behavior's coverage by the impact a defect in it would have, per `references/severity-risk.md` — severity decides how completely a behavior is covered and how high its gap ranks, never which layer is "cheapest sufficient." It is impact, not priority (urgency).
-- **Per-platform, not one-size.** A feature spanning server, web, and mobile gets a distinct shape per platform — their stacks and risks differ.
-- **Honesty about coverage.** Treat only _observed_ coverage from the inventory as verified; everything unconfirmed feeds the gap analysis, never an assumed-covered call. An un-inspectable repo is a recorded gap, not a silent pass.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
index 49148eb..137dd49 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -30,13 +30,11 @@ ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#recommend
     `<span class="badge assumption">assumption</span>`.
   - Use the layer → repo map; **E2E rows must name the dedicated `test` repo** as target.
   - **The "Evidence (linked)" column is binding.** For every existing test cited as current
-    coverage, render a GitHub permalink anchored to the captured commit SHA and line range —
-    `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
-    If a test cannot be linked, use
-    `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` instead of
-    fabricating a URL. These records come from the coverage inventory; the permalink production
-    rules live in the `assessing-test-coverage` skill's `references/finding-coverage.md` →
-    _Citing tests as GitHub permalinks_.
+    coverage, render the behavior's representative test(s) as GitHub permalinks — or, when a test
+    cannot be linked, the `.unlinkable` span instead of a fabricated URL. These records come from
+    the coverage inventory; the exact link / `.unlinkable` markup and the permalink production
+    rules are owned by the `assessing-test-coverage` skill's `references/finding-coverage.md` →
+    _Citing tests as GitHub permalinks_ and _When a test cannot be linked_ — follow it.
 - **`#gaps`** — heading "Coverage gaps & imbalances": behaviors with no coverage, and any shape
   wrong for its repo (ice-cream-cone, over-unit-tested, trivial tests). **Order by severity**,
   highest first, so a Critical uncovered behavior leads; Informative behaviors are recorded as
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
index d0fba0f..08269c7 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -27,13 +27,10 @@ ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#coverage`
   coverage still gets a row, shown empty.
 - **`#coverage`** — per-platform tables, **one row per behavior** (not per test):
   `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`.
-  - **Tests (linked)** renders the behavior's 1–3 representative permalinks (binding), anchored to
-    the captured commit SHA and line range —
-    `<a href="https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>">path/to/file.spec.ts</a>`.
-    A representative test that cannot be linked uses
-    `<span class="unlinkable">path/to/file.spec.ts — unlinkable: &lt;reason&gt;</span>` — never a
-    fabricated URL. Permalink production rules live in `finding-coverage.md` →
-    _Citing tests as GitHub permalinks_.
+  - **Tests (linked)** renders the behavior's 1–3 representative tests as permalinks (binding), or
+    the `.unlinkable` span when a test cannot be linked — never a fabricated URL. The exact link /
+    `.unlinkable` markup and the permalink production rules are owned by `finding-coverage.md` →
+    _Citing tests as GitHub permalinks_ and _When a test cannot be linked_ — follow it.
   - **Count** is the approximate number of tests covering that behavior at that layer — breadth
     without enumerating every test. Do not expand a well-covered behavior into dozens of rows.
   - **Layer** uses the matching layer chip. **Source** is `PR` (tests shipped in a linked/merged

From f605de350475794803b9cad0ec4851b94ebc76f4 Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Mon, 22 Jun 2026 15:38:40 -0400
Subject: [PATCH 6/9] refine scopes, multiple debloat passes

---
 .claude-plugin/marketplace.json               |   2 +-
 .cspell.json                                  |   1 +
 .../.claude-plugin/plugin.json                |   7 +-
 plugins/bitwarden-test-engineer/CHANGELOG.md  |  65 +-----
 plugins/bitwarden-test-engineer/README.md     |  34 +--
 .../bitwarden-test-engineer/agents/AGENT.md   | 172 ---------------
 .../agents/test-strategist.md                 | 151 +++++++++++++
 .../references/input-sources.md               |  97 +++------
 .../references/report-style-tokens.md         | 205 ++++++------------
 .../references/report-template-common.md      | 127 +++++------
 .../scripts/build-report.sh                   |  25 ++-
 .../skills/analyzing-test-stack/SKILL.md      |  28 +--
 .../references/html-report-template.md        |   9 +-
 .../references/monorepo-layout.md             |  52 ++---
 .../references/severity-risk.md               |  38 ++--
 .../references/test-layers.md                 |  66 ++++++
 .../references/testing-trophy.md              |  90 --------
 .../skills/assessing-test-coverage/SKILL.md   |  26 +--
 .../references/coverage-report-template.md    |   6 +-
 .../references/finding-coverage.md            |  13 +-
 20 files changed, 484 insertions(+), 730 deletions(-)
 delete mode 100644 plugins/bitwarden-test-engineer/agents/AGENT.md
 create mode 100644 plugins/bitwarden-test-engineer/agents/test-strategist.md
 create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md
 delete mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index a457038..d6e824d 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -97,7 +97,7 @@
       "name": "bitwarden-test-engineer",
       "source": "./plugins/bitwarden-test-engineer",
       "version": "1.0.0",
-      "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment."
+      "description": "Test engineering toolkit for Bitwarden. Hosts role-specific testing agents — currently a test strategist that recommends what to test, at which layer, and why (risk-weighted, shaped to each repo) and inventories existing coverage. Designed to grow additional roles such as an SDET or a QA engineer."
     }
   ]
 }
diff --git a/.cspell.json b/.cspell.json
index e39c245..ce3f3c4 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -107,6 +107,7 @@
     "Robolectric",
     "rustdoc",
     "sarif",
+    "SDET",
     "SDLC",
     "sast",
     "sbom",
diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
index 2d60354..0363e0e 100644
--- a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
+++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "bitwarden-test-engineer",
   "version": "1.0.0",
-  "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.",
+  "description": "Test engineering toolkit for Bitwarden. Hosts role-specific testing agents — currently a test strategist that recommends what to test, at which layer, and why (risk-weighted, shaped to each repo) and inventories existing coverage. Designed to grow additional roles such as an SDET or a QA engineer.",
   "author": {
     "name": "Bitwarden",
     "url": "https://github.com/bitwarden"
@@ -15,9 +15,8 @@
     "test-strategy",
     "test-automation",
     "exploratory-testing",
-    "testing-trophy",
+    "test-layers",
     "qa",
     "orchestrator"
-  ],
-  "agents": "./agents/AGENT.md"
+  ]
 }
diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md
index 9b3ab89..1b109fb 100644
--- a/plugins/bitwarden-test-engineer/CHANGELOG.md
+++ b/plugins/bitwarden-test-engineer/CHANGELOG.md
@@ -9,64 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Initial release of the `bitwarden-test-engineer` plugin.
-- `bitwarden-test-engineer` agent: classifies the inputs for a change (Jira ticket,
-  GitHub PR, technical breakdown document, exported test-case CSV, plain-language
-  description), fans out subagents to gather evidence — including a dedicated **breakdown
-  reader** subagent (`sonnet`) that mines a tech breakdown for testable behaviors and its
-  status — then runs the analyst skill and presents its recommendation. When
-  inspecting a checked-out repo, subagents read its Claude config (root `CLAUDE.md`,
-  `.claude/`, nested `CLAUDE.md`) for test conventions before opening test files, and
-  establish existing coverage PR-first (tests in linked/merged PRs) with a targeted lookup
-  for pre-existing tests — never a repo-wide grep. The agent runs a dedicated **assess
-  existing coverage** step (per-repo coverage scouts applying `assessing-test-coverage`)
-  after evidence gathering and before invoking `analyzing-test-stack`, passing the merged
-  coverage inventory into the recommendation.
-- `assessing-test-coverage` skill: a backward-looking inventory of what a change is
-  **already tested** by. Scoped to the change surface (PR-first, then a targeted lookup —
-  never a repo-wide sweep), it discovers each repo's test conventions config-first, buckets
-  every observed test by layer, cites it as a stable GitHub permalink (commit SHA, not
-  branch), records untested behaviors as `unverified` gaps, and writes its own self-contained
-  HTML **coverage report** (`test-coverage-report-<slug>-<date>-<HHMMSS>.html`) following
-  `references/coverage-report-template.md`. Usable standalone to audit current coverage, and
-  consumed by `analyzing-test-stack`. Owns convention discovery, existing-test finding, and
-  the GitHub permalink citation rules (in `references/finding-coverage.md`) — concerns kept
-  separate from the trophy recommendation.
-- Plugin-level shared `references/`: `input-sources.md` (evidence-source ingestion, used by
-  both skills and the agent), `report-style.css` (the single off-brand data-report stylesheet
-  both reports use) and `report-style-tokens.md` (its design contract). The
-  `scripts/build-report.sh` build script splices `report-style.css` into each report so the
-  stylesheet is never reproduced as model output and the coverage and test-stack reports
-  cannot drift — they read as one instrument.
-- Combined two-tab report: when the agent runs end to end, the `test-combined` build mode
-  stitches the two standalone reports into one page with _Current coverage_ and
-  _Recommended coverage_ tabs (CSS-only, no JavaScript; stacks both views on print). It is a
-  presentation-only merge assembled from the finished report files — each skill still authors
-  and builds its own standalone report unchanged, so the split between coverage and
-  recommendation stays intact. The tab chrome lives entirely in `report-style.css` and the
-  build script; no skill or template knows about tabs.
-- `analyzing-test-stack` skill: consumes the coverage inventory from `assessing-test-coverage`,
-  then maps a change's testable behaviors to the cheapest
-  sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits
-  a self-contained HTML report to the current working directory. Accepts a **technical
-  breakdown document** (a Bitwarden Tech Breakdown Confluence page, the artifact produced by
-  the `bitwarden-delivery-tools:writing-tech-breakdowns` skill) as an additive evidence
-  source alongside Jira, PR, CSV, and plain-language inputs — mining its Part 2 scope
-  checklist for the surfaces and platforms touched, its Part 4 specification child pages for
-  the interfaces to test against, and its Part 5 open questions for untestable-requirement
-  risk. The report surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone,
-  over-testing, missing platform layers), recording ungrounded findings as `unverified`
-  gaps. Includes references for the Testing Trophy model, the repo/stack
-  layer→repo map, evidence-source ingestion, and the HTML report
-  template. The Atlassian `search_confluence` / `search_confluence_cql` tools back locating a
-  breakdown by feature/team name when only a name (not a page ID) is given.
-- Linked table of contents (`.toc`) at the top of every report's `<main>`, linking to
-  each section; in the combined report the build script namespaces the ToC's anchors per tab so
-  each panel's ToC jumps within its own panel.
-- Top-of-report `#overview` synthesis section, written by the analyst: a 2–4 sentence recap
-  of the recommended shape per platform, the top 3 open risks (drawn from
-  `#gaps`), and anchor links into the detail sections, so readers see the bottom line without
-  scrolling. The overview is additive — per-behavior detail stays in `#recommendations`/`#gaps`.
-- Per-layer model governance to optimize token spend: the agent inherits the session model
-  for its own context (which drives the analysis and the recommendation), while the fan-out
-  evidence subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket,
-  or repo, `haiku` for pure CSV parsing — rather than inheriting the orchestrator's model.
+- `test-strategist` agent: classifies a change's inputs (Jira ticket, GitHub PR, tech breakdown, test-case CSV, plain-language description), fans out subagents to gather evidence, and presents a test recommendation.
+- `assessing-test-coverage` skill: inventories what a change is already tested by, buckets observed tests by layer, cites them as stable GitHub permalinks, and writes a self-contained HTML coverage report.
+- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest sufficient test layer per platform, surfaces coverage gaps and shape-wrong tests, and emits a self-contained HTML report.
+- Shared plugin-level `references/` and a `build-report.sh` script that splices the single shared stylesheet into each report so the two reports can't drift.
diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index e7f222d..4b7e183 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -2,12 +2,13 @@
 
 ## Overview
 
-A test engineering toolkit for Bitwarden. A generalist test-engineer agent analyzes a
-request and dispatches specialized skills across the testing discipline — test strategy and planning,
-automation, exploratory testing, and quality assessment. The plugin is designed to grow:
-new testing skills are added over time.
+A test engineering toolkit for Bitwarden. It hosts role-specific testing agents. Today it
+ships one — the **test strategist** (`test-strategist`), the test-_planning_ role:
+it recommends what to test, at which layer, and why, and inventories what is already tested.
+It does not author, run, or maintain the tests, nor do exploratory/manual QA. The plugin is
+designed to grow additional roles over time (for example an SDET or a QA engineer).
 
-### First capability: test-stack analysis
+### First role: the test strategist
 
 Given a change — a feature, bugfix, refactor, or migration — the agent recommends
 **what to test, at which layer, and why**, shaped to **each repo's actual test practice**.
@@ -33,18 +34,18 @@ in a dedicated, private `test` repository** — not inside the platform repos 
 recommendations target that separate repo, and existing E2E coverage is treated as
 unverified when that repo isn't checked out.
 
-## Agent
+## Agents
 
-| Agent                     | What It Does                                                                                                                                                                                                                                                                         |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `bitwarden-test-engineer` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, assesses existing coverage (`assessing-test-coverage`), then runs `analyzing-test-stack` — emitting a self-contained coverage report and a self-contained test-stack report. |
+| Agent             | What It Does                                                                                                                                                                                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `test-strategist` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, assesses existing coverage (`assessing-test-coverage`), then runs `analyzing-test-stack` — emitting a self-contained coverage report and a self-contained test-stack report. |
 
 ## Skills
 
 | Skill                     | What It Does                                                                                                                                                                                                                                                                                                                                                                                    |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `assessing-test-coverage` | The backward-looking inventory. Determines what is **already tested** for a change — scoped to the change surface, PR-first then a targeted lookup — buckets each observed test by layer, cites it as a stable GitHub permalink, flags untested behaviors as gaps, and writes a self-contained HTML coverage report. Feeds `analyzing-test-stack`; usable standalone to audit current coverage. |
-| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient test layer per platform, inside each repo's actual shape, names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report to the current working directory.   |
+| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient test layer per platform, inside each repo's actual shape, names concrete tooling, surfaces coverage gaps and shape-wrong tests (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report into a per-change report directory.   |
 
 ## Cross-Plugin Integration
 
@@ -83,13 +84,12 @@ Here's our exported test cases CSV for the billing migration — which of these
 automated and at what layer?
 ```
 
-Each run produces two self-contained HTML files in the current working directory: a
-`test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested — observed tests per
-layer, each cited as a GitHub permalink, plus gaps) and a
-`test-stack-report-<slug>-<date>-<HHMMSS>.html` (the per-platform recommendation and its
-coverage-gap findings). The `HHMMSS` time suffix is stamped at build time, so re-running on the
-same day never overwrites a prior report. Both share one off-brand data-report visual system so
-they read as the same instrument.
+Each run produces a per-change directory `test-engineer-report-<slug>-<date>/` holding the
+self-contained HTML reports: `coverage.html` (what is already tested — observed tests per layer,
+each cited as a GitHub permalink, plus gaps), `recommended.html` (the per-platform recommendation
+and its coverage-gap findings), and `combined.html` (the primary deliverable — both on one two-tab
+page). Re-running on the same change and date refreshes the reports in that directory. They share
+one off-brand data-report visual system so they read as the same instrument.
 
 ## References
 
diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md
deleted file mode 100644
index 85994eb..0000000
--- a/plugins/bitwarden-test-engineer/agents/AGENT.md
+++ /dev/null
@@ -1,172 +0,0 @@
----
-name: bitwarden-test-engineer
-version: 1.0.0
-description: |
-  Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E) shaped to each repo's actual test practice — a unit-heavy pyramid, an integration/snapshot trophy, or an all-E2E repo, not one universal trophy — and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, assesses what is already tested (the `assessing-test-coverage` skill), then runs the analyst skill (`analyzing-test-stack`), which emits a self-contained HTML report. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or asks for a "test stack" / "test strategy" / "test trophy" / "risk-based coverage" analysis for a ticket, PR, tech breakdown, or set of test cases.
-
-  <example>
-  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
-  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
-  assistant: "I'll use the bitwarden-test-engineer agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a test-layer recommendation shaped to each affected repo."
-  <commentary>
-  Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report.
-  </commentary>
-  </example>
-
-  <example>
-  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
-  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
-  assistant: "I'll use the bitwarden-test-engineer agent to read the PR diff and its tests, assess the test shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
-  <commentary>
-  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the test shape including the ice-cream-cone check.
-  </commentary>
-  </example>
-
-  <example>
-  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
-  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
-  assistant: "I'll use the bitwarden-test-engineer agent to parse the CSV, bucket the existing cases by test layer, find the gaps, and produce a layer-by-layer automation recommendation."
-  <commentary>
-  CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
-  </commentary>
-  </example>
-
-  <example>
-  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
-  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
-  assistant: "I'll use the bitwarden-test-engineer agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform test-stack recommendation shaped to each repo."
-  <commentary>
-  Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
-  </commentary>
-  </example>
-model: inherit
-tools:
-  - Read
-  - Write
-  - Glob
-  - Grep
-  - Skill
-  - Task
-  - AskUserQuestion
-  - Bash(gh pr view:*)
-  - Bash(gh pr diff:*)
-  - Bash(gh pr checks:*)
-  - Bash(git diff:*)
-  - Bash(git log:*)
-  - Bash(git rev-parse:*)
-  - Bash(git remote get-url:*)
-  - Bash(git -C * rev-parse:*)
-  - Bash(git -C * remote get-url:*)
-  - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)
-  - mcp__bitwarden-atlassian__get_issue
-  - mcp__bitwarden-atlassian__search_issues
-  - mcp__bitwarden-atlassian__get_issue_comments
-  - mcp__bitwarden-atlassian__get_issue_remote_links
-  - mcp__bitwarden-atlassian__get_confluence_page
-  - mcp__bitwarden-atlassian__search_confluence
-  - mcp__bitwarden-atlassian__search_confluence_cql
-skills:
-  - assessing-test-coverage
-  - analyzing-test-stack
-color: green
----
-
-You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, across three layers: a unit layer for pure logic, an integration layer for collaborator wiring, and a thin E2E layer reserved for critical user journeys. How those layers are weighted is **per repo, not one universal trophy** — Bitwarden's repos span unit-heavy pyramids (`server`, `clients`, `sdk-internal`, `android`), an integration + snapshot trophy (`ios`), and all-E2E repos (`test`, `browser-interactions-testing`).
-
-You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep each repo's shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence.
-
-## Operating context
-
-Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each repo is shaped independently — match the recommendation to that repo's actual practice (`monorepo-layout.md` → _Each repo's test shape in practice_), not a single house style.
-
-**Where each layer lives:** unit and integration live alongside the code in each platform repo; **E2E lives in the dedicated `test` repo** (sibling of the platform repos). See `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map.
-
-The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided.
-
-## Workflow
-
-**Route first.** Classify what the request actually needs, then dispatch to the matching skill(s) — the skills are self-describing and each can run standalone, so you select among them rather than forcing every request through a single fixed path.
-
-The **primary flow — and the one steps 1–5 below specify — is the coverage → recommendation pipeline**: assess what is already tested, then recommend what to add. It runs whenever the user wants a test plan, a test-stack analysis, or a risk-based coverage recommendation for a change. The two steps are genuinely ordered (the coverage inventory feeds the recommendation), so when the full plan is wanted, run them in sequence.
-
-But not every request is the full pipeline. When a request maps cleanly onto a single capability, invoke just that skill and stop:
-
-- _"What's already tested for this PR?"_ → `Skill(assessing-test-coverage)` alone; skip the recommendation.
-- _"What layers should this change ship with?"_ (coverage already known or not wanted) → `Skill(analyzing-test-stack)`, which pulls its own coverage inventory if none was supplied.
-
-As the plugin grows, a request that doesn't fit the coverage → recommendation pipeline dispatches to the skill that owns it rather than being bent through the steps below — add the new branch here, leave the pipeline intact. The orchestration concerns that span every flow (parallel evidence fan-out, explicit subagent model-pinning, coverage-before-recommendation ordering, context discipline) live in this agent regardless of which skill runs.
-
-The steps below specify the primary pipeline end to end.
-
-### 1. Intake and scope
-
-Classify every input the user supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name to search), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) is specified in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it here.
-
-Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption.
-
-### 2. Fan out to gather evidence
-
-Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out:
-
-- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted, and the **source issue key + browse URL** for each behavior (for an Epic, the specific child the behavior came from) so the report can link every behavior back to its requirement. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_ and _Citing Jira issues as links_.
-- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
-- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present.
-- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status.
-
-Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied.
-
-**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit the orchestrator's model. See _Model selection_ below for the rationale.
-
-### 3. Assess existing coverage
-
-Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return **one record per behavior** — its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [{ path, start_line, end_line, owner_repo, sha, permalink }] }`) plus `unverified` gaps. **Scouts must establish coverage per behavior and stop as soon as it's confirmed — never enumerate every test method in a covered area** (this is the dominant cost control; a behavior backed by 40 tests is one record with a count of ~40 and 3 exemplars, not 40 records). The output contract, the per-behavior discipline, the PR-first/targeted-lookup rule, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' per-behavior records into a single coverage inventory.
-
-This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C <repo> rev-parse HEAD` and `owner/repo` via `git -C <repo> remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date to produce the backward-looking coverage inventory (observed tests per layer with permalinks, plus `unverified` gaps) and the **self-contained HTML coverage report** — a `test-coverage-report-<slug>-<date>-<HHMMSS>.html` file in the current working directory. The skill returns the inventory records for step 4. Per the skill, the actual HTML _rendering_ is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — only the gathering and inventory merge happen in your context. Pass today's date — skills cannot read the clock; the build script stamps the `HHMMSS` suffix so the file is always fresh.
-
-### 4. Recommend
-
-Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient test layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report-<slug>-<date>-<HHMMSS>.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill (the clock-and-`HHMMSS` rule is stated in step 3).
-
-### 5. Combine and present
-
-Steps 3 and 4 each emit a self-contained HTML file in the current working directory: the `test-coverage-report-<slug>-<date>-<HHMMSS>.html` (what is already tested) and the `test-stack-report-<slug>-<date>-<HHMMSS>.html` (the recommendation) — the timestamped filenames never collide (step 3).
-
-Then assemble the **combined two-tab page** — the primary deliverable, with _Current coverage_ (the coverage report) and _Recommended coverage_ (the test-stack report) on one page. Run the build script yourself (it is pure file assembly — no template or stylesheet reading, so your context stays lean) with the two filenames the prior steps printed:
-
-```bash
-"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
-  --kind test-combined --slug <slug> --date <today> \
-  --current <test-coverage-report-…​.html> \
-  --recommended <test-stack-report-…​.html>
-```
-
-This writes `test-combined-report-<slug>-<date>-<HHMMSS>.html`; the two standalone reports are read, not modified, and remain available. Use the exact filenames the build script printed.
-
-Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at the **combined page** first (both views in one file), and note the two standalone reports are also available for sharing a single view.
-
-## Principles
-
-- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground.
-- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer, then land the call inside the target repo's actual shape (per `monorepo-layout.md` → _Each repo's test shape in practice_, not a single house style).
-- **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency).
-- **Degrade gracefully.** A missing input (no `bitwarden-atlassian-tools` MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see.
-- **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_.
-- **Coverage before recommendation.** Assess what already exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute.
-
-## Model selection
-
-This agent **inherits the session model** for its own context — the orchestration and the hard reasoning run on whatever model the user set the session to. What the plugin governs explicitly is the model of every subagent you fan out, so the cheap, high-volume work never runs at the orchestrator's rate. The split:
-
-- **You (the test-engineer agent) keep the genuinely hard work in your own context** — classifying intake, then mapping behaviors to the cheapest sufficient layer across multiple platforms, risk-weighted by severity. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on, so it stays with the orchestrator rather than being delegated to a subagent.
-- **Evidence-gathering subagents run on Sonnet or Haiku.** Everything you fan out to gather is evidence that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit the orchestrator's model.
-- **Report rendering runs on Sonnet — the report-writer subagent.** Once the coverage inventory (step 3) and the behavior→layer/severity mapping (step 4) are decided, turning them into HTML is **mechanical formatting, not reasoning**, and is delegated rather than done in your own context. Dispatch a `Task` (model: `sonnet`) report-writer that receives the decided structured records (plus the `#overview` synthesis you wrote), authors the report **content fragment** per the skill's template, and runs `${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh` to splice in the stylesheet and emit the file. The stylesheet itself is a static file the build script inlines — it is never reproduced as model output by anyone, on any model.
-
-Rule of thumb: push the cheap, high-volume gathering **and the mechanical report rendering** down to explicitly-pinned Sonnet/Haiku subagents; keep only the irreducible layer/severity reasoning in the orchestrator context.
-
-## Keep your orchestrator context lean
-
-Your own context is the most expensive token pool in the run — what you read into it and re-emit is re-cached on every subsequent turn. Three rules:
-
-- **Never read the rendering files into your context.** The report templates (`html-report-template.md`, `coverage-report-template.md`, the shared `report-template-common.md`), `report-style-tokens.md`, `report-style.css`, and `build-report.sh` are the **report-writer subagent's** concern only — it reads them. You only need the reasoning references (`testing-trophy.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). Loading the templates or stylesheet into your context is wasted cache. (The combined-page build in step 5 is the one time you _invoke_ `build-report.sh` directly — but you only run it on the two finished report filenames; you still never read its source or the rendering files.)
-- **Don't restate digests.** Subagents return compact digests; synthesize them into the decision, don't echo them back to the user mid-run. Keep inter-step narration to a few lines — the reports are the deliverable, not a running commentary.
-- **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records (now small by design) and the `#overview` text. If a record set is still large, `Write` it to a temp file (e.g. `./.test-engineer-<slug>.json`) and pass the path instead of pasting the blob into the prompt.
diff --git a/plugins/bitwarden-test-engineer/agents/test-strategist.md b/plugins/bitwarden-test-engineer/agents/test-strategist.md
new file mode 100644
index 0000000..c2ad1ed
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/agents/test-strategist.md
@@ -0,0 +1,151 @@
+---
+name: test-strategist
+version: 1.0.0
+description: |
+  Test strategist for Bitwarden — the test-planning role, scoped to exactly the two skills it owns: (1) analyzing-test-stack, which recommends what test automation a change needs and at which layer, and (2) assessing-test-coverage, which inventories what is already tested. It produces a risk-weighted plan and a coverage inventory — it does NOT author, run, or maintain test code (a future SDET role), and does NOT perform exploratory or manual QA (a future QA-engineer role); do not delegate those to it. Takes a change — a feature, bugfix, refactor, or migration — described in plain language or carried in a Jira ticket, a GitHub PR, a Confluence tech breakdown, and/or an exported test-case CSV, and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E), shaped to each repo's actual test practice rather than one universal shape, and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or what is already tested for a change — or asks for a "test stack" / "test strategy" / "risk-based coverage" / "coverage inventory" analysis for a ticket, PR, tech breakdown, or set of test cases.
+
+  <example>
+  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
+  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
+  assistant: "I'll use the test-strategist agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a test-layer recommendation shaped to each affected repo."
+  <commentary>
+  Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
+  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
+  assistant: "I'll use the test-strategist agent to read the PR diff and its tests, assess the test shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
+  <commentary>
+  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the test shape including the ice-cream-cone check.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
+  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
+  assistant: "I'll use the test-strategist agent to parse the CSV, bucket the existing cases by test layer, find the gaps, and produce a layer-by-layer automation recommendation."
+  <commentary>
+  CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
+  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
+  assistant: "I'll use the test-strategist agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform test-stack recommendation shaped to each repo."
+  <commentary>
+  Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
+  </commentary>
+  </example>
+model: inherit
+tools:
+  - Read
+  - Write
+  - Glob
+  - Grep
+  - Skill
+  - Task
+  - AskUserQuestion
+  - Bash(gh pr view:*)
+  - Bash(gh pr diff:*)
+  - Bash(gh pr checks:*)
+  - Bash(git diff:*)
+  - Bash(git log:*)
+  - Bash(git rev-parse:*)
+  - Bash(git remote get-url:*)
+  - Bash(git -C * rev-parse:*)
+  - Bash(git -C * remote get-url:*)
+  - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)
+  - mcp__bitwarden-atlassian__get_issue
+  - mcp__bitwarden-atlassian__search_issues
+  - mcp__bitwarden-atlassian__get_issue_comments
+  - mcp__bitwarden-atlassian__get_issue_remote_links
+  - mcp__bitwarden-atlassian__get_confluence_page
+  - mcp__bitwarden-atlassian__search_confluence
+  - mcp__bitwarden-atlassian__search_confluence_cql
+skills:
+  - assessing-test-coverage
+  - analyzing-test-stack
+color: green
+---
+
+You are the **test strategist** for Bitwarden — the test-planning role. Your job: take a change — a feature, bugfix, refactor, or migration — and say **what to test, at which layer, and why**. You recommend the plan and inventory existing coverage; you do not author, run, or maintain the tests, nor run exploratory/manual QA — those are separate roles this plugin may grow into later.
+
+You produce a recommendation — an HTML report — not the tests themselves. Ground every layer call in evidence; a test plan drifts toward whatever is easiest to write rather than what buys confidence, so keep each repo's shape honest.
+
+## Operating context
+
+A single feature frequently spans several repos (a server endpoint + a web client + a mobile screen), each shaped independently — match the recommendation to each repo's actual practice, not a house style. **Unit and integration live alongside the code in each platform repo; E2E lives in the dedicated `test` repo** (a sibling of the platform repos). The per-platform stack and the layer→repo map are in `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md`.
+
+Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is absent and the user references a Jira issue or Confluence breakdown, don't fail — say the MCP is unavailable and ask the user to paste the requirements, or proceed from the PR / CSV / description provided.
+
+## Workflow
+
+Classify what the request needs and dispatch to the matching skill(s) — each skill runs standalone:
+
+- _"What's already tested for this PR?"_ → `Skill(assessing-test-coverage)` alone.
+- _"What layers should this change ship with?"_ → `Skill(analyzing-test-stack)` (it pulls its own coverage inventory if none is supplied).
+- A full test plan / test-stack analysis → the **coverage → recommendation pipeline** below, run in sequence (the coverage inventory feeds the recommendation).
+
+The steps below specify that pipeline end to end.
+
+### 1. Intake and scope
+
+Classify every input supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) lives in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it. Then determine the **affected repos/platforms**: if scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion`; otherwise infer and state your assumption.
+
+### 2. Fan out to gather evidence
+
+Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your context stays lean. Each returns a compact structured digest, not raw dumps:
+
+- **Requirements reader** (`sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children, feeding linked PR URLs to the PR analyzer, and capturing the bug **severity** and each behavior's **source issue key + browse URL**. Follows `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_ and _Citing Jira issues as links_.
+- **Breakdown reader** (`sonnet`) — fetches the tech breakdown, mines Part 2's scope checklist for surfaces, Part 4 spec pages for interfaces, and Part 5 open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
+- **PR diff analyzer** (`sonnet`) — `gh pr diff` / `gh pr view` for the change surface, public API touched, and tests already present.
+- **CSV parser** (`haiku`) — buckets existing cases by apparent layer and automation status.
+
+Give each subagent one source and a tight output contract; skip any branch whose input wasn't supplied. **Set each subagent's model explicitly** (see _Model selection and context discipline_) — never let a digest-returning subagent inherit your model.
+
+### 3. Assess existing coverage
+
+Once the change surface is known (step 2), determine what is **already tested** before recommending anything. Fan out a **per-repo coverage scout** (`sonnet`) per affected repo, each applying the `assessing-test-coverage` skill — the record shape, discovery rules, per-behavior discipline, and permalink recipe live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md`; scouts follow it. Each returns one record per behavior plus `unverified` gaps. Merge the scouts' records into one inventory.
+
+Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date to produce the coverage inventory and the **self-contained HTML coverage report**. Per the skill, the HTML _rendering_ is delegated to the Sonnet **report-writer subagent** — only the gathering and merge happen in your context. Skills can't read the clock; pass today's date, and the build script writes the report into the per-change `test-engineer-report-<slug>-<date>/` directory.
+
+### 4. Recommend
+
+Invoke `Skill(analyzing-test-stack)` with the digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your context** — map each behavior to the cheapest sufficient layer per platform, risk-weighted by severity, and surface gaps and shape-wrong tests (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity; the skill and its `references/` own how. Once the mapping is decided, rendering it to the **self-contained HTML report** is mechanical and is delegated to the Sonnet **report-writer subagent** — hand it the decided per-behavior records (each carrying its `source_issue` from intake) and your `#overview` synthesis.
+
+### 5. Combine and present
+
+Steps 3 and 4 each write their report into the per-change directory `test-engineer-report-<slug>-<date>/` — `coverage.html` and `recommended.html`. Assemble the **combined two-tab page** — the primary deliverable, _Current coverage_ + _Recommended coverage_ on one page — yourself with the build script (pure file assembly, no template or stylesheet reading, so your context stays lean):
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-combined --slug <slug> --date <today> \
+  --current test-engineer-report-<slug>-<date>/coverage.html \
+  --recommended test-engineer-report-<slug>-<date>/recommended.html
+```
+
+The paths are deterministic under the per-change directory (and the prior steps print them); the two standalone reports are read, not modified, and `combined.html` lands beside them. Then mirror the test-stack report's `#overview` in chat — recommended shape per platform, the top open risks to resolve before committing to the plan, and any coverage the analyst couldn't verify — and point the user at `test-engineer-report-<slug>-<date>/combined.html` first (both standalone reports remain available for sharing a single view).
+
+## Principles
+
+These govern the orchestration; the per-skill principles live in the two skills.
+
+- **Coverage before recommendation.** Assess what exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute.
+- **Degrade gracefully.** A missing input (no MCP, no PR, no CSV, no `test` checkout) narrows the analysis; it never blocks it. State what you couldn't see.
+
+## Model selection and context discipline
+
+You **inherit the session model** for your own context — the orchestration and the hard behavior→layer/severity reasoning, where a wrong call is expensive to act on, stay with you. Everything you fan out is evidence-gathering or mechanical rendering and runs on an **explicitly pinned** cheaper model — never inherit:
+
+- **Evidence subagents** (step 2) — `sonnet` for anything reading a diff, ticket, or repo; `haiku` for pure CSV parsing.
+- **Coverage scouts** (step 3) — `sonnet`.
+- **Report-writer** — `sonnet`. Once the inventory (step 3) and the mapping (step 4) are decided, rendering to HTML is mechanical: the report-writer authors the content fragment per the skill's template and runs `build-report.sh` to splice in the stylesheet.
+
+Keep your own context lean — it is the most expensive token pool and is re-cached every turn:
+
+- **Never read the rendering files** (`html-report-template.md`, `coverage-report-template.md`, `report-template-common.md`, `report-style-tokens.md`, `report-style.css`, `build-report.sh`) — they are the report-writer's concern. You need only the reasoning references (`test-layers.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). The step-5 combined build is the one time you _invoke_ `build-report.sh` — on the two finished filenames; you still never read its source.
+- **Don't echo digests.** Synthesize subagent digests into the decision; keep inter-step narration to a few lines. The reports are the deliverable.
+- **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records and the `#overview` text; if a record set is large, `Write` it to a temp file (e.g. `./.test-engineer-<slug>.json`) and pass the path.
diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
index 33bc24d..224525d 100644
--- a/plugins/bitwarden-test-engineer/references/input-sources.md
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -20,19 +20,13 @@ Otherwise use the MCP tools directly:
 Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
 components** named. If the MCP is unavailable, ask the user to paste the requirements.
 
-Also capture, for every issue you read, its **key and browse URL** (prefer the URL the MCP/skill
-returns; otherwise construct `https://bitwarden.atlassian.net/browse/<KEY>`), and **carry the
-originating issue key with each behavior you extract**. The report links every behavior back to
-the Jira item it came from — see _Citing Jira issues as links_ below — so provenance must survive
-intake. A behavior that traces to no Jira item (e.g. found only in a PR diff) simply carries no
-source issue.
-
-Also capture **severity** — for a bug/defect ticket, read the severity assigned on the issue
-(the severity field, or the QA/reporter's stated severity in the description/comments) and
-carry it with the behaviors; for a feature/story without a defect, leave it to the analyst to
-assess each behavior's risk severity. Severity is the impact dial the `analyzing-test-stack`
-skill uses to risk-weight coverage — see that skill's `references/severity-risk.md`, mirrored
-from the Defect Severity Classification Guide (Confluence page `2759229512`).
+For every issue, also capture its **key and browse URL** and **carry the originating key with each
+behavior you extract**, so the report can link every behavior back to its source — link form and the
+no-Jira-source case are in _Citing Jira issues as links_ below.
+
+Also capture each behavior's **severity** and carry it through with the behavior. Where it comes
+from (a bug's Jira severity field vs. assessed risk for a feature) and how it weights coverage are
+owned by `analyzing-test-stack`'s `references/severity-risk.md`.
 
 ### Epic intake
 
@@ -52,29 +46,21 @@ before extracting:
    `bitwarden-atlassian-tools:researching-jira-issues` (Steps 2–3) — re-use that recipe; do
    not re-derive it.
 3. **Per child, gather behaviors and PRs.**
-   - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance
-     criteria — these are the testable behaviors for the analysis. Capture each child's **key and
-     browse URL** and carry it with the behaviors it produces, exactly as for a single-issue
-     intake — a behavior sourced from a child issue links to that child, not the epic.
-   - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub").
-     Each PR URL becomes an input to the **GitHub PR** branch below: hand it off to
-     `gh pr view` / `gh pr diff` so the actual change surface and any tests-in-PR feed the
-     recommendation. **These merged/linked PRs are the reliable backbone for existing
-     coverage** — the tests they contain are what shipped with this work, and the PR head SHA
-     makes each one permalink-ready (see the `assessing-test-coverage` skill's
-     `references/finding-coverage.md` → _Finding existing coverage_).
-     If `gh` cannot reach a PR (private fork, not authenticated, repo not accessible), record
-     the URL as evidence-not-inspected in the report rather than silently dropping it.
-4. **Track epic status.** The epic's own status (`In Planning`, `In Progress`, `Done`) tells
-   you how much of the work is shipped: children in `Done` with merged PRs likely already
-   have tests-in-PR you can audit for shape; children still `To Do` are scope-only and your
-   recommendation is necessarily prospective. Surface this distinction in the Evidence
-   section of the report.
-5. **Preferred path when available.** If `bitwarden-atlassian-tools` is installed, invoke
-   `Skill(bitwarden-atlassian-tools:researching-jira-issues)` on the epic key — its Step 2
-   already does the hierarchical-link discovery and Step 3 the depth-controlled traversal,
-   and returns the children + linked Confluence pages + remote links in one synthesized read.
-   Use the direct MCP calls above only when that skill is unavailable.
+   - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance criteria —
+     these are the testable behaviors. Carry each child's **key and browse URL** with the behaviors
+     it produces — a behavior sourced from a child links to that child, not the epic.
+   - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). Each PR URL
+     feeds the **GitHub PR** branch below (`gh pr view` / `gh pr diff`). **These merged/linked PRs
+     are the reliable backbone for existing coverage** — they carry the tests that shipped and the
+     PR head SHA makes each permalink-ready (see `finding-coverage.md` → _Finding existing
+     coverage_). If `gh` cannot reach a PR (private fork, not authenticated, repo inaccessible),
+     record the URL as evidence-not-inspected rather than dropping it.
+4. **Track epic status.** The epic's status (`In Planning`/`In Progress`/`Done`) tells you how much
+   is shipped: `Done` children with merged PRs likely have tests-in-PR to audit; `To Do` children
+   are scope-only and the recommendation is prospective. Surface this in the report's Evidence.
+5. **Preferred path.** The `researching-jira-issues` skill (preferred at the top of this file) does
+   this hierarchical discovery and depth-controlled traversal in one synthesized read — run it on the
+   epic key; the direct MCP calls above are the fallback.
 
 ## GitHub PR
 
@@ -152,7 +138,7 @@ settings — **do not hardcode them**. Read the header row, then map by meaning:
 - A **steps / expected-result** column, often in Given–When–Then form — the behavior.
 - Optional **team / area / tags / preconditions** columns — scope and grouping.
 
-Map rows to behaviors and bucket each by apparent layer using the `analyzing-test-stack` skill's `references/testing-trophy.md`:
+Map rows to behaviors and bucket each by apparent layer using the `analyzing-test-stack` skill's `references/test-layers.md`:
 
 - A case that drives the full UI through a complete journey → likely **E2E** (target the
   dedicated `test` repo).
@@ -166,30 +152,19 @@ state the interpretation you used rather than guessing silently.
 
 ## Citing Jira issues as links
 
-Every Jira item the report **names** — and every behavior the report shows that was **found from
-a Jira item** — is rendered as a clickable link to that item, never as bare key text. This is the
-Jira counterpart to the GitHub permalink rule for tests (the `assessing-test-coverage` skill's
-`references/finding-coverage.md` → _Citing tests as GitHub permalinks_).
-
-The link form is the issue's browse URL:
-
-```
-https://bitwarden.atlassian.net/browse/<KEY>
-```
-
-where `<KEY>` is the issue key (e.g. `PM-1234`). Prefer the URL the MCP tool or
-`bitwarden-atlassian-tools:researching-jira-issues` skill returns for the issue; fall back to
-constructing the browse URL from the key. The same rule covers epics and their children — link
-each to its own key.
+Every Jira item the report **names**, and every behavior **found from a Jira item**, is rendered as
+a clickable link — never bare key text. This is the Jira counterpart to the GitHub permalink rule
+for tests (`finding-coverage.md` → _Citing tests as GitHub permalinks_).
 
-Apply it everywhere the report renders one of these:
+The link form is the issue's browse URL `https://bitwarden.atlassian.net/browse/<KEY>` (e.g.
+`PM-1234`). Prefer the URL the MCP tool or `researching-jira-issues` skill returns; else construct it
+from the key. The same rule covers epics and their children — link each to its own key. Apply it:
 
-- An **issue, epic, or child key** named in the Overview, Summary, or Evidence sections —
-  anchor the key: `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`.
-- A **behavior row** (in the recommendations/coverage and gaps sections) whose behavior was
-  extracted from a Jira item — append the linked source key to the behavior cell so a reader can
-  jump to the requirement it came from. A behavior with no Jira source (PR-only) carries no key.
+- An **issue, epic, or child key** named in Overview/Summary/Evidence — anchor the key:
+  `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`.
+- A **behavior row** (recommendations/coverage/gaps) extracted from a Jira item — append the linked
+  source key to the behavior cell. A behavior with no Jira source (PR-only) carries none.
 
-These are informational `<a href>` citations — text, not loaded assets — so they do not violate
-the reports' self-contained / no-remote-resources constraint. Never fabricate a key or URL; if an
-issue's key is unknown, name the source in plain text rather than inventing a link.
+These are informational `<a href>` citations (text, not loaded assets), so they don't violate the
+self-contained constraint. Never fabricate a key or URL — if a key is unknown, name the source in
+plain text rather than inventing a link.
diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
index 117303e..013c469 100644
--- a/plugins/bitwarden-test-engineer/references/report-style-tokens.md
+++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
@@ -1,48 +1,39 @@
 # Report style tokens — data-report visual system for HTML reports
 
-This file documents the **visual system** for every self-contained HTML report the
-`bitwarden-test-engineer` plugin emits — the `analyzing-test-stack` test-stack report and the
-`assessing-test-coverage` coverage report alike. The HTML output requirements (single file,
-inline CSS, no external/CDN assets, no web fonts, no JS) mean a report cannot `<link>` to a
-design system at runtime — the stylesheet must be inlined into the report's `<style>` element.
-
-**You do not retype the stylesheet.** The canonical CSS lives as a real file at
-`report-style.css` (alongside this file in the plugin-level `references/` directory) and is
-spliced into the report by the `scripts/build-report.sh` build script — never reproduced as
-model output. Authoring a
-report means writing its **content** (the sections below) into a fragment whose `<style>`
-element holds a single sentinel line, then running the build script, which substitutes
-`report-style.css` for the sentinel verbatim. See _Building the report_ below. This is what
-keeps the two reports on one identical system: they splice the same file, so they cannot drift,
-and the ~400-line stylesheet costs zero output tokens per report.
-
-The look is deliberately **not** a brand skin. It is a quiet, ink-on-paper _data report_
-— the aesthetic of a statistical notebook or a coverage readout, where the data is the
-hero and nothing decorates. Every report ships the same system so two reports read as the
-same instrument. Do not re-pick colors, fonts, or layer tokens per report.
+The **visual system** for every self-contained HTML report the `bitwarden-test-engineer` plugin
+emits — the `analyzing-test-stack` test-stack report and the `assessing-test-coverage` coverage
+report alike. Because the output is a single file with no external assets, the stylesheet is
+inlined; both reports splice the **same** canonical CSS so they read as one instrument and cannot
+drift.
+
+**You never retype, prune, or hand-edit the stylesheet.** It lives as a real file at
+`report-style.css` (alongside this file) and is spliced into the report by `scripts/build-report.sh`
+— never reproduced as model output. Authoring a report means writing its **content** (the sections
+below) into a fragment whose `<style>` holds a single sentinel line, then running the build script
+(see _Building the report_). If the visual system genuinely needs to change, edit `report-style.css`
+once and every future report inherits it.
+
+The look is deliberately **not** a brand skin — a quiet, ink-on-paper _data report_ where the data
+is the hero and nothing decorates: flat white page, hairline rules, no cards/shadows/rounded panels.
 
 ## Design intent (why these choices)
 
-- **Flat paper, no chrome.** White page, hairline rules, no cards, no shadows, no
-  rounded panels. Sections are separated by a single rule and whitespace. Simple and
-  low-key by construction.
-- **Monospace is a structural role, not just for code.** Section numbers, eyebrows,
-  table headers, layer/badge chips, axis labels, counts, and SHAs are all set in
-  the system monospace stack. Prose is set in the system sans stack. The split makes
-  "data" and "argument" visually distinct and gives the report its notebook character
-  without any web font.
-- **The layer ramp is sequential, because the layers are ordered.** unit → integration
-  → e2e is a cost/depth sequence (cheapest/shallowest → most expensive/deepest). A
-  single-hue light→dark ramp encodes that order honestly; a thin dark sliver therefore
-  reads as "expensive, used sparingly." Do not swap it for unrelated categorical hues.
-- **State colors are categorical and muted.** The assumption/warn/ok badges each carry
-  exactly one meaning. Muted traffic colors, not saturated brand colors.
+- **Monospace is a structural role.** Section numbers, eyebrows, table headers, layer/badge chips,
+  axis labels, counts, and SHAs are set in the system mono stack; prose in the system sans stack.
+  The split makes "data" and "argument" visually distinct without any web font.
+- **The layer ramp is sequential, because the layers are ordered.** unit → integration → e2e is a
+  cost/depth sequence (cheapest/shallowest → most expensive/deepest); a single-hue light→dark ramp
+  encodes that order, so a thin dark sliver reads as "expensive, used sparingly." Do not swap it for
+  unrelated categorical hues.
+- **State colors are categorical and muted.** The assumption/warn/ok badges each carry exactly one
+  meaning — muted traffic colors, not saturated brand colors.
 
 ## Token → meaning mapping (binding)
 
-These mappings are **normative**. Do not re-pick colors per report.
+These mappings are **normative**. Do not re-pick colors per report. Your markup must use exactly
+these class names; the spliced stylesheet styles them.
 
-### Layer tokens (used wherever a test layer is rendered — chips, distribution bars, table cells)
+### Layer tokens (chips, distribution bars, table cells)
 
 | Layer       | Token           | HEX       | Role in the ramp                 |
 | ----------- | --------------- | --------- | -------------------------------- |
@@ -50,8 +41,8 @@ These mappings are **normative**. Do not re-pick colors per report.
 | integration | `--integration` | `#3F7196` | mid — the confidence layer       |
 | e2e         | `--e2e`         | `#1D3A54` | deepest — most expensive, thin   |
 
-`unit` is light, so layer chips and bar segments at the unit layer use **dark** text
-(`--on-unit`); integration and e2e use **white** text (`--on-deep`).
+`unit` is light, so its chips and bar segments use **dark** text (`--on-unit`); integration and e2e
+use **white** text (`--on-deep`).
 
 ### Badge / state tokens
 
@@ -61,8 +52,7 @@ These mappings are **normative**. Do not re-pick colors per report.
 | warn       | `--bad`  | Risks, missing-input flags, unverifiable claims |
 | ok         | `--ok`   | Confirmed coverage, grounded calls              |
 
-All badge chips use white (`--on-state`) text on these muted fills — the one
-contrast tradeoff in the system, kept legible by bold mono chip text at small sizes.
+All badge chips use white (`--on-state`) text on these muted fills.
 
 ### Surface, ink, and structural tokens
 
@@ -76,73 +66,27 @@ contrast tradeoff in the system, kept legible by bold mono chip text at small si
 | `--rule`      | `#E4E7EA` | Hairlines, dividers, table row borders        |
 | `--link`      | `#2F6E9E` | Links                                         |
 
-## Typography
-
-System fonts only — **no web fonts, no `@font-face`, no CDN imports**. Two roles, mapped
-to two stacks via `--sans` (prose) and `--mono` (data, labels, chrome):
-
-```
---sans: system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif
---mono: ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace
-```
+Typography is system fonts only — **no web fonts, no `@font-face`, no CDN imports** — split across
+`--sans` (prose) and `--mono` (data, labels, chrome).
 
 ## Graphics — the layer-distribution chart
 
-The one graphic the report needs is the **recommended layer distribution per platform**,
-rendered as a normalized horizontal **stacked bar** (a `<figure>` captioned `Fig 1`):
-
-- One `.dist-row` per platform: a right-aligned `.dist-label` (the platform) and a
-  `.bar` track holding one `.seg` per layer present.
-- **Segment width is proportional to the recommended test count at that layer** — set it
-  with an inline `style="flex: <count>"`. The flex values are the raw counts; the browser
-  normalizes them to fill the track. Do not hand-compute percentages or pixel widths.
-- Each segment shows its **count** as a monospace label inside it; the shared `.legend`
-  above maps color → layer. A `figcaption` names the figure. The unit segment carries
-  **dark** text (`--on-unit`) like the unit chip; integration and e2e segments carry
-  white (`--on-deep`).
-
-This replaces any arbitrary fixed-width bar. The chart is the report's signature: keep
-everything around it quiet so it reads.
-
-## Combined report — tabs (assembled, not authored)
-
-When both reports are produced for the same change, the build script can assemble them into
-**one page with two tabs** — _Current coverage_ (the `assessing-test-coverage` report) and
-_Recommended coverage_ (the `analyzing-test-stack` report). This is purely a **presentation**
-merge: each skill still authors and builds its own standalone report exactly as before; the
-combined page is an _additional_ deliverable stitched from the two finished report files. No
-skill or template knows about tabs — the tab markup and its CSS are owned entirely by
-`build-report.sh` and `report-style.css`, so the per-skill split stays intact.
-
-You never hand-write the tab markup. The build script reuses each report's `<header>`/`<main>`,
-namespaces the normative section ids so the two bodies coexist in one document
-(`#overview` → `#cur-overview` / `#rec-overview`, and likewise for the in-page anchor links),
-and emits the tab chrome. The mechanism is **CSS-only** (no JavaScript): two visually-hidden
-radio inputs (`.tab-input#tab-current` / `#tab-recommended`) drive the active `.tablist label`
-and which `.tabpanel[data-panel]` shows, via general-sibling selectors. On print, the tabs
-collapse and both panels stack, each titled by its `aria-label`, so a shared PDF carries the
-whole analysis. These classes live in the stylesheet's _Tabbed combined report_ block and are
-inert in the standalone reports, which never emit them.
-
-## The stylesheet file (binding contract)
-
-The full stylesheet is `report-style.css` (alongside this file). It is the single source of styling truth —
-**both** report templates resolve to it through the build script, so the coverage report
-(`assessing-test-coverage`'s `coverage-report-template.md`) and the test-stack report
-(`analyzing-test-stack`'s `html-report-template.md`) carry byte-identical CSS. Component
-classes (`.layer.*`, `.badge.*`, `.dist`/`.seg.*`, `.shapes`, `.unlinkable`, `.toc`, etc.) are
-part of the binding contract — both templates reference them by name; the markup you author must
-use exactly those class names so the spliced stylesheet styles it. Each report opens its
-`<main>` with a `.toc` nav of linked section anchors; in the combined report the build
-script namespaces those anchor links per tab.
-
-You never read, reproduce, prune, or hand-edit `report-style.css` when authoring a report —
-the build script inlines it whole. If the visual system genuinely needs to change, edit
-`report-style.css` once and every future report inherits it.
+The report's signature graphic: the layer distribution per platform, rendered as a normalized
+horizontal **stacked bar** (a `<figure>` captioned `Fig 1`).
+
+- One `.dist-row` per platform: a right-aligned `.dist-label` and a `.bar` track holding one `.seg`
+  per layer present.
+- **Segment width is proportional to the test count at that layer** — set it with inline
+  `style="flex: <count>"` (raw counts; the browser normalizes them). Never hand-compute percentages
+  or pixel widths.
+- Each segment shows its **count** as a monospace label; the shared `.legend` above maps color →
+  layer; a `figcaption` names the figure. The unit segment carries dark text (`--on-unit`),
+  integration and e2e white (`--on-deep`).
 
 ## Building the report
 
-The model authors a **content fragment** — a complete HTML document whose `<style>` element
+This section is the **single source of truth** for the build invocation; the templates only name
+their `--kind`. The model authors a **content fragment** — a complete HTML document whose `<style>`
 contains exactly one line, the sentinel:
 
 ```html
@@ -151,8 +95,8 @@ contains exactly one line, the sentinel:
 </style>
 ```
 
-Write that fragment to a temporary path (e.g. `<kind>-report-<slug>.fragment.html`), then run
-the build script from the plugin root:
+Write that fragment to a temporary path (e.g. `<kind>-report-<slug>.fragment.html`), then run the
+build script from the plugin root:
 
 ```bash
 "${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
@@ -160,39 +104,28 @@ the build script from the plugin root:
   <fragment-file>
 ```
 
-The script replaces the sentinel with `report-style.css` verbatim and writes
-`<kind>-report-<slug>-<date>-<HHMMSS>.html` to the current working directory, printing the
-final filename to stdout. The `<HHMMSS>` suffix is stamped from the wall clock by the script
-(the model cannot read the clock), so **every run gets a fresh filename** — a report is never
-overwritten, and an existing report never has to be read back and regenerated. Delete the
-temporary fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`,
-fragment not found) it writes nothing — fix the fragment and re-run rather than falling back to
-pasting CSS by hand.
-
-To assemble the **combined two-tab page** from the two already-built standalone reports, call
-the script with `--kind test-combined` and the two finished report files (no fragment, no
-sentinel — the bodies are reused as-is):
-
-```bash
-"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
-  --kind test-combined --slug <slug> --date <YYYY-MM-DD> \
-  --current <test-coverage-report-…​.html> \
-  --recommended <test-stack-report-…​.html>
-```
-
-It writes `test-combined-report-<slug>-<date>-<HHMMSS>.html` and prints the filename. The two
-input reports are read, not modified, and their standalone files remain.
+It replaces the sentinel with `report-style.css` verbatim and writes the report into a per-change
+directory `test-engineer-report-<slug>-<date>/` (created if needed) — the coverage report as
+`coverage.html`, the test-stack report as `recommended.html` — then prints the final path. The
+directory name derives only from `--slug`/`--date`, so a run's reports share one folder;
+**re-running the same change on the same date refreshes the report in place**. Delete the temporary
+fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`, fragment not
+found) it writes nothing — fix the fragment and re-run rather than pasting CSS by hand.
+
+**Combined two-tab page (assembled, not authored).** When both reports exist for one change, the
+build script can stitch them into one page with two CSS-only tabs — _Current coverage_ and
+_Recommended coverage_. This is a presentation-only merge from the two finished report files: no
+skill or template knows about tabs, and the agent (not the report author) runs it with
+`--kind test-combined --current test-engineer-report-<slug>-<date>/coverage.html --recommended test-engineer-report-<slug>-<date>/recommended.html`,
+which writes `combined.html` into that same directory. The tab chrome lives entirely in the build
+script and `report-style.css`.
 
 ## What not to do
 
-- Do not reintroduce a brand skin — no saturated brand blue/yellow, no logo images, no
-  `<link>` to a design system. The report is intentionally off-brand and self-contained.
-- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the
-  encoding.
-- Do not introduce web fonts, CDN links, or `<link rel="stylesheet">` — the single-file
-  constraint is binding.
-- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only
-  the sentinel, and the build script supplies the full stylesheet. A report that ships a
-  hand-copied or "only the classes I used" stylesheet is exactly how two reports drift apart.
-- Do not hand-compute the distribution bar widths in pixels or percentages — set
-  `flex: <count>` per segment and let the browser normalize.
+- Do not reintroduce a brand skin — no saturated brand colors, no logo images, no `<link>` to a
+  design system. The report is intentionally off-brand and self-contained.
+- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the encoding.
+- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only the
+  sentinel. A report that ships a hand-copied or "only the classes I used" stylesheet is exactly how
+  two reports drift apart.
+- Do not hand-compute distribution bar widths — set `flex: <count>` per segment.
diff --git a/plugins/bitwarden-test-engineer/references/report-template-common.md b/plugins/bitwarden-test-engineer/references/report-template-common.md
index 3eac48b..05877a9 100644
--- a/plugins/bitwarden-test-engineer/references/report-template-common.md
+++ b/plugins/bitwarden-test-engineer/references/report-template-common.md
@@ -1,106 +1,87 @@
 # Report HTML — shared authoring contract
 
 Both self-contained HTML reports the `bitwarden-test-engineer` plugin emits — the
-`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage
-report** — are authored against this shared contract, so the two read as one instrument. Each
-skill's own template (`html-report-template.md` / `coverage-report-template.md`) covers only what
-differs: its section set, its per-platform table columns, and its recommend-vs-inventory framing.
-**Read this file first, then that template.**
+`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage report** —
+are authored against this shared contract, so the two read as one instrument. Each skill's own
+template (`html-report-template.md` / `coverage-report-template.md`) covers only what differs: its
+section set, its per-platform table columns, and its recommend-vs-inventory framing. **Read this
+file first, then that template.**
 
 ## Output constraints
 
-Produce a **single self-contained HTML file**: all CSS inline in a `<style>` block, no
-external/CDN _resource_ links (stylesheets, fonts, scripts, images), no required JavaScript, no
-web fonts. Informational `<a href>` citations to public sources are fine and encouraged — they
-are text, not loaded assets (see _Content rules_). It must render correctly opened directly from
-disk and survive being attached to a ticket or PR.
-
-You do not write the final file directly and you do not paste any CSS. Author a **content
-fragment** (the full HTML document below, but with only a stylesheet sentinel inside `<style>`),
-then run the build script. The build mechanics — invocation, output filename, and the `HHMMSS`
-freshness stamp — live in `report-style-tokens.md` → _Building the report_ (the single source of
-truth); your template only names its `--kind`.
-
-## Styling — binding
-
-Inside the fragment's `<style>` element put exactly one line — the sentinel
-`/* @@BITWARDEN_REPORT_STYLESHEET@@ */` — and nothing else; the build script splices in the
-canonical stylesheet (`report-style.css`) verbatim, identically for both reports so they cannot
-drift. **Do not paste, retype, or trim any CSS, re-pick colors/fonts/layer tokens, reintroduce a
-brand skin, or add external/CDN assets** — the visual system and the full list of prohibitions are
-owned by `report-style-tokens.md` (its _token → meaning_ contract and _What not to do_); read it.
-Your markup must use the exact normative class names it defines: the layer → token mapping (unit /
-integration / e2e) and the badge → token mapping (assumption / warn / ok) apply wherever rendered —
-chips, distribution bars, table cells, and data rows.
-
-Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section and do
-not hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it scrolls rather than
+Produce a **single self-contained HTML file**: all CSS inline in `<style>`, no external/CDN
+_resource_ links (stylesheets, fonts, scripts, images), no required JavaScript, no web fonts. It
+must render correctly opened directly from disk and survive being attached to a ticket or PR.
+Informational `<a href>` citations to public sources are text, not loaded assets — they are fine and
+encouraged (see _Content rules_).
+
+You do not write the final file or paste any CSS: author a **content fragment** (the skeleton below,
+with only the stylesheet sentinel inside `<style>`), then run the build script. The fragment/sentinel
+mechanics, the build invocation, the normative class names (the layer and assumption/warn/ok tokens
+your markup must use), and the visual system are all owned by `report-style-tokens.md` — **read it.**
+Your template only names its `--kind`.
+
+Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section, do not
+hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it scrolls rather than
 overflows on narrow widths.
 
 ## Table of contents
 
-Directly **inside `<main>`, before `#overview`**, emit a linked table of contents:
-`<nav class="toc" aria-label="Sections">` holding one `<a href="#…">` per section in the report
-(your template lists them), each anchoring its section id. It is a `<nav>`, not a numbered
-section. (In the combined two-tab report the build script namespaces these anchor links per tab,
-so a panel's ToC jumps within its own panel.)
+Directly **inside `<main>`, before `#overview`**, emit `<nav class="toc" aria-label="Sections">`
+holding one `<a href="#…">` per section in the report (your template lists them). It is a `<nav>`,
+not a numbered section. (In the combined two-tab report the build script namespaces these anchors per
+tab so a panel's ToC jumps within its own panel.)
 
 ## Sections common to both reports
 
-Each section uses its **normative `id`** — do not rename, omit, or add top-level sections;
-readers look these up by id. The four below are shared; your template defines the report-specific
-data section (`#recommendations` or `#coverage`) and the `#gaps` contents, and adds framing notes
-for the shared ones (e.g. whether the chart shows recommended or observed counts).
+Each section uses its **normative `id`** — do not rename, omit, or add top-level sections; readers
+look these up by id. The four below are shared; your template defines the report-specific data
+section (`#recommendations` or `#coverage`) and the `#gaps` contents, and adds framing notes for the
+shared ones (e.g. whether the chart shows recommended or observed counts).
 
-1. **Header** (no id; `<header>` element) — report title, the change under analysis
-   (ticket/PR/feature), and the date.
+1. **Header** (no id; `<header>` element) — report title, the change under analysis (ticket/PR/
+   feature), and the date.
 2. **`#overview`** — a short top-of-report synthesis written by the author so a reader sees the
    bottom line without scrolling: a 2–4 sentence recap per platform, the top 3 items the reader
    should resolve (drawn from `#gaps`), and anchor links into the detail sections. Additive — the
-   per-behavior detail stays in the tables below. (Your template says what the recap and the
-   top-3 are _about_.)
+   per-behavior detail stays in the tables below.
 3. **`#summary`** — 2–4 sentences, then the **layer-distribution chart** (the report's signature
-   graphic) and a per-platform one-line shape list. The chart is a captioned `<figure class="dist">`
-   (`Fig 1`) with a `.legend` and one `.dist-row` per platform; follow it with
-   `<ul class="shapes">`, one `<li>` per platform (a `.plat` name plus the one-line shape). The
-   exact segment markup and the render rules (`flex:<count>`, the `--on-unit`/`--on-deep` text
-   colors, never hand-computing widths, no JS) are the chart contract owned by
-   `report-style-tokens.md` → _Graphics_ — follow it there. The chart encodes **shape** (counts per
-   layer) only — it is severity-blind. (Your template says whether the counts are _recommended_ or
-   _observed_ and supplies the caption.)
+   graphic; markup in the skeleton below) and a per-platform one-line shape list (`<ul class="shapes">`).
+   The chart's segment markup and render rules are the contract owned by `report-style-tokens.md` →
+   _Graphics_; it encodes **shape** (counts per layer) only — it is severity-blind. (Your template says
+   whether the counts are _recommended_ or _observed_ and supplies the caption.)
 4. **`#evidence`** — a table of which inputs were used and, explicitly, **what was missing or
    unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage unverified"). For PR
-   inputs include the captured **head SHA** and **`owner/repo`** so per-test permalinks elsewhere
-   in the report can be audited against the same commit.
+   inputs include the captured **head SHA** and **`owner/repo`** so per-test permalinks elsewhere can
+   be audited against the same commit.
 
 `#gaps` is the last section in both reports; its exact contents differ — see your template.
 
 ## Content rules
 
 - Tables over prose for the data sections and evidence — they're meant to be scanned and acted on.
-- Mark every assumption inline with `<span class="badge assumption">assumption</span>` so the
-  reader can tell grounded calls from inferred ones.
-- Flag unverifiable claims with `<span class="badge warn">unverified</span>` (e.g. E2E coverage
-  claimed without the `test` repo checked out).
-- **Hyperlink every GitHub or Atlassian source the report names.** Cited tests are GitHub
-  permalinks (see your template's evidence/coverage rule); any Jira/Confluence/GitHub artifact the
-  report names is anchored to its URL, never plain text. **Jira items and Jira-sourced behaviors
-  follow `input-sources.md` → _Citing Jira issues as links_** — the link form,
-  where to apply it, and the never-fabricate-a-key rule all live there. An informational
-  `<a href>` is text, not a fetched resource — it does not violate the no-remote-resources rule.
-- No tracking, no remote resources, no secrets — the file is shareable as-is. ("Remote resources"
-  means assets the page loads — stylesheets, fonts, scripts, images, CDN imports — not
-  informational `<a href>` citations, which are encouraged per the rule above.)
+- Mark every assumption inline with `<span class="badge assumption">assumption</span>` and every
+  unverifiable claim with `<span class="badge warn">unverified</span>` (e.g. E2E coverage claimed
+  without the `test` repo checked out), so grounded calls are distinguishable from inferred ones.
+- **Hyperlink every GitHub or Atlassian source the report names** — never plain text. The data
+  section's **evidence column** (`Evidence (linked)` in the test-stack report, `Tests (linked)` in
+  the coverage report) is binding: render each behavior's 1–3 representative tests as GitHub
+  permalinks, or the `.unlinkable` span when a test genuinely cannot be linked — never a fabricated
+  URL. Those records come from the coverage inventory; the exact link / `.unlinkable` markup and the
+  permalink-production rules are owned by the `assessing-test-coverage` skill's
+  `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _When a test cannot be
+  linked_. **Jira items and Jira-sourced behaviors** follow `input-sources.md` → _Citing Jira issues
+  as links_ (link form, where to apply it, never-fabricate-a-key rule). All of these are
+  informational `<a href>` citations, not fetched resources, so they don't violate the self-contained
+  constraint.
 - Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
-  after `</main>` paired with `id="top"` on `<header>`. It floats with the reader and jumps to the
-  top from anywhere; it is CSS-only (the stylesheet's `.to-top` rule, no JavaScript). Drop either
-  half and the anchor breaks.
+  after `</main>` paired with `id="top"` on `<header>`. It is CSS-only; drop either half and the
+  anchor breaks.
 
 ## Skeleton
 
-The shared document shell. Your template supplies the `<title>`, the eyebrow, the ToC section
-list, the report-specific section(s) between `#evidence` and `#gaps`, and the `#summary`/`#gaps`
-headings:
+The shared document shell. Your template supplies the `<title>`, the eyebrow, the ToC section list,
+the report-specific section(s) between `#evidence` and `#gaps`, and the `#summary`/`#gaps` headings:
 
 ```html
 <!doctype html>
diff --git a/plugins/bitwarden-test-engineer/scripts/build-report.sh b/plugins/bitwarden-test-engineer/scripts/build-report.sh
index fe06c6b..8142459 100755
--- a/plugins/bitwarden-test-engineer/scripts/build-report.sh
+++ b/plugins/bitwarden-test-engineer/scripts/build-report.sh
@@ -27,11 +27,16 @@
 # stylesheet in once. The two source reports are read, not modified, and their
 # standalone files remain; the combined page is an additional deliverable.
 #
-# Writes <kind>-report-<slug>-<date>-<HHMMSS>.html to the current working
-# directory and prints the final filename to stdout. The HHMMSS suffix is read
-# from the wall clock here (the model cannot read the clock) and guarantees a
-# fresh filename per run, so a report is never overwritten and an existing file
-# never has to be read back and regenerated.
+# Writes the report into a per-change directory, creating it if needed, and
+# prints the final path to stdout:
+#
+#   test-engineer-report-<slug>-<date>/coverage.html      (--kind test-coverage)
+#   test-engineer-report-<slug>-<date>/recommended.html   (--kind test-stack)
+#   test-engineer-report-<slug>-<date>/combined.html       (--kind test-combined)
+#
+# The directory name derives only from --slug/--date, so all three of a run's
+# reports land in the same folder. Re-running the same change on the same date
+# refreshes the report in place (the prior file is overwritten).
 #
 # Input files are left untouched; delete any temporary fragment yourself.
 
@@ -86,8 +91,14 @@ if [[ ! -f "$CSS_FILE" ]]; then
   exit 1
 fi
 
-TIME="$(date +%H%M%S)"
-OUT="${KIND}-report-${SLUG}-${DATE}-${TIME}.html"
+OUTDIR="test-engineer-report-${SLUG}-${DATE}"
+case "$KIND" in
+  test-coverage) BASENAME="coverage.html" ;;
+  test-stack)    BASENAME="recommended.html" ;;
+  test-combined) BASENAME="combined.html" ;;
+esac
+mkdir -p "$OUTDIR"
+OUT="${OUTDIR}/${BASENAME}"
 
 # Splice the canonical stylesheet in place of the sentinel line. awk reads the
 # CSS file line by line, so no shell escaping ever touches the CSS content.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index befb824..15275a5 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: analyzing-test-stack
-description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need".
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need". This is the forward-looking recommendation — it does NOT inventory what already exists; for that, use assessing-test-coverage (whose inventory this skill consumes).
 allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
 ---
 
@@ -8,38 +8,28 @@ allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view
 
 Recommend the test automation layers a change should ship with — shaped to **each target repo's actual test practice**, not one universal model — and write the recommendation as a self-contained HTML report. You produce advice, not tests.
 
-The three layers (read `references/testing-trophy.md` for the full model): a focused **unit** layer for pure logic and edge cases, an **integration** layer where collaborator wiring is exercised, and a **thin E2E** layer for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down toward unit. How the volume distributes across those layers differs per repo: Bitwarden's repos span unit-heavy **pyramids** (`server`, `clients`, `sdk-internal`, `android`), an integration + snapshot **trophy** (`ios`), and **all-E2E** repos (`test`, `browser-interactions-testing`). Land each call inside the target repo's shape — see `references/monorepo-layout.md` → _Each repo's test shape in practice_.
+Assign each behavior the **cheapest sufficient layer** (unit → integration → E2E, pushing coverage down) landed inside each repo's real shape (pyramid, trophy, or all-E2E). The layer model is in `references/test-layers.md`; the per-repo shapes in `references/monorepo-layout.md` → _Each repo's test shape in practice_.
 
 ## Inputs
 
-You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. You also consume a **coverage inventory** — the existing-test records produced by the `assessing-test-coverage` skill (permalink records + `unverified` gaps). Under the `bitwarden-test-engineer` agent this is gathered for you before this skill runs; if it is absent (e.g. run standalone), invoke `Skill(assessing-test-coverage)` for the affected change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
+You may receive any combination of: a Jira key, a GitHub PR, a CSV of test cases, a technical breakdown, and/or a plain-language description — additive evidence. You also consume a **coverage inventory** (the existing-test records produced by `assessing-test-coverage`: permalink records + `unverified` gaps). Under the `test-strategist` agent this is gathered before this skill runs; if it is absent (run standalone), invoke `Skill(assessing-test-coverage)` for the change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** for the report filename — don't read the clock; if none is supplied, ask via `AskUserQuestion`.
 
-`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule. Prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`; fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable. If neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance:
+`../../references/input-sources.md` (shared with `assessing-test-coverage`) is the canonical guide for ingesting each source — Epic expansion, breakdown mining, CSV column mapping, the Jira/Confluence tooling ladder, and the missing-source-is-a-gap rule.
 
-- **Jira** — extract testable behaviors and acceptance criteria; Epics/Features expand to their children before extraction.
-- **GitHub PR** — extract the change surface, API touched, and any tests already present.
-- **CSV** — bucket rows by apparent layer and automation status.
-- **Technical breakdown** — often the richest single input; its scope checklist already enumerates the platforms and surfaces.
-- **Description** — use directly when no artifact exists.
-
-If a source you'd expect is missing, proceed with what you have and **record the gap** in the report — never block on a missing input.
-
-Alongside the behaviors, carry each behavior's **risk severity** — the impact a defect in it would have, per Bitwarden's severity guide. `references/severity-risk.md` is the canonical model: where severity comes from (the Jira severity field for bugs; an assessment against the guide's criteria for features/PRs/breakdowns) and how it calibrates the recommendation. Severity is the dial that turns "cheapest sufficient layer" into a risk-weighted call — it decides how completely a behavior must be covered and how hard a missing test counts as a gap. Weight by severity (impact), not priority (urgency). **Security-sensitive behaviors (crypto, auth, threat-model-relevant paths) are at least Critical regardless of the guide's table** — see the reference's source-of-truth note.
+Carry each behavior's **risk severity** (impact, not urgency) alongside it; the model and how it calibrates coverage are in `references/severity-risk.md`.
 
 ## Workflow
 
 1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated.
 
-2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input. It is **one record per behavior**, carrying that behavior's layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`; representative tests are path-only with an `unlinkable` reason when they can't be linked), plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below.
+2. **Consume the coverage inventory.** What is already tested is established by `assessing-test-coverage`, not here — take its inventory (one record per behavior plus `unverified` gaps; the record shape and permalink rules live in that skill's `references/finding-coverage.md` → _Output contract_) as input. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If none was supplied, invoke `Skill(assessing-test-coverage)` for the change surface to produce one. These records feed both the report's Evidence column and the gap analysis below.
 
-3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest test layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`).
+3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest layer that genuinely buys the needed confidence (reach higher only for a real browser/device, cross-service contract, or full user journey), with a one-line rationale; then check that confidence bar against the behavior's risk severity per `references/severity-risk.md` (severity sets _how much_ confidence is sufficient, not _which_ layer). Land each call inside the **target repo's shape** and name its concrete tooling, both per `references/monorepo-layout.md` → _Each repo's test shape in practice_.
 
 4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is wrong for its repo (e.g. E2E doing work integration should do, untested core logic, or a layer the repo doesn't even maintain). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
 
-5. **Render the HTML report.** Once steps 1–4 have decided the per-behavior layer/severity mapping, rendering it to HTML is **mechanical formatting, not reasoning**. Author a content fragment per `references/html-report-template.md` (and the shared `../../references/report-template-common.md` it builds on), then run the build script to splice in the stylesheet and emit the file. Write `#overview` yourself as a short synthesis — recommended shape per platform, and the top 3 open risks drawn from `#gaps`, highest severity first. Ensure the `#recommendations` table carries a **Severity** column and a GitHub permalink (or explicit `unlinkable` note) in every Evidence cell. The template owns the section IDs, the never-paste-CSS rule, the `--kind test-stack` build invocation, and the filename/freshness contract — follow it; do not hand-assemble the file.
+5. **Render the HTML report** per `references/html-report-template.md` (which builds on the shared `../../references/report-template-common.md`) — mechanical formatting, not reasoning. Write `#overview` yourself: recommended shape per platform and the top 3 open risks from `#gaps`, highest severity first. The template owns everything else (section IDs, the Severity column, the Evidence permalinks, the `--kind test-stack` build, and the filename contract).
 
 ## Principles
 
-- **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test; treat only _observed_ coverage from the inventory as verified, and mark anything inferred without evidence as an assumption.
-- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down (unit over integration over E2E) unless a behavior truly needs the higher layer — then land that call inside the target repo's actual shape, not one universal trophy.
-- **Severity sets the bar, not the layer.** Weight each behavior's coverage by the impact a defect in it would have, per `references/severity-risk.md` — severity decides how completely a behavior is covered and how high its gap ranks, never which layer is "cheapest sufficient." It is impact, not priority (urgency).
+- **Ground every recommendation** in a specific requirement, diff hunk, CSV row, or observed test; treat only _observed_ coverage as verified, and mark anything inferred as an assumption.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
index 137dd49..7211cd9 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -29,12 +29,9 @@ ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#recommend
     a severity the analyst inferred (rather than read from a bug's Jira field) with
     `<span class="badge assumption">assumption</span>`.
   - Use the layer → repo map; **E2E rows must name the dedicated `test` repo** as target.
-  - **The "Evidence (linked)" column is binding.** For every existing test cited as current
-    coverage, render the behavior's representative test(s) as GitHub permalinks — or, when a test
-    cannot be linked, the `.unlinkable` span instead of a fabricated URL. These records come from
-    the coverage inventory; the exact link / `.unlinkable` markup and the permalink production
-    rules are owned by the `assessing-test-coverage` skill's `references/finding-coverage.md` →
-    _Citing tests as GitHub permalinks_ and _When a test cannot be linked_ — follow it.
+  - **The "Evidence (linked)" column is binding** — render each behavior's representative tests as
+    GitHub permalinks (or the `.unlinkable` span), per `../../../references/report-template-common.md`
+    → _Content rules_. These records come from the coverage inventory.
 - **`#gaps`** — heading "Coverage gaps & imbalances": behaviors with no coverage, and any shape
   wrong for its repo (ice-cream-cone, over-unit-tested, trivial tests). **Order by severity**,
   highest first, so a Critical uncovered behavior leads; Informative behaviors are recorded as
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
index 9dc5310..7c23897 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
@@ -1,10 +1,9 @@
 # Bitwarden repo layout, stacks, and the layer → repo map
 
 Bitwarden's code spans several repositories. A single feature often touches more than
-one, and **each repo follows its own test shape** — pyramid, trophy, or all-E2E (see
-_Each repo's test shape in practice_ below; the shapes themselves are defined in
-`testing-trophy.md`). Treat the tables below as a **starting map**, not gospel — when a
-repo is checked out, confirm the actual conventions from its config first (the
+one, and **each repo follows its own test shape** — pyramid, trophy, or all-E2E (the shapes
+themselves are defined in `test-layers.md`). Treat the table below as a **starting map**, not
+gospel — when a repo is checked out, confirm the actual conventions from its config first (the
 `assessing-test-coverage` skill's `references/finding-coverage.md` → _Discovering a repo's
 test conventions_), and read the table as the last-resort default.
 
@@ -13,37 +12,24 @@ it as permalinks — is a separate job owned by the `assessing-test-coverage` sk
 covers only the repo/stack map and the rules for mapping a behavior to the layer it _should_
 live at.
 
-## Platform repos and their stacks
-
-| Repo (typical)                           | Platform                                                                        | Language / framework                                                      | Unit / Integration tooling                                                                                                                                                                                                             |
-| ---------------------------------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `bitwarden/server`                       | Backend / API                                                                   | C# / .NET, ASP.NET Core, EF Core                                          | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers                                                                                                                                                         |
-| `bitwarden/clients`                      | Web, Browser ext, Desktop, CLI                                                  | TypeScript, Angular, Electron, RxJS                                       | Jest + `jest-mock-extended` + Angular TestBed (unit + shallow component); mocked HTTP at the boundary — _no_ Testing Library                                                                                                           |
-| `bitwarden/ios`                          | iOS                                                                             | Swift / SwiftUI                                                           | XCTest (+ emerging Swift Testing); SnapshotTesting + ViewInspector for SwiftUI views; processor/coordinator tests with mocks — no systematic XCUITest                                                                                  |
-| `bitwarden/android`                      | Android                                                                         | Kotlin                                                                    | JUnit5 + MockK + Turbine for ViewModels/logic; Compose UI tests run on the JVM via Robolectric — **all JVM `src/test`, no `androidTest`/Espresso, no screenshot testing**                                                              |
-| `bitwarden/sdk-internal`                 | Cross-platform SDK (core logic powering clients via WASM and mobile via UniFFI) | Rust (cargo workspace, ~50 crates); WASM + UniFFI (Swift/Kotlin) bindings | `cargo test --workspace` (no nextest; cargo-llvm-cov for coverage); mostly inline `#[cfg(test)]` unit tests, `mockall` + `wiremock` for the few HTTP/trait integration tests; binding surfaces consumed by `clients`, `ios`, `android` |
-| `bitwarden/test`                         | Cross-platform E2E (web, desktop, browser ext, iOS, android, CLI, API)          | C# / .NET                                                                 | NUnit + Selenium WebDriver (web/desktop/ext) + Appium (mobile) + CliWrap (CLI), Page Object Model; drives real builds — E2E only                                                                                                       |
-| `bitwarden/browser-interactions-testing` | Browser extension autofill (dedicated E2E suite)                                | TypeScript, Playwright, Docker Compose                                    | Playwright E2E form-fill against real extension builds (Chromium only); static-page + live-site scenarios — _not_ unit/integration                                                                                                     |
-
-Exact repo names and tool versions drift — verify against the checkout. If a platform
-isn't in this table, infer its stack from the repo and state the assumption in the report.
-
 ## Each repo's test shape in practice
 
-The shape a repo actually maintains — not a one-size trophy. Recommend the layer that fits the
-repo's real distribution (see `testing-trophy.md` for the shapes). Each shape below was
-**confirmed against a local checkout**; still re-verify when versions drift, and for any repo
-not listed here, infer its shape from the checkout and state the assumption in the report.
+Each repo's stack and the shape it actually maintains — not a one-size trophy. Recommend the
+layer that fits the repo's real distribution (see `test-layers.md` for the shapes), landed inside
+that shape and named with the concrete tool below. Each shape was **confirmed against a local
+checkout**; exact repo names and tool versions drift, so re-verify against the checkout, and for
+any repo not listed, infer its stack and shape from the checkout and **state the assumption** in
+the report.
 
-| Repo                                     | Shape                                       | What that means for recommendations                                                                                                                                                                                                                                                                   |
-| ---------------------------------------- | ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `bitwarden/server`                       | **Pyramid** (unit-heavy)                    | Broad xUnit unit base (~5:1 over integration), a meaningful integration layer via `WebApplicationFactory` + test DB, **no E2E in-repo**. Default behaviors to unit; reserve integration for endpoint/persistence wiring.                                                                              |
-| `bitwarden/clients`                      | **Unit-heavy** (pyramid-leaning)            | ~1,000+ colocated `*.spec.ts`, heavy `jest-mock-extended`, TestBed component tests that mock their children (shallow, not deep integration). Push logic to unit; treat true component-integration as the deliberate step up. No E2E in-repo.                                                          |
-| `bitwarden/ios`                          | **Trophy + snapshot layer**                 | Component/processor/coordinator tests with mocks dominate (integration-leaning), a large **snapshot-testing** layer (SnapshotTesting) for SwiftUI views is first-class, lighter pure-unit layer, **no systematic XCUITest**. Recommend snapshot coverage for view changes explicitly.                 |
-| `bitwarden/android`                      | **Unit-heavy + JVM Compose-UI integration** | ~558 JVM `src/test` files: a unit base (ViewModels/logic with MockK + Turbine) plus a substantial Compose-UI integration tier running on the JVM via Robolectric. **No `androidTest`/Espresso, no screenshot testing, no E2E in-repo.** Don't recommend device-instrumented or screenshot tests here. |
-| `bitwarden/sdk-internal`                 | **Pyramid** (strongly unit-heavy)           | ~50 Rust crates, ~97% inline `#[cfg(test)]` unit tests (crypto/encoding/parsing logic, deterministic, no mocks) vs ~3% in `tests/` dirs; `mockall`/`wiremock` only where HTTP or cross-module orchestration matters. **No E2E.** Default to unit; integration only for binding/orchestration flows.   |
-| `bitwarden/test`                         | **All E2E**                                 | The cross-system journeys themselves; C# NUnit + Selenium/Appium driving real builds. Everything here is E2E by definition — never recommend unit/integration in this repo.                                                                                                                           |
-| `bitwarden/browser-interactions-testing` | **All E2E** (autofill)                      | Playwright autofill/form-fill against real Chromium extension builds. E2E only; the autofill counterpart to `test`.                                                                                                                                                                                   |
+| Repo                                     | Platform · stack · tooling                                                                                                                                                                                                                                                                                                                               | Shape                                       | What that means for recommendations                                                                                                                                                                                                                                 |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden/server`                       | Backend / API · C# / .NET, ASP.NET Core, EF Core · xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers                                                                                                                                                                                                                        | **Pyramid** (unit-heavy)                    | Broad unit base (~5:1 over integration), a meaningful integration layer, **no E2E in-repo**. Default behaviors to unit; reserve integration for endpoint/persistence wiring.                                                                                        |
+| `bitwarden/clients`                      | Web, Browser ext, Desktop, CLI · TypeScript, Angular, Electron, RxJS · Jest + `jest-mock-extended` + Angular TestBed (unit + shallow component); mocked HTTP at the boundary — _no_ Testing Library                                                                                                                                                      | **Unit-heavy** (pyramid-leaning)            | ~1,000+ colocated `*.spec.ts`; TestBed component tests mock their children (shallow, not deep integration). Push logic to unit; treat true component-integration as the deliberate step up. No E2E in-repo.                                                         |
+| `bitwarden/ios`                          | iOS · Swift / SwiftUI · XCTest (+ emerging Swift Testing); SnapshotTesting + ViewInspector for SwiftUI views; processor/coordinator tests with mocks                                                                                                                                                                                                     | **Trophy + snapshot layer**                 | Component/processor/coordinator tests with mocks dominate (integration-leaning); the **snapshot-testing** layer for SwiftUI views is first-class; lighter pure-unit layer; **no systematic XCUITest**. Recommend snapshot coverage for view changes explicitly.     |
+| `bitwarden/android`                      | Android · Kotlin · JUnit5 + MockK + Turbine for ViewModels/logic; Compose UI tests run on the JVM via Robolectric                                                                                                                                                                                                                                        | **Unit-heavy + JVM Compose-UI integration** | ~558 JVM `src/test` files: a unit base plus a substantial Compose-UI integration tier on the JVM. **All JVM `src/test` — no `androidTest`/Espresso, no screenshot testing, no E2E in-repo.** Don't recommend device-instrumented or screenshot tests here.          |
+| `bitwarden/sdk-internal`                 | Cross-platform SDK (core logic powering clients via WASM, mobile via UniFFI) · Rust (cargo workspace, ~50 crates), WASM + UniFFI (Swift/Kotlin) bindings · `cargo test --workspace` (no nextest; cargo-llvm-cov for coverage); `mockall` + `wiremock` for the few HTTP/trait integration tests; binding surfaces consumed by `clients`, `ios`, `android` | **Pyramid** (strongly unit-heavy)           | ~97% inline `#[cfg(test)]` unit tests (crypto/encoding/parsing logic, deterministic, no mocks) vs ~3% in `tests/` dirs; mocks only where HTTP or cross-module orchestration matters. **No E2E.** Default to unit; integration only for binding/orchestration flows. |
+| `bitwarden/test`                         | Cross-platform E2E (web, desktop, browser ext, iOS, android, CLI, API) · C# / .NET · NUnit + Selenium WebDriver (web/desktop/ext) + Appium (mobile) + CliWrap (CLI), Page Object Model; drives real builds                                                                                                                                               | **All E2E**                                 | The cross-system journeys themselves. Everything here is E2E by definition — never recommend unit/integration in this repo.                                                                                                                                         |
+| `bitwarden/browser-interactions-testing` | Browser extension autofill (dedicated E2E suite) · TypeScript, Playwright, Docker Compose · Playwright form-fill against real Chromium extension builds; static-page + live-site scenarios                                                                                                                                                               | **All E2E** (autofill)                      | The autofill counterpart to `test`; E2E only.                                                                                                                                                                                                                       |
 
 ## Where each layer lives — important
 
@@ -69,7 +55,7 @@ not listed here, infer its shape from the checkout and state the assumption in t
 
 1. Identify which repo(s) the behavior lives in from the change surface (diff paths,
    ticket components, CSV team/area).
-2. Within each repo, choose the layer per `testing-trophy.md` (the cheapest sufficient layer)
+2. Within each repo, choose the layer per `test-layers.md` (the cheapest sufficient layer)
    **landed inside that repo's shape** from _Each repo's test shape in practice_ above — a
    pyramid repo like `server` or `sdk-internal` resolves toward unit; `ios` toward its
    component + snapshot practice — and name the concrete tool from the table above (confirmed
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
index de80fb9..24b717b 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -1,6 +1,6 @@
 # Severity as a risk weight
 
-The layer model (`testing-trophy.md`) tells you the _cheapest layer that buys the confidence a
+The layer model (`test-layers.md`) tells you the _cheapest layer that buys the confidence a
 behavior requires_, landed inside the target repo's shape. **Severity tells you how much
 confidence is required.** A defect in vault
 unlock and a typo on a settings label are not owed the same rigor — severity is the dial
@@ -11,41 +11,35 @@ it gets fixed (that is _priority_). This skill weights coverage by severity, not
 
 ## Source of truth
 
-The canonical classification is Bitwarden's [**Defect Severity Classification Guide**](https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity),
-Confluence page `2759229512`. The levels
-and criteria below mirror that page so the analysis degrades gracefully when the
-`bitwarden-atlassian-tools` MCP is unavailable — but the page is authoritative. When the
-`bitwarden-atlassian-tools` MCP is available, fetch it with `mcp__bitwarden-atlassian__get_confluence_page` (pageId
-`2759229512`) to pick up revisions before relying on the cached copy here. If the fetch
-fails or the MCP is unavailable, use the mirrored table below and note in the report that
-the severity definitions are from the cached copy (version not re-verified) — degrade
-gracefully; never block on it.
+The canonical classification is Bitwarden's **Defect Severity Classification Guide**,
+Confluence page `2759229512`:
+<https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity>. That page is
+authoritative — read it for the level definitions, criteria, and signals; this file does
+not reproduce them. When the `bitwarden-atlassian-tools` MCP is available, fetch the page
+with `mcp__bitwarden-atlassian__get_confluence_page` (pageId `2759229512`) and classify
+each behavior against its criteria. If the fetch fails or the MCP is unavailable, classify
+against the generally understood meaning of the levels below using your own judgment, and
+note in the report that severities were assessed without the guide (definitions not
+verified) — degrade gracefully; never block on it.
+
+The levels, highest to lowest impact, are **Critical**, **High**, **Medium**, **Low**, and
+**Informative**. Use these names consistently in the report regardless of source.
 
 **Security-vulnerability defects are the exception:** their severity follows the
 _Vulnerability Tracking and Management_ guide, not this one. If a behavior is
 security-sensitive (crypto, auth, a threat-model-relevant path), treat its risk as at
-least Critical regardless of the table below.
+least Critical regardless of the level definitions.
 
 ## Where each behavior's severity comes from
 
 - **Bug / defect ticket** — read the severity already assigned on the Jira issue (the
   severity field, or the reporter/QA's stated severity in the description/comments). Use it
-  directly; if it is absent, classify against the criteria below and mark it an assumption.
+  directly; if it is absent, classify against the guide's criteria and mark it an assumption.
 - **Feature, PR, tech breakdown** — there is no defect yet, so assess each behavior's
   **risk severity**: _if this behavior broke in production, what severity would the
   resulting defect carry?_ Classify it against the same criteria. This is what makes the
   recommendation risk-aware rather than uniform.
 
-## Levels and criteria (mirrored from the guide)
-
-| Severity        | A defect here would…                                                                                                                     | Signals (from the guide)                                                                                                                                               |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Critical**    | Severely harm core functionality, data integrity, or security with no viable workaround                                                  | Blocks a critical flow (login, vault access, billing, account creation); data loss/corruption/exposure; crash/unrecoverable state; affects all or a broad user segment |
-| **High**        | Significantly degrade a core feature/flow, but a workaround exists (difficult or non-obvious), or impact is limited to a subset of users | Core feature impaired but not blocked; specific client/OS/auth method; burdensome/undiscoverable workaround; compounding friction in a core workflow                   |
-| **Medium**      | Degrade functionality or UX meaningfully, but a workaround exists or scope is limited                                                    | Non-critical / secondary flow broken; misleading-but-not-destructive output; degraded experience for a subset; extra steps to work around                              |
-| **Low**         | Have minimal functional impact; does not meaningfully hinder the user                                                                    | Cosmetic / typo / visual only; negligible edge case; minor UX inconsistency; trivial workaround                                                                        |
-| **Informative** | Be a known limitation, third-party compatibility issue, or environmental quirk — not a defect in Bitwarden's core behavior               | Autofill on a non-standard third-party site/app; no clear owner or fix path; unlikely to be actioned                                                                   |
-
 ## How severity calibrates the recommendation
 
 Severity does **not** mean "push everything Critical to E2E." The cheapest-sufficient rule
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md
new file mode 100644
index 0000000..69ff05e
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md
@@ -0,0 +1,66 @@
+# Test layers and how to assign one
+
+A model for shaping automated test coverage across three layers — **unit**, **integration**,
+**E2E**. How the volume distributes across them describes a repo's _shape_: a **pyramid** (broad
+unit base, moderate integration, thin/absent E2E) suits backend and logic-heavy code; a **trophy**
+(focused unit base, heavy integration bulge, thin E2E) suits application code where behavior emerges
+from collaborators (UI components, view models). Integration is where each shape buys most of its
+confidence — how _much_ of it a repo carries is what separates the two.
+
+Neither shape is universally correct, and **this skill imposes neither.** Bitwarden's repos
+deliberately sit at different points — some pyramid, some trophy, some a mix, two effectively
+**all E2E**. Recommend the layer that fits the **target repo's actual practice** (mapped per repo
+in `monorepo-layout.md`), not an idealized shape. A mix within or across repos is normal.
+
+## The three layers (cheapest → most expensive)
+
+1. **Unit** — tests a single function/class/module in isolation. Best for pure logic, algorithms,
+   edge cases, and error handling where setup is cheap and the unit has real branching complexity.
+   Fast and stable, but isolation lets integration bugs slip through.
+
+2. **Integration** — the **confidence layer**. Tests several units working together through real
+   (or realistic) collaborators: a controller + service + in-memory/test database; a component
+   rendered with its real children and a mocked network boundary; a view model against a real
+   repository. Exercises the wiring users depend on without the cost and flakiness of E2E.
+
+3. **E2E (end-to-end)** — thin top in most repos, the **entire suite** in the dedicated E2E repos.
+   Drives the real, fully assembled system as a user would: real browser, device, backend. Highest
+   confidence per test, but slowest, most expensive, most flaky. In a platform repo, reserve it for
+   a few **critical user journeys** (login, vault unlock, checkout) — not branch coverage. The
+   cross-system journeys themselves live in the `test` repo, where E2E _is_ the strategy.
+
+Static analysis (type checking, linters, formatters) sits below all three and is handled by
+per-repo tooling — not recommended by this skill.
+
+## How to assign a layer
+
+Apply two rules together:
+
+1. **Cheapest sufficient layer.** Pick the lowest-cost layer (unit < integration < E2E) that still
+   buys the confidence the behavior requires:
+   - Pure transformation, calculation, parsing, validation with real branching → **unit**.
+   - Behavior that emerges from collaborators working together (HTTP handler + service +
+     persistence; component + store + API boundary; view model + repository) → **integration**.
+   - A behavior only meaningful as a full user journey across the real system → **E2E**, and only
+     if genuinely critical.
+   - Anything a type system, analyzer, or linter already guarantees → don't write a test for it.
+
+2. **Honor the target repo's shape.** The cheapest-sufficient call lands inside the shape the repo's
+   engineers actually maintain, so the same behavior resolves differently per repo: in `server` it
+   lands in a unit-heavy pyramid; in `ios` in component/processor integration plus the snapshot
+   layer; a cross-system journey lands as E2E in the dedicated `test` repo, never inside a platform
+   repo. Cite the per-repo shape in `monorepo-layout.md` — and where a repo's real shape is unknown,
+   say so rather than defaulting to a trophy.
+
+## Anti-patterns to avoid (in any shape)
+
+- **Ice-cream cone** — many E2E, few integration/unit. Slow, flaky, expensive. Wrong everywhere,
+  including a pyramid repo that has started leaning on E2E for branch coverage.
+- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the mocks rather
+  than real behavior; integration would buy more. The most common failure in unit-heavy repos.
+- **Testing trivial code** — getters/setters, framework glue, type-guaranteed invariants. Cost
+  without confidence.
+- **E2E for branch coverage** — slow full-system tests covering edge cases that belong at unit or
+  integration.
+- **Forcing a foreign shape** — recommending an integration bulge for a pyramid repo (or vice
+  versa) because a model says so. Match the repo, not the textbook.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
deleted file mode 100644
index 71f9e78..0000000
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Test shape: pyramid, trophy, and where Bitwarden's repos actually sit
-
-A model for shaping automated test coverage across three layers — **unit**, **integration**,
-**E2E**. Two classic shapes describe how the volume is distributed across those layers:
-
-- **Testing Pyramid** — a broad **unit** base, a smaller **integration** layer, a thin (often
-  absent) **E2E** cap. Optimizes for fast, stable, cheap-to-maintain coverage. The natural fit
-  for backend and logic-heavy code where units have real branching complexity.
-- **Testing Trophy** — a focused **unit** base, a **heavy integration** bulge where most
-  confidence is bought, a thin **E2E** cap. The fit for application code where behavior emerges
-  from collaborators (UI components, view models) and an isolated unit proves little.
-
-Neither shape is universally "correct," and **this skill does not impose one on every repo.**
-Bitwarden's repos deliberately sit at different points — some pyramid, some trophy, some a mix,
-and two are effectively **all E2E**. Recommend the layer that fits the **target repo's actual
-practice** (mapped per repo in `monorepo-layout.md`), not an idealized shape. A "funky mix" of
-pyramid and trophy within or across repos is normal and fine.
-
-## The three layers (cheapest → most expensive)
-
-1. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure
-   logic, algorithms, edge cases, and error handling where setup is cheap and the unit
-   has real branching complexity. Fast and stable, but isolation can let integration
-   bugs slip through.
-
-2. **Integration** — the **confidence layer**: the trophy's bulge and the pyramid's middle.
-   Tests several units working together through real (or realistic) collaborators: a
-   controller + service + in-memory or test database, a component rendered with its real
-   child components and a mocked network boundary, a view model against a real repository.
-   It exercises the wiring users actually depend on without the cost and flakiness of full
-   E2E. How _much_ of it a repo carries is what separates a trophy (a lot) from a pyramid
-   (a moderate middle).
-
-3. **E2E (end-to-end)** — thin top in most repos, the **entire suite** in the dedicated E2E
-   repos. Drives the real, fully assembled system the way a user would: real browser, real
-   device, real backend. Highest confidence per test, but slowest, most expensive, and most
-   flaky. In a platform repo, reserve it for a small number of **critical user journeys**
-   (e.g. login, vault unlock, checkout) — not for branch coverage. The cross-system journeys
-   themselves live in the `test` repo, where E2E _is_ the strategy.
-
-## The two shapes
-
-```
-   Pyramid (e.g. server)              Trophy (e.g. ios)
-
-      ┌─────────┐                       ┌───────────┐
-      │   E2E   │  thin / none          │    E2E    │   thin top
-   ┌──┴─────────┴──┐                 ┌──┴───────────┴──┐
-   │  Integration  │  moderate       │   Integration   │   HEAVY
-   └──┬─────────┬──┘              ┌──┴─────────────────┴──┐
-   │     Unit      │  BROAD base   │        Unit           │  focused
-   └───────────────┘              └───────────────────────┘
-```
-
-Static analysis (type checking, linters, formatters) sits below both shapes and is handled by
-per-repo tooling — not recommended by this skill.
-
-## How to assign a layer
-
-Apply two rules together:
-
-1. **Cheapest sufficient layer.** Pick the lowest-cost layer (unit < integration < E2E) that
-   still buys the confidence the behavior requires:
-   - Pure transformation, calculation, parsing, validation logic with real branching → **unit**.
-   - Behavior that emerges from collaborators working together (HTTP handler + service +
-     persistence; component + store + API boundary; view model + repository) → **integration**.
-   - A behavior only meaningful as a full user journey across the real system → **E2E**, and
-     only if it is genuinely critical.
-   - Anything a type system, analyzer, or linter already guarantees → don't write a test for it.
-
-2. **Honor the target repo's shape.** The cheapest-sufficient call lands inside the shape the
-   repo's engineers actually maintain. The same kind of behavior resolves differently per repo:
-   in `server` it lands in a unit-heavy pyramid; in `ios` it lands in component/processor
-   integration tests plus the repo's snapshot layer; a cross-system journey lands as E2E in the
-   dedicated `test` repo, never inside a platform repo. Recommend what that repo maintains today,
-   citing the per-repo shape in `monorepo-layout.md` — and where a repo's real shape is unknown,
-   say so rather than defaulting to the trophy.
-
-## Anti-patterns to avoid (in any shape)
-
-- **Ice-cream cone** — many E2E tests, few integration/unit. Slow, flaky, expensive to maintain.
-  Wrong everywhere, including in a pyramid repo that has started leaning on E2E for branch coverage.
-- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the mocks
-  rather than real behavior; integration would buy more. The most common failure in unit-heavy repos.
-- **Testing trivial code** — tests for getters/setters, framework glue, or type-guaranteed
-  invariants. Cost without confidence.
-- **E2E for branch coverage** — using slow full-system tests to cover edge cases that belong
-  at the unit or integration layer.
-- **Forcing a foreign shape** — recommending an integration bulge for a repo that runs a unit
-  pyramid (or vice versa) just because a model says so. Match the repo, not the textbook.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
index 0a9646e..cd572c8 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
@@ -6,46 +6,40 @@ allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Ba
 
 # Assessing Test Coverage
 
-Produce an evidence-grounded inventory of what is **already tested** for a change, scoped to the change surface, with every cited test rendered as a stable GitHub permalink and bucketed by test layer. This is a backward-looking, descriptive job: you report what exists, you do **not** recommend what to add or judge whether the shape is right — that is `analyzing-test-stack`'s job, which consumes this inventory.
-
-The output is a **coverage inventory**: a set of permalink records for observed tests plus a list of behaviors/surfaces recorded as gaps (`unverified`). Honesty is the whole point — a behavior with no observed test is a gap, never assumed covered.
+Produce an evidence-grounded inventory of what is **already tested** for a change, scoped to the change surface, with every cited test rendered as a stable GitHub permalink and bucketed by test layer. The output is a **coverage inventory**: permalink records for observed tests plus the behaviors/surfaces recorded as gaps (`unverified`). Honesty is the whole point — a behavior with no observed test is a gap, never assumed covered.
 
 ## Inputs
 
 You work from a **change surface** and the repos it touches:
 
-- **Change surface** — the changed paths/symbols and the named component(s). Usually supplied by the caller (the agent's evidence fan-out, or an `analyzing-test-stack` run). If you're handed only a Jira key or a PR with no resolved surface, derive a minimal surface from the PR diff (`gh pr diff`) before looking for coverage; the shared `../../references/input-sources.md` (the same intake guide `analyzing-test-stack` uses) covers how to resolve a PR or Epic into its diff paths and linked PRs.
+- **Change surface** — the changed paths/symbols and named component(s), usually supplied by the caller. Given only a Jira key or a bare PR, derive a minimal surface from the PR diff (`gh pr diff`) first; `../../references/input-sources.md` (shared with `analyzing-test-stack`) covers resolving a PR or Epic into diff paths and linked PRs.
 - **Affected repos** — which platform checkouts to inspect, and whether the sibling `test` repo (E2E) is available.
 - **Linked/merged PRs** — the PRs that shipped this work; their diffs are the primary, permalink-ready coverage evidence.
 
-A missing input narrows the inventory; it never blocks it. Record what you could not inspect as part of the result.
-
-**Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing.
+A missing input narrows the inventory; it never blocks it — record what you could not inspect. **Today's date is provided by the caller** for the report filename; if none is supplied, ask via `AskUserQuestion` rather than reading the clock.
 
 ## Workflow
 
 1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_.
 
-2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed**: capture 1–3 representative tests plus an approximate count per behavior, and do not open and enumerate every test method in a covered area. This is the dominant cost control — see `references/finding-coverage.md` → _Establish coverage per behavior_. For E2E, inspect the sibling `test` repo if available.
+2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then a lookup **scoped to the change surface** for pre-existing tests — never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed** (1–3 representative tests plus an approximate count, not every test method) — the dominant cost control, detailed in `references/finding-coverage.md` → _Establish coverage per behavior_. For E2E, inspect the sibling `test` repo if available.
 
-3. **Cite and bucket each behavior's coverage.** For each behavior, render its 1–3 representative tests as GitHub permalinks (commit SHA, not branch) and record its layer and approximate count, following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _Output contract_. A representative test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`.
+3. **Cite and bucket each behavior's coverage.** For each behavior, render its 1–3 representative tests as GitHub permalinks and record its layer and approximate count, following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _Output contract_ (which also covers the unlinkable-test fallback). Bucket by apparent layer (unit / integration / E2E); layer definitions are in the `analyzing-test-stack` skill's `references/test-layers.md`, the per-repo stack/tooling in its `references/monorepo-layout.md`.
 
 4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_.
 
-5. **Render the coverage report.** Turning the gathered inventory into HTML is **mechanical formatting, not reasoning**. Author a content fragment per `references/coverage-report-template.md` (and the shared `../../references/report-template-common.md` it builds on), then run the build script to splice in the stylesheet and emit the file. Write `#overview` yourself as a short synthesis — observed coverage per platform and the top gaps. The template owns the section IDs, the never-paste-CSS rule, the `--kind test-coverage` build invocation, and the filename/freshness contract — follow it; do not hand-assemble the file.
+5. **Render the coverage report** per `references/coverage-report-template.md` (which builds on the shared `../../references/report-template-common.md`) — mechanical formatting, not reasoning. Write `#overview` yourself: observed coverage per platform and the top gaps. The template owns everything else (section IDs, the Tests-linked permalinks, the `--kind test-coverage` build, and the filename contract).
 
 ## Output
 
 Two artifacts:
 
-- The **coverage inventory** as structured data — the record shape defined in `references/finding-coverage.md` → _Output contract_: one permalink record per observed test, plus the list of `unverified` gaps. When run under the `bitwarden-test-engineer` agent, return these records for `analyzing-test-stack` to consume as-is.
-- The **self-contained HTML coverage report** (step 5), written to the current working directory.
+- The **coverage inventory** as structured data (record shape in `references/finding-coverage.md` → _Output contract_). When run under the `test-strategist` agent, return these records for `analyzing-test-stack` to consume as-is.
+- The **self-contained HTML coverage report** (step 5), written as `coverage.html` into the per-change report directory `test-engineer-report-<slug>-<date>/`.
 
 Mirror the report's `#overview` in chat — the observed shape per platform and the top gaps — and point the reader at the report file for the per-test detail.
 
 ## Principles
 
-- **Observed vs. assumed.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure.
-- **Scoped, not swept.** Coverage is established PR-first then scoped to the change surface — never a repo-wide grep.
-- **Stable links only.** Permalinks use the commit SHA, not a branch. Unlinkable tests are recorded with a reason; URLs are never fabricated.
-- **Backward-looking only.** You inventory what exists. Recommending new tests, assigning cheapest-sufficient layers, and judging test shape belong to `analyzing-test-stack` — hand off, don't cross over.
+- **Observed vs. assumed.** Never present assumed coverage as verified — "I could not inspect the `test` repo" is a finding, not a failure.
+- **Backward-looking only.** You inventory what exists; recommending new tests and judging test shape belong to `analyzing-test-stack`.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
index 08269c7..561e010 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -27,10 +27,8 @@ ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#coverage`
   coverage still gets a row, shown empty.
 - **`#coverage`** — per-platform tables, **one row per behavior** (not per test):
   `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`.
-  - **Tests (linked)** renders the behavior's 1–3 representative tests as permalinks (binding), or
-    the `.unlinkable` span when a test cannot be linked — never a fabricated URL. The exact link /
-    `.unlinkable` markup and the permalink production rules are owned by `finding-coverage.md` →
-    _Citing tests as GitHub permalinks_ and _When a test cannot be linked_ — follow it.
+  - **Tests (linked)** is binding — render the behavior's 1–3 representative tests as permalinks (or
+    the `.unlinkable` span), per `../../../references/report-template-common.md` → _Content rules_.
   - **Count** is the approximate number of tests covering that behavior at that layer — breadth
     without enumerating every test. Do not expand a well-covered behavior into dozens of rows.
   - **Layer** uses the matching layer chip. **Source** is `PR` (tests shipped in a linked/merged
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
index 7829a95..44fdb50 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -47,14 +47,11 @@ A behavior with no PR-observed test and no targeted hit is recorded as a coverag
 
 The inventory is keyed to the **change's testable behaviors**, not to every test method in the
 repo. For each behavior, find _whether and at what layer_ it is covered, capture **1–3
-representative tests** as evidence plus an approximate **count** at that layer, and then **move
-on** — do not open and enumerate every test in a covered area. A behavior backed by 40 unit
-tests is recorded as `{ count: ~40, representative: [3 permalinks] }`, not 40 records. This is
-the dominant cost control on large repos: exhaustively cataloguing a well-covered area burns
-many tool calls and tokens to produce a record set no recommendation needs, and bloats the
-downstream report into an unreadable dump. Spend the search budget on **resolving each
-behavior's status**, not on completeness of enumeration. Two or three confirming tests prove a
-behavior is covered; the 38 others add cost, not confidence.
+representative tests** plus an approximate **count** at that layer, then **move on** — do not
+enumerate every test in a covered area. A behavior backed by 40 unit tests is recorded as
+`{ count: ~40, representative: [3 permalinks] }`, not 40 records. This is the dominant cost control
+on large repos: two or three confirming tests prove a behavior is covered; cataloguing the rest
+burns tool calls, bloats the downstream report, and adds cost, not confidence.
 
 ## Citing tests as GitHub permalinks
 

From 94d5698ea1686017a8ba24cd6cbe3ab14a4a28d1 Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Mon, 22 Jun 2026 16:37:47 -0400
Subject: [PATCH 7/9] update atlassian tool mcp refs

---
 plugins/bitwarden-test-engineer/README.md     |  6 +++---
 .../agents/test-strategist.md                 | 16 +++++++--------
 .../references/input-sources.md               | 20 +++++++++----------
 .../skills/analyzing-test-stack/SKILL.md      |  2 +-
 .../references/severity-risk.md               |  2 +-
 .../references/finding-coverage.md            |  2 +-
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index 4b7e183..9142516 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -49,9 +49,9 @@ unverified when that repo isn't checked out.
 
 ## Cross-Plugin Integration
 
-| Plugin                      | How It's Used                                                                                                                                                                                                                               |
-| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `bitwarden-atlassian-tools` | Optional but recommended. Provides the `mcp__bitwarden-atlassian__*` server used to read Jira tickets and linked Confluence requirements. If absent, the plugin degrades gracefully — paste requirements or rely on the PR/CSV/description. |
+| Plugin                      | How It's Used                                                                                                                                                                                                                                                                |
+| --------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden-atlassian-tools` | Optional but recommended. Provides the `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__*` server used to read Jira tickets and linked Confluence requirements. If absent, the plugin degrades gracefully — paste requirements or rely on the PR/CSV/description. |
 
 ## Installation
 
diff --git a/plugins/bitwarden-test-engineer/agents/test-strategist.md b/plugins/bitwarden-test-engineer/agents/test-strategist.md
index c2ad1ed..ba5d359 100644
--- a/plugins/bitwarden-test-engineer/agents/test-strategist.md
+++ b/plugins/bitwarden-test-engineer/agents/test-strategist.md
@@ -58,13 +58,13 @@ tools:
   - Bash(git -C * rev-parse:*)
   - Bash(git -C * remote get-url:*)
   - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)
-  - mcp__bitwarden-atlassian__get_issue
-  - mcp__bitwarden-atlassian__search_issues
-  - mcp__bitwarden-atlassian__get_issue_comments
-  - mcp__bitwarden-atlassian__get_issue_remote_links
-  - mcp__bitwarden-atlassian__get_confluence_page
-  - mcp__bitwarden-atlassian__search_confluence
-  - mcp__bitwarden-atlassian__search_confluence_cql
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql
 skills:
   - assessing-test-coverage
   - analyzing-test-stack
@@ -79,7 +79,7 @@ You produce a recommendation — an HTML report — not the tests themselves. Gr
 
 A single feature frequently spans several repos (a server endpoint + a web client + a mobile screen), each shaped independently — match the recommendation to each repo's actual practice, not a house style. **Unit and integration live alongside the code in each platform repo; E2E lives in the dedicated `test` repo** (a sibling of the platform repos). The per-platform stack and the layer→repo map are in `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md`.
 
-Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is absent and the user references a Jira issue or Confluence breakdown, don't fail — say the MCP is unavailable and ask the user to paste the requirements, or proceed from the PR / CSV / description provided.
+Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__*` server). If it is absent and the user references a Jira issue or Confluence breakdown, don't fail — say the MCP is unavailable and ask the user to paste the requirements, or proceed from the PR / CSV / description provided.
 
 ## Workflow
 
diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
index 224525d..d0342cc 100644
--- a/plugins/bitwarden-test-engineer/references/input-sources.md
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -10,12 +10,12 @@ Preferred: if the `bitwarden-atlassian-tools` plugin is installed, invoke
 
 Otherwise use the MCP tools directly:
 
-- `mcp__bitwarden-atlassian__get_issue` — the issue itself (summary, description,
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue` — the issue itself (summary, description,
   acceptance criteria, custom fields).
-- `mcp__bitwarden-atlassian__get_issue_comments` — clarifications and edge cases raised in
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments` — clarifications and edge cases raised in
   discussion.
-- `mcp__bitwarden-atlassian__get_issue_remote_links` — linked Confluence pages and PRs.
-- `mcp__bitwarden-atlassian__get_confluence_page` — linked requirements/design docs.
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` — linked Confluence pages and PRs.
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page` — linked requirements/design docs.
 
 Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
 components** named. If the MCP is unavailable, ask the user to paste the requirements.
@@ -38,7 +38,7 @@ before extracting:
 
 1. **Discover children.** Read the `subtasks` field first. If empty (common in next-gen
    projects, which use `parent` relationships rather than the legacy `subtasks` field), fall
-   back to `mcp__bitwarden-atlassian__search_issues` with JQL `parent = <EPIC-KEY>`. On
+   back to `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues` with JQL `parent = <EPIC-KEY>`. On
    classic projects, also try `"Epic Link" = <EPIC-KEY>`. Together these cover both schemas.
 2. **Bound the fan-out.** If the epic has more than ~10 children, fetch the first 10 in full
    and summarize the rest as a one-line list (key, status, summary) from the search results.
@@ -46,10 +46,10 @@ before extracting:
    `bitwarden-atlassian-tools:researching-jira-issues` (Steps 2–3) — re-use that recipe; do
    not re-derive it.
 3. **Per child, gather behaviors and PRs.**
-   - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance criteria —
+   - `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue` for the child's description and acceptance criteria —
      these are the testable behaviors. Carry each child's **key and browse URL** with the behaviors
      it produces — a behavior sourced from a child links to that child, not the epic.
-   - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). Each PR URL
+   - `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). Each PR URL
      feeds the **GitHub PR** branch below (`gh pr view` / `gh pr diff`). **These merged/linked PRs
      are the reliable backbone for existing coverage** — they carry the tests that shipped and the
      PR head SHA makes each permalink-ready (see `finding-coverage.md` → _Finding existing
@@ -89,9 +89,9 @@ scoping you would otherwise reconstruct from a diff or a ticket. Mine it; don't
 
 Locate and fetch it:
 
-- If given a page ID or URL, fetch directly with `mcp__bitwarden-atlassian__get_confluence_page`.
-- If given only a feature/team name, find the page first with `mcp__bitwarden-atlassian__search_confluence`
-  or `mcp__bitwarden-atlassian__search_confluence_cql` (breakdowns live in a team's "Tech Breakdown"
+- If given a page ID or URL, fetch directly with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page`.
+- If given only a feature/team name, find the page first with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence`
+  or `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql` (breakdowns live in a team's "Tech Breakdown"
   folder), then fetch it.
 - The breakdown's **status** matters: `IN PLANNING` / `IN PROGRESS` means the scope may still
   shift — note that the recommendation rests on a draft. `PROPOSED` / `ACCEPTED` is a stable
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
index 15275a5..a1d1754 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: analyzing-test-stack
 description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need". This is the forward-looking recommendation — it does NOT inventory what already exists; for that, use assessing-test-coverage (whose inventory this skill consumes).
-allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql"
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql"
 ---
 
 # Analyzing the Test Stack
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
index 24b717b..d22bb3c 100644
--- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -16,7 +16,7 @@ Confluence page `2759229512`:
 <https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity>. That page is
 authoritative — read it for the level definitions, criteria, and signals; this file does
 not reproduce them. When the `bitwarden-atlassian-tools` MCP is available, fetch the page
-with `mcp__bitwarden-atlassian__get_confluence_page` (pageId `2759229512`) and classify
+with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page` (pageId `2759229512`) and classify
 each behavior against its criteria. If the fetch fails or the MCP is unavailable, classify
 against the generally understood meaning of the levels below using your own judgment, and
 note in the report that severities were assessed without the guide (definitions not
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
index 44fdb50..2f36c4d 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -28,7 +28,7 @@ Reliably establishing what is **already tested** does not require grepping a who
 two ordered moves, and record anything still unfound as a gap rather than dropping it:
 
 1. **Merged/linked PRs are the backbone.** The PRs hanging off the Jira issue and its epic
-   children (`mcp__bitwarden-atlassian__get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
+   children (`mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
    the tests that shipped with this work, and are already permalink-ready via the PR head SHA.
    Take the tests observed in those PR diffs as primary coverage evidence.
 2. **Targeted repo lookup for pre-existing tests.** Tests written _before_ this ticket won't

From 3e4a51a1c161c47802ec0d937657843054f4a9b5 Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Mon, 22 Jun 2026 16:59:46 -0400
Subject: [PATCH 8/9] make examples more generic

---
 plugins/bitwarden-test-engineer/README.md            |  4 ++--
 .../agents/test-strategist.md                        |  2 +-
 .../references/finding-coverage.md                   | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
index 9142516..8999d37 100644
--- a/plugins/bitwarden-test-engineer/README.md
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -80,8 +80,8 @@ Does bitwarden/server#5821 have the right tests, or is it leaning too hard on en
 ```
 
 ```
-Here's our exported test cases CSV for the billing migration — which of these should be
-automated and at what layer?
+Here's our exported test cases CSV for the new item types import/export work (PM-32009) —
+which of these should be automated and at what layer?
 ```
 
 Each run produces a per-change directory `test-engineer-report-<slug>-<date>/` holding the
diff --git a/plugins/bitwarden-test-engineer/agents/test-strategist.md b/plugins/bitwarden-test-engineer/agents/test-strategist.md
index ba5d359..7500301 100644
--- a/plugins/bitwarden-test-engineer/agents/test-strategist.md
+++ b/plugins/bitwarden-test-engineer/agents/test-strategist.md
@@ -24,7 +24,7 @@ description: |
 
   <example>
   Context: A QA engineer exported a set of manual test cases and wants an automation plan.
-  user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?"
+  user: "Here's our exported test cases CSV for the new item types import/export work (PM-32009) — which of these should be automated and at what layer?"
   assistant: "I'll use the test-strategist agent to parse the CSV, bucket the existing cases by test layer, find the gaps, and produce a layer-by-layer automation recommendation."
   <commentary>
   CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
index 2f36c4d..cfbd54a 100644
--- a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -109,23 +109,23 @@ present. A behavior with no Jira source (e.g. found only in a PR diff) omits `so
 
 ```
 {
-  "behavior": "per-phase price resolution on schedule activation",
+  "behavior": "bank account item type round-trips through import/export",
   "platform": "server",
   "layer": "integration",
   "status": "covered",
   "count": 21,
   "source_issue": {
-    "key": "PM-1234",
-    "url": "https://bitwarden.atlassian.net/browse/PM-1234"
+    "key": "PM-32009",
+    "url": "https://bitwarden.atlassian.net/browse/PM-32009"
   },
   "representative": [
     {
-      "path": "test/Billing/.../ScheduleHandlerTests.cs",
+      "path": "test/Core.Test/Vault/.../CipherItemTypeTests.cs",
       "start_line": 42,
       "end_line": 89,
       "owner_repo": "bitwarden/server",
       "sha": "a1b2c3d4e5f6…",
-      "permalink": "https://github.com/bitwarden/server/blob/a1b2c3d4e5f6…/test/Billing/.../ScheduleHandlerTests.cs#L42-L89"
+      "permalink": "https://github.com/bitwarden/server/blob/a1b2c3d4e5f6…/test/Core.Test/Vault/.../CipherItemTypeTests.cs#L42-L89"
     }
   ]
 }
@@ -136,7 +136,7 @@ A representative test that cannot be linked is recorded path-only with a reason
 never fabricate a URL. Behaviors/surfaces with no observed test are returned as gaps:
 
 ```
-{ "behavior": "tier downgrade preserves seat count", "platform": "server", "status": "unverified" }
+{ "behavior": "organization policy can restrict the Driver License item type", "platform": "server", "status": "unverified" }
 ```
 
 Keep `representative` to at most three permalinks per behavior; the `count` conveys breadth

From 1fbffb89d35287baad47e879d713d964d1d7ecaf Mon Sep 17 00:00:00 2001
From: Ned Thompson <nthompson@bitwarden.com>
Date: Mon, 22 Jun 2026 17:10:05 -0400
Subject: [PATCH 9/9] update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 333852a..5ba7c34 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable
 | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/)     | 0.1.5   | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources                                                     |
 | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0   | Application security engineering: vulnerability triage, threat modeling, and secure code analysis                                                           |
 | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0   | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. |
-| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment.         |
+| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: role-specific testing agents spanning the test lifecycle, starting with risk-weighted test strategy and coverage planning.        |
 | [claude-config-validator](plugins/claude-config-validator/)         | 1.1.1   | Validates Claude Code configuration files for security, structure, and quality                                                                              |
 | [claude-retrospective](plugins/claude-retrospective/)               | 1.1.1   | Analyze Claude Code sessions to identify successful patterns and improvement opportunities                                                                  |