diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 6ce73d0..dbfa641 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -92,6 +92,12 @@
       "source": "./plugins/bitwarden-design-tools",
       "version": "0.1.0",
       "description": "Design toolkit for Bitwarden — non-persona skills for the design lifecycle. Content style guide reference, Figma Dev Mode MCP usage, Bitwarden brand application, design-to-engineering handoff prep, Design System governance, and the Product and Design Jira workflow. Composed by the bitwarden-designer agent and usable standalone."
+    },
+    {
+      "name": "bitwarden-test-engineer",
+      "source": "./plugins/bitwarden-test-engineer",
+      "version": "1.0.0",
+      "description": "Test engineering toolkit for Bitwarden. Hosts role-specific testing agents — currently a test strategist that recommends what to test, at which layer, and why (risk-weighted, shaped to each repo) and inventories existing coverage. Designed to grow additional roles such as an SDET or a QA engineer."
     }
   ]
 }
diff --git a/.cspell.json b/.cspell.json
index 7f702a6..ce3f3c4 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -3,6 +3,7 @@
   "version": "0.2",
   "words": [
     "accum",
+    "actioned",
     "adf",
     "AKIA",
     "anthropics",
@@ -12,6 +13,7 @@
     "askable",
     "ASVS",
     "atlassian",
+    "automatable",
     "Bitwarden",
     "blocklist",
     "blogposts",
@@ -25,11 +27,13 @@
     "codeBlock",
     "CODEOWNERS",
     "Confluence",
+    "Consolas",
     "CQL",
     "customfield",
     "cvss",
     "Dashlane",
     "dast",
+    "detekt",
     "docstrings",
     "dread",
     "duedate",
@@ -50,6 +54,7 @@
     "Gatekeeping",
     "GHAS",
     "ghsa",
+    "getline",
     "gofmt",
     "gradlew",
     "grype",
@@ -60,17 +65,21 @@
     "hotspots",
     "IDOR",
     "inclusivity",
+    "inlines",
     "issueIdOrKey",
     "issuelinks",
     "issuetype",
     "Jira",
     "JQL",
     "keyserver",
+    "ktlint",
     "lockdown",
     "lockfiles",
     "maxResults",
     "mcp",
+    "Menlo",
     "metacharacters",
+    "mockall",
     "modelcontextprotocol",
     "msword",
     "MVVM",
@@ -78,6 +87,7 @@
     "mypassword",
     "myproject",
     "Newtonsoft",
+    "nextest",
     "nextPageToken",
     "numstat",
     "NVARCHAR",
@@ -94,11 +104,14 @@
     "remotelink",
     "Rescope",
     "resolutiondate",
+    "Robolectric",
     "rustdoc",
     "sarif",
+    "SDET",
     "SDLC",
     "sast",
     "sbom",
+    "Segoe",
     "semver",
     "shellcheck",
     "shortlog",
@@ -117,15 +130,22 @@
     "startswith",
     "stride",
     "structurizr",
+    "stylesheet",
+    "subdirs",
+    "tablist",
+    "tabpanel",
     "tarpit",
     "thumbsup",
     "tinyui",
+    "tnum",
     "touchpoint",
     "touchpoints",
     "triaging",
     "unassigning",
     "unassigns",
+    "unfound",
     "ungroup",
+    "unlinkable",
     "unresponded",
     "unsanitized",
     "userflow",
@@ -139,6 +159,7 @@
     "wordprocessingml",
     "worktree",
     "worktrees",
+    "XCUI",
     "xoxb",
     "Zeroize",
     "zeroization",
diff --git a/README.md b/README.md
index bfc8c8f..5ba7c34 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable
 | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/)     | 0.1.5   | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources                                                     |
 | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0   | Application security engineering: vulnerability triage, threat modeling, and secure code analysis                                                           |
 | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0   | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. |
+| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/)         | 1.0.0   | Test engineering toolkit: role-specific testing agents spanning the test lifecycle, starting with risk-weighted test strategy and coverage planning.        |
 | [claude-config-validator](plugins/claude-config-validator/)         | 1.1.1   | Validates Claude Code configuration files for security, structure, and quality                                                                              |
 | [claude-retrospective](plugins/claude-retrospective/)               | 1.1.1   | Analyze Claude Code sessions to identify successful patterns and improvement opportunities                                                                  |
 
diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
new file mode 100644
index 0000000..0363e0e
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json
@@ -0,0 +1,22 @@
+{
+  "name": "bitwarden-test-engineer",
+  "version": "1.0.0",
+  "description": "Test engineering toolkit for Bitwarden. Hosts role-specific testing agents — currently a test strategist that recommends what to test, at which layer, and why (risk-weighted, shaped to each repo) and inventories existing coverage. Designed to grow additional roles such as an SDET or a QA engineer.",
+  "author": {
+    "name": "Bitwarden",
+    "url": "https://github.com/bitwarden"
+  },
+  "homepage": "https://github.com/bitwarden/ai-plugins/tree/main/plugins/bitwarden-test-engineer",
+  "repository": "https://github.com/bitwarden/ai-plugins",
+  "keywords": [
+    "testing",
+    "test-engineering",
+    "quality-engineering",
+    "test-strategy",
+    "test-automation",
+    "exploratory-testing",
+    "test-layers",
+    "qa",
+    "orchestrator"
+  ]
+}
diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md
new file mode 100644
index 0000000..1b109fb
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/CHANGELOG.md
@@ -0,0 +1,15 @@
+# Changelog
+
+All notable changes to the Bitwarden Test Engineer Plugin will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0.0] - 2026-06-15
+
+### Added
+
+- Initial release of the `bitwarden-test-engineer` plugin.
+- `test-strategist` agent: classifies a change's inputs (Jira ticket, GitHub PR, tech breakdown, test-case CSV, plain-language description), fans out subagents to gather evidence, and presents a test recommendation.
+- `assessing-test-coverage` skill: inventories what a change is already tested by, buckets observed tests by layer, cites them as stable GitHub permalinks, and writes a self-contained HTML coverage report.
+- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest sufficient test layer per platform, surfaces coverage gaps and shape-wrong tests, and emits a self-contained HTML report.
+- Shared plugin-level `references/` and a `build-report.sh` script that splices the single shared stylesheet into each report so the two reports can't drift.
diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md
new file mode 100644
index 0000000..8999d37
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/README.md
@@ -0,0 +1,99 @@
+# Bitwarden Test Engineer Plugin
+
+## Overview
+
+A test engineering toolkit for Bitwarden. It hosts role-specific testing agents. Today it
+ships one — the **test strategist** (`test-strategist`), the test-_planning_ role:
+it recommends what to test, at which layer, and why, and inventories what is already tested.
+It does not author, run, or maintain the tests, nor do exploratory/manual QA. The plugin is
+designed to grow additional roles over time (for example an SDET or a QA engineer).
+
+### First role: the test strategist
+
+Given a change — a feature, bugfix, refactor, or migration — the agent recommends
+**what to test, at which layer, and why**, shaped to **each repo's actual test practice**.
+Two ideas drive it: each behavior is tested at the cheapest layer that buys the confidence it
+needs (unit, integration, or E2E), and how those layers are weighted is decided per repo — a
+unit-heavy pyramid (`server`, `clients`, `sdk-internal`, `android`), an integration/snapshot
+trophy (`ios`), or a wholly all-E2E repo (the dedicated `test` repo,
+`browser-interactions-testing`). E2E is "thin" only _within_ a platform repo; the dedicated
+`test` repo is entirely E2E by design.
+
+It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub
+PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out
+subagents to gather it, assesses what is **already tested** (the `assessing-test-coverage`
+skill, which inventories existing tests, cites each as a GitHub permalink, and writes a
+coverage report), then runs the analyst skill (`analyzing-test-stack`), which produces the
+test-stack recommendation. Both skills emit a self-contained HTML report.
+
+## Where each layer lives
+
+Unit and integration tests live alongside the code inside each platform repo
+(e.g. `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`). **End-to-end tests live
+in a dedicated, private `test` repository** — not inside the platform repos — so E2E
+recommendations target that separate repo, and existing E2E coverage is treated as
+unverified when that repo isn't checked out.
+
+## Agents
+
+| Agent             | What It Does                                                                                                                                                                                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `test-strategist` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, assesses existing coverage (`assessing-test-coverage`), then runs `analyzing-test-stack` — emitting a self-contained coverage report and a self-contained test-stack report. |
+
+## Skills
+
+| Skill                     | What It Does                                                                                                                                                                                                                                                                                                                                                                                    |
+| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `assessing-test-coverage` | The backward-looking inventory. Determines what is **already tested** for a change — scoped to the change surface, PR-first then a targeted lookup — buckets each observed test by layer, cites it as a stable GitHub permalink, flags untested behaviors as gaps, and writes a self-contained HTML coverage report. Feeds `analyzing-test-stack`; usable standalone to audit current coverage. |
+| `analyzing-test-stack`    | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient test layer per platform, inside each repo's actual shape, names concrete tooling, surfaces coverage gaps and shape-wrong tests (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report into a per-change report directory.   |
+
+## Cross-Plugin Integration
+
+| Plugin                      | How It's Used                                                                                                                                                                                                                                                                |
+| --------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden-atlassian-tools` | Optional but recommended. Provides the `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__*` server used to read Jira tickets and linked Confluence requirements. If absent, the plugin degrades gracefully — paste requirements or rely on the PR/CSV/description. |
+
+## Installation
+
+```bash
+/plugin install bitwarden-test-engineer@bitwarden-marketplace
+```
+
+For Jira-backed analysis, install the Atlassian tools alongside it:
+
+```bash
+/plugin install bitwarden-atlassian-tools@bitwarden-marketplace
+```
+
+## Usage
+
+The agent activates when you ask what test coverage a change needs, which
+automation layers to add, how to shape a test plan, or whether existing tests are at the
+right level:
+
+```
+I'm picking up PM-12345 next sprint. What test coverage should this feature have?
+```
+
+```
+Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?
+```
+
+```
+Here's our exported test cases CSV for the new item types import/export work (PM-32009) —
+which of these should be automated and at what layer?
+```
+
+Each run produces a per-change directory `test-engineer-report-<slug>-<date>/` holding the
+self-contained HTML reports: `coverage.html` (what is already tested — observed tests per layer,
+each cited as a GitHub permalink, plus gaps), `recommended.html` (the per-platform recommendation
+and its coverage-gap findings), and `combined.html` (the primary deliverable — both on one two-tab
+page). Re-running on the same change and date refreshes the reports in that directory. They share
+one off-brand data-report visual system so they read as the same instrument.
+
+## References
+
+- [Claude Code Agents](https://code.claude.com/docs/en/agents)
+- [Claude Code Skills](https://code.claude.com/docs/en/skills)
+- [The Testing Trophy](https://kentcdodds.com/blog/the-testing-trophy-and-testing-classifications)
+- [Bitwarden Contributing Guidelines](https://contributing.bitwarden.com/contributing/)
diff --git a/plugins/bitwarden-test-engineer/agents/test-strategist.md b/plugins/bitwarden-test-engineer/agents/test-strategist.md
new file mode 100644
index 0000000..7500301
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/agents/test-strategist.md
@@ -0,0 +1,151 @@
+---
+name: test-strategist
+version: 1.0.0
+description: |
+  Test strategist for Bitwarden — the test-planning role, scoped to exactly the two skills it owns: (1) analyzing-test-stack, which recommends what test automation a change needs and at which layer, and (2) assessing-test-coverage, which inventories what is already tested. It produces a risk-weighted plan and a coverage inventory — it does NOT author, run, or maintain test code (a future SDET role), and does NOT perform exploratory or manual QA (a future QA-engineer role); do not delegate those to it. Takes a change — a feature, bugfix, refactor, or migration — described in plain language or carried in a Jira ticket, a GitHub PR, a Confluence tech breakdown, and/or an exported test-case CSV, and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E), shaped to each repo's actual test practice rather than one universal shape, and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or what is already tested for a change — or asks for a "test stack" / "test strategy" / "risk-based coverage" / "coverage inventory" analysis for a ticket, PR, tech breakdown, or set of test cases.
+
+  <example>
+  Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with.
+  user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?"
+  assistant: "I'll use the test-strategist agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a test-layer recommendation shaped to each affected repo."
+  <commentary>
+  Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A reviewer wants to know whether an open PR is adequately tested at the right layers.
+  user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?"
+  assistant: "I'll use the test-strategist agent to read the PR diff and its tests, assess the test shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern."
+  <commentary>
+  PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the test shape including the ice-cream-cone check.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A QA engineer exported a set of manual test cases and wants an automation plan.
+  user: "Here's our exported test cases CSV for the new item types import/export work (PM-32009) — which of these should be automated and at what layer?"
+  assistant: "I'll use the test-strategist agent to parse the CSV, bucket the existing cases by test layer, find the gaps, and produce a layer-by-layer automation recommendation."
+  <commentary>
+  CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps.
+  </commentary>
+  </example>
+
+  <example>
+  Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it.
+  user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?"
+  assistant: "I'll use the test-strategist agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform test-stack recommendation shaped to each repo."
+  <commentary>
+  Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report.
+  </commentary>
+  </example>
+model: inherit
+tools:
+  - Read
+  - Write
+  - Glob
+  - Grep
+  - Skill
+  - Task
+  - AskUserQuestion
+  - Bash(gh pr view:*)
+  - Bash(gh pr diff:*)
+  - Bash(gh pr checks:*)
+  - Bash(git diff:*)
+  - Bash(git log:*)
+  - Bash(git rev-parse:*)
+  - Bash(git remote get-url:*)
+  - Bash(git -C * rev-parse:*)
+  - Bash(git -C * remote get-url:*)
+  - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence
+  - mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql
+skills:
+  - assessing-test-coverage
+  - analyzing-test-stack
+color: green
+---
+
+You are the **test strategist** for Bitwarden — the test-planning role. Your job: take a change — a feature, bugfix, refactor, or migration — and say **what to test, at which layer, and why**. You recommend the plan and inventory existing coverage; you do not author, run, or maintain the tests, nor run exploratory/manual QA — those are separate roles this plugin may grow into later.
+
+You produce a recommendation — an HTML report — not the tests themselves. Ground every layer call in evidence; a test plan drifts toward whatever is easiest to write rather than what buys confidence, so keep each repo's shape honest.
+
+## Operating context
+
+A single feature frequently spans several repos (a server endpoint + a web client + a mobile screen), each shaped independently — match the recommendation to each repo's actual practice, not a house style. **Unit and integration live alongside the code in each platform repo; E2E lives in the dedicated `test` repo** (a sibling of the platform repos). The per-platform stack and the layer→repo map are in `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md`.
+
+Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__*` server). If it is absent and the user references a Jira issue or Confluence breakdown, don't fail — say the MCP is unavailable and ask the user to paste the requirements, or proceed from the PR / CSV / description provided.
+
+## Workflow
+
+Classify what the request needs and dispatch to the matching skill(s) — each skill runs standalone:
+
+- _"What's already tested for this PR?"_ → `Skill(assessing-test-coverage)` alone.
+- _"What layers should this change ship with?"_ → `Skill(analyzing-test-stack)` (it pulls its own coverage inventory if none is supplied).
+- A full test plan / test-stack analysis → the **coverage → recommendation pipeline** below, run in sequence (the coverage inventory feeds the recommendation).
+
+The steps below specify that pipeline end to end.
+
+### 1. Intake and scope
+
+Classify every input supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) lives in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it. Then determine the **affected repos/platforms**: if scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion`; otherwise infer and state your assumption.
+
+### 2. Fan out to gather evidence
+
+Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your context stays lean. Each returns a compact structured digest, not raw dumps:
+
+- **Requirements reader** (`sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children, feeding linked PR URLs to the PR analyzer, and capturing the bug **severity** and each behavior's **source issue key + browse URL**. Follows `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_ and _Citing Jira issues as links_.
+- **Breakdown reader** (`sonnet`) — fetches the tech breakdown, mines Part 2's scope checklist for surfaces, Part 4 spec pages for interfaces, and Part 5 open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status.
+- **PR diff analyzer** (`sonnet`) — `gh pr diff` / `gh pr view` for the change surface, public API touched, and tests already present.
+- **CSV parser** (`haiku`) — buckets existing cases by apparent layer and automation status.
+
+Give each subagent one source and a tight output contract; skip any branch whose input wasn't supplied. **Set each subagent's model explicitly** (see _Model selection and context discipline_) — never let a digest-returning subagent inherit your model.
+
+### 3. Assess existing coverage
+
+Once the change surface is known (step 2), determine what is **already tested** before recommending anything. Fan out a **per-repo coverage scout** (`sonnet`) per affected repo, each applying the `assessing-test-coverage` skill — the record shape, discovery rules, per-behavior discipline, and permalink recipe live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md`; scouts follow it. Each returns one record per behavior plus `unverified` gaps. Merge the scouts' records into one inventory.
+
+Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date to produce the coverage inventory and the **self-contained HTML coverage report**. Per the skill, the HTML _rendering_ is delegated to the Sonnet **report-writer subagent** — only the gathering and merge happen in your context. Skills can't read the clock; pass today's date, and the build script writes the report into the per-change `test-engineer-report-<slug>-<date>/` directory.
+
+### 4. Recommend
+
+Invoke `Skill(analyzing-test-stack)` with the digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your context** — map each behavior to the cheapest sufficient layer per platform, risk-weighted by severity, and surface gaps and shape-wrong tests (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity; the skill and its `references/` own how. Once the mapping is decided, rendering it to the **self-contained HTML report** is mechanical and is delegated to the Sonnet **report-writer subagent** — hand it the decided per-behavior records (each carrying its `source_issue` from intake) and your `#overview` synthesis.
+
+### 5. Combine and present
+
+Steps 3 and 4 each write their report into the per-change directory `test-engineer-report-<slug>-<date>/` — `coverage.html` and `recommended.html`. Assemble the **combined two-tab page** — the primary deliverable, _Current coverage_ + _Recommended coverage_ on one page — yourself with the build script (pure file assembly, no template or stylesheet reading, so your context stays lean):
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind test-combined --slug <slug> --date <today> \
+  --current test-engineer-report-<slug>-<date>/coverage.html \
+  --recommended test-engineer-report-<slug>-<date>/recommended.html
+```
+
+The paths are deterministic under the per-change directory (and the prior steps print them); the two standalone reports are read, not modified, and `combined.html` lands beside them. Then mirror the test-stack report's `#overview` in chat — recommended shape per platform, the top open risks to resolve before committing to the plan, and any coverage the analyst couldn't verify — and point the user at `test-engineer-report-<slug>-<date>/combined.html` first (both standalone reports remain available for sharing a single view).
+
+## Principles
+
+These govern the orchestration; the per-skill principles live in the two skills.
+
+- **Coverage before recommendation.** Assess what exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute.
+- **Degrade gracefully.** A missing input (no MCP, no PR, no CSV, no `test` checkout) narrows the analysis; it never blocks it. State what you couldn't see.
+
+## Model selection and context discipline
+
+You **inherit the session model** for your own context — the orchestration and the hard behavior→layer/severity reasoning, where a wrong call is expensive to act on, stay with you. Everything you fan out is evidence-gathering or mechanical rendering and runs on an **explicitly pinned** cheaper model — never inherit:
+
+- **Evidence subagents** (step 2) — `sonnet` for anything reading a diff, ticket, or repo; `haiku` for pure CSV parsing.
+- **Coverage scouts** (step 3) — `sonnet`.
+- **Report-writer** — `sonnet`. Once the inventory (step 3) and the mapping (step 4) are decided, rendering to HTML is mechanical: the report-writer authors the content fragment per the skill's template and runs `build-report.sh` to splice in the stylesheet.
+
+Keep your own context lean — it is the most expensive token pool and is re-cached every turn:
+
+- **Never read the rendering files** (`html-report-template.md`, `coverage-report-template.md`, `report-template-common.md`, `report-style-tokens.md`, `report-style.css`, `build-report.sh`) — they are the report-writer's concern. You need only the reasoning references (`test-layers.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). The step-5 combined build is the one time you _invoke_ `build-report.sh` — on the two finished filenames; you still never read its source.
+- **Don't echo digests.** Synthesize subagent digests into the decision; keep inter-step narration to a few lines. The reports are the deliverable.
+- **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records and the `#overview` text; if a record set is large, `Write` it to a temp file (e.g. `./.test-engineer-<slug>.json`) and pass the path.
diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md
new file mode 100644
index 0000000..d0342cc
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/input-sources.md
@@ -0,0 +1,170 @@
+# Ingesting evidence sources
+
+Inputs are additive — handle any combination, and record in the report which sources were
+present and which were missing. Never block on a missing source.
+
+## Jira ticket
+
+Preferred: if the `bitwarden-atlassian-tools` plugin is installed, invoke
+`Skill(bitwarden-atlassian-tools:researching-jira-issues)` for a deep, link-following read.
+
+Otherwise use the MCP tools directly:
+
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue` — the issue itself (summary, description,
+  acceptance criteria, custom fields).
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments` — clarifications and edge cases raised in
+  discussion.
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` — linked Confluence pages and PRs.
+- `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page` — linked requirements/design docs.
+
+Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/
+components** named. If the MCP is unavailable, ask the user to paste the requirements.
+
+For every issue, also capture its **key and browse URL** and **carry the originating key with each
+behavior you extract**, so the report can link every behavior back to its source — link form and the
+no-Jira-source case are in _Citing Jira issues as links_ below.
+
+Also capture each behavior's **severity** and carry it through with the behavior. Where it comes
+from (a bug's Jira severity field vs. assessed risk for a feature) and how it weights coverage are
+owned by `analyzing-test-stack`'s `references/severity-risk.md`.
+
+### Epic intake
+
+A Jira key may resolve to an Epic (or, in next-gen projects, a Feature) rather than a single
+story. The epic body itself rarely lists testable behaviors — those live on its children
+and on the PRs the children produce. If you analyze only the epic, you will under-scope the
+analysis. So when the `issuetype` on the `get_issue` response is `Epic` or `Feature`, expand
+before extracting:
+
+1. **Discover children.** Read the `subtasks` field first. If empty (common in next-gen
+   projects, which use `parent` relationships rather than the legacy `subtasks` field), fall
+   back to `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues` with JQL `parent = <EPIC-KEY>`. On
+   classic projects, also try `"Epic Link" = <EPIC-KEY>`. Together these cover both schemas.
+2. **Bound the fan-out.** If the epic has more than ~10 children, fetch the first 10 in full
+   and summarize the rest as a one-line list (key, status, summary) from the search results.
+   This matches the depth-control discipline in
+   `bitwarden-atlassian-tools:researching-jira-issues` (Steps 2–3) — re-use that recipe; do
+   not re-derive it.
+3. **Per child, gather behaviors and PRs.**
+   - `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue` for the child's description and acceptance criteria —
+     these are the testable behaviors. Carry each child's **key and browse URL** with the behaviors
+     it produces — a behavior sourced from a child links to that child, not the epic.
+   - `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). Each PR URL
+     feeds the **GitHub PR** branch below (`gh pr view` / `gh pr diff`). **These merged/linked PRs
+     are the reliable backbone for existing coverage** — they carry the tests that shipped and the
+     PR head SHA makes each permalink-ready (see `finding-coverage.md` → _Finding existing
+     coverage_). If `gh` cannot reach a PR (private fork, not authenticated, repo inaccessible),
+     record the URL as evidence-not-inspected rather than dropping it.
+4. **Track epic status.** The epic's status (`In Planning`/`In Progress`/`Done`) tells you how much
+   is shipped: `Done` children with merged PRs likely have tests-in-PR to audit; `To Do` children
+   are scope-only and the recommendation is prospective. Surface this in the report's Evidence.
+5. **Preferred path.** The `researching-jira-issues` skill (preferred at the top of this file) does
+   this hierarchical discovery and depth-controlled traversal in one synthesized read — run it on the
+   epic key; the direct MCP calls above are the fallback.
+
+## GitHub PR
+
+- `gh pr view <pr> --json url,headRefOid,baseRefName,title,body,files,state` — title,
+  body, linked issues, files changed, **and the head SHA + `owner/repo`** needed for
+  permalink production downstream.
+- `gh pr diff <pr>` — the actual change surface.
+
+Extract: the public API / behavior touched, the diff paths (→ which repos/platforms),
+**any tests already included in the PR** (so you assess incremental, not absolute,
+gaps), and the captured **`headRefOid`** + **`owner/repo`** (parsed from the PR URL).
+The SHA and `owner/repo` are required — they are what makes every test cited as
+existing coverage clickable in the report. Tests observed in the PR diff are primary
+coverage evidence; for _pre-existing_ tests not in the diff, do a targeted lookup scoped
+to the changed paths/symbols rather than a repo-wide sweep. See the
+`assessing-test-coverage` skill's `references/finding-coverage.md` → _Finding existing
+coverage_ and _Citing tests as GitHub permalinks_ for the link form and the fallback when
+ingredients are missing.
+
+## Technical breakdown document
+
+A Bitwarden **Tech Breakdown** — the Confluence artifact a team produces before implementation,
+authored with the `bitwarden-delivery-tools:writing-tech-breakdowns` skill. It is the richest
+single input for this analysis, because a good breakdown has already done the cross-platform
+scoping you would otherwise reconstruct from a diff or a ticket. Mine it; don't re-derive it.
+
+Locate and fetch it:
+
+- If given a page ID or URL, fetch directly with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page`.
+- If given only a feature/team name, find the page first with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence`
+  or `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql` (breakdowns live in a team's "Tech Breakdown"
+  folder), then fetch it.
+- The breakdown's **status** matters: `IN PLANNING` / `IN PROGRESS` means the scope may still
+  shift — note that the recommendation rests on a draft. `PROPOSED` / `ACCEPTED` is a stable
+  basis. Record the status as part of the evidence.
+
+Map its structure to testable evidence (the canonical template is page `2920349776`):
+
+- **Part 1 — Problem overview**: the feature framing and linked Jira epic. Use it for scope and
+  to cross-link any Jira/PR inputs, not as a behavior source on its own. **When Part 1 names an
+  Epic**, treat it the same as an Epic-key intake — drill into its children and their PR remote
+  links per the _Epic intake_ recipe above. A breakdown plus its epic together usually surface
+  more testable behavior than either alone.
+- **Part 2 — Breakdown scope checklist**: the core of the mining. Each answered item names a
+  surface the change touches and therefore a place tests are needed — **Database changes**
+  (migration/backwards-compat behaviors, EDD phasing), **API changes** (endpoint contracts,
+  V±2 compatibility, any unauthenticated endpoint), **UI components** (shared/base components),
+  **SDK changes**, **Services touched**, **Hosting** (Self-Hosted vs Cloud paths),
+  **Feature flagging** (flag-on/flag-off states to cover), and **Security considerations**
+  (crypto, threat-model-relevant behaviors). The **Testing considerations** item is the team's
+  own stated test intent — treat it as a claim to assess, not as ground truth
+  to copy.
+- **Part 4 — Specification artifacts**: linked child pages defining concrete interfaces (API
+  contracts, schemas, component APIs, crypto schemes). Fetch the relevant ones with
+  `get_confluence_page`; their public interfaces and edge cases are exactly what integration and
+  unit tests pin down.
+- **Part 5 — Open questions**: unresolved questions are untestable-requirement risk — a behavior
+  can't be reliably tested until its question is answered. Surface them in the report's gaps.
+
+Extract: discrete **testable behaviors** per platform, the **surfaces** each touches (→ repos via
+the `analyzing-test-stack` skill's `references/monorepo-layout.md`), and the team's **stated testing
+intent** (to evaluate, not echo). Where the
+breakdown's scope checklist disagrees with a diff or ticket you were also given, treat the
+divergence as a finding rather than silently picking one.
+
+## Test-case CSV export
+
+A CSV export of existing or planned test cases. Column headers vary by tool and export
+settings — **do not hardcode them**. Read the header row, then map by meaning:
+
+- A **title / case** column — the scenario name.
+- A **type** column (e.g. "Regression", "Smoke", "Functional") — hints at intended layer.
+- An **automation status** column (e.g. "Ready to Automate", "Automated", "Manual") —
+  what already exists vs. what's planned.
+- A **steps / expected-result** column, often in Given–When–Then form — the behavior.
+- Optional **team / area / tags / preconditions** columns — scope and grouping.
+
+Map rows to behaviors and bucket each by apparent layer using the `analyzing-test-stack` skill's `references/test-layers.md`:
+
+- A case that drives the full UI through a complete journey → likely **E2E** (target the
+  dedicated `test` repo).
+- A case asserting one service/component's behavior through its collaborators →
+  **integration**.
+- A case pinning a single function's logic or an edge case → **unit**.
+
+Flag cases that are currently manual but cheaply automatable at a lower layer, and cases
+slated for E2E that would be better as integration. If a column's meaning is ambiguous,
+state the interpretation you used rather than guessing silently.
+
+## Citing Jira issues as links
+
+Every Jira item the report **names**, and every behavior **found from a Jira item**, is rendered as
+a clickable link — never bare key text. This is the Jira counterpart to the GitHub permalink rule
+for tests (`finding-coverage.md` → _Citing tests as GitHub permalinks_).
+
+The link form is the issue's browse URL `https://bitwarden.atlassian.net/browse/<KEY>` (e.g.
+`PM-1234`). Prefer the URL the MCP tool or `researching-jira-issues` skill returns; else construct it
+from the key. The same rule covers epics and their children — link each to its own key. Apply it:
+
+- An **issue, epic, or child key** named in Overview/Summary/Evidence — anchor the key:
+  `<a href="https://bitwarden.atlassian.net/browse/PM-1234">PM-1234</a>`.
+- A **behavior row** (recommendations/coverage/gaps) extracted from a Jira item — append the linked
+  source key to the behavior cell. A behavior with no Jira source (PR-only) carries none.
+
+These are informational `<a href>` citations (text, not loaded assets), so they don't violate the
+self-contained constraint. Never fabricate a key or URL — if a key is unknown, name the source in
+plain text rather than inventing a link.
diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
new file mode 100644
index 0000000..013c469
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md
@@ -0,0 +1,131 @@
+# Report style tokens — data-report visual system for HTML reports
+
+The **visual system** for every self-contained HTML report the `bitwarden-test-engineer` plugin
+emits — the `analyzing-test-stack` test-stack report and the `assessing-test-coverage` coverage
+report alike. Because the output is a single file with no external assets, the stylesheet is
+inlined; both reports splice the **same** canonical CSS so they read as one instrument and cannot
+drift.
+
+**You never retype, prune, or hand-edit the stylesheet.** It lives as a real file at
+`report-style.css` (alongside this file) and is spliced into the report by `scripts/build-report.sh`
+— never reproduced as model output. Authoring a report means writing its **content** (the sections
+below) into a fragment whose `<style>` holds a single sentinel line, then running the build script
+(see _Building the report_). If the visual system genuinely needs to change, edit `report-style.css`
+once and every future report inherits it.
+
+The look is deliberately **not** a brand skin — a quiet, ink-on-paper _data report_ where the data
+is the hero and nothing decorates: flat white page, hairline rules, no cards/shadows/rounded panels.
+
+## Design intent (why these choices)
+
+- **Monospace is a structural role.** Section numbers, eyebrows, table headers, layer/badge chips,
+  axis labels, counts, and SHAs are set in the system mono stack; prose in the system sans stack.
+  The split makes "data" and "argument" visually distinct without any web font.
+- **The layer ramp is sequential, because the layers are ordered.** unit → integration → e2e is a
+  cost/depth sequence (cheapest/shallowest → most expensive/deepest); a single-hue light→dark ramp
+  encodes that order, so a thin dark sliver reads as "expensive, used sparingly." Do not swap it for
+  unrelated categorical hues.
+- **State colors are categorical and muted.** The assumption/warn/ok badges each carry exactly one
+  meaning — muted traffic colors, not saturated brand colors.
+
+## Token → meaning mapping (binding)
+
+These mappings are **normative**. Do not re-pick colors per report. Your markup must use exactly
+these class names; the spliced stylesheet styles them.
+
+### Layer tokens (chips, distribution bars, table cells)
+
+| Layer       | Token           | HEX       | Role in the ramp                 |
+| ----------- | --------------- | --------- | -------------------------------- |
+| unit        | `--unit`        | `#8FB3D1` | lightest — cheapest / shallowest |
+| integration | `--integration` | `#3F7196` | mid — the confidence layer       |
+| e2e         | `--e2e`         | `#1D3A54` | deepest — most expensive, thin   |
+
+`unit` is light, so its chips and bar segments use **dark** text (`--on-unit`); integration and e2e
+use **white** text (`--on-deep`).
+
+### Badge / state tokens
+
+| Badge      | Token    | Use                                             |
+| ---------- | -------- | ----------------------------------------------- |
+| assumption | `--warn` | Anything inferred without direct evidence       |
+| warn       | `--bad`  | Risks, missing-input flags, unverifiable claims |
+| ok         | `--ok`   | Confirmed coverage, grounded calls              |
+
+All badge chips use white (`--on-state`) text on these muted fills.
+
+### Surface, ink, and structural tokens
+
+| Token         | HEX       | Use                                           |
+| ------------- | --------- | --------------------------------------------- |
+| `--paper`     | `#FFFFFF` | Page background (flat — no cards)             |
+| `--panel`     | `#F4F6F8` | Inline code, chart track, table row hover     |
+| `--ink`       | `#16191D` | Primary text                                  |
+| `--ink-soft`  | `#585F68` | Secondary text, captions, table cells of note |
+| `--ink-faint` | `#818892` | Eyebrows, section numbers, axis labels        |
+| `--rule`      | `#E4E7EA` | Hairlines, dividers, table row borders        |
+| `--link`      | `#2F6E9E` | Links                                         |
+
+Typography is system fonts only — **no web fonts, no `@font-face`, no CDN imports** — split across
+`--sans` (prose) and `--mono` (data, labels, chrome).
+
+## Graphics — the layer-distribution chart
+
+The report's signature graphic: the layer distribution per platform, rendered as a normalized
+horizontal **stacked bar** (a `<figure>` captioned `Fig 1`).
+
+- One `.dist-row` per platform: a right-aligned `.dist-label` and a `.bar` track holding one `.seg`
+  per layer present.
+- **Segment width is proportional to the test count at that layer** — set it with inline
+  `style="flex: <count>"` (raw counts; the browser normalizes them). Never hand-compute percentages
+  or pixel widths.
+- Each segment shows its **count** as a monospace label; the shared `.legend` above maps color →
+  layer; a `figcaption` names the figure. The unit segment carries dark text (`--on-unit`),
+  integration and e2e white (`--on-deep`).
+
+## Building the report
+
+This section is the **single source of truth** for the build invocation; the templates only name
+their `--kind`. The model authors a **content fragment** — a complete HTML document whose `<style>`
+contains exactly one line, the sentinel:
+
+```html
+<style>
+  /* @@BITWARDEN_REPORT_STYLESHEET@@ */
+</style>
+```
+
+Write that fragment to a temporary path (e.g. `<kind>-report-<slug>.fragment.html`), then run the
+build script from the plugin root:
+
+```bash
+"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \
+  --kind <test-stack|test-coverage> --slug <slug> --date <YYYY-MM-DD> \
+  <fragment-file>
+```
+
+It replaces the sentinel with `report-style.css` verbatim and writes the report into a per-change
+directory `test-engineer-report-<slug>-<date>/` (created if needed) — the coverage report as
+`coverage.html`, the test-stack report as `recommended.html` — then prints the final path. The
+directory name derives only from `--slug`/`--date`, so a run's reports share one folder;
+**re-running the same change on the same date refreshes the report in place**. Delete the temporary
+fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`, fragment not
+found) it writes nothing — fix the fragment and re-run rather than pasting CSS by hand.
+
+**Combined two-tab page (assembled, not authored).** When both reports exist for one change, the
+build script can stitch them into one page with two CSS-only tabs — _Current coverage_ and
+_Recommended coverage_. This is a presentation-only merge from the two finished report files: no
+skill or template knows about tabs, and the agent (not the report author) runs it with
+`--kind test-combined --current test-engineer-report-<slug>-<date>/coverage.html --recommended test-engineer-report-<slug>-<date>/recommended.html`,
+which writes `combined.html` into that same directory. The tab chrome lives entirely in the build
+script and `report-style.css`.
+
+## What not to do
+
+- Do not reintroduce a brand skin — no saturated brand colors, no logo images, no `<link>` to a
+  design system. The report is intentionally off-brand and self-contained.
+- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the encoding.
+- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only the
+  sentinel. A report that ships a hand-copied or "only the classes I used" stylesheet is exactly how
+  two reports drift apart.
+- Do not hand-compute distribution bar widths — set `flex: <count>` per segment.
diff --git a/plugins/bitwarden-test-engineer/references/report-style.css b/plugins/bitwarden-test-engineer/references/report-style.css
new file mode 100644
index 0000000..ad98427
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-style.css
@@ -0,0 +1,552 @@
+:root {
+  /* Surfaces & ink — flat paper, no cards or shadows */
+  --paper: #ffffff;
+  --panel: #f4f6f8;
+  --ink: #16191d;
+  --ink-soft: #585f68;
+  --ink-faint: #818892;
+  --rule: #e4e7ea;
+
+  /* Layer ramp — SEQUENTIAL: ordered cheap/shallow -> costly/deep */
+  --unit: #8fb3d1;
+  --integration: #3f7196;
+  --e2e: #1d3a54;
+  --on-unit: #16191d; /* --unit is light: use dark text */
+  --on-deep: #ffffff; /* white text on integration/e2e */
+
+  /* Verdict & state — muted categorical */
+  --ok: #43875a;
+  --warn: #b07d2f;
+  --bad: #bf564a;
+  --on-state: #ffffff;
+
+  --link: #2f6e9e;
+
+  --sans:
+    system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+  --mono:
+    ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono",
+    monospace;
+}
+
+* {
+  box-sizing: border-box;
+}
+html {
+  -webkit-text-size-adjust: 100%;
+  scroll-padding-top: 24px; /* keep anchored sections clear of the top edge */
+}
+
+body {
+  margin: 0;
+  background: var(--paper);
+  color: var(--ink);
+  font: 15px/1.6 var(--sans);
+  font-feature-settings: "tnum" 1; /* tabular figures where supported */
+  -webkit-font-smoothing: antialiased;
+  text-rendering: optimizeLegibility;
+}
+
+/* Smooth in-page jumps for the report's overview -> section anchor links,
+   suppressed when the reader prefers reduced motion. */
+@media (prefers-reduced-motion: no-preference) {
+  html {
+    scroll-behavior: smooth;
+  }
+}
+
+a {
+  color: var(--link);
+  text-decoration: underline;
+  text-underline-offset: 2px;
+  text-decoration-thickness: 1px;
+}
+a:focus-visible,
+summary:focus-visible {
+  outline: 2px solid var(--link);
+  outline-offset: 2px;
+}
+
+/* Masthead */
+header {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: clamp(36px, 7vw, 56px) clamp(20px, 5vw, 32px) 28px;
+}
+header .eyebrow {
+  margin: 0 0 14px;
+  font: 600 11px/1 var(--mono);
+  letter-spacing: 0.18em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+}
+header h1 {
+  margin: 0 0 12px;
+  font-size: clamp(24px, 5vw, 32px);
+  line-height: 1.2;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+  text-wrap: balance;
+}
+header .meta {
+  font: 12px/1.6 var(--mono);
+  color: var(--ink-soft);
+}
+header .meta a {
+  color: var(--ink-soft);
+}
+
+/* In-page table of contents — a compact monospace row of section links at the
+   top of <main>. In the combined report the build script namespaces each link's
+   href per tab, so a panel's ToC jumps within its own panel. */
+.toc {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px 18px;
+  margin: 0 0 4px;
+  padding: 0 0 20px;
+  border-bottom: 1px solid var(--rule);
+  font: 600 11px/1.6 var(--mono);
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.toc a {
+  color: var(--ink-soft);
+  text-decoration: none;
+}
+.toc a:hover {
+  color: var(--link);
+  text-decoration: underline;
+}
+
+/* Sections — flat, hairline-separated, auto-numbered */
+main {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 0 clamp(20px, 5vw, 32px) 96px;
+  counter-reset: sec;
+}
+section {
+  counter-increment: sec;
+  padding: 36px 0;
+  border-top: 1px solid var(--rule);
+  scroll-margin-top: 24px;
+}
+section:first-of-type {
+  border-top: 0;
+}
+/* Quiet landing cue: briefly tint a section an in-page link jumped to. */
+@media (prefers-reduced-motion: no-preference) {
+  section:target {
+    animation: section-land 1.4s ease-out;
+  }
+  @keyframes section-land {
+    from {
+      background: var(--panel);
+    }
+    to {
+      background: transparent;
+    }
+  }
+}
+section > h2 {
+  margin: 0 0 18px;
+  font-size: 19px;
+  font-weight: 650;
+  letter-spacing: -0.01em;
+  text-wrap: balance;
+}
+section > h2::before {
+  content: counter(sec, decimal-leading-zero);
+  display: inline-block;
+  margin-right: 12px;
+  font: 600 12px/1 var(--mono);
+  letter-spacing: 0.1em;
+  color: var(--ink-faint);
+  vertical-align: 2px;
+}
+section h3 {
+  margin: 28px 0 10px;
+  font: 600 11px/1.3 var(--mono);
+  letter-spacing: 0.12em;
+  text-transform: uppercase;
+  color: var(--ink-soft);
+}
+
+/* Prose */
+p {
+  margin: 0 0 14px;
+  max-width: 72ch;
+  text-wrap: pretty; /* avoid orphans / ragged short last lines */
+}
+.lead {
+  font-size: 16px;
+}
+.small {
+  font-size: 12.5px;
+  color: var(--ink-soft);
+}
+ul.tight {
+  margin: 8px 0 16px;
+  padding-left: 20px;
+}
+ul.tight li {
+  margin: 0 0 6px;
+}
+ol {
+  padding-left: 22px;
+}
+ol li {
+  margin: 0 0 10px;
+}
+code {
+  font: 0.86em var(--mono);
+  background: var(--panel);
+  padding: 1px 5px;
+  border-radius: 3px;
+}
+
+/* Tables — heavy header rule, hairline rows */
+.scroll {
+  overflow-x: auto;
+  -webkit-overflow-scrolling: touch;
+  overscroll-behavior-x: contain;
+}
+table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 4px 0 18px;
+  font-size: 13.5px;
+}
+thead th {
+  text-align: left;
+  vertical-align: bottom;
+  padding: 0 12px 8px;
+  font: 600 10.5px/1.3 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  border-bottom: 1px solid var(--ink);
+}
+tbody td {
+  vertical-align: top;
+  padding: 10px 12px;
+  border-bottom: 1px solid var(--rule);
+}
+tbody tr:hover {
+  background: var(--panel);
+}
+th:first-child,
+td:first-child {
+  padding-left: 0;
+}
+th:last-child,
+td:last-child {
+  padding-right: 0;
+}
+
+/* Layer chip */
+.layer {
+  display: inline-block;
+  font: 600 10.5px/1.6 var(--mono);
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 2px 8px;
+  border-radius: 2px;
+  white-space: nowrap;
+}
+.layer.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.layer.integration {
+  background: var(--integration);
+  color: var(--on-deep);
+}
+.layer.e2e {
+  background: var(--e2e);
+  color: var(--on-deep);
+}
+
+/* Layer-distribution chart (the signature graphic) */
+figure {
+  margin: 18px 0;
+}
+figcaption {
+  margin-bottom: 14px;
+  font: 11px/1.4 var(--mono);
+  letter-spacing: 0.04em;
+  color: var(--ink-faint);
+}
+.dist .legend {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 18px;
+  margin-bottom: 14px;
+  font: 11px/1 var(--mono);
+  color: var(--ink-soft);
+}
+.dist .legend .key {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+.dist .legend .key::before {
+  content: "";
+  width: 10px;
+  height: 10px;
+  border-radius: 2px;
+  background: var(--rule);
+}
+.dist .legend .unit::before {
+  background: var(--unit);
+}
+.dist .legend .integration::before {
+  background: var(--integration);
+}
+.dist .legend .e2e::before {
+  background: var(--e2e);
+}
+.dist-row {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  margin: 7px 0;
+}
+.dist-row .dist-label {
+  flex: 0 0 14ch;
+  text-align: right;
+  font: 11px/1.3 var(--mono);
+  color: var(--ink-soft);
+  word-break: break-word;
+}
+.dist-row .bar {
+  flex: 1;
+  display: flex;
+  height: 24px;
+  background: var(--panel);
+  border-radius: 3px;
+  overflow: hidden;
+}
+.bar .seg {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 18px;
+  font: 600 11px/1 var(--mono);
+  color: var(--on-deep);
+}
+.bar .seg.unit {
+  background: var(--unit);
+  color: var(--on-unit);
+}
+.bar .seg.integration {
+  background: var(--integration);
+}
+.bar .seg.e2e {
+  background: var(--e2e);
+}
+
+/* Per-platform recommended-shape list (replaces card blocks) */
+ul.shapes {
+  margin: 6px 0 0;
+  padding: 0;
+  list-style: none;
+}
+ul.shapes li {
+  padding: 10px 0;
+  border-top: 1px solid var(--rule);
+}
+ul.shapes li:first-child {
+  border-top: 0;
+}
+ul.shapes .plat {
+  font: 600 13px/1.5 var(--mono);
+}
+
+/* Badges */
+.badge {
+  display: inline-block;
+  font: 600 10px/1.5 var(--mono);
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  padding: 1px 6px;
+  border-radius: 2px;
+  color: var(--on-state);
+  white-space: nowrap;
+}
+.badge.assumption {
+  background: var(--warn);
+}
+.badge.warn {
+  background: var(--bad);
+}
+.badge.ok {
+  background: var(--ok);
+}
+
+/* Unlinkable evidence */
+.unlinkable {
+  font: italic 12px/1.4 var(--mono);
+  color: var(--ink-faint);
+}
+
+/* Tabbed combined report — the Current-coverage and Recommended-coverage report
+   bodies surfaced as two tabs on one page, CSS-only (no JavaScript). The radio
+   inputs are visually hidden but keep keyboard focus; the checked input drives
+   both the active label and which panel shows. These rules are only exercised by
+   the combined report; they are inert in the standalone coverage/test-stack
+   reports, which never emit these elements. */
+.tab-input {
+  position: absolute;
+  width: 1px;
+  height: 1px;
+  margin: -1px;
+  opacity: 0;
+}
+.tablist {
+  max-width: 60rem;
+  margin: 0 auto;
+  padding: 0 clamp(20px, 5vw, 32px);
+  display: flex;
+  flex-wrap: wrap;
+  gap: 4px;
+  border-bottom: 1px solid var(--ink);
+}
+.tablist label {
+  display: inline-block;
+  padding: 11px 16px;
+  font: 600 11px/1.4 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  cursor: pointer;
+  border: 1px solid transparent;
+  border-bottom: 0;
+  border-radius: 3px 3px 0 0;
+  margin-bottom: -1px; /* sit the tab on the list's bottom rule */
+}
+.tablist label:hover {
+  color: var(--ink);
+  background: var(--panel);
+}
+/* A tabpanel is itself a section element; neutralize the global section chrome
+   so only the report sections nested inside its main element render with rules
+   and numbering. */
+.tabpanel {
+  display: none;
+  padding: 0;
+  border-top: 0;
+  counter-increment: none;
+}
+/* Active tab + its panel, driven by the checked radio (general-sibling ~). */
+#tab-current:checked ~ .tablist label[for="tab-current"],
+#tab-recommended:checked ~ .tablist label[for="tab-recommended"] {
+  color: var(--ink);
+  border-color: var(--ink);
+  border-bottom-color: var(--paper);
+  background: var(--paper);
+}
+#tab-current:checked ~ .tabpanel[data-panel="current"],
+#tab-recommended:checked ~ .tabpanel[data-panel="recommended"] {
+  display: block;
+}
+/* Keyboard focus on the visually-hidden radio surfaces a ring on its label. */
+#tab-current:focus-visible ~ .tablist label[for="tab-current"],
+#tab-recommended:focus-visible ~ .tablist label[for="tab-recommended"] {
+  outline: 2px solid var(--link);
+  outline-offset: -2px;
+}
+
+/* Floating "back to top" control — a fixed action button that rides along as the
+   reader scrolls and jumps to the top via the in-page #top anchor on <header>. No
+   JavaScript: it reuses the same smooth-scroll / reduced-motion behavior as the ToC
+   links. Flat to fit the data-report system — a solid ink fill carries it over the
+   content instead of a shadow. Present in every report; hidden when printing. */
+.to-top {
+  position: fixed;
+  right: clamp(16px, 4vw, 28px);
+  bottom: clamp(16px, 4vw, 28px);
+  z-index: 20;
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  padding: 9px 13px;
+  background: var(--ink);
+  color: var(--paper);
+  font: 600 10.5px/1 var(--mono);
+  letter-spacing: 0.1em;
+  text-transform: uppercase;
+  text-decoration: none;
+  border-radius: 4px;
+}
+.to-top::before {
+  content: "\2191"; /* upwards arrow */
+  font-size: 13px;
+  line-height: 1;
+}
+.to-top:hover {
+  background: var(--link);
+  color: var(--paper);
+}
+.to-top:focus-visible {
+  outline: 2px solid var(--link);
+  outline-offset: 2px;
+}
+
+@media (max-width: 720px) {
+  header,
+  main,
+  .tablist {
+    padding-left: 20px;
+    padding-right: 20px;
+  }
+  .dist-row {
+    flex-direction: column;
+    align-items: stretch;
+    gap: 4px;
+  }
+  .dist-row .dist-label {
+    flex: none;
+    text-align: left;
+  }
+}
+
+@media print {
+  body {
+    font-size: 11pt;
+  }
+  /* Tabs cannot be toggled on paper — drop the controls and stack both report
+     bodies, each titled by its panel label so the printout stays legible. */
+  .tab-input,
+  .tablist,
+  .to-top {
+    display: none;
+  }
+  .tabpanel {
+    display: block !important;
+  }
+  .tabpanel::before {
+    content: attr(aria-label);
+    display: block;
+    max-width: 60rem;
+    margin: 0 auto;
+    padding: 16px clamp(20px, 5vw, 32px) 0;
+    font: 600 11px/1.3 var(--mono);
+    letter-spacing: 0.12em;
+    text-transform: uppercase;
+    color: var(--ink-faint);
+  }
+  section {
+    break-inside: avoid;
+    border-top-color: #ccc;
+  }
+  tbody tr:hover {
+    background: none;
+  }
+  a {
+    color: var(--ink);
+  }
+}
diff --git a/plugins/bitwarden-test-engineer/references/report-template-common.md b/plugins/bitwarden-test-engineer/references/report-template-common.md
new file mode 100644
index 0000000..05877a9
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/references/report-template-common.md
@@ -0,0 +1,152 @@
+# Report HTML — shared authoring contract
+
+Both self-contained HTML reports the `bitwarden-test-engineer` plugin emits — the
+`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage report** —
+are authored against this shared contract, so the two read as one instrument. Each skill's own
+template (`html-report-template.md` / `coverage-report-template.md`) covers only what differs: its
+section set, its per-platform table columns, and its recommend-vs-inventory framing. **Read this
+file first, then that template.**
+
+## Output constraints
+
+Produce a **single self-contained HTML file**: all CSS inline in `<style>`, no external/CDN
+_resource_ links (stylesheets, fonts, scripts, images), no required JavaScript, no web fonts. It
+must render correctly opened directly from disk and survive being attached to a ticket or PR.
+Informational `<a href>` citations to public sources are text, not loaded assets — they are fine and
+encouraged (see _Content rules_).
+
+You do not write the final file or paste any CSS: author a **content fragment** (the skeleton below,
+with only the stylesheet sentinel inside `<style>`), then run the build script. The fragment/sentinel
+mechanics, the build invocation, the normative class names (the layer and assumption/warn/ok tokens
+your markup must use), and the visual system are all owned by `report-style-tokens.md` — **read it.**
+Your template only names its `--kind`.
+
+Section headings are auto-numbered by CSS (`01 · …`) — write a plain `<h2>` per section, do not
+hand-number. Wrap each wide table in `<div class="scroll">…</div>` so it scrolls rather than
+overflows on narrow widths.
+
+## Table of contents
+
+Directly **inside `<main>`, before `#overview`**, emit `<nav class="toc" aria-label="Sections">`
+holding one `<a href="#…">` per section in the report (your template lists them). It is a `<nav>`,
+not a numbered section. (In the combined two-tab report the build script namespaces these anchors per
+tab so a panel's ToC jumps within its own panel.)
+
+## Sections common to both reports
+
+Each section uses its **normative `id`** — do not rename, omit, or add top-level sections; readers
+look these up by id. The four below are shared; your template defines the report-specific data
+section (`#recommendations` or `#coverage`) and the `#gaps` contents, and adds framing notes for the
+shared ones (e.g. whether the chart shows recommended or observed counts).
+
+1. **Header** (no id; `<header>` element) — report title, the change under analysis (ticket/PR/
+   feature), and the date.
+2. **`#overview`** — a short top-of-report synthesis written by the author so a reader sees the
+   bottom line without scrolling: a 2–4 sentence recap per platform, the top 3 items the reader
+   should resolve (drawn from `#gaps`), and anchor links into the detail sections. Additive — the
+   per-behavior detail stays in the tables below.
+3. **`#summary`** — 2–4 sentences, then the **layer-distribution chart** (the report's signature
+   graphic; markup in the skeleton below) and a per-platform one-line shape list (`<ul class="shapes">`).
+   The chart's segment markup and render rules are the contract owned by `report-style-tokens.md` →
+   _Graphics_; it encodes **shape** (counts per layer) only — it is severity-blind. (Your template says
+   whether the counts are _recommended_ or _observed_ and supplies the caption.)
+4. **`#evidence`** — a table of which inputs were used and, explicitly, **what was missing or
+   unverifiable** (e.g. "`test` repo not checked out — existing E2E coverage unverified"). For PR
+   inputs include the captured **head SHA** and **`owner/repo`** so per-test permalinks elsewhere can
+   be audited against the same commit.
+
+`#gaps` is the last section in both reports; its exact contents differ — see your template.
+
+## Content rules
+
+- Tables over prose for the data sections and evidence — they're meant to be scanned and acted on.
+- Mark every assumption inline with `<span class="badge assumption">assumption</span>` and every
+  unverifiable claim with `<span class="badge warn">unverified</span>` (e.g. E2E coverage claimed
+  without the `test` repo checked out), so grounded calls are distinguishable from inferred ones.
+- **Hyperlink every GitHub or Atlassian source the report names** — never plain text. The data
+  section's **evidence column** (`Evidence (linked)` in the test-stack report, `Tests (linked)` in
+  the coverage report) is binding: render each behavior's 1–3 representative tests as GitHub
+  permalinks, or the `.unlinkable` span when a test genuinely cannot be linked — never a fabricated
+  URL. Those records come from the coverage inventory; the exact link / `.unlinkable` markup and the
+  permalink-production rules are owned by the `assessing-test-coverage` skill's
+  `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _When a test cannot be
+  linked_. **Jira items and Jira-sourced behaviors** follow `input-sources.md` → _Citing Jira issues
+  as links_ (link form, where to apply it, never-fabricate-a-key rule). All of these are
+  informational `<a href>` citations, not fetched resources, so they don't violate the self-contained
+  constraint.
+- Keep the fixed **back-to-top** control from the skeleton — the `<a class="to-top" href="#top">`
+  after `</main>` paired with `id="top"` on `<header>`. It is CSS-only; drop either half and the
+  anchor breaks.
+
+## Skeleton
+
+The shared document shell. Your template supplies the `<title>`, the eyebrow, the ToC section list,
+the report-specific section(s) between `#evidence` and `#gaps`, and the `#summary`/`#gaps` headings:
+
+```html
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>…report title — {{change}}…</title>
+    <style>
+      /* @@BITWARDEN_REPORT_STYLESHEET@@ */
+    </style>
+  </head>
+  <body>
+    <header id="top">
+      <p class="eyebrow">…report title…</p>
+      <h1>…the change under analysis…</h1>
+      <p class="meta">…ticket/PR · status · team · date…</p>
+    </header>
+    <main>
+      <nav class="toc" aria-label="Sections">
+        <!-- one <a href="#…"> per section, per your template's section list -->
+      </nav>
+      <section id="overview">
+        <h2>Overview</h2>
+        …synthesis: recap per platform; top 3 items; anchor links into the
+        detail sections…
+      </section>
+      <section id="summary">
+        <h2>…summary heading…</h2>
+        …2–4 sentences…
+        <figure class="dist">
+          <figcaption>Fig 1 · …layer distribution by platform…</figcaption>
+          <div class="legend">
+            <span class="key unit">unit</span>
+            <span class="key integration">integration</span>
+            <span class="key e2e">e2e</span>
+          </div>
+          <div class="dist-row">
+            <span class="dist-label">bitwarden/server</span>
+            <div class="bar">
+              <span class="seg unit" style="flex:3">3</span>
+              <span class="seg integration" style="flex:11">11</span>
+              <span class="seg e2e" style="flex:1">1</span>
+            </div>
+          </div>
+          <!-- one .dist-row per platform -->
+        </figure>
+        <ul class="shapes">
+          <li><span class="plat">bitwarden/server</span> — …one-line shape…</li>
+          <!-- one li per platform -->
+        </ul>
+      </section>
+      <section id="evidence">
+        <h2>Evidence &amp; sources</h2>
+        <div class="scroll">
+          …sources used + what was missing + commit SHA(s)…
+        </div>
+      </section>
+      <!-- report-specific section(s) here, per your template -->
+      <section id="gaps">
+        <h2>…gaps heading…</h2>
+        …per your template…
+      </section>
+    </main>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
+  </body>
+</html>
+```
diff --git a/plugins/bitwarden-test-engineer/scripts/build-report.sh b/plugins/bitwarden-test-engineer/scripts/build-report.sh
new file mode 100755
index 0000000..8142459
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/scripts/build-report.sh
@@ -0,0 +1,213 @@
+#!/usr/bin/env bash
+#
+# build-report.sh — assemble a self-contained HTML report for the
+# bitwarden-test-engineer plugin by splicing the canonical stylesheet into a
+# model-authored content fragment.
+#
+# The model writes a fragment whose <style> element contains a single sentinel
+# line; this script replaces that sentinel with references/report-style.css
+# verbatim. That keeps the ~400-line stylesheet out of model output entirely
+# (no token cost, no drift between the two reports) while the model authors only
+# the report's actual content.
+#
+# Usage (single report):
+#   build-report.sh --kind <test-stack|test-coverage> --slug <slug> \
+#                   --date <YYYY-MM-DD> <fragment-html-file>
+#
+# Usage (combined two-tab page):
+#   build-report.sh --kind test-combined --slug <slug> --date <YYYY-MM-DD> \
+#                   --current <coverage-report.html> \
+#                   --recommended <test-stack-report.html>
+#
+# The combined mode assembles ONE page with two CSS-only tabs — "Current
+# coverage" (the assessing-test-coverage report) and "Recommended coverage" (the
+# analyzing-test-stack report) — from the two already-built standalone report
+# files. It reuses each report's <header>/<main>, namespaces the section ids so
+# the two bodies coexist in one document (cur-* / rec-*), and splices the
+# stylesheet in once. The two source reports are read, not modified, and their
+# standalone files remain; the combined page is an additional deliverable.
+#
+# Writes the report into a per-change directory, creating it if needed, and
+# prints the final path to stdout:
+#
+#   test-engineer-report-<slug>-<date>/coverage.html      (--kind test-coverage)
+#   test-engineer-report-<slug>-<date>/recommended.html   (--kind test-stack)
+#   test-engineer-report-<slug>-<date>/combined.html       (--kind test-combined)
+#
+# The directory name derives only from --slug/--date, so all three of a run's
+# reports land in the same folder. Re-running the same change on the same date
+# refreshes the report in place (the prior file is overwritten).
+#
+# Input files are left untouched; delete any temporary fragment yourself.
+
+set -euo pipefail
+
+SENTINEL='/* @@BITWARDEN_REPORT_STYLESHEET@@ */'
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CSS_FILE="${SCRIPT_DIR}/../references/report-style.css"
+
+KIND=""
+SLUG=""
+DATE=""
+FRAGMENT=""
+CURRENT=""
+RECOMMENDED=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --kind) KIND="${2:-}"; shift 2 ;;
+    --slug) SLUG="${2:-}"; shift 2 ;;
+    --date) DATE="${2:-}"; shift 2 ;;
+    --current) CURRENT="${2:-}"; shift 2 ;;
+    --recommended) RECOMMENDED="${2:-}"; shift 2 ;;
+    -h|--help)
+      grep '^#' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
+      exit 0 ;;
+    --*) echo "build-report.sh: unknown option '$1'" >&2; exit 2 ;;
+    *) FRAGMENT="$1"; shift ;;
+  esac
+done
+
+# --- validate common inputs --------------------------------------------------
+case "$KIND" in
+  test-stack|test-coverage|test-combined) ;;
+  *) echo "build-report.sh: --kind must be 'test-stack', 'test-coverage', or 'test-combined' (got '${KIND}')" >&2; exit 2 ;;
+esac
+
+if [[ -z "$SLUG" ]]; then
+  echo "build-report.sh: --slug is required (a short kebab-case change identifier)" >&2
+  exit 2
+fi
+if [[ ! "$SLUG" =~ ^[a-zA-Z0-9._-]+$ ]]; then
+  echo "build-report.sh: --slug '${SLUG}' must be kebab-case (letters, digits, dot, dash, underscore)" >&2
+  exit 2
+fi
+if [[ ! "$DATE" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
+  echo "build-report.sh: --date must be YYYY-MM-DD (got '${DATE}')" >&2
+  exit 2
+fi
+if [[ ! -f "$CSS_FILE" ]]; then
+  echo "build-report.sh: stylesheet not found at '${CSS_FILE}'" >&2
+  exit 1
+fi
+
+OUTDIR="test-engineer-report-${SLUG}-${DATE}"
+case "$KIND" in
+  test-coverage) BASENAME="coverage.html" ;;
+  test-stack)    BASENAME="recommended.html" ;;
+  test-combined) BASENAME="combined.html" ;;
+esac
+mkdir -p "$OUTDIR"
+OUT="${OUTDIR}/${BASENAME}"
+
+# Splice the canonical stylesheet in place of the sentinel line. awk reads the
+# CSS file line by line, so no shell escaping ever touches the CSS content.
+splice_css() {
+  awk -v css="$CSS_FILE" -v sentinel="$SENTINEL" '
+    index($0, sentinel) {
+      while ((getline line < css) > 0) print line
+      close(css)
+      next
+    }
+    { print }
+  '
+}
+
+if [[ "$KIND" == "test-combined" ]]; then
+  # --- combined two-tab page -------------------------------------------------
+  for f in "$CURRENT" "$RECOMMENDED"; do
+    if [[ -z "$f" || ! -f "$f" ]]; then
+      echo "build-report.sh: --kind test-combined needs --current and --recommended report files (missing: '${f}')" >&2
+      exit 2
+    fi
+    if ! grep -q '<main' "$f"; then
+      echo "build-report.sh: '${f}' does not look like a built report (no <main> element)" >&2
+      exit 1
+    fi
+  done
+
+  # Pull the inclusive <header>…</header> or <main>…</main> region from a report.
+  # Only scan from <body> onward: the finished reports carry the whole stylesheet
+  # inlined in <head>, and a CSS comment can legitimately mention "<main>" etc. —
+  # gating on <body> keeps those from being mistaken for the real element.
+  extract_region() {
+    awk -v startTag="$2" -v endTag="$3" '
+      /<body[ >]/ { inBody = 1 }
+      !inBody { next }
+      index($0, startTag) { f = 1 }
+      f { print }
+      index($0, endTag) { if (f) exit }
+    ' "$1"
+  }
+
+  # Namespace the normative section ids (and their in-page anchor links) so the
+  # two report bodies can share one document without colliding on #overview etc.
+  IDS='overview|summary|evidence|coverage|recommendations|gaps'
+  prefix_ids() {
+    sed -E \
+      -e "s/ id=\"(${IDS})\"/ id=\"$1-\1\"/g" \
+      -e "s/href=\"#(${IDS})\"/href=\"#$1-\1\"/g"
+  }
+
+  {
+    cat <<HTML
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Test Engineering Report — ${SLUG}</title>
+    <style>
+      ${SENTINEL}
+    </style>
+  </head>
+  <body>
+HTML
+    # Shared masthead: reuse the recommendation report's header, relabel its
+    # eyebrow so the page reads as the combined deliverable, not one report.
+    extract_region "$RECOMMENDED" "<header" "</header>" \
+      | sed -E 's#(<p class="eyebrow">)[^<]*(</p>)#\1Test Engineering Report\2#'
+    cat <<'HTML'
+    <input class="tab-input" type="radio" name="report-view" id="tab-current" checked />
+    <input class="tab-input" type="radio" name="report-view" id="tab-recommended" />
+    <nav class="tablist" aria-label="Report views">
+      <label for="tab-current">Current coverage</label>
+      <label for="tab-recommended">Recommended coverage</label>
+    </nav>
+    <section class="tabpanel" data-panel="current" aria-label="Current coverage">
+HTML
+    extract_region "$CURRENT" "<main" "</main>" | prefix_ids cur
+    cat <<'HTML'
+    </section>
+    <section class="tabpanel" data-panel="recommended" aria-label="Recommended coverage">
+HTML
+    extract_region "$RECOMMENDED" "<main" "</main>" | prefix_ids rec
+    # The reused masthead carries id="top"; emit the back-to-top control once for
+    # the whole page. Each standalone report's own control sits after its </main>,
+    # outside the extracted region, so the combined page would otherwise have none.
+    cat <<'HTML'
+    </section>
+    <a class="to-top" href="#top" aria-label="Back to top">Top</a>
+  </body>
+</html>
+HTML
+  } | splice_css > "$OUT"
+
+  echo "$OUT"
+  exit 0
+fi
+
+# --- single report (test-stack | test-coverage) ------------------------------
+if [[ -z "$FRAGMENT" || ! -f "$FRAGMENT" ]]; then
+  echo "build-report.sh: fragment HTML file not found: '${FRAGMENT}'" >&2
+  exit 2
+fi
+if ! grep -qF "$SENTINEL" "$FRAGMENT"; then
+  echo "build-report.sh: fragment '${FRAGMENT}' has no stylesheet sentinel." >&2
+  echo "  Put exactly this line inside the <style> element: ${SENTINEL}" >&2
+  exit 1
+fi
+
+splice_css < "$FRAGMENT" > "$OUT"
+
+echo "$OUT"
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
new file mode 100644
index 0000000..a1d1754
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md
@@ -0,0 +1,35 @@
+---
+name: analyzing-test-stack
+description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need". This is the forward-looking recommendation — it does NOT inventory what already exists; for that, use assessing-test-coverage (whose inventory this skill consumes).
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_issues, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_comments, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence, mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__search_confluence_cql"
+---
+
+# Analyzing the Test Stack
+
+Recommend the test automation layers a change should ship with — shaped to **each target repo's actual test practice**, not one universal model — and write the recommendation as a self-contained HTML report. You produce advice, not tests.
+
+Assign each behavior the **cheapest sufficient layer** (unit → integration → E2E, pushing coverage down) landed inside each repo's real shape (pyramid, trophy, or all-E2E). The layer model is in `references/test-layers.md`; the per-repo shapes in `references/monorepo-layout.md` → _Each repo's test shape in practice_.
+
+## Inputs
+
+You may receive any combination of: a Jira key, a GitHub PR, a CSV of test cases, a technical breakdown, and/or a plain-language description — additive evidence. You also consume a **coverage inventory** (the existing-test records produced by `assessing-test-coverage`: permalink records + `unverified` gaps). Under the `test-strategist` agent this is gathered before this skill runs; if it is absent (run standalone), invoke `Skill(assessing-test-coverage)` for the change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** for the report filename — don't read the clock; if none is supplied, ask via `AskUserQuestion`.
+
+`../../references/input-sources.md` (shared with `assessing-test-coverage`) is the canonical guide for ingesting each source — Epic expansion, breakdown mining, CSV column mapping, the Jira/Confluence tooling ladder, and the missing-source-is-a-gap rule.
+
+Carry each behavior's **risk severity** (impact, not urgency) alongside it; the model and how it calibrates coverage are in `references/severity-risk.md`.
+
+## Workflow
+
+1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated.
+
+2. **Consume the coverage inventory.** What is already tested is established by `assessing-test-coverage`, not here — take its inventory (one record per behavior plus `unverified` gaps; the record shape and permalink rules live in that skill's `references/finding-coverage.md` → _Output contract_) as input. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If none was supplied, invoke `Skill(assessing-test-coverage)` for the change surface to produce one. These records feed both the report's Evidence column and the gap analysis below.
+
+3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest layer that genuinely buys the needed confidence (reach higher only for a real browser/device, cross-service contract, or full user journey), with a one-line rationale; then check that confidence bar against the behavior's risk severity per `references/severity-risk.md` (severity sets _how much_ confidence is sufficient, not _which_ layer). Land each call inside the **target repo's shape** and name its concrete tooling, both per `references/monorepo-layout.md` → _Each repo's test shape in practice_.
+
+4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is wrong for its repo (e.g. E2E doing work integration should do, untested core logic, or a layer the repo doesn't even maintain). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on.
+
+5. **Render the HTML report** per `references/html-report-template.md` (which builds on the shared `../../references/report-template-common.md`) — mechanical formatting, not reasoning. Write `#overview` yourself: recommended shape per platform and the top 3 open risks from `#gaps`, highest severity first. The template owns everything else (section IDs, the Severity column, the Evidence permalinks, the `--kind test-stack` build, and the filename contract).
+
+## Principles
+
+- **Ground every recommendation** in a specific requirement, diff hunk, CSV row, or observed test; treat only _observed_ coverage as verified, and mark anything inferred as an assumption.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
new file mode 100644
index 0000000..7211cd9
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md
@@ -0,0 +1,53 @@
+# Test-stack report template
+
+The **recommendation** report: per-platform test-layer recommendations, risk-weighted by
+severity. Build it against the shared contract in
+`../../../references/report-template-common.md` (output constraints, styling/sentinel rule,
+auto-numbering, ToC, the Header/Overview/Summary/Evidence sections, content rules, and the
+skeleton) — **read that first**. This file covers only what is specific to the test-stack report.
+Build with `--kind test-stack`; the invocation and filename rules are in
+`../../../references/report-style-tokens.md` → _Building the report_.
+
+## Sections (in order)
+
+ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#recommendations`, `#gaps`.
+
+- **`#overview`** — recap the **recommended shape per platform**; the top 3 open risks the reader
+  must resolve before acting are drawn from `#gaps`, **ordered highest severity first**; anchor
+  into `#recommendations` and `#gaps`.
+- **`#summary`** — heading "Summary & recommended shape". The distribution chart's
+  `.seg flex:<count>` is the **recommended** test count at each layer; caption it
+  `Fig 1 · Recommended layer distribution by platform`. The `.shapes` list gives each platform's
+  recommended shape matched to its repo's actual practice (e.g. "server: unit-heavy pyramid, thin
+  integration, no E2E; ios: integration + snapshot, no XCUITest").
+- **`#recommendations`** — per-platform tables, one row per behavior:
+  `Behavior | Severity | Recommended layer | Tooling | Rationale | Evidence (linked)`.
+  - **Severity** carries the behavior's risk severity (Critical / High / Medium / Low /
+    Informative) per `severity-risk.md`, rendered with the stylesheet's inline-code treatment —
+    `<code>Critical</code>`, **not** a new color token (the layer ramp and assumption/warn/ok
+    badges are the only colored chips the system defines; severity deliberately gets no hue). Mark
+    a severity the analyst inferred (rather than read from a bug's Jira field) with
+    `<span class="badge assumption">assumption</span>`.
+  - Use the layer → repo map; **E2E rows must name the dedicated `test` repo** as target.
+  - **The "Evidence (linked)" column is binding** — render each behavior's representative tests as
+    GitHub permalinks (or the `.unlinkable` span), per `../../../references/report-template-common.md`
+    → _Content rules_. These records come from the coverage inventory.
+- **`#gaps`** — heading "Coverage gaps & imbalances": behaviors with no coverage, and any shape
+  wrong for its repo (ice-cream-cone, over-unit-tested, trivial tests). **Order by severity**,
+  highest first, so a Critical uncovered behavior leads; Informative behaviors are recorded as
+  out-of-scope rather than gaps. Each tied to evidence; findings you could not ground are marked
+  `<span class="badge warn">unverified</span>` with a one-line reason.
+
+## Recommendations section markup
+
+Slot this between `#evidence` and `#gaps` in the shared skeleton:
+
+```html
+<section id="recommendations">
+  <h2>Per-platform recommendations</h2>
+  <div class="scroll">
+    …per-platform tables: Behavior | Severity | Recommended layer | Tooling |
+    Rationale | Evidence (linked)…
+  </div>
+</section>
+```
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
new file mode 100644
index 0000000..7c23897
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md
@@ -0,0 +1,72 @@
+# Bitwarden repo layout, stacks, and the layer → repo map
+
+Bitwarden's code spans several repositories. A single feature often touches more than
+one, and **each repo follows its own test shape** — pyramid, trophy, or all-E2E (the shapes
+themselves are defined in `test-layers.md`). Treat the table below as a **starting map**, not
+gospel — when a repo is checked out, confirm the actual conventions from its config first (the
+`assessing-test-coverage` skill's `references/finding-coverage.md` → _Discovering a repo's
+test conventions_), and read the table as the last-resort default.
+
+Establishing what a change is **already tested** by — finding existing coverage and citing
+it as permalinks — is a separate job owned by the `assessing-test-coverage` skill. This file
+covers only the repo/stack map and the rules for mapping a behavior to the layer it _should_
+live at.
+
+## Each repo's test shape in practice
+
+Each repo's stack and the shape it actually maintains — not a one-size trophy. Recommend the
+layer that fits the repo's real distribution (see `test-layers.md` for the shapes), landed inside
+that shape and named with the concrete tool below. Each shape was **confirmed against a local
+checkout**; exact repo names and tool versions drift, so re-verify against the checkout, and for
+any repo not listed, infer its stack and shape from the checkout and **state the assumption** in
+the report.
+
+| Repo                                     | Platform · stack · tooling                                                                                                                                                                                                                                                                                                                               | Shape                                       | What that means for recommendations                                                                                                                                                                                                                                 |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `bitwarden/server`                       | Backend / API · C# / .NET, ASP.NET Core, EF Core · xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers                                                                                                                                                                                                                        | **Pyramid** (unit-heavy)                    | Broad unit base (~5:1 over integration), a meaningful integration layer, **no E2E in-repo**. Default behaviors to unit; reserve integration for endpoint/persistence wiring.                                                                                        |
+| `bitwarden/clients`                      | Web, Browser ext, Desktop, CLI · TypeScript, Angular, Electron, RxJS · Jest + `jest-mock-extended` + Angular TestBed (unit + shallow component); mocked HTTP at the boundary — _no_ Testing Library                                                                                                                                                      | **Unit-heavy** (pyramid-leaning)            | ~1,000+ colocated `*.spec.ts`; TestBed component tests mock their children (shallow, not deep integration). Push logic to unit; treat true component-integration as the deliberate step up. No E2E in-repo.                                                         |
+| `bitwarden/ios`                          | iOS · Swift / SwiftUI · XCTest (+ emerging Swift Testing); SnapshotTesting + ViewInspector for SwiftUI views; processor/coordinator tests with mocks                                                                                                                                                                                                     | **Trophy + snapshot layer**                 | Component/processor/coordinator tests with mocks dominate (integration-leaning); the **snapshot-testing** layer for SwiftUI views is first-class; lighter pure-unit layer; **no systematic XCUITest**. Recommend snapshot coverage for view changes explicitly.     |
+| `bitwarden/android`                      | Android · Kotlin · JUnit5 + MockK + Turbine for ViewModels/logic; Compose UI tests run on the JVM via Robolectric                                                                                                                                                                                                                                        | **Unit-heavy + JVM Compose-UI integration** | ~558 JVM `src/test` files: a unit base plus a substantial Compose-UI integration tier on the JVM. **All JVM `src/test` — no `androidTest`/Espresso, no screenshot testing, no E2E in-repo.** Don't recommend device-instrumented or screenshot tests here.          |
+| `bitwarden/sdk-internal`                 | Cross-platform SDK (core logic powering clients via WASM, mobile via UniFFI) · Rust (cargo workspace, ~50 crates), WASM + UniFFI (Swift/Kotlin) bindings · `cargo test --workspace` (no nextest; cargo-llvm-cov for coverage); `mockall` + `wiremock` for the few HTTP/trait integration tests; binding surfaces consumed by `clients`, `ios`, `android` | **Pyramid** (strongly unit-heavy)           | ~97% inline `#[cfg(test)]` unit tests (crypto/encoding/parsing logic, deterministic, no mocks) vs ~3% in `tests/` dirs; mocks only where HTTP or cross-module orchestration matters. **No E2E.** Default to unit; integration only for binding/orchestration flows. |
+| `bitwarden/test`                         | Cross-platform E2E (web, desktop, browser ext, iOS, android, CLI, API) · C# / .NET · NUnit + Selenium WebDriver (web/desktop/ext) + Appium (mobile) + CliWrap (CLI), Page Object Model; drives real builds                                                                                                                                               | **All E2E**                                 | The cross-system journeys themselves. Everything here is E2E by definition — never recommend unit/integration in this repo.                                                                                                                                         |
+| `bitwarden/browser-interactions-testing` | Browser extension autofill (dedicated E2E suite) · TypeScript, Playwright, Docker Compose · Playwright form-fill against real Chromium extension builds; static-page + live-site scenarios                                                                                                                                                               | **All E2E** (autofill)                      | The autofill counterpart to `test`; E2E only.                                                                                                                                                                                                                       |
+
+## Where each layer lives — important
+
+- **Unit and integration** tests live **alongside the code, inside each platform
+  repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test
+  targets, and `sdk-internal`'s Rust crates, whose `cargo test` suites sit next to the
+  code they cover).
+- **End-to-end (E2E) tests live in a dedicated `test` repository** — _not_ inside the
+  platform repos. It sits as a sibling of `server` / `clients` / `ios` in the user's
+  Bitwarden checkout root, so look for it next to whichever platform repo you're in
+  (e.g. if `clients` is at `~/repos/Bitwarden/clients`, `test` is at
+  `~/repos/Bitwarden/test`). Source: [`bitwarden/test`](https://github.com/bitwarden/test) — cite this URL
+  in the report only if no local sibling is found.
+- **Browser-extension autofill / form-fill E2E** also has a dedicated repo,
+  [`bitwarden/browser-interactions-testing`](https://github.com/bitwarden/browser-interactions-testing) —
+  Playwright driving real extension builds against static pattern pages and live sites
+  (Chromium today). Note the **overlap**: the cross-platform `test` repo _also_ carries
+  extension autofill coverage, so a given autofill journey may be tested in either (or
+  both). When recommending or inventorying autofill E2E, check both repos and flag where
+  coverage overlaps or where one is the better home, rather than assuming a single owner.
+
+## Mapping a behavior to a platform + layer
+
+1. Identify which repo(s) the behavior lives in from the change surface (diff paths,
+   ticket components, CSV team/area).
+2. Within each repo, choose the layer per `test-layers.md` (the cheapest sufficient layer)
+   **landed inside that repo's shape** from _Each repo's test shape in practice_ above — a
+   pyramid repo like `server` or `sdk-internal` resolves toward unit; `ios` toward its
+   component + snapshot practice — and name the concrete tool from the table above (confirmed
+   against the checkout where possible).
+3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo;
+   for browser-extension autofill / form-fill journeys, also consider
+   `browser-interactions-testing`. Coverage for autofill can live in either repo, so
+   check both and flag any overlap or comparable existing E2E coverage (per the coverage
+   inventory from `assessing-test-coverage`).
+
+Existing coverage to compare these recommendations against — including the GitHub permalinks
+the report's Evidence column requires — comes from the `assessing-test-coverage` skill's
+coverage inventory (`references/finding-coverage.md` → _Citing tests as GitHub permalinks_
+and _Output contract_), not from this file.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
new file mode 100644
index 0000000..d22bb3c
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md
@@ -0,0 +1,75 @@
+# Severity as a risk weight
+
+The layer model (`test-layers.md`) tells you the _cheapest layer that buys the confidence a
+behavior requires_, landed inside the target repo's shape. **Severity tells you how much
+confidence is required.** A defect in vault
+unlock and a typo on a settings label are not owed the same rigor — severity is the dial
+that turns "cheapest sufficient" from a flat rule into a risk-weighted one.
+
+Severity is the **impact of a defect on the system or user**, independent of how urgently
+it gets fixed (that is _priority_). This skill weights coverage by severity, not priority.
+
+## Source of truth
+
+The canonical classification is Bitwarden's **Defect Severity Classification Guide**,
+Confluence page `2759229512`:
+<https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity>. That page is
+authoritative — read it for the level definitions, criteria, and signals; this file does
+not reproduce them. When the `bitwarden-atlassian-tools` MCP is available, fetch the page
+with `mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_confluence_page` (pageId `2759229512`) and classify
+each behavior against its criteria. If the fetch fails or the MCP is unavailable, classify
+against the generally understood meaning of the levels below using your own judgment, and
+note in the report that severities were assessed without the guide (definitions not
+verified) — degrade gracefully; never block on it.
+
+The levels, highest to lowest impact, are **Critical**, **High**, **Medium**, **Low**, and
+**Informative**. Use these names consistently in the report regardless of source.
+
+**Security-vulnerability defects are the exception:** their severity follows the
+_Vulnerability Tracking and Management_ guide, not this one. If a behavior is
+security-sensitive (crypto, auth, a threat-model-relevant path), treat its risk as at
+least Critical regardless of the level definitions.
+
+## Where each behavior's severity comes from
+
+- **Bug / defect ticket** — read the severity already assigned on the Jira issue (the
+  severity field, or the reporter/QA's stated severity in the description/comments). Use it
+  directly; if it is absent, classify against the guide's criteria and mark it an assumption.
+- **Feature, PR, tech breakdown** — there is no defect yet, so assess each behavior's
+  **risk severity**: _if this behavior broke in production, what severity would the
+  resulting defect carry?_ Classify it against the same criteria. This is what makes the
+  recommendation risk-aware rather than uniform.
+
+## How severity calibrates the recommendation
+
+Severity does **not** mean "push everything Critical to E2E." The cheapest-sufficient rule
+still governs _which_ layer; severity governs _how completely_ the behavior must be covered
+and _how hard a missing test counts as a gap_. Concretely:
+
+- **Critical** — the confidence bar is highest: cover the behavior's material failure modes,
+  not just the happy path, at whatever layer each mode is cheapest to pin down. Critical
+  behaviors that are genuine end-to-end journeys (login, vault unlock, checkout) are exactly
+  what the **thin E2E layer** is reserved for — the guide's "critical user flows"
+  map 1:1 onto that reservation. A Critical behavior with no observed coverage is a
+  **top-priority gap** and belongs at the head of `#overview`'s open risks.
+- **High** — strong integration coverage of the primary path _and_ the documented
+  workaround / affected configuration (the specific client, OS, or auth method that scopes
+  the impact). Reach for E2E only when the path is itself a critical journey. An uncovered
+  High behavior is a gap that should be scheduled, not silently accepted.
+- **Medium** — the plain cheapest-sufficient layer with no escalation. A gap here is worth
+  recording and ranking below Critical/High; it is reasonable to defer.
+- **Low** — minimal coverage; often a single unit or integration assertion, or an explicit
+  "not worth automating" call. Do **not** spend an E2E test on a Low behavior — that is the
+  ice-cream-cone anti-pattern wearing a risk costume.
+- **Informative** — generally not automatable as a Bitwarden behavior; record as
+  out-of-scope rather than as a coverage gap, with a one-line reason.
+
+Two corollaries:
+
+1. **Severity ranks the gaps.** When `#gaps` and `#overview` list open risks, order them by
+   severity — the reader should resolve the Critical-uncovered behaviors first. Gap
+   prioritization is severity-driven, not list-order-driven.
+2. **Severity ≠ priority.** A Low-severity defect can be High-priority before a launch, and
+   a High-severity bug in a rarely used admin panel can be Low-priority. This skill weights
+   coverage by **severity** (impact). Note priority only if the caller supplied it and it
+   changes what to test first.
diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md
new file mode 100644
index 0000000..69ff05e
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/test-layers.md
@@ -0,0 +1,66 @@
+# Test layers and how to assign one
+
+A model for shaping automated test coverage across three layers — **unit**, **integration**,
+**E2E**. How the volume distributes across them describes a repo's _shape_: a **pyramid** (broad
+unit base, moderate integration, thin/absent E2E) suits backend and logic-heavy code; a **trophy**
+(focused unit base, heavy integration bulge, thin E2E) suits application code where behavior emerges
+from collaborators (UI components, view models). Integration is where each shape buys most of its
+confidence — how _much_ of it a repo carries is what separates the two.
+
+Neither shape is universally correct, and **this skill imposes neither.** Bitwarden's repos
+deliberately sit at different points — some pyramid, some trophy, some a mix, two effectively
+**all E2E**. Recommend the layer that fits the **target repo's actual practice** (mapped per repo
+in `monorepo-layout.md`), not an idealized shape. A mix within or across repos is normal.
+
+## The three layers (cheapest → most expensive)
+
+1. **Unit** — tests a single function/class/module in isolation. Best for pure logic, algorithms,
+   edge cases, and error handling where setup is cheap and the unit has real branching complexity.
+   Fast and stable, but isolation lets integration bugs slip through.
+
+2. **Integration** — the **confidence layer**. Tests several units working together through real
+   (or realistic) collaborators: a controller + service + in-memory/test database; a component
+   rendered with its real children and a mocked network boundary; a view model against a real
+   repository. Exercises the wiring users depend on without the cost and flakiness of E2E.
+
+3. **E2E (end-to-end)** — thin top in most repos, the **entire suite** in the dedicated E2E repos.
+   Drives the real, fully assembled system as a user would: real browser, device, backend. Highest
+   confidence per test, but slowest, most expensive, most flaky. In a platform repo, reserve it for
+   a few **critical user journeys** (login, vault unlock, checkout) — not branch coverage. The
+   cross-system journeys themselves live in the `test` repo, where E2E _is_ the strategy.
+
+Static analysis (type checking, linters, formatters) sits below all three and is handled by
+per-repo tooling — not recommended by this skill.
+
+## How to assign a layer
+
+Apply two rules together:
+
+1. **Cheapest sufficient layer.** Pick the lowest-cost layer (unit < integration < E2E) that still
+   buys the confidence the behavior requires:
+   - Pure transformation, calculation, parsing, validation with real branching → **unit**.
+   - Behavior that emerges from collaborators working together (HTTP handler + service +
+     persistence; component + store + API boundary; view model + repository) → **integration**.
+   - A behavior only meaningful as a full user journey across the real system → **E2E**, and only
+     if genuinely critical.
+   - Anything a type system, analyzer, or linter already guarantees → don't write a test for it.
+
+2. **Honor the target repo's shape.** The cheapest-sufficient call lands inside the shape the repo's
+   engineers actually maintain, so the same behavior resolves differently per repo: in `server` it
+   lands in a unit-heavy pyramid; in `ios` in component/processor integration plus the snapshot
+   layer; a cross-system journey lands as E2E in the dedicated `test` repo, never inside a platform
+   repo. Cite the per-repo shape in `monorepo-layout.md` — and where a repo's real shape is unknown,
+   say so rather than defaulting to a trophy.
+
+## Anti-patterns to avoid (in any shape)
+
+- **Ice-cream cone** — many E2E, few integration/unit. Slow, flaky, expensive. Wrong everywhere,
+  including a pyramid repo that has started leaning on E2E for branch coverage.
+- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the mocks rather
+  than real behavior; integration would buy more. The most common failure in unit-heavy repos.
+- **Testing trivial code** — getters/setters, framework glue, type-guaranteed invariants. Cost
+  without confidence.
+- **E2E for branch coverage** — slow full-system tests covering edge cases that belong at unit or
+  integration.
+- **Forcing a foreign shape** — recommending an integration bulge for a pyramid repo (or vice
+  versa) because a model says so. Match the repo, not the textbook.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
new file mode 100644
index 0000000..cd572c8
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md
@@ -0,0 +1,45 @@
+---
+name: assessing-test-coverage
+description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or changed paths across Bitwarden's repos, citing each as a stable GitHub permalink bucketed by test layer, and flagging behaviors with no observed test as gaps. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", or "is this component covered". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient test layers; for that, use analyzing-test-stack.
+allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*)"
+---
+
+# Assessing Test Coverage
+
+Produce an evidence-grounded inventory of what is **already tested** for a change, scoped to the change surface, with every cited test rendered as a stable GitHub permalink and bucketed by test layer. The output is a **coverage inventory**: permalink records for observed tests plus the behaviors/surfaces recorded as gaps (`unverified`). Honesty is the whole point — a behavior with no observed test is a gap, never assumed covered.
+
+## Inputs
+
+You work from a **change surface** and the repos it touches:
+
+- **Change surface** — the changed paths/symbols and named component(s), usually supplied by the caller. Given only a Jira key or a bare PR, derive a minimal surface from the PR diff (`gh pr diff`) first; `../../references/input-sources.md` (shared with `analyzing-test-stack`) covers resolving a PR or Epic into diff paths and linked PRs.
+- **Affected repos** — which platform checkouts to inspect, and whether the sibling `test` repo (E2E) is available.
+- **Linked/merged PRs** — the PRs that shipped this work; their diffs are the primary, permalink-ready coverage evidence.
+
+A missing input narrows the inventory; it never blocks it — record what you could not inspect. **Today's date is provided by the caller** for the report filename; if none is supplied, ask via `AskUserQuestion` rather than reading the clock.
+
+## Workflow
+
+1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_.
+
+2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then a lookup **scoped to the change surface** for pre-existing tests — never a repo-wide grep sweep. **Establish coverage per behavior and stop as soon as it is confirmed** (1–3 representative tests plus an approximate count, not every test method) — the dominant cost control, detailed in `references/finding-coverage.md` → _Establish coverage per behavior_. For E2E, inspect the sibling `test` repo if available.
+
+3. **Cite and bucket each behavior's coverage.** For each behavior, render its 1–3 representative tests as GitHub permalinks and record its layer and approximate count, following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_ and _Output contract_ (which also covers the unlinkable-test fallback). Bucket by apparent layer (unit / integration / E2E); layer definitions are in the `analyzing-test-stack` skill's `references/test-layers.md`, the per-repo stack/tooling in its `references/monorepo-layout.md`.
+
+4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_.
+
+5. **Render the coverage report** per `references/coverage-report-template.md` (which builds on the shared `../../references/report-template-common.md`) — mechanical formatting, not reasoning. Write `#overview` yourself: observed coverage per platform and the top gaps. The template owns everything else (section IDs, the Tests-linked permalinks, the `--kind test-coverage` build, and the filename contract).
+
+## Output
+
+Two artifacts:
+
+- The **coverage inventory** as structured data (record shape in `references/finding-coverage.md` → _Output contract_). When run under the `test-strategist` agent, return these records for `analyzing-test-stack` to consume as-is.
+- The **self-contained HTML coverage report** (step 5), written as `coverage.html` into the per-change report directory `test-engineer-report-<slug>-<date>/`.
+
+Mirror the report's `#overview` in chat — the observed shape per platform and the top gaps — and point the reader at the report file for the per-test detail.
+
+## Principles
+
+- **Observed vs. assumed.** Never present assumed coverage as verified — "I could not inspect the `test` repo" is a finding, not a failure.
+- **Backward-looking only.** You inventory what exists; recommending new tests and judging test shape belong to `analyzing-test-stack`.
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
new file mode 100644
index 0000000..561e010
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md
@@ -0,0 +1,53 @@
+# Coverage report template
+
+The **inventory** report: what is already tested for a change, per platform, every cited test a
+stable GitHub permalink. Build it against the shared contract in
+`../../../references/report-template-common.md` (output constraints, styling/sentinel rule,
+auto-numbering, ToC, the Header/Overview/Summary/Evidence sections, content rules, and the
+skeleton) — **read that first**. This file covers only what is specific to the coverage report.
+Build with `--kind test-coverage`; the invocation and filename rules are in
+`../../../references/report-style-tokens.md` → _Building the report_.
+
+This is the coverage counterpart to the `analyzing-test-stack` test-stack report; the two splice
+the same stylesheet and follow the same shared contract, so they read as one instrument.
+
+## Sections (in order)
+
+ToC and section ids, in order: `#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`.
+
+- **`#overview`** — recap **how well covered the change is per platform** (where observed tests
+  concentrate, which layers are bare); the top 3 coverage gaps the reader should know about are
+  drawn from `#gaps`; anchor into `#coverage` and `#gaps`. This report **describes** coverage — it
+  does not recommend new tests or assign cheapest-sufficient layers (that is the test-stack
+  report's job); say so in one line and, if a test-stack report was also produced, link to it.
+- **`#summary`** — heading "Observed coverage shape". The distribution chart's `.seg flex:<count>`
+  is the **count of observed tests** at each layer (not recommended counts); caption it
+  `Fig 1 · Observed test coverage by platform`. The `.shapes` list gives each platform's observed
+  shape (e.g. "server: 14 integration, 3 unit, 0 E2E observed"); a platform with no observed
+  coverage still gets a row, shown empty.
+- **`#coverage`** — per-platform tables, **one row per behavior** (not per test):
+  `Behavior / surface | Layer | Tests (linked) | Count | Source | Notes`.
+  - **Tests (linked)** is binding — render the behavior's 1–3 representative tests as permalinks (or
+    the `.unlinkable` span), per `../../../references/report-template-common.md` → _Content rules_.
+  - **Count** is the approximate number of tests covering that behavior at that layer — breadth
+    without enumerating every test. Do not expand a well-covered behavior into dozens of rows.
+  - **Layer** uses the matching layer chip. **Source** is `PR` (tests shipped in a linked/merged
+    PR) or `pre-existing` (found by the targeted lookup) — keep the observed-vs-assumed
+    distinction visible.
+- **`#gaps`** — heading "Coverage gaps": behaviors/surfaces in the change with **no observed
+  test**, each marked `<span class="badge warn">unverified</span>` with a one-line reason (no
+  PR-observed test and no targeted hit; or `test` repo unavailable). The honest record of what is
+  _not_ known to be covered — not a recommendation to add tests.
+
+## Coverage section markup
+
+Slot this between `#evidence` and `#gaps` in the shared skeleton:
+
+```html
+<section id="coverage">
+  <h2>Observed coverage</h2>
+  <div class="scroll">
+    …per-platform behavior→test tables with linked evidence…
+  </div>
+</section>
+```
diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
new file mode 100644
index 0000000..cfbd54a
--- /dev/null
+++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md
@@ -0,0 +1,145 @@
+# Finding and citing existing test coverage
+
+How to determine what a change is **already** tested by, scoped to the change surface, and how to cite each observed test as a stable link. This is the repo-reading half of test engineering; the layer-mapping half (which layer a behavior _should_ live at) is in the `analyzing-test-stack` skill.
+
+## Discovering a repo's test conventions (config-first)
+
+Test conventions, tooling, and where tests live are usually documented in a repo's Claude
+config — read it **before** opening any test files, and stop as soon as it answers the
+question. This keeps token spend low on large repos. Work the tiers in order:
+
+1. **Config first.** Read the repo's root `CLAUDE.md`, its `.claude/` directory (rules and
+   settings), and any **nested `CLAUDE.md`** in the subdirectories the change touches (e.g.
+   `clients/apps/<app>/CLAUDE.md`). Extract the test tooling, the test-file layout/naming, and
+   any stated layer conventions.
+2. **Test files as fallback — only for gaps config leaves.** If config is silent on a
+   convention you need, read a _few representative_ test files near the change surface to
+   confirm it. Do **not** sweep the repo.
+3. **Generic stack table as last resort.** When neither config nor local tests answer, fall
+   back to the per-repo stack/tooling table in the `analyzing-test-stack` skill's
+   `references/monorepo-layout.md` and **state the assumption** in the result.
+
+This tier governs _conventions_ — what the tooling is and where tests live. Finding which
+behaviors are _already covered_ is the next job, below.
+
+## Finding existing coverage (PRs first, then a targeted lookup)
+
+Reliably establishing what is **already tested** does not require grepping a whole repo. Work
+two ordered moves, and record anything still unfound as a gap rather than dropping it:
+
+1. **Merged/linked PRs are the backbone.** The PRs hanging off the Jira issue and its epic
+   children (`mcp__plugin_bitwarden-atlassian-tools_bitwarden-atlassian__get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of
+   the tests that shipped with this work, and are already permalink-ready via the PR head SHA.
+   Take the tests observed in those PR diffs as primary coverage evidence.
+2. **Targeted repo lookup for pre-existing tests.** Tests written _before_ this ticket won't
+   appear in those PRs. Find them with a lookup **scoped to the change surface** — the files
+   and symbols the PRs/diff touch, and the component named in the ticket — not a repo-wide
+   sweep. Confirm conventions from config (above) so the lookup targets the right paths.
+
+For end-to-end coverage, inspect the dedicated sibling `test` repo if it is checked out (see
+the `analyzing-test-stack` skill's `references/monorepo-layout.md` → _Where each layer lives_)
+and cite specific files; if it is not available, record E2E coverage as `unverified`.
+
+A behavior with no PR-observed test and no targeted hit is recorded as a coverage gap /
+`unverified` — never silently assumed covered.
+
+### Establish coverage per behavior, not per test — stop as soon as it's confirmed
+
+The inventory is keyed to the **change's testable behaviors**, not to every test method in the
+repo. For each behavior, find _whether and at what layer_ it is covered, capture **1–3
+representative tests** plus an approximate **count** at that layer, then **move on** — do not
+enumerate every test in a covered area. A behavior backed by 40 unit tests is recorded as
+`{ count: ~40, representative: [3 permalinks] }`, not 40 records. This is the dominant cost control
+on large repos: two or three confirming tests prove a behavior is covered; cataloguing the rest
+burns tool calls, bloats the downstream report, and adds cost, not confidence.
+
+## Citing tests as GitHub permalinks
+
+Every test cited as **current coverage** must be rendered as a clickable
+GitHub permalink so a reader can jump to the actual test. The link form is:
+
+```
+https://github.com/<owner>/<repo>/blob/<SHA>/<path>#L<start>-L<end>
+```
+
+Use the **commit SHA**, not a branch name. Branch links rot under rebase and
+force-push; SHA links are stable.
+
+### Acquiring the four ingredients
+
+1. **`owner/repo`** — from the remote URL.
+   - PR-sourced: parse from the PR URL (e.g. `gh pr view <pr> --json url`).
+   - Local checkout: `git -C <repo> remote get-url origin` and parse the
+     `github.com[:/]<owner>/<repo>(\.git)?` segment.
+2. **Commit SHA**.
+   - PR-sourced: `gh pr view <pr> --json headRefOid` returns the PR head SHA. This is
+     the SHA the diff was computed against and is the right anchor for any
+     tests-in-PR or tests-on-the-PR-branch references.
+   - Local checkout: `git -C <repo> rev-parse HEAD` for the working-tree SHA. If the
+     working tree is dirty (uncommitted changes), still use HEAD and note in the
+     evidence that links point to HEAD, not the working tree.
+3. **Path** — repo-relative path of the test file (no leading slash). The same path
+   you'd pass to `Read`, minus the repo root.
+4. **Line range** — start line through end line of the test declaration. Acceptable
+   resolutions, in descending preference:
+   - Full block: from the `it(`/`test(`/`Test(`/`func Test…(` declaration line through
+     the matching closing brace.
+   - Declaration only: the single line where the test name is declared (`#L42`).
+   - File only (`#L1`) — accept reluctantly, and only when grep cannot localize the
+     test. Avoid for newly authored tests.
+
+### When a test cannot be linked
+
+If any of the four ingredients is missing — no remote (`git remote get-url origin`
+returns empty), detached HEAD with no remote, private fork the session cannot reach,
+or the file exists only in a local working tree never pushed — record the test as
+**unlinkable** with the reason. Never fabricate a URL. Both this skill's coverage report
+(`coverage-report-template.md`) and the downstream `analyzing-test-stack` test-stack report
+render these as `<span class="unlinkable">path — unlinkable: &lt;reason&gt;</span>`.
+
+### Output contract
+
+Return **one record per behavior** (not per test), carrying its layer, an approximate count,
+1–3 representative tests as evidence, and — when the behavior was extracted from a Jira item —
+the originating `source_issue` (`key` + browse `url`) so the report can link the behavior back to
+its requirement (see `../../../references/input-sources.md` → _Citing Jira issues as links_). The
+`source_issue` is **carried through from intake** with the behavior — it is provenance recorded
+when the behavior was extracted, not something coverage discovery determines; echo it through when
+present. A behavior with no Jira source (e.g. found only in a PR diff) omits `source_issue`.
+
+```
+{
+  "behavior": "bank account item type round-trips through import/export",
+  "platform": "server",
+  "layer": "integration",
+  "status": "covered",
+  "count": 21,
+  "source_issue": {
+    "key": "PM-32009",
+    "url": "https://bitwarden.atlassian.net/browse/PM-32009"
+  },
+  "representative": [
+    {
+      "path": "test/Core.Test/Vault/.../CipherItemTypeTests.cs",
+      "start_line": 42,
+      "end_line": 89,
+      "owner_repo": "bitwarden/server",
+      "sha": "a1b2c3d4e5f6…",
+      "permalink": "https://github.com/bitwarden/server/blob/a1b2c3d4e5f6…/test/Core.Test/Vault/.../CipherItemTypeTests.cs#L42-L89"
+    }
+  ]
+}
+```
+
+A representative test that cannot be linked is recorded path-only with a reason inside
+`representative` (`{ "path": "…", "unlinkable_reason": "no remote for local checkout" }`) —
+never fabricate a URL. Behaviors/surfaces with no observed test are returned as gaps:
+
+```
+{ "behavior": "organization policy can restrict the Driver License item type", "platform": "server", "status": "unverified" }
+```
+
+Keep `representative` to at most three permalinks per behavior; the `count` conveys breadth
+without listing every test. The `analyzing-test-stack` recommender consumes these records as-is
+to populate the report's Evidence (linked) column (rendering the representative permalinks) and
+to seed its gap analysis.