From 25eea80f0ff9b79f5a73ba72b478da478f3bb73e Mon Sep 17 00:00:00 2001 From: NedThompson Date: Tue, 16 Jun 2026 10:39:23 -0400 Subject: [PATCH 1/9] add test-engineer-plugin with test stack analyst skills --- .claude-plugin/marketplace.json | 6 + .cspell.json | 5 + README.md | 1 + .../.claude-plugin/plugin.json | 23 +++ plugins/bitwarden-test-engineer/CHANGELOG.md | 38 +++++ plugins/bitwarden-test-engineer/README.md | 93 ++++++++++++ .../test-engineer-orchestrator/AGENT.md | 137 ++++++++++++++++++ .../skills/analyzing-test-stack/SKILL.md | 46 ++++++ .../references/html-report-template.md | 63 ++++++++ .../references/input-sources.md | 95 ++++++++++++ .../references/monorepo-layout.md | 40 +++++ .../references/testing-trophy.md | 67 +++++++++ .../SKILL.md | 70 +++++++++ .../references/adversarial-checklist.md | 61 ++++++++ 14 files changed, 745 insertions(+) create mode 100644 plugins/bitwarden-test-engineer/.claude-plugin/plugin.json create mode 100644 plugins/bitwarden-test-engineer/CHANGELOG.md create mode 100644 plugins/bitwarden-test-engineer/README.md create mode 100644 plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md create mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md create mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index d288c89..7ebd043 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -92,6 +92,12 @@ "source": "./plugins/bitwarden-design-tools", "version": "0.1.0", "description": "Design toolkit for Bitwarden — non-persona skills for the design lifecycle. Content style guide reference, Figma Dev Mode MCP usage, Bitwarden brand application, design-to-engineering handoff prep, Design System governance, and the Product and Design Jira workflow. Composed by the bitwarden-designer agent and usable standalone." + }, + { + "name": "bitwarden-test-engineer", + "source": "./plugins/bitwarden-test-engineer", + "version": "1.0.0", + "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment." } ] } diff --git a/.cspell.json b/.cspell.json index 7f702a6..14fee97 100644 --- a/.cspell.json +++ b/.cspell.json @@ -12,6 +12,7 @@ "askable", "ASVS", "atlassian", + "automatable", "Bitwarden", "blocklist", "blogposts", @@ -30,6 +31,7 @@ "cvss", "Dashlane", "dast", + "detekt", "docstrings", "dread", "duedate", @@ -66,6 +68,7 @@ "Jira", "JQL", "keyserver", + "ktlint", "lockdown", "lockfiles", "maxResults", @@ -94,6 +97,7 @@ "remotelink", "Rescope", "resolutiondate", + "Robolectric", "rustdoc", "sarif", "SDLC", @@ -139,6 +143,7 @@ "wordprocessingml", "worktree", "worktrees", + "XCUI", "xoxb", "Zeroize", "zeroization", diff --git a/README.md b/README.md index 8204a80..42ad7d2 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/) | 0.1.5 | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources | | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0 | Application security engineering: vulnerability triage, threat modeling, and secure code analysis | | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0 | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. | +| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/) | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment. | | [claude-config-validator](plugins/claude-config-validator/) | 1.1.1 | Validates Claude Code configuration files for security, structure, and quality | | [claude-retrospective](plugins/claude-retrospective/) | 1.1.1 | Analyze Claude Code sessions to identify successful patterns and improvement opportunities | diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json new file mode 100644 index 0000000..59fc07c --- /dev/null +++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json @@ -0,0 +1,23 @@ +{ + "name": "bitwarden-test-engineer", + "version": "1.0.0", + "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.", + "author": { + "name": "Bitwarden", + "url": "https://github.com/bitwarden" + }, + "homepage": "https://github.com/bitwarden/ai-plugins/tree/main/plugins/bitwarden-test-engineer", + "repository": "https://github.com/bitwarden/ai-plugins", + "keywords": [ + "testing", + "test-engineering", + "quality-engineering", + "test-strategy", + "test-automation", + "exploratory-testing", + "testing-trophy", + "qa", + "orchestrator" + ], + "agents": "./agents/test-engineer-orchestrator/AGENT.md" +} diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md new file mode 100644 index 0000000..76d0dbe --- /dev/null +++ b/plugins/bitwarden-test-engineer/CHANGELOG.md @@ -0,0 +1,38 @@ +# Changelog + +All notable changes to the Bitwarden Test Engineer Plugin will be documented in this file. +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.0] - 2026-06-15 + +### Added + +- Initial release of the `bitwarden-test-engineer` plugin. +- `test-engineer-orchestrator` agent: classifies the inputs for a change (Jira ticket, + GitHub PR, technical breakdown document, exported test-case CSV, plain-language + description), fans out subagents to gather evidence — including a dedicated **breakdown + reader** subagent (`sonnet`) that mines a tech breakdown for testable behaviors and its + status — runs the analyst skill, then automatically runs the adversarial counterpart + before presenting a consolidated result. +- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest + sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits + a self-contained HTML report to the current working directory. Accepts a **technical + breakdown document** (a Bitwarden Tech Breakdown Confluence page, the artifact produced by + the `bitwarden-delivery-tools:writing-tech-breakdowns` skill) as an additive evidence + source alongside Jira, PR, CSV, and plain-language inputs — mining its Part 2 scope + checklist for the surfaces and platforms touched, its Part 4 specification child pages for + the interfaces to test against, and its Part 5 open questions for untestable-requirement + risk. Includes references for the Testing Trophy model, the repo/stack layer→repo map, + evidence-source ingestion, and the HTML report template. The Atlassian + `search_confluence` / `search_confluence_cql` tools back locating a breakdown by + feature/team name when only a name (not a page ID) is given. +- `challenging-test-stack-recommendations` skill: the adversarial counterpart that + red-teams the analyst's recommendation against known anti-patterns (ice-cream-cone, + unit-masquerading-as-integration, over-testing, untestable requirements, missing platform + layers, flaky-E2E candidates, ungrounded coverage claims) and returns a verdict of + endorse, revise, or reject-with-reasons. +- Per-layer model governance to optimize token spend: the orchestrator runs on Opus + (its context drives the synthesis and adversarial reasoning), while its fan-out evidence + subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, or repo, + `haiku` for pure CSV parsing — rather than inheriting Opus. diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md new file mode 100644 index 0000000..0895580 --- /dev/null +++ b/plugins/bitwarden-test-engineer/README.md @@ -0,0 +1,93 @@ +# Bitwarden Test Engineer Plugin + +## Overview + +A test engineering toolkit for Bitwarden. An orchestrator analyzes a request and +dispatches specialized skills across the testing discipline — test strategy and planning, +automation, exploratory testing, and quality assessment. The plugin is designed to grow: +new testing skills are added over time, and **every analytic skill ships with an +adversarial counterpart** that red-teams its output before it reaches you. An unchallenged +test plan tends to drift toward whatever is easiest to do rather than what actually buys +confidence; the adversary exists to catch that. + +### First capability: test-stack analysis + +Given a change — a feature, bugfix, refactor, or migration — the orchestrator recommends +**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a thin +static-analysis base, a focused unit layer, a heavy integration layer where most confidence +is bought, and a thin E2E layer reserved for critical user journeys. + +It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub +PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out +subagents to gather it, runs the analyst skill (`analyzing-test-stack`) to produce a +self-contained HTML report, then automatically runs its adversarial counterpart +(`challenging-test-stack-recommendations`) to red-team the recommendation and consolidate a +single report. + +## Where each layer lives + +Static, unit, and integration tests live alongside the code inside each platform repo +(e.g. `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`). **End-to-end tests live +in a dedicated, private `test` repository** — not inside the platform repos — so E2E +recommendations target that separate repo, and existing E2E coverage is treated as +unverified when that repo isn't checked out. + +## Agent + +| Agent | What It Does | +| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `test-engineer-orchestrator` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, runs `analyzing-test-stack`, then automatically runs `challenging-test-stack-recommendations` and consolidates a single report. | + +## Skills + +| Skill | What It Does | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `analyzing-test-stack` | The recommender. Maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a self-contained HTML report to the current working directory. | +| `challenging-test-stack-recommendations` | The adversarial counterpart. Re-derives the evidence independently and red-teams the recommendation against known anti-patterns (ice-cream-cone, unit-masquerading-as-integration, over-testing, untestable requirements, missing platform layers, flaky-E2E, ungrounded coverage), then returns a verdict: endorse, revise, or reject-with-reasons. | + +## Cross-Plugin Integration + +| Plugin | How It's Used | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `bitwarden-atlassian-tools` | Optional but recommended. Provides the `mcp__bitwarden-atlassian__*` server used to read Jira tickets and linked Confluence requirements. If absent, the plugin degrades gracefully — paste requirements or rely on the PR/CSV/description. | + +## Installation + +```bash +/plugin install bitwarden-test-engineer@bitwarden-marketplace +``` + +For Jira-backed analysis, install the Atlassian tools alongside it: + +```bash +/plugin install bitwarden-atlassian-tools@bitwarden-marketplace +``` + +## Usage + +The orchestrator activates when you ask what test coverage a change needs, which +automation layers to add, how to shape a test plan, or whether existing tests are at the +right level: + +``` +I'm picking up PM-12345 next sprint. What test coverage should this feature have? +``` + +``` +Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end? +``` + +``` +Here's our exported test cases CSV for the billing migration — which of these should be +automated and at what layer? +``` + +Each run produces a self-contained `test-stack-report--.html` in the current +working directory, containing the per-platform recommendation and the adversarial review. + +## References + +- [Claude Code Agents](https://code.claude.com/docs/en/agents) +- [Claude Code Skills](https://code.claude.com/docs/en/skills) +- [The Testing Trophy](https://kentcdodds.com/blog/the-testing-trophy-and-testing-classifications) +- [Bitwarden Contributing Guidelines](https://contributing.bitwarden.com/contributing/) diff --git a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md new file mode 100644 index 0000000..37934b1 --- /dev/null +++ b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md @@ -0,0 +1,137 @@ +--- +name: test-engineer-orchestrator +version: 1.0.0 +description: | + Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (static, unit, integration, E2E) shaped as a Testing Trophy, across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, runs the analyst skill to synthesize a recommendation and HTML report, then automatically runs the adversarial counterpart to red-team it before presenting a consolidated result. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, or asks for a "test stack" / "test strategy" / "test trophy" analysis for a ticket, PR, tech breakdown, or set of test cases. + + + Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with. + user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?" + assistant: "I'll use the test-engineer-orchestrator agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation — then red-team it before handing it back." + + Jira-key intake. The orchestrator gathers the ticket via the Atlassian MCP, runs Skill(analyzing-test-stack), then auto-runs Skill(challenging-test-stack-recommendations). + + + + + Context: A reviewer wants to know whether an open PR is adequately tested at the right layers. + user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?" + assistant: "I'll use the test-engineer-orchestrator agent to read the PR diff and its tests, assess the trophy shape, and run the adversarial pass to specifically check for an ice-cream-cone (too E2E-heavy) anti-pattern." + + PR intake plus an explicit anti-pattern concern. The orchestrator gathers the diff via gh, then chains analyst → adversary. + + + + + Context: A QA engineer exported a set of manual test cases and wants an automation plan. + user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?" + assistant: "I'll use the test-engineer-orchestrator agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation with an adversarial review." + + CSV intake. The orchestrator parses the export, runs the analyst to map cases to layers and surface gaps, then the adversary challenges the recommendation. + + + + + Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it. + user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?" + assistant: "I'll use the test-engineer-orchestrator agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation — then red-team it." + + Tech-breakdown intake. The orchestrator fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then chains analyst → adversary. + + +model: opus +tools: + - Read + - Write + - Glob + - Grep + - Skill + - Task + - AskUserQuestion + - Bash(gh pr view:*) + - Bash(gh pr diff:*) + - Bash(gh pr checks:*) + - Bash(git diff:*) + - Bash(git log:*) + - mcp__bitwarden-atlassian__get_issue + - mcp__bitwarden-atlassian__search_issues + - mcp__bitwarden-atlassian__get_issue_comments + - mcp__bitwarden-atlassian__get_issue_remote_links + - mcp__bitwarden-atlassian__get_confluence_page + - mcp__bitwarden-atlassian__search_confluence + - mcp__bitwarden-atlassian__search_confluence_cql +skills: + - analyzing-test-stack + - challenging-test-stack-recommendations +color: green +--- + +You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a thin static-analysis base, a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys. + +You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Every recommendation you produce is challenged by an adversarial pass before you present it, because an unchallenged test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence. + +## Operating context + +Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently. + +**Where each layer lives:** static, unit, and integration tests live alongside the code, inside each platform repo. **End-to-end (E2E) tests live in a dedicated, private `test` repository** — not inside the platform repos. So an E2E recommendation always targets that separate repo, and a per-repo coverage scout will not find existing E2E tests inside `server`/`clients`/`ios`; it must look in the `test` repo (and the user may not have it checked out — degrade gracefully and say so). Read `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map. + +The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided. + +## Workflow + +### 1. Intake and scope + +Classify every input the user supplied. Inputs are additive — handle any combination: + +- **Jira key** (e.g. `PM-12345`) → requirements and acceptance criteria. +- **GitHub PR** (URL or `owner/repo#number`) → the actual change surface and any tests already present. +- **Technical breakdown** (a Confluence page ID/URL, or a feature/team name to search for) → a Bitwarden Tech Breakdown whose scope checklist already enumerates the platforms and surfaces the change touches, with spec child pages defining the interfaces. Often the richest single input. +- **CSV path** → an exported set of existing/planned test cases (column layout described in the analyst skill's `references/input-sources.md`). +- **Plain-language description** → the change itself when no artifact exists. + +Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption. + +### 2. Fan out to gather evidence + +Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out: + +- **Requirements reader** (model: `sonnet`) — resolves the Jira issue (via `Skill(bitwarden-atlassian-tools:researching-jira-issues)` if available, else the `mcp__bitwarden-atlassian__*` tools) into testable behaviors and acceptance criteria. +- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status. +- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present. +- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status. +- **Per-repo coverage scout** (model: `sonnet`) — for each affected platform repo, surveys existing static/unit/integration conventions and where comparable behavior is tested today. For E2E, scout the dedicated `test` repo if it is checked out; otherwise note it as unverified. + +Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied. + +**Set each subagent's model explicitly to control cost.** This fan-out is the bulk of the plugin's token spend, and the work is evidence gathering — read a source, extract, return a compact digest — not the strategic reasoning you reserve for yourself. Spawn each `Task` on the cheapest model that fits: **`haiku`** for pure mechanical parsing (the CSV parser), **`sonnet`** for everything that reads code, a diff, or a ticket and summarizes it (the default for these subagents). Do **not** let a subagent inherit your Opus model — a digest-returning agent never needs it. Reserve Opus for your own context, where the synthesis and adversarial reasoning happen (see Model selection below). + +### 3. Recommend + +Invoke `Skill(analyzing-test-stack)` with the gathered digests. It maps each testable behavior to the cheapest sufficient trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report--.html`. Pass today's date to the skill — skills cannot read the clock themselves. + +### 4. Adversary (automatic) + +Immediately invoke `Skill(challenging-test-stack-recommendations)` on the report and the underlying evidence. It red-teams the recommendation against known failure modes — ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable/ambiguous requirements, a missing platform layer, flaky-E2E candidates, and coverage claimed without evidence — and returns a critique with a verdict: **endorse**, **revise**, or **reject-with-reasons**. + +This pass is not optional. If the user explicitly asks to skip it, comply but state plainly in your summary that the recommendation was not adversarially reviewed. + +### 5. Consolidate + +Merge the critique into the report as a clearly labeled "Adversarial Review" section, so a single HTML file carries both the recommendation and its challenge. In chat, give a short summary: the recommended shape per platform, the adversary's verdict, and the top open risks the user should resolve before committing to the plan. + +## Principles + +- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground. +- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer. +- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see. +- **Read the repo's CLAUDE.md** when the analysis touches a specific checked-out codebase — honor its test conventions over generic defaults. + +## Model selection + +Model spend is governed here in the plugin, not left to the session default. The split: + +- **You (the orchestrator) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` (mapping behaviors to the cheapest sufficient layer across multiple platforms) and `challenging-test-stack-recommendations` (red-teaming that recommendation) — both execute in _your_ context, so your model sets their quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus. +- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus. + +Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md new file mode 100644 index 0000000..5183d4f --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md @@ -0,0 +1,46 @@ +--- +name: analyzing-test-stack +description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a change from a Jira ticket, a GitHub PR, an exported test-case CSV, a technical breakdown document (a Confluence tech breakdown), and/or a plain-language description, then mapping each behavior to the cheapest sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emitting a self-contained HTML report. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", or "are these tests at the right level". This is the recommender; its adversarial counterpart is challenging-test-stack-recommendations, which red-teams the output. +allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql" +--- + +# Analyzing the Test Stack + +Recommend the test automation layers a change should ship with, shaped as a **Testing Trophy**, and write the recommendation as a self-contained HTML report. You produce advice, not tests. + +The Testing Trophy (read `references/testing-trophy.md` for the full model): a thin **static** base, a focused **unit** layer for pure logic and edge cases, a **heavy integration** layer where most confidence is bought, and a **thin E2E** layer reserved for critical end-to-end journeys. The guiding rule is _write tests at the cheapest layer that still buys the confidence the behavior requires_ — push coverage down the trophy, not up. + +## Inputs + +You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. + +Read `references/input-sources.md` for how to ingest each source: + +- **Jira** — via the `mcp__bitwarden-atlassian__*` tools (or the `bitwarden-atlassian-tools:researching-jira-issues` skill if available). Extract testable behaviors and acceptance criteria. If the MCP is unavailable, ask the user to paste requirements rather than failing. +- **GitHub PR** — `gh pr view` / `gh pr diff` to read the change surface, public API touched, and any tests already present. +- **CSV** — an exported set of test cases. The expected columns and how to bucket rows by layer are documented in `references/input-sources.md`. +- **Technical breakdown** — a Bitwarden Tech Breakdown Confluence page (the artifact produced by the `bitwarden-delivery-tools:writing-tech-breakdowns` skill). Fetch via `mcp__bitwarden-atlassian__get_confluence_page`. This is often the richest single input: its scope checklist already enumerates the platforms and surfaces the change touches, and its specification child pages define the interfaces to test against. See `references/input-sources.md` for how to mine it. +- **Description** — use directly when no artifact exists. + +If a source you'd expect is missing, proceed with what you have and **record the gap** in the report — never block on a missing input. + +## Workflow + +1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks and tooling using `references/monorepo-layout.md`. Note that **E2E tests live in a separate, private `test` repo** — never inside the platform repos — so E2E recommendations target that repo and existing E2E coverage may be unverifiable if it isn't checked out. + +2. **Assess current coverage.** For each affected area, determine what is already tested and where. From a PR diff, note tests included in the change. From a CSV, bucket existing cases by apparent layer and automation status. From a repo checkout, grep the established test conventions. Distinguish _observed_ coverage from _assumed_ coverage. + +3. **Assign the cheapest sufficient layer.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey). Name concrete tooling per platform (see `references/monorepo-layout.md`). + +4. **Find the gaps and the imbalance.** Call out behaviors with no recommended coverage, and any existing shape that is trophy-wrong (e.g. E2E doing work integration should do, or untested core logic). Be explicit about what evidence each gap rests on. + +5. **Write the HTML report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/html-report-template.md`. Write it to the **current working directory** as `test-stack-report--.html`, where `` is a short kebab-case identifier for the change (ticket key, PR number, or feature name) and `` is the caller-provided date. Report sections, in order: Summary & recommended shape; Evidence & sources (with what was missing); Per-platform recommendations (behavior → layer → tooling → rationale); Coverage gaps; and a placeholder **Adversarial Review** section the counterpart skill fills in. + +6. **Hand off for adversarial review.** Your recommendation is not final until `challenging-test-stack-recommendations` has red-teamed it. When invoked under the orchestrator this happens automatically; when invoked standalone, tell the user the adversarial pass is available and recommended. + +## Principles + +- **Ground every recommendation.** Each behavior→layer call ties to a specific requirement, diff hunk, CSV row, or observed test. Mark anything inferred without evidence as an assumption. +- **Cheapest sufficient layer wins.** Confidence pushed down the trophy is cheaper to write, faster to run, and less flaky. +- **Per-platform, not one-size.** A feature spanning server, web, and mobile gets a distinct shape per platform — their stacks and risks differ. +- **Honesty about coverage.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md new file mode 100644 index 0000000..f2eb2ce --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/html-report-template.md @@ -0,0 +1,63 @@ +# HTML report template + +Produce a **single self-contained HTML file**: all CSS inline in a ` + + +
…title, change, date…
+
…recommended shape per platform…
+
…sources used + what was missing…
+
…per-platform behavior→layer tables…
+
…coverage gaps & imbalances…
+
+ …filled in by the adversarial pass… +
+ + +``` diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md new file mode 100644 index 0000000..b6b2d46 --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md @@ -0,0 +1,95 @@ +# Ingesting evidence sources + +Inputs are additive — handle any combination, and record in the report which sources were +present and which were missing. Never block on a missing source. + +## Jira ticket + +Preferred: if the `bitwarden-atlassian-tools` plugin is installed, invoke +`Skill(bitwarden-atlassian-tools:researching-jira-issues)` for a deep, link-following read. + +Otherwise use the MCP tools directly: + +- `mcp__bitwarden-atlassian__get_issue` — the issue itself (summary, description, + acceptance criteria, custom fields). +- `mcp__bitwarden-atlassian__get_issue_comments` — clarifications and edge cases raised in + discussion. +- `mcp__bitwarden-atlassian__get_issue_remote_links` — linked Confluence pages and PRs. +- `mcp__bitwarden-atlassian__get_confluence_page` — linked requirements/design docs. + +Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/ +components** named. If the MCP is unavailable, ask the user to paste the requirements. + +## GitHub PR + +- `gh pr view ` — title, body, linked issues, files changed, checks. +- `gh pr diff ` — the actual change surface. + +Extract: the public API / behavior touched, the diff paths (→ which repos/platforms), and +**any tests already included in the PR** (so you assess incremental, not absolute, gaps). + +## Technical breakdown document + +A Bitwarden **Tech Breakdown** — the Confluence artifact a team produces before implementation, +authored with the `bitwarden-delivery-tools:writing-tech-breakdowns` skill. It is the richest +single input for this analysis, because a good breakdown has already done the cross-platform +scoping you would otherwise reconstruct from a diff or a ticket. Mine it; don't re-derive it. + +Locate and fetch it: + +- If given a page ID or URL, fetch directly with `mcp__bitwarden-atlassian__get_confluence_page`. +- If given only a feature/team name, find the page first with `mcp__bitwarden-atlassian__search_confluence` + or `mcp__bitwarden-atlassian__search_confluence_cql` (breakdowns live in a team's "Tech Breakdown" + folder), then fetch it. +- The breakdown's **status** matters: `IN PLANNING` / `IN PROGRESS` means the scope may still + shift — note that the recommendation rests on a draft. `PROPOSED` / `ACCEPTED` is a stable + basis. Record the status as part of the evidence. + +Map its structure to testable evidence (the canonical template is page `2920349776`): + +- **Part 1 — Problem overview**: the feature framing and linked Jira epic. Use it for scope and + to cross-link any Jira/PR inputs, not as a behavior source on its own. +- **Part 2 — Breakdown scope checklist**: the core of the mining. Each answered item names a + surface the change touches and therefore a place tests are needed — **Database changes** + (migration/backwards-compat behaviors, EDD phasing), **API changes** (endpoint contracts, + V±2 compatibility, any unauthenticated endpoint), **UI components** (shared/base components), + **SDK changes**, **Services touched**, **Hosting** (Self-Hosted vs Cloud paths), + **Feature flagging** (flag-on/flag-off states to cover), and **Security considerations** + (crypto, threat-model-relevant behaviors). The **Testing considerations** item is the team's + own stated test intent — treat it as a claim to assess against the trophy, not as ground truth + to copy. +- **Part 4 — Specification artifacts**: linked child pages defining concrete interfaces (API + contracts, schemas, component APIs, crypto schemes). Fetch the relevant ones with + `get_confluence_page`; their public interfaces and edge cases are exactly what integration and + unit tests pin down. +- **Part 5 — Open questions**: unresolved questions are untestable-requirement risk — a behavior + can't be reliably tested until its question is answered. Surface them in the report's gaps. + +Extract: discrete **testable behaviors** per platform, the **surfaces** each touches (→ repos via +`monorepo-layout.md`), and the team's **stated testing intent** (to evaluate, not echo). Where the +breakdown's scope checklist disagrees with a diff or ticket you were also given, treat the +divergence as a finding rather than silently picking one. + +## Test-case CSV export + +A CSV export of existing or planned test cases. Column headers vary by tool and export +settings — **do not hardcode them**. Read the header row, then map by meaning: + +- A **title / case** column — the scenario name. +- A **type** column (e.g. "Regression", "Smoke", "Functional") — hints at intended layer. +- An **automation status** column (e.g. "Ready to Automate", "Automated", "Manual") — + what already exists vs. what's planned. +- A **steps / expected-result** column, often in Given–When–Then form — the behavior. +- Optional **team / area / tags / preconditions** columns — scope and grouping. + +Map rows to behaviors and bucket each by apparent layer using `testing-trophy.md`: + +- A case that drives the full UI through a complete journey → likely **E2E** (target the + dedicated `test` repo). +- A case asserting one service/component's behavior through its collaborators → + **integration**. +- A case pinning a single function's logic or an edge case → **unit**. + +Flag cases that are currently manual but cheaply automatable at a lower layer, and cases +slated for E2E that would be better as integration. If a column's meaning is ambiguous, +state the interpretation you used rather than guessing silently. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md new file mode 100644 index 0000000..dae06b1 --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md @@ -0,0 +1,40 @@ +# Bitwarden repo layout, stacks, and the layer → repo map + +Bitwarden's code spans several repositories. A single feature often touches more than +one, and each gets its own Testing Trophy. Treat the table below as a **starting map**, +not gospel — when a repo is checked out, read its `CLAUDE.md` and grep its existing tests +to confirm the actual conventions before recommending tooling. + +## Platform repos and their stacks + +| Repo (typical) | Platform | Language / framework | Static | Unit / Integration tooling | +| ------------------- | ------------------------------ | ----------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------------------ | +| `bitwarden/server` | Backend / API | C# / .NET, ASP.NET Core, EF Core | `dotnet build` analyzers, nullable refs | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers | +| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | `tsc`, ESLint | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary | +| `bitwarden/ios` | iOS | Swift / SwiftUI | SwiftLint, compiler | XCTest (unit + integration); XCUITest for on-device UI | +| `bitwarden/android` | Android | Kotlin | ktlint/detekt, compiler | JUnit + Robolectric / Espresso (instrumented) | + +Exact repo names and tool versions drift — verify against the checkout. If a platform +isn't in this table, infer its stack from the repo and state the assumption in the report. + +## Where each layer lives — important + +- **Static, unit, integration** tests live **alongside the code, inside each platform + repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test + targets). +- **End-to-end (E2E) tests live in a dedicated, private `test` repository** — _not_ + inside the platform repos. Consequences for analysis: + - An E2E recommendation always targets that separate `test` repo. + - A coverage scout will **not** find existing E2E tests by searching `server`/`clients`/ + `ios`. It must look in the `test` repo, which the user may not have checked out. + - If the `test` repo is unavailable, treat existing E2E coverage as **unverified** and + say so explicitly in the report — do not assume it is absent or present. + +## Mapping a behavior to a platform + layer + +1. Identify which repo(s) the behavior lives in from the change surface (diff paths, + ticket components, CSV team/area). +2. Within each repo, choose the layer per `testing-trophy.md` and name the concrete tool + from the table above (confirmed against the checkout where possible). +3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo and + flag whether comparable E2E coverage already exists there. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md new file mode 100644 index 0000000..13da8bd --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md @@ -0,0 +1,67 @@ +# The Testing Trophy + +A model for shaping automated test coverage, contrasted with the older Testing Pyramid. The trophy weights **integration** tests most heavily, +because they buy the most confidence per unit of cost and maintenance for typical +application code. + +## The four layers (base → top) + +1. **Static** — the base. Type checking, linters, formatters, compiler errors, static + analysis. Catches whole classes of bugs (typos, null misuse, unused code, unsafe + patterns) before a single test runs. Nearly free; always on. + - _Examples:_ TypeScript/`tsc`, ESLint, Roslyn analyzers / `dotnet build` warnings as + errors, SwiftLint, nullable reference types. + +2. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure + logic, algorithms, edge cases, and error handling where setup is cheap and the unit + has real branching complexity. Fast and stable, but isolation can let integration + bugs slip through. + +3. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units + working together through real (or realistic) collaborators: a controller + service + + in-memory or test database, a component rendered with its real child components and a + mocked network boundary, a view model against a real repository. This is where most + confidence is bought because it exercises the wiring users actually depend on, without + the cost and flakiness of full E2E. + +4. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a + user would: real browser, real device, real backend. Highest confidence per test, but + slowest, most expensive, and most flaky. Reserve for a small number of **critical user + journeys** (e.g. login, vault unlock, checkout) — not for branch coverage. + +## The shape + +``` + ┌───────────┐ + │ E2E │ thin top — critical journeys only + ┌──┴───────────┴──┐ + │ Integration │ HEAVY — most confidence bought here + └──┐ ┌──┘ + │ Unit │ focused — pure logic & edge cases + ┌──┴───────────┴──┐ + │ Static │ broad, ~free base — always on + └─────────────────┘ +``` + +## How to assign a layer + +Pick the **cheapest layer that still buys the confidence the behavior requires**: + +- Pure transformation, calculation, parsing, validation logic with real branching → **unit**. +- Behavior that emerges from collaborators working together (HTTP handler + service + + persistence; component + store + API boundary; view model + repository) → **integration**. +- A behavior only meaningful as a full user journey across the real system → **E2E**, and + only if it is genuinely critical. +- Anything a type system, analyzer, or linter can guarantee → **static**; don't write a + test for it. + +## Anti-patterns to avoid (the adversary checks for these) + +- **Ice-cream cone** — the trophy inverted: many E2E tests, few integration/unit. Slow, + flaky, expensive to maintain. +- **Over-unit-testing** — exhaustive unit tests with heavy mocking that re-assert the + mocks rather than real behavior; integration would buy more. +- **Testing trivial code** — tests for getters/setters, framework glue, or + type-guaranteed invariants. Cost without confidence. +- **E2E for branch coverage** — using slow full-system tests to cover edge cases that + belong at the unit or integration layer. diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md new file mode 100644 index 0000000..e86e11d --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md @@ -0,0 +1,70 @@ +--- +name: challenging-test-stack-recommendations +description: Use to red-team a test automation recommendation produced by analyzing-test-stack — adversarially reviewing a Testing Trophy recommendation or HTML test-stack report for anti-patterns and ungrounded claims before the team acts on it. Triggers on "challenge this test plan", "red-team the test recommendation", "poke holes in this test strategy", "is this proposed test plan over/under-testing", "review the test stack report", or runs automatically after analyzing-test-stack under the test-engineer orchestrator. Checks for ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable requirements, missing platform layers, flaky-E2E candidates, and coverage claimed without evidence; returns a verdict of endorse, revise, or reject-with-reasons. +allowed-tools: "Read, Grep, Glob, Bash(gh pr view:*), Bash(gh pr diff:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_confluence_page" +--- + +# Challenging Test Stack Recommendations + +You are the adversary to `analyzing-test-stack`. Your job is to **try to break its +recommendation** before the team builds on it. A recommendation that survives a genuine +red-team is trustworthy; one that was never challenged tends to drift toward whatever +tests are easiest to write rather than what actually buys confidence. + +Default to skepticism. Your value is in the specific, evidence-backed objection — not in +rubber-stamping. But do not invent problems: an objection you cannot tie to evidence is +itself a rejected finding (you hold yourself to the same evidence bar you demand). + +## Inputs + +- The **HTML report** (or the recommendation text) from `analyzing-test-stack`. +- The **underlying evidence** — the same Jira ticket, PR diff, CSV, and/or repo checkout. + Re-derive independently where you can; re-read the PR diff or ticket rather than trusting + the report's summary of it. + +## Workflow + +1. **Re-read the evidence independently.** Don't take the report's characterization of the + change at face value — pull the diff / ticket / CSV yourself and form your own view of + the testable behaviors and where they live. Ingest each source the same way the analyst + does (see `analyzing-test-stack/references/input-sources.md` for the CSV column mapping + and Atlassian MCP tools). In particular, **E2E tests live in a separate, private `test` + repo** — not inside the platform repos — so treat any existing-E2E-coverage claim as + unverified unless that repo was actually inspected. + +2. **Run the rejection criteria.** Apply every check in `references/adversarial-checklist.md` + to each per-platform recommendation and to the overall shape. For each, decide: does the + recommendation pass, or is there a concrete, evidence-backed objection? + +3. **Test the grounding.** For every behavior→layer call, confirm it ties to real evidence. + Flag any layer assignment, coverage claim, or "already tested" assertion that the + evidence does not support — especially **E2E coverage claimed without inspecting the + dedicated `test` repo**. + +4. **Pressure the shape.** Step back from individual rows: is the overall trophy right? Too + E2E-heavy (ice-cream cone)? Core logic pushed to slow layers? A whole platform's layer + missing? Trivial code over-tested? + +5. **Issue findings and a verdict.** Each finding: the specific claim challenged, why it's + wrong or unsupported (with evidence), and the corrective recommendation. Then a single + verdict: + - **Endorse** — sound and well-grounded; minor or no notes. + - **Revise** — directionally right but has specific fixable issues (list them). + - **Reject-with-reasons** — the shape or grounding is wrong enough that the team should + not act on it as written; state what a correct recommendation would require. + +6. **Write the critique into the report.** Populate the report's `#adversarial-review` + section with your findings and verdict (preserve the self-contained, no-external-deps + HTML constraint). When run standalone without the orchestrator, return the critique as + a clearly structured summary instead. + +## Principles + +- **Adversarial, not contrarian.** Push hard, but every objection carries evidence. Drop + any finding you can't support — apply the analyst's own evidence standard to yourself. +- **Re-derive, don't trust.** The report's summary of the diff/ticket is a claim to verify, + not a fact to accept. +- **Name the anti-pattern.** When you flag a shape problem, use the precise term + (ice-cream-cone, over-unit-testing, E2E-for-branch-coverage) so the fix is unambiguous. +- **Unverifiable is a finding.** "The report claims E2E coverage exists but the `test` repo + was never inspected" is a legitimate, important objection — surface it. diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md new file mode 100644 index 0000000..7fbd307 --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md @@ -0,0 +1,61 @@ +# Adversarial checklist — rejection criteria + +Run every check against each per-platform recommendation and against the overall shape. +A check "fails" only when you can state a concrete, evidence-backed objection. Record the +evidence; an objection you can't ground is itself rejected. + +## Shape-level checks + +1. **Ice-cream cone (too E2E-heavy).** Is confidence concentrated in slow, flaky E2E tests + that integration or unit tests could buy more cheaply? Any behavior recommended for E2E + that is not a genuinely critical, full-system user journey is suspect — demand the + justification and propose the lower layer. + +2. **Missing platform layer.** Does an affected platform have a gap in its trophy — e.g. + server logic with no integration layer, a client with only E2E and no component/unit + coverage, core logic with nothing at all? A whole missing layer is a major finding. + +3. **Inverted cost/confidence.** Is core branching logic pushed up to integration/E2E + while trivial glue sits at lower layers? Confidence should sit at the cheapest + sufficient layer. + +## Row-level checks (per behavior → layer assignment) + +4. **Unit masquerading as integration (and vice-versa).** Is something labeled + "integration" actually a unit test with everything mocked (re-asserting mocks, not real + collaboration)? Or a true cross-collaborator behavior mislabeled "unit"? Mislabeling + distorts the shape and the confidence claim. + +5. **Over-testing trivial code.** Tests recommended for getters/setters, framework glue, + generated code, or invariants the type system/analyzer already guarantees. Cost without + confidence — recommend dropping or moving to static. + +6. **E2E for branch coverage.** Edge cases or error paths assigned to slow full-system + tests when they belong at unit/integration. E2E is for journeys, not branches. + +7. **Flaky-E2E candidate.** Does a recommended E2E test depend on timing, external + services, animation, network, or shared mutable state likely to make it flaky? Flag the + flakiness risk and whether an integration test with a controlled boundary would be more + reliable. + +## Grounding checks + +8. **Coverage claimed without evidence.** Any "already tested" / "existing coverage" + assertion not backed by an observed test, diff hunk, or CSV row. Especially: **E2E + coverage asserted without inspecting the dedicated private `test` repo** — that repo is + not inside the platform repos, so unexamined E2E claims are unverified by definition. + +9. **Untestable / ambiguous requirement.** A behavior recommended for testing whose + acceptance criteria are too vague to write a deterministic assertion against. The fix is + to flag the requirement gap upstream, not to write a test against a guess. + +10. **Assumption presented as fact.** Inferred platform, stack, tooling, or scope stated + without an "assumption" marker. Demand it be labeled so the reader can weigh it. + +## Verdict mapping + +- **Endorse** — no failing checks, or only cosmetic notes. +- **Revise** — one or more fixable row-level findings, shape essentially sound. +- **Reject-with-reasons** — a shape-level failure (ice-cream cone, missing layer, inverted + cost/confidence) or pervasive ungrounded coverage claims. State what a correct + recommendation would require. From 471fd43e835754042f72913710a2b75a477ce1e0 Mon Sep 17 00:00:00 2001 From: Ned Thompson Date: Wed, 17 Jun 2026 12:36:29 -0400 Subject: [PATCH 2/9] split stack analysis from coverage report --- .claude-plugin/marketplace.json | 2 +- .cspell.json | 5 + README.md | 2 +- .../.claude-plugin/plugin.json | 4 +- plugins/bitwarden-test-engineer/CHANGELOG.md | 56 +- plugins/bitwarden-test-engineer/README.md | 50 +- .../bitwarden-test-engineer/agents/AGENT.md | 136 +++++ .../test-engineer-orchestrator/AGENT.md | 137 ----- .../references/input-sources.md | 73 ++- .../references/report-style-tokens.md | 496 ++++++++++++++++++ .../skills/analyzing-test-stack/SKILL.md | 33 +- .../references/html-report-template.md | 173 ++++-- .../references/monorepo-layout.md | 45 +- .../references/severity-risk.md | 81 +++ .../references/testing-trophy.md | 26 +- .../skills/assessing-test-coverage/SKILL.md | 51 ++ .../references/coverage-report-template.md | 155 ++++++ .../references/finding-coverage.md | 119 +++++ .../SKILL.md | 70 --- .../references/adversarial-checklist.md | 61 --- 20 files changed, 1374 insertions(+), 401 deletions(-) create mode 100644 plugins/bitwarden-test-engineer/agents/AGENT.md delete mode 100644 plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md rename plugins/bitwarden-test-engineer/{skills/analyzing-test-stack => }/references/input-sources.md (51%) create mode 100644 plugins/bitwarden-test-engineer/references/report-style-tokens.md create mode 100644 plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md create mode 100644 plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md delete mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md delete mode 100644 plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index c02023f..a457038 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -97,7 +97,7 @@ "name": "bitwarden-test-engineer", "source": "./plugins/bitwarden-test-engineer", "version": "1.0.0", - "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment." + "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment." } ] } diff --git a/.cspell.json b/.cspell.json index 14fee97..b8189cd 100644 --- a/.cspell.json +++ b/.cspell.json @@ -26,6 +26,7 @@ "codeBlock", "CODEOWNERS", "Confluence", + "Consolas", "CQL", "customfield", "cvss", @@ -73,6 +74,7 @@ "lockfiles", "maxResults", "mcp", + "Menlo", "metacharacters", "modelcontextprotocol", "msword", @@ -103,6 +105,7 @@ "SDLC", "sast", "sbom", + "Segoe", "semver", "shellcheck", "shortlog", @@ -124,12 +127,14 @@ "tarpit", "thumbsup", "tinyui", + "tnum", "touchpoint", "touchpoints", "triaging", "unassigning", "unassigns", "ungroup", + "unlinkable", "unresponded", "unsanitized", "userflow", diff --git a/README.md b/README.md index c693611..61b5442 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/) | 0.1.5 | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources | | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0 | Application security engineering: vulnerability triage, threat modeling, and secure code analysis | | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0 | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. | -| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/) | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment. | +| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/) | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment. | | [claude-config-validator](plugins/claude-config-validator/) | 1.1.1 | Validates Claude Code configuration files for security, structure, and quality | | [claude-retrospective](plugins/claude-retrospective/) | 1.1.1 | Analyze Claude Code sessions to identify successful patterns and improvement opportunities | diff --git a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json index 59fc07c..2d60354 100644 --- a/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json +++ b/plugins/bitwarden-test-engineer/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "bitwarden-test-engineer", "version": "1.0.0", - "description": "Test engineering toolkit for Bitwarden. An orchestrator dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.", + "description": "Test engineering toolkit for Bitwarden. A generalist test-engineer agent dispatches specialized testing skills — strategy and planning, automation, exploratory testing, and quality assessment.", "author": { "name": "Bitwarden", "url": "https://github.com/bitwarden" @@ -19,5 +19,5 @@ "qa", "orchestrator" ], - "agents": "./agents/test-engineer-orchestrator/AGENT.md" + "agents": "./agents/AGENT.md" } diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md index 76d0dbe..12cf16b 100644 --- a/plugins/bitwarden-test-engineer/CHANGELOG.md +++ b/plugins/bitwarden-test-engineer/CHANGELOG.md @@ -9,13 +9,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial release of the `bitwarden-test-engineer` plugin. -- `test-engineer-orchestrator` agent: classifies the inputs for a change (Jira ticket, +- `bitwarden-test-engineer` agent: classifies the inputs for a change (Jira ticket, GitHub PR, technical breakdown document, exported test-case CSV, plain-language description), fans out subagents to gather evidence — including a dedicated **breakdown reader** subagent (`sonnet`) that mines a tech breakdown for testable behaviors and its - status — runs the analyst skill, then automatically runs the adversarial counterpart - before presenting a consolidated result. -- `analyzing-test-stack` skill: maps a change's testable behaviors to the cheapest + status — then runs the analyst skill and presents its recommendation. When + inspecting a checked-out repo, subagents read its Claude config (root `CLAUDE.md`, + `.claude/`, nested `CLAUDE.md`) for test conventions before opening test files, and + establish existing coverage PR-first (tests in linked/merged PRs) with a targeted lookup + for pre-existing tests — never a repo-wide grep. The agent runs a dedicated **assess + existing coverage** step (per-repo coverage scouts applying `assessing-test-coverage`) + after evidence gathering and before invoking `analyzing-test-stack`, passing the merged + coverage inventory into the recommendation. +- `assessing-test-coverage` skill: a backward-looking inventory of what a change is + **already tested** by. Scoped to the change surface (PR-first, then a targeted lookup — + never a repo-wide sweep), it discovers each repo's test conventions config-first, buckets + every observed test by layer, cites it as a stable GitHub permalink (commit SHA, not + branch), records untested behaviors as `unverified` gaps, and writes its own self-contained + HTML **coverage report** (`test-coverage-report--.html`) following + `references/coverage-report-template.md`. Usable standalone to audit current coverage, and + consumed by `analyzing-test-stack`. Owns convention discovery, existing-test finding, and + the GitHub permalink citation rules (in `references/finding-coverage.md`) — concerns kept + separate from the trophy recommendation. +- Plugin-level shared `references/`: `input-sources.md` (evidence-source ingestion, used by + both skills and the agent) and `report-style-tokens.md` (the single off-brand data-report + styling system both the coverage report and the test-stack report inline verbatim, so the + two read as one instrument). +- `analyzing-test-stack` skill: consumes the coverage inventory from `assessing-test-coverage`, + then maps a change's testable behaviors to the cheapest sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits a self-contained HTML report to the current working directory. Accepts a **technical breakdown document** (a Bitwarden Tech Breakdown Confluence page, the artifact produced by @@ -23,16 +44,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 source alongside Jira, PR, CSV, and plain-language inputs — mining its Part 2 scope checklist for the surfaces and platforms touched, its Part 4 specification child pages for the interfaces to test against, and its Part 5 open questions for untestable-requirement - risk. Includes references for the Testing Trophy model, the repo/stack layer→repo map, - evidence-source ingestion, and the HTML report template. The Atlassian - `search_confluence` / `search_confluence_cql` tools back locating a breakdown by - feature/team name when only a name (not a page ID) is given. -- `challenging-test-stack-recommendations` skill: the adversarial counterpart that - red-teams the analyst's recommendation against known anti-patterns (ice-cream-cone, - unit-masquerading-as-integration, over-testing, untestable requirements, missing platform - layers, flaky-E2E candidates, ungrounded coverage claims) and returns a verdict of - endorse, revise, or reject-with-reasons. -- Per-layer model governance to optimize token spend: the orchestrator runs on Opus - (its context drives the synthesis and adversarial reasoning), while its fan-out evidence - subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, or repo, - `haiku` for pure CSV parsing — rather than inheriting Opus. + risk. The report surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, + over-testing, missing platform layers), recording ungrounded findings as `unverified` + gaps. Includes references for the Testing Trophy model, the repo/stack + layer→repo map, evidence-source ingestion, and the HTML report + template. The Atlassian `search_confluence` / `search_confluence_cql` tools back locating a + breakdown by feature/team name when only a name (not a page ID) is given. +- Top-of-report `#overview` synthesis section, written by the analyst: a 2–4 sentence recap + of the recommended shape per platform, the top 3 open risks (drawn from + `#gaps`), and anchor links into the detail sections, so readers see the bottom line without + scrolling. The overview is additive — per-behavior detail stays in `#recommendations`/`#gaps`. +- Per-layer model governance to optimize token spend: the agent runs on Opus + (its context drives the analysis and the recommendation), while the fan-out + evidence subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, + or repo, `haiku` for pure CSV parsing — rather than inheriting Opus. diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md index 0895580..f18e06e 100644 --- a/plugins/bitwarden-test-engineer/README.md +++ b/plugins/bitwarden-test-engineer/README.md @@ -2,31 +2,28 @@ ## Overview -A test engineering toolkit for Bitwarden. An orchestrator analyzes a request and -dispatches specialized skills across the testing discipline — test strategy and planning, +A test engineering toolkit for Bitwarden. A generalist test-engineer agent analyzes a +request and dispatches specialized skills across the testing discipline — test strategy and planning, automation, exploratory testing, and quality assessment. The plugin is designed to grow: -new testing skills are added over time, and **every analytic skill ships with an -adversarial counterpart** that red-teams its output before it reaches you. An unchallenged -test plan tends to drift toward whatever is easiest to do rather than what actually buys -confidence; the adversary exists to catch that. +new testing skills are added over time. ### First capability: test-stack analysis -Given a change — a feature, bugfix, refactor, or migration — the orchestrator recommends -**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a thin -static-analysis base, a focused unit layer, a heavy integration layer where most confidence -is bought, and a thin E2E layer reserved for critical user journeys. +Given a change — a feature, bugfix, refactor, or migration — the agent recommends +**what to test, at which layer, and why**, shaped as a **Testing Trophy**: a focused +unit layer, a heavy integration layer where most confidence is bought, and a thin E2E +layer reserved for critical user journeys. It ingests whatever evidence is available — a Jira ticket (via the Atlassian MCP), a GitHub PR (via `gh`), an exported test-case CSV, and/or a plain-language description — fans out -subagents to gather it, runs the analyst skill (`analyzing-test-stack`) to produce a -self-contained HTML report, then automatically runs its adversarial counterpart -(`challenging-test-stack-recommendations`) to red-team the recommendation and consolidate a -single report. +subagents to gather it, assesses what is **already tested** (the `assessing-test-coverage` +skill, which inventories existing tests, cites each as a GitHub permalink, and writes a +coverage report), then runs the analyst skill (`analyzing-test-stack`), which produces the +test-stack recommendation. Both skills emit a self-contained HTML report. ## Where each layer lives -Static, unit, and integration tests live alongside the code inside each platform repo +Unit and integration tests live alongside the code inside each platform repo (e.g. `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`). **End-to-end tests live in a dedicated, private `test` repository** — not inside the platform repos — so E2E recommendations target that separate repo, and existing E2E coverage is treated as @@ -34,16 +31,16 @@ unverified when that repo isn't checked out. ## Agent -| Agent | What It Does | -| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `test-engineer-orchestrator` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, runs `analyzing-test-stack`, then automatically runs `challenging-test-stack-recommendations` and consolidates a single report. | +| Agent | What It Does | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `bitwarden-test-engineer` | Classifies the inputs for a change (Jira, PR, CSV, description), fans out subagents to gather evidence, assesses existing coverage (`assessing-test-coverage`), then runs `analyzing-test-stack` — emitting a self-contained coverage report and a self-contained test-stack report. | ## Skills -| Skill | What It Does | -| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `analyzing-test-stack` | The recommender. Maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a self-contained HTML report to the current working directory. | -| `challenging-test-stack-recommendations` | The adversarial counterpart. Re-derives the evidence independently and red-teams the recommendation against known anti-patterns (ice-cream-cone, unit-masquerading-as-integration, over-testing, untestable requirements, missing platform layers, flaky-E2E, ungrounded coverage), then returns a verdict: endorse, revise, or reject-with-reasons. | +| Skill | What It Does | +| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `assessing-test-coverage` | The backward-looking inventory. Determines what is **already tested** for a change — scoped to the change surface, PR-first then a targeted lookup — buckets each observed test by layer, cites it as a stable GitHub permalink, flags untested behaviors as gaps, and writes a self-contained HTML coverage report. Feeds `analyzing-test-stack`; usable standalone to audit current coverage. | +| `analyzing-test-stack` | The recommender. Consumes the coverage inventory, then maps each testable behavior in a change to the cheapest sufficient Testing Trophy layer per platform, names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, over-testing, missing platform layers), and writes a self-contained HTML report to the current working directory. | ## Cross-Plugin Integration @@ -65,7 +62,7 @@ For Jira-backed analysis, install the Atlassian tools alongside it: ## Usage -The orchestrator activates when you ask what test coverage a change needs, which +The agent activates when you ask what test coverage a change needs, which automation layers to add, how to shape a test plan, or whether existing tests are at the right level: @@ -82,8 +79,11 @@ Here's our exported test cases CSV for the billing migration — which of these automated and at what layer? ``` -Each run produces a self-contained `test-stack-report--.html` in the current -working directory, containing the per-platform recommendation and the adversarial review. +Each run produces two self-contained HTML files in the current working directory: a +`test-coverage-report--.html` (what is already tested — observed tests per layer, +each cited as a GitHub permalink, plus gaps) and a `test-stack-report--.html` (the +per-platform recommendation and its coverage-gap findings). Both share one off-brand +data-report visual system so they read as the same instrument. ## References diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md new file mode 100644 index 0000000..fdc1dd6 --- /dev/null +++ b/plugins/bitwarden-test-engineer/agents/AGENT.md @@ -0,0 +1,136 @@ +--- +name: bitwarden-test-engineer +version: 1.0.0 +description: | + Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (unit, integration, E2E) shaped as a Testing Trophy and risk-weighted by each behavior's defect severity (impact, not urgency), across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, assesses what is already tested (the `assessing-test-coverage` skill), then runs the analyst skill (`analyzing-test-stack`), which emits a self-contained HTML report. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, how to prioritize test coverage by risk, what tests a Critical/High bug needs, or asks for a "test stack" / "test strategy" / "test trophy" / "risk-based coverage" analysis for a ticket, PR, tech breakdown, or set of test cases. + + + Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with. + user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?" + assistant: "I'll use the bitwarden-test-engineer agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation." + + Jira-key intake. The agent gathers the ticket via the Atlassian MCP, then runs Skill(analyzing-test-stack) to produce the report. + + + + + Context: A reviewer wants to know whether an open PR is adequately tested at the right layers. + user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?" + assistant: "I'll use the bitwarden-test-engineer agent to read the PR diff and its tests, assess the trophy shape, and check specifically for an ice-cream-cone (too E2E-heavy) anti-pattern." + + PR intake plus an explicit anti-pattern concern. The agent gathers the diff via gh, then runs the analyst, which assesses the trophy shape including the ice-cream-cone check. + + + + + Context: A QA engineer exported a set of manual test cases and wants an automation plan. + user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?" + assistant: "I'll use the bitwarden-test-engineer agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation." + + CSV intake. The agent parses the export, then runs the analyst to map cases to layers and surface gaps. + + + + + Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it. + user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?" + assistant: "I'll use the bitwarden-test-engineer agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation." + + Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report. + + +model: opus +tools: + - Read + - Write + - Glob + - Grep + - Skill + - Task + - AskUserQuestion + - Bash(gh pr view:*) + - Bash(gh pr diff:*) + - Bash(gh pr checks:*) + - Bash(git diff:*) + - Bash(git log:*) + - Bash(git rev-parse:*) + - Bash(git remote get-url:*) + - Bash(git -C * rev-parse:*) + - Bash(git -C * remote get-url:*) + - mcp__bitwarden-atlassian__get_issue + - mcp__bitwarden-atlassian__search_issues + - mcp__bitwarden-atlassian__get_issue_comments + - mcp__bitwarden-atlassian__get_issue_remote_links + - mcp__bitwarden-atlassian__get_confluence_page + - mcp__bitwarden-atlassian__search_confluence + - mcp__bitwarden-atlassian__search_confluence_cql +skills: + - assessing-test-coverage + - analyzing-test-stack +color: green +--- + +You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys. + +You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep the trophy shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence. + +## Operating context + +Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently. + +**Where each layer lives:** unit and integration live alongside the code in each platform repo; **E2E lives in the dedicated `test` repo** (sibling of the platform repos). See `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map. + +The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided. + +## Workflow + +### 1. Intake and scope + +Classify every input the user supplied — Jira key, GitHub PR, Confluence tech breakdown (page ID/URL or feature/team name to search), CSV path, plain-language description. Inputs are additive; handle any combination. Per-source ingestion (Epic expansion, breakdown mining, CSV column mapping) is specified in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` — don't re-derive it here. + +Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption. + +### 2. Fan out to gather evidence + +Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out: + +- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_. +- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status. +- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present. +- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status. + +Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied. + +**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit Opus. See _Model selection_ below for the rationale. + +### 3. Assess existing coverage + +Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return a **permalink record per cited test** (`{ path, start_line, end_line, owner_repo, sha, layer, permalink }`, or `{ path, unlinkable_reason }` when an ingredient is missing) plus `unverified` gaps. The output contract, the PR-first/targeted-lookup discipline, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' records into a single coverage inventory. + +This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C rev-parse HEAD` and `owner/repo` via `git -C remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date: it writes a **self-contained HTML coverage report** to the current working directory as `test-coverage-report--.html` (the backward-looking inventory — observed tests per layer with permalinks, plus `unverified` gaps) and returns the inventory records for step 4. The scouts do the gathering; the skill assembles the report. Pass today's date — skills cannot read the clock. + +### 4. Recommend + +Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. It maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report--.html`. The analyst writes the report's `#overview` itself. Pass today's date to the skill — skills cannot read the clock themselves. + +### 5. Present + +The run produces **two self-contained HTML files** in the current working directory: the `test-coverage-report-*.html` (what is already tested, from step 3) and the `test-stack-report-*.html` (the recommendation, from step 4). Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at both files — the coverage report for the existing-test detail, the test-stack report for the per-behavior recommendation. + +## Principles + +- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground. +- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer. +- **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency). +- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see. +- **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_. +- **Coverage before recommendation.** Assess what already exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute. + +## Model selection + +Model spend is governed here in the plugin, not left to the session default. The split: + +- **You (the test-engineer agent) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` — mapping behaviors to the cheapest sufficient layer across multiple platforms — all in _your_ context, so your model sets its quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus. +- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus. + +Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus. diff --git a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md b/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md deleted file mode 100644 index 37934b1..0000000 --- a/plugins/bitwarden-test-engineer/agents/test-engineer-orchestrator/AGENT.md +++ /dev/null @@ -1,137 +0,0 @@ ---- -name: test-engineer-orchestrator -version: 1.0.0 -description: | - Test automation strategist for Bitwarden. Takes a feature, bugfix, or arbitrary change — described in plain language, in a Jira ticket, in a GitHub PR, in a technical breakdown document (a Confluence tech breakdown), and/or in an exported test-case CSV — and produces an evidence-driven recommendation for the right test automation layers (static, unit, integration, E2E) shaped as a Testing Trophy, across Bitwarden's server, client, and mobile codebases. Gathers the evidence by fanning out subagents, runs the analyst skill to synthesize a recommendation and HTML report, then automatically runs the adversarial counterpart to red-team it before presenting a consolidated result. Use when the user asks what test coverage a change needs, which automation layers to add, how to shape a test plan, whether existing tests are over- or under-weighted, or asks for a "test stack" / "test strategy" / "test trophy" analysis for a ticket, PR, tech breakdown, or set of test cases. - - - Context: An engineer is about to start a Jira story and wants to know what test automation it should ship with. - user: "I'm picking up PM-12345 next sprint. What test coverage should this feature have?" - assistant: "I'll use the test-engineer-orchestrator agent to pull the requirements from PM-12345, map the change across the affected codebases, and produce a Testing Trophy recommendation — then red-team it before handing it back." - - Jira-key intake. The orchestrator gathers the ticket via the Atlassian MCP, runs Skill(analyzing-test-stack), then auto-runs Skill(challenging-test-stack-recommendations). - - - - - Context: A reviewer wants to know whether an open PR is adequately tested at the right layers. - user: "Does bitwarden/server#5821 have the right tests, or is it leaning too hard on end-to-end?" - assistant: "I'll use the test-engineer-orchestrator agent to read the PR diff and its tests, assess the trophy shape, and run the adversarial pass to specifically check for an ice-cream-cone (too E2E-heavy) anti-pattern." - - PR intake plus an explicit anti-pattern concern. The orchestrator gathers the diff via gh, then chains analyst → adversary. - - - - - Context: A QA engineer exported a set of manual test cases and wants an automation plan. - user: "Here's our exported test cases CSV for the billing migration work — which of these should be automated and at what layer?" - assistant: "I'll use the test-engineer-orchestrator agent to parse the CSV, bucket the existing cases by trophy layer, find the gaps, and produce a layer-by-layer automation recommendation with an adversarial review." - - CSV intake. The orchestrator parses the export, runs the analyst to map cases to layers and surface gaps, then the adversary challenges the recommendation. - - - - - Context: A tech lead just finished a tech breakdown and wants the test plan that should accompany it. - user: "I've got the tech breakdown for the new device-approval flow in Confluence — what test coverage should we plan across the stack?" - assistant: "I'll use the test-engineer-orchestrator agent to read the breakdown, mine its scope checklist and spec child pages for the surfaces and behaviors it touches, and produce a per-platform Testing Trophy recommendation — then red-team it." - - Tech-breakdown intake. The orchestrator fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then chains analyst → adversary. - - -model: opus -tools: - - Read - - Write - - Glob - - Grep - - Skill - - Task - - AskUserQuestion - - Bash(gh pr view:*) - - Bash(gh pr diff:*) - - Bash(gh pr checks:*) - - Bash(git diff:*) - - Bash(git log:*) - - mcp__bitwarden-atlassian__get_issue - - mcp__bitwarden-atlassian__search_issues - - mcp__bitwarden-atlassian__get_issue_comments - - mcp__bitwarden-atlassian__get_issue_remote_links - - mcp__bitwarden-atlassian__get_confluence_page - - mcp__bitwarden-atlassian__search_confluence - - mcp__bitwarden-atlassian__search_confluence_cql -skills: - - analyzing-test-stack - - challenging-test-stack-recommendations -color: green ---- - -You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a thin static-analysis base, a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys. - -You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Every recommendation you produce is challenged by an adversarial pass before you present it, because an unchallenged test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence. - -## Operating context - -Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently. - -**Where each layer lives:** static, unit, and integration tests live alongside the code, inside each platform repo. **End-to-end (E2E) tests live in a dedicated, private `test` repository** — not inside the platform repos. So an E2E recommendation always targets that separate repo, and a per-repo coverage scout will not find existing E2E tests inside `server`/`clients`/`ios`; it must look in the `test` repo (and the user may not have it checked out — degrade gracefully and say so). Read `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map. - -The Atlassian capabilities depend on the **`bitwarden-atlassian-tools`** plugin (the `mcp__bitwarden-atlassian__*` server). If it is not installed and the user references a Jira issue or a Confluence tech breakdown, do not fail — tell the user the MCP is unavailable and ask them to paste the requirements or the breakdown contents, or proceed from the PR / CSV / description they provided. - -## Workflow - -### 1. Intake and scope - -Classify every input the user supplied. Inputs are additive — handle any combination: - -- **Jira key** (e.g. `PM-12345`) → requirements and acceptance criteria. -- **GitHub PR** (URL or `owner/repo#number`) → the actual change surface and any tests already present. -- **Technical breakdown** (a Confluence page ID/URL, or a feature/team name to search for) → a Bitwarden Tech Breakdown whose scope checklist already enumerates the platforms and surfaces the change touches, with spec child pages defining the interfaces. Often the richest single input. -- **CSV path** → an exported set of existing/planned test cases (column layout described in the analyst skill's `references/input-sources.md`). -- **Plain-language description** → the change itself when no artifact exists. - -Then determine the **affected repos/platforms**. If scope is genuinely ambiguous and it changes the recommendation, use `AskUserQuestion` — otherwise infer and state your assumption. - -### 2. Fan out to gather evidence - -Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out: - -- **Requirements reader** (model: `sonnet`) — resolves the Jira issue (via `Skill(bitwarden-atlassian-tools:researching-jira-issues)` if available, else the `mcp__bitwarden-atlassian__*` tools) into testable behaviors and acceptance criteria. -- **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status. -- **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present. -- **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status. -- **Per-repo coverage scout** (model: `sonnet`) — for each affected platform repo, surveys existing static/unit/integration conventions and where comparable behavior is tested today. For E2E, scout the dedicated `test` repo if it is checked out; otherwise note it as unverified. - -Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied. - -**Set each subagent's model explicitly to control cost.** This fan-out is the bulk of the plugin's token spend, and the work is evidence gathering — read a source, extract, return a compact digest — not the strategic reasoning you reserve for yourself. Spawn each `Task` on the cheapest model that fits: **`haiku`** for pure mechanical parsing (the CSV parser), **`sonnet`** for everything that reads code, a diff, or a ticket and summarizes it (the default for these subagents). Do **not** let a subagent inherit your Opus model — a digest-returning agent never needs it. Reserve Opus for your own context, where the synthesis and adversarial reasoning happen (see Model selection below). - -### 3. Recommend - -Invoke `Skill(analyzing-test-stack)` with the gathered digests. It maps each testable behavior to the cheapest sufficient trophy layer per platform, names concrete tooling, surfaces coverage gaps, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report--.html`. Pass today's date to the skill — skills cannot read the clock themselves. - -### 4. Adversary (automatic) - -Immediately invoke `Skill(challenging-test-stack-recommendations)` on the report and the underlying evidence. It red-teams the recommendation against known failure modes — ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable/ambiguous requirements, a missing platform layer, flaky-E2E candidates, and coverage claimed without evidence — and returns a critique with a verdict: **endorse**, **revise**, or **reject-with-reasons**. - -This pass is not optional. If the user explicitly asks to skip it, comply but state plainly in your summary that the recommendation was not adversarially reviewed. - -### 5. Consolidate - -Merge the critique into the report as a clearly labeled "Adversarial Review" section, so a single HTML file carries both the recommendation and its challenge. In chat, give a short summary: the recommended shape per platform, the adversary's verdict, and the top open risks the user should resolve before committing to the plan. - -## Principles - -- **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground. -- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer. -- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see. -- **Read the repo's CLAUDE.md** when the analysis touches a specific checked-out codebase — honor its test conventions over generic defaults. - -## Model selection - -Model spend is governed here in the plugin, not left to the session default. The split: - -- **You (the orchestrator) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` (mapping behaviors to the cheapest sufficient layer across multiple platforms) and `challenging-test-stack-recommendations` (red-teaming that recommendation) — both execute in _your_ context, so your model sets their quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus. -- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus. - -Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md similarity index 51% rename from plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md rename to plugins/bitwarden-test-engineer/references/input-sources.md index b6b2d46..9724f09 100644 --- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/input-sources.md +++ b/plugins/bitwarden-test-engineer/references/input-sources.md @@ -20,13 +20,70 @@ Otherwise use the MCP tools directly: Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/ components** named. If the MCP is unavailable, ask the user to paste the requirements. +Also capture **severity** — for a bug/defect ticket, read the severity assigned on the issue +(the severity field, or the QA/reporter's stated severity in the description/comments) and +carry it with the behaviors; for a feature/story without a defect, leave it to the analyst to +assess each behavior's risk severity. Severity is the impact dial the `analyzing-test-stack` +skill uses to risk-weight coverage — see that skill's `references/severity-risk.md`, mirrored +from the Defect Severity Classification Guide (Confluence page `2759229512`). + +### Epic intake + +A Jira key may resolve to an Epic (or, in next-gen projects, a Feature) rather than a single +story. The epic body itself rarely lists testable behaviors — those live on its children +and on the PRs the children produce. If you analyze only the epic, you will under-scope the +trophy. So when the `issuetype` on the `get_issue` response is `Epic` or `Feature`, expand +before extracting: + +1. **Discover children.** Read the `subtasks` field first. If empty (common in next-gen + projects, which use `parent` relationships rather than the legacy `subtasks` field), fall + back to `mcp__bitwarden-atlassian__search_issues` with JQL `parent = `. On + classic projects, also try `"Epic Link" = `. Together these cover both schemas. +2. **Bound the fan-out.** If the epic has more than ~10 children, fetch the first 10 in full + and summarize the rest as a one-line list (key, status, summary) from the search results. + This matches the depth-control discipline in + `bitwarden-atlassian-tools:researching-jira-issues` (Steps 2–3) — re-use that recipe; do + not re-derive it. +3. **Per child, gather behaviors and PRs.** + - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance + criteria — these are the testable behaviors for the trophy. + - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). + Each PR URL becomes an input to the **GitHub PR** branch below: hand it off to + `gh pr view` / `gh pr diff` so the actual change surface and any tests-in-PR feed the + recommendation. **These merged/linked PRs are the reliable backbone for existing + coverage** — the tests they contain are what shipped with this work, and the PR head SHA + makes each one permalink-ready (see the `assessing-test-coverage` skill's + `references/finding-coverage.md` → _Finding existing coverage_). + If `gh` cannot reach a PR (private fork, not authenticated, repo not accessible), record + the URL as evidence-not-inspected in the report rather than silently dropping it. +4. **Track epic status.** The epic's own status (`In Planning`, `In Progress`, `Done`) tells + you how much of the work is shipped: children in `Done` with merged PRs likely already + have tests-in-PR you can audit for shape; children still `To Do` are scope-only and your + recommendation is necessarily prospective. Surface this distinction in the Evidence + section of the report. +5. **Preferred path when available.** If `bitwarden-atlassian-tools` is installed, invoke + `Skill(bitwarden-atlassian-tools:researching-jira-issues)` on the epic key — its Step 2 + already does the hierarchical-link discovery and Step 3 the depth-controlled traversal, + and returns the children + linked Confluence pages + remote links in one synthesized read. + Use the direct MCP calls above only when that skill is unavailable. + ## GitHub PR -- `gh pr view ` — title, body, linked issues, files changed, checks. +- `gh pr view --json url,headRefOid,baseRefName,title,body,files,state` — title, + body, linked issues, files changed, **and the head SHA + `owner/repo`** needed for + permalink production downstream. - `gh pr diff ` — the actual change surface. -Extract: the public API / behavior touched, the diff paths (→ which repos/platforms), and -**any tests already included in the PR** (so you assess incremental, not absolute, gaps). +Extract: the public API / behavior touched, the diff paths (→ which repos/platforms), +**any tests already included in the PR** (so you assess incremental, not absolute, +gaps), and the captured **`headRefOid`** + **`owner/repo`** (parsed from the PR URL). +The SHA and `owner/repo` are required — they are what makes every test cited as +existing coverage clickable in the report. Tests observed in the PR diff are primary +coverage evidence; for _pre-existing_ tests not in the diff, do a targeted lookup scoped +to the changed paths/symbols rather than a repo-wide sweep. See the +`assessing-test-coverage` skill's `references/finding-coverage.md` → _Finding existing +coverage_ and _Citing tests as GitHub permalinks_ for the link form and the fallback when +ingredients are missing. ## Technical breakdown document @@ -48,7 +105,10 @@ Locate and fetch it: Map its structure to testable evidence (the canonical template is page `2920349776`): - **Part 1 — Problem overview**: the feature framing and linked Jira epic. Use it for scope and - to cross-link any Jira/PR inputs, not as a behavior source on its own. + to cross-link any Jira/PR inputs, not as a behavior source on its own. **When Part 1 names an + Epic**, treat it the same as an Epic-key intake — drill into its children and their PR remote + links per the _Epic intake_ recipe above. A breakdown plus its epic together usually surface + more testable behavior than either alone. - **Part 2 — Breakdown scope checklist**: the core of the mining. Each answered item names a surface the change touches and therefore a place tests are needed — **Database changes** (migration/backwards-compat behaviors, EDD phasing), **API changes** (endpoint contracts, @@ -66,7 +126,8 @@ Map its structure to testable evidence (the canonical template is page `29203497 can't be reliably tested until its question is answered. Surface them in the report's gaps. Extract: discrete **testable behaviors** per platform, the **surfaces** each touches (→ repos via -`monorepo-layout.md`), and the team's **stated testing intent** (to evaluate, not echo). Where the +the `analyzing-test-stack` skill's `references/monorepo-layout.md`), and the team's **stated testing +intent** (to evaluate, not echo). Where the breakdown's scope checklist disagrees with a diff or ticket you were also given, treat the divergence as a finding rather than silently picking one. @@ -82,7 +143,7 @@ settings — **do not hardcode them**. Read the header row, then map by meaning: - A **steps / expected-result** column, often in Given–When–Then form — the behavior. - Optional **team / area / tags / preconditions** columns — scope and grouping. -Map rows to behaviors and bucket each by apparent layer using `testing-trophy.md`: +Map rows to behaviors and bucket each by apparent layer using the `analyzing-test-stack` skill's `references/testing-trophy.md`: - A case that drives the full UI through a complete journey → likely **E2E** (target the dedicated `test` repo). diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md new file mode 100644 index 0000000..4957f9f --- /dev/null +++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md @@ -0,0 +1,496 @@ +# Report style tokens — data-report visual system for HTML reports + +This file is the **single source of styling truth** for every self-contained HTML report the +`bitwarden-test-engineer` plugin emits — the `analyzing-test-stack` test-stack report and the +`assessing-test-coverage` coverage report alike. The HTML output requirements (single file, +inline CSS, no external/CDN assets, no web fonts, no JS) mean a report cannot `` to a +design system at runtime — instead, **inline the stylesheet block at the bottom of this file +verbatim** into the report's ` -
…title, change, date…
-
…recommended shape per platform…
-
…sources used + what was missing…
-
…per-platform behavior→layer tables…
-
…coverage gaps & imbalances…
-
- …filled in by the adversarial pass… -
+
+

Test Stack Report

+

…the change under analysis…

+

…ticket/PR · status · team · date…

+
+
+
+

Overview

+ …2–4 sentence recap of the recommended shape per platform; top 3 open + risks; anchor links into #recommendations and #gaps… +
+
+

Summary & recommended shape

+ …2–4 sentences… +
+
+ Fig 1 · Recommended layer distribution by platform +
+
+ unit + integration + e2e +
+
+ bitwarden/server +
+ 3 + 11 + 1 +
+
+ +
+
    +
  • + bitwarden/server — integration-heavy, thin + unit, 1 E2E journey +
  • + +
+
+
+

Evidence & sources

+
+ …sources used + what was missing + commit SHA(s)… +
+
+
+

Per-platform recommendations

+
+ …per-platform tables: Behavior | Severity | Recommended layer | + Tooling | Rationale | Evidence (linked)… +
+
+
+

Coverage gaps & imbalances

+ …gaps and trophy-wrong shapes; ungrounded findings marked unverified… +
+
``` diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md index dae06b1..6188e09 100644 --- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/monorepo-layout.md @@ -2,33 +2,38 @@ Bitwarden's code spans several repositories. A single feature often touches more than one, and each gets its own Testing Trophy. Treat the table below as a **starting map**, -not gospel — when a repo is checked out, read its `CLAUDE.md` and grep its existing tests -to confirm the actual conventions before recommending tooling. +not gospel — when a repo is checked out, confirm the actual conventions from its config +first (the `assessing-test-coverage` skill's `references/finding-coverage.md` → +_Discovering a repo's test conventions_), and read the table as the last-resort default. + +Establishing what a change is **already tested** by — finding existing coverage and citing +it as permalinks — is a separate job owned by the `assessing-test-coverage` skill. This file +covers only the repo/stack map and the rules for mapping a behavior to the layer it _should_ +live at. ## Platform repos and their stacks -| Repo (typical) | Platform | Language / framework | Static | Unit / Integration tooling | -| ------------------- | ------------------------------ | ----------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------------------ | -| `bitwarden/server` | Backend / API | C# / .NET, ASP.NET Core, EF Core | `dotnet build` analyzers, nullable refs | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers | -| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | `tsc`, ESLint | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary | -| `bitwarden/ios` | iOS | Swift / SwiftUI | SwiftLint, compiler | XCTest (unit + integration); XCUITest for on-device UI | -| `bitwarden/android` | Android | Kotlin | ktlint/detekt, compiler | JUnit + Robolectric / Espresso (instrumented) | +| Repo (typical) | Platform | Language / framework | Unit / Integration tooling | +| ------------------- | ------------------------------ | ----------------------------------- | ------------------------------------------------------------------------------------------ | +| `bitwarden/server` | Backend / API | C# / .NET, ASP.NET Core, EF Core | xUnit; integration via `WebApplicationFactory` + test DB / in-memory providers | +| `bitwarden/clients` | Web, Browser ext, Desktop, CLI | TypeScript, Angular, Electron, RxJS | Jest + Angular TestBed / Testing Library (unit + integration); mocked HTTP at the boundary | +| `bitwarden/ios` | iOS | Swift / SwiftUI | XCTest (unit + integration); XCUITest for on-device UI | +| `bitwarden/android` | Android | Kotlin | JUnit + Robolectric / Espresso (instrumented) | Exact repo names and tool versions drift — verify against the checkout. If a platform isn't in this table, infer its stack from the repo and state the assumption in the report. ## Where each layer lives — important -- **Static, unit, integration** tests live **alongside the code, inside each platform +- **Unit and integration** tests live **alongside the code, inside each platform repo** (e.g. `server`'s xUnit projects, `clients`' `*.spec.ts` files, the iOS test targets). -- **End-to-end (E2E) tests live in a dedicated, private `test` repository** — _not_ - inside the platform repos. Consequences for analysis: - - An E2E recommendation always targets that separate `test` repo. - - A coverage scout will **not** find existing E2E tests by searching `server`/`clients`/ - `ios`. It must look in the `test` repo, which the user may not have checked out. - - If the `test` repo is unavailable, treat existing E2E coverage as **unverified** and - say so explicitly in the report — do not assume it is absent or present. +- **End-to-end (E2E) tests live in a dedicated `test` repository** — _not_ inside the + platform repos. It sits as a sibling of `server` / `clients` / `ios` in the user's + Bitwarden checkout root, so look for it next to whichever platform repo you're in + (e.g. if `clients` is at `~/repos/Bitwarden/clients`, `test` is at + `~/repos/Bitwarden/test`). Source: `https://github.com/bitwarden/test` — cite this URL + in the report only if no local sibling is found. ## Mapping a behavior to a platform + layer @@ -37,4 +42,10 @@ isn't in this table, infer its stack from the repo and state the assumption in t 2. Within each repo, choose the layer per `testing-trophy.md` and name the concrete tool from the table above (confirmed against the checkout where possible). 3. For any cross-system journey worth E2E coverage, target the dedicated `test` repo and - flag whether comparable E2E coverage already exists there. + flag whether comparable E2E coverage already exists there (per the coverage inventory + from `assessing-test-coverage`). + +Existing coverage to compare these recommendations against — including the GitHub permalinks +the report's Evidence column requires — comes from the `assessing-test-coverage` skill's +coverage inventory (`references/finding-coverage.md` → _Citing tests as GitHub permalinks_ +and _Output contract_), not from this file. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md new file mode 100644 index 0000000..d3f292b --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/severity-risk.md @@ -0,0 +1,81 @@ +# Severity as a risk weight + +The Testing Trophy tells you the _cheapest layer that buys the confidence a behavior +requires_. **Severity tells you how much confidence is required.** A defect in vault +unlock and a typo on a settings label are not owed the same rigor — severity is the dial +that turns "cheapest sufficient" from a flat rule into a risk-weighted one. + +Severity is the **impact of a defect on the system or user**, independent of how urgently +it gets fixed (that is _priority_). This skill weights coverage by severity, not priority. + +## Source of truth + +The canonical classification is Bitwarden's **Defect Severity Classification Guide**, +Confluence page `2759229512` +(`https://bitwarden.atlassian.net/wiki/spaces/EN/pages/2759229512/Severity`). The levels +and criteria below mirror that page so the analysis degrades gracefully when the Atlassian +MCP is unavailable — but the page is authoritative. When the `bitwarden-atlassian-tools` +MCP is available, fetch it with `mcp__bitwarden-atlassian__get_confluence_page` (pageId +`2759229512`) to pick up revisions before relying on the cached copy here. If the fetch +fails or the MCP is unavailable, use the mirrored table below and note in the report that +the severity definitions are from the cached copy (version not re-verified) — degrade +gracefully; never block on it. + +**Security-vulnerability defects are the exception:** their severity follows the +_Vulnerability Tracking and Management_ guide, not this one. If a behavior is +security-sensitive (crypto, auth, a threat-model-relevant path), treat its risk as at +least Critical regardless of the table below. + +## Where each behavior's severity comes from + +- **Bug / defect ticket** — read the severity already assigned on the Jira issue (the + severity field, or the reporter/QA's stated severity in the description/comments). Use it + directly; if it is absent, classify against the criteria below and mark it an assumption. +- **Feature, PR, tech breakdown** — there is no defect yet, so assess each behavior's + **risk severity**: _if this behavior broke in production, what severity would the + resulting defect carry?_ Classify it against the same criteria. This is what makes the + recommendation risk-aware rather than uniform. + +## Levels and criteria (mirrored from the guide) + +| Severity | A defect here would… | Signals (from the guide) | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Critical** | Severely harm core functionality, data integrity, or security with no viable workaround | Blocks a critical flow (login, vault access, billing, account creation); data loss/corruption/exposure; crash/unrecoverable state; affects all or a broad user segment | +| **High** | Significantly degrade a core feature/flow, but a workaround exists (difficult or non-obvious), or impact is limited to a subset of users | Core feature impaired but not blocked; specific client/OS/auth method; burdensome/undiscoverable workaround; compounding friction in a core workflow | +| **Medium** | Degrade functionality or UX meaningfully, but a workaround exists or scope is limited | Non-critical / secondary flow broken; misleading-but-not-destructive output; degraded experience for a subset; extra steps to work around | +| **Low** | Have minimal functional impact; does not meaningfully hinder the user | Cosmetic / typo / visual only; negligible edge case; minor UX inconsistency; trivial workaround | +| **Informative** | Be a known limitation, third-party compatibility issue, or environmental quirk — not a defect in Bitwarden's core behavior | Autofill on a non-standard third-party site/app; no clear owner or fix path; unlikely to be actioned | + +## How severity calibrates the recommendation + +Severity does **not** mean "push everything Critical to E2E." The cheapest-sufficient rule +still governs _which_ layer; severity governs _how completely_ the behavior must be covered +and _how hard a missing test counts as a gap_. Concretely: + +- **Critical** — the confidence bar is highest: cover the behavior's material failure modes, + not just the happy path, at whatever layer each mode is cheapest to pin down. Critical + behaviors that are genuine end-to-end journeys (login, vault unlock, checkout) are exactly + what the trophy reserves the **thin E2E layer** for — the guide's "critical user flows" + map 1:1 onto that reservation. A Critical behavior with no observed coverage is a + **top-priority gap** and belongs at the head of `#overview`'s open risks. +- **High** — strong integration coverage of the primary path _and_ the documented + workaround / affected configuration (the specific client, OS, or auth method that scopes + the impact). Reach for E2E only when the path is itself a critical journey. An uncovered + High behavior is a gap that should be scheduled, not silently accepted. +- **Medium** — the plain cheapest-sufficient layer with no escalation. A gap here is worth + recording and ranking below Critical/High; it is reasonable to defer. +- **Low** — minimal coverage; often a single unit or integration assertion, or an explicit + "not worth automating" call. Do **not** spend an E2E test on a Low behavior — that is the + ice-cream-cone anti-pattern wearing a risk costume. +- **Informative** — generally not automatable as a Bitwarden behavior; record as + out-of-scope rather than as a coverage gap, with a one-line reason. + +Two corollaries: + +1. **Severity ranks the gaps.** When `#gaps` and `#overview` list open risks, order them by + severity — the reader should resolve the Critical-uncovered behaviors first. Gap + prioritization is severity-driven, not list-order-driven. +2. **Severity ≠ priority.** A Low-severity defect can be High-priority before a launch, and + a High-severity bug in a rarely used admin panel can be Low-priority. This skill weights + coverage by **severity** (impact). Note priority only if the caller supplied it and it + changes what to test first. diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md index 13da8bd..ff5cd41 100644 --- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/references/testing-trophy.md @@ -4,27 +4,21 @@ A model for shaping automated test coverage, contrasted with the older Testing P because they buy the most confidence per unit of cost and maintenance for typical application code. -## The four layers (base → top) +## The three layers (base → top) -1. **Static** — the base. Type checking, linters, formatters, compiler errors, static - analysis. Catches whole classes of bugs (typos, null misuse, unused code, unsafe - patterns) before a single test runs. Nearly free; always on. - - _Examples:_ TypeScript/`tsc`, ESLint, Roslyn analyzers / `dotnet build` warnings as - errors, SwiftLint, nullable reference types. - -2. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure +1. **Unit** — focused. Tests a single function/class/module in isolation. Best for pure logic, algorithms, edge cases, and error handling where setup is cheap and the unit has real branching complexity. Fast and stable, but isolation can let integration bugs slip through. -3. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units +2. **Integration** — **the heaviest layer; the trophy's bulge.** Tests several units working together through real (or realistic) collaborators: a controller + service + in-memory or test database, a component rendered with its real child components and a mocked network boundary, a view model against a real repository. This is where most confidence is bought because it exercises the wiring users actually depend on, without the cost and flakiness of full E2E. -4. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a +3. **E2E (end-to-end)** — thin top. Drives the real, fully assembled system the way a user would: real browser, real device, real backend. Highest confidence per test, but slowest, most expensive, and most flaky. Reserve for a small number of **critical user journeys** (e.g. login, vault unlock, checkout) — not for branch coverage. @@ -38,11 +32,11 @@ application code. │ Integration │ HEAVY — most confidence bought here └──┐ ┌──┘ │ Unit │ focused — pure logic & edge cases - ┌──┴───────────┴──┐ - │ Static │ broad, ~free base — always on - └─────────────────┘ + └───────────┘ ``` +Static analysis (type checking, linters, formatters) sits below the trophy and is handled by per-repo tooling — not recommended by this skill. + ## How to assign a layer Pick the **cheapest layer that still buys the confidence the behavior requires**: @@ -52,10 +46,10 @@ Pick the **cheapest layer that still buys the confidence the behavior requires** persistence; component + store + API boundary; view model + repository) → **integration**. - A behavior only meaningful as a full user journey across the real system → **E2E**, and only if it is genuinely critical. -- Anything a type system, analyzer, or linter can guarantee → **static**; don't write a - test for it. +- Anything a type system, analyzer, or linter already guarantees → don't write a test + for it. -## Anti-patterns to avoid (the adversary checks for these) +## Anti-patterns to avoid - **Ice-cream cone** — the trophy inverted: many E2E tests, few integration/unit. Slow, flaky, expensive to maintain. diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md new file mode 100644 index 0000000..32905a3 --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/SKILL.md @@ -0,0 +1,51 @@ +--- +name: assessing-test-coverage +description: Use when determining what test coverage ALREADY exists for a change — inventorying the tests that currently cover a feature, PR, component, or set of changed paths across Bitwarden's repos, citing each as a stable GitHub permalink, bucketing it by test layer, and flagging behaviors with no observed test as gaps. Distinguishes observed coverage from assumed. Triggers on "what's already tested", "does this PR have tests", "what coverage exists for", "find the existing tests for", "is this component covered", "audit current test coverage". This is the backward-looking inventory that feeds test-stack analysis — it does NOT recommend new tests or assign cheapest-sufficient trophy layers; for that, use analyzing-test-stack. +allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(git rev-parse:*), Bash(git remote get-url:*), Bash(git -C * rev-parse:*), Bash(git -C * remote get-url:*)" +--- + +# Assessing Test Coverage + +Produce an evidence-grounded inventory of what is **already tested** for a change, scoped to the change surface, with every cited test rendered as a stable GitHub permalink and bucketed by test layer. This is a backward-looking, descriptive job: you report what exists, you do **not** recommend what to add or judge whether the shape is right — that is `analyzing-test-stack`'s job, which consumes this inventory. + +The output is a **coverage inventory**: a set of permalink records for observed tests plus a list of behaviors/surfaces recorded as gaps (`unverified`). Honesty is the whole point — a behavior with no observed test is a gap, never assumed covered. + +## Inputs + +You work from a **change surface** and the repos it touches: + +- **Change surface** — the changed paths/symbols and the named component(s). Usually supplied by the caller (the agent's evidence fan-out, or an `analyzing-test-stack` run). If you're handed only a Jira key or a PR with no resolved surface, derive a minimal surface from the PR diff (`gh pr diff`) before looking for coverage; the shared `../../references/input-sources.md` (the same intake guide `analyzing-test-stack` uses) covers how to resolve a PR or Epic into its diff paths and linked PRs. +- **Affected repos** — which platform checkouts to inspect, and whether the sibling `test` repo (E2E) is available. +- **Linked/merged PRs** — the PRs that shipped this work; their diffs are the primary, permalink-ready coverage evidence. + +A missing input narrows the inventory; it never blocks it. Record what you could not inspect as part of the result. + +**Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing. + +## Workflow + +1. **Learn each repo's conventions, config-first.** Before opening any test files, read the repo's Claude config to learn its test tooling and where tests live. Stop as soon as it answers the question. See `references/finding-coverage.md` → _Discovering a repo's test conventions_. + +2. **Find existing coverage — PRs first, then a targeted lookup.** Take the tests in the linked/merged PR diffs as primary evidence, then do a lookup **scoped to the change surface** for pre-existing tests. Never a repo-wide grep sweep. For E2E, inspect the sibling `test` repo if available. See `references/finding-coverage.md` → _Finding existing coverage_. + +3. **Cite and bucket each observed test.** Render every cited test as a GitHub permalink (commit SHA, not branch), following `references/finding-coverage.md` → _Citing tests as GitHub permalinks_. A test that genuinely cannot be linked is recorded path-only with an explicit reason — never fabricate a URL. Bucket each by apparent layer (unit / integration / E2E); for the layer definitions see the `analyzing-test-stack` skill's `references/testing-trophy.md`. For the per-repo stack/tooling reference, see that skill's `references/monorepo-layout.md`. + +4. **Record gaps.** Any behavior or surface in the change with no PR-observed test and no targeted hit is recorded as a coverage gap / `unverified`. Distinguish _observed_ coverage from _assumed_. + +5. **Write the coverage report.** Build a single self-contained HTML file (inline CSS, no external/CDN dependencies, no JS required) following `references/coverage-report-template.md`. **Inline the canonical stylesheet from `../../references/report-style-tokens.md` verbatim** — the same plugin-level styling source the test-stack report uses, so the two reports read as one instrument; do not re-pick colors or reintroduce a brand skin. Use the normative section IDs (`#overview`, `#summary`, `#evidence`, `#coverage`, `#gaps`) and write `#overview` yourself as a short synthesis. Write the report to the **current working directory** as `test-coverage-report--.html`, where `` is a short kebab-case identifier for the change and `` is the caller-provided date. + +## Output + +Two artifacts: + +- The **coverage inventory** as structured data — the record shape defined in `references/finding-coverage.md` → _Output contract_: one permalink record per observed test, plus the list of `unverified` gaps. When run under the `bitwarden-test-engineer` agent, return these records for `analyzing-test-stack` to consume as-is. +- The **self-contained HTML coverage report** (step 5), written to the current working directory. + +Mirror the report's `#overview` in chat — the observed shape per platform and the top gaps — and point the reader at the report file for the per-test detail. + +## Principles + +- **Observed vs. assumed.** Never present assumed coverage as verified. "I could not inspect the `test` repo" is a finding, not a failure. +- **Scoped, not swept.** Coverage is established PR-first then scoped to the change surface — never a repo-wide grep. +- **Stable links only.** Permalinks use the commit SHA, not a branch. Unlinkable tests are recorded with a reason; URLs are never fabricated. +- **Backward-looking only.** You inventory what exists. Recommending new tests, assigning cheapest-sufficient layers, and judging trophy shape belong to `analyzing-test-stack` — hand off, don't cross over. diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md new file mode 100644 index 0000000..0a08050 --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/coverage-report-template.md @@ -0,0 +1,155 @@ +# Coverage report template + +Produce a **single self-contained HTML file** inventorying the existing test coverage for a +change: all CSS inline in a ` + + +
+

Test Coverage Report

+

…the change under analysis…

+

…ticket/PR · status · team · date…

+
+
+
+

Overview

+ …2–4 sentence recap of observed coverage per platform; top 3 gaps; + anchor links into #coverage and #gaps; one line noting this is a + coverage inventory, not a recommendation… +
+
+

Observed coverage shape

+ …2–4 sentences… +
+
Fig 1 · Observed test coverage by platform
+
+ unit + integration + e2e +
+
+ bitwarden/server +
+ 3 + 14 +
+
+ +
+
    +
  • + bitwarden/server — 14 integration, 3 unit, + 0 E2E observed +
  • + +
+
+
+

Evidence & sources

+
+ …repos inspected + PRs read + test-repo availability + what was + missing + commit SHA(s)… +
+
+
+

Observed coverage

+
+ …per-platform behavior→test tables with linked evidence… +
+
+
+

Coverage gaps

+ …behaviors with no observed test, each marked unverified with a one-line + reason… +
+
+ + +``` diff --git a/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md new file mode 100644 index 0000000..af2f27e --- /dev/null +++ b/plugins/bitwarden-test-engineer/skills/assessing-test-coverage/references/finding-coverage.md @@ -0,0 +1,119 @@ +# Finding and citing existing test coverage + +How to determine what a change is **already** tested by, scoped to the change surface, and how to cite each observed test as a stable link. This is the repo-reading half of test engineering; the trophy-mapping half (which layer a behavior _should_ live at) is in the `analyzing-test-stack` skill. + +## Discovering a repo's test conventions (config-first) + +Test conventions, tooling, and where tests live are usually documented in a repo's Claude +config — read it **before** opening any test files, and stop as soon as it answers the +question. This keeps token spend low on large repos. Work the tiers in order: + +1. **Config first.** Read the repo's root `CLAUDE.md`, its `.claude/` directory (rules and + settings), and any **nested `CLAUDE.md`** in the subdirectories the change touches (e.g. + `clients/apps//CLAUDE.md`). Extract the test tooling, the test-file layout/naming, and + any stated layer conventions. +2. **Test files as fallback — only for gaps config leaves.** If config is silent on a + convention you need, read a _few representative_ test files near the change surface to + confirm it. Do **not** sweep the repo. +3. **Generic stack table as last resort.** When neither config nor local tests answer, fall + back to the per-repo stack/tooling table in the `analyzing-test-stack` skill's + `references/monorepo-layout.md` and **state the assumption** in the result. + +This tier governs _conventions_ — what the tooling is and where tests live. Finding which +behaviors are _already covered_ is the next job, below. + +## Finding existing coverage (PRs first, then a targeted lookup) + +Reliably establishing what is **already tested** does not require grepping a whole repo. Work +two ordered moves, and record anything still unfound as a gap rather than dropping it: + +1. **Merged/linked PRs are the backbone.** The PRs hanging off the Jira issue and its epic + children (`get_issue_remote_links` → `gh pr view`/`gh pr diff`) are the reliable record of + the tests that shipped with this work, and are already permalink-ready via the PR head SHA. + Take the tests observed in those PR diffs as primary coverage evidence. +2. **Targeted repo lookup for pre-existing tests.** Tests written _before_ this ticket won't + appear in those PRs. Find them with a lookup **scoped to the change surface** — the files + and symbols the PRs/diff touch, and the component named in the ticket — not a repo-wide + sweep. Confirm conventions from config (above) so the lookup targets the right paths. + +For end-to-end coverage, inspect the dedicated sibling `test` repo if it is checked out (see +the `analyzing-test-stack` skill's `references/monorepo-layout.md` → _Where each layer lives_) +and cite specific files; if it is not available, record E2E coverage as `unverified`. + +A behavior with no PR-observed test and no targeted hit is recorded as a coverage gap / +`unverified` — never silently assumed covered. + +## Citing tests as GitHub permalinks + +Every test cited as **current coverage** must be rendered as a clickable +GitHub permalink so a reader can jump to the actual test. The link form is: + +``` +https://github.com///blob//#L-L +``` + +Use the **commit SHA**, not a branch name. Branch links rot under rebase and +force-push; SHA links are stable. + +### Acquiring the four ingredients + +1. **`owner/repo`** — from the remote URL. + - PR-sourced: parse from the PR URL (e.g. `gh pr view --json url`). + - Local checkout: `git -C remote get-url origin` and parse the + `github.com[:/]/(\.git)?` segment. +2. **Commit SHA**. + - PR-sourced: `gh pr view --json headRefOid` returns the PR head SHA. This is + the SHA the diff was computed against and is the right anchor for any + tests-in-PR or tests-on-the-PR-branch references. + - Local checkout: `git -C rev-parse HEAD` for the working-tree SHA. If the + working tree is dirty (uncommitted changes), still use HEAD and note in the + evidence that links point to HEAD, not the working tree. +3. **Path** — repo-relative path of the test file (no leading slash). The same path + you'd pass to `Read`, minus the repo root. +4. **Line range** — start line through end line of the test declaration. Acceptable + resolutions, in descending preference: + - Full block: from the `it(`/`test(`/`Test(`/`func Test…(` declaration line through + the matching closing brace. + - Declaration only: the single line where the test name is declared (`#L42`). + - File only (`#L1`) — accept reluctantly, and only when grep cannot localize the + test. Avoid for newly authored tests. + +### When a test cannot be linked + +If any of the four ingredients is missing — no remote (`git remote get-url origin` +returns empty), detached HEAD with no remote, private fork the session cannot reach, +or the file exists only in a local working tree never pushed — record the test as +**unlinkable** with the reason. Never fabricate a URL. Both this skill's coverage report +(`coverage-report-template.md`) and the downstream `analyzing-test-stack` test-stack report +render these as `path — unlinkable: <reason>`. + +### Output contract + +For every cited test, return a record of the shape: + +``` +{ + "path": "src/services/Foo/FooService.spec.ts", + "start_line": 42, + "end_line": 89, + "owner_repo": "bitwarden/clients", + "sha": "a1b2c3d4e5f6…", + "layer": "integration", + "permalink": "https://github.com/bitwarden/clients/blob/a1b2c3d4e5f6…/src/services/Foo/FooService.spec.ts#L42-L89" +} +``` + +…or, when unlinkable: + +``` +{ "path": "src/services/Foo/FooService.spec.ts", "layer": "integration", "unlinkable_reason": "no remote for local checkout" } +``` + +Behaviors/surfaces with no observed test are returned as gaps: + +``` +{ "behavior": "tier downgrade preserves seat count", "platform": "server", "status": "unverified" } +``` + +The `analyzing-test-stack` recommender consumes these records as-is to populate the +report's Evidence (linked) column and to seed its gap analysis. diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md deleted file mode 100644 index e86e11d..0000000 --- a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/SKILL.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -name: challenging-test-stack-recommendations -description: Use to red-team a test automation recommendation produced by analyzing-test-stack — adversarially reviewing a Testing Trophy recommendation or HTML test-stack report for anti-patterns and ungrounded claims before the team acts on it. Triggers on "challenge this test plan", "red-team the test recommendation", "poke holes in this test strategy", "is this proposed test plan over/under-testing", "review the test stack report", or runs automatically after analyzing-test-stack under the test-engineer orchestrator. Checks for ice-cream-cone (too E2E-heavy), unit-tests-masquerading-as-integration, over-testing trivial code, untestable requirements, missing platform layers, flaky-E2E candidates, and coverage claimed without evidence; returns a verdict of endorse, revise, or reject-with-reasons. -allowed-tools: "Read, Grep, Glob, Bash(gh pr view:*), Bash(gh pr diff:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_confluence_page" ---- - -# Challenging Test Stack Recommendations - -You are the adversary to `analyzing-test-stack`. Your job is to **try to break its -recommendation** before the team builds on it. A recommendation that survives a genuine -red-team is trustworthy; one that was never challenged tends to drift toward whatever -tests are easiest to write rather than what actually buys confidence. - -Default to skepticism. Your value is in the specific, evidence-backed objection — not in -rubber-stamping. But do not invent problems: an objection you cannot tie to evidence is -itself a rejected finding (you hold yourself to the same evidence bar you demand). - -## Inputs - -- The **HTML report** (or the recommendation text) from `analyzing-test-stack`. -- The **underlying evidence** — the same Jira ticket, PR diff, CSV, and/or repo checkout. - Re-derive independently where you can; re-read the PR diff or ticket rather than trusting - the report's summary of it. - -## Workflow - -1. **Re-read the evidence independently.** Don't take the report's characterization of the - change at face value — pull the diff / ticket / CSV yourself and form your own view of - the testable behaviors and where they live. Ingest each source the same way the analyst - does (see `analyzing-test-stack/references/input-sources.md` for the CSV column mapping - and Atlassian MCP tools). In particular, **E2E tests live in a separate, private `test` - repo** — not inside the platform repos — so treat any existing-E2E-coverage claim as - unverified unless that repo was actually inspected. - -2. **Run the rejection criteria.** Apply every check in `references/adversarial-checklist.md` - to each per-platform recommendation and to the overall shape. For each, decide: does the - recommendation pass, or is there a concrete, evidence-backed objection? - -3. **Test the grounding.** For every behavior→layer call, confirm it ties to real evidence. - Flag any layer assignment, coverage claim, or "already tested" assertion that the - evidence does not support — especially **E2E coverage claimed without inspecting the - dedicated `test` repo**. - -4. **Pressure the shape.** Step back from individual rows: is the overall trophy right? Too - E2E-heavy (ice-cream cone)? Core logic pushed to slow layers? A whole platform's layer - missing? Trivial code over-tested? - -5. **Issue findings and a verdict.** Each finding: the specific claim challenged, why it's - wrong or unsupported (with evidence), and the corrective recommendation. Then a single - verdict: - - **Endorse** — sound and well-grounded; minor or no notes. - - **Revise** — directionally right but has specific fixable issues (list them). - - **Reject-with-reasons** — the shape or grounding is wrong enough that the team should - not act on it as written; state what a correct recommendation would require. - -6. **Write the critique into the report.** Populate the report's `#adversarial-review` - section with your findings and verdict (preserve the self-contained, no-external-deps - HTML constraint). When run standalone without the orchestrator, return the critique as - a clearly structured summary instead. - -## Principles - -- **Adversarial, not contrarian.** Push hard, but every objection carries evidence. Drop - any finding you can't support — apply the analyst's own evidence standard to yourself. -- **Re-derive, don't trust.** The report's summary of the diff/ticket is a claim to verify, - not a fact to accept. -- **Name the anti-pattern.** When you flag a shape problem, use the precise term - (ice-cream-cone, over-unit-testing, E2E-for-branch-coverage) so the fix is unambiguous. -- **Unverifiable is a finding.** "The report claims E2E coverage exists but the `test` repo - was never inspected" is a legitimate, important objection — surface it. diff --git a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md b/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md deleted file mode 100644 index 7fbd307..0000000 --- a/plugins/bitwarden-test-engineer/skills/challenging-test-stack-recommendations/references/adversarial-checklist.md +++ /dev/null @@ -1,61 +0,0 @@ -# Adversarial checklist — rejection criteria - -Run every check against each per-platform recommendation and against the overall shape. -A check "fails" only when you can state a concrete, evidence-backed objection. Record the -evidence; an objection you can't ground is itself rejected. - -## Shape-level checks - -1. **Ice-cream cone (too E2E-heavy).** Is confidence concentrated in slow, flaky E2E tests - that integration or unit tests could buy more cheaply? Any behavior recommended for E2E - that is not a genuinely critical, full-system user journey is suspect — demand the - justification and propose the lower layer. - -2. **Missing platform layer.** Does an affected platform have a gap in its trophy — e.g. - server logic with no integration layer, a client with only E2E and no component/unit - coverage, core logic with nothing at all? A whole missing layer is a major finding. - -3. **Inverted cost/confidence.** Is core branching logic pushed up to integration/E2E - while trivial glue sits at lower layers? Confidence should sit at the cheapest - sufficient layer. - -## Row-level checks (per behavior → layer assignment) - -4. **Unit masquerading as integration (and vice-versa).** Is something labeled - "integration" actually a unit test with everything mocked (re-asserting mocks, not real - collaboration)? Or a true cross-collaborator behavior mislabeled "unit"? Mislabeling - distorts the shape and the confidence claim. - -5. **Over-testing trivial code.** Tests recommended for getters/setters, framework glue, - generated code, or invariants the type system/analyzer already guarantees. Cost without - confidence — recommend dropping or moving to static. - -6. **E2E for branch coverage.** Edge cases or error paths assigned to slow full-system - tests when they belong at unit/integration. E2E is for journeys, not branches. - -7. **Flaky-E2E candidate.** Does a recommended E2E test depend on timing, external - services, animation, network, or shared mutable state likely to make it flaky? Flag the - flakiness risk and whether an integration test with a controlled boundary would be more - reliable. - -## Grounding checks - -8. **Coverage claimed without evidence.** Any "already tested" / "existing coverage" - assertion not backed by an observed test, diff hunk, or CSV row. Especially: **E2E - coverage asserted without inspecting the dedicated private `test` repo** — that repo is - not inside the platform repos, so unexamined E2E claims are unverified by definition. - -9. **Untestable / ambiguous requirement.** A behavior recommended for testing whose - acceptance criteria are too vague to write a deterministic assertion against. The fix is - to flag the requirement gap upstream, not to write a test against a guess. - -10. **Assumption presented as fact.** Inferred platform, stack, tooling, or scope stated - without an "assumption" marker. Demand it be labeled so the reader can weigh it. - -## Verdict mapping - -- **Endorse** — no failing checks, or only cosmetic notes. -- **Revise** — one or more fixable row-level findings, shape essentially sound. -- **Reject-with-reasons** — a shape-level failure (ice-cream cone, missing layer, inverted - cost/confidence) or pervasive ungrounded coverage claims. State what a correct - recommendation would require. From be2db8d50a85587bfef20e776a0f2d0ff8f49908 Mon Sep 17 00:00:00 2001 From: Ned Thompson Date: Thu, 18 Jun 2026 14:14:50 -0400 Subject: [PATCH 3/9] html report changes, defer to current test stack shape rather than forcing trophy --- .cspell.json | 10 + README.md | 2 +- plugins/bitwarden-test-engineer/CHANGELOG.md | 26 +- plugins/bitwarden-test-engineer/README.md | 10 +- .../bitwarden-test-engineer/agents/AGENT.md | 57 +- .../references/input-sources.md | 41 +- .../references/report-style-tokens.md | 484 +++------------ .../references/report-style.css | 552 ++++++++++++++++++ .../scripts/build-report.sh | 202 +++++++ .../skills/analyzing-test-stack/SKILL.md | 18 +- .../references/html-report-template.md | 116 +++- .../references/monorepo-layout.md | 69 ++- .../references/severity-risk.md | 16 +- .../references/testing-trophy.md | 117 ++-- .../skills/assessing-test-coverage/SKILL.md | 8 +- .../references/coverage-report-template.md | 95 ++- .../references/finding-coverage.md | 63 +- 17 files changed, 1322 insertions(+), 564 deletions(-) create mode 100644 plugins/bitwarden-test-engineer/references/report-style.css create mode 100755 plugins/bitwarden-test-engineer/scripts/build-report.sh diff --git a/.cspell.json b/.cspell.json index b8189cd..e39c245 100644 --- a/.cspell.json +++ b/.cspell.json @@ -3,6 +3,7 @@ "version": "0.2", "words": [ "accum", + "actioned", "adf", "AKIA", "anthropics", @@ -53,6 +54,7 @@ "Gatekeeping", "GHAS", "ghsa", + "getline", "gofmt", "gradlew", "grype", @@ -63,6 +65,7 @@ "hotspots", "IDOR", "inclusivity", + "inlines", "issueIdOrKey", "issuelinks", "issuetype", @@ -76,6 +79,7 @@ "mcp", "Menlo", "metacharacters", + "mockall", "modelcontextprotocol", "msword", "MVVM", @@ -83,6 +87,7 @@ "mypassword", "myproject", "Newtonsoft", + "nextest", "nextPageToken", "numstat", "NVARCHAR", @@ -124,6 +129,10 @@ "startswith", "stride", "structurizr", + "stylesheet", + "subdirs", + "tablist", + "tabpanel", "tarpit", "thumbsup", "tinyui", @@ -133,6 +142,7 @@ "triaging", "unassigning", "unassigns", + "unfound", "ungroup", "unlinkable", "unresponded", diff --git a/README.md b/README.md index 61b5442..c693611 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ A curated collection of plugins for AI-assisted development at Bitwarden. Enable | [bitwarden-product-analyst](plugins/bitwarden-product-analyst/) | 0.1.5 | Product analyst agent for creating comprehensive Bitwarden requirements documents from multiple sources | | [bitwarden-security-engineer](plugins/bitwarden-security-engineer/) | 1.2.0 | Application security engineering: vulnerability triage, threat modeling, and secure code analysis | | [bitwarden-software-engineer](plugins/bitwarden-software-engineer/) | 1.0.0 | Software engineer agent for a Bitwarden product team. Implements stories, tasks, and bugs with code quality, performance, security, and team comms in mind. | -| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/) | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment. | +| [bitwarden-test-engineer](plugins/bitwarden-test-engineer/) | 1.0.0 | Test engineering toolkit: an orchestrator dispatches testing skills strategy and planning, automation, exploratory testing, and quality assessment. | | [claude-config-validator](plugins/claude-config-validator/) | 1.1.1 | Validates Claude Code configuration files for security, structure, and quality | | [claude-retrospective](plugins/claude-retrospective/) | 1.1.1 | Analyze Claude Code sessions to identify successful patterns and improvement opportunities | diff --git a/plugins/bitwarden-test-engineer/CHANGELOG.md b/plugins/bitwarden-test-engineer/CHANGELOG.md index 12cf16b..9b3ab89 100644 --- a/plugins/bitwarden-test-engineer/CHANGELOG.md +++ b/plugins/bitwarden-test-engineer/CHANGELOG.md @@ -26,15 +26,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 never a repo-wide sweep), it discovers each repo's test conventions config-first, buckets every observed test by layer, cites it as a stable GitHub permalink (commit SHA, not branch), records untested behaviors as `unverified` gaps, and writes its own self-contained - HTML **coverage report** (`test-coverage-report--.html`) following + HTML **coverage report** (`test-coverage-report---.html`) following `references/coverage-report-template.md`. Usable standalone to audit current coverage, and consumed by `analyzing-test-stack`. Owns convention discovery, existing-test finding, and the GitHub permalink citation rules (in `references/finding-coverage.md`) — concerns kept separate from the trophy recommendation. - Plugin-level shared `references/`: `input-sources.md` (evidence-source ingestion, used by - both skills and the agent) and `report-style-tokens.md` (the single off-brand data-report - styling system both the coverage report and the test-stack report inline verbatim, so the - two read as one instrument). + both skills and the agent), `report-style.css` (the single off-brand data-report stylesheet + both reports use) and `report-style-tokens.md` (its design contract). The + `scripts/build-report.sh` build script splices `report-style.css` into each report so the + stylesheet is never reproduced as model output and the coverage and test-stack reports + cannot drift — they read as one instrument. +- Combined two-tab report: when the agent runs end to end, the `test-combined` build mode + stitches the two standalone reports into one page with _Current coverage_ and + _Recommended coverage_ tabs (CSS-only, no JavaScript; stacks both views on print). It is a + presentation-only merge assembled from the finished report files — each skill still authors + and builds its own standalone report unchanged, so the split between coverage and + recommendation stays intact. The tab chrome lives entirely in `report-style.css` and the + build script; no skill or template knows about tabs. - `analyzing-test-stack` skill: consumes the coverage inventory from `assessing-test-coverage`, then maps a change's testable behaviors to the cheapest sufficient Testing Trophy layer (static, unit, integration, E2E) per platform and emits @@ -50,11 +59,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 layer→repo map, evidence-source ingestion, and the HTML report template. The Atlassian `search_confluence` / `search_confluence_cql` tools back locating a breakdown by feature/team name when only a name (not a page ID) is given. +- Linked table of contents (`.toc`) at the top of every report's `
`, linking to + each section; in the combined report the build script namespaces the ToC's anchors per tab so + each panel's ToC jumps within its own panel. - Top-of-report `#overview` synthesis section, written by the analyst: a 2–4 sentence recap of the recommended shape per platform, the top 3 open risks (drawn from `#gaps`), and anchor links into the detail sections, so readers see the bottom line without scrolling. The overview is additive — per-behavior detail stays in `#recommendations`/`#gaps`. -- Per-layer model governance to optimize token spend: the agent runs on Opus - (its context drives the analysis and the recommendation), while the fan-out +- Per-layer model governance to optimize token spend: the agent inherits the session model + for its own context (which drives the analysis and the recommendation), while the fan-out evidence subagents are assigned explicitly — `sonnet` for sources that read a diff, ticket, - or repo, `haiku` for pure CSV parsing — rather than inheriting Opus. + or repo, `haiku` for pure CSV parsing — rather than inheriting the orchestrator's model. diff --git a/plugins/bitwarden-test-engineer/README.md b/plugins/bitwarden-test-engineer/README.md index f18e06e..86b3336 100644 --- a/plugins/bitwarden-test-engineer/README.md +++ b/plugins/bitwarden-test-engineer/README.md @@ -80,10 +80,12 @@ automated and at what layer? ``` Each run produces two self-contained HTML files in the current working directory: a -`test-coverage-report--.html` (what is already tested — observed tests per layer, -each cited as a GitHub permalink, plus gaps) and a `test-stack-report--.html` (the -per-platform recommendation and its coverage-gap findings). Both share one off-brand -data-report visual system so they read as the same instrument. +`test-coverage-report---.html` (what is already tested — observed tests per +layer, each cited as a GitHub permalink, plus gaps) and a +`test-stack-report---.html` (the per-platform recommendation and its +coverage-gap findings). The `HHMMSS` time suffix is stamped at build time, so re-running on the +same day never overwrites a prior report. Both share one off-brand data-report visual system so +they read as the same instrument. ## References diff --git a/plugins/bitwarden-test-engineer/agents/AGENT.md b/plugins/bitwarden-test-engineer/agents/AGENT.md index fdc1dd6..31a1d92 100644 --- a/plugins/bitwarden-test-engineer/agents/AGENT.md +++ b/plugins/bitwarden-test-engineer/agents/AGENT.md @@ -39,7 +39,7 @@ description: | Tech-breakdown intake. The agent fetches the Confluence breakdown via the Atlassian MCP, extracts testable behaviors and the affected platforms from Part 2, then runs the analyst to emit the report. -model: opus +model: inherit tools: - Read - Write @@ -57,6 +57,7 @@ tools: - Bash(git remote get-url:*) - Bash(git -C * rev-parse:*) - Bash(git -C * remote get-url:*) + - Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*) - mcp__bitwarden-atlassian__get_issue - mcp__bitwarden-atlassian__search_issues - mcp__bitwarden-atlassian__get_issue_comments @@ -70,13 +71,13 @@ skills: color: green --- -You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, shaped as a Testing Trophy: a unit layer for pure logic, a heavy integration layer where most confidence is bought, and a thin E2E layer reserved for critical user journeys. +You are a test automation strategist for Bitwarden. Your job is to take a change — a feature, a bugfix, a refactor, or a migration — and tell the team **what to test, at which layer, and why**, across three layers: a unit layer for pure logic, an integration layer for collaborator wiring, and a thin E2E layer reserved for critical user journeys. How those layers are weighted is **per repo, not one universal trophy** — Bitwarden's repos span unit-heavy pyramids (`server`, `clients`, `sdk-internal`, `android`), an integration + snapshot trophy (`ios`), and all-E2E repos (`test`, `browser-interactions-testing`). -You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep the trophy shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence. +You do not write the tests. You produce a recommendation — an HTML report — that an engineer or QA can act on. Ground every layer call in evidence and keep each repo's shape honest, because a test plan tends to drift toward whatever is easiest to write rather than what actually buys confidence. ## Operating context -Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each platform's trophy is shaped independently. +Bitwarden's code is split across several repositories, each with its own platform, stack, and test tooling. Assume the user works in a multi-repo layout such as `bitwarden/server`, `bitwarden/clients`, `bitwarden/ios`, and similar. A single feature frequently spans more than one of these (e.g. a server endpoint plus a web client plus a mobile screen), and each repo is shaped independently — match the recommendation to that repo's actual practice (`monorepo-layout.md` → _Each repo's test shape in practice_), not a single house style. **Where each layer lives:** unit and integration live alongside the code in each platform repo; **E2E lives in the dedicated `test` repo** (sibling of the platform repos). See `${CLAUDE_PLUGIN_ROOT}/skills/analyzing-test-stack/references/monorepo-layout.md` for the per-platform stack, tooling, and the layer→repo map. @@ -94,43 +95,65 @@ Then determine the **affected repos/platforms**. If scope is genuinely ambiguous Spawn `Task` subagents **in parallel**, one per evidence source or affected repo, so your own context stays lean. Each subagent returns a compact structured digest (not raw dumps). Typical fan-out: -- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_. +- **Requirements reader** (model: `sonnet`) — resolves the Jira issue into testable behaviors and acceptance criteria, expanding Epics/Features to their children and feeding any linked PR URLs to the PR diff analyzer downstream. Captures the **severity** assigned on a bug/defect ticket so the recommendation can be risk-weighted, and the **source issue key + browse URL** for each behavior (for an Epic, the specific child the behavior came from) so the report can link every behavior back to its requirement. Follows the recipe in `${CLAUDE_PLUGIN_ROOT}/references/input-sources.md` → _Epic intake_ and _Citing Jira issues as links_. - **Breakdown reader** (model: `sonnet`) — fetches the tech breakdown via `mcp__bitwarden-atlassian__get_confluence_page` (searching first with `search_confluence`/`search_confluence_cql` when given only a name), then mines Part 2's scope checklist for the surfaces touched, the relevant Part 4 spec child pages for interfaces, and Part 5's open questions for untestable-requirement risk. Returns testable behaviors per platform plus the breakdown's status. - **PR diff analyzer** (model: `sonnet`) — `gh pr diff` / `gh pr view` to extract the change surface, public API touched, and tests already present. - **CSV parser** (model: `haiku`) — reads the export and buckets existing cases by apparent layer and automation status. Give each subagent a single source and a tight output contract. Skip any branch whose input was not supplied. -**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit Opus. See _Model selection_ below for the rationale. +**Set each subagent's model explicitly** — `haiku` for the CSV parser, `sonnet` for the rest. Never let a digest-returning subagent inherit the orchestrator's model. See _Model selection_ below for the rationale. ### 3. Assess existing coverage -Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return a **permalink record per cited test** (`{ path, start_line, end_line, owner_repo, sha, layer, permalink }`, or `{ path, unlinkable_reason }` when an ingredient is missing) plus `unverified` gaps. The output contract, the PR-first/targeted-lookup discipline, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' records into a single coverage inventory. +Once the change surface is known (the diff paths/symbols and named components from step 2), determine what is **already tested** before recommending anything new. Fan out a **per-repo coverage scout** (model: `sonnet`) for each affected platform repo, each applying the `assessing-test-coverage` skill: read the repo's Claude config for conventions, establish coverage **PR-first then via a targeted lookup scoped to the change surface** (never a repo-wide sweep), inspect the sibling `test` repo for E2E, and return **one record per behavior** — its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [{ path, start_line, end_line, owner_repo, sha, permalink }] }`) plus `unverified` gaps. **Scouts must establish coverage per behavior and stop as soon as it's confirmed — never enumerate every test method in a covered area** (this is the dominant cost control; a behavior backed by 40 tests is one record with a count of ~40 and 3 exemplars, not 40 records). The output contract, the per-behavior discipline, the PR-first/targeted-lookup rule, and the SHA/`owner-repo` permalink recipe all live in `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` — the scouts follow it; don't restate it here. Merge the scouts' per-behavior records into a single coverage inventory. -This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C rev-parse HEAD` and `owner/repo` via `git -C remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date: it writes a **self-contained HTML coverage report** to the current working directory as `test-coverage-report--.html` (the backward-looking inventory — observed tests per layer with permalinks, plus `unverified` gaps) and returns the inventory records for step 4. The scouts do the gathering; the skill assembles the report. Pass today's date — skills cannot read the clock. +This step depends on step 2's change surface, so run it after the evidence fan-out (not interleaved). Scouts capture the SHA via `git -C rev-parse HEAD` and `owner/repo` via `git -C remote get-url origin`. Then invoke `Skill(assessing-test-coverage)` with the merged inventory and today's date to produce the backward-looking coverage inventory (observed tests per layer with permalinks, plus `unverified` gaps) and the **self-contained HTML coverage report** — a `test-coverage-report---.html` file in the current working directory. The skill returns the inventory records for step 4. Per the skill, the actual HTML _rendering_ is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — only the gathering and inventory merge happen in your context. Pass today's date — skills cannot read the clock; the build script stamps the `HHMMSS` suffix so the file is always fresh. ### 4. Recommend -Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. It maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity, and writes a **self-contained HTML report** (inline CSS, no external dependencies) to the current working directory as `test-stack-report--.html`. The analyst writes the report's `#overview` itself. Pass today's date to the skill — skills cannot read the clock themselves. +Invoke `Skill(analyzing-test-stack)` with the gathered digests **and the coverage inventory from step 3**. The behavior→layer mapping is the genuinely hard reasoning and **stays in your own (orchestrator) context**: it maps each testable behavior to the cheapest sufficient trophy layer per platform, **risk-weighted by each behavior's severity** (the impact a defect would carry — read from a bug's Jira severity field or assessed against Bitwarden's severity guide; see the skill's `references/severity-risk.md`), names concrete tooling, and surfaces coverage gaps and trophy-wrong shapes (ice-cream-cone, mislabeled layers, ungrounded coverage claims) ordered by severity. Once that mapping is decided, rendering it into the **self-contained HTML report** (`test-stack-report---.html` in the current working directory) is mechanical and is delegated to the Sonnet **report-writer subagent** (see _Model selection_) — hand it the decided per-behavior records, each carrying its `source_issue` (key + URL) from intake, and the `#overview` synthesis to lay out; it authors the fragment, linking every Jira item and every Jira-sourced behavior to its browse URL per the template, and runs the build script. Pass today's date to the skill — skills cannot read the clock; the build script stamps the `HHMMSS` suffix. -### 5. Present +### 5. Combine and present -The run produces **two self-contained HTML files** in the current working directory: the `test-coverage-report-*.html` (what is already tested, from step 3) and the `test-stack-report-*.html` (the recommendation, from step 4). Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at both files — the coverage report for the existing-test detail, the test-stack report for the per-behavior recommendation. +Steps 3 and 4 each emit a self-contained HTML file in the current working directory: the `test-coverage-report---.html` (what is already tested) and the `test-stack-report---.html` (the recommendation). Each filename carries the build script's timestamp, so re-running never overwrites a prior report. + +Then assemble the **combined two-tab page** — the primary deliverable, with _Current coverage_ (the coverage report) and _Recommended coverage_ (the test-stack report) on one page. Run the build script yourself (it is pure file assembly — no template or stylesheet reading, so your context stays lean) with the two filenames the prior steps printed: + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \ + --kind test-combined --slug --date \ + --current \ + --recommended +``` + +This writes `test-combined-report---.html`; the two standalone reports are read, not modified, and remain available. Use the exact filenames the build script printed. + +Mirror the test-stack report's `#overview` in chat: the recommended shape per platform, the top open risks the user should resolve before committing to the plan, and any coverage the analyst could not verify. Point the user at the **combined page** first (both views in one file), and note the two standalone reports are also available for sharing a single view. ## Principles - **Evidence over assertion.** Every recommended layer ties back to a specific behavior, requirement, diff hunk, or existing test. Flag anything you could not ground. -- **Cheapest sufficient layer.** Push confidence down the trophy — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer. +- **Cheapest sufficient layer, inside the repo's shape.** Push confidence down — prefer integration over E2E, unit over integration — unless a behavior genuinely requires the higher layer, then land the call inside the target repo's actual shape (pyramid for `server`/`sdk-internal`/`clients`/`android`, integration + snapshot for `ios`, all-E2E for `test`/`browser-interactions-testing`). - **Risk-weighted by severity.** Coverage rigor scales with the impact a defect would carry, not with how urgently it ships. Critical behaviors (core flows, data integrity, security) owe their failure modes full coverage and lead the gap list; Low behaviors earn minimal coverage and never an E2E test. Severity (impact) ≠ priority (urgency). -- **Degrade gracefully.** A missing input (no Jira MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see. +- **Degrade gracefully.** A missing input (no `bitwarden-atlassian-tools` MCP, no PR, no CSV, no `test` repo checkout) narrows the analysis; it never blocks it. State what you could not see. - **Read repo config first.** When the analysis touches a checked-out codebase, the coverage scouts read its Claude config (root `CLAUDE.md`, `.claude/`, and nested `CLAUDE.md` for the touched subdirs) before opening test files, and honor its test conventions over generic defaults. Explore test files only as a fallback for conventions the config doesn't cover. See `${CLAUDE_PLUGIN_ROOT}/skills/assessing-test-coverage/references/finding-coverage.md` → _Discovering a repo's test conventions_. - **Coverage before recommendation.** Assess what already exists (step 3) before mapping new layers (step 4); the recommendation is incremental against observed coverage, not absolute. ## Model selection -Model spend is governed here in the plugin, not left to the session default. The split: +This agent **inherits the session model** for its own context — the orchestration and the hard reasoning run on whatever model the user set the session to. What the plugin governs explicitly is the model of every subagent you fan out, so the cheap, high-volume work never runs at the orchestrator's rate. The split: + +- **You (the test-engineer agent) keep the genuinely hard work in your own context** — classifying intake, then mapping behaviors to the cheapest sufficient layer across multiple platforms, risk-weighted by severity. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on, so it stays with the orchestrator rather than being delegated to a subagent. +- **Evidence-gathering subagents run on Sonnet or Haiku.** Everything you fan out to gather is evidence that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit the orchestrator's model. +- **Report rendering runs on Sonnet — the report-writer subagent.** Once the coverage inventory (step 3) and the behavior→layer/severity mapping (step 4) are decided, turning them into HTML is **mechanical formatting, not reasoning**, and is delegated rather than done in your own context. Dispatch a `Task` (model: `sonnet`) report-writer that receives the decided structured records (plus the `#overview` synthesis you wrote), authors the report **content fragment** per the skill's template, and runs `${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh` to splice in the stylesheet and emit the file. The stylesheet itself is a static file the build script inlines — it is never reproduced as model output by anyone, on any model. + +Rule of thumb: push the cheap, high-volume gathering **and the mechanical report rendering** down to explicitly-pinned Sonnet/Haiku subagents; keep only the irreducible layer/severity reasoning in the orchestrator context. + +## Keep your orchestrator context lean -- **You (the test-engineer agent) run on Opus.** Your context is where the genuinely hard work happens: classifying intake, then running `analyzing-test-stack` — mapping behaviors to the cheapest sufficient layer across multiple platforms — all in _your_ context, so your model sets its quality. This is cross-repo strategic reasoning where a wrong recommendation is expensive to act on; it justifies Opus. -- **Subagents run on Sonnet or Haiku.** Everything you fan out is evidence gathering that returns a compact digest. Sonnet handles anything that reads a diff, ticket, or repo; Haiku handles pure parsing. Assign the model explicitly on every `Task` (see step 2) rather than letting it inherit Opus. +Your own context is the most expensive token pool in the run — what you read into it and re-emit is re-cached on every subsequent turn. Three rules: -Rule of thumb: push the cheap, high-volume gathering down to Sonnet/Haiku; keep only the irreducible reasoning on Opus. +- **Never read the rendering files into your context.** The report templates (`html-report-template.md`, `coverage-report-template.md`), `report-style-tokens.md`, `report-style.css`, and `build-report.sh` are the **report-writer subagent's** concern only — it reads them. You only need the reasoning references (`testing-trophy.md`, `severity-risk.md`, `monorepo-layout.md`, `input-sources.md`, and `finding-coverage.md` for the contract). Loading the templates or stylesheet into your context is wasted cache. (The combined-page build in step 5 is the one time you _invoke_ `build-report.sh` directly — but you only run it on the two finished report filenames; you still never read its source or the rendering files.) +- **Don't restate digests.** Subagents return compact digests; synthesize them into the decision, don't echo them back to the user mid-run. Keep inter-step narration to a few lines — the reports are the deliverable, not a running commentary. +- **Hand off by the smallest payload.** Pass report-writers the compact per-behavior records (now small by design) and the `#overview` text. If a record set is still large, `Write` it to a temp file (e.g. `./.test-engineer-.json`) and pass the path instead of pasting the blob into the prompt. diff --git a/plugins/bitwarden-test-engineer/references/input-sources.md b/plugins/bitwarden-test-engineer/references/input-sources.md index 9724f09..7938b95 100644 --- a/plugins/bitwarden-test-engineer/references/input-sources.md +++ b/plugins/bitwarden-test-engineer/references/input-sources.md @@ -20,6 +20,13 @@ Otherwise use the MCP tools directly: Extract: discrete **testable behaviors**, **acceptance criteria**, and the **platforms/ components** named. If the MCP is unavailable, ask the user to paste the requirements. +Also capture, for every issue you read, its **key and browse URL** (prefer the URL the MCP/skill +returns; otherwise construct `https://bitwarden.atlassian.net/browse/`), and **carry the +originating issue key with each behavior you extract**. The report links every behavior back to +the Jira item it came from — see _Citing Jira issues as links_ below — so provenance must survive +intake. A behavior that traces to no Jira item (e.g. found only in a PR diff) simply carries no +source issue. + Also capture **severity** — for a bug/defect ticket, read the severity assigned on the issue (the severity field, or the QA/reporter's stated severity in the description/comments) and carry it with the behaviors; for a feature/story without a defect, leave it to the analyst to @@ -46,7 +53,9 @@ before extracting: not re-derive it. 3. **Per child, gather behaviors and PRs.** - `mcp__bitwarden-atlassian__get_issue` for the child's description and acceptance - criteria — these are the testable behaviors for the trophy. + criteria — these are the testable behaviors for the trophy. Capture each child's **key and + browse URL** and carry it with the behaviors it produces, exactly as for a single-issue + intake — a behavior sourced from a child issue links to that child, not the epic. - `mcp__bitwarden-atlassian__get_issue_remote_links` for PRs (grouped under "GitHub"). Each PR URL becomes an input to the **GitHub PR** branch below: hand it off to `gh pr view` / `gh pr diff` so the actual change surface and any tests-in-PR feed the @@ -154,3 +163,33 @@ Map rows to behaviors and bucket each by apparent layer using the `analyzing-tes Flag cases that are currently manual but cheaply automatable at a lower layer, and cases slated for E2E that would be better as integration. If a column's meaning is ambiguous, state the interpretation you used rather than guessing silently. + +## Citing Jira issues as links + +Every Jira item the report **names** — and every behavior the report shows that was **found from +a Jira item** — is rendered as a clickable link to that item, never as bare key text. This is the +Jira counterpart to the GitHub permalink rule for tests (the `assessing-test-coverage` skill's +`references/finding-coverage.md` → _Citing tests as GitHub permalinks_). + +The link form is the issue's browse URL: + +``` +https://bitwarden.atlassian.net/browse/ +``` + +where `` is the issue key (e.g. `PM-1234`). Prefer the URL the MCP tool or +`bitwarden-atlassian-tools:researching-jira-issues` skill returns for the issue; fall back to +constructing the browse URL from the key. The same rule covers epics and their children — link +each to its own key. + +Apply it everywhere the report renders one of these: + +- An **issue, epic, or child key** named in the Overview, Summary, or Evidence sections — + anchor the key: `PM-1234`. +- A **behavior row** (in the recommendations/coverage and gaps sections) whose behavior was + extracted from a Jira item — append the linked source key to the behavior cell so a reader can + jump to the requirement it came from. A behavior with no Jira source (PR-only) carries no key. + +These are informational `` citations — text, not loaded assets — so they do not violate +the reports' self-contained / no-remote-resources constraint. Never fabricate a key or URL; if an +issue's key is unknown, name the source in plain text rather than inventing a link. diff --git a/plugins/bitwarden-test-engineer/references/report-style-tokens.md b/plugins/bitwarden-test-engineer/references/report-style-tokens.md index 4957f9f..117303e 100644 --- a/plugins/bitwarden-test-engineer/references/report-style-tokens.md +++ b/plugins/bitwarden-test-engineer/references/report-style-tokens.md @@ -1,11 +1,20 @@ # Report style tokens — data-report visual system for HTML reports -This file is the **single source of styling truth** for every self-contained HTML report the +This file documents the **visual system** for every self-contained HTML report the `bitwarden-test-engineer` plugin emits — the `analyzing-test-stack` test-stack report and the `assessing-test-coverage` coverage report alike. The HTML output requirements (single file, inline CSS, no external/CDN assets, no web fonts, no JS) mean a report cannot `` to a -design system at runtime — instead, **inline the stylesheet block at the bottom of this file -verbatim** into the report's ` +``` + +Write that fragment to a temporary path (e.g. `-report-.fragment.html`), then run +the build script from the plugin root: + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \ + --kind --slug --date \ + ``` +The script replaces the sentinel with `report-style.css` verbatim and writes +`-report---.html` to the current working directory, printing the +final filename to stdout. The `` suffix is stamped from the wall clock by the script +(the model cannot read the clock), so **every run gets a fresh filename** — a report is never +overwritten, and an existing report never has to be read back and regenerated. Delete the +temporary fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`, +fragment not found) it writes nothing — fix the fragment and re-run rather than falling back to +pasting CSS by hand. + +To assemble the **combined two-tab page** from the two already-built standalone reports, call +the script with `--kind test-combined` and the two finished report files (no fragment, no +sentinel — the bodies are reused as-is): + +```bash +"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \ + --kind test-combined --slug --date \ + --current \ + --recommended +``` + +It writes `test-combined-report---.html` and prints the filename. The two +input reports are read, not modified, and their standalone files remain. + ## What not to do - Do not reintroduce a brand skin — no saturated brand blue/yellow, no logo images, no @@ -490,7 +191,8 @@ ul.shapes .plat { encoding. - Do not introduce web fonts, CDN links, or `` — the single-file constraint is binding. -- Do not narrow the stylesheet down to "only the classes this report uses." The template - ships the full stylesheet so a reader inspecting any report sees the same system. +- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only + the sentinel, and the build script supplies the full stylesheet. A report that ships a + hand-copied or "only the classes I used" stylesheet is exactly how two reports drift apart. - Do not hand-compute the distribution bar widths in pixels or percentages — set `flex: ` per segment and let the browser normalize. diff --git a/plugins/bitwarden-test-engineer/references/report-style.css b/plugins/bitwarden-test-engineer/references/report-style.css new file mode 100644 index 0000000..ad98427 --- /dev/null +++ b/plugins/bitwarden-test-engineer/references/report-style.css @@ -0,0 +1,552 @@ +:root { + /* Surfaces & ink — flat paper, no cards or shadows */ + --paper: #ffffff; + --panel: #f4f6f8; + --ink: #16191d; + --ink-soft: #585f68; + --ink-faint: #818892; + --rule: #e4e7ea; + + /* Layer ramp — SEQUENTIAL: ordered cheap/shallow -> costly/deep */ + --unit: #8fb3d1; + --integration: #3f7196; + --e2e: #1d3a54; + --on-unit: #16191d; /* --unit is light: use dark text */ + --on-deep: #ffffff; /* white text on integration/e2e */ + + /* Verdict & state — muted categorical */ + --ok: #43875a; + --warn: #b07d2f; + --bad: #bf564a; + --on-state: #ffffff; + + --link: #2f6e9e; + + --sans: + system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + --mono: + ui-monospace, "SF Mono", SFMono-Regular, Menlo, Consolas, "Liberation Mono", + monospace; +} + +* { + box-sizing: border-box; +} +html { + -webkit-text-size-adjust: 100%; + scroll-padding-top: 24px; /* keep anchored sections clear of the top edge */ +} + +body { + margin: 0; + background: var(--paper); + color: var(--ink); + font: 15px/1.6 var(--sans); + font-feature-settings: "tnum" 1; /* tabular figures where supported */ + -webkit-font-smoothing: antialiased; + text-rendering: optimizeLegibility; +} + +/* Smooth in-page jumps for the report's overview -> section anchor links, + suppressed when the reader prefers reduced motion. */ +@media (prefers-reduced-motion: no-preference) { + html { + scroll-behavior: smooth; + } +} + +a { + color: var(--link); + text-decoration: underline; + text-underline-offset: 2px; + text-decoration-thickness: 1px; +} +a:focus-visible, +summary:focus-visible { + outline: 2px solid var(--link); + outline-offset: 2px; +} + +/* Masthead */ +header { + max-width: 60rem; + margin: 0 auto; + padding: clamp(36px, 7vw, 56px) clamp(20px, 5vw, 32px) 28px; +} +header .eyebrow { + margin: 0 0 14px; + font: 600 11px/1 var(--mono); + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--ink-faint); +} +header h1 { + margin: 0 0 12px; + font-size: clamp(24px, 5vw, 32px); + line-height: 1.2; + font-weight: 650; + letter-spacing: -0.01em; + text-wrap: balance; +} +header .meta { + font: 12px/1.6 var(--mono); + color: var(--ink-soft); +} +header .meta a { + color: var(--ink-soft); +} + +/* In-page table of contents — a compact monospace row of section links at the + top of
. In the combined report the build script namespaces each link's + href per tab, so a panel's ToC jumps within its own panel. */ +.toc { + display: flex; + flex-wrap: wrap; + gap: 6px 18px; + margin: 0 0 4px; + padding: 0 0 20px; + border-bottom: 1px solid var(--rule); + font: 600 11px/1.6 var(--mono); + letter-spacing: 0.08em; + text-transform: uppercase; +} +.toc a { + color: var(--ink-soft); + text-decoration: none; +} +.toc a:hover { + color: var(--link); + text-decoration: underline; +} + +/* Sections — flat, hairline-separated, auto-numbered */ +main { + max-width: 60rem; + margin: 0 auto; + padding: 0 clamp(20px, 5vw, 32px) 96px; + counter-reset: sec; +} +section { + counter-increment: sec; + padding: 36px 0; + border-top: 1px solid var(--rule); + scroll-margin-top: 24px; +} +section:first-of-type { + border-top: 0; +} +/* Quiet landing cue: briefly tint a section an in-page link jumped to. */ +@media (prefers-reduced-motion: no-preference) { + section:target { + animation: section-land 1.4s ease-out; + } + @keyframes section-land { + from { + background: var(--panel); + } + to { + background: transparent; + } + } +} +section > h2 { + margin: 0 0 18px; + font-size: 19px; + font-weight: 650; + letter-spacing: -0.01em; + text-wrap: balance; +} +section > h2::before { + content: counter(sec, decimal-leading-zero); + display: inline-block; + margin-right: 12px; + font: 600 12px/1 var(--mono); + letter-spacing: 0.1em; + color: var(--ink-faint); + vertical-align: 2px; +} +section h3 { + margin: 28px 0 10px; + font: 600 11px/1.3 var(--mono); + letter-spacing: 0.12em; + text-transform: uppercase; + color: var(--ink-soft); +} + +/* Prose */ +p { + margin: 0 0 14px; + max-width: 72ch; + text-wrap: pretty; /* avoid orphans / ragged short last lines */ +} +.lead { + font-size: 16px; +} +.small { + font-size: 12.5px; + color: var(--ink-soft); +} +ul.tight { + margin: 8px 0 16px; + padding-left: 20px; +} +ul.tight li { + margin: 0 0 6px; +} +ol { + padding-left: 22px; +} +ol li { + margin: 0 0 10px; +} +code { + font: 0.86em var(--mono); + background: var(--panel); + padding: 1px 5px; + border-radius: 3px; +} + +/* Tables — heavy header rule, hairline rows */ +.scroll { + overflow-x: auto; + -webkit-overflow-scrolling: touch; + overscroll-behavior-x: contain; +} +table { + width: 100%; + border-collapse: collapse; + margin: 4px 0 18px; + font-size: 13.5px; +} +thead th { + text-align: left; + vertical-align: bottom; + padding: 0 12px 8px; + font: 600 10.5px/1.3 var(--mono); + letter-spacing: 0.1em; + text-transform: uppercase; + color: var(--ink-faint); + border-bottom: 1px solid var(--ink); +} +tbody td { + vertical-align: top; + padding: 10px 12px; + border-bottom: 1px solid var(--rule); +} +tbody tr:hover { + background: var(--panel); +} +th:first-child, +td:first-child { + padding-left: 0; +} +th:last-child, +td:last-child { + padding-right: 0; +} + +/* Layer chip */ +.layer { + display: inline-block; + font: 600 10.5px/1.6 var(--mono); + letter-spacing: 0.08em; + text-transform: uppercase; + padding: 2px 8px; + border-radius: 2px; + white-space: nowrap; +} +.layer.unit { + background: var(--unit); + color: var(--on-unit); +} +.layer.integration { + background: var(--integration); + color: var(--on-deep); +} +.layer.e2e { + background: var(--e2e); + color: var(--on-deep); +} + +/* Layer-distribution chart (the signature graphic) */ +figure { + margin: 18px 0; +} +figcaption { + margin-bottom: 14px; + font: 11px/1.4 var(--mono); + letter-spacing: 0.04em; + color: var(--ink-faint); +} +.dist .legend { + display: flex; + flex-wrap: wrap; + gap: 18px; + margin-bottom: 14px; + font: 11px/1 var(--mono); + color: var(--ink-soft); +} +.dist .legend .key { + display: inline-flex; + align-items: center; + gap: 6px; + text-transform: uppercase; + letter-spacing: 0.06em; +} +.dist .legend .key::before { + content: ""; + width: 10px; + height: 10px; + border-radius: 2px; + background: var(--rule); +} +.dist .legend .unit::before { + background: var(--unit); +} +.dist .legend .integration::before { + background: var(--integration); +} +.dist .legend .e2e::before { + background: var(--e2e); +} +.dist-row { + display: flex; + align-items: center; + gap: 14px; + margin: 7px 0; +} +.dist-row .dist-label { + flex: 0 0 14ch; + text-align: right; + font: 11px/1.3 var(--mono); + color: var(--ink-soft); + word-break: break-word; +} +.dist-row .bar { + flex: 1; + display: flex; + height: 24px; + background: var(--panel); + border-radius: 3px; + overflow: hidden; +} +.bar .seg { + display: flex; + align-items: center; + justify-content: center; + min-width: 18px; + font: 600 11px/1 var(--mono); + color: var(--on-deep); +} +.bar .seg.unit { + background: var(--unit); + color: var(--on-unit); +} +.bar .seg.integration { + background: var(--integration); +} +.bar .seg.e2e { + background: var(--e2e); +} + +/* Per-platform recommended-shape list (replaces card blocks) */ +ul.shapes { + margin: 6px 0 0; + padding: 0; + list-style: none; +} +ul.shapes li { + padding: 10px 0; + border-top: 1px solid var(--rule); +} +ul.shapes li:first-child { + border-top: 0; +} +ul.shapes .plat { + font: 600 13px/1.5 var(--mono); +} + +/* Badges */ +.badge { + display: inline-block; + font: 600 10px/1.5 var(--mono); + letter-spacing: 0.04em; + text-transform: uppercase; + padding: 1px 6px; + border-radius: 2px; + color: var(--on-state); + white-space: nowrap; +} +.badge.assumption { + background: var(--warn); +} +.badge.warn { + background: var(--bad); +} +.badge.ok { + background: var(--ok); +} + +/* Unlinkable evidence */ +.unlinkable { + font: italic 12px/1.4 var(--mono); + color: var(--ink-faint); +} + +/* Tabbed combined report — the Current-coverage and Recommended-coverage report + bodies surfaced as two tabs on one page, CSS-only (no JavaScript). The radio + inputs are visually hidden but keep keyboard focus; the checked input drives + both the active label and which panel shows. These rules are only exercised by + the combined report; they are inert in the standalone coverage/test-stack + reports, which never emit these elements. */ +.tab-input { + position: absolute; + width: 1px; + height: 1px; + margin: -1px; + opacity: 0; +} +.tablist { + max-width: 60rem; + margin: 0 auto; + padding: 0 clamp(20px, 5vw, 32px); + display: flex; + flex-wrap: wrap; + gap: 4px; + border-bottom: 1px solid var(--ink); +} +.tablist label { + display: inline-block; + padding: 11px 16px; + font: 600 11px/1.4 var(--mono); + letter-spacing: 0.1em; + text-transform: uppercase; + color: var(--ink-faint); + cursor: pointer; + border: 1px solid transparent; + border-bottom: 0; + border-radius: 3px 3px 0 0; + margin-bottom: -1px; /* sit the tab on the list's bottom rule */ +} +.tablist label:hover { + color: var(--ink); + background: var(--panel); +} +/* A tabpanel is itself a section element; neutralize the global section chrome + so only the report sections nested inside its main element render with rules + and numbering. */ +.tabpanel { + display: none; + padding: 0; + border-top: 0; + counter-increment: none; +} +/* Active tab + its panel, driven by the checked radio (general-sibling ~). */ +#tab-current:checked ~ .tablist label[for="tab-current"], +#tab-recommended:checked ~ .tablist label[for="tab-recommended"] { + color: var(--ink); + border-color: var(--ink); + border-bottom-color: var(--paper); + background: var(--paper); +} +#tab-current:checked ~ .tabpanel[data-panel="current"], +#tab-recommended:checked ~ .tabpanel[data-panel="recommended"] { + display: block; +} +/* Keyboard focus on the visually-hidden radio surfaces a ring on its label. */ +#tab-current:focus-visible ~ .tablist label[for="tab-current"], +#tab-recommended:focus-visible ~ .tablist label[for="tab-recommended"] { + outline: 2px solid var(--link); + outline-offset: -2px; +} + +/* Floating "back to top" control — a fixed action button that rides along as the + reader scrolls and jumps to the top via the in-page #top anchor on
. No + JavaScript: it reuses the same smooth-scroll / reduced-motion behavior as the ToC + links. Flat to fit the data-report system — a solid ink fill carries it over the + content instead of a shadow. Present in every report; hidden when printing. */ +.to-top { + position: fixed; + right: clamp(16px, 4vw, 28px); + bottom: clamp(16px, 4vw, 28px); + z-index: 20; + display: inline-flex; + align-items: center; + gap: 6px; + padding: 9px 13px; + background: var(--ink); + color: var(--paper); + font: 600 10.5px/1 var(--mono); + letter-spacing: 0.1em; + text-transform: uppercase; + text-decoration: none; + border-radius: 4px; +} +.to-top::before { + content: "\2191"; /* upwards arrow */ + font-size: 13px; + line-height: 1; +} +.to-top:hover { + background: var(--link); + color: var(--paper); +} +.to-top:focus-visible { + outline: 2px solid var(--link); + outline-offset: 2px; +} + +@media (max-width: 720px) { + header, + main, + .tablist { + padding-left: 20px; + padding-right: 20px; + } + .dist-row { + flex-direction: column; + align-items: stretch; + gap: 4px; + } + .dist-row .dist-label { + flex: none; + text-align: left; + } +} + +@media print { + body { + font-size: 11pt; + } + /* Tabs cannot be toggled on paper — drop the controls and stack both report + bodies, each titled by its panel label so the printout stays legible. */ + .tab-input, + .tablist, + .to-top { + display: none; + } + .tabpanel { + display: block !important; + } + .tabpanel::before { + content: attr(aria-label); + display: block; + max-width: 60rem; + margin: 0 auto; + padding: 16px clamp(20px, 5vw, 32px) 0; + font: 600 11px/1.3 var(--mono); + letter-spacing: 0.12em; + text-transform: uppercase; + color: var(--ink-faint); + } + section { + break-inside: avoid; + border-top-color: #ccc; + } + tbody tr:hover { + background: none; + } + a { + color: var(--ink); + } +} diff --git a/plugins/bitwarden-test-engineer/scripts/build-report.sh b/plugins/bitwarden-test-engineer/scripts/build-report.sh new file mode 100755 index 0000000..fe06c6b --- /dev/null +++ b/plugins/bitwarden-test-engineer/scripts/build-report.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +# +# build-report.sh — assemble a self-contained HTML report for the +# bitwarden-test-engineer plugin by splicing the canonical stylesheet into a +# model-authored content fragment. +# +# The model writes a fragment whose + + +HTML + # Shared masthead: reuse the recommendation report's header, relabel its + # eyebrow so the page reads as the combined deliverable, not one report. + extract_region "$RECOMMENDED" "" \ + | sed -E 's#(

)[^<]*(

)#\1Test Engineering Report\2#' + cat <<'HTML' + + + +
+HTML + extract_region "$CURRENT" "" | prefix_ids cur + cat <<'HTML' +
+
+HTML + extract_region "$RECOMMENDED" "" | prefix_ids rec + # The reused masthead carries id="top"; emit the back-to-top control once for + # the whole page. Each standalone report's own control sits after its
, + # outside the extracted region, so the combined page would otherwise have none. + cat <<'HTML' + +
Top + + +HTML + } | splice_css > "$OUT" + + echo "$OUT" + exit 0 +fi + +# --- single report (test-stack | test-coverage) ------------------------------ +if [[ -z "$FRAGMENT" || ! -f "$FRAGMENT" ]]; then + echo "build-report.sh: fragment HTML file not found: '${FRAGMENT}'" >&2 + exit 2 +fi +if ! grep -qF "$SENTINEL" "$FRAGMENT"; then + echo "build-report.sh: fragment '${FRAGMENT}' has no stylesheet sentinel." >&2 + echo " Put exactly this line inside the -
+

Test Stack Report

…the change under analysis…

…ticket/PR · status · team · date…

+

Overview

…2–4 sentence recap of the recommended shape per platform; top 3 open @@ -143,8 +189,8 @@ top-level sections — readers look these up by id.
  • - bitwarden/server — integration-heavy, thin - unit, 1 E2E journey + bitwarden/server — unit-heavy pyramid, + thin integration, no E2E
@@ -167,6 +213,26 @@ top-level sections — readers look these up by id. …gaps and trophy-wrong shapes; ungrounded findings marked unverified…
+ Top ``` + +## Building the report + +Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only +content of ` -
+

Test Coverage Report

…the change under analysis…

…ticket/PR · status · team · date…

+

Overview

…2–4 sentence recap of observed coverage per platform; top 3 gaps; @@ -150,6 +187,26 @@ sections — readers look these up by id. reason…
+ Top ``` + +## Building the report + +Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only +content of ` + + +
+

…report title…

+

…the change under analysis…

+

…ticket/PR · status · team · date…

+
+
+ +
+

Overview

+ …synthesis: recap per platform; top 3 items; anchor links into the + detail sections… +
+
+

…summary heading…

+ …2–4 sentences… +
+
Fig 1 · …layer distribution by platform…
+
+ unit + integration + e2e +
+
+ bitwarden/server +
+ 3 + 11 + 1 +
+
+ +
+
    +
  • bitwarden/server — …one-line shape…
  • + +
+
+
+

Evidence & sources

+
+ …sources used + what was missing + commit SHA(s)… +
+
+ +
+

…gaps heading…

+ …per your template… +
+
+ Top + + +``` diff --git a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md index 8003edb..13db7c1 100644 --- a/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md +++ b/plugins/bitwarden-test-engineer/skills/analyzing-test-stack/SKILL.md @@ -1,6 +1,6 @@ --- name: analyzing-test-stack -description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — analyzing a Jira ticket, GitHub PR, exported test-case CSV, technical breakdown, and/or plain-language description, then mapping each behavior to the cheapest sufficient test layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "what tests should this have", "which test layers", "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "what should we test for this tech breakdown", "are these tests at the right level", "risk-based test coverage", "what tests does this Critical/High bug need", or "rank coverage gaps by severity". +description: Use when recommending what test automation a feature, bugfix, or change needs and at which layer — from a Jira ticket, GitHub PR, test-case CSV, technical breakdown, and/or plain-language description — mapping each behavior to the cheapest sufficient layer (unit, integration, E2E) inside each repo's actual test shape, risk-weighted by defect severity. Triggers on "test stack", "test strategy", "test trophy", "test plan for this PR/ticket", "which test layers should this have", or "what tests does this Critical/High bug need". allowed-tools: "Read, Write, Grep, Glob, AskUserQuestion, Skill, Bash(gh pr view:*), Bash(gh pr diff:*), Bash(gh pr checks:*), Bash(${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh:*), mcp__bitwarden-atlassian__get_issue, mcp__bitwarden-atlassian__search_issues, mcp__bitwarden-atlassian__get_issue_comments, mcp__bitwarden-atlassian__get_issue_remote_links, mcp__bitwarden-atlassian__get_confluence_page, mcp__bitwarden-atlassian__search_confluence, mcp__bitwarden-atlassian__search_confluence_cql" --- @@ -14,7 +14,7 @@ The three layers (read `references/testing-trophy.md` for the full model): a foc You may receive any combination of: a Jira key, a GitHub PR, a CSV export of test cases, a technical breakdown document, and/or a plain-language description. Treat them as additive evidence. You also consume a **coverage inventory** — the existing-test records produced by the `assessing-test-coverage` skill (permalink records + `unverified` gaps). Under the `bitwarden-test-engineer` agent this is gathered for you before this skill runs; if it is absent (e.g. run standalone), invoke `Skill(assessing-test-coverage)` for the affected change surface, or proceed and record all coverage as `unverified`. **Today's date is provided by the caller** — use it for the report filename; do not attempt to read the clock. If no date is supplied, ask via `AskUserQuestion` rather than guessing. -`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule: prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`, fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable, and if neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance: +`../../references/input-sources.md` (a plugin-level reference shared with `assessing-test-coverage`) is the canonical guide for how to ingest each source — Epic expansion, breakdown mining, CSV column mapping, and the rule that a missing source is recorded as a gap rather than blocking the analysis. **For Jira and Confluence intake**, follow that reference's tooling rule. Prefer `Skill(bitwarden-atlassian-tools:researching-jira-issues)`; fall back to the `bitwarden-atlassian-tools` MCP tools (the `mcp__bitwarden-atlassian__*` tools this skill's frontmatter grants) when that skill is unavailable. If neither is reachable, ask the user to paste the requirements rather than blocking — never assume a generic Atlassian MCP or direct REST access. At a glance: - **Jira** — extract testable behaviors and acceptance criteria; Epics/Features expand to their children before extraction. - **GitHub PR** — extract the change surface, API touched, and any tests already present. @@ -30,18 +30,18 @@ Alongside the behaviors, carry each behavior's **risk severity** — the impact 1. **Resolve scope.** From the evidence, list the discrete testable behaviors and the platforms each touches. Map platforms to stacks, tooling, and the layer→repo split (including the sibling `test` repo for E2E) using `references/monorepo-layout.md`. **When the input is an Epic**, the behaviors come from the children's acceptance criteria and the diffs of any PRs linked from those children — record which children/PRs you actually inspected vs. only enumerated. -2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input: **one record per behavior** carrying its layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`, representative tests path-only with an `unlinkable` reason when they can't be linked) plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below. +2. **Consume the coverage inventory.** What is already tested is established by the `assessing-test-coverage` skill, not here — take its inventory as input. It is **one record per behavior**, carrying that behavior's layer, an approximate count, and 1–3 representative permalinks (`{ behavior, platform, layer, status, count, representative: [...] }`; representative tests are path-only with an `unlinkable` reason when they can't be linked), plus the `unverified` gaps. Treat _observed_ coverage as verified and everything else as a gap, never assumed covered. If no inventory was supplied, invoke `Skill(assessing-test-coverage)` for the affected change surface to produce one; do not re-derive coverage-finding or permalink rules here (they live in that skill's `references/finding-coverage.md`). These records feed both the report's Evidence column (rendering each behavior's representative permalinks) and the gap analysis below. -3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest trophy layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`). +3. **Assign the cheapest sufficient layer, weighted by severity.** For each behavior, pick the lowest test layer that genuinely buys the needed confidence, with a one-line rationale — then check the confidence bar against the behavior's risk severity per `references/severity-risk.md`. Severity sets _how much_ confidence is sufficient, not _which_ layer: a Critical behavior must cover its material failure modes (and, if it is a genuine end-to-end critical flow, claim the thin E2E layer reserved for exactly that), while a Low behavior earns minimal coverage and never an E2E test. Prefer integration over E2E and unit over integration unless the behavior truly requires the higher layer (real browser/device, cross-service contract, full user journey) — then land that call inside the **target repo's shape** (`references/monorepo-layout.md` → _Each repo's test shape in practice_): a pyramid repo like `server` or `sdk-internal` resolves toward unit, `ios` toward its component + snapshot practice, and cross-system journeys toward the all-E2E `test` repo. Name concrete tooling per platform (see `references/monorepo-layout.md`). 4. **Find the gaps and the imbalance, ranked by severity.** Call out behaviors with no recommended coverage, and any existing shape that is wrong for its repo (e.g. E2E doing work integration should do, untested core logic, or a layer the repo doesn't even maintain). **Order gaps by severity** — a Critical behavior with no observed coverage is a top-priority gap and leads the list; Informative behaviors are recorded as out-of-scope rather than gaps. Be explicit about what evidence each gap rests on. -5. **Render the HTML report.** Once steps 1–4 have decided the per-behavior layer/severity mapping, rendering it to HTML is **mechanical formatting, not reasoning** — under the `bitwarden-test-engineer` agent this step runs on a Sonnet report-writer subagent (see the agent's _Model selection_), not in the analytical context. Author a **content fragment** following `references/html-report-template.md`: a full HTML document whose ` - - -
-

Test Stack Report

-

…the change under analysis…

-

…ticket/PR · status · team · date…

-
-
- -
-

Overview

- …2–4 sentence recap of the recommended shape per platform; top 3 open - risks; anchor links into #recommendations and #gaps… -
-
-

Summary & recommended shape

- …2–4 sentences… -
-
- Fig 1 · Recommended layer distribution by platform -
-
- unit - integration - e2e -
-
- bitwarden/server -
- 3 - 11 - 1 -
-
- -
-
    -
  • - bitwarden/server — unit-heavy pyramid, - thin integration, no E2E -
  • - -
-
-
-

Evidence & sources

-
- …sources used + what was missing + commit SHA(s)… -
-
-
-

Per-platform recommendations

-
- …per-platform tables: Behavior | Severity | Recommended layer | - Tooling | Rationale | Evidence (linked)… -
-
-
-

Coverage gaps & imbalances

- …gaps and trophy-wrong shapes; ungrounded findings marked unverified… -
-
- Top - - +
+

Per-platform recommendations

+
+ …per-platform tables: Behavior | Severity | Recommended layer | Tooling | + Rationale | Evidence (linked)… +
+
``` - -## Building the report - -Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only -content of ` - - -
-

Test Coverage Report

-

…the change under analysis…

-

…ticket/PR · status · team · date…

-
-
- -
-

Overview

- …2–4 sentence recap of observed coverage per platform; top 3 gaps; - anchor links into #coverage and #gaps; one line noting this is a - coverage inventory, not a recommendation… -
-
-

Observed coverage shape

- …2–4 sentences… -
-
Fig 1 · Observed test coverage by platform
-
- unit - integration - e2e -
-
- bitwarden/server -
- 3 - 14 -
-
- -
-
    -
  • - bitwarden/server — 14 integration, 3 unit, - 0 E2E observed -
  • - -
-
-
-

Evidence & sources

-
- …repos inspected + PRs read + test-repo availability + what was - missing + commit SHA(s)… -
-
-
-

Observed coverage

-
- …per-platform behavior→test tables with linked evidence… -
-
-
-

Coverage gaps

- …behaviors with no observed test, each marked unverified with a one-line - reason… -
-
- Top - - -``` - -## Building the report - -Write the fragment above (with the `/* @@BITWARDEN_REPORT_STYLESHEET@@ */` sentinel as the only -content of ` ``` -Write that fragment to a temporary path (e.g. `-report-.fragment.html`), then run -the build script from the plugin root: +Write that fragment to a temporary path (e.g. `-report-.fragment.html`), then run the +build script from the plugin root: ```bash "${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \ @@ -160,39 +104,28 @@ the build script from the plugin root: ``` -The script replaces the sentinel with `report-style.css` verbatim and writes -`-report---.html` to the current working directory, printing the -final filename to stdout. The `` suffix is stamped from the wall clock by the script -(the model cannot read the clock), so **every run gets a fresh filename** — a report is never -overwritten, and an existing report never has to be read back and regenerated. Delete the -temporary fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`, -fragment not found) it writes nothing — fix the fragment and re-run rather than falling back to -pasting CSS by hand. - -To assemble the **combined two-tab page** from the two already-built standalone reports, call -the script with `--kind test-combined` and the two finished report files (no fragment, no -sentinel — the bodies are reused as-is): - -```bash -"${CLAUDE_PLUGIN_ROOT}/scripts/build-report.sh" \ - --kind test-combined --slug --date \ - --current \ - --recommended -``` - -It writes `test-combined-report---.html` and prints the filename. The two -input reports are read, not modified, and their standalone files remain. +It replaces the sentinel with `report-style.css` verbatim and writes the report into a per-change +directory `test-engineer-report--/` (created if needed) — the coverage report as +`coverage.html`, the test-stack report as `recommended.html` — then prints the final path. The +directory name derives only from `--slug`/`--date`, so a run's reports share one folder; +**re-running the same change on the same date refreshes the report in place**. Delete the temporary +fragment afterward. If the script errors (missing sentinel, bad `--kind`/`--date`, fragment not +found) it writes nothing — fix the fragment and re-run rather than pasting CSS by hand. + +**Combined two-tab page (assembled, not authored).** When both reports exist for one change, the +build script can stitch them into one page with two CSS-only tabs — _Current coverage_ and +_Recommended coverage_. This is a presentation-only merge from the two finished report files: no +skill or template knows about tabs, and the agent (not the report author) runs it with +`--kind test-combined --current test-engineer-report--/coverage.html --recommended test-engineer-report--/recommended.html`, +which writes `combined.html` into that same directory. The tab chrome lives entirely in the build +script and `report-style.css`. ## What not to do -- Do not reintroduce a brand skin — no saturated brand blue/yellow, no logo images, no - `` to a design system. The report is intentionally off-brand and self-contained. -- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the - encoding. -- Do not introduce web fonts, CDN links, or `` — the single-file - constraint is binding. -- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only - the sentinel, and the build script supplies the full stylesheet. A report that ships a - hand-copied or "only the classes I used" stylesheet is exactly how two reports drift apart. -- Do not hand-compute the distribution bar widths in pixels or percentages — set - `flex: ` per segment and let the browser normalize. +- Do not reintroduce a brand skin — no saturated brand colors, no logo images, no `` to a + design system. The report is intentionally off-brand and self-contained. +- Do not swap the sequential layer ramp for unrelated categorical hues; the order is the encoding. +- Do not paste, retype, or trim the stylesheet into the fragment — the fragment carries only the + sentinel. A report that ships a hand-copied or "only the classes I used" stylesheet is exactly how + two reports drift apart. +- Do not hand-compute distribution bar widths — set `flex: ` per segment. diff --git a/plugins/bitwarden-test-engineer/references/report-template-common.md b/plugins/bitwarden-test-engineer/references/report-template-common.md index 3eac48b..05877a9 100644 --- a/plugins/bitwarden-test-engineer/references/report-template-common.md +++ b/plugins/bitwarden-test-engineer/references/report-template-common.md @@ -1,106 +1,87 @@ # Report HTML — shared authoring contract Both self-contained HTML reports the `bitwarden-test-engineer` plugin emits — the -`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage -report** — are authored against this shared contract, so the two read as one instrument. Each -skill's own template (`html-report-template.md` / `coverage-report-template.md`) covers only what -differs: its section set, its per-platform table columns, and its recommend-vs-inventory framing. -**Read this file first, then that template.** +`analyzing-test-stack` **test-stack report** and the `assessing-test-coverage` **coverage report** — +are authored against this shared contract, so the two read as one instrument. Each skill's own +template (`html-report-template.md` / `coverage-report-template.md`) covers only what differs: its +section set, its per-platform table columns, and its recommend-vs-inventory framing. **Read this +file first, then that template.** ## Output constraints -Produce a **single self-contained HTML file**: all CSS inline in a `