From aefb12ab24d3faf3b861c7abec89767c2b2661c2 Mon Sep 17 00:00:00 2001 From: Naitik Soni <91239827+naitik-mixpanel@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:01:47 +0530 Subject: [PATCH] new skill: monitor-metrics --- .../.fuse_hidden0000000700000001 | 459 +++++++++++++++++ .../.fuse_hidden0000000d00000006 | 459 +++++++++++++++++ .../.fuse_hidden0000000e00000007 | 459 +++++++++++++++++ .../.fuse_hidden0000000f00000008 | 459 +++++++++++++++++ .../.fuse_hidden0000001000000009 | 459 +++++++++++++++++ .../skills/monitor-metrics/SKILL.md | 462 +++++++++++++++++ .../commands/metric-anomaly.md | 236 +++++++++ .../monitor-metrics/commands/metric-drift.md | 319 ++++++++++++ .../monitor-metrics/commands/metric-rca.md | 484 ++++++++++++++++++ .../.fuse_hidden0000000700000001 | 459 +++++++++++++++++ .../.fuse_hidden0000000800000002 | 459 +++++++++++++++++ .../.fuse_hidden0000000900000003 | 459 +++++++++++++++++ .../.fuse_hidden0000000a00000004 | 459 +++++++++++++++++ .../skills/monitor-metrics/SKILL.md | 462 +++++++++++++++++ .../commands/metric-anomaly.md | 236 +++++++++ .../monitor-metrics/commands/metric-drift.md | 319 ++++++++++++ .../monitor-metrics/commands/metric-rca.md | 484 ++++++++++++++++++ .../.fuse_hidden0000000700000001 | 459 +++++++++++++++++ .../.fuse_hidden0000000700000002 | 459 +++++++++++++++++ .../.fuse_hidden0000000800000003 | 459 +++++++++++++++++ .../.fuse_hidden0000000900000004 | 459 +++++++++++++++++ .../.fuse_hidden0000000a00000005 | 459 +++++++++++++++++ .../skills/monitor-metrics/SKILL.md | 462 +++++++++++++++++ .../commands/metric-anomaly.md | 242 +++++++++ .../monitor-metrics/commands/metric-drift.md | 319 ++++++++++++ .../monitor-metrics/commands/metric-rca.md | 484 ++++++++++++++++++ 26 files changed, 10935 insertions(+) create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000700000001 create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000d00000006 create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000e00000007 create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000f00000008 create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000001000000009 create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/SKILL.md create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-anomaly.md create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-drift.md create mode 100644 plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-rca.md create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000700000001 create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000800000002 create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000900000003 create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000a00000004 create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/SKILL.md create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-anomaly.md create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-drift.md create mode 100644 plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-rca.md create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000001 create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000002 create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000800000003 create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000900000004 create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000a00000005 create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/SKILL.md create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-anomaly.md create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-drift.md create mode 100644 plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-rca.md diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000700000001 b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000700000001 new file mode 100644 index 0000000..11a1684 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000700000001 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `Mixpanel MCP:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `Mixpanel MCP:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `Mixpanel MCP:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `Mixpanel MCP:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `Mixpanel MCP:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

`, `

`, `

`, + ``, `

    `, `
  • `, `
    `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000d00000006 b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000d00000006 new file mode 100644 index 0000000..11a1684 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000d00000006 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `Mixpanel MCP:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `Mixpanel MCP:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `Mixpanel MCP:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `Mixpanel MCP:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `Mixpanel MCP:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

    `, `

    `, `

    `, + ``, `

      `, `
    • `, `
      `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000e00000007 b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000e00000007 new file mode 100644 index 0000000..aaa9bc7 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000e00000007 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-eu:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-eu:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-eu:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-eu:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-eu:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

      `, `

      `, `

      `, + ``, `

        `, `
      • `, `
        `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000f00000008 b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000f00000008 new file mode 100644 index 0000000..75bf536 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000000f00000008 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-eu` connector (Mixpanel EU). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-eu:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-eu:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-eu:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-eu:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-eu:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

        `, `

        `, `

        `, + ``, `

          `, `
        • `, `
          `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000001000000009 b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000001000000009 new file mode 100644 index 0000000..cf21368 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/.fuse_hidden0000001000000009 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp-eu` connector (Mixpanel EU). +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-eu` connector (Mixpanel EU). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-eu:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-eu:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-eu:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-eu:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-eu:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

          `, `

          `, `

          `, + ``, `

            `, `
          • `, `
            `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/SKILL.md b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/SKILL.md new file mode 100644 index 0000000..acf4362 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/SKILL.md @@ -0,0 +1,462 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp-eu` connector (Mixpanel EU). +--- + +# Monitor Metrics + +> **Connector:** This skill operates exclusively against the `mixpanel-mcp-eu` connector (Mixpanel EU region). Every Mixpanel MCP tool call in this SKILL.md and in every file under `commands/` must be routed through `mixpanel-mcp-eu` — never any other Mixpanel connector. + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-eu` connector (Mixpanel EU). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-eu:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-eu:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-eu:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-eu:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-eu:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `List-Properties` with + `names=[]` and `resource_type=` (pass + `events=[]` to scope to a specific event's properties). If it + doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

            `, `

            `, `

            `, + ``, `

              `, `
            • `, `
              `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-anomaly.md b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-anomaly.md new file mode 100644 index 0000000..25530aa --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-anomaly.md @@ -0,0 +1,236 @@ +# Command: metric-anomaly + +Detect point-in-time anomalies in a single metric — recent spikes, drops, and +clusters. Produces a verdict on *whether* something unusual happened at a +specific moment. Does **not** test for trend-level drift (run `metric-drift` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-drift.md` — edits to one must be mirrored in the other._ + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Purpose | +|---|---|---|---| +| Q1-hourly | Last 7 days | `hour` | Recent-blip detection | +| Q1-daily | Last 30 days | `day` | Recent-day detection against a fuller baseline | + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +Build the `Run-Query` body from `query_template` with only `dateRange` and +`unit` (granularity) overridden. Use `timeComparison` when a single call can +cover both windows. + +--- + +## Phase 2 — Outlier tests (Z-score + IQR, time-bucketed) + +For each series independently, compute the expected range at every timestamp. +Run **both** tests; flag a point if **either** test flags it. Report which +test(s) caught each flag. + +### Test 1 — Z-score against time-bucketed mean + +- For the **hourly** series: group all points by hour-of-day (0–23) and day-of-week (7 × 24 = 168 buckets). Compute mean (μ) and stddev (σ) per bucket across the 7-day window. Flag any point where `|value - μ| / σ > 2.5`. +- For the **daily** series: group by day-of-week (7 buckets). Compute μ and σ across the 30-day window. Flag any point where `|value - μ| / σ > 2.5`. +- Handle low-variance buckets: if σ is <5% of μ, skip the Z-score for that bucket and fall back to IQR only (division by tiny σ creates false alarms). + +### Test 2 — IQR against time-bucketed median + +- Same bucketing scheme as Test 1. +- For each bucket, compute Q1, median, Q3, and IQR = Q3 − Q1. +- Flag any point where `value < Q1 − 1.5 × IQR` or `value > Q3 + 1.5 × IQR`. + +### Deviation magnitude + +For every flagged point, report `(value − median) / median` as a signed +percentage. This is what the CSA actually cares about, not the Z-score itself. + +### Classify each flagged timestamp + +- **Isolated spike/drop** — one point flagged, neighbors normal. Most likely a real anomaly (outage, release, data gap). +- **Cluster** — 2+ consecutive points flagged in the same direction. Could be a short incident *or* the leading edge of drift. Flag as ambiguous and note that `metric-drift` may be a better follow-up. +- **Edge-of-window cluster** — flagged points are the most recent N points. Strongly suggestive of drift, not anomaly. Recommend running `metric-drift` before treating as an anomaly incident. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether anything was flagged. A stable chart +is the visual proof of stability and saves the CSA from second-guessing. + +**Top chart: 7-day hourly view** (Q1-hourly series) +- Line for the hourly series. +- Dots for every flagged hourly point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 7 days, hourly`. + +**Bottom chart: 30-day daily view** (Q1-daily series) +- Line for the daily series. +- Dots for every flagged daily point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 30 days, daily`. + +Both charts share x-axis type (date/time) but not range — render as two +separate plots in one widget, stacked, with consistent y-axis formatting. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ ANOMALY VERDICT ━━ +Hourly series (7d): +Daily series (30d): + +━━ TOP FLAGS ━━ + [isolated | cluster | edge] (z-score | IQR | both) + [isolated | cluster | edge] (z-score | IQR | both) +... (cap 5; omit section entirely if no flags) + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is point-in-time anomaly detection only. Trend-level drift is not +tested here — run `metric-drift` for that. +``` + +#### Headline phrasing discipline + +- No flags: "Metric is stable at the point-in-time level — no anomalies in the last 7 or 30 days." +- Isolated flag(s): "Metric had a [spike/drop] of X% on [date]. Baseline otherwise stable." +- Cluster or edge cluster: "Metric has [N] anomalies concentrated in the last [window] — likely the leading edge of drift. Recommend running `metric-drift` next." + +Never lead with a confidence hedge. State the finding, then qualify it. + +If >10 flags total across both series, cap the TOP FLAGS list at 5 entries +sorted by deviation magnitude descending and add a note to the headline: +"18 anomalies flagged in the last 7 days — the metric is either undergoing a +regime shift or the baseline model is wrong. Run `metric-drift` before +treating any single point as actionable." + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-anomaly", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-hourly", window: "last 7 days", granularity: "hour", + run_query_body: , result: }, + { label: "Q1-daily", window: "last 30 days", granularity: "day", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + hourly: [ { timestamp, value, deviation_pct, classification, test } , ... ], + daily: [ { timestamp, value, deviation_pct, classification, test } , ... ] + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** The hourly view is usually too noisy for a multi-step +funnel at low volume. Drop Q1-hourly and run Q1-daily only (last 14 days +instead of 30 to stay lightweight). Note in output: "Hourly anomaly detection +skipped — funnel volume too low at hourly granularity." + +**Retention metrics:** Retention is a rolling cohort metric — point-in-time +anomaly detection mostly doesn't apply. Tell the user directly and recommend +`metric-drift` instead, which has a cohort-over-cohort fallback for retention. + +**Very low-volume metrics (<100 events/day):** Skip Q1-hourly and run +Q1-daily only — the Poisson noise floor dominates at hourly granularity. +State this in the output. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not test for trend-level drift.** That's `metric-drift`. +- **Does not attribute cause.** Root-cause investigation is out of scope for this command — run `metric-rca` after detection. +- **Does not produce recommendations beyond "run drift" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean anomaly verdict in under 30 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-drift.md b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-drift.md new file mode 100644 index 0000000..12e9456 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-drift.md @@ -0,0 +1,319 @@ +# Command: metric-drift + +Detect trend-level drift in a single metric — whether the baseline itself has +shifted over recent weeks. Produces a verdict on *whether* the metric is in a +new regime. Does **not** test for point-in-time anomalies (run `metric-anomaly` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-anomaly.md` — edits to one must be mirrored in the other._ + +### Prerequisite — name the drift and baseline windows + +The naming convention used throughout this command's output: + +- **`drift_window`** — the **recent** 30 days (most recent 30 days ending today). +- **`baseline_window`** — the **prior** 30 days (30 days ending 30 days before today). + +Both windows are computed from Q1-daily. The weekly test uses 8 vs 8 weeks — +those windows are reported alongside but are secondary to the daily windows +for headline purposes. + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Comparison | +|---|---|---|---| +| Q1-daily | Last 60 days | `day` | Last 30 days vs. prior 30 days | +| Q1-weekly | Last 16 weeks | `week` | Last 8 weeks vs. prior 8 weeks | + +The 60-day daily view catches medium-term drift. The 16-week weekly view +catches slow drift that the daily window would miss because daily noise +drowns the signal. Running both is cheap and they answer different questions. + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +--- + +## Phase 2 — Drift tests (mean shift + variance ratio) + +### Window split & contamination check + +For each series, split into `recent` and `prior` halves (no overlap). + +**Lightweight anomaly contamination check** (important because this command +can run standalone without `metric-anomaly` having run first): + +Scan the `recent` window for obvious outliers using a simple rule — any point +more than 3σ from the window mean. If ≥20% of points in the `recent` window +qualify → flag **"drift test potentially contaminated by outliers in the +recent window"** and mark all drift findings as low-confidence. Recommend the +user run `metric-anomaly` first. + +If 0–20% of points qualify, proceed normally but note the count in the +verdict card's contamination section. + +This is deliberately lighter than `metric-anomaly`'s full time-bucketed +test — its job here is only to flag contamination risk, not to produce a +publishable anomaly verdict. + +### Test 1 — Mean shift (level drift) + +``` +mean_recent = mean(recent_window) +mean_prior = mean(prior_window) +level_delta = (mean_recent − mean_prior) / mean_prior # signed % +``` + +Flag thresholds: +- `|level_delta| < 5%` → no meaningful shift +- `5% ≤ |level_delta| < 15%` → moderate drift +- `|level_delta| ≥ 15%` → significant drift + +Additionally compute a Welch's t-test on the two windows. If p < 0.05 and +`level_delta ≥ 5%`, drift is statistically supported. If p ≥ 0.05, note the +shift is observational but not statistically distinguishable from noise. + +### Test 2 — Variance ratio (volatility drift) + +``` +var_ratio = variance(recent_window) / variance(prior_window) +``` + +Flag thresholds: +- `0.67 ≤ var_ratio ≤ 1.5` → variance stable +- `var_ratio > 1.5` → metric got noisier (investigate instrumentation, cohort mix) +- `var_ratio < 0.67` → metric got smoother (often a sign of flatlining or saturation) + +Variance drift without level drift is an under-appreciated signal — the +headline number looks fine but something structural changed. Always surface +it separately. + +Distribution-shape tests (KS, PSI) are intentionally **not** part of this +battery. They require per-user or per-segment values, which Mixpanel's MCP +surface does not return at practical cost. + +### Combine into a per-series verdict + +| Verdict | When | +|---|---| +| **No drift** | Level stable AND variance stable | +| **Level drift** | Level shifted ≥5%, variance stable | +| **Variance drift** | Level stable, variance ratio outside 0.67–1.5 | +| **Compound drift** | Both | + +Also report **direction** (up / down) and **magnitude** (% for level, ratio +for variance). + +### Reconcile the two series + +The 60-day-daily and 16-week-weekly views should agree on direction. If they +disagree: + +- **Weekly says drift, daily says none** → slow drift that daily noise hides. Trust the weekly. +- **Daily says drift, weekly says none** → recent movement that hasn't accumulated into the weekly window yet. Could be the leading edge of real drift, or a contained incident. Trust the daily but note the weekly hasn't confirmed. +- **Both agree** → high confidence, state it. + +### Classify drift shape + +If drift is flagged, classify its shape using the daily series for use in +the verdict card: + +| Condition | `verdict_shape` value | +|---|---| +| Single-day change point where mean shift before vs after explains ≥60% of variance, and before/after segments are each <20% within-segment variance | `step` (record the change-point date) | +| Linear regression fit to the full 60-day series has R² ≥ 0.5 and non-zero slope | `slope` | +| 7-day autocorrelation on residuals ≥ 0.5, and periodicity strength differs between drift and baseline windows | `oscillating` | +| None of the above fit cleanly | `unclassified` | + +**Shape precedence**: if multiple shapes fit, use this priority: +`step` > `slope` > `oscillating` > `unclassified`. (Step changes are the +most actionable; surface them first when ambiguous.) + +If no drift was flagged, skip shape classification entirely. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether drift was detected. A stable chart +is the visual proof of stability. + +**Top chart: 60-day daily view** (Q1-daily series) +- Line for the daily series. +- **Shaded band** for the prior 30-day baseline window (subtle grey fill). +- **Shaded band** for the recent 30-day drift window — red-tinted fill if drift is `down`, green-tinted if `up`, amber-tinted if `mixed`, grey if no drift. +- Horizontal line for `mean_prior` (dashed grey). +- Horizontal line for `mean_recent` (dashed, colored to match drift direction). +- If `verdict_shape = step`, annotate the change-point date with a vertical dashed line. +- Title: ` — last 60 days, daily`. + +**Bottom chart: 16-week weekly view** (Q1-weekly series) +- Line for the weekly series. +- **Shaded band** for the prior 8-week baseline window (subtle grey fill). +- **Shaded band** for the recent 8-week drift window — same direction-based coloring as above. +- Horizontal lines for `mean_prior_weekly` (dashed grey) and `mean_recent_weekly` (dashed, colored). +- Title: ` — last 16 weeks, weekly`. + +Both charts share x-axis type (date) and consistent y-axis formatting. +Render as two separate plots in one widget, stacked. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ DRIFT VERDICT ━━ +60-day / daily view: (t-test p =

              ) +16-week / weekly view: +Reconciled verdict: +Shape: + +━━ CONTAMINATION ━━ + + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is trend-level drift detection only. Point-in-time anomalies are not +tested here — run `metric-anomaly` for that. +``` + +#### Headline phrasing discipline + +- No drift: "Metric is stable — trend has not shifted in the last 30 days or 8 weeks." +- Level drift: "Metric has drifted [up/down] by X% over the last 30 days. [Weekly view confirms / Weekly view hasn't confirmed yet]." +- Variance drift only: "Metric level is stable but volatility has [increased/decreased] — variance ratio [X.XX]. Something structural changed without moving the headline." +- Compound drift: "Metric has drifted [up/down] by X% AND volatility changed. Compound drift — investigate both level and structure." +- Contamination flag: append "Drift confidence is low — recent window has N outlier points. Run `metric-anomaly` first to clean up before attributing." + +Never lead with a confidence hedge. State the finding, then qualify it. + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-drift", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-daily", window: "last 60 days", granularity: "day", + run_query_body: , result: }, + { label: "Q1-weekly", window: "last 16 weeks", granularity: "week", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + daily: { verdict, direction, level_delta, var_ratio, t_test_p, shape, change_point_date }, + weekly: { verdict, direction, level_delta, var_ratio }, + reconciled: , + contamination: { outlier_count, contaminated: bool } + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** Phase 1 and Phase 2 work as-is for multi-step funnels +— the overall conversion series is what drifts. No special handling needed. + +**Retention metrics:** Retention is a rolling cohort metric — "drift" on a +retention curve means cohort-over-cohort degradation. Replace the 60-day +daily and 16-week weekly splits with a cohort-over-cohort comparison: last +8 cohorts vs. prior 8 cohorts on the same retention day (D1, D7, D30). Flag +which retention day shifted. Note in the verdict card: "Retention +cohort-over-cohort comparison used in place of daily/weekly split." + +**Very low-volume metrics (<100 events/day):** The tests still apply but +statistical confidence drops sharply. Downgrade confidence to `low` regardless +of `level_delta` magnitude and note: "Low-volume metric — drift signal may be +Poisson noise." + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not detect point-in-time anomalies.** That's `metric-anomaly`. +- **Does not attribute cause.** Root-cause investigation is handled by `metric-rca` after detection. +- **Does not produce recommendations beyond "run anomaly first" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean drift verdict in under 60 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-rca.md b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-rca.md new file mode 100644 index 0000000..ac2bf98 --- /dev/null +++ b/plugins/mixpanel-mcp-eu/skills/monitor-metrics/commands/metric-rca.md @@ -0,0 +1,484 @@ +# Command: metric-rca + +Root-cause investigation for a flagged metric. Takes the diagnosis payload +from a prior `metric-anomaly` or `metric-drift` run and fans out across a +set of segmentation branches to localise *where* the movement concentrated. +Produces a ranked list of findings and appends them to the diagnosis board +the user already created. + +This command does **not** re-run anomaly or drift detection. It assumes the +movement has already been established — its job is attribution, not +detection. + +--- + +## Prerequisites + +Before this command runs, the session must hold a **diagnosis payload** in +conversation memory from an earlier `metric-anomaly` or `metric-drift` run +(see `SKILL.md` Step 2). The payload carries the project, metric, metric +type, date ranges, flagged points or drift windows, and the query bodies +used. + +If no payload exists, do **not** attempt to run RCA from a cold start. Tell +the user: *"RCA runs on top of an existing anomaly or drift diagnosis. Run +`metric-anomaly` or `metric-drift` first, then come back here."* Stop. + +### Board state + +If the user persisted the diagnosis as a Mixpanel board (Step 2 in +`SKILL.md`), the payload will include `diagnosis_board_id`. This command +**appends** to that board — it does not create a new one. If no board was +created, skip the append step at the end and just return the findings +inline; do not silently create a new board. + +### Ask once — business / market context + +Before firing Branch 5, ask the user exactly once: + +> *"What business or market is this metric tied to? (e.g., Indian +> e-commerce, Indian OTT streaming, SEA fintech.) I'll use this to check +> whether the flagged dates line up with festivals, launches, or +> category-specific events."* + +Hold the answer as `business_context`. If the user skips or says "not +relevant", skip Branch 5 entirely — do not guess the market from project +name or memory. + +--- + +## Phase 1 — Branch selection + parallel fan-out + +Read the payload and decide which branches to run. Every branch runs +against the **same date ranges** the source command used: + +- `metric-anomaly` payload → use 7-day hourly + 30-day daily windows. +- `metric-drift` payload → use 60-day daily + 16-week weekly windows, with + recent vs prior window comparison preserved. + +If both payloads exist in the session (user ran anomaly then drift), +prefer the drift payload's date ranges — RCA over a longer window is more +useful — and annotate findings with the anomaly payload's flagged +timestamps for cross-reference. + +### Branch selection matrix + +| Branch | Purpose | Runs when | +|---|---|---| +| **Branch 1 — Component decomposition** | Break ratio/funnel/retention into its component events + metric-definition filters | `metric_type ∈ {ratio, funnel, retention}` | +| **Branch 2 — Default-property breakdowns** | Source → geography → client-specific split | Always | +| **Branch 3 — Distinct-ID outliers** | Find whether a small set of users drove the movement | Anomaly payload only. Skip if in-window distinct user count >10k | +| **Branch 4 — Cohort comparison** | Run the metric filtered to the cohorts the user names to find concentration in named user segments | The user named one or more cohorts (or referenced a cohort in their ask) | +| **Branch 5 — Calendar context** | Check whether flagged dates line up with festivals, launches, category events in `business_context` | `business_context` provided | + +Run all selected branches **in parallel** via concurrent `Run-Query` calls. +Each branch can issue multiple queries; batch within a branch sequentially +if one query's result informs the next (Branch 2's second level depends +on the first). + +--- + +## Branch 1 — Component decomposition + +Only runs for `ratio`, `funnel`, and `retention` metrics. The question: +*is the movement in the numerator, the denominator, or a specific step?* + +**If the metric came from a saved Mixpanel Metric** (`metric_id` is set on +the payload), read the component events, formula, and filters straight from +the `Get-Metric` definition rather than re-deriving them — the definition is +authoritative and avoids guessing the numerator/denominator. Fall back to +the derivation below only when no saved-Metric definition is available. + +### For `ratio` +1. Pull numerator event as a standalone count series (same window, + granularity, and filters from the metric definition). +2. Pull denominator event as a standalone count series (same window, + granularity, and filters). +3. Compare each component's deviation % against the ratio's overall + deviation %. Flag which component moved. +4. If both components moved in the same direction by similar magnitude → + the ratio is stable but volumes shifted. Note as a volume story, not a + conversion story. +5. If only one moved, or they moved opposite directions → the ratio + shift is concentration-driven. Identify which. + +### For `funnel` +1. Run the **same funnel definition** twice as `report_type=funnels` via + `Run-Query`: once for the recent (drift/anomaly) window, once for the + baseline window. The native funnels response returns step conversion + rates and absolute counts per step. +2. For each step pair, compute the conversion-rate delta between recent + and baseline. +3. Flag the **specific step pair** with the largest absolute conversion + drop. One step usually owns the drop; surface that pair as the + headline finding. +4. If the funnel has step-level filters (e.g. property filters on + individual steps), do not decompose into standalone event counts — + the filters change the meaning. The native funnels query is the only + faithful comparison. + +This replaces the prior "pull each funnel step as a standalone event +count" approach. Standalone event counts ignore step ordering and +step-level filters; the native funnels report does not. + +### For `retention` +1. Pull the cohort-defining event as a standalone count series. +2. Pull the return event as a standalone count series. +3. Check whether cohort size changed, return count changed, or both. +4. A drop in retention with stable return count + larger cohort is a mix + effect; a drop in return count with stable cohort is real attrition. + +### Event × metric-definition filter combinations + +For every component event above, re-run it with **each filter from the +metric definition applied independently** (i.e. one filter at a time, not +all combinations — combinatorial blowup is not useful here). This shows +whether a specific filter value concentrates the movement. + +Example: if the metric definition has `user_type = premium` baked in, +and the numerator event is `video_play`, run: +- `video_play` with no filter +- `video_play` with `user_type = premium` (the baked filter) — this + should match the metric's numerator +- `video_play` broken down **by** `user_type` (all values) — exposes + whether the movement is specific to `premium` or shared across the + population. + +Cap at 5 filter values per property breakdown; drop the long tail. + +--- + +## Branch 2 — Default-property breakdowns + +Two-level cascade. Always runs. + +### Level 1 — Source segmentation + +Break down the metric by the SDK / ingestion source. Two properties +together: + +- Event property `mp_lib` (string) — SDK name (e.g. `web`, `android`, + `iphone`, `swift`, `python`, `ruby`, `java`). +- Event property `$import` (boolean) — true for events ingested via the + Import API, false for Track API. + +Output: a matrix of `mp_lib × $import` with deviation % per cell. The +goal here is to isolate whether the movement is concentrated in +client-side vs server-side vs Import API ingestion. + +### Level 2 — Conditional breakdowns + +The Level 2 slice depends on what Level 1 surfaced. Run the slice whose +dominant source owns the movement; skip the others. + +**For client-side sources (`web`, `android`, `iphone`, `swift`, etc.):** +Common first slice — geography in a step function: +- Event property `$os` +- Event property `platform` (or the project's equivalent; check the + metric definition or fall back to `mp_lib` if not present) +- Event property `mp_country_code` +- Event property `$region` +- Event property `$city` + +Run these as a **step function**, not a cross-product: start with +`mp_country_code`. If one country owns >50% of the movement, break that +country down by `$region`. If one region owns >50%, break by `$city`. +Stop when the concentration flattens. + +**For `web` specifically:** +- Event property `$device` +- Event property `utm_source` +- Event property `$browser` + +**For `android` / `iphone` / `swift` / `ios`:** +- Event property `$app_version_string` +- Event property `$model` + +Run these as single-property breakdowns, not two-level (avoids the +high-cardinality two-level truncation risk that bites large projects). + +### Cardinality discipline + +- Any breakdown returning exactly 1,000 / 3,000 / 10,000 rows is + potentially truncated — flag in findings, do not treat the result as + exhaustive. +- If a two-level breakdown (`mp_lib × $import`) is used, keep the + first-level cardinality bounded: if `mp_lib` returns >20 distinct + values, filter to the top 10 by volume before running the second + level. + +--- + +## Branch 3 — Distinct-ID outliers + +Only runs for anomaly payloads. Goal: is a small set of users +responsible for the flagged point(s)? + +### Cardinality gate + +Before running, check in-window distinct user count against the metric's +base query. If >10,000 distinct users contributed to the metric in the +flagged window, skip this branch and note "Branch 3 skipped — user +cardinality too high for outlier detection via MCP." A top-N breakdown +on 100k users returns noise. + +### If within cardinality + +1. Break the metric down by `distinct_id` for the flagged window only + (not the whole series — this keeps the query tractable). +2. Rank users by their contribution to the metric in the flagged window. +3. Flag outliers: users whose contribution in the flagged window is + >5σ above the median user's contribution, OR users who appear in + the flagged window but not in the baseline window. +4. Cap output at the top 20 distinct_ids by deviation. + +If the top 5 users account for >30% of the movement → strong user-driven +outlier signal. Surface this prominently. Could be bots, internal test +traffic, or a single high-volume customer. + +### Optional follow-up — session replay context + +If the top 3 distinct_ids each account for ≥10% of the movement individually, +offer the user a follow-up: *"Top user(s) `` drove [X]% of the +flagged window. Want me to pull their session replays from that window so +you can see what they did?"* + +If the user says yes, call `Get-User-Replays-Data` for each flagged +distinct_id with `from_date` and `to_date` set to the flagged window. Cap at +3 distinct_ids and 5 replays per user. Surface the replay URLs + timestamps +in the findings card under the Branch 3 section. + +This is **opt-in only** — do not pull replays automatically. Replays add +value when the customer wants the "what did they actually do" answer, but +they're noisy if Session Replay isn't widely enabled in the project. Ask +once, run if confirmed, skip if declined. + +--- + +## Branch 4 — Cohort comparison + +Goal: is the movement concentrated in a specific user cohort the customer +already cares about? Cohorts are typically the most CSA-actionable RCA +signal — "your churn-risk cohort dropped 40%" is a far better headline than +"users on iOS 17.4 dropped 40%." + +### Step 1 — Identify candidate cohorts + +The Mixpanel MCP surface has **no cohort-listing tool** — `Search-Entities` +does not support a `cohort` entity type (its types are insights, funnels, +flows, retention, dashboard, launch-analysis, experiments, feature-flags, +metric-trees, playlists, heat-maps). Branch 4 therefore cannot auto-discover +cohorts; source them from the user instead: + +1. If the user named cohorts in their original ask (e.g. "is this happening + in our power users?"), use those. +2. Otherwise, ask once: *"Want me to compare against any saved cohorts? If + so, name them (or share their cohort IDs) and I'll filter the metric to + each."* + +If the user names no cohorts (or declines) → record *"Branch 4 skipped — no +cohorts named; cohort auto-discovery isn't available on the MCP surface."* +and continue. + +### Step 2 — Resolve the named cohorts + +Cap at the **top 5 cohorts** the user named. For each, resolve its +`cohort_id` — the user may give a name or an id; if only a name is given, +confirm it back before filtering. If the user named more than 5, ask which +five matter most. + +Surface the cohort names in the findings — the customer recognizes their +own cohort names and that's part of the value. + +### Step 3 — Run the metric filtered by each cohort + +For each selected cohort, run the same `query_template` as the headline +metric, with one cohort-membership filter added. The exact filter shape +comes from `Get-Query-Schema` — Mixpanel's query schema accepts cohort +membership as a filter on `distinct_id` referencing the cohort_id. + +Run all cohort queries in parallel via concurrent `Run-Query` calls. Each +query covers the same date window the source command used (drift window +or anomaly window). + +### Step 4 — Score and rank + +For each cohort, compute the same concentration + deviation scores used +in the Phase 2 ranking step (cohort_delta_abs / total_delta_abs and the +cohort's own deviation %). Treat cohorts as candidate findings the same +way property breakdowns are treated. + +A cohort is **important** if either: +- It explains ≥30% of the headline movement (lower threshold than the + default 40% — cohorts are smaller slices than top-level properties, + and 30% concentration in a named cohort is a strong signal), OR +- Its individual deviation is ≥1.5× the headline metric's deviation. + +### Error handling + +| Situation | Response | +|---|---| +| User names no cohorts | Skip branch, record reason. | +| A cohort filter fails in `Run-Query` (cohort schema mismatch) | Retry once. If still failing, skip that cohort, continue others, note in branch coverage. | +| All cohort queries fail | Skip branch, note "Branch 4 skipped — cohort filtering failed across all cohorts." | + +--- + +## Branch 5 — Calendar context + +Only runs if the user provided `business_context`. + +1. Identify the key dates in the flagged window. For anomaly payloads, + use the timestamps from `payload.flags.hourly` and `payload.flags.daily`. + For drift payloads, use the change-point date if `shape = step`, or + the start of the drift window otherwise. +2. Run a `web_search` with a query built from `business_context` + the + relevant date(s). Example: if `business_context = "Indian e-commerce"` + and the change-point is `2026-03-08`, search `"Indian e-commerce + events March 8 2026 festival sale"`. If `web_search` isn't available in + this runtime, skip Branch 5 and record *"Branch 5 skipped — web search + unavailable in this runtime"* (mirrors the no-`business_context` skip); + the other four branches still run. +3. Look for matches: religious festivals, cricket fixtures, sale events + (BBD, EOSS, GOSF), product launches, regulatory dates (e.g. RBI policy + announcements). +4. If a plausible match surfaces, include it in findings with a + confidence label: `strong` (exact date match, major event), `moderate` + (same week, category-aligned), `weak` (same month, tangential). +5. If nothing surfaces, record: *"No calendar events found for + `` on the flagged dates."* + +This branch is **context**, not **evidence**. Phrase findings as "the +flagged date falls on [event]" — never as "the [event] caused the +movement." Correlation only; causation belongs to the customer. + +--- + +## Phase 2 — Synthesise, rank, visualise + +### Rank findings + +For every branch, each sub-segment (a `mp_lib` value, a country, a funnel +step, a distinct_id, etc.) is a candidate finding. Score each: + +- **Concentration score** — share of the total movement this segment + explains. `segment_delta_abs / total_delta_abs`. A segment with 70% + concentration is worth surfacing; 5% is not. +- **Deviation score** — this segment's deviation % compared to its own + baseline. A segment that individually deviated 40% is stronger signal + than one that deviated 5%. + +Flag a finding as **"important"** if **either** of these is true: +- Concentration score ≥ 0.4 (one segment owns ≥40% of the movement), OR +- Segment deviation ≥ 1.5× the headline metric's deviation (the movement + concentrates here). + +Cap total important findings at 6. If more than 6 qualify, keep the top 6 +by concentration × deviation combined rank. + +### Visualise important findings + +Render a single visualizer widget containing one chart per important +finding, stacked vertically. Chart type by branch: + +| Branch | Chart | +|---|---| +| Branch 1 (component) | Two-line overlay: headline metric vs component metric, same window, same granularity | +| Branch 2 (property breakdown) | Horizontal bar chart, one bar per segment, bar length = deviation %, color-coded by direction | +| Branch 3 (distinct_id) | Horizontal bar chart, top-N users by contribution % in flagged window | +| Branch 4 (cohort) | Horizontal bar chart, one bar per important cohort, bar length = deviation %, color-coded by direction | +| Branch 5 (calendar) | No chart — rendered as an annotation in the written findings block | + +Before generating, read `visualize:read_me` with `modules: ["chart"]` +once if not already loaded this session. Do not narrate the read_me call. + +### The findings card + +``` +METRIC: +DIAGNOSIS SOURCE: +WINDOW: + +━━ HEADLINE ━━ + + +━━ IMPORTANT FINDINGS (ranked) ━━ +1. [Branch N] of movement, + vs baseline. . +2. ... +(cap 6; omit section if no important findings) + +━━ BRANCH COVERAGE ━━ +Branch 1 (component): +Branch 2 (default props): +Branch 3 (distinct_id): +Branch 4 (cohort): +Branch 5 (calendar): + +━━ WHAT THIS ISN'T ━━ +This is attribution by segmentation, not causal analysis. Findings show +where the movement concentrated; they do not prove what caused it. +Calendar matches are correlation only. +``` + +### The RCA payload (passed back to SKILL.md) + +After rendering the findings card + charts, hand back to the skill-level +flow: + +``` +{ + command: "metric-rca", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + source_payload_command: "metric-anomaly" | "metric-drift", + business_context: , + rca_queries: [ + { branch: int, label: str, run_query_body: dict, result: dict }, ... + ], + important_findings: [ + { branch: int, segment: str, concentration_pct: float, + deviation_pct: float, interpretation: str, + chart_spec: dict }, + ... (cap 6) + ], + findings_card: , + headline: , + diagnosis_board_id: +} +``` + +The skill-level flow (Step 3 in `SKILL.md`, added with this command) +handles the board append. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| No diagnosis payload in session | Stop. Tell user to run `metric-anomaly` or `metric-drift` first. | +| A branch query fails | Retry once. If still failing, mark that branch partial, continue others, note in branch coverage. | +| All branches fail | Stop. Report failure and ask the user to verify project access. | +| Branch 2 Level 1 returns only one `mp_lib × $import` cell with meaningful volume | Skip Branch 2 Level 2 conditional logic; run the fallback geography step function directly. | +| User declines to provide `business_context` | Skip Branch 5 entirely, proceed with others. | +| `web_search` unavailable in this runtime | Skip Branch 5, record "Branch 5 skipped — web search unavailable." Other branches continue. | +| No important findings after ranking (all segments <40% concentration and <1.5× deviation) | Surface that finding: "Movement is distributed across segments — no single dimension concentrates it." This is a valid, useful result. | + +--- + +## What this command deliberately doesn't do + +- **Does not re-run anomaly or drift detection.** It consumes the payload. +- **Does not claim causation.** Correlation by segmentation is the ceiling. +- **Does not cross-join properties combinatorially.** Branch 2 is a + step-function cascade, not a cross-product, because high-cardinality + two-level breakdowns truncate silently. +- **Does not source calendar dates from memory.** Always `web_search` + with the user-provided `business_context` (skips gracefully if web search + is unavailable). +- **Does not create a new board.** Appends to the existing diagnosis + board via the skill-level flow. + +Keep the surface narrow. A ranked list of 3-6 concentrated segments with +charts beats a 40-branch exhaustive report every time. diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000700000001 b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000700000001 new file mode 100644 index 0000000..11a1684 --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000700000001 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `Mixpanel MCP:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `Mixpanel MCP:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `Mixpanel MCP:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `Mixpanel MCP:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `Mixpanel MCP:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

              `, `

              `, `

              `, + ``, `

                `, `
              • `, `
                `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000800000002 b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000800000002 new file mode 100644 index 0000000..954c4fe --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000800000002 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-in:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-in:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-in:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-in:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-in:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                `, `

                `, `

                `, + ``, `

                  `, `
                • `, `
                  `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000900000003 b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000900000003 new file mode 100644 index 0000000..5b317fe --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000900000003 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-in` connector (Mixpanel India). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-in:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-in:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-in:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-in:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-in:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                  `, `

                  `, `

                  `, + ``, `

                    `, `
                  • `, `
                    `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000a00000004 b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000a00000004 new file mode 100644 index 0000000..831ac65 --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/.fuse_hidden0000000a00000004 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp-in` connector (Mixpanel India). +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-in` connector (Mixpanel India). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-in:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-in:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-in:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-in:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-in:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                    `, `

                    `, `

                    `, + ``, `

                      `, `
                    • `, `
                      `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/SKILL.md b/plugins/mixpanel-mcp-in/skills/monitor-metrics/SKILL.md new file mode 100644 index 0000000..a2f3849 --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/SKILL.md @@ -0,0 +1,462 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp-in` connector (Mixpanel India). +--- + +# Monitor Metrics + +> **Connector:** This skill operates exclusively against the `mixpanel-mcp-in` connector (Mixpanel India region). Every Mixpanel MCP tool call in this SKILL.md and in every file under `commands/` must be routed through `mixpanel-mcp-in` — never any other Mixpanel connector. + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp-in` connector (Mixpanel India). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp-in:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp-in:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp-in:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp-in:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp-in:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `List-Properties` with + `names=[]` and `resource_type=` (pass + `events=[]` to scope to a specific event's properties). If it + doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                      `, `

                      `, `

                      `, + ``, `

                        `, `
                      • `, `
                        `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-anomaly.md b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-anomaly.md new file mode 100644 index 0000000..25530aa --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-anomaly.md @@ -0,0 +1,236 @@ +# Command: metric-anomaly + +Detect point-in-time anomalies in a single metric — recent spikes, drops, and +clusters. Produces a verdict on *whether* something unusual happened at a +specific moment. Does **not** test for trend-level drift (run `metric-drift` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-drift.md` — edits to one must be mirrored in the other._ + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Purpose | +|---|---|---|---| +| Q1-hourly | Last 7 days | `hour` | Recent-blip detection | +| Q1-daily | Last 30 days | `day` | Recent-day detection against a fuller baseline | + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +Build the `Run-Query` body from `query_template` with only `dateRange` and +`unit` (granularity) overridden. Use `timeComparison` when a single call can +cover both windows. + +--- + +## Phase 2 — Outlier tests (Z-score + IQR, time-bucketed) + +For each series independently, compute the expected range at every timestamp. +Run **both** tests; flag a point if **either** test flags it. Report which +test(s) caught each flag. + +### Test 1 — Z-score against time-bucketed mean + +- For the **hourly** series: group all points by hour-of-day (0–23) and day-of-week (7 × 24 = 168 buckets). Compute mean (μ) and stddev (σ) per bucket across the 7-day window. Flag any point where `|value - μ| / σ > 2.5`. +- For the **daily** series: group by day-of-week (7 buckets). Compute μ and σ across the 30-day window. Flag any point where `|value - μ| / σ > 2.5`. +- Handle low-variance buckets: if σ is <5% of μ, skip the Z-score for that bucket and fall back to IQR only (division by tiny σ creates false alarms). + +### Test 2 — IQR against time-bucketed median + +- Same bucketing scheme as Test 1. +- For each bucket, compute Q1, median, Q3, and IQR = Q3 − Q1. +- Flag any point where `value < Q1 − 1.5 × IQR` or `value > Q3 + 1.5 × IQR`. + +### Deviation magnitude + +For every flagged point, report `(value − median) / median` as a signed +percentage. This is what the CSA actually cares about, not the Z-score itself. + +### Classify each flagged timestamp + +- **Isolated spike/drop** — one point flagged, neighbors normal. Most likely a real anomaly (outage, release, data gap). +- **Cluster** — 2+ consecutive points flagged in the same direction. Could be a short incident *or* the leading edge of drift. Flag as ambiguous and note that `metric-drift` may be a better follow-up. +- **Edge-of-window cluster** — flagged points are the most recent N points. Strongly suggestive of drift, not anomaly. Recommend running `metric-drift` before treating as an anomaly incident. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether anything was flagged. A stable chart +is the visual proof of stability and saves the CSA from second-guessing. + +**Top chart: 7-day hourly view** (Q1-hourly series) +- Line for the hourly series. +- Dots for every flagged hourly point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 7 days, hourly`. + +**Bottom chart: 30-day daily view** (Q1-daily series) +- Line for the daily series. +- Dots for every flagged daily point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 30 days, daily`. + +Both charts share x-axis type (date/time) but not range — render as two +separate plots in one widget, stacked, with consistent y-axis formatting. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ ANOMALY VERDICT ━━ +Hourly series (7d): +Daily series (30d): + +━━ TOP FLAGS ━━ + [isolated | cluster | edge] (z-score | IQR | both) + [isolated | cluster | edge] (z-score | IQR | both) +... (cap 5; omit section entirely if no flags) + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is point-in-time anomaly detection only. Trend-level drift is not +tested here — run `metric-drift` for that. +``` + +#### Headline phrasing discipline + +- No flags: "Metric is stable at the point-in-time level — no anomalies in the last 7 or 30 days." +- Isolated flag(s): "Metric had a [spike/drop] of X% on [date]. Baseline otherwise stable." +- Cluster or edge cluster: "Metric has [N] anomalies concentrated in the last [window] — likely the leading edge of drift. Recommend running `metric-drift` next." + +Never lead with a confidence hedge. State the finding, then qualify it. + +If >10 flags total across both series, cap the TOP FLAGS list at 5 entries +sorted by deviation magnitude descending and add a note to the headline: +"18 anomalies flagged in the last 7 days — the metric is either undergoing a +regime shift or the baseline model is wrong. Run `metric-drift` before +treating any single point as actionable." + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-anomaly", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-hourly", window: "last 7 days", granularity: "hour", + run_query_body: , result: }, + { label: "Q1-daily", window: "last 30 days", granularity: "day", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + hourly: [ { timestamp, value, deviation_pct, classification, test } , ... ], + daily: [ { timestamp, value, deviation_pct, classification, test } , ... ] + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** The hourly view is usually too noisy for a multi-step +funnel at low volume. Drop Q1-hourly and run Q1-daily only (last 14 days +instead of 30 to stay lightweight). Note in output: "Hourly anomaly detection +skipped — funnel volume too low at hourly granularity." + +**Retention metrics:** Retention is a rolling cohort metric — point-in-time +anomaly detection mostly doesn't apply. Tell the user directly and recommend +`metric-drift` instead, which has a cohort-over-cohort fallback for retention. + +**Very low-volume metrics (<100 events/day):** Skip Q1-hourly and run +Q1-daily only — the Poisson noise floor dominates at hourly granularity. +State this in the output. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not test for trend-level drift.** That's `metric-drift`. +- **Does not attribute cause.** Root-cause investigation is out of scope for this command — run `metric-rca` after detection. +- **Does not produce recommendations beyond "run drift" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean anomaly verdict in under 30 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-drift.md b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-drift.md new file mode 100644 index 0000000..12e9456 --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-drift.md @@ -0,0 +1,319 @@ +# Command: metric-drift + +Detect trend-level drift in a single metric — whether the baseline itself has +shifted over recent weeks. Produces a verdict on *whether* the metric is in a +new regime. Does **not** test for point-in-time anomalies (run `metric-anomaly` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-anomaly.md` — edits to one must be mirrored in the other._ + +### Prerequisite — name the drift and baseline windows + +The naming convention used throughout this command's output: + +- **`drift_window`** — the **recent** 30 days (most recent 30 days ending today). +- **`baseline_window`** — the **prior** 30 days (30 days ending 30 days before today). + +Both windows are computed from Q1-daily. The weekly test uses 8 vs 8 weeks — +those windows are reported alongside but are secondary to the daily windows +for headline purposes. + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Comparison | +|---|---|---|---| +| Q1-daily | Last 60 days | `day` | Last 30 days vs. prior 30 days | +| Q1-weekly | Last 16 weeks | `week` | Last 8 weeks vs. prior 8 weeks | + +The 60-day daily view catches medium-term drift. The 16-week weekly view +catches slow drift that the daily window would miss because daily noise +drowns the signal. Running both is cheap and they answer different questions. + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +--- + +## Phase 2 — Drift tests (mean shift + variance ratio) + +### Window split & contamination check + +For each series, split into `recent` and `prior` halves (no overlap). + +**Lightweight anomaly contamination check** (important because this command +can run standalone without `metric-anomaly` having run first): + +Scan the `recent` window for obvious outliers using a simple rule — any point +more than 3σ from the window mean. If ≥20% of points in the `recent` window +qualify → flag **"drift test potentially contaminated by outliers in the +recent window"** and mark all drift findings as low-confidence. Recommend the +user run `metric-anomaly` first. + +If 0–20% of points qualify, proceed normally but note the count in the +verdict card's contamination section. + +This is deliberately lighter than `metric-anomaly`'s full time-bucketed +test — its job here is only to flag contamination risk, not to produce a +publishable anomaly verdict. + +### Test 1 — Mean shift (level drift) + +``` +mean_recent = mean(recent_window) +mean_prior = mean(prior_window) +level_delta = (mean_recent − mean_prior) / mean_prior # signed % +``` + +Flag thresholds: +- `|level_delta| < 5%` → no meaningful shift +- `5% ≤ |level_delta| < 15%` → moderate drift +- `|level_delta| ≥ 15%` → significant drift + +Additionally compute a Welch's t-test on the two windows. If p < 0.05 and +`level_delta ≥ 5%`, drift is statistically supported. If p ≥ 0.05, note the +shift is observational but not statistically distinguishable from noise. + +### Test 2 — Variance ratio (volatility drift) + +``` +var_ratio = variance(recent_window) / variance(prior_window) +``` + +Flag thresholds: +- `0.67 ≤ var_ratio ≤ 1.5` → variance stable +- `var_ratio > 1.5` → metric got noisier (investigate instrumentation, cohort mix) +- `var_ratio < 0.67` → metric got smoother (often a sign of flatlining or saturation) + +Variance drift without level drift is an under-appreciated signal — the +headline number looks fine but something structural changed. Always surface +it separately. + +Distribution-shape tests (KS, PSI) are intentionally **not** part of this +battery. They require per-user or per-segment values, which Mixpanel's MCP +surface does not return at practical cost. + +### Combine into a per-series verdict + +| Verdict | When | +|---|---| +| **No drift** | Level stable AND variance stable | +| **Level drift** | Level shifted ≥5%, variance stable | +| **Variance drift** | Level stable, variance ratio outside 0.67–1.5 | +| **Compound drift** | Both | + +Also report **direction** (up / down) and **magnitude** (% for level, ratio +for variance). + +### Reconcile the two series + +The 60-day-daily and 16-week-weekly views should agree on direction. If they +disagree: + +- **Weekly says drift, daily says none** → slow drift that daily noise hides. Trust the weekly. +- **Daily says drift, weekly says none** → recent movement that hasn't accumulated into the weekly window yet. Could be the leading edge of real drift, or a contained incident. Trust the daily but note the weekly hasn't confirmed. +- **Both agree** → high confidence, state it. + +### Classify drift shape + +If drift is flagged, classify its shape using the daily series for use in +the verdict card: + +| Condition | `verdict_shape` value | +|---|---| +| Single-day change point where mean shift before vs after explains ≥60% of variance, and before/after segments are each <20% within-segment variance | `step` (record the change-point date) | +| Linear regression fit to the full 60-day series has R² ≥ 0.5 and non-zero slope | `slope` | +| 7-day autocorrelation on residuals ≥ 0.5, and periodicity strength differs between drift and baseline windows | `oscillating` | +| None of the above fit cleanly | `unclassified` | + +**Shape precedence**: if multiple shapes fit, use this priority: +`step` > `slope` > `oscillating` > `unclassified`. (Step changes are the +most actionable; surface them first when ambiguous.) + +If no drift was flagged, skip shape classification entirely. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether drift was detected. A stable chart +is the visual proof of stability. + +**Top chart: 60-day daily view** (Q1-daily series) +- Line for the daily series. +- **Shaded band** for the prior 30-day baseline window (subtle grey fill). +- **Shaded band** for the recent 30-day drift window — red-tinted fill if drift is `down`, green-tinted if `up`, amber-tinted if `mixed`, grey if no drift. +- Horizontal line for `mean_prior` (dashed grey). +- Horizontal line for `mean_recent` (dashed, colored to match drift direction). +- If `verdict_shape = step`, annotate the change-point date with a vertical dashed line. +- Title: ` — last 60 days, daily`. + +**Bottom chart: 16-week weekly view** (Q1-weekly series) +- Line for the weekly series. +- **Shaded band** for the prior 8-week baseline window (subtle grey fill). +- **Shaded band** for the recent 8-week drift window — same direction-based coloring as above. +- Horizontal lines for `mean_prior_weekly` (dashed grey) and `mean_recent_weekly` (dashed, colored). +- Title: ` — last 16 weeks, weekly`. + +Both charts share x-axis type (date) and consistent y-axis formatting. +Render as two separate plots in one widget, stacked. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ DRIFT VERDICT ━━ +60-day / daily view: (t-test p =

                        ) +16-week / weekly view: +Reconciled verdict: +Shape: + +━━ CONTAMINATION ━━ + + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is trend-level drift detection only. Point-in-time anomalies are not +tested here — run `metric-anomaly` for that. +``` + +#### Headline phrasing discipline + +- No drift: "Metric is stable — trend has not shifted in the last 30 days or 8 weeks." +- Level drift: "Metric has drifted [up/down] by X% over the last 30 days. [Weekly view confirms / Weekly view hasn't confirmed yet]." +- Variance drift only: "Metric level is stable but volatility has [increased/decreased] — variance ratio [X.XX]. Something structural changed without moving the headline." +- Compound drift: "Metric has drifted [up/down] by X% AND volatility changed. Compound drift — investigate both level and structure." +- Contamination flag: append "Drift confidence is low — recent window has N outlier points. Run `metric-anomaly` first to clean up before attributing." + +Never lead with a confidence hedge. State the finding, then qualify it. + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-drift", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-daily", window: "last 60 days", granularity: "day", + run_query_body: , result: }, + { label: "Q1-weekly", window: "last 16 weeks", granularity: "week", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + daily: { verdict, direction, level_delta, var_ratio, t_test_p, shape, change_point_date }, + weekly: { verdict, direction, level_delta, var_ratio }, + reconciled: , + contamination: { outlier_count, contaminated: bool } + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** Phase 1 and Phase 2 work as-is for multi-step funnels +— the overall conversion series is what drifts. No special handling needed. + +**Retention metrics:** Retention is a rolling cohort metric — "drift" on a +retention curve means cohort-over-cohort degradation. Replace the 60-day +daily and 16-week weekly splits with a cohort-over-cohort comparison: last +8 cohorts vs. prior 8 cohorts on the same retention day (D1, D7, D30). Flag +which retention day shifted. Note in the verdict card: "Retention +cohort-over-cohort comparison used in place of daily/weekly split." + +**Very low-volume metrics (<100 events/day):** The tests still apply but +statistical confidence drops sharply. Downgrade confidence to `low` regardless +of `level_delta` magnitude and note: "Low-volume metric — drift signal may be +Poisson noise." + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not detect point-in-time anomalies.** That's `metric-anomaly`. +- **Does not attribute cause.** Root-cause investigation is handled by `metric-rca` after detection. +- **Does not produce recommendations beyond "run anomaly first" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean drift verdict in under 60 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-rca.md b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-rca.md new file mode 100644 index 0000000..ac2bf98 --- /dev/null +++ b/plugins/mixpanel-mcp-in/skills/monitor-metrics/commands/metric-rca.md @@ -0,0 +1,484 @@ +# Command: metric-rca + +Root-cause investigation for a flagged metric. Takes the diagnosis payload +from a prior `metric-anomaly` or `metric-drift` run and fans out across a +set of segmentation branches to localise *where* the movement concentrated. +Produces a ranked list of findings and appends them to the diagnosis board +the user already created. + +This command does **not** re-run anomaly or drift detection. It assumes the +movement has already been established — its job is attribution, not +detection. + +--- + +## Prerequisites + +Before this command runs, the session must hold a **diagnosis payload** in +conversation memory from an earlier `metric-anomaly` or `metric-drift` run +(see `SKILL.md` Step 2). The payload carries the project, metric, metric +type, date ranges, flagged points or drift windows, and the query bodies +used. + +If no payload exists, do **not** attempt to run RCA from a cold start. Tell +the user: *"RCA runs on top of an existing anomaly or drift diagnosis. Run +`metric-anomaly` or `metric-drift` first, then come back here."* Stop. + +### Board state + +If the user persisted the diagnosis as a Mixpanel board (Step 2 in +`SKILL.md`), the payload will include `diagnosis_board_id`. This command +**appends** to that board — it does not create a new one. If no board was +created, skip the append step at the end and just return the findings +inline; do not silently create a new board. + +### Ask once — business / market context + +Before firing Branch 5, ask the user exactly once: + +> *"What business or market is this metric tied to? (e.g., Indian +> e-commerce, Indian OTT streaming, SEA fintech.) I'll use this to check +> whether the flagged dates line up with festivals, launches, or +> category-specific events."* + +Hold the answer as `business_context`. If the user skips or says "not +relevant", skip Branch 5 entirely — do not guess the market from project +name or memory. + +--- + +## Phase 1 — Branch selection + parallel fan-out + +Read the payload and decide which branches to run. Every branch runs +against the **same date ranges** the source command used: + +- `metric-anomaly` payload → use 7-day hourly + 30-day daily windows. +- `metric-drift` payload → use 60-day daily + 16-week weekly windows, with + recent vs prior window comparison preserved. + +If both payloads exist in the session (user ran anomaly then drift), +prefer the drift payload's date ranges — RCA over a longer window is more +useful — and annotate findings with the anomaly payload's flagged +timestamps for cross-reference. + +### Branch selection matrix + +| Branch | Purpose | Runs when | +|---|---|---| +| **Branch 1 — Component decomposition** | Break ratio/funnel/retention into its component events + metric-definition filters | `metric_type ∈ {ratio, funnel, retention}` | +| **Branch 2 — Default-property breakdowns** | Source → geography → client-specific split | Always | +| **Branch 3 — Distinct-ID outliers** | Find whether a small set of users drove the movement | Anomaly payload only. Skip if in-window distinct user count >10k | +| **Branch 4 — Cohort comparison** | Run the metric filtered to the cohorts the user names to find concentration in named user segments | The user named one or more cohorts (or referenced a cohort in their ask) | +| **Branch 5 — Calendar context** | Check whether flagged dates line up with festivals, launches, category events in `business_context` | `business_context` provided | + +Run all selected branches **in parallel** via concurrent `Run-Query` calls. +Each branch can issue multiple queries; batch within a branch sequentially +if one query's result informs the next (Branch 2's second level depends +on the first). + +--- + +## Branch 1 — Component decomposition + +Only runs for `ratio`, `funnel`, and `retention` metrics. The question: +*is the movement in the numerator, the denominator, or a specific step?* + +**If the metric came from a saved Mixpanel Metric** (`metric_id` is set on +the payload), read the component events, formula, and filters straight from +the `Get-Metric` definition rather than re-deriving them — the definition is +authoritative and avoids guessing the numerator/denominator. Fall back to +the derivation below only when no saved-Metric definition is available. + +### For `ratio` +1. Pull numerator event as a standalone count series (same window, + granularity, and filters from the metric definition). +2. Pull denominator event as a standalone count series (same window, + granularity, and filters). +3. Compare each component's deviation % against the ratio's overall + deviation %. Flag which component moved. +4. If both components moved in the same direction by similar magnitude → + the ratio is stable but volumes shifted. Note as a volume story, not a + conversion story. +5. If only one moved, or they moved opposite directions → the ratio + shift is concentration-driven. Identify which. + +### For `funnel` +1. Run the **same funnel definition** twice as `report_type=funnels` via + `Run-Query`: once for the recent (drift/anomaly) window, once for the + baseline window. The native funnels response returns step conversion + rates and absolute counts per step. +2. For each step pair, compute the conversion-rate delta between recent + and baseline. +3. Flag the **specific step pair** with the largest absolute conversion + drop. One step usually owns the drop; surface that pair as the + headline finding. +4. If the funnel has step-level filters (e.g. property filters on + individual steps), do not decompose into standalone event counts — + the filters change the meaning. The native funnels query is the only + faithful comparison. + +This replaces the prior "pull each funnel step as a standalone event +count" approach. Standalone event counts ignore step ordering and +step-level filters; the native funnels report does not. + +### For `retention` +1. Pull the cohort-defining event as a standalone count series. +2. Pull the return event as a standalone count series. +3. Check whether cohort size changed, return count changed, or both. +4. A drop in retention with stable return count + larger cohort is a mix + effect; a drop in return count with stable cohort is real attrition. + +### Event × metric-definition filter combinations + +For every component event above, re-run it with **each filter from the +metric definition applied independently** (i.e. one filter at a time, not +all combinations — combinatorial blowup is not useful here). This shows +whether a specific filter value concentrates the movement. + +Example: if the metric definition has `user_type = premium` baked in, +and the numerator event is `video_play`, run: +- `video_play` with no filter +- `video_play` with `user_type = premium` (the baked filter) — this + should match the metric's numerator +- `video_play` broken down **by** `user_type` (all values) — exposes + whether the movement is specific to `premium` or shared across the + population. + +Cap at 5 filter values per property breakdown; drop the long tail. + +--- + +## Branch 2 — Default-property breakdowns + +Two-level cascade. Always runs. + +### Level 1 — Source segmentation + +Break down the metric by the SDK / ingestion source. Two properties +together: + +- Event property `mp_lib` (string) — SDK name (e.g. `web`, `android`, + `iphone`, `swift`, `python`, `ruby`, `java`). +- Event property `$import` (boolean) — true for events ingested via the + Import API, false for Track API. + +Output: a matrix of `mp_lib × $import` with deviation % per cell. The +goal here is to isolate whether the movement is concentrated in +client-side vs server-side vs Import API ingestion. + +### Level 2 — Conditional breakdowns + +The Level 2 slice depends on what Level 1 surfaced. Run the slice whose +dominant source owns the movement; skip the others. + +**For client-side sources (`web`, `android`, `iphone`, `swift`, etc.):** +Common first slice — geography in a step function: +- Event property `$os` +- Event property `platform` (or the project's equivalent; check the + metric definition or fall back to `mp_lib` if not present) +- Event property `mp_country_code` +- Event property `$region` +- Event property `$city` + +Run these as a **step function**, not a cross-product: start with +`mp_country_code`. If one country owns >50% of the movement, break that +country down by `$region`. If one region owns >50%, break by `$city`. +Stop when the concentration flattens. + +**For `web` specifically:** +- Event property `$device` +- Event property `utm_source` +- Event property `$browser` + +**For `android` / `iphone` / `swift` / `ios`:** +- Event property `$app_version_string` +- Event property `$model` + +Run these as single-property breakdowns, not two-level (avoids the +high-cardinality two-level truncation risk that bites large projects). + +### Cardinality discipline + +- Any breakdown returning exactly 1,000 / 3,000 / 10,000 rows is + potentially truncated — flag in findings, do not treat the result as + exhaustive. +- If a two-level breakdown (`mp_lib × $import`) is used, keep the + first-level cardinality bounded: if `mp_lib` returns >20 distinct + values, filter to the top 10 by volume before running the second + level. + +--- + +## Branch 3 — Distinct-ID outliers + +Only runs for anomaly payloads. Goal: is a small set of users +responsible for the flagged point(s)? + +### Cardinality gate + +Before running, check in-window distinct user count against the metric's +base query. If >10,000 distinct users contributed to the metric in the +flagged window, skip this branch and note "Branch 3 skipped — user +cardinality too high for outlier detection via MCP." A top-N breakdown +on 100k users returns noise. + +### If within cardinality + +1. Break the metric down by `distinct_id` for the flagged window only + (not the whole series — this keeps the query tractable). +2. Rank users by their contribution to the metric in the flagged window. +3. Flag outliers: users whose contribution in the flagged window is + >5σ above the median user's contribution, OR users who appear in + the flagged window but not in the baseline window. +4. Cap output at the top 20 distinct_ids by deviation. + +If the top 5 users account for >30% of the movement → strong user-driven +outlier signal. Surface this prominently. Could be bots, internal test +traffic, or a single high-volume customer. + +### Optional follow-up — session replay context + +If the top 3 distinct_ids each account for ≥10% of the movement individually, +offer the user a follow-up: *"Top user(s) `` drove [X]% of the +flagged window. Want me to pull their session replays from that window so +you can see what they did?"* + +If the user says yes, call `Get-User-Replays-Data` for each flagged +distinct_id with `from_date` and `to_date` set to the flagged window. Cap at +3 distinct_ids and 5 replays per user. Surface the replay URLs + timestamps +in the findings card under the Branch 3 section. + +This is **opt-in only** — do not pull replays automatically. Replays add +value when the customer wants the "what did they actually do" answer, but +they're noisy if Session Replay isn't widely enabled in the project. Ask +once, run if confirmed, skip if declined. + +--- + +## Branch 4 — Cohort comparison + +Goal: is the movement concentrated in a specific user cohort the customer +already cares about? Cohorts are typically the most CSA-actionable RCA +signal — "your churn-risk cohort dropped 40%" is a far better headline than +"users on iOS 17.4 dropped 40%." + +### Step 1 — Identify candidate cohorts + +The Mixpanel MCP surface has **no cohort-listing tool** — `Search-Entities` +does not support a `cohort` entity type (its types are insights, funnels, +flows, retention, dashboard, launch-analysis, experiments, feature-flags, +metric-trees, playlists, heat-maps). Branch 4 therefore cannot auto-discover +cohorts; source them from the user instead: + +1. If the user named cohorts in their original ask (e.g. "is this happening + in our power users?"), use those. +2. Otherwise, ask once: *"Want me to compare against any saved cohorts? If + so, name them (or share their cohort IDs) and I'll filter the metric to + each."* + +If the user names no cohorts (or declines) → record *"Branch 4 skipped — no +cohorts named; cohort auto-discovery isn't available on the MCP surface."* +and continue. + +### Step 2 — Resolve the named cohorts + +Cap at the **top 5 cohorts** the user named. For each, resolve its +`cohort_id` — the user may give a name or an id; if only a name is given, +confirm it back before filtering. If the user named more than 5, ask which +five matter most. + +Surface the cohort names in the findings — the customer recognizes their +own cohort names and that's part of the value. + +### Step 3 — Run the metric filtered by each cohort + +For each selected cohort, run the same `query_template` as the headline +metric, with one cohort-membership filter added. The exact filter shape +comes from `Get-Query-Schema` — Mixpanel's query schema accepts cohort +membership as a filter on `distinct_id` referencing the cohort_id. + +Run all cohort queries in parallel via concurrent `Run-Query` calls. Each +query covers the same date window the source command used (drift window +or anomaly window). + +### Step 4 — Score and rank + +For each cohort, compute the same concentration + deviation scores used +in the Phase 2 ranking step (cohort_delta_abs / total_delta_abs and the +cohort's own deviation %). Treat cohorts as candidate findings the same +way property breakdowns are treated. + +A cohort is **important** if either: +- It explains ≥30% of the headline movement (lower threshold than the + default 40% — cohorts are smaller slices than top-level properties, + and 30% concentration in a named cohort is a strong signal), OR +- Its individual deviation is ≥1.5× the headline metric's deviation. + +### Error handling + +| Situation | Response | +|---|---| +| User names no cohorts | Skip branch, record reason. | +| A cohort filter fails in `Run-Query` (cohort schema mismatch) | Retry once. If still failing, skip that cohort, continue others, note in branch coverage. | +| All cohort queries fail | Skip branch, note "Branch 4 skipped — cohort filtering failed across all cohorts." | + +--- + +## Branch 5 — Calendar context + +Only runs if the user provided `business_context`. + +1. Identify the key dates in the flagged window. For anomaly payloads, + use the timestamps from `payload.flags.hourly` and `payload.flags.daily`. + For drift payloads, use the change-point date if `shape = step`, or + the start of the drift window otherwise. +2. Run a `web_search` with a query built from `business_context` + the + relevant date(s). Example: if `business_context = "Indian e-commerce"` + and the change-point is `2026-03-08`, search `"Indian e-commerce + events March 8 2026 festival sale"`. If `web_search` isn't available in + this runtime, skip Branch 5 and record *"Branch 5 skipped — web search + unavailable in this runtime"* (mirrors the no-`business_context` skip); + the other four branches still run. +3. Look for matches: religious festivals, cricket fixtures, sale events + (BBD, EOSS, GOSF), product launches, regulatory dates (e.g. RBI policy + announcements). +4. If a plausible match surfaces, include it in findings with a + confidence label: `strong` (exact date match, major event), `moderate` + (same week, category-aligned), `weak` (same month, tangential). +5. If nothing surfaces, record: *"No calendar events found for + `` on the flagged dates."* + +This branch is **context**, not **evidence**. Phrase findings as "the +flagged date falls on [event]" — never as "the [event] caused the +movement." Correlation only; causation belongs to the customer. + +--- + +## Phase 2 — Synthesise, rank, visualise + +### Rank findings + +For every branch, each sub-segment (a `mp_lib` value, a country, a funnel +step, a distinct_id, etc.) is a candidate finding. Score each: + +- **Concentration score** — share of the total movement this segment + explains. `segment_delta_abs / total_delta_abs`. A segment with 70% + concentration is worth surfacing; 5% is not. +- **Deviation score** — this segment's deviation % compared to its own + baseline. A segment that individually deviated 40% is stronger signal + than one that deviated 5%. + +Flag a finding as **"important"** if **either** of these is true: +- Concentration score ≥ 0.4 (one segment owns ≥40% of the movement), OR +- Segment deviation ≥ 1.5× the headline metric's deviation (the movement + concentrates here). + +Cap total important findings at 6. If more than 6 qualify, keep the top 6 +by concentration × deviation combined rank. + +### Visualise important findings + +Render a single visualizer widget containing one chart per important +finding, stacked vertically. Chart type by branch: + +| Branch | Chart | +|---|---| +| Branch 1 (component) | Two-line overlay: headline metric vs component metric, same window, same granularity | +| Branch 2 (property breakdown) | Horizontal bar chart, one bar per segment, bar length = deviation %, color-coded by direction | +| Branch 3 (distinct_id) | Horizontal bar chart, top-N users by contribution % in flagged window | +| Branch 4 (cohort) | Horizontal bar chart, one bar per important cohort, bar length = deviation %, color-coded by direction | +| Branch 5 (calendar) | No chart — rendered as an annotation in the written findings block | + +Before generating, read `visualize:read_me` with `modules: ["chart"]` +once if not already loaded this session. Do not narrate the read_me call. + +### The findings card + +``` +METRIC: +DIAGNOSIS SOURCE: +WINDOW: + +━━ HEADLINE ━━ + + +━━ IMPORTANT FINDINGS (ranked) ━━ +1. [Branch N] of movement, + vs baseline. . +2. ... +(cap 6; omit section if no important findings) + +━━ BRANCH COVERAGE ━━ +Branch 1 (component): +Branch 2 (default props): +Branch 3 (distinct_id): +Branch 4 (cohort): +Branch 5 (calendar): + +━━ WHAT THIS ISN'T ━━ +This is attribution by segmentation, not causal analysis. Findings show +where the movement concentrated; they do not prove what caused it. +Calendar matches are correlation only. +``` + +### The RCA payload (passed back to SKILL.md) + +After rendering the findings card + charts, hand back to the skill-level +flow: + +``` +{ + command: "metric-rca", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + source_payload_command: "metric-anomaly" | "metric-drift", + business_context: , + rca_queries: [ + { branch: int, label: str, run_query_body: dict, result: dict }, ... + ], + important_findings: [ + { branch: int, segment: str, concentration_pct: float, + deviation_pct: float, interpretation: str, + chart_spec: dict }, + ... (cap 6) + ], + findings_card: , + headline: , + diagnosis_board_id: +} +``` + +The skill-level flow (Step 3 in `SKILL.md`, added with this command) +handles the board append. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| No diagnosis payload in session | Stop. Tell user to run `metric-anomaly` or `metric-drift` first. | +| A branch query fails | Retry once. If still failing, mark that branch partial, continue others, note in branch coverage. | +| All branches fail | Stop. Report failure and ask the user to verify project access. | +| Branch 2 Level 1 returns only one `mp_lib × $import` cell with meaningful volume | Skip Branch 2 Level 2 conditional logic; run the fallback geography step function directly. | +| User declines to provide `business_context` | Skip Branch 5 entirely, proceed with others. | +| `web_search` unavailable in this runtime | Skip Branch 5, record "Branch 5 skipped — web search unavailable." Other branches continue. | +| No important findings after ranking (all segments <40% concentration and <1.5× deviation) | Surface that finding: "Movement is distributed across segments — no single dimension concentrates it." This is a valid, useful result. | + +--- + +## What this command deliberately doesn't do + +- **Does not re-run anomaly or drift detection.** It consumes the payload. +- **Does not claim causation.** Correlation by segmentation is the ceiling. +- **Does not cross-join properties combinatorially.** Branch 2 is a + step-function cascade, not a cross-product, because high-cardinality + two-level breakdowns truncate silently. +- **Does not source calendar dates from memory.** Always `web_search` + with the user-provided `business_context` (skips gracefully if web search + is unavailable). +- **Does not create a new board.** Appends to the existing diagnosis + board via the skill-level flow. + +Keep the surface narrow. A ranked list of 3-6 concentrated segments with +charts beats a 40-branch exhaustive report every time. diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000001 b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000001 new file mode 100644 index 0000000..11a1684 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000001 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `Mixpanel MCP:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `Mixpanel MCP:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `Mixpanel MCP:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `Mixpanel MCP:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `Mixpanel MCP:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                        `, `

                        `, `

                        `, + ``, `

                          `, `
                        • `, `
                          `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000002 b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000002 new file mode 100644 index 0000000..11a1684 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000700000002 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `Mixpanel MCP:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `Mixpanel MCP:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `Mixpanel MCP:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `Mixpanel MCP:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `Mixpanel MCP:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                          `, `

                          `, `

                          `, + ``, `

                            `, `
                          • `, `
                            `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000800000003 b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000800000003 new file mode 100644 index 0000000..c6ec4b8 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000800000003 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the Mixpanel MCP. Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                            `, `

                            `, `

                            `, + ``, `

                              `, `
                            • `, `
                              `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000900000004 b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000900000004 new file mode 100644 index 0000000..a14f131 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000900000004 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires Mixpanel MCP. +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp` connector (Mixpanel US). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                              `, `

                              `, `

                              `, + ``, `

                                `, `
                              • `, `
                                `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000a00000005 b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000a00000005 new file mode 100644 index 0000000..a79b6dd --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/.fuse_hidden0000000a00000005 @@ -0,0 +1,459 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp` connector (Mixpanel US). +--- + +# Monitor Metrics + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp` connector (Mixpanel US). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `Get-Properties` with + `property_names=[]` and `resource_type=`. + If it doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                                `, `

                                `, `

                                `, + ``, `

                                  `, `
                                • `, `
                                  `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/SKILL.md b/plugins/mixpanel-mcp/skills/monitor-metrics/SKILL.md new file mode 100644 index 0000000..357d2f7 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/SKILL.md @@ -0,0 +1,462 @@ +--- +name: monitor-metrics +description: > + Monitor and diagnose a Mixpanel metric for anomalies, drift, and root + cause. Use whenever the user asks to investigate, debug, monitor, or + explain a change in a Mixpanel metric — a saved Metric, KPI, conversion + rate, retention, event count, funnel step, or anything tracked in a saved + report or dashboard. Trigger phrases: "monitor [metric]", "what's going on + with [metric]", "why did [metric] drop/spike", "diagnose this metric", + "check for anomalies", "has [metric] drifted", "is this metric stable", + "something looks off", "did [metric] change last month", "what's driving + the drop", "where is the movement coming from", "run RCA on this metric". + Also trigger when the user shares a Mixpanel report/dashboard/metric link + and asks what's happening, or describes a metric in prose and wants to know + if the movement is real. Do NOT trigger for portfolio health checks (use + `weekly-pulse`) or adoption reports (use `gtm-customer-intelligence`). + Requires the `mixpanel-mcp` connector (Mixpanel US). +--- + +# Monitor Metrics + +> **Connector:** This skill operates exclusively against the `mixpanel-mcp` connector (Mixpanel US region). Every Mixpanel MCP tool call in this SKILL.md and in every file under `commands/` must be routed through `mixpanel-mcp` — never any other Mixpanel connector. + +A focused diagnostic skill for a single metric at a time. Works for any +project the user has access to. Requires the `mixpanel-mcp` connector (Mixpanel US). Answers three +questions cleanly: + +1. **Is a recent point weird?** (anomaly detection — `metric-anomaly`) +2. **Has the baseline itself shifted?** (drift detection — `metric-drift`) +3. **Where is the movement coming from?** (root-cause attribution — + `metric-rca`) + +Separation matters because the customer conversation is different for each: +an anomaly is an incident, drift is a trend, and RCA is the segmentation +story that makes either of the first two actionable. + +`metric-rca` runs on top of an existing anomaly or drift diagnosis — it +consumes the diagnosis payload, fans out across segmentation branches, and +appends its findings to the diagnosis board. It does not perform detection +itself. + +--- + +## Commands + +This skill has three commands. Route to the right one based on the user's +ask. + +### `metric-anomaly` +Detect point-in-time anomalies — recent spikes, drops, and clusters in a +single metric. Uses time-bucketed Z-score + IQR tests against 7-day hourly +and 30-day daily series. Produces flagged timestamps, classification +(isolated / cluster / edge), and a verdict. **Does not** test for +trend-level drift. + +Trigger when the user wants to know *whether a specific point looks weird* — +"is this spike real?", "did something happen yesterday?", "is this a blip?". + +→ See `commands/metric-anomaly.md` + +### `metric-drift` +Detect trend-level drift — whether the baseline has shifted. Runs mean-shift +and variance-ratio tests on 60-day daily (last 30 vs prior 30) and 16-week +weekly (last 8 vs prior 8) windows. Includes a lightweight outlier +contamination check so it can run standalone without `metric-anomaly` +first. Produces direction, magnitude, shape (step/slope/oscillating), and +a verdict. **Does not** flag individual point anomalies. + +Trigger when the user wants to know *whether the trend has changed* — +"has this drifted?", "is the baseline different now?", "what's happened over +the last month?". + +→ See `commands/metric-drift.md` + +### `metric-rca` +Root-cause attribution on top of an existing anomaly or drift diagnosis. +Fans out across five branches — component decomposition, default-property +breakdowns, distinct-id outliers, cohort comparison, and calendar/market +context — over the same date windows the source command used. Ranks findings +by concentration and deviation, renders charts for the important ones, and +appends results to the diagnosis board. + +Trigger when the user wants to know *where the movement came from* — +"what's driving this drop?", "where is the spike concentrated?", "break +this down", "run RCA", "is it a specific segment?". Requires a prior +`metric-anomaly` or `metric-drift` run in the same session. + +→ See `commands/metric-rca.md` + +--- + +## Choosing between the commands + +- **Ambiguous or exploratory ask** ("something looks off") → default to + `metric-anomaly` first. Anomaly is cheaper (2 queries) and catches + point-in-time issues that would contaminate a drift test. +- **"Has this changed over the last month?"** → `metric-drift` directly. +- **Both detection questions matter** → run `metric-anomaly` first, then + `metric-drift`. Drift will pick up any anomaly context if present and + downgrade confidence accordingly. +- **User asks "why" or "where" after seeing a verdict** → `metric-rca`. +- **User opens with "why did X drop"** → run `metric-anomaly` or + `metric-drift` first (whichever fits their framing better), then flow + into `metric-rca`. Do not run RCA cold — it needs the detection payload. + +--- + +## Step 0 — Input validation (both commands) + +**Do not skip this step.** Before touching Step 1 or anything downstream, +confirm the user has given both a project and a metric. If either is +missing, ask once and wait. + +### Step 0a — Resolve org/project context first + +Before validating the project, call `mixpanel-mcp:Get-Business-Context` +**once per session**. Pass `project_id` if the user already gave one; +otherwise call without it. This returns: + +- Org-specific vocabulary (project nicknames, internal acronyms, product + terms) that may resolve the user's request without needing `Get-Projects`. +- Project-specific guidance on how that customer queries their data + (relevant for any project with established conventions). + +If business context resolves the project name → proceed directly to the +metric validation step. If not → fall through to `Get-Projects`. + +Skip this call only if the user's input is unambiguous (a numeric +`project_id` plus a clearly-named saved metric/report, with no project name +to interpret). + +### Validate the project + +| Situation | Action | +|---|---| +| User gave a `project_id` (int) | Call `mixpanel-mcp:Get-Projects`, find the matching entry, and confirm the project **name** back to the user in one line: *"Running on project `` (id: ``) — confirm?"*. Wait for confirmation. | +| User gave a project **name** only | Call `mixpanel-mcp:Get-Projects`, find the match. If one match, resolve the id and confirm back. If multiple matches or no match, list the candidates and ask the user to pick. | +| Neither given | Ask: *"Which Mixpanel project should I run this on? Share the project id, name, or a report/metric URL."* Do not guess from memory or past conversations. | + +Store the resolved `project_id` and `project_name` on the metric series object. + +### Validate the metric + +Resolve in this priority order. **Saved Mixpanel Metrics are the preferred +input** — they carry a complete, machine-readable definition (see Step 1). + +| Situation | Action | +|---|---| +| User named a metric, or said "metric" generically | Call `mixpanel-mcp:List-Metrics` with `project_id` and `query=`. If one saved Metric matches, confirm the resolved name back to the user. If several match, list and ask. If none match, fall through to the other shapes below (saved report / prose). | +| User gave a metric **id** | Treat as a saved Metric. Confirm via `Get-Metric` in Step 1. | +| User gave a report URL, `bookmark_id`, or dashboard URL | Resolve via the Step 1 input-shape table. Confirm the resolved metric name and one-sentence definition back to the user before firing queries. | +| User described the metric in prose | Still call `List-Metrics` once to check whether a saved Metric already captures it — reuse beats rebuild. If no match, confirm the prose definition back to the user in one sentence before firing queries. | +| Nothing given | Ask: *"Which metric are we diagnosing? Share a saved Metric name, a report URL, a bookmark id, or describe it in one line."* Do not assume from context. | + +Only proceed once both project and metric are confirmed. + +--- + +## Step 1 — Metric ingestion (both commands) + +Resolve the metric into a single canonical form: a normalized **metric +series** object whose `query_template` is the `report` body each command +will replay at its own date windows. + +There are two ways `query_template` gets built. **Prefer the first.** + +### Path A — Saved Mixpanel Metric (preferred) + +A saved Metric is the only input shape that returns its **full definition** +programmatically. Use it whenever Step 0 resolved a saved Metric. + +1. Call `mixpanel-mcp:Get-Metric` with `project_id` and `metric_id`. +2. The response carries the complete metric structure — events, formulas, + filters, and aggregation. Lift this directly into `query_template`. You + do **not** need to reconstruct it from prose, and you do **not** need + `Get-Query-Schema` for a saved Metric — the definition is authoritative. +3. Confirm the resolved metric **name** and a one-line plain-English summary + of what it measures back to the user before firing any time-series query. +4. Record `metric_id` on the series object so a board or RCA run can + reference the source Metric. + +### Path B — Saved report, dashboard tile, or prose (rebuild) + +Used when there is no saved Metric. Here `query_template` must be **built +fresh** and confirmed with the user, because these shapes do not expose a +replayable query body. + +> **Important:** `Get-Report` returns report metadata + results at the +> report's native granularity but **does not** return the underlying query +> definition. Saved reports are only a starting point for confirming the +> metric definition — every downstream `Run-Query` is built fresh from the +> confirmed prose definition using `Get-Query-Schema`. (This is the key +> difference from Path A: `Get-Metric` *does* return a replayable +> definition; `Get-Report` does not.) + +#### Input shape resolution (Path B) + +| Input shape | How to recognize | How to resolve | +|---|---|---| +| **Saved report (with ID)** | A `bookmark_id` + `project_id`, or a report URL containing `/report//` | Call `Get-Report` with `skip_results=false`. From the metadata + native-granularity results, draft a one-sentence prose definition (event(s), measurement type, obvious filters). Confirm with the user. | +| **Dashboard tile (with URL or ID)** | A dashboard URL containing `/dashboards/` | Call `Get-Dashboard` with `include_layout=true`, find the matching report cell, then treat as saved report (above). | +| **Report/dashboard referenced by name only** | "the conversion tile on the funnel board" with no URL | Call `Search-Entities` with appropriate `entity_types` (`["dashboard"]` for boards; `["insights","funnels","retention","flows"]` for reports) and `query=`. One match → resolve. Multiple → list and ask. None → ask for the URL. | +| **Natural language** | User describes the metric in prose | Confirmation already done in Step 0. Proceed to query construction. | + +#### Build the query body (Path B) + +Once the metric definition is confirmed in prose: + +1. Determine `report_type` (`insights`, `funnels`, `retention`, or `flows`). +2. Call `Get-Query-Schema` for that report type. +3. Construct the `report` body — events, measurement, filters, breakdowns — + matching the prose definition. Do **not** copy from a saved report's raw + response; build from the schema. + +### Normalize to a "metric series" object internally + +``` +{ + project_id: int, + project_name: str, # resolved and confirmed in Step 0 + metric_id: int | null, # set when source is a saved Metric (Path A) + metric_name: str, # human-readable label + metric_definition: str, # one-sentence what-it-measures (confirmed) + report_type: str, # insights | funnels | retention | flows + query_template: dict, # `report` body (from Get-Metric or Get-Query-Schema) + default_filters: list, # filters baked into query_template, for RCA reference +} +``` + +Every downstream step operates on this object. Each command's Phase 1 +overrides only `dateRange` and `unit` (granularity) on `query_template`. + +**Funnel and retention classification** is owned by each command's own +pre-flight (top of `commands/metric-anomaly.md` and `commands/metric-drift.md`), +not by Step 1. Step 1 is deliberately narrow: resolve the metric into a +normalized series object. Nothing more. + +--- + +## Step 1.5 — Project profile resolution + +Before writing any time-series query, resolve a minimal project profile. +This step is cheap (metadata calls only) and catches filter/instrumentation +problems before they contaminate the diagnosis. + +### Filter resolution (cheap metadata calls, not probe queries) + +For every filter referenced in `query_template` (billing/account filters, +exclusions, user-property filters, segment scopes): + +1. **Confirm the property exists.** Call `List-Properties` with + `names=[]` and `resource_type=` (pass + `events=[]` to scope to a specific event's properties). If it + doesn't resolve, stop and tell the user — the filter references a + property that doesn't exist in this project. +2. **Confirm the filter value is real.** Call `Get-Property-Values` with + the property name and (for event properties) the relevant event. If the + filter value isn't in the returned distinct values, stop and tell the + user — the filter excludes everything because the value never appears. + +Skip this for filters that came from a saved Metric definition (Path A) and +are already known-good — but still validate any filter the *user* added on +top of the saved Metric. + +### Instrumentation health check + +Call `Get-Issues` once, scoped to the events used by `query_template` +(`event_name=` for each), with `since_date` set to the earliest +date the diagnosis will look at (60 days back for drift, 30 days back for +anomaly). If issues exist (type drift, null spikes, schema changes) in +that window: + +- Capture issue summaries. +- Do **not** abort the diagnosis. Carry these forward to the verdict card + under contamination — a separate signal from the statistical + contamination check. The customer needs to know if instrumentation + changed during the window even if the metric itself looks stable. + +### Two-level breakdown truncation note + +Two-level breakdowns can return truncated result sets on high-cardinality +dimensions. Treat any result that looks suspiciously round (e.g. exactly +1,000 / 3,000 / 10,000 rows and no tail) as potentially truncated and +confirm before relying on it. Mainly an RCA Branch 2 concern but applies +anywhere a two-level breakdown is run. + +Store as `project_profile` for downstream use: +``` +{ + filters_validated: list, # filters confirmed to resolve + instrumentation_issues: list, # issues from Get-Issues, may be empty + truncation_warnings: list, # populated by downstream branches +} +``` + +--- + +## Output contract + +Both commands produce a structured verdict, not a data dump. The commands +define their own output formats; common principles: + +- **Default to compact.** A CSA scanning between calls needs a verdict in under 60 seconds. Full detail is opt-in. +- **Always chart the trend.** Both commands always render inline charts — whether anomalies/drift were detected or not. A stable metric gets the same charts; the visual confirmation of stability is just as valuable as flagging a problem. Annotation overlays (anomaly dots, drift window shading, change-point markers) only appear when something was flagged. +- **Fixed section order.** Headline → confidence → next step. Never lead with a hedge. +- **Explicit scope limits.** Every output names what it did *not* do ("this does not test for drift — run `metric-drift`"; "this does not flag individual anomalies — run `metric-anomaly`"). + +Never output a wall of tables or raw query results. The CSA is the audience, +and the goal is a verdict they can act on. + +--- + +## Step 2 — Post-diagnosis handoff (both commands) + +At the end of Phase 3, each command hands back a structured **diagnosis +payload** to the skill-level flow. The skill then offers the user a board, +and caches the payload in conversation memory for a future `metric-rca` +command. + +### The diagnosis payload + +Both commands return the same shape: + +``` +{ + command: "metric-anomaly" | "metric-drift", + project_id: int, + project_name: str, + metric_id: int | null, + metric_name: str, + metric_definition: str, + metric_type: str, + queries: [ + { label: str, window: str, granularity: str, run_query_body: dict, result: dict }, + ... + ], + verdict_card: str, # the full rendered card from Phase 3 + headline: str, # one-line summary from the card + flags: dict # command-specific (flagged points for anomaly; level_delta / var_ratio / shape for drift) +} +``` + +This payload is held in conversation memory only — do not write to disk. +It survives for the session and is what `metric-rca` consumes when +invoked. If the user later creates a board (below), the resulting +`board_id` is attached to the payload as `diagnosis_board_id` so +`metric-rca` knows where to append. + +### The board prompt + +After rendering the Phase 3 charts + verdict card, ask the user **exactly +once**: + +> *"Want me to save this as a board in Mixpanel?"* + +Do not offer the prompt if either of these is true: +- The command aborted in error handling (no usable verdict). +- The metric is `retention` and the command was `metric-anomaly` (was skipped to drift — nothing to board). + +### If the user says yes + +Create a dashboard in the same `project_id`. Use `Create-Dashboard` directly +— this case (one board, N reports, one text card) is simple enough that +delegating to a dashboard-manager skill adds unnecessary indirection. + +Build the rows as follows: + +1. **Run each query in `queries[]` first** with `skip_results=true` to + register them and get their `query_id`s back. Do this in parallel. +2. **Assemble the dashboard rows:** + - Row 1: a single text cell containing `verdict_card` (HTML-formatted + using `Create-Dashboard`'s allowed tags: `

                                  `, `

                                  `, `

                                  `, + ``, `

                                    `, `
                                  • `, `
                                    `, etc. — no newlines, each element + is a new line). + - Row 2 onwards: one report cell per query in `queries[]`, named + `, ` (matching the chart titles + from Phase 3). +3. **Call `Create-Dashboard`** with `title= + diagnosis (YYYY-MM-DD)`, the rows above, and the user's project_id. + +Return the board URL to the user when done, and **store the resulting +`board_id` back onto the diagnosis payload as `diagnosis_board_id`** so a +subsequent `metric-rca` run can append to it. + +For the **append** path at Step 3 (adding RCA findings to an existing +board), use `Get-Dashboard` (with `include_layout=true`) → `Update-Dashboard` +to add cells without disturbing the existing layout. + +### If the user says no + +Do nothing. The payload is already in conversation memory; `metric-rca` +will pick it up when invoked later in the session. + +--- + +## Step 3 — Post-RCA board append + +Runs after `metric-rca` returns its payload (see `commands/metric-rca.md` +Phase 2). The RCA payload carries `important_findings`, `findings_card`, +and `rca_queries` — Step 3's job is to append these to the existing +diagnosis board without creating a new one. + +### Append target + +Read `diagnosis_board_id` from the source payload (the anomaly/drift +payload that RCA consumed). + +- **If present** → append to that board. This is the default path. +- **If null** (the user declined the board earlier) → do not create a + board silently. Return the findings card + charts inline and tell the + user: *"No diagnosis board was created earlier, so I'm not appending + anywhere. Want me to create a board now with the diagnosis + RCA + findings together?"* If they say yes, follow Step 2's board-creation + path first, then run Step 3 against the new board. + +### What to append + +Use `Get-Dashboard` (`include_layout=true`) → `Update-Dashboard` to append. +The content to add, in order: + +1. **One text card** containing `findings_card` verbatim. Place it + beneath the existing Phase 3 verdict card (visual continuity: diagnosis + first, then attribution). +2. **One saved report per important finding** — use `chart_spec` + + `run_query_body` from the RCA payload's `rca_queries`. Name each + ` — RCA: ` so the board reads as a + story: headline → verdict → findings → per-segment charts. + +Cap appended reports at 6 (matches the RCA findings cap). If there are +zero important findings, append only the text card — the "no single +segment concentrates the movement" result is still worth boarding. + +### Do not offer a second prompt + +RCA's append to an existing board is automatic — do not ask *"should I +append?"*. The user already opted into the board at Step 2. The only ask +at Step 3 is the fallback above, when no board exists yet. + +Return the updated board URL when done. + +--- + +## When not to use this skill + +- **Portfolio-wide sweeps** → use `weekly-pulse`. +- **Full adoption story / QBR prep** → use `gtm-customer-intelligence`. +- **Lexicon / instrumentation health** → use `manage-lexicon`. +- **Metric definition help** ("how should I measure X?") → answer directly, no skill needed. +- **Root-cause investigation from scratch, without a prior diagnosis** → + run `metric-anomaly` or `metric-drift` first, then `metric-rca`. RCA + does not run cold. + +This skill is deliberately narrow: one metric, one diagnosis, one +attribution pass. + +--- + +## Files + +- `commands/metric-anomaly.md` — point-in-time anomaly detection (Z-score + IQR, time-bucketed; 2 queries; 7-day hourly + 30-day daily views) +- `commands/metric-drift.md` — trend-level drift detection (mean shift + variance ratio; 2 queries; 60-day daily + 16-week weekly views; owns shape classification) +- `commands/metric-rca.md` — root-cause attribution (5-branch segmentation fan-out on same windows as source command; ranks findings by concentration × deviation; appends to the diagnosis board) diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-anomaly.md b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-anomaly.md new file mode 100644 index 0000000..6c2bafe --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-anomaly.md @@ -0,0 +1,242 @@ +# Command: metric-anomaly + +Detect point-in-time anomalies in a single metric — recent spikes, drops, and +clusters. Produces a verdict on *whether* something unusual happened at a +specific moment. Does **not** test for trend-level drift (run `metric-drift` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-drift.md` — edits to one must be mirrored in the other._ + +> _Keep this classification table in sync with the identical block in +> `metric-drift.md` — edits to one must be mirrored in the other._ + +> _Keep this classification table in sync with the identical block in +> `metric-drift.md` — edits to one must be mirrored in the other._ + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Purpose | +|---|---|---|---| +| Q1-hourly | Last 7 days | `hour` | Recent-blip detection | +| Q1-daily | Last 30 days | `day` | Recent-day detection against a fuller baseline | + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +Build the `Run-Query` body from `query_template` with only `dateRange` and +`unit` (granularity) overridden. Use `timeComparison` when a single call can +cover both windows. + +--- + +## Phase 2 — Outlier tests (Z-score + IQR, time-bucketed) + +For each series independently, compute the expected range at every timestamp. +Run **both** tests; flag a point if **either** test flags it. Report which +test(s) caught each flag. + +### Test 1 — Z-score against time-bucketed mean + +- For the **hourly** series: group all points by hour-of-day (0–23) and day-of-week (7 × 24 = 168 buckets). Compute mean (μ) and stddev (σ) per bucket across the 7-day window. Flag any point where `|value - μ| / σ > 2.5`. +- For the **daily** series: group by day-of-week (7 buckets). Compute μ and σ across the 30-day window. Flag any point where `|value - μ| / σ > 2.5`. +- Handle low-variance buckets: if σ is <5% of μ, skip the Z-score for that bucket and fall back to IQR only (division by tiny σ creates false alarms). + +### Test 2 — IQR against time-bucketed median + +- Same bucketing scheme as Test 1. +- For each bucket, compute Q1, median, Q3, and IQR = Q3 − Q1. +- Flag any point where `value < Q1 − 1.5 × IQR` or `value > Q3 + 1.5 × IQR`. + +### Deviation magnitude + +For every flagged point, report `(value − median) / median` as a signed +percentage. This is what the CSA actually cares about, not the Z-score itself. + +### Classify each flagged timestamp + +- **Isolated spike/drop** — one point flagged, neighbors normal. Most likely a real anomaly (outage, release, data gap). +- **Cluster** — 2+ consecutive points flagged in the same direction. Could be a short incident *or* the leading edge of drift. Flag as ambiguous and note that `metric-drift` may be a better follow-up. +- **Edge-of-window cluster** — flagged points are the most recent N points. Strongly suggestive of drift, not anomaly. Recommend running `metric-drift` before treating as an anomaly incident. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether anything was flagged. A stable chart +is the visual proof of stability and saves the CSA from second-guessing. + +**Top chart: 7-day hourly view** (Q1-hourly series) +- Line for the hourly series. +- Dots for every flagged hourly point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 7 days, hourly`. + +**Bottom chart: 30-day daily view** (Q1-daily series) +- Line for the daily series. +- Dots for every flagged daily point — red for drops, amber for spikes. Omit entirely if no flags. +- Label the most recent flagged point inline with timestamp and deviation %. +- Title: ` — last 30 days, daily`. + +Both charts share x-axis type (date/time) but not range — render as two +separate plots in one widget, stacked, with consistent y-axis formatting. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ ANOMALY VERDICT ━━ +Hourly series (7d): +Daily series (30d): + +━━ TOP FLAGS ━━ + [isolated | cluster | edge] (z-score | IQR | both) + [isolated | cluster | edge] (z-score | IQR | both) +... (cap 5; omit section entirely if no flags) + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is point-in-time anomaly detection only. Trend-level drift is not +tested here — run `metric-drift` for that. +``` + +#### Headline phrasing discipline + +- No flags: "Metric is stable at the point-in-time level — no anomalies in the last 7 or 30 days." +- Isolated flag(s): "Metric had a [spike/drop] of X% on [date]. Baseline otherwise stable." +- Cluster or edge cluster: "Metric has [N] anomalies concentrated in the last [window] — likely the leading edge of drift. Recommend running `metric-drift` next." + +Never lead with a confidence hedge. State the finding, then qualify it. + +If >10 flags total across both series, cap the TOP FLAGS list at 5 entries +sorted by deviation magnitude descending and add a note to the headline: +"18 anomalies flagged in the last 7 days — the metric is either undergoing a +regime shift or the baseline model is wrong. Run `metric-drift` before +treating any single point as actionable." + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-anomaly", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-hourly", window: "last 7 days", granularity: "hour", + run_query_body: , result: }, + { label: "Q1-daily", window: "last 30 days", granularity: "day", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + hourly: [ { timestamp, value, deviation_pct, classification, test } , ... ], + daily: [ { timestamp, value, deviation_pct, classification, test } , ... ] + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** The hourly view is usually too noisy for a multi-step +funnel at low volume. Drop Q1-hourly and run Q1-daily only (last 14 days +instead of 30 to stay lightweight). Note in output: "Hourly anomaly detection +skipped — funnel volume too low at hourly granularity." + +**Retention metrics:** Retention is a rolling cohort metric — point-in-time +anomaly detection mostly doesn't apply. Tell the user directly and recommend +`metric-drift` instead, which has a cohort-over-cohort fallback for retention. + +**Very low-volume metrics (<100 events/day):** Skip Q1-hourly and run +Q1-daily only — the Poisson noise floor dominates at hourly granularity. +State this in the output. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not test for trend-level drift.** That's `metric-drift`. +- **Does not attribute cause.** Root-cause investigation is out of scope for this command — run `metric-rca` after detection. +- **Does not produce recommendations beyond "run drift" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean anomaly verdict in under 30 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-drift.md b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-drift.md new file mode 100644 index 0000000..12e9456 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-drift.md @@ -0,0 +1,319 @@ +# Command: metric-drift + +Detect trend-level drift in a single metric — whether the baseline itself has +shifted over recent weeks. Produces a verdict on *whether* the metric is in a +new regime. Does **not** test for point-in-time anomalies (run `metric-anomaly` +for that). + +--- + +## Prerequisites + +Before this command runs, Steps 0, 1, and 1.5 from `SKILL.md` must have +completed — input validation, normalized metric series object, and project +profile resolution. If any of those haven't happened, do them first. + +If the user's input is a saved report but the metric is a **funnel** or +**retention** report, see the "Special cases" section at the bottom. + +### Prerequisite — classify `metric_type` + +Before firing any queries, classify the metric into one of: +`count`, `unique_count`, `ratio`, `funnel`, `retention`, `unknown`. + +| Detected | Classification | +|---|---| +| Report type `funnels` | `funnel` | +| Report type `retention` | `retention` | +| Query template has A/B form or `% of total` (conversion rate, session rate, etc.) | `ratio` | +| Single-series count (event count, event count distinct users) | `count` | +| Single-series unique count | `unique_count` | +| Formula metric / custom SQL / anything else | `unknown` | + +Store as `metric_type` on the metric series object. Used in the verdict card +and in special-case routing (funnel, retention). + +> _Keep this classification table in sync with the identical block in +> `metric-anomaly.md` — edits to one must be mirrored in the other._ + +### Prerequisite — name the drift and baseline windows + +The naming convention used throughout this command's output: + +- **`drift_window`** — the **recent** 30 days (most recent 30 days ending today). +- **`baseline_window`** — the **prior** 30 days (30 days ending 30 days before today). + +Both windows are computed from Q1-daily. The weekly test uses 8 vs 8 weeks — +those windows are reported alongside but are secondary to the daily windows +for headline purposes. + +--- + +## Phase 1 — Fetch series (2 queries, parallel) + +Fire both `Run-Query` calls simultaneously: + +| Query | Window | Granularity | Comparison | +|---|---|---|---| +| Q1-daily | Last 60 days | `day` | Last 30 days vs. prior 30 days | +| Q1-weekly | Last 16 weeks | `week` | Last 8 weeks vs. prior 8 weeks | + +The 60-day daily view catches medium-term drift. The 16-week weekly view +catches slow drift that the daily window would miss because daily noise +drowns the signal. Running both is cheap and they answer different questions. + +Use the `query_template` from the metric object; override only `dateRange` +and `unit` (granularity). Do not re-apply filters — they're already baked in. + +--- + +## Phase 2 — Drift tests (mean shift + variance ratio) + +### Window split & contamination check + +For each series, split into `recent` and `prior` halves (no overlap). + +**Lightweight anomaly contamination check** (important because this command +can run standalone without `metric-anomaly` having run first): + +Scan the `recent` window for obvious outliers using a simple rule — any point +more than 3σ from the window mean. If ≥20% of points in the `recent` window +qualify → flag **"drift test potentially contaminated by outliers in the +recent window"** and mark all drift findings as low-confidence. Recommend the +user run `metric-anomaly` first. + +If 0–20% of points qualify, proceed normally but note the count in the +verdict card's contamination section. + +This is deliberately lighter than `metric-anomaly`'s full time-bucketed +test — its job here is only to flag contamination risk, not to produce a +publishable anomaly verdict. + +### Test 1 — Mean shift (level drift) + +``` +mean_recent = mean(recent_window) +mean_prior = mean(prior_window) +level_delta = (mean_recent − mean_prior) / mean_prior # signed % +``` + +Flag thresholds: +- `|level_delta| < 5%` → no meaningful shift +- `5% ≤ |level_delta| < 15%` → moderate drift +- `|level_delta| ≥ 15%` → significant drift + +Additionally compute a Welch's t-test on the two windows. If p < 0.05 and +`level_delta ≥ 5%`, drift is statistically supported. If p ≥ 0.05, note the +shift is observational but not statistically distinguishable from noise. + +### Test 2 — Variance ratio (volatility drift) + +``` +var_ratio = variance(recent_window) / variance(prior_window) +``` + +Flag thresholds: +- `0.67 ≤ var_ratio ≤ 1.5` → variance stable +- `var_ratio > 1.5` → metric got noisier (investigate instrumentation, cohort mix) +- `var_ratio < 0.67` → metric got smoother (often a sign of flatlining or saturation) + +Variance drift without level drift is an under-appreciated signal — the +headline number looks fine but something structural changed. Always surface +it separately. + +Distribution-shape tests (KS, PSI) are intentionally **not** part of this +battery. They require per-user or per-segment values, which Mixpanel's MCP +surface does not return at practical cost. + +### Combine into a per-series verdict + +| Verdict | When | +|---|---| +| **No drift** | Level stable AND variance stable | +| **Level drift** | Level shifted ≥5%, variance stable | +| **Variance drift** | Level stable, variance ratio outside 0.67–1.5 | +| **Compound drift** | Both | + +Also report **direction** (up / down) and **magnitude** (% for level, ratio +for variance). + +### Reconcile the two series + +The 60-day-daily and 16-week-weekly views should agree on direction. If they +disagree: + +- **Weekly says drift, daily says none** → slow drift that daily noise hides. Trust the weekly. +- **Daily says drift, weekly says none** → recent movement that hasn't accumulated into the weekly window yet. Could be the leading edge of real drift, or a contained incident. Trust the daily but note the weekly hasn't confirmed. +- **Both agree** → high confidence, state it. + +### Classify drift shape + +If drift is flagged, classify its shape using the daily series for use in +the verdict card: + +| Condition | `verdict_shape` value | +|---|---| +| Single-day change point where mean shift before vs after explains ≥60% of variance, and before/after segments are each <20% within-segment variance | `step` (record the change-point date) | +| Linear regression fit to the full 60-day series has R² ≥ 0.5 and non-zero slope | `slope` | +| 7-day autocorrelation on residuals ≥ 0.5, and periodicity strength differs between drift and baseline windows | `oscillating` | +| None of the above fit cleanly | `unclassified` | + +**Shape precedence**: if multiple shapes fit, use this priority: +`step` > `slope` > `oscillating` > `unclassified`. (Step changes are the +most actionable; surface them first when ambiguous.) + +If no drift was flagged, skip shape classification entirely. + +--- + +## Phase 3 — Summarise + charts + handoff + +Produces **three things**, in order: + +1. **A single visualizer widget with two charts stacked vertically** +2. **A compact verdict card** +3. **A diagnosis payload** handed back to the skill-level flow (Step 2 in + `SKILL.md`) for the board prompt and `metric-rca` caching + +### The charts — always rendered + +Both charts render regardless of whether drift was detected. A stable chart +is the visual proof of stability. + +**Top chart: 60-day daily view** (Q1-daily series) +- Line for the daily series. +- **Shaded band** for the prior 30-day baseline window (subtle grey fill). +- **Shaded band** for the recent 30-day drift window — red-tinted fill if drift is `down`, green-tinted if `up`, amber-tinted if `mixed`, grey if no drift. +- Horizontal line for `mean_prior` (dashed grey). +- Horizontal line for `mean_recent` (dashed, colored to match drift direction). +- If `verdict_shape = step`, annotate the change-point date with a vertical dashed line. +- Title: ` — last 60 days, daily`. + +**Bottom chart: 16-week weekly view** (Q1-weekly series) +- Line for the weekly series. +- **Shaded band** for the prior 8-week baseline window (subtle grey fill). +- **Shaded band** for the recent 8-week drift window — same direction-based coloring as above. +- Horizontal lines for `mean_prior_weekly` (dashed grey) and `mean_recent_weekly` (dashed, colored). +- Title: ` — last 16 weeks, weekly`. + +Both charts share x-axis type (date) and consistent y-axis formatting. +Render as two separate plots in one widget, stacked. + +Before generating, read `visualize:read_me` with `modules: ["chart"]` once if +not already loaded this session. Do not narrate the read_me call to the user. + +If chart generation fails, fall back to card-only output with the note +"Chart unavailable — card below." Do not block on the chart. + +### The compact verdict card + +``` +METRIC: +DEFINITION: + +━━ DRIFT VERDICT ━━ +60-day / daily view: (t-test p =

                                    ) +16-week / weekly view: +Reconciled verdict: +Shape: + +━━ CONTAMINATION ━━ + + +━━ HEADLINE ━━ + + +━━ CONFIDENCE ━━ + + +━━ NEXT STEP ━━ + + +━━ WHAT THIS ISN'T ━━ +This is trend-level drift detection only. Point-in-time anomalies are not +tested here — run `metric-anomaly` for that. +``` + +#### Headline phrasing discipline + +- No drift: "Metric is stable — trend has not shifted in the last 30 days or 8 weeks." +- Level drift: "Metric has drifted [up/down] by X% over the last 30 days. [Weekly view confirms / Weekly view hasn't confirmed yet]." +- Variance drift only: "Metric level is stable but volatility has [increased/decreased] — variance ratio [X.XX]. Something structural changed without moving the headline." +- Compound drift: "Metric has drifted [up/down] by X% AND volatility changed. Compound drift — investigate both level and structure." +- Contamination flag: append "Drift confidence is low — recent window has N outlier points. Run `metric-anomaly` first to clean up before attributing." + +Never lead with a confidence hedge. State the finding, then qualify it. + +### The diagnosis payload + +After rendering the charts and verdict card, assemble the payload defined +in `SKILL.md` Step 2 and hand it back to the skill-level flow: + +``` +{ + command: "metric-drift", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + queries: [ + { label: "Q1-daily", window: "last 60 days", granularity: "day", + run_query_body: , result: }, + { label: "Q1-weekly", window: "last 16 weeks", granularity: "week", + run_query_body: , result: } + ], + verdict_card: , + headline: , + flags: { + daily: { verdict, direction, level_delta, var_ratio, t_test_p, shape, change_point_date }, + weekly: { verdict, direction, level_delta, var_ratio }, + reconciled: , + contamination: { outlier_count, contaminated: bool } + } +} +``` + +The skill-level flow (Step 2 in `SKILL.md`) then asks the user about the +board and caches the payload for `metric-rca`. Do **not** ask the board +question from inside this command — that lives at the skill level so a +user running anomaly → drift back-to-back gets asked once at the end, +not twice. + +--- + +## Special cases + +**Funnel metrics:** Phase 1 and Phase 2 work as-is for multi-step funnels +— the overall conversion series is what drifts. No special handling needed. + +**Retention metrics:** Retention is a rolling cohort metric — "drift" on a +retention curve means cohort-over-cohort degradation. Replace the 60-day +daily and 16-week weekly splits with a cohort-over-cohort comparison: last +8 cohorts vs. prior 8 cohorts on the same retention day (D1, D7, D30). Flag +which retention day shifted. Note in the verdict card: "Retention +cohort-over-cohort comparison used in place of daily/weekly split." + +**Very low-volume metrics (<100 events/day):** The tests still apply but +statistical confidence drops sharply. Downgrade confidence to `low` regardless +of `level_delta` magnitude and note: "Low-volume metric — drift signal may be +Poisson noise." + +--- + +## Error handling + +| Situation | Response | +|---|---| +| Either query fails | Retry once. If still failing, mark that series partial, continue the other, note in output. | +| Both queries fail | Stop. Report the failure and ask the user to verify project access. | +| Project requires a filter the user didn't provide | Ask once, then proceed. Don't guess. | +| Metric returns zero events in window | Stop. The metric is either broken or the filter excludes everything. Report as a possible data quality issue; do not proceed to Phase 2. | + +--- + +## What this command deliberately doesn't do + +- **Does not detect point-in-time anomalies.** That's `metric-anomaly`. +- **Does not attribute cause.** Root-cause investigation is handled by `metric-rca` after detection. +- **Does not produce recommendations beyond "run anomaly first" / "run RCA".** The verdict is the product. + +Keep the surface narrow. A clean drift verdict in under 60 seconds is more +useful than a sprawling analysis that tries to do everything. diff --git a/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-rca.md b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-rca.md new file mode 100644 index 0000000..ac2bf98 --- /dev/null +++ b/plugins/mixpanel-mcp/skills/monitor-metrics/commands/metric-rca.md @@ -0,0 +1,484 @@ +# Command: metric-rca + +Root-cause investigation for a flagged metric. Takes the diagnosis payload +from a prior `metric-anomaly` or `metric-drift` run and fans out across a +set of segmentation branches to localise *where* the movement concentrated. +Produces a ranked list of findings and appends them to the diagnosis board +the user already created. + +This command does **not** re-run anomaly or drift detection. It assumes the +movement has already been established — its job is attribution, not +detection. + +--- + +## Prerequisites + +Before this command runs, the session must hold a **diagnosis payload** in +conversation memory from an earlier `metric-anomaly` or `metric-drift` run +(see `SKILL.md` Step 2). The payload carries the project, metric, metric +type, date ranges, flagged points or drift windows, and the query bodies +used. + +If no payload exists, do **not** attempt to run RCA from a cold start. Tell +the user: *"RCA runs on top of an existing anomaly or drift diagnosis. Run +`metric-anomaly` or `metric-drift` first, then come back here."* Stop. + +### Board state + +If the user persisted the diagnosis as a Mixpanel board (Step 2 in +`SKILL.md`), the payload will include `diagnosis_board_id`. This command +**appends** to that board — it does not create a new one. If no board was +created, skip the append step at the end and just return the findings +inline; do not silently create a new board. + +### Ask once — business / market context + +Before firing Branch 5, ask the user exactly once: + +> *"What business or market is this metric tied to? (e.g., Indian +> e-commerce, Indian OTT streaming, SEA fintech.) I'll use this to check +> whether the flagged dates line up with festivals, launches, or +> category-specific events."* + +Hold the answer as `business_context`. If the user skips or says "not +relevant", skip Branch 5 entirely — do not guess the market from project +name or memory. + +--- + +## Phase 1 — Branch selection + parallel fan-out + +Read the payload and decide which branches to run. Every branch runs +against the **same date ranges** the source command used: + +- `metric-anomaly` payload → use 7-day hourly + 30-day daily windows. +- `metric-drift` payload → use 60-day daily + 16-week weekly windows, with + recent vs prior window comparison preserved. + +If both payloads exist in the session (user ran anomaly then drift), +prefer the drift payload's date ranges — RCA over a longer window is more +useful — and annotate findings with the anomaly payload's flagged +timestamps for cross-reference. + +### Branch selection matrix + +| Branch | Purpose | Runs when | +|---|---|---| +| **Branch 1 — Component decomposition** | Break ratio/funnel/retention into its component events + metric-definition filters | `metric_type ∈ {ratio, funnel, retention}` | +| **Branch 2 — Default-property breakdowns** | Source → geography → client-specific split | Always | +| **Branch 3 — Distinct-ID outliers** | Find whether a small set of users drove the movement | Anomaly payload only. Skip if in-window distinct user count >10k | +| **Branch 4 — Cohort comparison** | Run the metric filtered to the cohorts the user names to find concentration in named user segments | The user named one or more cohorts (or referenced a cohort in their ask) | +| **Branch 5 — Calendar context** | Check whether flagged dates line up with festivals, launches, category events in `business_context` | `business_context` provided | + +Run all selected branches **in parallel** via concurrent `Run-Query` calls. +Each branch can issue multiple queries; batch within a branch sequentially +if one query's result informs the next (Branch 2's second level depends +on the first). + +--- + +## Branch 1 — Component decomposition + +Only runs for `ratio`, `funnel`, and `retention` metrics. The question: +*is the movement in the numerator, the denominator, or a specific step?* + +**If the metric came from a saved Mixpanel Metric** (`metric_id` is set on +the payload), read the component events, formula, and filters straight from +the `Get-Metric` definition rather than re-deriving them — the definition is +authoritative and avoids guessing the numerator/denominator. Fall back to +the derivation below only when no saved-Metric definition is available. + +### For `ratio` +1. Pull numerator event as a standalone count series (same window, + granularity, and filters from the metric definition). +2. Pull denominator event as a standalone count series (same window, + granularity, and filters). +3. Compare each component's deviation % against the ratio's overall + deviation %. Flag which component moved. +4. If both components moved in the same direction by similar magnitude → + the ratio is stable but volumes shifted. Note as a volume story, not a + conversion story. +5. If only one moved, or they moved opposite directions → the ratio + shift is concentration-driven. Identify which. + +### For `funnel` +1. Run the **same funnel definition** twice as `report_type=funnels` via + `Run-Query`: once for the recent (drift/anomaly) window, once for the + baseline window. The native funnels response returns step conversion + rates and absolute counts per step. +2. For each step pair, compute the conversion-rate delta between recent + and baseline. +3. Flag the **specific step pair** with the largest absolute conversion + drop. One step usually owns the drop; surface that pair as the + headline finding. +4. If the funnel has step-level filters (e.g. property filters on + individual steps), do not decompose into standalone event counts — + the filters change the meaning. The native funnels query is the only + faithful comparison. + +This replaces the prior "pull each funnel step as a standalone event +count" approach. Standalone event counts ignore step ordering and +step-level filters; the native funnels report does not. + +### For `retention` +1. Pull the cohort-defining event as a standalone count series. +2. Pull the return event as a standalone count series. +3. Check whether cohort size changed, return count changed, or both. +4. A drop in retention with stable return count + larger cohort is a mix + effect; a drop in return count with stable cohort is real attrition. + +### Event × metric-definition filter combinations + +For every component event above, re-run it with **each filter from the +metric definition applied independently** (i.e. one filter at a time, not +all combinations — combinatorial blowup is not useful here). This shows +whether a specific filter value concentrates the movement. + +Example: if the metric definition has `user_type = premium` baked in, +and the numerator event is `video_play`, run: +- `video_play` with no filter +- `video_play` with `user_type = premium` (the baked filter) — this + should match the metric's numerator +- `video_play` broken down **by** `user_type` (all values) — exposes + whether the movement is specific to `premium` or shared across the + population. + +Cap at 5 filter values per property breakdown; drop the long tail. + +--- + +## Branch 2 — Default-property breakdowns + +Two-level cascade. Always runs. + +### Level 1 — Source segmentation + +Break down the metric by the SDK / ingestion source. Two properties +together: + +- Event property `mp_lib` (string) — SDK name (e.g. `web`, `android`, + `iphone`, `swift`, `python`, `ruby`, `java`). +- Event property `$import` (boolean) — true for events ingested via the + Import API, false for Track API. + +Output: a matrix of `mp_lib × $import` with deviation % per cell. The +goal here is to isolate whether the movement is concentrated in +client-side vs server-side vs Import API ingestion. + +### Level 2 — Conditional breakdowns + +The Level 2 slice depends on what Level 1 surfaced. Run the slice whose +dominant source owns the movement; skip the others. + +**For client-side sources (`web`, `android`, `iphone`, `swift`, etc.):** +Common first slice — geography in a step function: +- Event property `$os` +- Event property `platform` (or the project's equivalent; check the + metric definition or fall back to `mp_lib` if not present) +- Event property `mp_country_code` +- Event property `$region` +- Event property `$city` + +Run these as a **step function**, not a cross-product: start with +`mp_country_code`. If one country owns >50% of the movement, break that +country down by `$region`. If one region owns >50%, break by `$city`. +Stop when the concentration flattens. + +**For `web` specifically:** +- Event property `$device` +- Event property `utm_source` +- Event property `$browser` + +**For `android` / `iphone` / `swift` / `ios`:** +- Event property `$app_version_string` +- Event property `$model` + +Run these as single-property breakdowns, not two-level (avoids the +high-cardinality two-level truncation risk that bites large projects). + +### Cardinality discipline + +- Any breakdown returning exactly 1,000 / 3,000 / 10,000 rows is + potentially truncated — flag in findings, do not treat the result as + exhaustive. +- If a two-level breakdown (`mp_lib × $import`) is used, keep the + first-level cardinality bounded: if `mp_lib` returns >20 distinct + values, filter to the top 10 by volume before running the second + level. + +--- + +## Branch 3 — Distinct-ID outliers + +Only runs for anomaly payloads. Goal: is a small set of users +responsible for the flagged point(s)? + +### Cardinality gate + +Before running, check in-window distinct user count against the metric's +base query. If >10,000 distinct users contributed to the metric in the +flagged window, skip this branch and note "Branch 3 skipped — user +cardinality too high for outlier detection via MCP." A top-N breakdown +on 100k users returns noise. + +### If within cardinality + +1. Break the metric down by `distinct_id` for the flagged window only + (not the whole series — this keeps the query tractable). +2. Rank users by their contribution to the metric in the flagged window. +3. Flag outliers: users whose contribution in the flagged window is + >5σ above the median user's contribution, OR users who appear in + the flagged window but not in the baseline window. +4. Cap output at the top 20 distinct_ids by deviation. + +If the top 5 users account for >30% of the movement → strong user-driven +outlier signal. Surface this prominently. Could be bots, internal test +traffic, or a single high-volume customer. + +### Optional follow-up — session replay context + +If the top 3 distinct_ids each account for ≥10% of the movement individually, +offer the user a follow-up: *"Top user(s) `` drove [X]% of the +flagged window. Want me to pull their session replays from that window so +you can see what they did?"* + +If the user says yes, call `Get-User-Replays-Data` for each flagged +distinct_id with `from_date` and `to_date` set to the flagged window. Cap at +3 distinct_ids and 5 replays per user. Surface the replay URLs + timestamps +in the findings card under the Branch 3 section. + +This is **opt-in only** — do not pull replays automatically. Replays add +value when the customer wants the "what did they actually do" answer, but +they're noisy if Session Replay isn't widely enabled in the project. Ask +once, run if confirmed, skip if declined. + +--- + +## Branch 4 — Cohort comparison + +Goal: is the movement concentrated in a specific user cohort the customer +already cares about? Cohorts are typically the most CSA-actionable RCA +signal — "your churn-risk cohort dropped 40%" is a far better headline than +"users on iOS 17.4 dropped 40%." + +### Step 1 — Identify candidate cohorts + +The Mixpanel MCP surface has **no cohort-listing tool** — `Search-Entities` +does not support a `cohort` entity type (its types are insights, funnels, +flows, retention, dashboard, launch-analysis, experiments, feature-flags, +metric-trees, playlists, heat-maps). Branch 4 therefore cannot auto-discover +cohorts; source them from the user instead: + +1. If the user named cohorts in their original ask (e.g. "is this happening + in our power users?"), use those. +2. Otherwise, ask once: *"Want me to compare against any saved cohorts? If + so, name them (or share their cohort IDs) and I'll filter the metric to + each."* + +If the user names no cohorts (or declines) → record *"Branch 4 skipped — no +cohorts named; cohort auto-discovery isn't available on the MCP surface."* +and continue. + +### Step 2 — Resolve the named cohorts + +Cap at the **top 5 cohorts** the user named. For each, resolve its +`cohort_id` — the user may give a name or an id; if only a name is given, +confirm it back before filtering. If the user named more than 5, ask which +five matter most. + +Surface the cohort names in the findings — the customer recognizes their +own cohort names and that's part of the value. + +### Step 3 — Run the metric filtered by each cohort + +For each selected cohort, run the same `query_template` as the headline +metric, with one cohort-membership filter added. The exact filter shape +comes from `Get-Query-Schema` — Mixpanel's query schema accepts cohort +membership as a filter on `distinct_id` referencing the cohort_id. + +Run all cohort queries in parallel via concurrent `Run-Query` calls. Each +query covers the same date window the source command used (drift window +or anomaly window). + +### Step 4 — Score and rank + +For each cohort, compute the same concentration + deviation scores used +in the Phase 2 ranking step (cohort_delta_abs / total_delta_abs and the +cohort's own deviation %). Treat cohorts as candidate findings the same +way property breakdowns are treated. + +A cohort is **important** if either: +- It explains ≥30% of the headline movement (lower threshold than the + default 40% — cohorts are smaller slices than top-level properties, + and 30% concentration in a named cohort is a strong signal), OR +- Its individual deviation is ≥1.5× the headline metric's deviation. + +### Error handling + +| Situation | Response | +|---|---| +| User names no cohorts | Skip branch, record reason. | +| A cohort filter fails in `Run-Query` (cohort schema mismatch) | Retry once. If still failing, skip that cohort, continue others, note in branch coverage. | +| All cohort queries fail | Skip branch, note "Branch 4 skipped — cohort filtering failed across all cohorts." | + +--- + +## Branch 5 — Calendar context + +Only runs if the user provided `business_context`. + +1. Identify the key dates in the flagged window. For anomaly payloads, + use the timestamps from `payload.flags.hourly` and `payload.flags.daily`. + For drift payloads, use the change-point date if `shape = step`, or + the start of the drift window otherwise. +2. Run a `web_search` with a query built from `business_context` + the + relevant date(s). Example: if `business_context = "Indian e-commerce"` + and the change-point is `2026-03-08`, search `"Indian e-commerce + events March 8 2026 festival sale"`. If `web_search` isn't available in + this runtime, skip Branch 5 and record *"Branch 5 skipped — web search + unavailable in this runtime"* (mirrors the no-`business_context` skip); + the other four branches still run. +3. Look for matches: religious festivals, cricket fixtures, sale events + (BBD, EOSS, GOSF), product launches, regulatory dates (e.g. RBI policy + announcements). +4. If a plausible match surfaces, include it in findings with a + confidence label: `strong` (exact date match, major event), `moderate` + (same week, category-aligned), `weak` (same month, tangential). +5. If nothing surfaces, record: *"No calendar events found for + `` on the flagged dates."* + +This branch is **context**, not **evidence**. Phrase findings as "the +flagged date falls on [event]" — never as "the [event] caused the +movement." Correlation only; causation belongs to the customer. + +--- + +## Phase 2 — Synthesise, rank, visualise + +### Rank findings + +For every branch, each sub-segment (a `mp_lib` value, a country, a funnel +step, a distinct_id, etc.) is a candidate finding. Score each: + +- **Concentration score** — share of the total movement this segment + explains. `segment_delta_abs / total_delta_abs`. A segment with 70% + concentration is worth surfacing; 5% is not. +- **Deviation score** — this segment's deviation % compared to its own + baseline. A segment that individually deviated 40% is stronger signal + than one that deviated 5%. + +Flag a finding as **"important"** if **either** of these is true: +- Concentration score ≥ 0.4 (one segment owns ≥40% of the movement), OR +- Segment deviation ≥ 1.5× the headline metric's deviation (the movement + concentrates here). + +Cap total important findings at 6. If more than 6 qualify, keep the top 6 +by concentration × deviation combined rank. + +### Visualise important findings + +Render a single visualizer widget containing one chart per important +finding, stacked vertically. Chart type by branch: + +| Branch | Chart | +|---|---| +| Branch 1 (component) | Two-line overlay: headline metric vs component metric, same window, same granularity | +| Branch 2 (property breakdown) | Horizontal bar chart, one bar per segment, bar length = deviation %, color-coded by direction | +| Branch 3 (distinct_id) | Horizontal bar chart, top-N users by contribution % in flagged window | +| Branch 4 (cohort) | Horizontal bar chart, one bar per important cohort, bar length = deviation %, color-coded by direction | +| Branch 5 (calendar) | No chart — rendered as an annotation in the written findings block | + +Before generating, read `visualize:read_me` with `modules: ["chart"]` +once if not already loaded this session. Do not narrate the read_me call. + +### The findings card + +``` +METRIC: +DIAGNOSIS SOURCE: +WINDOW: + +━━ HEADLINE ━━ + + +━━ IMPORTANT FINDINGS (ranked) ━━ +1. [Branch N] of movement, + vs baseline. . +2. ... +(cap 6; omit section if no important findings) + +━━ BRANCH COVERAGE ━━ +Branch 1 (component): +Branch 2 (default props): +Branch 3 (distinct_id): +Branch 4 (cohort): +Branch 5 (calendar): + +━━ WHAT THIS ISN'T ━━ +This is attribution by segmentation, not causal analysis. Findings show +where the movement concentrated; they do not prove what caused it. +Calendar matches are correlation only. +``` + +### The RCA payload (passed back to SKILL.md) + +After rendering the findings card + charts, hand back to the skill-level +flow: + +``` +{ + command: "metric-rca", + project_id, project_name, metric_id, + metric_name, metric_definition, metric_type, + source_payload_command: "metric-anomaly" | "metric-drift", + business_context: , + rca_queries: [ + { branch: int, label: str, run_query_body: dict, result: dict }, ... + ], + important_findings: [ + { branch: int, segment: str, concentration_pct: float, + deviation_pct: float, interpretation: str, + chart_spec: dict }, + ... (cap 6) + ], + findings_card: , + headline: , + diagnosis_board_id: +} +``` + +The skill-level flow (Step 3 in `SKILL.md`, added with this command) +handles the board append. + +--- + +## Error handling + +| Situation | Response | +|---|---| +| No diagnosis payload in session | Stop. Tell user to run `metric-anomaly` or `metric-drift` first. | +| A branch query fails | Retry once. If still failing, mark that branch partial, continue others, note in branch coverage. | +| All branches fail | Stop. Report failure and ask the user to verify project access. | +| Branch 2 Level 1 returns only one `mp_lib × $import` cell with meaningful volume | Skip Branch 2 Level 2 conditional logic; run the fallback geography step function directly. | +| User declines to provide `business_context` | Skip Branch 5 entirely, proceed with others. | +| `web_search` unavailable in this runtime | Skip Branch 5, record "Branch 5 skipped — web search unavailable." Other branches continue. | +| No important findings after ranking (all segments <40% concentration and <1.5× deviation) | Surface that finding: "Movement is distributed across segments — no single dimension concentrates it." This is a valid, useful result. | + +--- + +## What this command deliberately doesn't do + +- **Does not re-run anomaly or drift detection.** It consumes the payload. +- **Does not claim causation.** Correlation by segmentation is the ceiling. +- **Does not cross-join properties combinatorially.** Branch 2 is a + step-function cascade, not a cross-product, because high-cardinality + two-level breakdowns truncate silently. +- **Does not source calendar dates from memory.** Always `web_search` + with the user-provided `business_context` (skips gracefully if web search + is unavailable). +- **Does not create a new board.** Appends to the existing diagnosis + board via the skill-level flow. + +Keep the surface narrow. A ranked list of 3-6 concentrated segments with +charts beats a 40-branch exhaustive report every time.