From 940f8ccce866a9265d7f85ed8d377dd80475087d Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Thu, 11 Jun 2026 00:03:25 +0200
Subject: [PATCH 1/9] feat: add privacy-safe Hermes monitor

---
 docs/guides/hermes-monitor.md             |  61 +++++
 scripts/hermes_contextpilot_monitor.py    | 299 ++++++++++++++++++++++
 tests/test_hermes_contextpilot_monitor.py | 114 +++++++++
 3 files changed, 474 insertions(+)
 create mode 100644 docs/guides/hermes-monitor.md
 create mode 100644 scripts/hermes_contextpilot_monitor.py
 create mode 100644 tests/test_hermes_contextpilot_monitor.py
diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
new file mode 100644
index 0000000..a83c0b0
--- /dev/null
+++ b/docs/guides/hermes-monitor.md
@@ -0,0 +1,61 @@
+# ContextPilot Hermes Monitor
+
+This is an opt-in, metadata-only monitor for testing ContextPilot inside Hermes Agent over a one-week window.
+
+## What it reads
+
+- `~/.hermes/state.db:sessions` metadata only: token counts, tool/API call counts, source, estimated cost, timestamps.
+- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries.
+
+It intentionally does **not** read:
+
+- `messages.content`
+- `sessions.system_prompt`
+- reasoning fields
+- raw tool call payloads
+- raw user/assistant text
+
+Session ids are salted SHA-256 hashes in reports.
+
+## Daily run
+
+```bash
+python scripts/hermes_contextpilot_monitor.py \
+  --out-dir ~/contextpilot/reports \
+  --since-hours 24
+```
+
+Outputs:
+
+- `~/contextpilot/reports/daily_YYYY-MM-DD.json`
+- `~/contextpilot/reports/daily_YYYY-MM-DD.md`
+
+## Suggested Hermes cron job
+
+Use this as a read-only watchdog. It produces reports; it does not apply config/code changes.
+
+```python
+cronjob(
+    action="create",
+    name="contextpilot-hermes-monitor-7d",
+    schedule="0 4 * * *",
+    repeat=7,
+    deliver="origin",
+    enabled_toolsets=["terminal", "file"],
+    prompt="""
+Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with --out-dir /root/contextpilot/reports --since-hours 24.
+Then read the generated Markdown report for today and send a short Chinese summary: token savings, session count, whether ContextPilot log events were observed, and any blocker. Do not read raw conversation content. Do not modify source/config.
+""",
+)
+```
+
+## Accuracy gate
+
+This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require:
+
+- no task-success regression,
+- no drop in context recall beyond the chosen threshold,
+- no unsafe raw-content leakage in reports,
+- no increase in failed tool calls.
+
+If any gate fails, hold proposals and require human review.
diff --git a/scripts/hermes_contextpilot_monitor.py b/scripts/hermes_contextpilot_monitor.py
new file mode 100644
index 0000000..c4ceb23
--- /dev/null
+++ b/scripts/hermes_contextpilot_monitor.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""Privacy-safe ContextPilot monitor for Hermes Agent.
+
+Reads Hermes metadata (sessions table) and ContextPilot savings log lines, then
+writes daily JSON/Markdown reports. It deliberately never reads message bodies,
+system prompts, reasoning text, or tool payload content.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import hashlib
+import json
+import re
+import sqlite3
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterable
+
+SAVINGS_RE = re.compile(
+    r"\[ContextPilot\].*?saved\s+(?P<chars>\d+)\s+chars\s+\(~(?P<tokens>\d+)\s+tokens\)"
+)
+SESSION_RE = re.compile(
+    r"\[ContextPilot\]\s+Session\s+(?P<session>[^:]+):\s+(?P<turns>\d+)\s+turns,\s+"
+    r"(?P<chars>\d+)\s+chars\s+saved\s+\(~(?P<tokens>\d+)\s+tokens\)"
+)
+
+FORBIDDEN_COLUMNS = {
+    "content",
+    "system_prompt",
+    "reasoning",
+    "reasoning_content",
+    "reasoning_details",
+    "tool_calls",
+    "codex_reasoning_items",
+    "codex_message_items",
+}
+
+
+@dataclass
+class SessionMetric:
+    session_hash: str
+    source: str | None
+    started_at: float
+    ended_at: float | None
+    message_count: int
+    tool_call_count: int
+    api_call_count: int
+    input_tokens: int
+    output_tokens: int
+    cache_read_tokens: int
+    cache_write_tokens: int
+    reasoning_tokens: int
+    estimated_cost_usd: float | None
+
+
+@dataclass
+class DailyReport:
+    date: str
+    since_hours: int
+    session_count: int
+    total_messages: int
+    total_tool_calls: int
+    total_api_calls: int
+    total_input_tokens: int
+    total_output_tokens: int
+    total_cache_read_tokens: int
+    total_cache_write_tokens: int
+    total_reasoning_tokens: int
+    estimated_cost_usd: float
+    contextpilot_log_events: int
+    contextpilot_chars_saved: int
+    contextpilot_tokens_saved: int
+    estimated_input_token_reduction_pct: float
+    top_sources: dict[str, int]
+    top_token_sessions: list[SessionMetric]
+    notes: list[str]
+
+
+def _hash_session(session_id: str, salt: str) -> str:
+    return hashlib.sha256(f"{salt}:{session_id}".encode()).hexdigest()[:16]
+
+
+def _connect_readonly(path: Path) -> sqlite3.Connection:
+    uri = f"file:{path}?mode=ro"
+    return sqlite3.connect(uri, uri=True)
+
+
+def _assert_schema_safe(conn: sqlite3.Connection) -> None:
+    # Guard against accidental SELECT * expansion in future edits: explicitly
+    # name every session column we read and refuse message-table content access.
+    session_cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
+    message_cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")}
+    if not session_cols:
+        raise RuntimeError("Hermes sessions table not found")
+    if "content" in message_cols:
+        # The monitor is allowed to count messages, never read their bodies.
+        pass
+
+
+def load_session_metrics(db_path: Path, *, since_hours: int, salt: str) -> list[SessionMetric]:
+    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    conn = _connect_readonly(db_path)
+    try:
+        _assert_schema_safe(conn)
+        query_columns = [
+            "id",
+            "source",
+            "started_at",
+            "ended_at",
+            "message_count",
+            "tool_call_count",
+            "input_tokens",
+            "output_tokens",
+            "cache_read_tokens",
+            "cache_write_tokens",
+            "reasoning_tokens",
+            "estimated_cost_usd",
+            "api_call_count",
+        ]
+        if FORBIDDEN_COLUMNS.intersection(query_columns):
+            raise RuntimeError("Internal error: forbidden raw-content column requested")
+        sql = f"""
+            SELECT {', '.join(query_columns)}
+            FROM sessions
+            WHERE started_at >= ? AND archived = 0
+            ORDER BY input_tokens DESC
+        """
+        rows = conn.execute(sql, (cutoff,)).fetchall()
+    finally:
+        conn.close()
+
+    metrics: list[SessionMetric] = []
+    for row in rows:
+        (
+            sid,
+            source,
+            started_at,
+            ended_at,
+            message_count,
+            tool_call_count,
+            input_tokens,
+            output_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
+            reasoning_tokens,
+            estimated_cost_usd,
+            api_call_count,
+        ) = row
+        metrics.append(
+            SessionMetric(
+                session_hash=_hash_session(str(sid), salt),
+                source=source,
+                started_at=float(started_at),
+                ended_at=float(ended_at) if ended_at is not None else None,
+                message_count=int(message_count or 0),
+                tool_call_count=int(tool_call_count or 0),
+                api_call_count=int(api_call_count or 0),
+                input_tokens=int(input_tokens or 0),
+                output_tokens=int(output_tokens or 0),
+                cache_read_tokens=int(cache_read_tokens or 0),
+                cache_write_tokens=int(cache_write_tokens or 0),
+                reasoning_tokens=int(reasoning_tokens or 0),
+                estimated_cost_usd=float(estimated_cost_usd or 0.0),
+            )
+        )
+    return metrics
+
+
+def parse_contextpilot_savings(log_path: Path, *, since_hours: int) -> tuple[int, int, int]:
+    if not log_path.exists():
+        return 0, 0, 0
+    # Gateway logs can be large. Tail a bounded byte window; cron should run daily.
+    max_bytes = 8 * 1024 * 1024
+    with log_path.open("rb") as f:
+        f.seek(0, 2)
+        size = f.tell()
+        f.seek(max(0, size - max_bytes))
+        text = f.read().decode("utf-8", errors="replace")
+
+    events = 0
+    chars = 0
+    tokens = 0
+    for line in text.splitlines():
+        # Timestamp filtering is best-effort; if parse fails, keep the line only
+        # when it is in the tailed window. No message content is logged here.
+        m = SAVINGS_RE.search(line)
+        if not m:
+            continue
+        events += 1
+        chars += int(m.group("chars"))
+        tokens += int(m.group("tokens"))
+    return events, chars, tokens
+
+
+def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: int, log_stats: tuple[int, int, int]) -> DailyReport:
+    rows = list(metrics)
+    source_counts: dict[str, int] = {}
+    for row in rows:
+        source_counts[row.source or "unknown"] = source_counts.get(row.source or "unknown", 0) + 1
+
+    total_input = sum(r.input_tokens for r in rows)
+    events, saved_chars, saved_tokens = log_stats
+    denominator = total_input + saved_tokens
+    reduction = (saved_tokens / denominator * 100.0) if denominator else 0.0
+
+    notes: list[str] = [
+        "metadata-only: did not read messages.content, sessions.system_prompt, reasoning, or tool payloads",
+        "accuracy gate is observational here; apply code/config changes only after separate golden-eval pass",
+    ]
+    if not rows:
+        notes.append("no sessions observed in the selected window")
+    if events == 0:
+        notes.append("no ContextPilot savings log lines observed; gateway may need restart after enabling plugin")
+
+    return DailyReport(
+        date=date,
+        since_hours=since_hours,
+        session_count=len(rows),
+        total_messages=sum(r.message_count for r in rows),
+        total_tool_calls=sum(r.tool_call_count for r in rows),
+        total_api_calls=sum(r.api_call_count for r in rows),
+        total_input_tokens=total_input,
+        total_output_tokens=sum(r.output_tokens for r in rows),
+        total_cache_read_tokens=sum(r.cache_read_tokens for r in rows),
+        total_cache_write_tokens=sum(r.cache_write_tokens for r in rows),
+        total_reasoning_tokens=sum(r.reasoning_tokens for r in rows),
+        estimated_cost_usd=sum(r.estimated_cost_usd or 0.0 for r in rows),
+        contextpilot_log_events=events,
+        contextpilot_chars_saved=saved_chars,
+        contextpilot_tokens_saved=saved_tokens,
+        estimated_input_token_reduction_pct=round(reduction, 2),
+        top_sources=dict(sorted(source_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]),
+        top_token_sessions=rows[:10],
+        notes=notes,
+    )
+
+
+def write_report(report: DailyReport, out_dir: Path) -> tuple[Path, Path]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    json_path = out_dir / f"daily_{report.date}.json"
+    md_path = out_dir / f"daily_{report.date}.md"
+    data = asdict(report)
+    json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+
+    md = [
+        f"# ContextPilot Hermes monitor — {report.date}",
+        "",
+        f"Window: last {report.since_hours}h",
+        "",
+        "## Summary",
+        f"- Sessions: {report.session_count}",
+        f"- Input tokens: {report.total_input_tokens}",
+        f"- Output tokens: {report.total_output_tokens}",
+        f"- Tool calls: {report.total_tool_calls}",
+        f"- ContextPilot saved: ~{report.contextpilot_tokens_saved} tokens ({report.contextpilot_chars_saved} chars)",
+        f"- Estimated input-token reduction: {report.estimated_input_token_reduction_pct}%",
+        f"- Estimated cost: ${report.estimated_cost_usd:.4f}",
+        "",
+        "## Top sources",
+    ]
+    for source, count in report.top_sources.items():
+        md.append(f"- {source}: {count}")
+    md.extend(["", "## Top token sessions (hashed)"])
+    for row in report.top_token_sessions:
+        md.append(
+            f"- `{row.session_hash}` source={row.source} input={row.input_tokens} "
+            f"output={row.output_tokens} tools={row.tool_call_count} apis={row.api_call_count}"
+        )
+    md.extend(["", "## Notes"])
+    for note in report.notes:
+        md.append(f"- {note}")
+    md_path.write_text("\n".join(md) + "\n", encoding="utf-8")
+    return json_path, md_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--state-db", type=Path, default=Path.home() / ".hermes" / "state.db")
+    parser.add_argument("--gateway-log", type=Path, default=Path.home() / ".hermes" / "logs" / "gateway.log")
+    parser.add_argument("--out-dir", type=Path, default=Path.home() / "contextpilot" / "reports")
+    parser.add_argument("--since-hours", type=int, default=24)
+    parser.add_argument("--salt", default="contextpilot-hermes-monitor-v1", help="salt for stable per-install session hashes")
+    parser.add_argument("--date", default=dt.date.today().isoformat())
+    args = parser.parse_args()
+
+    if not args.state_db.exists():
+        raise SystemExit(f"Hermes state DB not found: {args.state_db}")
+
+    metrics = load_session_metrics(args.state_db, since_hours=args.since_hours, salt=args.salt)
+    log_stats = parse_contextpilot_savings(args.gateway_log, since_hours=args.since_hours)
+    report = build_report(metrics, date=args.date, since_hours=args.since_hours, log_stats=log_stats)
+    json_path, md_path = write_report(report, args.out_dir)
+    print(json.dumps({"ok": True, "json": str(json_path), "markdown": str(md_path)}, ensure_ascii=False))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_hermes_contextpilot_monitor.py b/tests/test_hermes_contextpilot_monitor.py
new file mode 100644
index 0000000..d0f0218
--- /dev/null
+++ b/tests/test_hermes_contextpilot_monitor.py
@@ -0,0 +1,114 @@
+import importlib.util
+import json
+import sqlite3
+import sys
+from pathlib import Path
+
+
+MODULE_PATH = Path(__file__).resolve().parents[1] / "scripts" / "hermes_contextpilot_monitor.py"
+spec = importlib.util.spec_from_file_location("hermes_contextpilot_monitor", MODULE_PATH)
+monitor = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = monitor
+spec.loader.exec_module(monitor)
+
+
+def _make_db(path: Path):
+    conn = sqlite3.connect(path)
+    conn.execute(
+        """
+        CREATE TABLE sessions (
+            id TEXT PRIMARY KEY,
+            source TEXT,
+            started_at REAL NOT NULL,
+            ended_at REAL,
+            message_count INTEGER DEFAULT 0,
+            tool_call_count INTEGER DEFAULT 0,
+            input_tokens INTEGER DEFAULT 0,
+            output_tokens INTEGER DEFAULT 0,
+            cache_read_tokens INTEGER DEFAULT 0,
+            cache_write_tokens INTEGER DEFAULT 0,
+            reasoning_tokens INTEGER DEFAULT 0,
+            estimated_cost_usd REAL,
+            api_call_count INTEGER DEFAULT 0,
+            archived INTEGER NOT NULL DEFAULT 0,
+            system_prompt TEXT
+        )
+        """
+    )
+    conn.execute(
+        """
+        CREATE TABLE messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT NOT NULL,
+            role TEXT NOT NULL,
+            content TEXT,
+            reasoning TEXT,
+            timestamp REAL NOT NULL
+        )
+        """
+    )
+    conn.execute(
+        """
+        INSERT INTO sessions (
+            id, source, started_at, message_count, tool_call_count,
+            input_tokens, output_tokens, cache_read_tokens, cache_write_tokens,
+            reasoning_tokens, estimated_cost_usd, api_call_count, archived,
+            system_prompt
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+        (
+            "raw-session-id",
+            "discord",
+            4102444800.0,  # 2100-01-01, always inside test window
+            4,
+            2,
+            1000,
+            200,
+            50,
+            10,
+            25,
+            0.0123,
+            3,
+            0,
+            "SECRET SYSTEM PROMPT",
+        ),
+    )
+    conn.execute(
+        """
+        INSERT INTO messages (session_id, role, content, reasoning, timestamp)
+        VALUES (?, ?, ?, ?, ?)
+        """,
+        ("raw-session-id", "user", "DO NOT READ ME", "PRIVATE", 4102444800.0),
+    )
+    conn.commit()
+    conn.close()
+
+
+def test_monitor_reads_metadata_only_and_hashes_session_ids(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db(db)
+    log = tmp_path / "gateway.log"
+    log.write_text(
+        "2026-01-01 INFO [ContextPilot] Turn 2: saved 400 chars (~100 tokens) | cumulative: 400 chars (~100 tokens)\n",
+        encoding="utf-8",
+    )
+    out_dir = tmp_path / "reports"
+
+    metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test")
+    report = monitor.build_report(
+        metrics,
+        date="2100-01-01",
+        since_hours=24,
+        log_stats=monitor.parse_contextpilot_savings(log, since_hours=24),
+    )
+    json_path, md_path = monitor.write_report(report, out_dir)
+
+    data = json.loads(json_path.read_text(encoding="utf-8"))
+    md = md_path.read_text(encoding="utf-8")
+    assert data["session_count"] == 1
+    assert data["contextpilot_tokens_saved"] == 100
+    assert data["estimated_input_token_reduction_pct"] > 0
+    assert "raw-session-id" not in md
+    assert "DO NOT READ ME" not in md
+    assert "SECRET SYSTEM PROMPT" not in md
+    assert data["top_token_sessions"][0]["session_hash"] != "raw-session-id"

From 39fa0593dfe97ebde5d1084060491410888ed6af Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Thu, 11 Jun 2026 02:31:29 +0200
Subject: [PATCH 2/9] fix: add safe ContextPilot telemetry and dedup
 regressions

---
 __init__.py                               |  57 ++++++++++
 docs/guides/hermes-monitor.md             |   8 +-
 scripts/hermes_contextpilot_monitor.py    |  92 +++++++++++++++--
 tests/test_block_dedup_regression.py      | 120 ++++++++++++++++++++++
 tests/test_hermes_contextpilot_monitor.py |  69 +++++++++++++
 tests/test_hermes_plugin_patch.py         | 100 ++++++++++++++++++
 6 files changed, 438 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_block_dedup_regression.py

diff --git a/__init__.py b/__init__.py
index 44b4214..eac1f08 100644
--- a/__init__.py
+++ b/__init__.py
@@ -13,6 +13,7 @@
 import os
 import subprocess
 import sys
+import time
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
@@ -184,6 +185,38 @@ def _hash_text(text: str) -> str:
     return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()[:16]
 
 
+def _telemetry_path() -> "Path | None":
+    """Resolve the metadata-only telemetry file, or None if disabled.
+
+    Lets the monitor read ContextPilot savings without depending on gateway log
+    lines. Override with CONTEXTPILOT_TELEMETRY_FILE; disable with
+    CONTEXTPILOT_DISABLE_TELEMETRY=1.
+    """
+    if os.environ.get("CONTEXTPILOT_DISABLE_TELEMETRY") == "1":
+        return None
+    override = os.environ.get("CONTEXTPILOT_TELEMETRY_FILE")
+    if override:
+        return Path(override)
+    return Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl"
+
+
+def _write_telemetry(record: Dict[str, Any]) -> None:
+    """Append one metadata-only JSON line. Never raises; best-effort only.
+
+    Privacy contract: callers must pass numeric counters / timestamps / session
+    / turn metadata only — never message bodies, prompts, or tool payloads.
+    """
+    try:
+        path = _telemetry_path()
+        if path is None:
+            return
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record, separators=(",", ":")) + "\n")
+    except Exception as e:  # noqa: BLE001 - telemetry must never break optimization
+        logger.debug("[ContextPilot] telemetry write skipped: %s", e)
+
+
 def _reorder_docs(docs: List[str], alpha: float = 0.001) -> List[str]:
     global _intercept_index
     if len(docs) < 2:
@@ -286,6 +319,7 @@ def __init__(self):
         self._total_reordered = 0
         self._total_docs_deduped = 0
         self._optimize_count = 0
+        self._session_id = None
         self.threshold_percent = 0.75
 
     @staticmethod
@@ -624,6 +658,28 @@ def _tool_chars(msgs):
                 self._total_chars_saved,
                 self._total_chars_saved // 4,
             )
+            # Metadata-only telemetry so the monitor does not depend solely on
+            # gateway log lines. No content, prompts, or tool payloads here.
+            _write_telemetry(
+                {
+                    "ts": time.time(),
+                    "type": "turn",
+                    "session_hash": (
+                        _hash_text(str(self._session_id))
+                        if self._session_id is not None else None
+                    ),
+                    "turn": self._optimize_count,
+                    "chars_saved": turn_chars_saved,
+                    "tokens_saved": turn_chars_saved // 4,
+                    "doc_chars_saved": doc_chars_saved,
+                    "block_chars_saved": dedup_result.chars_saved,
+                    "blocks_deduped": dedup_result.blocks_deduped,
+                    "blocks_total": dedup_result.blocks_total,
+                    "docs_deduped": self._total_docs_deduped,
+                    "system_blocks_matched": dedup_result.system_blocks_matched,
+                    "cumulative_chars_saved": self._total_chars_saved,
+                }
+            )
 
         return api_messages, {
             "chars_saved": turn_chars_saved,
@@ -648,6 +704,7 @@ def on_context_compressed(self, old_count: int, new_count: int) -> None:
 
     def on_session_start(self, session_id: str, **kwargs) -> None:
         _patch_hermes_sanitizer()
+        self._session_id = session_id
         self._model = kwargs.get("model", "")
         self._base_url = ""
         self._api_key = ""
diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index a83c0b0..e63bfd7 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -5,7 +5,8 @@ This is an opt-in, metadata-only monitor for testing ContextPilot inside Hermes
 ## What it reads
 
 - `~/.hermes/state.db:sessions` metadata only: token counts, tool/API call counts, source, estimated cost, timestamps.
-- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries.
+- `~/.hermes/contextpilot/telemetry.jsonl` metadata-only ContextPilot savings records (preferred source).
+- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries (fallback source).
 
 It intentionally does **not** read:
 
@@ -22,9 +23,12 @@ Session ids are salted SHA-256 hashes in reports.
 ```bash
 python scripts/hermes_contextpilot_monitor.py \
   --out-dir ~/contextpilot/reports \
-  --since-hours 24
+  --since-hours 24 \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl
 ```
 
+The telemetry file is written by the ContextPilot Hermes plugin when savings occur. Set `CONTEXTPILOT_DISABLE_TELEMETRY=1` to disable writes, or `CONTEXTPILOT_TELEMETRY_FILE=/path/to/file.jsonl` to override the location.
+
 Outputs:
 
 - `~/contextpilot/reports/daily_YYYY-MM-DD.json`
diff --git a/scripts/hermes_contextpilot_monitor.py b/scripts/hermes_contextpilot_monitor.py
index c4ceb23..3bd9483 100644
--- a/scripts/hermes_contextpilot_monitor.py
+++ b/scripts/hermes_contextpilot_monitor.py
@@ -69,6 +69,8 @@ class DailyReport:
     total_reasoning_tokens: int
     estimated_cost_usd: float
     contextpilot_log_events: int
+    contextpilot_telemetry_events: int
+    contextpilot_savings_source: str
     contextpilot_chars_saved: int
     contextpilot_tokens_saved: int
     estimated_input_token_reduction_pct: float
@@ -193,25 +195,86 @@ def parse_contextpilot_savings(log_path: Path, *, since_hours: int) -> tuple[int
     return events, chars, tokens
 
 
-def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: int, log_stats: tuple[int, int, int]) -> DailyReport:
+def parse_contextpilot_telemetry(telemetry_path: Path, *, since_hours: int) -> tuple[int, int, int]:
+    """Aggregate the plugin's metadata-only telemetry file.
+
+    Returns (events, chars_saved, tokens_saved). The file is JSON-lines, one
+    numeric record per saved turn; it never contains message content, prompts,
+    or tool payloads, so we only read numeric counters here.
+    """
+    if not telemetry_path or not telemetry_path.exists():
+        return 0, 0, 0
+    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+
+    events = 0
+    chars = 0
+    tokens = 0
+    with telemetry_path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                record = json.loads(line)
+            except (ValueError, TypeError):
+                continue
+            if not isinstance(record, dict):
+                continue
+            ts = record.get("ts")
+            if isinstance(ts, (int, float)) and ts < cutoff:
+                continue
+            cs = record.get("chars_saved")
+            if not isinstance(cs, (int, float)):
+                continue
+            saved_tokens = record.get("tokens_saved")
+            events += 1
+            chars += int(cs)
+            tokens += int(saved_tokens) if isinstance(saved_tokens, (int, float)) else int(cs) // 4
+    return events, chars, tokens
+
+
+def build_report(
+    metrics: Iterable[SessionMetric],
+    *,
+    date: str,
+    since_hours: int,
+    log_stats: tuple[int, int, int],
+    telemetry_stats: tuple[int, int, int] = (0, 0, 0),
+) -> DailyReport:
     rows = list(metrics)
     source_counts: dict[str, int] = {}
     for row in rows:
         source_counts[row.source or "unknown"] = source_counts.get(row.source or "unknown", 0) + 1
 
     total_input = sum(r.input_tokens for r in rows)
-    events, saved_chars, saved_tokens = log_stats
+    log_events, log_chars, log_tokens = log_stats
+    tel_events, tel_chars, tel_tokens = telemetry_stats
+
+    # Prefer the local telemetry file when present: it is the authoritative,
+    # log-independent source. Logs are a fallback and are NOT summed on top
+    # (both record the same turns, so summing would double-count).
+    if tel_events > 0:
+        events, saved_chars, saved_tokens = tel_events, tel_chars, tel_tokens
+        savings_source = "telemetry"
+    else:
+        events, saved_chars, saved_tokens = log_events, log_chars, log_tokens
+        savings_source = "gateway-log"
+
     denominator = total_input + saved_tokens
     reduction = (saved_tokens / denominator * 100.0) if denominator else 0.0
 
     notes: list[str] = [
         "metadata-only: did not read messages.content, sessions.system_prompt, reasoning, or tool payloads",
         "accuracy gate is observational here; apply code/config changes only after separate golden-eval pass",
+        f"contextpilot savings source: {savings_source} (telemetry={tel_events} events, log={log_events} events)",
     ]
     if not rows:
         notes.append("no sessions observed in the selected window")
-    if events == 0:
-        notes.append("no ContextPilot savings log lines observed; gateway may need restart after enabling plugin")
+    if tel_events == 0 and log_events == 0:
+        notes.append(
+            "no ContextPilot savings observed via telemetry or logs; "
+            "gateway may need restart after enabling plugin"
+        )
 
     return DailyReport(
         date=date,
@@ -226,7 +289,9 @@ def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: in
         total_cache_write_tokens=sum(r.cache_write_tokens for r in rows),
         total_reasoning_tokens=sum(r.reasoning_tokens for r in rows),
         estimated_cost_usd=sum(r.estimated_cost_usd or 0.0 for r in rows),
-        contextpilot_log_events=events,
+        contextpilot_log_events=log_events,
+        contextpilot_telemetry_events=tel_events,
+        contextpilot_savings_source=savings_source,
         contextpilot_chars_saved=saved_chars,
         contextpilot_tokens_saved=saved_tokens,
         estimated_input_token_reduction_pct=round(reduction, 2),
@@ -254,6 +319,8 @@ def write_report(report: DailyReport, out_dir: Path) -> tuple[Path, Path]:
         f"- Output tokens: {report.total_output_tokens}",
         f"- Tool calls: {report.total_tool_calls}",
         f"- ContextPilot saved: ~{report.contextpilot_tokens_saved} tokens ({report.contextpilot_chars_saved} chars)",
+        f"- ContextPilot savings source: {report.contextpilot_savings_source} "
+        f"(telemetry events={report.contextpilot_telemetry_events}, log events={report.contextpilot_log_events})",
         f"- Estimated input-token reduction: {report.estimated_input_token_reduction_pct}%",
         f"- Estimated cost: ${report.estimated_cost_usd:.4f}",
         "",
@@ -278,6 +345,12 @@ def main() -> int:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--state-db", type=Path, default=Path.home() / ".hermes" / "state.db")
     parser.add_argument("--gateway-log", type=Path, default=Path.home() / ".hermes" / "logs" / "gateway.log")
+    parser.add_argument(
+        "--telemetry-file",
+        type=Path,
+        default=Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl",
+        help="metadata-only ContextPilot telemetry file (preferred over gateway log)",
+    )
     parser.add_argument("--out-dir", type=Path, default=Path.home() / "contextpilot" / "reports")
     parser.add_argument("--since-hours", type=int, default=24)
     parser.add_argument("--salt", default="contextpilot-hermes-monitor-v1", help="salt for stable per-install session hashes")
@@ -289,7 +362,14 @@ def main() -> int:
 
     metrics = load_session_metrics(args.state_db, since_hours=args.since_hours, salt=args.salt)
     log_stats = parse_contextpilot_savings(args.gateway_log, since_hours=args.since_hours)
-    report = build_report(metrics, date=args.date, since_hours=args.since_hours, log_stats=log_stats)
+    telemetry_stats = parse_contextpilot_telemetry(args.telemetry_file, since_hours=args.since_hours)
+    report = build_report(
+        metrics,
+        date=args.date,
+        since_hours=args.since_hours,
+        log_stats=log_stats,
+        telemetry_stats=telemetry_stats,
+    )
     json_path, md_path = write_report(report, args.out_dir)
     print(json.dumps({"ok": True, "json": str(json_path), "markdown": str(md_path)}, ensure_ascii=False))
     return 0
diff --git a/tests/test_block_dedup_regression.py b/tests/test_block_dedup_regression.py
new file mode 100644
index 0000000..57e66b3
--- /dev/null
+++ b/tests/test_block_dedup_regression.py
@@ -0,0 +1,120 @@
+"""Regression coverage for the block-dedup behavioral contract.
+
+These tests lock in the guarantees the Hermes integration depends on:
+
+1. Exact-identical tool-result chunks are replaced by short references.
+2. Edited / near-duplicate content keeps the changed (delta) text verbatim and
+   is NOT collapsed wholesale into an "identical" reference.
+3. Genuinely different content (e.g. a different file in the same repo) is never
+   claimed identical — nothing is deduped and the payload is left untouched.
+
+The hashing is exact-content based; these tests guard against any future change
+that weakens it (e.g. fuzzy matching that would hide new, unique content behind
+a reference).
+"""
+import importlib.util
+from pathlib import Path
+
+MODULE_PATH = Path(__file__).resolve().parents[1] / "contextpilot" / "dedup" / "block_dedup.py"
+_spec = importlib.util.spec_from_file_location("contextpilot_block_dedup_test", MODULE_PATH)
+block_dedup = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(block_dedup)
+
+dedup_chat_completions = block_dedup.dedup_chat_completions
+
+
+def _file_content(prefix: str = "compute_value", n: int = 80) -> str:
+    """Deterministic multi-line tool-result body that chunks into many blocks."""
+    return "\n".join(
+        f"{i:3d}| def function_number_{i}(): return {prefix}({i}) + base_offset_value"
+        for i in range(n)
+    )
+
+
+def _two_tool_results(content_a: str, content_b: str) -> dict:
+    """Two Read tool results in a single chat-completions body."""
+    return {
+        "messages": [
+            {"role": "user", "content": "read the file"},
+            {"role": "assistant", "tool_calls": [{"id": "c1", "function": {"name": "Read"}}]},
+            {"role": "tool", "tool_call_id": "c1", "content": content_a},
+            {"role": "assistant", "tool_calls": [{"id": "c2", "function": {"name": "Read"}}]},
+            {"role": "tool", "tool_call_id": "c2", "content": content_b},
+        ]
+    }
+
+
+def test_exact_duplicate_tool_result_is_replaced_by_reference():
+    content = _file_content()
+    body = _two_tool_results(content, content)
+
+    result = dedup_chat_completions(body)
+
+    first = body["messages"][2]["content"]
+    second = body["messages"][4]["content"]
+
+    # Savings actually happened and were attributed to deduped blocks.
+    assert result.chars_saved > 0
+    assert result.blocks_deduped > 0
+
+    # The first occurrence is untouched; the second is shortened and points back.
+    assert first == content
+    assert len(second) < len(content)
+    assert "identical to earlier" in second
+
+
+def test_edited_near_duplicate_preserves_delta_verbatim():
+    content = _file_content()
+    lines = content.split("\n")
+    # Edit a single line in the middle — a realistic same-file edit between turns.
+    delta = "40| TOTALLY_UNIQUE_EDITED_LINE_MARKER xyzzy brand new content not seen before"
+    lines[40] = delta
+    edited = "\n".join(lines)
+
+    body = _two_tool_results(content, edited)
+    result = dedup_chat_completions(body)
+
+    second = body["messages"][4]["content"]
+
+    # The unique edited text MUST survive verbatim — it must never be hidden
+    # behind an "identical" reference.
+    assert "TOTALLY_UNIQUE_EDITED_LINE_MARKER" in second
+    assert delta in second
+
+    # Identical surrounding blocks are still deduped (so this is not a no-op),
+    # but the result is not collapsed into a single wholesale "identical" marker.
+    assert result.blocks_deduped > 0
+    assert result.blocks_deduped < result.blocks_total
+
+
+def test_different_file_same_repo_is_not_claimed_identical():
+    content = _file_content("compute_value")
+    other = "\n".join(
+        f"{i:3d}| class Widget_{i}: pass  # unrelated module, distinct content line"
+        for i in range(80)
+    )
+    body = _two_tool_results(content, other)
+
+    result = dedup_chat_completions(body)
+
+    # No shared blocks -> nothing deduped and both payloads left byte-for-byte intact.
+    assert result.chars_saved == 0
+    assert result.blocks_deduped == 0
+    assert body["messages"][2]["content"] == content
+    assert body["messages"][4]["content"] == other
+
+
+def test_single_changed_char_breaks_block_match():
+    """A one-character change must produce a different hash (no fuzzy collapse)."""
+    content = _file_content()
+    # Flip exactly one character deep inside the body.
+    idx = len(content) // 2
+    mutated = content[:idx] + ("Z" if content[idx] != "Z" else "Q") + content[idx + 1:]
+
+    body = _two_tool_results(content, mutated)
+    dedup_chat_completions(body)
+
+    second = body["messages"][4]["content"]
+    # The block containing the mutation is preserved verbatim (not referenced away).
+    mutated_line = mutated.split("\n")[content[:idx].count("\n")]
+    assert mutated_line in second
diff --git a/tests/test_hermes_contextpilot_monitor.py b/tests/test_hermes_contextpilot_monitor.py
index d0f0218..ba56bb2 100644
--- a/tests/test_hermes_contextpilot_monitor.py
+++ b/tests/test_hermes_contextpilot_monitor.py
@@ -112,3 +112,72 @@ def test_monitor_reads_metadata_only_and_hashes_session_ids(tmp_path):
     assert "DO NOT READ ME" not in md
     assert "SECRET SYSTEM PROMPT" not in md
     assert data["top_token_sessions"][0]["session_hash"] != "raw-session-id"
+
+
+def _write_telemetry(path, records):
+    path.write_text(
+        "\n".join(json.dumps(r) for r in records) + "\n", encoding="utf-8"
+    )
+
+
+def test_parse_telemetry_aggregates_recent_records(tmp_path):
+    tel = tmp_path / "telemetry.jsonl"
+    far_future = 4102444800.0  # 2100-01-01
+    _write_telemetry(
+        tel,
+        [
+            {"ts": far_future, "type": "turn", "session": "s1", "turn": 1,
+             "chars_saved": 400, "tokens_saved": 100},
+            {"ts": far_future, "type": "turn", "session": "s1", "turn": 2,
+             "chars_saved": 200, "tokens_saved": 50},
+            # Stale record far in the past must be excluded by the window.
+            {"ts": 1000.0, "type": "turn", "session": "s0", "turn": 1,
+             "chars_saved": 999999, "tokens_saved": 999999},
+            "this is not json",
+        ],
+    )
+
+    events, chars, tokens = monitor.parse_contextpilot_telemetry(tel, since_hours=24)
+    assert events == 2
+    assert chars == 600
+    assert tokens == 150
+
+
+def test_parse_telemetry_missing_file_is_safe(tmp_path):
+    assert monitor.parse_contextpilot_telemetry(tmp_path / "nope.jsonl", since_hours=24) == (0, 0, 0)
+
+
+def test_build_report_prefers_telemetry_over_logs(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db(db)
+    metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test")
+
+    report = monitor.build_report(
+        metrics,
+        date="2100-01-01",
+        since_hours=24,
+        log_stats=(5, 4000, 1000),
+        telemetry_stats=(2, 600, 150),
+    )
+    # Telemetry is authoritative when present; logs are not summed on top.
+    assert report.contextpilot_tokens_saved == 150
+    assert report.contextpilot_chars_saved == 600
+    assert report.contextpilot_telemetry_events == 2
+    assert report.contextpilot_log_events == 5
+    assert report.contextpilot_savings_source == "telemetry"
+
+
+def test_build_report_falls_back_to_logs_without_telemetry(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db(db)
+    metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test")
+
+    report = monitor.build_report(
+        metrics,
+        date="2100-01-01",
+        since_hours=24,
+        log_stats=(5, 4000, 1000),
+        telemetry_stats=(0, 0, 0),
+    )
+    assert report.contextpilot_tokens_saved == 1000
+    assert report.contextpilot_savings_source == "gateway-log"
diff --git a/tests/test_hermes_plugin_patch.py b/tests/test_hermes_plugin_patch.py
index 9a372d8..74bcd19 100644
--- a/tests/test_hermes_plugin_patch.py
+++ b/tests/test_hermes_plugin_patch.py
@@ -184,3 +184,103 @@ def dedup(body, **kwargs):
     assert second_out[1]["content"] == "DEDUPED TOOL RESULT"
     assert second_out[2]["content"] == "now summarize it"
     assert calls[-1][1]["content"] == "DEDUPED TOOL RESULT"
+
+
+def _saving_dedup(body, **kwargs):
+    saved = 0
+    for msg in body["messages"]:
+        if msg.get("role") == "tool" and msg.get("content") == "FULL TOOL RESULT":
+            msg["content"] = "REF"
+            saved += len("FULL TOOL RESULT") - len("REF")
+    return SimpleNamespace(
+        chars_saved=saved,
+        blocks_deduped=1 if saved else 0,
+        blocks_total=1,
+        system_blocks_matched=0,
+    )
+
+
+def test_optimize_writes_metadata_only_telemetry_line(monkeypatch, tmp_path):
+    import json
+
+    module, _ = _load_plugin_module(monkeypatch)
+    monkeypatch.setattr(module, "_check_reorder", lambda: False)
+    monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False)
+    monkeypatch.setattr(module, "dedup_chat_completions", _saving_dedup)
+
+    telemetry = tmp_path / "nested" / "telemetry.jsonl"
+    monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry))
+
+    engine = module.ContextPilotEngine()
+    engine.on_session_start("session-XYZ", model="test-model")
+
+    secret = "SUPER SECRET USER PROMPT — must never be written to telemetry"
+    messages = [
+        {"role": "user", "content": secret},
+        {"role": "tool", "tool_call_id": "call_1", "content": "FULL TOOL RESULT"},
+    ]
+    engine.optimize_api_messages(messages)
+
+    assert telemetry.exists()
+    lines = [l for l in telemetry.read_text(encoding="utf-8").splitlines() if l.strip()]
+    assert len(lines) == 1
+    record = json.loads(lines[0])
+
+    # Numeric/metadata only — savings recorded.
+    assert record["chars_saved"] > 0
+    assert record["tokens_saved"] == record["chars_saved"] // 4
+    assert record["turn"] == 1
+    assert record["session_hash"] == module._hash_text("session-XYZ")
+    assert "session" not in record
+    assert isinstance(record["ts"], (int, float))
+
+    # Privacy: no message/prompt/tool-payload content may appear anywhere.
+    raw = telemetry.read_text(encoding="utf-8")
+    assert secret not in raw
+    assert "FULL TOOL RESULT" not in raw
+    forbidden = {"content", "messages", "prompt", "system_prompt", "text", "tool_calls"}
+    assert forbidden.isdisjoint(record.keys())
+
+
+def test_optimize_telemetry_skipped_when_nothing_saved(monkeypatch, tmp_path):
+    module, _ = _load_plugin_module(monkeypatch)
+    monkeypatch.setattr(module, "_check_reorder", lambda: False)
+    monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False)
+    monkeypatch.setattr(
+        module,
+        "dedup_chat_completions",
+        lambda body, **kw: SimpleNamespace(
+            chars_saved=0, blocks_deduped=0, blocks_total=0, system_blocks_matched=0
+        ),
+    )
+
+    telemetry = tmp_path / "telemetry.jsonl"
+    monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry))
+
+    engine = module.ContextPilotEngine()
+    engine.optimize_api_messages([{"role": "user", "content": "hello"}])
+
+    # No save -> no telemetry noise.
+    assert not telemetry.exists()
+
+
+def test_optimize_survives_unwritable_telemetry_path(monkeypatch, tmp_path):
+    module, _ = _load_plugin_module(monkeypatch)
+    monkeypatch.setattr(module, "_check_reorder", lambda: False)
+    monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False)
+    monkeypatch.setattr(module, "dedup_chat_completions", _saving_dedup)
+
+    # Point telemetry at a path whose parent is an existing *file*, so mkdir fails.
+    blocker = tmp_path / "iam_a_file"
+    blocker.write_text("x", encoding="utf-8")
+    monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(blocker / "telemetry.jsonl"))
+
+    engine = module.ContextPilotEngine()
+    messages = [
+        {"role": "user", "content": "read file"},
+        {"role": "tool", "tool_call_id": "call_1", "content": "FULL TOOL RESULT"},
+    ]
+    # Must not raise despite the unwritable telemetry destination.
+    out, stats = engine.optimize_api_messages(messages)
+    assert out[1]["content"] == "REF"
+    assert stats["chars_saved"] > 0

From 1c93f1dc7fce1af677a1eec2dd276996a87f1dcf Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Thu, 11 Jun 2026 03:09:03 +0200
Subject: [PATCH 3/9] feat: add Hermes context opportunity scanner

---
 docs/guides/hermes-monitor.md                 |  35 +
 .../analyze_hermes_context_opportunities.py   | 680 ++++++++++++++++++
 ...est_hermes_context_opportunity_analyzer.py | 222 ++++++
 3 files changed, 937 insertions(+)
 create mode 100644 scripts/analyze_hermes_context_opportunities.py
 create mode 100644 tests/test_hermes_context_opportunity_analyzer.py

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index e63bfd7..4d20d43 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -53,6 +53,41 @@ Then read the generated Markdown report for today and send a short Chinese summa
 )
 ```
 
+## Opportunity scanning
+
+`scripts/analyze_hermes_context_opportunities.py` is a companion scanner meant
+for a continuous cron job. Where the monitor stays metadata-only, this analyzer
+*does* read message content and tool outputs — but only in-memory, to compute
+salted SHA-256 fingerprints and aggregate counters. Reports never contain raw
+message/tool text, system prompts, reasoning, or raw session ids.
+
+It surfaces concrete token-reduction opportunities:
+
+- exact duplicate tool outputs (identical payloads re-sent across turns),
+- repeated line/block fingerprints (shared boilerplate across outputs),
+- large tool outputs grouped by `tool_name`,
+- heavy sessions by input-token / tool-call / message counts (hashed ids),
+- ContextPilot telemetry coverage and savings ratios.
+
+```bash
+python scripts/analyze_hermes_context_opportunities.py \
+  --state-db /root/.hermes/state.db \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
+  --out-dir ~/contextpilot/opportunities \
+  --since-hours 24
+```
+
+Outputs:
+
+- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json`
+- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md`
+
+Each estimated "wasted tokens" figure is a heuristic (chars / 4); treat the
+report as a prioritized list of candidates and validate against the accuracy
+gate below before changing ContextPilot config or code. A defensive guard in
+`write_report` refuses to emit any forbidden raw-content key, so the reports are
+safe to ship from an unattended cron job.
+
 ## Accuracy gate
 
 This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require:
diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py
new file mode 100644
index 0000000..780315a
--- /dev/null
+++ b/scripts/analyze_hermes_context_opportunities.py
@@ -0,0 +1,680 @@
+#!/usr/bin/env python3
+"""Privacy-safe Hermes context opportunity analyzer for ContextPilot.
+
+Unlike ``hermes_contextpilot_monitor.py`` (which never reads message bodies),
+this analyzer *does* inspect message content and tool outputs in order to find
+concrete token-reduction opportunities: exact duplicate tool outputs, repeated
+line/block fingerprints, oversized tool outputs per tool, heavy sessions, and
+ContextPilot telemetry coverage.
+
+It reads content only in-memory to compute salted hashes and aggregate
+counters. Reports never contain raw message/tool text, system prompts, or raw
+session ids -- only salted SHA-256 fingerprints and numeric aggregates. This
+makes it safe to run continuously from a cron job and ship the reports.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import hashlib
+import json
+import sqlite3
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Iterable
+
+# Columns we are explicitly forbidden from EMITTING in any report. We may read
+# message content in-memory for hashing, but it must never reach an output file.
+FORBIDDEN_OUTPUT_KEYS = {
+    "content",
+    "system_prompt",
+    "reasoning",
+    "reasoning_content",
+    "reasoning_details",
+    "tool_calls",
+    "codex_reasoning_items",
+    "codex_message_items",
+}
+
+# Tunables (overridable via CLI).
+DEFAULT_MIN_BLOCK_CHARS = 40       # ignore trivial lines when fingerprinting
+DEFAULT_MIN_BLOCK_REPEAT = 3       # a block must recur this often to be a "repeat"
+DEFAULT_LARGE_OUTPUT_CHARS = 8000  # tool outputs at/above this are "large"
+DEFAULT_TOP_N = 20
+EST_CHARS_PER_TOKEN = 4
+
+
+def _est_tokens(chars: int) -> int:
+    return chars // EST_CHARS_PER_TOKEN
+
+
+def _salted_hash(text: str, salt: str, *, length: int = 16) -> str:
+    return hashlib.sha256(f"{salt}:{text}".encode("utf-8", "replace")).hexdigest()[:length]
+
+
+def _salt_fingerprint(salt: str) -> str:
+    # Confirms a salt was applied without revealing it.
+    return hashlib.sha256(f"fingerprint:{salt}".encode()).hexdigest()[:12]
+
+
+def _connect_readonly(path: Path) -> sqlite3.Connection:
+    uri = f"file:{path}?mode=ro"
+    return sqlite3.connect(uri, uri=True)
+
+
+# ---------------------------------------------------------------------------
+# Data structures (all privacy-safe: hashes + counters only)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DuplicateToolOutput:
+    content_hash: str
+    tool_name: str | None
+    occurrences: int
+    char_length: int
+    est_tokens: int
+    est_wasted_tokens: int  # tokens spent re-sending identical output: (n-1) * est_tokens
+
+
+@dataclass
+class RepeatedBlock:
+    block_hash: str
+    occurrences: int
+    char_length: int
+    est_tokens: int
+    est_wasted_tokens: int  # (n-1) * est_tokens
+
+
+@dataclass
+class ToolSizeStat:
+    tool_name: str
+    output_count: int
+    total_chars: int
+    max_chars: int
+    avg_chars: int
+    total_est_tokens: int
+    large_output_count: int  # outputs >= large_output_chars threshold
+
+
+@dataclass
+class HeavySession:
+    session_hash: str
+    source: str | None
+    input_tokens: int
+    output_tokens: int
+    message_count: int
+    tool_call_count: int
+    api_call_count: int
+
+
+@dataclass
+class TelemetryCoverage:
+    events: int
+    chars_saved: int
+    tokens_saved: int
+    avg_tokens_saved_per_event: float
+    coverage_ratio_pct: float           # tokens_saved / (tokens_saved + total_input_tokens)
+    malformed_records_skipped: int
+
+
+@dataclass
+class OpportunityReport:
+    date: str
+    since_hours: int
+    salt_fingerprint: str
+    tool_message_count: int
+    total_tool_output_chars: int
+    total_tool_output_est_tokens: int
+    exact_duplicate_groups: list[DuplicateToolOutput]
+    duplicate_tool_output_groups: int
+    duplicate_tool_output_wasted_tokens: int
+    repeated_block_count: int
+    repeated_block_wasted_tokens: int
+    repeated_blocks: list[RepeatedBlock]
+    large_tool_outputs_by_tool: list[ToolSizeStat]
+    heavy_sessions: list[HeavySession]
+    telemetry: TelemetryCoverage
+    notes: list[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Loading
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _ToolMessage:
+    tool_name: str | None
+    content: str
+
+
+def load_tool_messages(
+    db_path: Path, *, since_hours: int
+) -> list[_ToolMessage]:
+    """Load tool-output messages within the window.
+
+    Content is returned for in-memory hashing only; callers must not emit it.
+    A message is treated as tool output when ``role='tool'`` or ``tool_name``
+    is set.
+    """
+    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    conn = _connect_readonly(db_path)
+    try:
+        cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")}
+        if "content" not in cols:
+            return []
+        has_tool_name = "tool_name" in cols
+        has_ts = "timestamp" in cols
+        select_tool = "tool_name" if has_tool_name else "NULL AS tool_name"
+        where = []
+        params: list[object] = []
+        if has_ts:
+            where.append("timestamp >= ?")
+            params.append(cutoff)
+        tool_pred = "role = 'tool'"
+        if has_tool_name:
+            tool_pred = "(role = 'tool' OR tool_name IS NOT NULL)"
+        where.append(tool_pred)
+        sql = (
+            f"SELECT {select_tool}, content FROM messages "
+            f"WHERE {' AND '.join(where)}"
+        )
+        rows = conn.execute(sql, params).fetchall()
+    finally:
+        conn.close()
+
+    out: list[_ToolMessage] = []
+    for tool_name, content in rows:
+        if content is None:
+            continue
+        out.append(_ToolMessage(tool_name=tool_name, content=str(content)))
+    return out
+
+
+def load_heavy_sessions(
+    db_path: Path, *, since_hours: int, salt: str, top_n: int
+) -> list[HeavySession]:
+    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    conn = _connect_readonly(db_path)
+    try:
+        cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
+        if "id" not in cols:
+            return []
+        wanted = [
+            "id",
+            "source",
+            "input_tokens",
+            "output_tokens",
+            "message_count",
+            "tool_call_count",
+            "api_call_count",
+        ]
+        select_cols = [c if c in cols else f"NULL AS {c}" for c in wanted]
+        where = []
+        params: list[object] = []
+        if "started_at" in cols:
+            where.append("started_at >= ?")
+            params.append(cutoff)
+        if "archived" in cols:
+            where.append("archived = 0")
+        sql = f"SELECT {', '.join(select_cols)} FROM sessions"
+        if where:
+            sql += " WHERE " + " AND ".join(where)
+        sql += " ORDER BY input_tokens DESC"
+        rows = conn.execute(sql, params).fetchall()
+    finally:
+        conn.close()
+
+    sessions: list[HeavySession] = []
+    for sid, source, inp, out_tok, msgs, tools, apis in rows:
+        sessions.append(
+            HeavySession(
+                session_hash=_salted_hash(str(sid), salt),
+                source=source,
+                input_tokens=int(inp or 0),
+                output_tokens=int(out_tok or 0),
+                message_count=int(msgs or 0),
+                tool_call_count=int(tools or 0),
+                api_call_count=int(apis or 0),
+            )
+        )
+    sessions.sort(key=lambda s: (s.input_tokens, s.tool_call_count), reverse=True)
+    return sessions[:top_n]
+
+
+def total_input_tokens(db_path: Path, *, since_hours: int) -> int:
+    """Sum input tokens across ALL in-window sessions (not just the top-N)."""
+    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    conn = _connect_readonly(db_path)
+    try:
+        cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
+        if "input_tokens" not in cols:
+            return 0
+        where = []
+        params: list[object] = []
+        if "started_at" in cols:
+            where.append("started_at >= ?")
+            params.append(cutoff)
+        if "archived" in cols:
+            where.append("archived = 0")
+        sql = "SELECT COALESCE(SUM(input_tokens), 0) FROM sessions"
+        if where:
+            sql += " WHERE " + " AND ".join(where)
+        (total,) = conn.execute(sql, params).fetchone()
+    finally:
+        conn.close()
+    return int(total or 0)
+
+
+def parse_telemetry(
+    telemetry_path: Path, *, since_hours: int, total_input_tokens: int
+) -> TelemetryCoverage:
+    """Aggregate the metadata-only ContextPilot telemetry file.
+
+    Tolerates malformed lines (non-JSON, non-dict, missing counters) by
+    skipping and counting them. Never reads message content.
+    """
+    events = 0
+    chars = 0
+    tokens = 0
+    malformed = 0
+    if telemetry_path and telemetry_path.exists():
+        cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+        with telemetry_path.open("r", encoding="utf-8", errors="replace") as f:
+            for raw in f:
+                line = raw.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except (ValueError, TypeError):
+                    malformed += 1
+                    continue
+                if not isinstance(record, dict):
+                    malformed += 1
+                    continue
+                ts = record.get("ts")
+                if isinstance(ts, (int, float)) and ts < cutoff:
+                    continue
+                cs = record.get("chars_saved")
+                if not isinstance(cs, (int, float)):
+                    malformed += 1
+                    continue
+                saved_tokens = record.get("tokens_saved")
+                events += 1
+                chars += int(cs)
+                tokens += (
+                    int(saved_tokens)
+                    if isinstance(saved_tokens, (int, float))
+                    else int(cs) // EST_CHARS_PER_TOKEN
+                )
+
+    denom = tokens + total_input_tokens
+    coverage = (tokens / denom * 100.0) if denom else 0.0
+    avg = (tokens / events) if events else 0.0
+    return TelemetryCoverage(
+        events=events,
+        chars_saved=chars,
+        tokens_saved=tokens,
+        avg_tokens_saved_per_event=round(avg, 2),
+        coverage_ratio_pct=round(coverage, 2),
+        malformed_records_skipped=malformed,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Detection
+# ---------------------------------------------------------------------------
+
+
+def detect_exact_duplicate_tool_outputs(
+    messages: Iterable[_ToolMessage], *, salt: str, top_n: int
+) -> list[DuplicateToolOutput]:
+    groups: dict[str, dict] = {}
+    for msg in messages:
+        content = msg.content
+        if not content:
+            continue
+        h = _salted_hash(content, salt)
+        g = groups.get(h)
+        if g is None:
+            groups[h] = {
+                "tool_name": msg.tool_name,
+                "occurrences": 1,
+                "char_length": len(content),
+            }
+        else:
+            g["occurrences"] += 1
+            if g["tool_name"] != msg.tool_name:
+                g["tool_name"] = None  # mixed tools produced identical output
+
+    dups: list[DuplicateToolOutput] = []
+    for h, g in groups.items():
+        if g["occurrences"] < 2:
+            continue
+        est = _est_tokens(g["char_length"])
+        dups.append(
+            DuplicateToolOutput(
+                content_hash=h,
+                tool_name=g["tool_name"],
+                occurrences=g["occurrences"],
+                char_length=g["char_length"],
+                est_tokens=est,
+                est_wasted_tokens=est * (g["occurrences"] - 1),
+            )
+        )
+    dups.sort(key=lambda d: d.est_wasted_tokens, reverse=True)
+    return dups[:top_n]
+
+
+def detect_repeated_blocks(
+    messages: Iterable[_ToolMessage],
+    *,
+    salt: str,
+    min_block_chars: int,
+    min_repeat: int,
+    top_n: int,
+) -> list[RepeatedBlock]:
+    counts: dict[str, dict] = {}
+    for msg in messages:
+        seen_in_msg: set[str] = set()
+        for line in msg.content.splitlines():
+            block = line.strip()
+            if len(block) < min_block_chars:
+                continue
+            h = _salted_hash(block, salt)
+            # Count cross-message recurrence; collapse repeats within one
+            # message so a single noisy output cannot dominate.
+            if h in seen_in_msg:
+                continue
+            seen_in_msg.add(h)
+            c = counts.get(h)
+            if c is None:
+                counts[h] = {"occurrences": 1, "char_length": len(block)}
+            else:
+                c["occurrences"] += 1
+
+    blocks: list[RepeatedBlock] = []
+    for h, c in counts.items():
+        if c["occurrences"] < min_repeat:
+            continue
+        est = _est_tokens(c["char_length"])
+        blocks.append(
+            RepeatedBlock(
+                block_hash=h,
+                occurrences=c["occurrences"],
+                char_length=c["char_length"],
+                est_tokens=est,
+                est_wasted_tokens=est * (c["occurrences"] - 1),
+            )
+        )
+    blocks.sort(key=lambda b: b.est_wasted_tokens, reverse=True)
+    return blocks[:top_n]
+
+
+def summarize_tool_sizes(
+    messages: Iterable[_ToolMessage], *, large_output_chars: int, top_n: int
+) -> list[ToolSizeStat]:
+    agg: dict[str, dict] = {}
+    for msg in messages:
+        name = msg.tool_name or "(unknown)"
+        length = len(msg.content)
+        a = agg.get(name)
+        if a is None:
+            agg[name] = {
+                "output_count": 1,
+                "total_chars": length,
+                "max_chars": length,
+                "large_output_count": 1 if length >= large_output_chars else 0,
+            }
+        else:
+            a["output_count"] += 1
+            a["total_chars"] += length
+            a["max_chars"] = max(a["max_chars"], length)
+            if length >= large_output_chars:
+                a["large_output_count"] += 1
+
+    stats: list[ToolSizeStat] = []
+    for name, a in agg.items():
+        stats.append(
+            ToolSizeStat(
+                tool_name=name,
+                output_count=a["output_count"],
+                total_chars=a["total_chars"],
+                max_chars=a["max_chars"],
+                avg_chars=a["total_chars"] // a["output_count"],
+                total_est_tokens=_est_tokens(a["total_chars"]),
+                large_output_count=a["large_output_count"],
+            )
+        )
+    stats.sort(key=lambda s: s.total_chars, reverse=True)
+    return stats[:top_n]
+
+
+# ---------------------------------------------------------------------------
+# Build + write
+# ---------------------------------------------------------------------------
+
+
+def build_report(
+    *,
+    date: str,
+    since_hours: int,
+    salt: str,
+    tool_messages: list[_ToolMessage],
+    heavy_sessions: list[HeavySession],
+    telemetry: TelemetryCoverage,
+    min_block_chars: int = DEFAULT_MIN_BLOCK_CHARS,
+    min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT,
+    large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS,
+    top_n: int = DEFAULT_TOP_N,
+) -> OpportunityReport:
+    dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n)
+    blocks = detect_repeated_blocks(
+        tool_messages,
+        salt=salt,
+        min_block_chars=min_block_chars,
+        min_repeat=min_block_repeat,
+        top_n=top_n,
+    )
+    sizes = summarize_tool_sizes(
+        tool_messages, large_output_chars=large_output_chars, top_n=top_n
+    )
+
+    total_chars = sum(len(m.content) for m in tool_messages)
+    dup_wasted = sum(d.est_wasted_tokens for d in dups)
+    block_wasted = sum(b.est_wasted_tokens for b in blocks)
+
+    notes = [
+        "content-aware analysis: message/tool text was hashed in-memory only and never written to reports",
+        "all identifiers are salted SHA-256 fingerprints; counters are aggregates",
+        "wasted-token figures are heuristic estimates (chars/4); validate before acting",
+        "session 'source' and 'tool_name' are emitted verbatim as low-cardinality enums, not raw text",
+    ]
+    if not tool_messages:
+        notes.append("no tool-output messages observed in the selected window")
+
+    return OpportunityReport(
+        date=date,
+        since_hours=since_hours,
+        salt_fingerprint=_salt_fingerprint(salt),
+        tool_message_count=len(tool_messages),
+        total_tool_output_chars=total_chars,
+        total_tool_output_est_tokens=_est_tokens(total_chars),
+        exact_duplicate_groups=dups,
+        duplicate_tool_output_groups=len(dups),
+        duplicate_tool_output_wasted_tokens=dup_wasted,
+        repeated_block_count=len(blocks),
+        repeated_block_wasted_tokens=block_wasted,
+        repeated_blocks=blocks,
+        large_tool_outputs_by_tool=sizes,
+        heavy_sessions=heavy_sessions,
+        telemetry=telemetry,
+        notes=notes,
+    )
+
+
+def _assert_no_forbidden_keys(data: dict) -> None:
+    """Defensive guard: ensure no forbidden raw-content key reached the output."""
+
+    def walk(obj):
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                if k in FORBIDDEN_OUTPUT_KEYS:
+                    raise RuntimeError(f"refusing to emit forbidden key: {k}")
+                walk(v)
+        elif isinstance(obj, list):
+            for item in obj:
+                walk(item)
+
+    walk(data)
+
+
+def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    data = asdict(report)
+    _assert_no_forbidden_keys(data)
+
+    json_path = out_dir / f"opportunities_{report.date}.json"
+    md_path = out_dir / f"opportunities_{report.date}.md"
+    json_path.write_text(
+        json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+    )
+
+    t = report.telemetry
+    md = [
+        f"# ContextPilot Hermes opportunity scan — {report.date}",
+        "",
+        f"Window: last {report.since_hours}h",
+        f"Salt fingerprint: `{report.salt_fingerprint}`",
+        "",
+        "## Summary",
+        f"- Tool-output messages: {report.tool_message_count}",
+        f"- Total tool-output tokens (est): {report.total_tool_output_est_tokens}",
+        f"- Exact duplicate groups: {report.duplicate_tool_output_groups} "
+        f"(~{report.duplicate_tool_output_wasted_tokens} wasted tokens)",
+        f"- Repeated blocks: {report.repeated_block_count} "
+        f"(~{report.repeated_block_wasted_tokens} wasted tokens)",
+        f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, "
+        f"coverage {t.coverage_ratio_pct}%",
+        "",
+        "## Top exact-duplicate tool outputs",
+    ]
+    for d in report.exact_duplicate_groups:
+        md.append(
+            f"- `{d.content_hash}` tool={d.tool_name} x{d.occurrences} "
+            f"chars={d.char_length} ~wasted={d.est_wasted_tokens} tokens"
+        )
+    md.append("")
+    md.append("## Top repeated blocks")
+    for b in report.repeated_blocks:
+        md.append(
+            f"- `{b.block_hash}` x{b.occurrences} chars={b.char_length} "
+            f"~wasted={b.est_wasted_tokens} tokens"
+        )
+    md.append("")
+    md.append("## Large tool outputs by tool")
+    for s in report.large_tool_outputs_by_tool:
+        md.append(
+            f"- {s.tool_name}: count={s.output_count} total_chars={s.total_chars} "
+            f"max={s.max_chars} avg={s.avg_chars} large(>=thresh)={s.large_output_count}"
+        )
+    md.append("")
+    md.append("## Heavy sessions (hashed)")
+    for h in report.heavy_sessions:
+        md.append(
+            f"- `{h.session_hash}` source={h.source} input={h.input_tokens} "
+            f"output={h.output_tokens} msgs={h.message_count} tools={h.tool_call_count} "
+            f"apis={h.api_call_count}"
+        )
+    md.append("")
+    md.append("## Telemetry coverage")
+    md.extend(
+        [
+            f"- Events: {t.events}",
+            f"- Tokens saved: {t.tokens_saved} (chars {t.chars_saved})",
+            f"- Avg tokens saved / event: {t.avg_tokens_saved_per_event}",
+            f"- Coverage ratio: {t.coverage_ratio_pct}%",
+            f"- Malformed records skipped: {t.malformed_records_skipped}",
+        ]
+    )
+    md.append("")
+    md.append("## Notes")
+    for note in report.notes:
+        md.append(f"- {note}")
+    md_path.write_text("\n".join(md) + "\n", encoding="utf-8")
+    return json_path, md_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--state-db", type=Path, default=Path("/root/.hermes/state.db"))
+    parser.add_argument(
+        "--telemetry-file",
+        type=Path,
+        default=Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl",
+        help="metadata-only ContextPilot telemetry file",
+    )
+    parser.add_argument(
+        "--out-dir", type=Path, default=Path.home() / "contextpilot" / "opportunities"
+    )
+    parser.add_argument("--since-hours", type=int, default=24)
+    parser.add_argument(
+        "--salt",
+        default="contextpilot-hermes-opportunity-v1",
+        help="salt for stable per-install content/session fingerprints",
+    )
+    parser.add_argument("--date", default=dt.date.today().isoformat())
+    parser.add_argument("--min-block-chars", type=int, default=DEFAULT_MIN_BLOCK_CHARS)
+    parser.add_argument("--min-block-repeat", type=int, default=DEFAULT_MIN_BLOCK_REPEAT)
+    parser.add_argument(
+        "--large-output-chars", type=int, default=DEFAULT_LARGE_OUTPUT_CHARS
+    )
+    parser.add_argument("--top-n", type=int, default=DEFAULT_TOP_N)
+    args = parser.parse_args()
+
+    if not args.state_db.exists():
+        raise SystemExit(f"Hermes state DB not found: {args.state_db}")
+
+    # Harden for unattended cron use: never dump a traceback (which would echo
+    # the DB path / SQL); emit only the exception class name and a non-zero code.
+    try:
+        tool_messages = load_tool_messages(args.state_db, since_hours=args.since_hours)
+        heavy_sessions = load_heavy_sessions(
+            args.state_db, since_hours=args.since_hours, salt=args.salt, top_n=args.top_n
+        )
+        total_input = total_input_tokens(args.state_db, since_hours=args.since_hours)
+        telemetry = parse_telemetry(
+            args.telemetry_file,
+            since_hours=args.since_hours,
+            total_input_tokens=total_input,
+        )
+        report = build_report(
+            date=args.date,
+            since_hours=args.since_hours,
+            salt=args.salt,
+            tool_messages=tool_messages,
+            heavy_sessions=heavy_sessions,
+            telemetry=telemetry,
+            min_block_chars=args.min_block_chars,
+            min_block_repeat=args.min_block_repeat,
+            large_output_chars=args.large_output_chars,
+            top_n=args.top_n,
+        )
+        json_path, md_path = write_report(report, args.out_dir)
+    except Exception as exc:  # noqa: BLE001 - cron-safe: report class only, no payload
+        print(json.dumps({"ok": False, "error": type(exc).__name__}))
+        return 1
+
+    print(
+        json.dumps(
+            {"ok": True, "json": str(json_path), "markdown": str(md_path)},
+            ensure_ascii=False,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py
new file mode 100644
index 0000000..0c18e63
--- /dev/null
+++ b/tests/test_hermes_context_opportunity_analyzer.py
@@ -0,0 +1,222 @@
+import importlib.util
+import json
+import sqlite3
+import sys
+from pathlib import Path
+
+
+MODULE_PATH = (
+    Path(__file__).resolve().parents[1]
+    / "scripts"
+    / "analyze_hermes_context_opportunities.py"
+)
+spec = importlib.util.spec_from_file_location(
+    "analyze_hermes_context_opportunities", MODULE_PATH
+)
+analyzer = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = analyzer
+spec.loader.exec_module(analyzer)
+
+
+FAR_FUTURE = 4102444800.0  # 2100-01-01, always inside a generous test window
+WIDE_WINDOW = 24 * 365 * 100
+
+
+def _make_db(path: Path, messages, *, sessions=None):
+    conn = sqlite3.connect(path)
+    conn.execute(
+        """
+        CREATE TABLE sessions (
+            id TEXT PRIMARY KEY,
+            source TEXT,
+            started_at REAL NOT NULL,
+            ended_at REAL,
+            message_count INTEGER DEFAULT 0,
+            tool_call_count INTEGER DEFAULT 0,
+            input_tokens INTEGER DEFAULT 0,
+            output_tokens INTEGER DEFAULT 0,
+            api_call_count INTEGER DEFAULT 0,
+            archived INTEGER NOT NULL DEFAULT 0,
+            system_prompt TEXT
+        )
+        """
+    )
+    conn.execute(
+        """
+        CREATE TABLE messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT NOT NULL,
+            role TEXT NOT NULL,
+            content TEXT,
+            tool_name TEXT,
+            reasoning TEXT,
+            timestamp REAL NOT NULL
+        )
+        """
+    )
+    # tuple layout: (id, source, _placeholder, tool_call_count, message_count,
+    #                input_tokens, output_tokens, api_call_count, system_prompt)
+    for s in sessions or [
+        ("raw-session-id", "discord", None, 4, 6, 1000, 200, 3, "SECRET SYSTEM PROMPT")
+    ]:
+        conn.execute(
+            """
+            INSERT INTO sessions (
+                id, source, started_at, tool_call_count, message_count,
+                input_tokens, output_tokens, api_call_count, archived, system_prompt
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, 0, ?)
+            """,
+            (s[0], s[1], FAR_FUTURE, s[3], s[4], s[5], s[6], s[7], s[8]),
+        )
+    for role, content, tool_name in messages:
+        conn.execute(
+            "INSERT INTO messages (session_id, role, content, tool_name, reasoning, timestamp)"
+            " VALUES (?, ?, ?, ?, ?, ?)",
+            ("raw-session-id", role, content, tool_name, "PRIVATE REASONING", FAR_FUTURE),
+        )
+    conn.commit()
+    conn.close()
+
+
+def _analyze(db, tmp_path, telemetry=None, salt="test-salt"):
+    tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)
+    heavy = analyzer.load_heavy_sessions(
+        db, since_hours=WIDE_WINDOW, salt=salt, top_n=20
+    )
+    total_input = sum(h.input_tokens for h in heavy)
+    tel = analyzer.parse_telemetry(
+        telemetry if telemetry is not None else tmp_path / "none.jsonl",
+        since_hours=WIDE_WINDOW,
+        total_input_tokens=total_input,
+    )
+    report = analyzer.build_report(
+        date="2100-01-01",
+        since_hours=24,
+        salt=salt,
+        tool_messages=tool_messages,
+        heavy_sessions=heavy,
+        telemetry=tel,
+        min_block_repeat=2,
+    )
+    return report
+
+
+def test_no_raw_content_leaks_in_reports(tmp_path):
+    db = tmp_path / "state.db"
+    secret = "TOP-SECRET-TOOL-OUTPUT-PAYLOAD-DO-NOT-LEAK " * 10
+    _make_db(
+        db,
+        [
+            ("tool", secret, "Bash"),
+            ("tool", secret, "Bash"),
+            ("user", "DO NOT READ ME USER TEXT", None),
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+
+    blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8")
+    # Raw content, prompts, reasoning, and raw session ids must never appear.
+    assert "TOP-SECRET-TOOL-OUTPUT-PAYLOAD" not in blob
+    assert "DO NOT READ ME" not in blob
+    assert "SECRET SYSTEM PROMPT" not in blob
+    assert "PRIVATE REASONING" not in blob
+    assert "raw-session-id" not in blob
+    # But the duplicate was still detected via hashing.
+    assert report.duplicate_tool_output_groups == 1
+    assert report.heavy_sessions[0].session_hash != "raw-session-id"
+
+
+def test_exact_duplicate_tool_outputs_counted(tmp_path):
+    db = tmp_path / "state.db"
+    payload = "identical output line one\nidentical output line two\n" * 3
+    _make_db(
+        db,
+        [
+            ("tool", payload, "Read"),
+            ("tool", payload, "Read"),
+            ("tool", payload, "Read"),
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    assert report.duplicate_tool_output_groups == 1
+    group = report.exact_duplicate_groups[0]
+    assert group.occurrences == 3
+    assert group.tool_name == "Read"
+    # Two of the three sends are pure waste.
+    assert group.est_wasted_tokens == group.est_tokens * 2
+    assert report.duplicate_tool_output_wasted_tokens == group.est_wasted_tokens
+
+
+def test_near_or_different_content_not_exact_duplicate(tmp_path):
+    db = tmp_path / "state.db"
+    base = "the quick brown fox jumps over the lazy dog " * 5
+    near = base + "X"  # one char different -> different hash
+    other = "completely unrelated tool output content here " * 5
+    _make_db(
+        db,
+        [
+            ("tool", base, "Bash"),
+            ("tool", near, "Bash"),
+            ("tool", other, "Bash"),
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    # No two outputs are byte-identical -> zero exact-duplicate groups.
+    assert report.duplicate_tool_output_groups == 0
+    assert report.duplicate_tool_output_wasted_tokens == 0
+
+
+def test_malformed_telemetry_tolerated(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db(db, [("tool", "some output", "Bash")])
+    tel = tmp_path / "telemetry.jsonl"
+    tel.write_text(
+        "\n".join(
+            [
+                json.dumps({"ts": FAR_FUTURE, "chars_saved": 400, "tokens_saved": 100}),
+                json.dumps({"ts": FAR_FUTURE, "chars_saved": 200}),  # missing tokens_saved
+                "this is not json at all",
+                json.dumps([1, 2, 3]),  # not a dict
+                json.dumps({"ts": FAR_FUTURE, "note": "no counters here"}),
+                "",
+            ]
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    report = _analyze(db, tmp_path, telemetry=tel)
+    t = report.telemetry
+    # Two valid records aggregated; second infers tokens from chars (200//4=50).
+    assert t.events == 2
+    assert t.chars_saved == 600
+    assert t.tokens_saved == 150
+    # Non-json, non-dict, and missing-counter lines are skipped, not fatal.
+    assert t.malformed_records_skipped == 3
+    assert t.coverage_ratio_pct > 0
+
+
+def test_repeated_blocks_and_large_outputs(tmp_path):
+    db = tmp_path / "state.db"
+    shared = "this shared boilerplate block is long enough to be fingerprinted"
+    big = "x" * 9000
+    _make_db(
+        db,
+        [
+            ("tool", shared + "\nunique tail alpha that is also sufficiently long here", "Bash"),
+            ("tool", shared + "\nunique tail beta that is also sufficiently long here", "Bash"),
+            ("tool", big, "Read"),
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    assert any(b.occurrences >= 2 for b in report.repeated_blocks)
+    read_stat = next(s for s in report.large_tool_outputs_by_tool if s.tool_name == "Read")
+    assert read_stat.large_output_count == 1
+
+
+def test_missing_telemetry_file_is_safe(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db(db, [("tool", "out", "Bash")])
+    report = _analyze(db, tmp_path, telemetry=tmp_path / "nope.jsonl")
+    assert report.telemetry.events == 0
+    assert report.telemetry.malformed_records_skipped == 0

From 992cd1ad8635f408c5f3848ee8d7fe6d39047039 Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Thu, 11 Jun 2026 11:51:06 +0200
Subject: [PATCH 4/9] feat: analyze LLM-bound context redundancy

---
 docs/guides/hermes-monitor.md                 |  36 ++
 .../analyze_hermes_context_opportunities.py   | 409 +++++++++++++++++-
 ...est_hermes_context_opportunity_analyzer.py | 236 +++++++++-
 3 files changed, 658 insertions(+), 23 deletions(-)

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index 4d20d43..8798885 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -69,12 +69,48 @@ It surfaces concrete token-reduction opportunities:
 - heavy sessions by input-token / tool-call / message counts (hashed ids),
 - ContextPilot telemetry coverage and savings ratios.
 
+### LLM-bound block redundancy
+
+The analyzer also performs an **LLM-bound block scan** that looks *only* at
+content Hermes would actually send to a model, and reports where the same block
+is paid for more than once:
+
+- `sessions.system_prompt`, classified heuristically as `system_prompt` or
+  `skill_prompt` (skill frontmatter / "use this skill" style cues),
+- active `messages.content` for roles `system` / `user` / `assistant` / `tool`,
+  bucketed as `user_prompt`, `assistant_context`, `tool_result`, etc.,
+- tool-result messages (`role='tool'` or `tool_name` set) as `tool_result`.
+
+Inactive messages are skipped when an `active` column exists, and archived
+sessions (and their messages) are skipped when an `archived` column exists. Each
+block is split line-wise, fingerprinted with a salted SHA-256 hash, and
+aggregated. The report then shows:
+
+- **redundancy by block type** — per-type block / unique / repeated counts and
+  estimated redundant tokens,
+- **cross-type repeated blocks** — the headline signal: a single fingerprint
+  observed in 2+ block types (e.g. the same chunk shipped from a skill/system
+  prompt *and* a tool result *and* a user prompt). Reported only as a hash plus
+  per-type counters — never the raw text.
+
+Use `--all-sessions` to ignore the `--since-hours` window and scan **all**
+non-archived sessions and active messages (useful for a one-shot, whole-history
+audit rather than a rolling daily window):
+
 ```bash
+# rolling daily window
 python scripts/analyze_hermes_context_opportunities.py \
   --state-db /root/.hermes/state.db \
   --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
   --out-dir ~/contextpilot/opportunities \
   --since-hours 24
+
+# whole-history audit across every session and LLM-bound block
+python scripts/analyze_hermes_context_opportunities.py \
+  --state-db /root/.hermes/state.db \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
+  --out-dir ~/contextpilot/opportunities \
+  --all-sessions
 ```
 
 Outputs:
diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py
index 780315a..5bcd3b1 100644
--- a/scripts/analyze_hermes_context_opportunities.py
+++ b/scripts/analyze_hermes_context_opportunities.py
@@ -86,6 +86,54 @@ class RepeatedBlock:
     est_wasted_tokens: int  # (n-1) * est_tokens
 
 
+# Recognized LLM-bound block types. These are low-cardinality enums, safe to
+# emit verbatim (they describe the *origin* of a block, never its text).
+BLOCK_TYPES = (
+    "system_prompt",
+    "skill_prompt",
+    "user_prompt",
+    "assistant_context",
+    "tool_result",
+    "unknown",
+)
+
+
+@dataclass
+class TypeCount:
+    block_type: str
+    count: int
+
+
+@dataclass
+class BlockTypeStat:
+    """Aggregate redundancy within a single LLM-bound block type."""
+
+    block_type: str
+    item_count: int            # source items (prompts/messages) of this type
+    block_count: int           # total fingerprintable block instances
+    unique_block_count: int    # distinct fingerprints
+    repeated_block_count: int  # fingerprints recurring >= min_repeat within type
+    est_redundant_tokens: int  # sum over repeats of (occ-1) * est_tokens
+
+
+@dataclass
+class CrossTypeBlockGroup:
+    """A single block fingerprint observed in 2+ distinct block types.
+
+    This is the headline signal: the same chunk of text is being shipped to the
+    LLM from, e.g., a skill/system prompt *and* a tool result, so it is paying
+    for the same tokens twice from different sources.
+    """
+
+    block_hash: str
+    block_types: list[str]               # sorted distinct types this block spans
+    type_occurrences: list[TypeCount]    # per-type occurrence counts
+    occurrences: int                     # total occurrences across all types
+    char_length: int
+    est_tokens: int
+    est_wasted_tokens: int               # (occurrences - 1) * est_tokens
+
+
 @dataclass
 class ToolSizeStat:
     tool_name: str
@@ -122,6 +170,7 @@ class TelemetryCoverage:
 class OpportunityReport:
     date: str
     since_hours: int
+    all_sessions: bool
     salt_fingerprint: str
     tool_message_count: int
     total_tool_output_chars: int
@@ -135,6 +184,11 @@ class OpportunityReport:
     large_tool_outputs_by_tool: list[ToolSizeStat]
     heavy_sessions: list[HeavySession]
     telemetry: TelemetryCoverage
+    # LLM-bound block analysis (system/skill prompts, prompts, tool results).
+    llm_bound_item_count: int
+    llm_block_types: list[BlockTypeStat]
+    cross_type_block_groups: list[CrossTypeBlockGroup]
+    cross_type_wasted_tokens: int
     notes: list[str] = field(default_factory=list)
 
 
@@ -149,16 +203,75 @@ class _ToolMessage:
     content: str
 
 
+@dataclass
+class _LLMContent:
+    """A chunk of content that Hermes would actually send to the LLM.
+
+    Held in-memory only for hashing; ``content`` must never be emitted.
+    """
+
+    block_type: str
+    content: str
+
+
+def _window_cutoff(since_hours: int, all_sessions: bool) -> float | None:
+    """Return the epoch cutoff, or ``None`` to scan all history.
+
+    ``all_sessions=True`` disables the time window so old sessions/messages are
+    included regardless of ``since_hours``.
+    """
+    if all_sessions:
+        return None
+    return dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+
+
+def _classify_system_prompt(text: str) -> str:
+    """Heuristically label a system prompt as skill material or a plain prompt.
+
+    Operates on in-memory text only; returns a low-cardinality enum, never the
+    text itself.
+    """
+    low = text.lower()
+    stripped = low.lstrip()
+    # Skill-style frontmatter block (e.g. "---\nname: ...\ndescription: ...").
+    if stripped.startswith("---") and "name:" in low[:300]:
+        return "skill_prompt"
+    cues = (
+        "use this skill",
+        "available skills",
+        "when to use",
+        "invoke it via skill",
+        "<skill",
+        "# skill",
+        "skill tool",
+    )
+    if any(c in low for c in cues):
+        return "skill_prompt"
+    return "system_prompt"
+
+
+def _message_block_type(role: str | None, tool_name: str | None) -> str:
+    if role == "tool" or tool_name is not None:
+        return "tool_result"
+    if role == "user":
+        return "user_prompt"
+    if role == "assistant":
+        return "assistant_context"
+    if role == "system":
+        return "system_prompt"
+    return "unknown"
+
+
 def load_tool_messages(
-    db_path: Path, *, since_hours: int
+    db_path: Path, *, since_hours: int, all_sessions: bool = False
 ) -> list[_ToolMessage]:
     """Load tool-output messages within the window.
 
     Content is returned for in-memory hashing only; callers must not emit it.
     A message is treated as tool output when ``role='tool'`` or ``tool_name``
-    is set.
+    is set. With ``all_sessions=True`` the time window is ignored.
     """
-    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    cutoff = _window_cutoff(since_hours, all_sessions)
     conn = _connect_readonly(db_path)
     try:
         cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")}
@@ -169,9 +282,11 @@ def load_tool_messages(
         select_tool = "tool_name" if has_tool_name else "NULL AS tool_name"
         where = []
         params: list[object] = []
-        if has_ts:
+        if has_ts and cutoff is not None:
             where.append("timestamp >= ?")
             params.append(cutoff)
+        if "active" in cols:
+            where.append("active = 1")
         tool_pred = "role = 'tool'"
         if has_tool_name:
             tool_pred = "(role = 'tool' OR tool_name IS NOT NULL)"
@@ -192,10 +307,92 @@ def load_tool_messages(
     return out
 
 
+def load_llm_bound_content(
+    db_path: Path, *, since_hours: int, all_sessions: bool = False
+) -> list[_LLMContent]:
+    """Load only content Hermes would actually send to an LLM.
+
+    Sources, all read in-memory for hashing (never emitted):
+      * ``sessions.system_prompt`` -> ``system_prompt`` or ``skill_prompt``,
+      * ``messages.content`` for active messages with role in
+        ``system``/``user``/``assistant``/``tool`` -> per-role block type,
+      * tool-result messages (role=tool or ``tool_name`` set) -> ``tool_result``.
+
+    Inactive messages are skipped when an ``active`` column exists; archived
+    sessions (and their messages) are skipped when an ``archived`` column
+    exists. With ``all_sessions=True`` the time window is ignored.
+    """
+    cutoff = _window_cutoff(since_hours, all_sessions)
+    conn = _connect_readonly(db_path)
+    out: list[_LLMContent] = []
+    try:
+        scols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
+        mcols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")}
+
+        # --- system / skill prompts from sessions -------------------------
+        if "system_prompt" in scols:
+            where = ["system_prompt IS NOT NULL"]
+            params: list[object] = []
+            if cutoff is not None and "started_at" in scols:
+                where.append("started_at >= ?")
+                params.append(cutoff)
+            if "archived" in scols:
+                where.append("archived = 0")
+            sql = f"SELECT system_prompt FROM sessions WHERE {' AND '.join(where)}"
+            for (sp,) in conn.execute(sql, params):
+                if sp is None:
+                    continue
+                text = str(sp)
+                out.append(
+                    _LLMContent(block_type=_classify_system_prompt(text), content=text)
+                )
+
+        # --- active messages bound for the LLM ----------------------------
+        if "content" in mcols:
+            has_role = "role" in mcols
+            has_tool_name = "tool_name" in mcols
+            select = [
+                "messages.role" if has_role else "NULL AS role",
+                "messages.content",
+                "messages.tool_name" if has_tool_name else "NULL AS tool_name",
+            ]
+            where = ["messages.content IS NOT NULL"]
+            params = []
+            if has_role:
+                where.append(
+                    "messages.role IN ('system', 'user', 'assistant', 'tool')"
+                )
+            if cutoff is not None and "timestamp" in mcols:
+                where.append("messages.timestamp >= ?")
+                params.append(cutoff)
+            if "active" in mcols:
+                where.append("messages.active = 1")
+            join = ""
+            if "archived" in scols and "session_id" in mcols and "id" in scols:
+                join = " JOIN sessions ON sessions.id = messages.session_id"
+                where.append("sessions.archived = 0")
+            sql = (
+                f"SELECT {', '.join(select)} FROM messages{join} "
+                f"WHERE {' AND '.join(where)}"
+            )
+            for role, content, tool_name in conn.execute(sql, params):
+                if content is None:
+                    continue
+                out.append(
+                    _LLMContent(
+                        block_type=_message_block_type(role, tool_name),
+                        content=str(content),
+                    )
+                )
+    finally:
+        conn.close()
+    return out
+
+
 def load_heavy_sessions(
-    db_path: Path, *, since_hours: int, salt: str, top_n: int
+    db_path: Path, *, since_hours: int, salt: str, top_n: int, all_sessions: bool = False
 ) -> list[HeavySession]:
-    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    cutoff = _window_cutoff(since_hours, all_sessions)
     conn = _connect_readonly(db_path)
     try:
         cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
@@ -213,7 +410,7 @@ def load_heavy_sessions(
         select_cols = [c if c in cols else f"NULL AS {c}" for c in wanted]
         where = []
         params: list[object] = []
-        if "started_at" in cols:
+        if cutoff is not None and "started_at" in cols:
             where.append("started_at >= ?")
             params.append(cutoff)
         if "archived" in cols:
@@ -243,9 +440,11 @@ def load_heavy_sessions(
     return sessions[:top_n]
 
 
-def total_input_tokens(db_path: Path, *, since_hours: int) -> int:
+def total_input_tokens(
+    db_path: Path, *, since_hours: int, all_sessions: bool = False
+) -> int:
     """Sum input tokens across ALL in-window sessions (not just the top-N)."""
-    cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+    cutoff = _window_cutoff(since_hours, all_sessions)
     conn = _connect_readonly(db_path)
     try:
         cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")}
@@ -253,7 +452,7 @@ def total_input_tokens(db_path: Path, *, since_hours: int) -> int:
             return 0
         where = []
         params: list[object] = []
-        if "started_at" in cols:
+        if cutoff is not None and "started_at" in cols:
             where.append("started_at >= ?")
             params.append(cutoff)
         if "archived" in cols:
@@ -268,19 +467,24 @@ def total_input_tokens(db_path: Path, *, since_hours: int) -> int:
 
 
 def parse_telemetry(
-    telemetry_path: Path, *, since_hours: int, total_input_tokens: int
+    telemetry_path: Path,
+    *,
+    since_hours: int,
+    total_input_tokens: int,
+    all_sessions: bool = False,
 ) -> TelemetryCoverage:
     """Aggregate the metadata-only ContextPilot telemetry file.
 
     Tolerates malformed lines (non-JSON, non-dict, missing counters) by
-    skipping and counting them. Never reads message content.
+    skipping and counting them. Never reads message content. With
+    ``all_sessions=True`` the time window is ignored.
     """
     events = 0
     chars = 0
     tokens = 0
     malformed = 0
     if telemetry_path and telemetry_path.exists():
-        cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600
+        cutoff = _window_cutoff(since_hours, all_sessions)
         with telemetry_path.open("r", encoding="utf-8", errors="replace") as f:
             for raw in f:
                 line = raw.strip()
@@ -295,7 +499,7 @@ def parse_telemetry(
                     malformed += 1
                     continue
                 ts = record.get("ts")
-                if isinstance(ts, (int, float)) and ts < cutoff:
+                if cutoff is not None and isinstance(ts, (int, float)) and ts < cutoff:
                     continue
                 cs = record.get("chars_saved")
                 if not isinstance(cs, (int, float)):
@@ -452,6 +656,110 @@ def summarize_tool_sizes(
     return stats[:top_n]
 
 
+def _iter_blocks(content: str, min_block_chars: int) -> Iterable[str]:
+    """Yield the distinct fingerprintable lines of one item (deduped in-item)."""
+    seen: set[str] = set()
+    for line in content.splitlines():
+        block = line.strip()
+        if len(block) < min_block_chars:
+            continue
+        if block in seen:
+            continue
+        seen.add(block)
+        yield block
+
+
+def analyze_llm_bound_blocks(
+    contents: Iterable[_LLMContent],
+    *,
+    salt: str,
+    min_block_chars: int,
+    min_repeat: int,
+    top_n: int,
+) -> tuple[list[BlockTypeStat], list[CrossTypeBlockGroup]]:
+    """Fingerprint LLM-bound blocks and report redundancy.
+
+    Returns (per-type stats, cross-type repeated block groups). All output is
+    salted hashes / counters / block-type enums -- no raw text.
+    """
+    # block_hash -> {char_length, types: {block_type: occ}}
+    agg: dict[str, dict] = {}
+    # block_type -> source item count
+    item_counts: dict[str, int] = {}
+
+    for item in contents:
+        bt = item.block_type
+        item_counts[bt] = item_counts.get(bt, 0) + 1
+        for block in _iter_blocks(item.content, min_block_chars):
+            h = _salted_hash(block, salt)
+            entry = agg.get(h)
+            if entry is None:
+                agg[h] = {"char_length": len(block), "types": {bt: 1}}
+            else:
+                entry["types"][bt] = entry["types"].get(bt, 0) + 1
+
+    # --- per block-type aggregate redundancy ------------------------------
+    per_type: dict[str, dict] = {}
+    for entry in agg.values():
+        est = _est_tokens(entry["char_length"])
+        for bt, occ in entry["types"].items():
+            t = per_type.setdefault(
+                bt,
+                {
+                    "block_count": 0,
+                    "unique": 0,
+                    "repeated": 0,
+                    "redundant_tokens": 0,
+                },
+            )
+            t["block_count"] += occ
+            t["unique"] += 1
+            if occ >= min_repeat:
+                t["repeated"] += 1
+                t["redundant_tokens"] += est * (occ - 1)
+
+    block_type_stats: list[BlockTypeStat] = []
+    for bt in sorted(set(per_type) | set(item_counts)):
+        t = per_type.get(
+            bt, {"block_count": 0, "unique": 0, "repeated": 0, "redundant_tokens": 0}
+        )
+        block_type_stats.append(
+            BlockTypeStat(
+                block_type=bt,
+                item_count=item_counts.get(bt, 0),
+                block_count=t["block_count"],
+                unique_block_count=t["unique"],
+                repeated_block_count=t["repeated"],
+                est_redundant_tokens=t["redundant_tokens"],
+            )
+        )
+
+    # --- cross-type repeated blocks ---------------------------------------
+    cross: list[CrossTypeBlockGroup] = []
+    for h, entry in agg.items():
+        types = entry["types"]
+        if len(types) < 2:
+            continue
+        total_occ = sum(types.values())
+        est = _est_tokens(entry["char_length"])
+        cross.append(
+            CrossTypeBlockGroup(
+                block_hash=h,
+                block_types=sorted(types.keys()),
+                type_occurrences=[
+                    TypeCount(block_type=bt, count=occ)
+                    for bt, occ in sorted(types.items())
+                ],
+                occurrences=total_occ,
+                char_length=entry["char_length"],
+                est_tokens=est,
+                est_wasted_tokens=est * (total_occ - 1),
+            )
+        )
+    cross.sort(key=lambda g: g.est_wasted_tokens, reverse=True)
+    return block_type_stats, cross[:top_n]
+
+
 # ---------------------------------------------------------------------------
 # Build + write
 # ---------------------------------------------------------------------------
@@ -465,6 +773,8 @@ def build_report(
     tool_messages: list[_ToolMessage],
     heavy_sessions: list[HeavySession],
     telemetry: TelemetryCoverage,
+    llm_contents: list[_LLMContent] | None = None,
+    all_sessions: bool = False,
     min_block_chars: int = DEFAULT_MIN_BLOCK_CHARS,
     min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT,
     large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS,
@@ -482,22 +792,38 @@ def build_report(
         tool_messages, large_output_chars=large_output_chars, top_n=top_n
     )
 
+    llm_contents = llm_contents or []
+    block_type_stats, cross_groups = analyze_llm_bound_blocks(
+        llm_contents,
+        salt=salt,
+        min_block_chars=min_block_chars,
+        min_repeat=min_block_repeat,
+        top_n=top_n,
+    )
+
     total_chars = sum(len(m.content) for m in tool_messages)
     dup_wasted = sum(d.est_wasted_tokens for d in dups)
     block_wasted = sum(b.est_wasted_tokens for b in blocks)
+    cross_wasted = sum(g.est_wasted_tokens for g in cross_groups)
 
     notes = [
         "content-aware analysis: message/tool text was hashed in-memory only and never written to reports",
         "all identifiers are salted SHA-256 fingerprints; counters are aggregates",
         "wasted-token figures are heuristic estimates (chars/4); validate before acting",
-        "session 'source' and 'tool_name' are emitted verbatim as low-cardinality enums, not raw text",
+        "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text",
+        "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages",
     ]
+    if all_sessions:
+        notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages")
     if not tool_messages:
         notes.append("no tool-output messages observed in the selected window")
+    if not llm_contents:
+        notes.append("no llm-bound content observed in the selected window")
 
     return OpportunityReport(
         date=date,
         since_hours=since_hours,
+        all_sessions=all_sessions,
         salt_fingerprint=_salt_fingerprint(salt),
         tool_message_count=len(tool_messages),
         total_tool_output_chars=total_chars,
@@ -511,6 +837,10 @@ def build_report(
         large_tool_outputs_by_tool=sizes,
         heavy_sessions=heavy_sessions,
         telemetry=telemetry,
+        llm_bound_item_count=len(llm_contents),
+        llm_block_types=block_type_stats,
+        cross_type_block_groups=cross_groups,
+        cross_type_wasted_tokens=cross_wasted,
         notes=notes,
     )
 
@@ -543,10 +873,11 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
     )
 
     t = report.telemetry
+    window = "all sessions (no time window)" if report.all_sessions else f"last {report.since_hours}h"
     md = [
         f"# ContextPilot Hermes opportunity scan — {report.date}",
         "",
-        f"Window: last {report.since_hours}h",
+        f"Window: {window}",
         f"Salt fingerprint: `{report.salt_fingerprint}`",
         "",
         "## Summary",
@@ -556,11 +887,30 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
         f"(~{report.duplicate_tool_output_wasted_tokens} wasted tokens)",
         f"- Repeated blocks: {report.repeated_block_count} "
         f"(~{report.repeated_block_wasted_tokens} wasted tokens)",
+        f"- LLM-bound items scanned: {report.llm_bound_item_count}",
+        f"- Cross-type repeated blocks: {len(report.cross_type_block_groups)} "
+        f"(~{report.cross_type_wasted_tokens} wasted tokens)",
         f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, "
         f"coverage {t.coverage_ratio_pct}%",
         "",
-        "## Top exact-duplicate tool outputs",
+        "## LLM-bound redundancy by block type",
     ]
+    for bt in report.llm_block_types:
+        md.append(
+            f"- {bt.block_type}: items={bt.item_count} blocks={bt.block_count} "
+            f"unique={bt.unique_block_count} repeated={bt.repeated_block_count} "
+            f"~redundant={bt.est_redundant_tokens} tokens"
+        )
+    md.append("")
+    md.append("## Cross-type repeated blocks (same block, multiple sources)")
+    for g in report.cross_type_block_groups:
+        spread = ", ".join(f"{tc.block_type}x{tc.count}" for tc in g.type_occurrences)
+        md.append(
+            f"- `{g.block_hash}` types=[{', '.join(g.block_types)}] ({spread}) "
+            f"chars={g.char_length} ~wasted={g.est_wasted_tokens} tokens"
+        )
+    md.append("")
+    md.append("## Top exact-duplicate tool outputs")
     for d in report.exact_duplicate_groups:
         md.append(
             f"- `{d.content_hash}` tool={d.tool_name} x{d.occurrences} "
@@ -620,6 +970,11 @@ def main() -> int:
         "--out-dir", type=Path, default=Path.home() / "contextpilot" / "opportunities"
     )
     parser.add_argument("--since-hours", type=int, default=24)
+    parser.add_argument(
+        "--all-sessions",
+        action="store_true",
+        help="ignore --since-hours; scan all non-archived sessions and active messages",
+    )
     parser.add_argument(
         "--salt",
         default="contextpilot-hermes-opportunity-v1",
@@ -640,15 +995,27 @@ def main() -> int:
     # Harden for unattended cron use: never dump a traceback (which would echo
     # the DB path / SQL); emit only the exception class name and a non-zero code.
     try:
-        tool_messages = load_tool_messages(args.state_db, since_hours=args.since_hours)
+        tool_messages = load_tool_messages(
+            args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions
+        )
+        llm_contents = load_llm_bound_content(
+            args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions
+        )
         heavy_sessions = load_heavy_sessions(
-            args.state_db, since_hours=args.since_hours, salt=args.salt, top_n=args.top_n
+            args.state_db,
+            since_hours=args.since_hours,
+            salt=args.salt,
+            top_n=args.top_n,
+            all_sessions=args.all_sessions,
+        )
+        total_input = total_input_tokens(
+            args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions
         )
-        total_input = total_input_tokens(args.state_db, since_hours=args.since_hours)
         telemetry = parse_telemetry(
             args.telemetry_file,
             since_hours=args.since_hours,
             total_input_tokens=total_input,
+            all_sessions=args.all_sessions,
         )
         report = build_report(
             date=args.date,
@@ -657,6 +1024,8 @@ def main() -> int:
             tool_messages=tool_messages,
             heavy_sessions=heavy_sessions,
             telemetry=telemetry,
+            llm_contents=llm_contents,
+            all_sessions=args.all_sessions,
             min_block_chars=args.min_block_chars,
             min_block_repeat=args.min_block_repeat,
             large_output_chars=args.large_output_chars,
diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py
index 0c18e63..649288a 100644
--- a/tests/test_hermes_context_opportunity_analyzer.py
+++ b/tests/test_hermes_context_opportunity_analyzer.py
@@ -78,16 +78,22 @@ def _make_db(path: Path, messages, *, sessions=None):
     conn.close()
 
 
-def _analyze(db, tmp_path, telemetry=None, salt="test-salt"):
-    tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)
+def _analyze(db, tmp_path, telemetry=None, salt="test-salt", all_sessions=False):
+    tool_messages = analyzer.load_tool_messages(
+        db, since_hours=WIDE_WINDOW, all_sessions=all_sessions
+    )
+    llm_contents = analyzer.load_llm_bound_content(
+        db, since_hours=WIDE_WINDOW, all_sessions=all_sessions
+    )
     heavy = analyzer.load_heavy_sessions(
-        db, since_hours=WIDE_WINDOW, salt=salt, top_n=20
+        db, since_hours=WIDE_WINDOW, salt=salt, top_n=20, all_sessions=all_sessions
     )
     total_input = sum(h.input_tokens for h in heavy)
     tel = analyzer.parse_telemetry(
         telemetry if telemetry is not None else tmp_path / "none.jsonl",
         since_hours=WIDE_WINDOW,
         total_input_tokens=total_input,
+        all_sessions=all_sessions,
     )
     report = analyzer.build_report(
         date="2100-01-01",
@@ -96,6 +102,8 @@ def _analyze(db, tmp_path, telemetry=None, salt="test-salt"):
         tool_messages=tool_messages,
         heavy_sessions=heavy,
         telemetry=tel,
+        llm_contents=llm_contents,
+        all_sessions=all_sessions,
         min_block_repeat=2,
     )
     return report
@@ -220,3 +228,225 @@ def test_missing_telemetry_file_is_safe(tmp_path):
     report = _analyze(db, tmp_path, telemetry=tmp_path / "nope.jsonl")
     assert report.telemetry.events == 0
     assert report.telemetry.malformed_records_skipped == 0
+
+
+# ---------------------------------------------------------------------------
+# LLM-bound block analysis + all-sessions tests
+# ---------------------------------------------------------------------------
+
+OLD_TS = 1_000_000_000.0  # 2001 — far outside any normal recent window
+
+
+def _make_db_ex(path, *, sessions, messages, message_active_col=False):
+    """Flexible builder: custom timestamps, optional messages.active column."""
+    conn = sqlite3.connect(path)
+    conn.execute(
+        """
+        CREATE TABLE sessions (
+            id TEXT PRIMARY KEY,
+            source TEXT,
+            started_at REAL NOT NULL,
+            input_tokens INTEGER DEFAULT 0,
+            output_tokens INTEGER DEFAULT 0,
+            message_count INTEGER DEFAULT 0,
+            tool_call_count INTEGER DEFAULT 0,
+            api_call_count INTEGER DEFAULT 0,
+            archived INTEGER NOT NULL DEFAULT 0,
+            system_prompt TEXT
+        )
+        """
+    )
+    active_col = ", active INTEGER NOT NULL DEFAULT 1" if message_active_col else ""
+    conn.execute(
+        f"""
+        CREATE TABLE messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT NOT NULL,
+            role TEXT NOT NULL,
+            content TEXT,
+            tool_name TEXT,
+            reasoning TEXT,
+            timestamp REAL NOT NULL{active_col}
+        )
+        """
+    )
+    for s in sessions:
+        conn.execute(
+            "INSERT INTO sessions (id, source, started_at, input_tokens, archived,"
+            " system_prompt) VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                s["id"],
+                s.get("source"),
+                s["started_at"],
+                s.get("input_tokens", 0),
+                s.get("archived", 0),
+                s.get("system_prompt"),
+            ),
+        )
+    for m in messages:
+        if message_active_col:
+            conn.execute(
+                "INSERT INTO messages (session_id, role, content, tool_name, reasoning,"
+                " timestamp, active) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                (
+                    m.get("session_id", "s1"),
+                    m["role"],
+                    m.get("content"),
+                    m.get("tool_name"),
+                    "PRIVATE REASONING",
+                    m.get("timestamp", FAR_FUTURE),
+                    m.get("active", 1),
+                ),
+            )
+        else:
+            conn.execute(
+                "INSERT INTO messages (session_id, role, content, tool_name, reasoning,"
+                " timestamp) VALUES (?, ?, ?, ?, ?, ?)",
+                (
+                    m.get("session_id", "s1"),
+                    m["role"],
+                    m.get("content"),
+                    m.get("tool_name"),
+                    "PRIVATE REASONING",
+                    m.get("timestamp", FAR_FUTURE),
+                ),
+            )
+    conn.commit()
+    conn.close()
+
+
+def test_all_sessions_includes_old_out_of_window_data(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db_ex(
+        db,
+        sessions=[
+            {
+                "id": "old-sess",
+                "source": "discord",
+                "started_at": OLD_TS,
+                "input_tokens": 500,
+                "system_prompt": "old system prompt material that is plenty long here",
+            }
+        ],
+        messages=[
+            {
+                "session_id": "old-sess",
+                "role": "tool",
+                "content": "old tool output block sufficiently long to be scanned",
+                "tool_name": "Bash",
+                "timestamp": OLD_TS,
+            },
+            {
+                "session_id": "old-sess",
+                "role": "user",
+                "content": "old user prompt text that is also long enough to scan",
+                "timestamp": OLD_TS,
+            },
+        ],
+    )
+    # A normal recent window excludes the old data entirely.
+    assert analyzer.load_tool_messages(db, since_hours=24) == []
+    assert analyzer.load_llm_bound_content(db, since_hours=24) == []
+    assert analyzer.load_heavy_sessions(db, since_hours=24, salt="s", top_n=5) == []
+
+    # all_sessions ignores the window and picks the old data back up.
+    assert len(analyzer.load_tool_messages(db, since_hours=24, all_sessions=True)) == 1
+    llm = analyzer.load_llm_bound_content(db, since_hours=24, all_sessions=True)
+    assert len(llm) == 3  # system_prompt + tool_result + user_prompt
+    assert (
+        len(
+            analyzer.load_heavy_sessions(
+                db, since_hours=24, salt="s", top_n=5, all_sessions=True
+            )
+        )
+        == 1
+    )
+
+
+def test_inactive_messages_skipped(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {
+                "role": "tool",
+                "content": "active tool output that is sufficiently long to fingerprint",
+                "tool_name": "Bash",
+                "active": 1,
+            },
+            {
+                "role": "tool",
+                "content": "inactive tool output that should be skipped entirely here",
+                "tool_name": "Bash",
+                "active": 0,
+            },
+            {
+                "role": "user",
+                "content": "inactive user prompt that must also be skipped here",
+                "active": 0,
+            },
+        ],
+        message_active_col=True,
+    )
+    # Inactive rows are filtered out of both loaders.
+    assert len(analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)) == 1
+    llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW)
+    assert sorted(c.block_type for c in llm) == ["tool_result"]
+
+
+def test_skill_prompt_classification(tmp_path):
+    db = tmp_path / "state.db"
+    skill_sys = (
+        "---\nname: deep-research\ndescription: research harness\n---\n"
+        "Use this skill when researching a topic."
+    )
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": skill_sys}],
+        messages=[],
+    )
+    llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW)
+    assert len(llm) == 1
+    assert llm[0].block_type == "skill_prompt"
+
+
+def test_cross_type_redundancy_reported_via_hashes_only(tmp_path):
+    db = tmp_path / "state.db"
+    shared = "This is a shared instruction block long enough to fingerprint cleanly."
+    sys_prompt = "You are a helpful system.\n" + shared + "\nEnd of system prompt."
+    tool_out = "tool produced this output line\n" + shared + "\nand more tool lines"
+    user_msg = "user asks the assistant something specific here\n" + shared
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_prompt}],
+        messages=[
+            {"role": "tool", "content": tool_out, "tool_name": "Bash"},
+            {"role": "user", "content": user_msg},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+
+    # The shared block spans system_prompt, tool_result, and user_prompt.
+    assert len(report.cross_type_block_groups) >= 1
+    grp = report.cross_type_block_groups[0]
+    assert "tool_result" in grp.block_types
+    assert any(bt in grp.block_types for bt in ("system_prompt", "skill_prompt"))
+    assert "user_prompt" in grp.block_types
+    assert grp.occurrences == 3
+    # Reported only via salted hash + counters — never the raw block text.
+    assert shared not in grp.block_hash
+    assert report.cross_type_wasted_tokens > 0
+
+    # Per-type block stats are populated for the LLM-bound types.
+    types_seen = {b.block_type for b in report.llm_block_types}
+    assert {"tool_result", "user_prompt"} <= types_seen
+
+    # The written report leaks no raw prompt/tool/system text.
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+    blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8")
+    assert shared not in blob
+    assert "shared instruction block" not in blob
+    assert "You are a helpful system" not in blob
+    assert "user asks the assistant" not in blob
+    assert "PRIVATE REASONING" not in blob

From 9a1e1833d6d82df0ecf4713fdaeebb8d9a1956d4 Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Fri, 12 Jun 2026 00:48:51 +0200
Subject: [PATCH 5/9] feat: dedup exact cross-role payload blocks

---
 contextpilot/dedup/block_dedup.py      | 303 +++++++++++++++----------
 tests/test_payload_cross_role_dedup.py | 215 ++++++++++++++++++
 2 files changed, 393 insertions(+), 125 deletions(-)
 create mode 100644 tests/test_payload_cross_role_dedup.py

diff --git a/contextpilot/dedup/block_dedup.py b/contextpilot/dedup/block_dedup.py
index 073ee37..d9e0082 100644
--- a/contextpilot/dedup/block_dedup.py
+++ b/contextpilot/dedup/block_dedup.py
@@ -232,155 +232,208 @@ def _rebuild_json_content(original: str, key: str, new_text: str) -> str:
         return original
 
 
-def dedup_chat_completions(
-    body: dict,
-    min_block_chars: int = MIN_BLOCK_CHARS,
-    min_content_chars: int = MIN_CONTENT_CHARS,
-    chunk_modulus: int = CHUNK_MODULUS,
-    system_content: Optional[str] = None,
-) -> DedupResult:
-    messages = body.get("messages")
-    if not isinstance(messages, list) or not messages:
-        return DedupResult()
+def _account(result: DedupResult, original_len: int, new_len: int) -> None:
+    """Roll a single field's before/after lengths into the aggregate result."""
+    result.chars_before += original_len
+    result.chars_after += new_len
+    result.chars_saved += original_len - new_len
 
-    tool_names = _build_tool_name_map_openai(messages)
-    seen_blocks: Dict[str, Tuple[int, str, int]] = {}
-    pre_seen = _prescan_system_blocks(system_content, min_block_chars, chunk_modulus)
-    result = DedupResult()
 
-    for idx, msg in enumerate(messages):
-        if not isinstance(msg, dict) or msg.get("role") != "tool":
-            continue
+def _register_blocks_only(
+    text: Optional[str],
+    seen_blocks: Dict[str, Tuple[int, str, int]],
+    msg_idx: int,
+    label: str,
+    result: DedupResult,
+    min_block_chars: int,
+    chunk_modulus: int,
+) -> None:
+    """Register a message's blocks as dedup *sources* without modifying it.
 
-        content = msg.get("content", "")
-        if not isinstance(content, str) or len(content) < min_content_chars:
+    Used for the canonical first copy — e.g. the system / skill prompt — which
+    may seed references for later duplicates but must itself stay verbatim.
+    """
+    if not isinstance(text, str) or not text.strip():
+        return
+    for block_idx, block in enumerate(_content_defined_chunking(text, chunk_modulus)):
+        if len(block.strip()) < min_block_chars:
             continue
+        result.blocks_total += 1
+        h = _hash_block(block)
+        if h not in seen_blocks:
+            seen_blocks[h] = (msg_idx, label, block_idx)
 
-        tc_id = msg.get("tool_call_id", "")
-        fn_name = tool_names.get(tc_id, msg.get("name", "")) or "tool"
 
-        # Extract text from JSON-wrapped tool results for proper chunking
-        extracted_text, json_key = _extract_text_for_dedup(content)
-        dedup_target = extracted_text if extracted_text else content
+def _dedup_string_field(
+    text: str,
+    seen_blocks: Dict[str, Tuple[int, str, int]],
+    msg_idx: int,
+    label: str,
+    result: DedupResult,
+    min_block_chars: int,
+    chunk_modulus: int,
+) -> Optional[str]:
+    """Dedup one plain-text field against earlier blocks in the same payload."""
+    new_text = _dedup_text(
+        text, seen_blocks, msg_idx, label, result, min_block_chars, chunk_modulus
+    )
+    if new_text is not None:
+        _account(result, len(text), len(new_text))
+        logger.info(
+            "Block dedup: msg[%d] %s — saved %d chars",
+            msg_idx,
+            label,
+            len(text) - len(new_text),
+        )
+    return new_text
 
-        new_content = _dedup_text(
-            dedup_target,
-            seen_blocks,
-            idx,
-            fn_name,
-            result,
-            min_block_chars,
-            chunk_modulus,
-            pre_seen=pre_seen,
+
+def _dedup_assistant_message(
+    msg: dict,
+    idx: int,
+    seen_blocks: Dict[str, Tuple[int, str, int]],
+    result: DedupResult,
+    min_block_chars: int,
+    min_content_chars: int,
+    chunk_modulus: int,
+) -> None:
+    """Dedup assistant content (string or list-of-text-blocks) against earlier blocks."""
+    raw = msg.get("content", "")
+    if isinstance(raw, str):
+        if len(raw) < min_content_chars:
+            return
+        new_content = _dedup_string_field(
+            raw, seen_blocks, idx, "assistant message", result,
+            min_block_chars, chunk_modulus,
         )
         if new_content is not None:
-            if json_key and extracted_text:
-                # Rebuild the JSON with shortened content field
-                original_len = len(content)
-                msg["content"] = _rebuild_json_content(content, json_key, new_content)
-                new_len = len(msg["content"])
-            else:
-                original_len = len(content)
-                msg["content"] = new_content
-                new_len = len(new_content)
-            result.chars_before += original_len
-            result.chars_after += new_len
-            result.chars_saved += original_len - new_len
-            logger.info(
-                f"Block dedup: msg[{idx}] {fn_name} — "
-                f"saved {original_len - new_len:,} chars"
+            msg["content"] = new_content
+    elif isinstance(raw, list):
+        # OpenClaw sends [{type: "text", text: "..."}, ...]
+        for block in raw:
+            if not isinstance(block, dict) or block.get("type") != "text":
+                continue
+            t = block.get("text", "")
+            if not isinstance(t, str) or len(t) < min_content_chars:
+                continue
+            new_text = _dedup_string_field(
+                t, seen_blocks, idx, "assistant message", result,
+                min_block_chars, chunk_modulus,
             )
+            if new_text is not None:
+                block["text"] = new_text
 
-    _dedup_assistant_code_blocks(
-        messages,
-        seen_blocks,
-        result,
-        min_block_chars,
-        min_content_chars,
-        chunk_modulus,
-        pre_seen=pre_seen,
-    )
-
-    return result
-
-
-_CODE_BLOCK_RE = re.compile(r"(```[\w]*\n)(.*?)(```)", re.DOTALL)
 
-
-def _dedup_assistant_code_blocks(
-    messages: list,
+def _dedup_tool_message(
+    msg: dict,
+    idx: int,
+    tool_names: Dict[str, str],
     seen_blocks: Dict[str, Tuple[int, str, int]],
     result: DedupResult,
     min_block_chars: int,
     min_content_chars: int,
     chunk_modulus: int,
-    pre_seen: Optional[Dict[str, Tuple[int, str, int]]] = None,
 ) -> None:
-    for idx, msg in enumerate(messages):
-        if not isinstance(msg, dict) or msg.get("role") != "assistant":
-            continue
-        raw_content = msg.get("content", "")
-
-        # Handle both string and list (content blocks) formats
-        is_list_content = False
-        text_block_idx = -1
-        if isinstance(raw_content, str):
-            content = raw_content
-        elif isinstance(raw_content, list):
-            # OpenClaw sends [{type: "text", text: "..."}, ...]
-            # Find the text block that contains code
-            content = ""
-            for bi, block in enumerate(raw_content):
-                if isinstance(block, dict) and block.get("type") == "text":
-                    t = block.get("text", "")
-                    if "```" in t and len(t) > len(content):
-                        content = t
-                        text_block_idx = bi
-                        is_list_content = True
-            if not content:
-                continue
-        else:
-            continue
+    """Dedup a tool result (JSON-aware) against earlier blocks in the payload."""
+    content = msg.get("content", "")
+    if not isinstance(content, str) or len(content) < min_content_chars:
+        return
 
-        if len(content) < min_content_chars:
-            continue
+    tc_id = msg.get("tool_call_id", "")
+    fn_name = tool_names.get(tc_id, msg.get("name", "")) or "tool"
 
-        code_blocks = list(_CODE_BLOCK_RE.finditer(content))
-        if not code_blocks:
-            continue
+    # Extract text from JSON-wrapped tool results for proper chunking.
+    extracted_text, json_key = _extract_text_for_dedup(content)
+    dedup_target = extracted_text if extracted_text else content
 
-        modified = False
-        new_content = content
+    new_content = _dedup_text(
+        dedup_target, seen_blocks, idx, fn_name, result, min_block_chars, chunk_modulus
+    )
+    if new_content is None:
+        return
+
+    original_len = len(content)
+    if json_key and extracted_text:
+        msg["content"] = _rebuild_json_content(content, json_key, new_content)
+    else:
+        msg["content"] = new_content
+    new_len = len(msg["content"])
+    _account(result, original_len, new_len)
+    logger.info(
+        "Block dedup: msg[%d] %s — saved %d chars", idx, fn_name, original_len - new_len
+    )
 
-        for match in reversed(code_blocks):
-            code = match.group(2)
-            if len(code.strip()) < min_block_chars:
-                continue
 
-            new_code = _dedup_text(
-                code,
-                seen_blocks,
-                idx,
-                "assistant",
-                result,
-                min_block_chars,
-                chunk_modulus,
-                pre_seen=pre_seen,
+def dedup_chat_completions(
+    body: dict,
+    min_block_chars: int = MIN_BLOCK_CHARS,
+    min_content_chars: int = MIN_CONTENT_CHARS,
+    chunk_modulus: int = CHUNK_MODULUS,
+    system_content: Optional[str] = None,
+) -> DedupResult:
+    """Exact-block dedup across ALL roles within a single chat payload.
+
+    Walks messages in document order with a shared block table. The first
+    (earliest) occurrence of any block — across system/skill prompt, user,
+    assistant, and tool messages — keeps its full text; later EXACT occurrences
+    anywhere in the *same* payload are replaced by a short reference pointing to
+    the earlier copy ("see above"). Only exact hash matches are ever replaced;
+    references only ever point backward, to a block in this same payload.
+
+    The system / skill prompt is treated as the canonical source: its blocks are
+    registered but it is never itself shortened.
+    """
+    messages = body.get("messages")
+    if not isinstance(messages, list) or not messages:
+        return DedupResult()
+
+    tool_names = _build_tool_name_map_openai(messages)
+    seen_blocks: Dict[str, Tuple[int, str, int]] = {}
+    result = DedupResult()
+
+    # Seed an externally-supplied system / skill prompt (e.g. one not present as
+    # a message in `messages`) as the canonical first copy. Registered at -1 so
+    # later matches are attributed as system-block hits.
+    pre_seen = _prescan_system_blocks(system_content, min_block_chars, chunk_modulus)
+    for h, origin in pre_seen.items():
+        seen_blocks.setdefault(h, origin)
+
+    for idx, msg in enumerate(messages):
+        if not isinstance(msg, dict):
+            continue
+        role = msg.get("role")
+
+        if role == "system":
+            # Canonical source — register but never shorten. Use -1 so downstream
+            # matches count as system-block hits, consistent with `pre_seen`.
+            _register_blocks_only(
+                msg.get("content", ""), seen_blocks, -1, "system prompt",
+                result, min_block_chars, chunk_modulus,
             )
-            if new_code is not None:
-                start, end = match.start(2), match.end(2)
-                original_len = end - start
-                new_content = new_content[:start] + new_code + new_content[end:]
-                result.chars_before += original_len
-                result.chars_after += len(new_code)
-                result.chars_saved += original_len - len(new_code)
-                modified = True
-
-        if modified:
-            if is_list_content and text_block_idx >= 0:
-                msg["content"][text_block_idx]["text"] = new_content
-            else:
-                msg["content"] = new_content
+
+        elif role == "user":
+            content = msg.get("content", "")
+            if isinstance(content, str) and len(content) >= min_content_chars:
+                new_content = _dedup_string_field(
+                    content, seen_blocks, idx, "user message", result,
+                    min_block_chars, chunk_modulus,
+                )
+                if new_content is not None:
+                    msg["content"] = new_content
+
+        elif role == "assistant":
+            _dedup_assistant_message(
+                msg, idx, seen_blocks, result,
+                min_block_chars, min_content_chars, chunk_modulus,
+            )
+
+        elif role == "tool":
+            _dedup_tool_message(
+                msg, idx, tool_names, seen_blocks, result,
+                min_block_chars, min_content_chars, chunk_modulus,
+            )
+
+    return result
 
 
 def dedup_responses_api(
diff --git a/tests/test_payload_cross_role_dedup.py b/tests/test_payload_cross_role_dedup.py
new file mode 100644
index 0000000..a03e4bf
--- /dev/null
+++ b/tests/test_payload_cross_role_dedup.py
@@ -0,0 +1,215 @@
+"""Cross-role exact-block dedup within a single LLM-bound payload.
+
+This is the safest concrete ContextPilot optimization: inside ONE OpenAI chat
+payload, an exact repeated block that already appears in an earlier message
+(system / skill prompt, user, assistant, or tool result) is replaced — in the
+*later* message only — by a short reference back to the earlier copy. The LLM
+has already seen one full copy in the same request, so no information is lost.
+
+Hard safety contract these tests lock in:
+
+1. Exact repeated blocks ACROSS DIFFERENT ROLES are deduped. The first
+   (earliest, in document order) occurrence keeps its full text; later
+   occurrences are shortened to a reference pointing "above".
+2. References point to an EARLIER block in the SAME payload (never forward,
+   never the first occurrence).
+3. A one-character-different / near-duplicate block is NEVER collapsed — its
+   unique text survives verbatim.
+4. Genuinely different content is left byte-for-byte intact.
+5. No raw block/message content is ever written to telemetry.
+"""
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+MODULE_PATH = REPO_ROOT / "contextpilot" / "dedup" / "block_dedup.py"
+_spec = importlib.util.spec_from_file_location("contextpilot_block_dedup_xrole", MODULE_PATH)
+block_dedup = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(block_dedup)
+
+dedup_chat_completions = block_dedup.dedup_chat_completions
+
+REFERENCE_MARKER = "identical to earlier"
+
+
+def _instruction_block(prefix: str = "always follow safety rule", n: int = 30) -> str:
+    """A deterministic multi-line instruction/skill block that chunks cleanly."""
+    return "\n".join(
+        f"INSTRUCTION LINE {i}: {prefix} number {i} carefully and verbatim every time"
+        for i in range(n)
+    )
+
+
+def _cross_role_payload(sys_block, user_block, tool_block, asst_block) -> dict:
+    """A single chat payload where the same instruction block can recur by role."""
+    return {
+        "messages": [
+            {"role": "system", "content": "You are a coding agent.\n" + sys_block + "\nEnd system."},
+            {"role": "user", "content": "Please remember the rules:\n" + user_block + "\nThanks."},
+            {"role": "assistant", "tool_calls": [{"id": "c1", "function": {"name": "Read"}}]},
+            {"role": "tool", "tool_call_id": "c1", "content": "File header\n" + tool_block + "\nFooter."},
+            {"role": "assistant", "content": "Acknowledging the rules:\n" + asst_block + "\nDone."},
+        ]
+    }
+
+
+def test_repeated_block_across_roles_is_deduped_first_copy_kept():
+    block = _instruction_block()
+    body = _cross_role_payload(block, block, block, block)
+    original_system = body["messages"][0]["content"]
+
+    result = dedup_chat_completions(body)
+
+    system_after = body["messages"][0]["content"]
+    user_after = body["messages"][1]["content"]
+    tool_after = body["messages"][3]["content"]
+    asst_after = body["messages"][4]["content"]
+
+    # Real savings, attributed to deduped blocks.
+    assert result.chars_saved > 0
+    assert result.blocks_deduped > 0
+
+    # First (earliest) occurrence — the system prompt — is left fully intact.
+    assert system_after == original_system
+
+    # Every later role that repeats the block is shortened and references "above".
+    assert len(user_after) < len(body["messages"][1]["content"]) or REFERENCE_MARKER in user_after
+    assert REFERENCE_MARKER in user_after, "user-role duplicate must be deduped"
+    assert REFERENCE_MARKER in tool_after, "tool-role duplicate must be deduped"
+    assert REFERENCE_MARKER in asst_after, "assistant-role duplicate must be deduped"
+
+
+def test_reference_points_backward_only_never_first_occurrence():
+    block = _instruction_block()
+    body = _cross_role_payload(block, block, block, block)
+    dedup_chat_completions(body)
+
+    # The system message is first; it must never become a reference to itself or
+    # to anything later in the payload.
+    assert REFERENCE_MARKER not in body["messages"][0]["content"]
+
+
+def test_near_duplicate_block_survives_verbatim():
+    block = _instruction_block()
+    # One unique line differs in the user copy — a one-line delta.
+    lines = block.split("\n")
+    lines[15] = "INSTRUCTION LINE 15: UNIQUE_DELTA_MARKER_qwerty brand new never-seen directive"
+    edited = "\n".join(lines)
+
+    body = _cross_role_payload(block, edited, block, block)
+    dedup_chat_completions(body)
+
+    user_after = body["messages"][1]["content"]
+    # The changed line MUST survive verbatim — never hidden behind a reference.
+    assert "UNIQUE_DELTA_MARKER_qwerty" in user_after
+
+
+def test_single_char_difference_is_not_collapsed():
+    block = _instruction_block()
+    idx = len(block) // 2
+    mutated = block[:idx] + ("Z" if block[idx] != "Z" else "Q") + block[idx + 1:]
+
+    body = _cross_role_payload(block, mutated, block, block)
+    dedup_chat_completions(body)
+
+    user_after = body["messages"][1]["content"]
+    mutated_line = mutated.split("\n")[block[:idx].count("\n")]
+    assert mutated_line in user_after
+
+
+def test_genuinely_different_content_left_intact():
+    block = _instruction_block()
+    other = "\n".join(
+        f"UNRELATED ROW {i}: a completely different paragraph about widgets and gears {i}"
+        for i in range(30)
+    )
+    body = _cross_role_payload(block, other, other, other)
+    user_before = body["messages"][1]["content"]
+    tool_before = body["messages"][3]["content"]
+
+    result = dedup_chat_completions(body)
+
+    assert result.chars_saved == 0
+    assert result.blocks_deduped == 0
+    assert body["messages"][1]["content"] == user_before
+    assert body["messages"][3]["content"] == tool_before
+
+
+def test_no_raw_block_content_in_plugin_telemetry(monkeypatch, tmp_path):
+    """End-to-end through the Hermes engine: telemetry stays metadata-only."""
+    import sys
+    import types
+
+    # Minimal fake Hermes surface so __init__.py imports cleanly.
+    agent_pkg = types.ModuleType("agent")
+    context_engine_mod = types.ModuleType("agent.context_engine")
+    context_compressor_mod = types.ModuleType("agent.context_compressor")
+
+    class FakeContextEngine:
+        threshold_percent = 0.75
+
+        def get_status(self):
+            return {}
+
+    class FakeContextCompressor(FakeContextEngine):
+        def __init__(self, **kwargs):
+            self.threshold_tokens = 0
+            self.context_length = 0
+            self.protect_first_n = 3
+            self.protect_last_n = 6
+            self.compression_count = 0
+
+        def on_session_start(self, session_id, **kwargs):
+            return None
+
+    context_engine_mod.ContextEngine = FakeContextEngine
+    context_compressor_mod.ContextCompressor = FakeContextCompressor
+    agent_pkg.context_engine = context_engine_mod
+    agent_pkg.context_compressor = context_compressor_mod
+    monkeypatch.setitem(sys.modules, "agent", agent_pkg)
+    monkeypatch.setitem(sys.modules, "agent.context_engine", context_engine_mod)
+    monkeypatch.setitem(sys.modules, "agent.context_compressor", context_compressor_mod)
+
+    run_agent_mod = types.ModuleType("run_agent")
+
+    class FakeAIAgent:
+        @staticmethod
+        def _sanitize_api_messages(messages):
+            return messages
+
+    run_agent_mod.AIAgent = FakeAIAgent
+    monkeypatch.setitem(sys.modules, "run_agent", run_agent_mod)
+
+    module_path = REPO_ROOT / "__init__.py"
+    spec = importlib.util.spec_from_file_location("contextpilot_plugin_xrole", module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    monkeypatch.setattr(module, "_check_reorder", lambda: False)
+    monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False)
+
+    telemetry = tmp_path / "telemetry.jsonl"
+    monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry))
+
+    block = _instruction_block()
+    secret_line = "INSTRUCTION LINE 0: always follow safety rule number 0 carefully and verbatim every time"
+    assert secret_line in block
+
+    engine = module.ContextPilotEngine()
+    engine.on_session_start("session-XR", model="test-model")
+    body = _cross_role_payload(block, block, block, block)
+    _out, stats = engine.optimize_api_messages(body["messages"])
+
+    assert stats["chars_saved"] > 0
+
+    assert telemetry.exists()
+    raw = telemetry.read_text(encoding="utf-8")
+    # No raw block/message content may ever leak into telemetry.
+    assert secret_line not in raw
+    assert "INSTRUCTION LINE" not in raw
+    for record in (json.loads(l) for l in raw.splitlines() if l.strip()):
+        forbidden = {"content", "messages", "prompt", "system_prompt", "text", "tool_calls"}
+        assert forbidden.isdisjoint(record.keys())

From ba48354c294c7ca8a255e23aff36518e540b07c8 Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Fri, 12 Jun 2026 02:14:36 +0200
Subject: [PATCH 6/9] feat: add worker routing shadow telemetry

---
 docs/guides/hermes-monitor.md                 |  25 +-
 .../analyze_hermes_context_opportunities.py   | 437 ++++++++++++++++++
 ...est_hermes_context_opportunity_analyzer.py | 254 ++++++++++
 3 files changed, 715 insertions(+), 1 deletion(-)

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index 8798885..0f5e91a 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -67,7 +67,8 @@ It surfaces concrete token-reduction opportunities:
 - repeated line/block fingerprints (shared boilerplate across outputs),
 - large tool outputs grouped by `tool_name`,
 - heavy sessions by input-token / tool-call / message counts (hashed ids),
-- ContextPilot telemetry coverage and savings ratios.
+- ContextPilot telemetry coverage and savings ratios,
+- **Worker Context Routing shadow labels** for future router training/eval.
 
 ### LLM-bound block redundancy
 
@@ -93,6 +94,28 @@ aggregated. The report then shows:
   prompt *and* a tool result *and* a user prompt). Reported only as a hash plus
   per-type counters — never the raw text.
 
+### Worker Context Routing shadow mode
+
+The analyzer now includes a **Worker Context Routing — shadow mode** section by
+default. This is P0 data collection only: it never drops, summarizes, or mutates
+context. It fingerprints each LLM-bound block and emits only low-cardinality
+labels/counters such as:
+
+- `policy_must_keep` for user/system/skill prompts and explicit safety /
+  acceptance constraints,
+- `direct_task_hint` for short actionable task/error hints,
+- `likely_relevant` for conservative default-keep blocks,
+- `summarizable_candidate` / `likely_drop_candidate` for large or repeated
+  tool-like blocks that a future router might route away. Large diagnostic logs
+  containing `error:` / `failed` / `traceback` cues are still only advisory
+  summarization candidates, not must-drop decisions.
+
+The report includes estimated advisory candidate tokens and salted candidate
+block hashes. These are **not realized savings** and must be treated as training
+/ evaluation data for a future high-recall router. Use
+`--disable-worker-routing-shadow` only when you want to omit this section from a
+scan.
+
 Use `--all-sessions` to ignore the `--since-hours` window and scan **all**
 non-archived sessions and active messages (useful for a one-shot, whole-history
 audit rather than a rolling daily window):
diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py
index 5bcd3b1..780f0f1 100644
--- a/scripts/analyze_hermes_context_opportunities.py
+++ b/scripts/analyze_hermes_context_opportunities.py
@@ -134,6 +134,131 @@ class CrossTypeBlockGroup:
     est_wasted_tokens: int               # (occurrences - 1) * est_tokens
 
 
+# ---------------------------------------------------------------------------
+# Worker Context Routing — SHADOW MODE (P0 data collection only)
+# ---------------------------------------------------------------------------
+# Low-cardinality router labels. These are the *training/eval* labels a future
+# small worker-context router would predict. P0 is data-collection only: nothing
+# here ever drops, summarizes, or mutates context — it only classifies blocks
+# and emits aggregate counters + salted hashes so the labels can be evaluated
+# offline before any online pruning is built.
+ROUTER_LABELS = (
+    "policy_must_keep",        # never droppable (user/system/skill/safety constraints)
+    "direct_task_hint",        # short actionable task signal — keep
+    "likely_relevant",         # default keep; not obviously prunable
+    "summarizable_candidate",  # large single block that *might* be summarized later
+    "likely_drop_candidate",   # large/repeated tool-like block, candidate to route away
+)
+
+# Labels whose blocks a future router might safely route away. Used only to
+# tally *advisory* candidate tokens; P0 never acts on them.
+_ROUTABLE_LABELS = ("summarizable_candidate", "likely_drop_candidate")
+
+# Block-type priority when one fingerprint spans multiple origins: the most
+# "must-keep" origin wins, so cross-origin blocks are classified conservatively.
+_TYPE_KEEP_PRIORITY = {
+    "user_prompt": 5,
+    "system_prompt": 4,
+    "skill_prompt": 4,
+    "assistant_context": 2,
+    "tool_result": 1,
+    "unknown": 0,
+}
+
+# Cues marking content that must NEVER be dropped even from a tool/assistant
+# block: explicit safety / acceptance / hard-constraint language. Matching here
+# is intentionally generous — over-keeping is the safe direction for P0.
+_SAFETY_CONSTRAINT_CUES = (
+    "must not",
+    "must never",
+    "never drop",
+    "do not delete",
+    "do not remove",
+    "do not modify",
+    "acceptance criteria",
+    "acceptance test",
+    "safety",
+    "must keep",
+    "you must",
+    "required:",
+    "constraint",
+    "forbidden",
+    "policy",
+)
+
+# Cues marking a short, actionable task hint worth keeping verbatim.
+_TASK_HINT_CUES = (
+    "todo",
+    "next step",
+    "error:",
+    "traceback",
+    "failed",
+    "fixme",
+    "task:",
+    "goal:",
+    "implement",
+    "reproduce",
+)
+
+
+@dataclass
+class RouterLabelCount:
+    """Aggregate over all blocks assigned one router label."""
+
+    route_label: str
+    block_count: int            # distinct fingerprints with this label
+    occurrence_count: int       # total occurrences across the window
+    total_est_tokens: int       # est tokens these blocks occupy (occ * est)
+    est_candidate_tokens: int   # ADVISORY routable tokens (0 unless routable)
+
+
+@dataclass
+class RouterReasonCount:
+    """Aggregate keyed by (block_type, route_label, reason_code)."""
+
+    block_type: str
+    route_label: str
+    reason_code: str
+    block_count: int
+    occurrence_count: int
+    total_est_tokens: int
+    est_candidate_tokens: int
+
+
+@dataclass
+class RouterCandidateBlock:
+    """A single routable-candidate fingerprint (salted hash + counters only)."""
+
+    block_hash: str
+    block_type: str
+    route_label: str
+    reason_code: str
+    occurrences: int
+    char_length: int
+    est_tokens: int
+    est_candidate_tokens: int   # ADVISORY upper bound only
+
+
+@dataclass
+class WorkerRoutingShadow:
+    """Shadow-mode worker-context routing report (P0: data collection only)."""
+
+    enabled: bool
+    item_count: int                 # LLM-bound items classified
+    classified_block_count: int     # distinct fingerprints classified
+    total_occurrences: int
+    must_keep_block_count: int
+    must_keep_occurrence_count: int
+    est_must_keep_tokens: int
+    est_candidate_tokens_total: int          # ADVISORY routable ceiling
+    est_drop_candidate_tokens: int           # ADVISORY
+    est_summarizable_candidate_tokens: int   # ADVISORY
+    label_counts: list[RouterLabelCount]
+    reason_counts: list[RouterReasonCount]
+    top_candidate_blocks: list[RouterCandidateBlock]
+    notes: list[str] = field(default_factory=list)
+
+
 @dataclass
 class ToolSizeStat:
     tool_name: str
@@ -189,6 +314,8 @@ class OpportunityReport:
     llm_block_types: list[BlockTypeStat]
     cross_type_block_groups: list[CrossTypeBlockGroup]
     cross_type_wasted_tokens: int
+    # Worker Context Routing shadow mode (P0 data collection; never prunes).
+    worker_routing: WorkerRoutingShadow
     notes: list[str] = field(default_factory=list)
 
 
@@ -760,6 +887,247 @@ def analyze_llm_bound_blocks(
     return block_type_stats, cross[:top_n]
 
 
+def classify_router_label(
+    block_type: str,
+    content: str,
+    *,
+    occurrences: int,
+    large_output_chars: int,
+    min_repeat: int,
+) -> tuple[str, str]:
+    """Heuristically assign a worker-routing label + reason code to a block.
+
+    Pure P0 heuristic: no ML, no network, no mutation. Operates on in-memory
+    text only and returns two low-cardinality enums (``route_label``,
+    ``reason_code``) -- never the text. The bias is deliberately conservative:
+    when in doubt, keep. Anything that is a user prompt, a system/skill prompt,
+    or carries explicit safety/acceptance-constraint language is pinned to
+    ``policy_must_keep`` and can never become a routable candidate.
+    """
+    low = content.lower()
+
+    # 1. Never-drop by origin: prompts the user/system/skills authored.
+    if block_type == "user_prompt":
+        return "policy_must_keep", "user_prompt_never_drop"
+    if block_type in ("system_prompt", "skill_prompt"):
+        return "policy_must_keep", "system_or_skill_constraint_never_drop"
+
+    # 2. Never-drop by content: explicit safety / acceptance / hard constraints,
+    #    even inside an assistant or tool block.
+    if any(cue in low for cue in _SAFETY_CONSTRAINT_CUES):
+        return "policy_must_keep", "safety_or_acceptance_constraint"
+
+    char_len = len(content)
+    has_task_hint = any(cue in low for cue in _TASK_HINT_CUES)
+
+    # 3. Short actionable task hints -> keep verbatim. Very large diagnostic
+    #    logs often contain "error:"/"failed"/"traceback"; keep collecting
+    #    them as summarization candidates instead of pinning the whole log.
+    if has_task_hint and char_len < large_output_chars:
+        return "direct_task_hint", "actionable_task_signal"
+
+    # 4. Bulky / repeated tool-like material -> routable candidates (advisory).
+    if block_type in ("tool_result", "assistant_context", "unknown"):
+        if has_task_hint and char_len >= large_output_chars:
+            return "summarizable_candidate", "large_actionable_tool_block"
+        is_large = char_len >= large_output_chars
+        is_repeated = occurrences >= min_repeat
+        if is_large and is_repeated:
+            return "likely_drop_candidate", "large_repeated_tool_block"
+        if is_repeated:
+            return "likely_drop_candidate", "repeated_tool_block"
+        if is_large:
+            return "summarizable_candidate", "large_single_tool_block"
+
+    # 5. Everything else: keep by default.
+    return "likely_relevant", "default_keep"
+
+
+def analyze_worker_routing_shadow(
+    contents: Iterable[_LLMContent],
+    *,
+    salt: str,
+    large_output_chars: int,
+    min_repeat: int,
+    top_n: int,
+    enabled: bool = True,
+) -> WorkerRoutingShadow:
+    """Shadow-mode worker-context routing classifier (P0: data collection only).
+
+    Fingerprints each LLM-bound item, assigns a conservative router label, and
+    returns aggregate counters + salted hashes for routable candidates. Emits
+    NO raw text and never mutates/drops context. ``est_candidate_tokens`` is an
+    advisory upper bound on what a *future* router might route away -- not a
+    realized saving.
+    """
+    if not enabled:
+        return WorkerRoutingShadow(
+            enabled=False,
+            item_count=0,
+            classified_block_count=0,
+            total_occurrences=0,
+            must_keep_block_count=0,
+            must_keep_occurrence_count=0,
+            est_must_keep_tokens=0,
+            est_candidate_tokens_total=0,
+            est_drop_candidate_tokens=0,
+            est_summarizable_candidate_tokens=0,
+            label_counts=[],
+            reason_counts=[],
+            top_candidate_blocks=[],
+            notes=["worker-routing shadow analysis disabled via flag"],
+        )
+
+    # Aggregate occurrences per fingerprint, picking the most must-keep origin
+    # when one block spans several block types.
+    agg: dict[str, dict] = {}
+    item_count = 0
+    for item in contents:
+        content = item.content
+        if not content:
+            continue
+        item_count += 1
+        h = _salted_hash(content, salt)
+        bt = item.block_type
+        entry = agg.get(h)
+        if entry is None:
+            agg[h] = {
+                "block_type": bt,
+                "char_length": len(content),
+                "occurrences": 1,
+                "content": content,
+            }
+        else:
+            entry["occurrences"] += 1
+            cur = entry["block_type"]
+            bt_pri = _TYPE_KEEP_PRIORITY.get(bt, 0)
+            cur_pri = _TYPE_KEEP_PRIORITY.get(cur, 0)
+            if bt_pri > cur_pri or (bt_pri == cur_pri and bt < cur):
+                entry["block_type"] = bt
+
+    # Classify each unique fingerprint and roll up counters.
+    label_agg: dict[str, dict] = {}
+    reason_agg: dict[tuple[str, str, str], dict] = {}
+    candidates: list[RouterCandidateBlock] = []
+    must_keep_blocks = 0
+    must_keep_occ = 0
+    est_must_keep_tokens = 0
+    drop_tokens = 0
+    summ_tokens = 0
+
+    for h, entry in agg.items():
+        bt = entry["block_type"]
+        occ = entry["occurrences"]
+        char_len = entry["char_length"]
+        est = _est_tokens(char_len)
+        total_est = est * occ
+        label, reason = classify_router_label(
+            bt,
+            entry["content"],
+            occurrences=occ,
+            large_output_chars=large_output_chars,
+            min_repeat=min_repeat,
+        )
+        candidate_tokens = total_est if label in _ROUTABLE_LABELS else 0
+
+        la = label_agg.setdefault(
+            label,
+            {"block_count": 0, "occ": 0, "total_est": 0, "candidate": 0},
+        )
+        la["block_count"] += 1
+        la["occ"] += occ
+        la["total_est"] += total_est
+        la["candidate"] += candidate_tokens
+
+        ra = reason_agg.setdefault(
+            (bt, label, reason),
+            {"block_count": 0, "occ": 0, "total_est": 0, "candidate": 0},
+        )
+        ra["block_count"] += 1
+        ra["occ"] += occ
+        ra["total_est"] += total_est
+        ra["candidate"] += candidate_tokens
+
+        if label == "policy_must_keep":
+            must_keep_blocks += 1
+            must_keep_occ += occ
+            est_must_keep_tokens += total_est
+        if label == "likely_drop_candidate":
+            drop_tokens += candidate_tokens
+        elif label == "summarizable_candidate":
+            summ_tokens += candidate_tokens
+
+        if candidate_tokens > 0:
+            candidates.append(
+                RouterCandidateBlock(
+                    block_hash=h,
+                    block_type=bt,
+                    route_label=label,
+                    reason_code=reason,
+                    occurrences=occ,
+                    char_length=char_len,
+                    est_tokens=est,
+                    est_candidate_tokens=candidate_tokens,
+                )
+            )
+
+    # Deterministic ordering: label_counts follow the canonical label order;
+    # reason_counts and candidates sort by a stable key.
+    label_counts = [
+        RouterLabelCount(
+            route_label=lbl,
+            block_count=label_agg[lbl]["block_count"],
+            occurrence_count=label_agg[lbl]["occ"],
+            total_est_tokens=label_agg[lbl]["total_est"],
+            est_candidate_tokens=label_agg[lbl]["candidate"],
+        )
+        for lbl in ROUTER_LABELS
+        if lbl in label_agg
+    ]
+    reason_counts = [
+        RouterReasonCount(
+            block_type=bt,
+            route_label=lbl,
+            reason_code=reason,
+            block_count=v["block_count"],
+            occurrence_count=v["occ"],
+            total_est_tokens=v["total_est"],
+            est_candidate_tokens=v["candidate"],
+        )
+        for (bt, lbl, reason), v in sorted(reason_agg.items())
+    ]
+    candidates.sort(
+        key=lambda c: (c.est_candidate_tokens, c.occurrences, c.block_hash),
+        reverse=True,
+    )
+
+    total_occ = sum(e["occurrences"] for e in agg.values())
+    notes = [
+        "SHADOW MODE P0: classification only -- no context was dropped, summarized, or mutated",
+        "route_label/reason_code/block_type are low-cardinality enums; block_hash is a salted SHA-256 fingerprint",
+        "est_candidate_tokens is ADVISORY (an upper bound for a FUTURE router), not a realized saving",
+        "user/system/skill prompts and safety/acceptance constraints are pinned to policy_must_keep and never routable",
+        "classification is conservative: when uncertain, blocks are kept (likely_relevant)",
+    ]
+
+    return WorkerRoutingShadow(
+        enabled=True,
+        item_count=item_count,
+        classified_block_count=len(agg),
+        total_occurrences=total_occ,
+        must_keep_block_count=must_keep_blocks,
+        must_keep_occurrence_count=must_keep_occ,
+        est_must_keep_tokens=est_must_keep_tokens,
+        est_candidate_tokens_total=drop_tokens + summ_tokens,
+        est_drop_candidate_tokens=drop_tokens,
+        est_summarizable_candidate_tokens=summ_tokens,
+        label_counts=label_counts,
+        reason_counts=reason_counts,
+        top_candidate_blocks=candidates[:top_n],
+        notes=notes,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Build + write
 # ---------------------------------------------------------------------------
@@ -779,6 +1147,7 @@ def build_report(
     min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT,
     large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS,
     top_n: int = DEFAULT_TOP_N,
+    worker_routing_shadow: bool = True,
 ) -> OpportunityReport:
     dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n)
     blocks = detect_repeated_blocks(
@@ -801,6 +1170,15 @@ def build_report(
         top_n=top_n,
     )
 
+    worker_routing = analyze_worker_routing_shadow(
+        llm_contents,
+        salt=salt,
+        large_output_chars=large_output_chars,
+        min_repeat=min_block_repeat,
+        top_n=top_n,
+        enabled=worker_routing_shadow,
+    )
+
     total_chars = sum(len(m.content) for m in tool_messages)
     dup_wasted = sum(d.est_wasted_tokens for d in dups)
     block_wasted = sum(b.est_wasted_tokens for b in blocks)
@@ -812,6 +1190,7 @@ def build_report(
         "wasted-token figures are heuristic estimates (chars/4); validate before acting",
         "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text",
         "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages",
+        "worker-routing section is SHADOW MODE P0: it labels blocks for a future router but never drops/summarizes context",
     ]
     if all_sessions:
         notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages")
@@ -841,6 +1220,7 @@ def build_report(
         llm_block_types=block_type_stats,
         cross_type_block_groups=cross_groups,
         cross_type_wasted_tokens=cross_wasted,
+        worker_routing=worker_routing,
         notes=notes,
     )
 
@@ -892,6 +1272,9 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
         f"(~{report.cross_type_wasted_tokens} wasted tokens)",
         f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, "
         f"coverage {t.coverage_ratio_pct}%",
+        f"- Worker routing (shadow): {report.worker_routing.classified_block_count} blocks "
+        f"classified, {report.worker_routing.must_keep_block_count} must-keep, "
+        f"~{report.worker_routing.est_candidate_tokens_total} advisory candidate tokens",
         "",
         "## LLM-bound redundancy by block type",
     ]
@@ -950,6 +1333,51 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
         ]
     )
     md.append("")
+    wr = report.worker_routing
+    md.append("## Worker Context Routing — shadow mode (P0, advisory only)")
+    if not wr.enabled:
+        md.append("- disabled")
+    else:
+        md.append(
+            f"- Items classified: {wr.item_count} "
+            f"(distinct fingerprints: {wr.classified_block_count}, "
+            f"occurrences: {wr.total_occurrences})"
+        )
+        md.append(
+            f"- Must-keep: {wr.must_keep_block_count} blocks / "
+            f"{wr.must_keep_occurrence_count} occurrences "
+            f"(~{wr.est_must_keep_tokens} tokens, never routable)"
+        )
+        md.append(
+            f"- Advisory candidate tokens: ~{wr.est_candidate_tokens_total} "
+            f"(drop ~{wr.est_drop_candidate_tokens}, "
+            f"summarize ~{wr.est_summarizable_candidate_tokens}) — NOT a realized saving"
+        )
+        md.append("")
+        md.append("### Router labels")
+        for lc in wr.label_counts:
+            md.append(
+                f"- {lc.route_label}: blocks={lc.block_count} "
+                f"occ={lc.occurrence_count} tokens={lc.total_est_tokens} "
+                f"~candidate={lc.est_candidate_tokens}"
+            )
+        md.append("")
+        md.append("### Reason codes (block_type / label / reason)")
+        for rc in wr.reason_counts:
+            md.append(
+                f"- {rc.block_type} / {rc.route_label} / {rc.reason_code}: "
+                f"blocks={rc.block_count} occ={rc.occurrence_count} "
+                f"tokens={rc.total_est_tokens} ~candidate={rc.est_candidate_tokens}"
+            )
+        md.append("")
+        md.append("### Top routable-candidate blocks (hashed)")
+        for cb in wr.top_candidate_blocks:
+            md.append(
+                f"- `{cb.block_hash}` type={cb.block_type} "
+                f"label={cb.route_label} reason={cb.reason_code} "
+                f"x{cb.occurrences} chars={cb.char_length} ~candidate={cb.est_candidate_tokens}"
+            )
+    md.append("")
     md.append("## Notes")
     for note in report.notes:
         md.append(f"- {note}")
@@ -987,6 +1415,14 @@ def main() -> int:
         "--large-output-chars", type=int, default=DEFAULT_LARGE_OUTPUT_CHARS
     )
     parser.add_argument("--top-n", type=int, default=DEFAULT_TOP_N)
+    parser.add_argument(
+        "--disable-worker-routing-shadow",
+        action="store_true",
+        help=(
+            "skip the shadow-mode Worker Context Routing classification "
+            "(P0 data collection; enabled by default, never prunes context)"
+        ),
+    )
     args = parser.parse_args()
 
     if not args.state_db.exists():
@@ -1030,6 +1466,7 @@ def main() -> int:
             min_block_repeat=args.min_block_repeat,
             large_output_chars=args.large_output_chars,
             top_n=args.top_n,
+            worker_routing_shadow=not args.disable_worker_routing_shadow,
         )
         json_path, md_path = write_report(report, args.out_dir)
     except Exception as exc:  # noqa: BLE001 - cron-safe: report class only, no payload
diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py
index 649288a..7eb04d2 100644
--- a/tests/test_hermes_context_opportunity_analyzer.py
+++ b/tests/test_hermes_context_opportunity_analyzer.py
@@ -1,3 +1,4 @@
+import dataclasses
 import importlib.util
 import json
 import sqlite3
@@ -450,3 +451,256 @@ def test_cross_type_redundancy_reported_via_hashes_only(tmp_path):
     assert "You are a helpful system" not in blob
     assert "user asks the assistant" not in blob
     assert "PRIVATE REASONING" not in blob
+
+
+# ---------------------------------------------------------------------------
+# Worker Context Routing — SHADOW MODE (P0 data-collection) tests
+# ---------------------------------------------------------------------------
+
+LARGE = 8000  # default large-output threshold
+
+
+def _route_map(report):
+    """label -> RouterLabelCount for convenient assertions."""
+    return {lc.route_label: lc for lc in report.worker_routing.label_counts}
+
+
+def _labels_for(contents, **kw):
+    """Run only the shadow classifier over a list of _LLMContent."""
+    return analyzer.analyze_worker_routing_shadow(
+        contents, salt="test-salt", large_output_chars=LARGE, min_repeat=2, top_n=20, **kw
+    )
+
+
+def test_user_and_system_constraints_are_must_keep(tmp_path):
+    db = tmp_path / "state.db"
+    sys_prompt = "You are an agent. Follow the rules below for the whole session here."
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_prompt}],
+        messages=[
+            {"role": "user", "content": "Please implement the worker routing layer for me now"},
+            {
+                "role": "tool",
+                "content": "ACCEPTANCE CRITERIA: the targeted pytest suite must pass before merge",
+                "tool_name": "Bash",
+            },
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    wr = report.worker_routing
+
+    # User prompt, system prompt, and the acceptance-constraint tool block are
+    # all pinned to policy_must_keep and contribute zero routable tokens.
+    rm = _route_map(report)
+    assert "policy_must_keep" in rm
+    assert rm["policy_must_keep"].est_candidate_tokens == 0
+    assert wr.est_candidate_tokens_total == 0
+
+    reasons = {(r.block_type, r.route_label, r.reason_code) for r in wr.reason_counts}
+    assert ("user_prompt", "policy_must_keep", "user_prompt_never_drop") in reasons
+    assert (
+        "system_prompt",
+        "policy_must_keep",
+        "system_or_skill_constraint_never_drop",
+    ) in reasons
+    # The acceptance constraint inside a TOOL block is still must-keep.
+    assert any(
+        r.route_label == "policy_must_keep"
+        and r.reason_code == "safety_or_acceptance_constraint"
+        and r.block_type == "tool_result"
+        for r in wr.reason_counts
+    )
+
+
+def test_large_repeated_tool_blocks_become_drop_candidates(tmp_path):
+    db = tmp_path / "state.db"
+    big_unrelated = "row of unrelated build log output number 7 with filler text " * 200
+    assert len(big_unrelated) >= LARGE
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": big_unrelated, "tool_name": "Bash"},
+            {"role": "tool", "content": big_unrelated, "tool_name": "Bash"},
+            {"role": "tool", "content": big_unrelated, "tool_name": "Bash"},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    wr = report.worker_routing
+    rm = _route_map(report)
+
+    # The repeated, large, unrelated tool output is a drop candidate.
+    assert "likely_drop_candidate" in rm
+    assert wr.est_drop_candidate_tokens > 0
+    assert wr.est_candidate_tokens_total >= wr.est_drop_candidate_tokens
+    top = wr.top_candidate_blocks[0]
+    assert top.route_label == "likely_drop_candidate"
+    assert top.reason_code == "large_repeated_tool_block"
+    assert top.occurrences == 3
+
+
+def test_large_single_tool_block_is_summarizable(tmp_path):
+    db = tmp_path / "state.db"
+    big_once = "one-off diagnostic dump segment with assorted detail text " * 200
+    assert len(big_once) >= LARGE
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[{"role": "tool", "content": big_once, "tool_name": "Read"}],
+    )
+    report = _analyze(db, tmp_path)
+    wr = report.worker_routing
+    assert wr.est_summarizable_candidate_tokens > 0
+    assert any(
+        c.route_label == "summarizable_candidate"
+        and c.reason_code == "large_single_tool_block"
+        for c in wr.top_candidate_blocks
+    )
+
+
+def test_large_actionable_diagnostic_log_is_summarizable_not_pinned(tmp_path):
+    db = tmp_path / "state.db"
+    big_error_log = "ERROR: integration test failed with stack frame details " * 220
+    assert len(big_error_log) >= LARGE
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[{"role": "tool", "content": big_error_log, "tool_name": "Bash"}],
+    )
+    report = _analyze(db, tmp_path)
+    wr = report.worker_routing
+
+    assert wr.est_summarizable_candidate_tokens > 0
+    assert any(
+        c.route_label == "summarizable_candidate"
+        and c.reason_code == "large_actionable_tool_block"
+        for c in wr.top_candidate_blocks
+    )
+    assert not any(
+        r.route_label == "direct_task_hint"
+        and r.reason_code == "actionable_task_signal"
+        and r.block_type == "tool_result"
+        for r in wr.reason_counts
+    )
+
+
+def test_short_actionable_hint_and_default_blocks_are_kept(tmp_path):
+    contents = [
+        analyzer._LLMContent(block_type="assistant_context", content="Next step: run pytest"),
+        analyzer._LLMContent(block_type="assistant_context", content="plain medium context without special cues"),
+    ]
+    wr = _labels_for(contents)
+    reasons = {(r.block_type, r.route_label, r.reason_code) for r in wr.reason_counts}
+
+    assert ("assistant_context", "direct_task_hint", "actionable_task_signal") in reasons
+    assert ("assistant_context", "likely_relevant", "default_keep") in reasons
+    assert wr.est_candidate_tokens_total == 0
+
+
+def test_equal_priority_block_type_tiebreak_is_deterministic():
+    same = "identical prompt material"
+    forward = _labels_for(
+        [
+            analyzer._LLMContent(block_type="system_prompt", content=same),
+            analyzer._LLMContent(block_type="skill_prompt", content=same),
+        ]
+    )
+    reverse = _labels_for(
+        [
+            analyzer._LLMContent(block_type="skill_prompt", content=same),
+            analyzer._LLMContent(block_type="system_prompt", content=same),
+        ]
+    )
+
+    assert [dataclasses.asdict(r) for r in forward.reason_counts] == [
+        dataclasses.asdict(r) for r in reverse.reason_counts
+    ]
+
+
+def test_shadow_mode_never_emits_raw_content(tmp_path):
+    db = tmp_path / "state.db"
+    secret = "SHADOW-SECRET-PAYLOAD-DO-NOT-LEAK detail line that is quite long here " * 200
+    user_secret = "USER-SECRET-PROMPT-DO-NOT-LEAK implement the thing for me"
+    sys_secret = "SYSTEM-SECRET-CONSTRAINT you must never reveal internal keys here"
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_secret}],
+        messages=[
+            {"role": "tool", "content": secret, "tool_name": "Bash"},
+            {"role": "tool", "content": secret, "tool_name": "Bash"},
+            {"role": "user", "content": user_secret},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+    blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8")
+
+    # No raw block/prompt/system/reasoning text in either output.
+    assert "SHADOW-SECRET-PAYLOAD" not in blob
+    assert "USER-SECRET-PROMPT" not in blob
+    assert "SYSTEM-SECRET-CONSTRAINT" not in blob
+    assert "PRIVATE REASONING" not in blob
+    # The classification still happened (drop candidate detected via hash).
+    assert report.worker_routing.est_drop_candidate_tokens > 0
+
+
+def test_shadow_schema_is_deterministic_and_privacy_safe(tmp_path):
+    db = tmp_path / "state.db"
+    big = "deterministic repeated tool payload chunk of sufficient size here " * 200
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": "be safe please"}],
+        messages=[
+            {"role": "tool", "content": big, "tool_name": "Bash"},
+            {"role": "tool", "content": big, "tool_name": "Bash"},
+            {"role": "user", "content": "implement the routing shadow layer now please"},
+        ],
+    )
+    r1 = _analyze(db, tmp_path)
+    r2 = _analyze(db, tmp_path)
+    # Identical inputs -> byte-identical serialized shadow section.
+    assert dataclasses.asdict(r1.worker_routing) == dataclasses.asdict(r2.worker_routing)
+
+    wr = r1.worker_routing
+    # All emitted route labels are from the canonical low-cardinality enum.
+    assert set(lc.route_label for lc in wr.label_counts) <= set(analyzer.ROUTER_LABELS)
+    # label_counts follow the canonical order (deterministic).
+    order = {lbl: i for i, lbl in enumerate(analyzer.ROUTER_LABELS)}
+    idxs = [order[lc.route_label] for lc in wr.label_counts]
+    assert idxs == sorted(idxs)
+    # Every candidate carries only hash + enums + counters (no free text fields).
+    for cb in wr.top_candidate_blocks:
+        assert len(cb.block_hash) == 16  # salted SHA-256 prefix
+        assert cb.route_label in analyzer._ROUTABLE_LABELS
+
+
+def test_shadow_mode_can_be_disabled(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[{"role": "tool", "content": "x" * 9000, "tool_name": "Bash"}],
+    )
+    tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)
+    llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW)
+    heavy = analyzer.load_heavy_sessions(db, since_hours=WIDE_WINDOW, salt="s", top_n=5)
+    tel = analyzer.parse_telemetry(
+        tmp_path / "none.jsonl", since_hours=WIDE_WINDOW, total_input_tokens=0
+    )
+    report = analyzer.build_report(
+        date="2100-01-01",
+        since_hours=24,
+        salt="s",
+        tool_messages=tool_messages,
+        heavy_sessions=heavy,
+        telemetry=tel,
+        llm_contents=llm,
+        min_block_repeat=2,
+        worker_routing_shadow=False,
+    )
+    assert report.worker_routing.enabled is False
+    assert report.worker_routing.classified_block_count == 0
+    # Disabled section still serializes safely.
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+    assert "disabled" in md_path.read_text(encoding="utf-8")

From 0213650f523a0a94551cbd0ef5c184676fc67a20 Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Fri, 12 Jun 2026 10:48:04 +0200
Subject: [PATCH 7/9] feat: add parent aggregation artifact telemetry

---
 docs/guides/hermes-monitor.md                 |  39 +-
 .../analyze_hermes_context_opportunities.py   | 425 ++++++++++++++++++
 ...est_hermes_context_opportunity_analyzer.py | 235 ++++++++++
 3 files changed, 698 insertions(+), 1 deletion(-)

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index 0f5e91a..4aa9128 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -68,7 +68,9 @@ It surfaces concrete token-reduction opportunities:
 - large tool outputs grouped by `tool_name`,
 - heavy sessions by input-token / tool-call / message counts (hashed ids),
 - ContextPilot telemetry coverage and savings ratios,
-- **Worker Context Routing shadow labels** for future router training/eval.
+- **Worker Context Routing shadow labels** for future router training/eval,
+- **Parent Aggregation Artifact telemetry** (exact duplicate worker/parent
+  artifacts grouped by hash) for future parent-aggregation dedup eval.
 
 ### LLM-bound block redundancy
 
@@ -116,6 +118,41 @@ block hashes. These are **not realized savings** and must be treated as training
 `--disable-worker-routing-shadow` only when you want to omit this section from a
 scan.
 
+### Parent Aggregation Artifacts — shadow mode
+
+The analyzer also includes a **Parent Aggregation Artifacts — shadow mode**
+section by default. This is **P0 telemetry only**: it collects data so a future
+parent-aggregation dedup can be evaluated offline. It never drops, summarizes,
+replaces, or mutates any context.
+
+When a parent/orchestrator aggregates results from several workers, the same
+artifact body (a test log, a diff, a file dump, a review summary, ...) is often
+carried into the parent's LLM context once per worker and again in the parent's
+own roll-up — paying for the same tokens several times. The analyzer groups
+**EXACT** artifact bodies by salted content hash (near-duplicates never group),
+classifies each body with a deterministic heuristic kind, and emits only
+low-cardinality metadata + counters:
+
+- `artifact_kind` — one of `test_log`, `terminal_output`, `file_content`,
+  `diff`, `error_trace`, `review_findings`, `benchmark_result`,
+  `worker_summary`, `unknown_large_block` (deterministic, first-match-wins),
+- per-kind summary — distinct bodies, occurrences, duplicate-group count,
+  estimated tokens, and advisory duplicate tokens,
+- **provenance** — per duplicate group, `source_type_counts` such as
+  `tool_result xN` and `assistant_context xM`, plus a deterministically chosen
+  `canonical_source_type` (the dominant origin, tie-broken alphabetically),
+- top duplicate artifact groups, reported **only** as a salted `content_hash`
+  plus counters.
+
+`est_duplicate_tokens` is computed as `(occurrences - 1) * est_tokens` and is an
+**advisory upper bound** on what a future parent dedup might save — **not a
+realized saving**, and payloads are never changed. No raw artifact / worker /
+tool / system text, and no raw session ids, are ever emitted. Only sizeable
+blocks (`>= --min-artifact-chars`, default 400) from parent/worker output origins
+(`assistant_context` and `tool_result`) are considered candidates, so prompt
+boilerplate and short hints never enter this telemetry. Use
+`--disable-parent-aggregation` to omit this section from a scan.
+
 Use `--all-sessions` to ignore the `--since-hours` window and scan **all**
 non-archived sessions and active messages (useful for a one-shot, whole-history
 audit rather than a rolling daily window):
diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py
index 780f0f1..b6cca65 100644
--- a/scripts/analyze_hermes_context_opportunities.py
+++ b/scripts/analyze_hermes_context_opportunities.py
@@ -316,6 +316,8 @@ class OpportunityReport:
     cross_type_wasted_tokens: int
     # Worker Context Routing shadow mode (P0 data collection; never prunes).
     worker_routing: WorkerRoutingShadow
+    # Parent Aggregation Artifacts shadow mode (P0 telemetry; never dedups).
+    parent_aggregation: ParentAggregationArtifacts
     notes: list[str] = field(default_factory=list)
 
 
@@ -1128,6 +1130,361 @@ def analyze_worker_routing_shadow(
     )
 
 
+# ---------------------------------------------------------------------------
+# Parent Aggregation Artifacts — SHADOW MODE (P0 telemetry only)
+# ---------------------------------------------------------------------------
+# When a parent/orchestrator aggregates results from several workers, the same
+# artifact body (a test log, a diff, a file dump, a review summary, ...) is often
+# carried into the parent's LLM context once per worker and again in the parent's
+# own roll-up -- paying for the same tokens several times. This section collects
+# *telemetry only* so a future parent-aggregation dedup can be evaluated offline:
+# it groups EXACT artifact bodies by salted content hash, classifies each body
+# with a deterministic heuristic kind, and emits low-cardinality metadata +
+# counters. It NEVER drops, summarizes, replaces, or mutates any context, and it
+# NEVER emits raw artifact text, worker text, tool output, session ids, or
+# system prompts.
+
+# Heuristic P0 artifact kinds. Low-cardinality enums describing the *shape* of an
+# aggregation artifact, never its text. Classification is deterministic.
+ARTIFACT_KINDS = (
+    "test_log",
+    "terminal_output",
+    "file_content",
+    "diff",
+    "error_trace",
+    "review_findings",
+    "benchmark_result",
+    "worker_summary",
+    "unknown_large_block",
+)
+
+# Conservative floor: only sizeable blocks are treated as candidate aggregation
+# artifacts, so short prompts/hints never enter parent-aggregation telemetry.
+DEFAULT_MIN_ARTIFACT_CHARS = 400
+
+# Parent aggregation P0 focuses on content produced by workers/tools and then
+# carried into the parent context. System/skill/user prompts are analyzed by the
+# LLM-bound redundancy and worker-routing sections, but excluding them here keeps
+# parent artifact telemetry from being polluted by prompt boilerplate.
+PARENT_AGGREGATION_SOURCE_TYPES = ("assistant_context", "tool_result")
+
+
+def classify_artifact_kind(content: str) -> str:
+    """Deterministically classify a candidate aggregation artifact body.
+
+    Pure P0 heuristic over in-memory text; returns a low-cardinality enum from
+    ``ARTIFACT_KINDS`` and never the text. The check order is fixed so the same
+    body always yields the same kind (first match wins).
+    """
+    low = content.lower()
+    stripped = content.lstrip()
+
+    # 1. Unified diff / patch.
+    if (
+        stripped.startswith("diff --git")
+        or stripped.startswith("--- a/")
+        or stripped.startswith("@@ ")
+        or "\n@@ " in content
+        or ("\n--- " in content and "\n+++ " in content)
+    ):
+        return "diff"
+
+    # 2. Test/pytest log (checked before error_trace: a failing test log may
+    #    embed a traceback but is still fundamentally a test log).
+    if (
+        "pytest" in low
+        or "test session starts" in low
+        or " passed in " in low
+        or " failed in " in low
+        or ("passed" in low and "failed" in low)
+        or "=== " in content
+    ):
+        return "test_log"
+
+    # 3. Error / exception trace.
+    if (
+        "traceback (most recent call last)" in low
+        or "\n  at " in content
+        or "stack trace" in low
+        or ("exception" in low and "error" in low)
+    ):
+        return "error_trace"
+
+    # 4. Benchmark / perf result.
+    if (
+        "benchmark" in low
+        or "ops/sec" in low
+        or "ops/s" in low
+        or "req/sec" in low
+        or "throughput" in low
+        or "latency" in low
+        or "iterations/sec" in low
+    ):
+        return "benchmark_result"
+
+    # 5. Code-review findings.
+    if (
+        "code review" in low
+        or "review findings" in low
+        or "severity:" in low
+        or "vulnerab" in low
+        or "## findings" in low
+    ):
+        return "review_findings"
+
+    # 6. File content / source dump (cat -n style numbering or code cues).
+    if (
+        "\n     1\t" in content
+        or "\n   1\t" in content
+        or "def " in content
+        or "class " in content
+        or "\nimport " in content
+        or "#include" in content
+        or "function " in content
+    ):
+        return "file_content"
+
+    # 7. Worker / aggregation summary. Checked after source-code cues so files
+    #    mentioning workers are still labeled as file_content.
+    if (
+        "## summary" in low
+        or "in summary" in low
+        or "summary:" in low
+        or "tl;dr" in low
+        or "aggregat" in low
+        or "worker" in low
+    ):
+        return "worker_summary"
+
+    # 8. Terminal / shell session output.
+    if (
+        "\n$ " in content
+        or stripped.startswith("$ ")
+        or "\n# " in content
+        or "user@" in low
+        or "bash-" in low
+        or "exit code" in low
+    ):
+        return "terminal_output"
+
+    # 9. Fallback: a large block we could not confidently classify.
+    return "unknown_large_block"
+
+
+@dataclass
+class ArtifactSourceCount:
+    """Provenance counter: occurrences of one artifact body from one source."""
+
+    source_type: str
+    count: int
+
+
+@dataclass
+class ParentAggregationGroup:
+    """One EXACT artifact body observed 2+ times across parent/worker contexts.
+
+    Salted hash + counters only -- never the body text.
+    """
+
+    content_hash: str
+    artifact_kind: str
+    canonical_source_type: str           # dominant origin, chosen deterministically
+    occurrences: int
+    char_length: int
+    est_tokens: int
+    est_duplicate_tokens: int            # ADVISORY: (occurrences - 1) * est_tokens
+    source_type_counts: list[ArtifactSourceCount]  # provenance: tool_result xN, ...
+
+
+@dataclass
+class ArtifactKindStat:
+    """Aggregate over all candidate artifact bodies of one kind."""
+
+    artifact_kind: str
+    group_count: int             # distinct bodies of this kind
+    occurrence_count: int        # total occurrences of those bodies
+    duplicate_group_count: int   # bodies seen >= 2 times
+    est_tokens: int              # sum of est tokens for distinct bodies
+    est_duplicate_tokens: int    # ADVISORY duplicate tokens for this kind
+
+
+@dataclass
+class ParentAggregationArtifacts:
+    """Shadow-mode parent-aggregation artifact report (P0: telemetry only)."""
+
+    enabled: bool
+    item_count: int                  # candidate artifact items considered
+    artifact_body_count: int         # distinct bodies (groups)
+    total_occurrences: int
+    duplicate_group_count: int
+    est_total_tokens: int            # est tokens for distinct bodies
+    est_duplicate_tokens: int        # ADVISORY duplicate-artifact tokens
+    by_kind: list[ArtifactKindStat]
+    source_type_counts: list[ArtifactSourceCount]   # provenance across candidates
+    top_duplicate_groups: list[ParentAggregationGroup]
+    notes: list[str] = field(default_factory=list)
+
+
+def analyze_parent_aggregation_artifacts(
+    contents: Iterable[_LLMContent],
+    *,
+    salt: str,
+    min_artifact_chars: int,
+    top_n: int,
+    enabled: bool = True,
+) -> ParentAggregationArtifacts:
+    """Group EXACT aggregation-artifact bodies and emit provenance telemetry.
+
+    P0 telemetry/advisory only: no context is dropped, summarized, replaced, or
+    mutated. Each sizeable LLM-bound block is fingerprinted by EXACT salted
+    content hash (near-duplicates never group), classified with a deterministic
+    heuristic kind, and rolled up into low-cardinality metadata + counters.
+    ``est_duplicate_tokens`` is an advisory upper bound on what a *future* parent
+    dedup might save -- never a realized saving. No raw artifact/worker/tool/
+    system text, and no raw session ids, are ever emitted.
+    """
+    if not enabled:
+        return ParentAggregationArtifacts(
+            enabled=False,
+            item_count=0,
+            artifact_body_count=0,
+            total_occurrences=0,
+            duplicate_group_count=0,
+            est_total_tokens=0,
+            est_duplicate_tokens=0,
+            by_kind=[],
+            source_type_counts=[],
+            top_duplicate_groups=[],
+            notes=["parent-aggregation artifact analysis disabled via flag"],
+        )
+
+    # --- group sizeable bodies by EXACT salted content hash ----------------
+    groups: dict[str, dict] = {}
+    item_count = 0
+    source_totals: dict[str, int] = {}
+    for item in contents:
+        content = item.content
+        bt = item.block_type
+        if bt not in PARENT_AGGREGATION_SOURCE_TYPES:
+            continue
+        if not content or len(content) < min_artifact_chars:
+            continue
+        item_count += 1
+        source_totals[bt] = source_totals.get(bt, 0) + 1
+        h = _salted_hash(content, salt)
+        g = groups.get(h)
+        if g is None:
+            groups[h] = {
+                "char_length": len(content),
+                "occurrences": 1,
+                "sources": {bt: 1},
+                # classify once from in-memory text; never stored/emitted.
+                "kind": classify_artifact_kind(content),
+            }
+        else:
+            g["occurrences"] += 1
+            g["sources"][bt] = g["sources"].get(bt, 0) + 1
+
+    # --- per-kind rollup + per-group records -------------------------------
+    kind_agg: dict[str, dict] = {}
+    group_records: list[ParentAggregationGroup] = []
+    total_occurrences = 0
+    est_total_tokens = 0
+    est_duplicate_tokens = 0
+    duplicate_group_count = 0
+
+    for h, g in groups.items():
+        occ = g["occurrences"]
+        char_len = g["char_length"]
+        est = _est_tokens(char_len)
+        dup_tokens = est * (occ - 1)
+        kind = g["kind"]
+        is_dup = occ >= 2
+
+        total_occurrences += occ
+        est_total_tokens += est
+        est_duplicate_tokens += dup_tokens
+        if is_dup:
+            duplicate_group_count += 1
+
+        ka = kind_agg.setdefault(
+            kind,
+            {"groups": 0, "occ": 0, "dups": 0, "est": 0, "dup_tokens": 0},
+        )
+        ka["groups"] += 1
+        ka["occ"] += occ
+        ka["est"] += est
+        ka["dup_tokens"] += dup_tokens
+        if is_dup:
+            ka["dups"] += 1
+
+        if is_dup:
+            # Provenance counts, sorted by source_type for determinism.
+            source_counts = [
+                ArtifactSourceCount(source_type=st, count=c)
+                for st, c in sorted(g["sources"].items())
+            ]
+            # Canonical source: dominant origin, tie-broken alphabetically.
+            canonical = min(
+                g["sources"].items(), key=lambda kv: (-kv[1], kv[0])
+            )[0]
+            group_records.append(
+                ParentAggregationGroup(
+                    content_hash=h,
+                    artifact_kind=kind,
+                    canonical_source_type=canonical,
+                    occurrences=occ,
+                    char_length=char_len,
+                    est_tokens=est,
+                    est_duplicate_tokens=dup_tokens,
+                    source_type_counts=source_counts,
+                )
+            )
+
+    by_kind = [
+        ArtifactKindStat(
+            artifact_kind=kind,
+            group_count=kind_agg[kind]["groups"],
+            occurrence_count=kind_agg[kind]["occ"],
+            duplicate_group_count=kind_agg[kind]["dups"],
+            est_tokens=kind_agg[kind]["est"],
+            est_duplicate_tokens=kind_agg[kind]["dup_tokens"],
+        )
+        for kind in ARTIFACT_KINDS
+        if kind in kind_agg
+    ]
+    source_type_counts = [
+        ArtifactSourceCount(source_type=st, count=c)
+        for st, c in sorted(source_totals.items())
+    ]
+    group_records.sort(
+        key=lambda g: (g.est_duplicate_tokens, g.occurrences, g.content_hash),
+        reverse=True,
+    )
+
+    notes = [
+        "SHADOW MODE P0: telemetry only -- no aggregation artifact was deduped, replaced, summarized, or mutated",
+        "artifact_kind/source_type/canonical_source_type are low-cardinality enums; content_hash is a salted SHA-256 fingerprint",
+        "grouping is EXACT (same salted content hash): near-duplicate artifacts never group",
+        "est_duplicate_tokens is ADVISORY ((occurrences-1) * est_tokens), an upper bound for a FUTURE parent dedup -- not a realized saving",
+        "provenance source_type_counts show how many copies came from each parent/worker output origin (assistant_context, tool_result)",
+    ]
+
+    return ParentAggregationArtifacts(
+        enabled=True,
+        item_count=item_count,
+        artifact_body_count=len(groups),
+        total_occurrences=total_occurrences,
+        duplicate_group_count=duplicate_group_count,
+        est_total_tokens=est_total_tokens,
+        est_duplicate_tokens=est_duplicate_tokens,
+        by_kind=by_kind,
+        source_type_counts=source_type_counts,
+        top_duplicate_groups=group_records[:top_n],
+        notes=notes,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Build + write
 # ---------------------------------------------------------------------------
@@ -1148,6 +1505,8 @@ def build_report(
     large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS,
     top_n: int = DEFAULT_TOP_N,
     worker_routing_shadow: bool = True,
+    parent_aggregation_shadow: bool = True,
+    min_artifact_chars: int = DEFAULT_MIN_ARTIFACT_CHARS,
 ) -> OpportunityReport:
     dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n)
     blocks = detect_repeated_blocks(
@@ -1179,6 +1538,14 @@ def build_report(
         enabled=worker_routing_shadow,
     )
 
+    parent_aggregation = analyze_parent_aggregation_artifacts(
+        llm_contents,
+        salt=salt,
+        min_artifact_chars=min_artifact_chars,
+        top_n=top_n,
+        enabled=parent_aggregation_shadow,
+    )
+
     total_chars = sum(len(m.content) for m in tool_messages)
     dup_wasted = sum(d.est_wasted_tokens for d in dups)
     block_wasted = sum(b.est_wasted_tokens for b in blocks)
@@ -1191,6 +1558,7 @@ def build_report(
         "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text",
         "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages",
         "worker-routing section is SHADOW MODE P0: it labels blocks for a future router but never drops/summarizes context",
+        "parent-aggregation section is SHADOW MODE P0 telemetry: it groups exact artifact bodies but never dedups/replaces context",
     ]
     if all_sessions:
         notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages")
@@ -1221,6 +1589,7 @@ def build_report(
         cross_type_block_groups=cross_groups,
         cross_type_wasted_tokens=cross_wasted,
         worker_routing=worker_routing,
+        parent_aggregation=parent_aggregation,
         notes=notes,
     )
 
@@ -1275,6 +1644,9 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
         f"- Worker routing (shadow): {report.worker_routing.classified_block_count} blocks "
         f"classified, {report.worker_routing.must_keep_block_count} must-keep, "
         f"~{report.worker_routing.est_candidate_tokens_total} advisory candidate tokens",
+        f"- Parent aggregation (shadow): {report.parent_aggregation.duplicate_group_count} "
+        f"duplicate artifact groups, "
+        f"~{report.parent_aggregation.est_duplicate_tokens} advisory duplicate tokens",
         "",
         "## LLM-bound redundancy by block type",
     ]
@@ -1378,6 +1750,46 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]:
                 f"x{cb.occurrences} chars={cb.char_length} ~candidate={cb.est_candidate_tokens}"
             )
     md.append("")
+    pa = report.parent_aggregation
+    md.append("## Parent Aggregation Artifacts — shadow mode")
+    if not pa.enabled:
+        md.append("- disabled")
+    else:
+        md.append(
+            f"- Candidate artifact items: {pa.item_count} "
+            f"(distinct bodies: {pa.artifact_body_count}, "
+            f"occurrences: {pa.total_occurrences})"
+        )
+        md.append(
+            f"- Duplicate artifact groups: {pa.duplicate_group_count} "
+            f"(~{pa.est_duplicate_tokens} advisory duplicate tokens of "
+            f"~{pa.est_total_tokens} distinct-body tokens) — NOT a realized saving, "
+            f"payloads are unchanged"
+        )
+        md.append("")
+        md.append("### By artifact kind")
+        for ks in pa.by_kind:
+            md.append(
+                f"- {ks.artifact_kind}: bodies={ks.group_count} "
+                f"occ={ks.occurrence_count} dup_groups={ks.duplicate_group_count} "
+                f"tokens={ks.est_tokens} ~dup={ks.est_duplicate_tokens}"
+            )
+        md.append("")
+        md.append("### Provenance (artifact source types)")
+        for sc in pa.source_type_counts:
+            md.append(f"- {sc.source_type}: {sc.count}")
+        md.append("")
+        md.append("### Top duplicate artifact groups (hashed)")
+        for g in pa.top_duplicate_groups:
+            spread = ", ".join(
+                f"{sc.source_type}x{sc.count}" for sc in g.source_type_counts
+            )
+            md.append(
+                f"- `{g.content_hash}` kind={g.artifact_kind} "
+                f"canonical={g.canonical_source_type} x{g.occurrences} "
+                f"({spread}) chars={g.char_length} ~dup={g.est_duplicate_tokens} tokens"
+            )
+    md.append("")
     md.append("## Notes")
     for note in report.notes:
         md.append(f"- {note}")
@@ -1423,6 +1835,17 @@ def main() -> int:
             "(P0 data collection; enabled by default, never prunes context)"
         ),
     )
+    parser.add_argument(
+        "--disable-parent-aggregation",
+        action="store_true",
+        help=(
+            "skip the shadow-mode Parent Aggregation Artifact telemetry "
+            "(P0 telemetry only; enabled by default, never dedups/replaces context)"
+        ),
+    )
+    parser.add_argument(
+        "--min-artifact-chars", type=int, default=DEFAULT_MIN_ARTIFACT_CHARS
+    )
     args = parser.parse_args()
 
     if not args.state_db.exists():
@@ -1467,6 +1890,8 @@ def main() -> int:
             large_output_chars=args.large_output_chars,
             top_n=args.top_n,
             worker_routing_shadow=not args.disable_worker_routing_shadow,
+            parent_aggregation_shadow=not args.disable_parent_aggregation,
+            min_artifact_chars=args.min_artifact_chars,
         )
         json_path, md_path = write_report(report, args.out_dir)
     except Exception as exc:  # noqa: BLE001 - cron-safe: report class only, no payload
diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py
index 7eb04d2..fc997fc 100644
--- a/tests/test_hermes_context_opportunity_analyzer.py
+++ b/tests/test_hermes_context_opportunity_analyzer.py
@@ -704,3 +704,238 @@ def test_shadow_mode_can_be_disabled(tmp_path):
     # Disabled section still serializes safely.
     json_path, md_path = analyzer.write_report(report, tmp_path / "out")
     assert "disabled" in md_path.read_text(encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Parent Aggregation Artifacts — SHADOW MODE (P0 telemetry) tests
+# ---------------------------------------------------------------------------
+
+# An artifact body must be >= DEFAULT_MIN_ARTIFACT_CHARS to be a candidate.
+TEST_LOG_ARTIFACT = "pytest session: tests/test_widget.py::test_alpha PASSED\n" * 20
+
+
+def _pa_kinds(report):
+    """artifact_kind -> ArtifactKindStat for convenient assertions."""
+    return {ks.artifact_kind: ks for ks in report.parent_aggregation.by_kind}
+
+
+def test_exact_duplicate_artifacts_group_and_estimate_duplicate_tokens(tmp_path):
+    db = tmp_path / "state.db"
+    assert len(TEST_LOG_ARTIFACT) >= analyzer.DEFAULT_MIN_ARTIFACT_CHARS
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    pa = report.parent_aggregation
+
+    assert pa.enabled is True
+    assert pa.duplicate_group_count == 1
+    grp = pa.top_duplicate_groups[0]
+    assert grp.occurrences == 3
+    assert grp.artifact_kind == "test_log"
+    # Two of the three copies are advisory duplicate tokens.
+    assert grp.est_duplicate_tokens == grp.est_tokens * 2
+    assert pa.est_duplicate_tokens == grp.est_duplicate_tokens
+
+
+def test_near_duplicate_artifacts_do_not_group(tmp_path):
+    db = tmp_path / "state.db"
+    base = "pytest run output line that is sufficiently long to be an artifact body\n" * 8
+    near = base + "X"  # one char different -> different exact hash
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": base, "tool_name": "Bash"},
+            {"role": "tool", "content": near, "tool_name": "Bash"},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    pa = report.parent_aggregation
+    # Two distinct bodies, neither repeated -> zero duplicate groups.
+    assert pa.artifact_body_count == 2
+    assert pa.duplicate_group_count == 0
+    assert pa.est_duplicate_tokens == 0
+    assert pa.top_duplicate_groups == []
+
+
+def test_provenance_source_type_counts_are_emitted(tmp_path):
+    db = tmp_path / "state.db"
+    # Same exact artifact body shipped from a tool result AND assistant context.
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "assistant", "content": TEST_LOG_ARTIFACT},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    pa = report.parent_aggregation
+    grp = pa.top_duplicate_groups[0]
+
+    spread = {sc.source_type: sc.count for sc in grp.source_type_counts}
+    assert spread == {"assistant_context": 1, "tool_result": 1}
+    # Canonical source chosen deterministically (tie -> alphabetical).
+    assert grp.canonical_source_type == "assistant_context"
+    # Aggregate provenance across all candidates is also emitted.
+    agg = {sc.source_type: sc.count for sc in pa.source_type_counts}
+    assert agg == {"assistant_context": 1, "tool_result": 1}
+
+
+def test_parent_aggregation_excludes_prompt_boilerplate_sources(tmp_path):
+    db = tmp_path / "state.db"
+    prompt_like_artifact = "pytest prompt boilerplate that should not be a parent artifact\n" * 20
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": prompt_like_artifact}],
+        messages=[
+            {"role": "user", "content": prompt_like_artifact},
+            {"role": "system", "content": prompt_like_artifact},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    pa = report.parent_aggregation
+
+    assert pa.item_count == 0
+    assert pa.artifact_body_count == 0
+    assert pa.source_type_counts == []
+    assert pa.top_duplicate_groups == []
+
+
+def test_parent_aggregation_never_emits_raw_content(tmp_path):
+    db = tmp_path / "state.db"
+    secret = "PARENT-AGG-SECRET-ARTIFACT-DO-NOT-LEAK pytest detail line here\n" * 20
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": "be safe"}],
+        messages=[
+            {"role": "tool", "content": secret, "tool_name": "Bash"},
+            {"role": "tool", "content": secret, "tool_name": "Bash"},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+    blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8")
+
+    assert "PARENT-AGG-SECRET-ARTIFACT" not in blob
+    assert "PRIVATE REASONING" not in blob
+    assert "raw-session-id" not in blob
+    # The duplicate was still detected via salted hashing.
+    assert report.parent_aggregation.duplicate_group_count == 1
+    assert report.parent_aggregation.est_duplicate_tokens > 0
+
+
+def test_parent_aggregation_schema_is_deterministic(tmp_path):
+    db = tmp_path / "state.db"
+    diff_artifact = (
+        "diff --git a/foo.py b/foo.py\n@@ -1,4 +1,4 @@\n"
+        "-old line of code here that is long\n+new line of code here that is long\n" * 8
+    )
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "assistant", "content": diff_artifact},
+            {"role": "assistant", "content": diff_artifact},
+        ],
+    )
+    r1 = _analyze(db, tmp_path)
+    r2 = _analyze(db, tmp_path)
+    # Identical inputs -> byte-identical serialized section.
+    assert dataclasses.asdict(r1.parent_aggregation) == dataclasses.asdict(
+        r2.parent_aggregation
+    )
+
+    pa = r1.parent_aggregation
+    # by_kind follows the canonical ARTIFACT_KINDS order (deterministic).
+    order = {k: i for i, k in enumerate(analyzer.ARTIFACT_KINDS)}
+    idxs = [order[ks.artifact_kind] for ks in pa.by_kind]
+    assert idxs == sorted(idxs)
+    # Every emitted kind is from the canonical low-cardinality enum.
+    assert set(order) >= {ks.artifact_kind for ks in pa.by_kind}
+    # Candidates carry only hash + enums + counters; hash is a salted prefix.
+    for g in pa.top_duplicate_groups:
+        assert len(g.content_hash) == 16
+        assert g.artifact_kind in analyzer.ARTIFACT_KINDS
+        assert g.canonical_source_type in analyzer.BLOCK_TYPES
+
+
+def test_classify_artifact_kind_is_deterministic_and_low_cardinality():
+    cases = {
+        "diff --git a/x b/x\n@@ -1 +1 @@\n-a\n+b\n": "diff",
+        "Traceback (most recent call last):\n  File x\nValueError: boom": "error_trace",
+        "## Summary\nThe worker aggregated all results successfully here.": "worker_summary",
+        "def worker_helper():\n    return 'source code should win over worker word'": "file_content",
+        "some entirely opaque blob of bytes with no recognizable structure!!": "unknown_large_block",
+    }
+    for text, expected in cases.items():
+        kind = analyzer.classify_artifact_kind(text)
+        assert kind == expected
+        assert kind in analyzer.ARTIFACT_KINDS
+
+
+def test_parent_aggregation_can_be_disabled(tmp_path):
+    db = tmp_path / "state.db"
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+            {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"},
+        ],
+    )
+    tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)
+    llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW)
+    heavy = analyzer.load_heavy_sessions(db, since_hours=WIDE_WINDOW, salt="s", top_n=5)
+    tel = analyzer.parse_telemetry(
+        tmp_path / "none.jsonl", since_hours=WIDE_WINDOW, total_input_tokens=0
+    )
+    report = analyzer.build_report(
+        date="2100-01-01",
+        since_hours=24,
+        salt="s",
+        tool_messages=tool_messages,
+        heavy_sessions=heavy,
+        telemetry=tel,
+        llm_contents=llm,
+        min_block_repeat=2,
+        parent_aggregation_shadow=False,
+    )
+    assert report.parent_aggregation.enabled is False
+    assert report.parent_aggregation.duplicate_group_count == 0
+    json_path, md_path = analyzer.write_report(report, tmp_path / "out")
+    md = md_path.read_text(encoding="utf-8")
+    assert "Parent Aggregation Artifacts — shadow mode" in md
+    assert "disabled" in md
+
+
+def test_worker_routing_intact_alongside_parent_aggregation(tmp_path):
+    """Adding parent-aggregation telemetry must not disturb worker routing."""
+    db = tmp_path / "state.db"
+    big_repeated = "row of unrelated build log output number 7 with filler text " * 200
+    assert len(big_repeated) >= LARGE
+    _make_db_ex(
+        db,
+        sessions=[{"id": "s1", "started_at": FAR_FUTURE}],
+        messages=[
+            {"role": "tool", "content": big_repeated, "tool_name": "Bash"},
+            {"role": "tool", "content": big_repeated, "tool_name": "Bash"},
+            {"role": "tool", "content": big_repeated, "tool_name": "Bash"},
+        ],
+    )
+    report = _analyze(db, tmp_path)
+    # Worker routing still classifies the large repeated block as a drop candidate.
+    rm = _route_map(report)
+    assert "likely_drop_candidate" in rm
+    assert report.worker_routing.est_drop_candidate_tokens > 0
+    # And parent aggregation independently sees the same body as a duplicate.
+    assert report.parent_aggregation.duplicate_group_count == 1

From 957d04ae097668b1c1b08cdd66df799cbaa3f064 Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Fri, 12 Jun 2026 14:47:49 +0200
Subject: [PATCH 8/9] docs: add ContextPilot self-evolve skill

---
 docs/guides/hermes-monitor.md                |  29 ++
 docs/guides/hermes.md                        |   9 +
 skills/contextpilot-self-evolve/SKILL.md     | 274 +++++++++++++++++++
 tests/test_contextpilot_self_evolve_skill.py | 112 ++++++++
 4 files changed, 424 insertions(+)
 create mode 100644 skills/contextpilot-self-evolve/SKILL.md
 create mode 100644 tests/test_contextpilot_self_evolve_skill.py

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index 4aa9128..a851fa3 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -184,6 +184,35 @@ gate below before changing ContextPilot config or code. A defensive guard in
 `write_report` refuses to emit any forbidden raw-content key, so the reports are
 safe to ship from an unattended cron job.
 
+## Self-evolve skill (Hermes)
+
+The monitor and analyzer above are bundled into a reusable Hermes skill so users
+can run the same safe loop — install/enable ContextPilot, collect
+telemetry/shadow data, analyze realized savings vs advisory candidates, and
+propose improvements under strict safety gates:
+
+- Skill path: `skills/contextpilot-self-evolve/SKILL.md`
+
+The skill is **proposal-only**: it never auto-applies context routing, dropping,
+or summarization, and it enforces the same privacy boundary (reports never emit
+raw conversation/tool/system text, reasoning, or raw session ids).
+
+To use it in Hermes, copy or load the skill into your Hermes skills directory,
+then invoke it by name:
+
+```bash
+# copy into your Hermes skills directory (adjust path to your install)
+mkdir -p ~/.hermes/skills/contextpilot-self-evolve
+cp skills/contextpilot-self-evolve/SKILL.md \
+   ~/.hermes/skills/contextpilot-self-evolve/SKILL.md
+```
+
+Then ask Hermes to run the **contextpilot-self-evolve** skill. It walks through
+install/verify, the metadata-only monitor, the content-aware analyzer for
+`--since-hours 24` and `168`, interpretation of realized vs advisory tokens,
+optional read-only cron jobs, and the branch/tests/privacy/independent-review
+gate required before any code or config change ships.
+
 ## Accuracy gate
 
 This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require:
diff --git a/docs/guides/hermes.md b/docs/guides/hermes.md
index d13f293..7761697 100644
--- a/docs/guides/hermes.md
+++ b/docs/guides/hermes.md
@@ -97,6 +97,15 @@ Hermes ships with `ContextCompressor`, a threshold-based LLM-summarization engin
 
 ContextPilot runs *before* the threshold-based compressor, reducing how often the expensive summarization path is hit.
 
+## Self-evolve skill
+
+For a guided, safety-gated loop that installs ContextPilot, monitors its real
+token savings, scans for context-reduction opportunities, and proposes
+improvements (without auto-applying risky changes), use the bundled Hermes
+skill at `skills/contextpilot-self-evolve/SKILL.md`. See
+[`hermes-monitor.md`](./hermes-monitor.md#self-evolve-skill-hermes) for how to
+copy/load and invoke it.
+
 ## Troubleshooting
 
 **Plugin not discovered after install.** Check `~/.hermes/plugins/ContextPilot/plugin.yaml` exists and contains `type: context_engine`. Run `hermes plugins list` to confirm.
diff --git a/skills/contextpilot-self-evolve/SKILL.md b/skills/contextpilot-self-evolve/SKILL.md
new file mode 100644
index 0000000..d88b81b
--- /dev/null
+++ b/skills/contextpilot-self-evolve/SKILL.md
@@ -0,0 +1,274 @@
+---
+name: contextpilot-self-evolve
+description: Use when a user wants to install/enable ContextPilot inside Hermes Agent and then run a safe, repeatable "self-evolve" loop — collect metadata-only telemetry and content-aware shadow data, analyze realized token savings vs advisory candidate tokens, and propose ContextPilot improvements under strict safety gates. Use it for monitoring token spend, scanning context-redundancy opportunities, setting up read-only daily/weekly cron analysis, and preparing reviewed, branch-gated code/config changes. Do NOT use it to auto-apply routing/drop/summarization changes; this skill only proposes risky changes and requires tests, privacy checks, and independent review before anything ships.
+version: 1.0.0
+author: ContextPilot
+license: MIT
+metadata:
+  hermes:
+    tags: [contextpilot, hermes, telemetry, context-optimization, token-savings, safety-gated]
+    related_skills: []
+    category: observability
+    safety: proposal-only
+---
+
+# ContextPilot Self-Evolve (Hermes)
+
+This skill drives a **safe, repeatable** loop for running ContextPilot inside
+Hermes Agent and continuously improving it from real telemetry — **without**
+auto-applying any risky context change. You measure, you analyze, you *propose*;
+a human (plus tests, privacy checks, and independent review) decides what ships.
+
+> Core safety stance: **observe and propose only.** This skill never enables
+> context routing, dropping, or summarization on its own. Shadow/advisory
+> numbers are training/eval data, **not** realized savings, and must never be
+> treated as something to "just turn on."
+
+## When to use this skill
+
+- A user asks to install or enable ContextPilot in Hermes and watch its impact.
+- A user wants to know how many tokens/cost ContextPilot is actually saving.
+- A user wants to find token-reduction *opportunities* (duplicate tool outputs,
+  cross-role repeated blocks, oversized tool results, routing/dedup candidates).
+- A user wants a daily/weekly read-only cron that reports savings + opportunities.
+- A user wants to propose a ContextPilot config or code change and needs the safe
+  workflow (branch, tests, privacy/no-raw-content checks, independent review).
+
+If the user instead wants the low-level integration mechanics, point them at
+`docs/guides/hermes.md`; for the metadata-only monitor details, see
+`docs/guides/hermes-monitor.md`. This skill orchestrates both into one loop.
+
+## Privacy boundary (read this first)
+
+There are two analysis tools with **different** read scopes:
+
+- `scripts/hermes_contextpilot_monitor.py` — **metadata only**. Never reads
+  `messages.content`, `sessions.system_prompt`, reasoning, or raw tool payloads.
+- `scripts/analyze_hermes_context_opportunities.py` — **content-aware**. It
+  *may* read message/tool/system content **in-memory** to compute salted
+  SHA-256 fingerprints and aggregate counters.
+
+In **both** cases the rule is absolute: **reports must never emit raw
+conversation text, tool-call payloads, system prompts, reasoning, or raw session
+ids.** Session ids appear only as salted hashes. The analyzer has a defensive
+`write_report` guard that refuses to emit forbidden raw-content keys; do not
+weaken or bypass it. If you are ever unsure whether an output is safe to ship,
+treat it as unsafe and stop.
+
+## Workflow
+
+### Step 1 — Install / enable ContextPilot in Hermes
+
+Normal install (do **not** use `--force`):
+
+```bash
+hermes plugins install EfficientContext/ContextPilot --enable
+hermes config set context.engine contextpilot
+```
+
+`--force` is **only** for an intentional update/reinstall over an existing
+install — never as the default:
+
+```bash
+hermes plugins install EfficientContext/ContextPilot --enable --force
+```
+
+If your Hermes version does not support `--enable`, install first and then use the
+plugin menu:
+
+```bash
+hermes plugins            # General Plugins -> toggle "contextpilot" enabled
+```
+
+### Step 2 — Verify the context engine + restart
+
+Confirm Hermes is actually routing through ContextPilot. The active context
+engine must be `contextpilot`:
+
+```yaml
+# ~/.hermes config
+context:
+  engine: contextpilot
+```
+
+```python
+from hermes_cli.plugins import get_plugin_manager
+engine = get_plugin_manager()._context_engine
+print(engine.get_status())   # expect {'engine': 'contextpilot', ...}
+```
+
+Then **restart the Hermes gateway / start a fresh session** so the engine is
+loaded. On startup you should see:
+
+```
+Plugin 'contextpilot' registered context engine: contextpilot
+```
+
+> The context-engine TUI submenu may show "contextpilot (not found)" — that is
+> cosmetic; `get_status()` is the source of truth.
+
+### Step 3 — Run the metadata-only monitor
+
+Use this as the safe baseline. It reports realized savings and operational
+signals from telemetry/metadata only:
+
+```bash
+python scripts/hermes_contextpilot_monitor.py \
+  --out-dir ~/contextpilot/reports \
+  --since-hours 24 \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl
+```
+
+Reports:
+
+- `~/contextpilot/reports/daily_YYYY-MM-DD.json`
+- `~/contextpilot/reports/daily_YYYY-MM-DD.md`
+
+The telemetry file is written by the ContextPilot Hermes plugin when savings
+occur. `CONTEXTPILOT_DISABLE_TELEMETRY=1` disables writes;
+`CONTEXTPILOT_TELEMETRY_FILE=/path` overrides the location.
+
+### Step 4 — Run the content-aware opportunity analyzer
+
+Run for both a rolling day and a rolling week to separate noise from trend:
+
+```bash
+# last 24h
+python scripts/analyze_hermes_context_opportunities.py \
+  --state-db ~/.hermes/state.db \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
+  --out-dir ~/contextpilot/opportunities \
+  --since-hours 24
+
+# last 7 days (168h)
+python scripts/analyze_hermes_context_opportunities.py \
+  --state-db ~/.hermes/state.db \
+  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
+  --out-dir ~/contextpilot/opportunities \
+  --since-hours 168
+```
+
+For a one-shot whole-history audit, swap the window for `--all-sessions`.
+Reports:
+
+- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json`
+- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md`
+
+The analyzer surfaces: exact duplicate tool outputs, repeated line/block
+fingerprints, large outputs by `tool_name`, heavy sessions (hashed ids),
+ContextPilot telemetry coverage/ratios, **LLM-bound cross-type repeated
+blocks**, **Worker Context Routing shadow labels**, and **Parent Aggregation
+Artifact** dedup telemetry. The shadow/parent sections are **on by default** and
+collect P0 data only; pass `--disable-worker-routing-shadow` or
+`--disable-parent-aggregation` to omit a section.
+
+### Step 5 — Interpret: realized savings vs advisory candidates
+
+Keep these two numbers in separate mental buckets — never add them together:
+
+- **Realized savings** (telemetry: `chars_saved`, `~tokens`, savings ratio,
+  monitor report) — what ContextPilot *actually* saved via lossless dedup +
+  reorder. This is real and bankable.
+- **Advisory / shadow candidate tokens** (analyzer: routing-shadow
+  `est_advisory_candidate_tokens`, parent-aggregation `est_duplicate_tokens`,
+  cross-type redundant tokens) — an **upper-bound estimate** of what a *future*
+  router/dedup *might* save. **Not realized.** It is training/eval data, and
+  every token estimate is a heuristic (`chars/4`).
+
+When reporting to the user, state realized savings as fact and label every
+advisory number as a candidate that still needs validation. Do not imply that
+advisory tokens are available simply by toggling a flag.
+
+### Step 6 — Optional read-only cron jobs
+
+Schedule the monitor and/or analyzer as **read-only watchdogs**. They produce
+reports; they must not apply config or code changes.
+
+```python
+cronjob(
+    action="create",
+    name="contextpilot-self-evolve-daily",
+    schedule="0 4 * * *",
+    repeat=7,
+    deliver="origin",
+    enabled_toolsets=["terminal", "file"],
+    prompt="""
+Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with
+--out-dir /root/contextpilot/reports --since-hours 24, then run
+analyze_hermes_context_opportunities.py with --since-hours 24. Read today's
+Markdown reports and send a short summary: realized token savings, session
+count, whether ContextPilot events were observed, and the top advisory
+opportunities (clearly labeled as candidates, not realized). Do NOT read raw
+conversation content. Do NOT modify source/config.
+""",
+)
+```
+
+For a weekly trend, add a second job with `--since-hours 168` on a `0 5 * * 1`
+schedule. Both stay strictly read-only.
+
+### Step 7 — Propose improvements (do NOT auto-apply risky changes)
+
+From the reports, write a prioritized proposal. **Never** auto-enable context
+**routing**, context **dropping**, or **summarization** based on shadow numbers.
+Those are high-recall-sensitive changes that can silently drop needed context;
+they require the accuracy gate plus human sign-off.
+
+Before any ContextPilot change ships, run a fixed golden eval set and require:
+
+- no task-success regression,
+- no drop in context recall beyond the chosen threshold,
+- no unsafe raw-content leakage in reports,
+- no increase in failed tool calls.
+
+If any gate fails, hold the proposal and require human review.
+
+### Step 8 — Safe path for code/config changes
+
+For anything beyond a read-only report, follow this gate every time:
+
+1. **Branch.** Make changes on a dedicated branch; never on `main`. No
+   destructive git operations, no commit/push unless the user explicitly asks.
+2. **Tests.** Add/extend tests and run the relevant suite (see below). A change
+   to analysis or routing logic must ship with coverage.
+3. **Privacy / no-raw-content check.** Re-confirm no report path can emit raw
+   conversation/tool/system text, reasoning, or raw session ids. Keep the
+   `write_report` forbidden-key guard intact.
+4. **Independent review.** Get a second, independent review (human or a separate
+   reviewing agent) focused on correctness, recall safety, and privacy before
+   merge.
+
+### Optional — delegated coding + independent verification
+
+If the user has a coding-agent workflow, you may delegate the *implementation*
+of an approved proposal to a coding agent (e.g. Claude Code) on a branch, and
+then run **independent verification** in Hermes (re-run tests, the privacy
+guard, and the accuracy gate) rather than trusting the author's own check. This
+two-party split (one writes, another verifies) is recommended but generic — any
+"author + independent reviewer" arrangement satisfies the gate. The skill itself
+never merges; a human approves.
+
+## Report locations (quick reference)
+
+| Tool | Scope | Default output |
+|------|-------|----------------|
+| `hermes_contextpilot_monitor.py` | metadata only | `~/contextpilot/reports/daily_YYYY-MM-DD.{json,md}` |
+| `analyze_hermes_context_opportunities.py` | content-aware (hashes only in reports) | `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.{json,md}` |
+
+## Relevant tests
+
+```bash
+python -m pytest tests/test_hermes_contextpilot_monitor.py \
+  tests/test_hermes_context_opportunity_analyzer.py \
+  tests/test_contextpilot_self_evolve_skill.py -q
+```
+
+## Hard rules (never violate)
+
+- Observe and **propose** only — never auto-apply routing/drop/summarization.
+- Reports never contain raw conversation/tool/system text, reasoning, or raw
+  session ids; session ids are salted hashes only.
+- Realized savings and advisory/shadow candidate tokens are reported separately.
+- `--force` install only for an intentional update/reinstall.
+- Code/config changes require: branch, tests, privacy check, independent review.
+- No destructive git operations; no commit/push unless the user asks.
diff --git a/tests/test_contextpilot_self_evolve_skill.py b/tests/test_contextpilot_self_evolve_skill.py
new file mode 100644
index 0000000..5da09c4
--- /dev/null
+++ b/tests/test_contextpilot_self_evolve_skill.py
@@ -0,0 +1,112 @@
+"""Validation tests for the contextpilot-self-evolve Hermes skill.
+
+These guard the SKILL.md packaging contract: valid YAML frontmatter, required
+Hermes fields, size limits, and the presence of the safety/privacy phrases that
+make this skill safe to ship (it must stay proposal-only and never promise to
+auto-apply risky context changes).
+"""
+from pathlib import Path
+
+import pytest
+
+import yaml
+
+
+SKILL_PATH = (
+    Path(__file__).resolve().parents[1]
+    / "skills"
+    / "contextpilot-self-evolve"
+    / "SKILL.md"
+)
+
+MAX_SKILL_CHARS = 100_000
+MAX_DESCRIPTION_CHARS = 1024
+
+
+def _read_skill():
+    text = SKILL_PATH.read_text(encoding="utf-8")
+    assert text.startswith("---\n"), "SKILL.md must start with YAML frontmatter"
+    # Split on the closing frontmatter fence.
+    _, frontmatter, body = text.split("---\n", 2)
+    meta = yaml.safe_load(frontmatter)
+    return text, meta, body
+
+
+def test_skill_file_exists():
+    assert SKILL_PATH.is_file(), f"missing skill file: {SKILL_PATH}"
+
+
+def test_skill_size_under_limit():
+    text = SKILL_PATH.read_text(encoding="utf-8")
+    assert len(text) <= MAX_SKILL_CHARS, (
+        f"SKILL.md is {len(text)} chars, exceeds {MAX_SKILL_CHARS}"
+    )
+
+
+def test_frontmatter_parses_and_has_required_fields():
+    _, meta, _ = _read_skill()
+    assert isinstance(meta, dict), "frontmatter must parse to a mapping"
+    for field in ("name", "description", "version", "author", "license", "metadata"):
+        assert field in meta, f"frontmatter missing required field: {field}"
+
+
+def test_name_matches():
+    _, meta, _ = _read_skill()
+    assert meta["name"] == "contextpilot-self-evolve"
+
+
+def test_description_is_use_when_and_within_limit():
+    _, meta, _ = _read_skill()
+    description = meta["description"]
+    assert isinstance(description, str) and description.strip()
+    assert description.lstrip().lower().startswith("use when"), (
+        "description should start with 'Use when' per Hermes convention"
+    )
+    assert len(description) <= MAX_DESCRIPTION_CHARS, (
+        f"description is {len(description)} chars, exceeds {MAX_DESCRIPTION_CHARS}"
+    )
+
+
+def test_metadata_has_tags():
+    _, meta, _ = _read_skill()
+    metadata = meta["metadata"]
+    assert isinstance(metadata, dict)
+    hermes_meta = metadata.get("hermes")
+    assert isinstance(hermes_meta, dict), "metadata.hermes must be present"
+    assert hermes_meta.get("tags"), "metadata.hermes.tags must be a non-empty list"
+    assert isinstance(hermes_meta["tags"], list)
+
+
+@pytest.mark.parametrize(
+    "phrase",
+    [
+        # proposal-only / no auto-apply of risky changes
+        "propose",
+        "independent review",
+        # privacy boundary
+        "raw",
+        "salted",
+        "session ids",
+        # realized vs advisory separation
+        "advisory",
+        "realized",
+        # safe install convention
+        "--force",
+        # change-gate requirements
+        "branch",
+        "tests",
+    ],
+)
+def test_required_safety_phrases_present(phrase):
+    text, _, _ = _read_skill()
+    assert phrase.lower() in text.lower(), f"SKILL.md missing safety phrase: {phrase!r}"
+
+
+def test_does_not_promise_auto_apply():
+    """The skill must keep its proposal-only stance for risky changes."""
+    text, _, _ = _read_skill()
+    lowered = text.lower()
+    # Must explicitly disclaim auto-applying routing/drop/summarization.
+    assert "never auto-apply" in lowered or "do not auto-apply" in lowered or (
+        "not" in lowered and "auto-enable" in lowered
+    ), "SKILL.md must state it never auto-applies risky context changes"

From 746b7d4d4d31a24c162d8bafb621fdcf439c712a Mon Sep 17 00:00:00 2001
From: root <root@vmi3142307.contaboserver.net>
Date: Fri, 12 Jun 2026 15:09:51 +0200
Subject: [PATCH 9/9] revert: remove user-facing self-evolve skill

---
 docs/guides/hermes-monitor.md                |  29 --
 docs/guides/hermes.md                        |   9 -
 skills/contextpilot-self-evolve/SKILL.md     | 274 -------------------
 tests/test_contextpilot_self_evolve_skill.py | 112 --------
 4 files changed, 424 deletions(-)
 delete mode 100644 skills/contextpilot-self-evolve/SKILL.md
 delete mode 100644 tests/test_contextpilot_self_evolve_skill.py

diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md
index a851fa3..4aa9128 100644
--- a/docs/guides/hermes-monitor.md
+++ b/docs/guides/hermes-monitor.md
@@ -184,35 +184,6 @@ gate below before changing ContextPilot config or code. A defensive guard in
 `write_report` refuses to emit any forbidden raw-content key, so the reports are
 safe to ship from an unattended cron job.
 
-## Self-evolve skill (Hermes)
-
-The monitor and analyzer above are bundled into a reusable Hermes skill so users
-can run the same safe loop — install/enable ContextPilot, collect
-telemetry/shadow data, analyze realized savings vs advisory candidates, and
-propose improvements under strict safety gates:
-
-- Skill path: `skills/contextpilot-self-evolve/SKILL.md`
-
-The skill is **proposal-only**: it never auto-applies context routing, dropping,
-or summarization, and it enforces the same privacy boundary (reports never emit
-raw conversation/tool/system text, reasoning, or raw session ids).
-
-To use it in Hermes, copy or load the skill into your Hermes skills directory,
-then invoke it by name:
-
-```bash
-# copy into your Hermes skills directory (adjust path to your install)
-mkdir -p ~/.hermes/skills/contextpilot-self-evolve
-cp skills/contextpilot-self-evolve/SKILL.md \
-   ~/.hermes/skills/contextpilot-self-evolve/SKILL.md
-```
-
-Then ask Hermes to run the **contextpilot-self-evolve** skill. It walks through
-install/verify, the metadata-only monitor, the content-aware analyzer for
-`--since-hours 24` and `168`, interpretation of realized vs advisory tokens,
-optional read-only cron jobs, and the branch/tests/privacy/independent-review
-gate required before any code or config change ships.
-
 ## Accuracy gate
 
 This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require:
diff --git a/docs/guides/hermes.md b/docs/guides/hermes.md
index 7761697..d13f293 100644
--- a/docs/guides/hermes.md
+++ b/docs/guides/hermes.md
@@ -97,15 +97,6 @@ Hermes ships with `ContextCompressor`, a threshold-based LLM-summarization engin
 
 ContextPilot runs *before* the threshold-based compressor, reducing how often the expensive summarization path is hit.
 
-## Self-evolve skill
-
-For a guided, safety-gated loop that installs ContextPilot, monitors its real
-token savings, scans for context-reduction opportunities, and proposes
-improvements (without auto-applying risky changes), use the bundled Hermes
-skill at `skills/contextpilot-self-evolve/SKILL.md`. See
-[`hermes-monitor.md`](./hermes-monitor.md#self-evolve-skill-hermes) for how to
-copy/load and invoke it.
-
 ## Troubleshooting
 
 **Plugin not discovered after install.** Check `~/.hermes/plugins/ContextPilot/plugin.yaml` exists and contains `type: context_engine`. Run `hermes plugins list` to confirm.
diff --git a/skills/contextpilot-self-evolve/SKILL.md b/skills/contextpilot-self-evolve/SKILL.md
deleted file mode 100644
index d88b81b..0000000
--- a/skills/contextpilot-self-evolve/SKILL.md
+++ /dev/null
@@ -1,274 +0,0 @@
----
-name: contextpilot-self-evolve
-description: Use when a user wants to install/enable ContextPilot inside Hermes Agent and then run a safe, repeatable "self-evolve" loop — collect metadata-only telemetry and content-aware shadow data, analyze realized token savings vs advisory candidate tokens, and propose ContextPilot improvements under strict safety gates. Use it for monitoring token spend, scanning context-redundancy opportunities, setting up read-only daily/weekly cron analysis, and preparing reviewed, branch-gated code/config changes. Do NOT use it to auto-apply routing/drop/summarization changes; this skill only proposes risky changes and requires tests, privacy checks, and independent review before anything ships.
-version: 1.0.0
-author: ContextPilot
-license: MIT
-metadata:
-  hermes:
-    tags: [contextpilot, hermes, telemetry, context-optimization, token-savings, safety-gated]
-    related_skills: []
-    category: observability
-    safety: proposal-only
----
-
-# ContextPilot Self-Evolve (Hermes)
-
-This skill drives a **safe, repeatable** loop for running ContextPilot inside
-Hermes Agent and continuously improving it from real telemetry — **without**
-auto-applying any risky context change. You measure, you analyze, you *propose*;
-a human (plus tests, privacy checks, and independent review) decides what ships.
-
-> Core safety stance: **observe and propose only.** This skill never enables
-> context routing, dropping, or summarization on its own. Shadow/advisory
-> numbers are training/eval data, **not** realized savings, and must never be
-> treated as something to "just turn on."
-
-## When to use this skill
-
-- A user asks to install or enable ContextPilot in Hermes and watch its impact.
-- A user wants to know how many tokens/cost ContextPilot is actually saving.
-- A user wants to find token-reduction *opportunities* (duplicate tool outputs,
-  cross-role repeated blocks, oversized tool results, routing/dedup candidates).
-- A user wants a daily/weekly read-only cron that reports savings + opportunities.
-- A user wants to propose a ContextPilot config or code change and needs the safe
-  workflow (branch, tests, privacy/no-raw-content checks, independent review).
-
-If the user instead wants the low-level integration mechanics, point them at
-`docs/guides/hermes.md`; for the metadata-only monitor details, see
-`docs/guides/hermes-monitor.md`. This skill orchestrates both into one loop.
-
-## Privacy boundary (read this first)
-
-There are two analysis tools with **different** read scopes:
-
-- `scripts/hermes_contextpilot_monitor.py` — **metadata only**. Never reads
-  `messages.content`, `sessions.system_prompt`, reasoning, or raw tool payloads.
-- `scripts/analyze_hermes_context_opportunities.py` — **content-aware**. It
-  *may* read message/tool/system content **in-memory** to compute salted
-  SHA-256 fingerprints and aggregate counters.
-
-In **both** cases the rule is absolute: **reports must never emit raw
-conversation text, tool-call payloads, system prompts, reasoning, or raw session
-ids.** Session ids appear only as salted hashes. The analyzer has a defensive
-`write_report` guard that refuses to emit forbidden raw-content keys; do not
-weaken or bypass it. If you are ever unsure whether an output is safe to ship,
-treat it as unsafe and stop.
-
-## Workflow
-
-### Step 1 — Install / enable ContextPilot in Hermes
-
-Normal install (do **not** use `--force`):
-
-```bash
-hermes plugins install EfficientContext/ContextPilot --enable
-hermes config set context.engine contextpilot
-```
-
-`--force` is **only** for an intentional update/reinstall over an existing
-install — never as the default:
-
-```bash
-hermes plugins install EfficientContext/ContextPilot --enable --force
-```
-
-If your Hermes version does not support `--enable`, install first and then use the
-plugin menu:
-
-```bash
-hermes plugins            # General Plugins -> toggle "contextpilot" enabled
-```
-
-### Step 2 — Verify the context engine + restart
-
-Confirm Hermes is actually routing through ContextPilot. The active context
-engine must be `contextpilot`:
-
-```yaml
-# ~/.hermes config
-context:
-  engine: contextpilot
-```
-
-```python
-from hermes_cli.plugins import get_plugin_manager
-engine = get_plugin_manager()._context_engine
-print(engine.get_status())   # expect {'engine': 'contextpilot', ...}
-```
-
-Then **restart the Hermes gateway / start a fresh session** so the engine is
-loaded. On startup you should see:
-
-```
-Plugin 'contextpilot' registered context engine: contextpilot
-```
-
-> The context-engine TUI submenu may show "contextpilot (not found)" — that is
-> cosmetic; `get_status()` is the source of truth.
-
-### Step 3 — Run the metadata-only monitor
-
-Use this as the safe baseline. It reports realized savings and operational
-signals from telemetry/metadata only:
-
-```bash
-python scripts/hermes_contextpilot_monitor.py \
-  --out-dir ~/contextpilot/reports \
-  --since-hours 24 \
-  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl
-```
-
-Reports:
-
-- `~/contextpilot/reports/daily_YYYY-MM-DD.json`
-- `~/contextpilot/reports/daily_YYYY-MM-DD.md`
-
-The telemetry file is written by the ContextPilot Hermes plugin when savings
-occur. `CONTEXTPILOT_DISABLE_TELEMETRY=1` disables writes;
-`CONTEXTPILOT_TELEMETRY_FILE=/path` overrides the location.
-
-### Step 4 — Run the content-aware opportunity analyzer
-
-Run for both a rolling day and a rolling week to separate noise from trend:
-
-```bash
-# last 24h
-python scripts/analyze_hermes_context_opportunities.py \
-  --state-db ~/.hermes/state.db \
-  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
-  --out-dir ~/contextpilot/opportunities \
-  --since-hours 24
-
-# last 7 days (168h)
-python scripts/analyze_hermes_context_opportunities.py \
-  --state-db ~/.hermes/state.db \
-  --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \
-  --out-dir ~/contextpilot/opportunities \
-  --since-hours 168
-```
-
-For a one-shot whole-history audit, swap the window for `--all-sessions`.
-Reports:
-
-- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json`
-- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md`
-
-The analyzer surfaces: exact duplicate tool outputs, repeated line/block
-fingerprints, large outputs by `tool_name`, heavy sessions (hashed ids),
-ContextPilot telemetry coverage/ratios, **LLM-bound cross-type repeated
-blocks**, **Worker Context Routing shadow labels**, and **Parent Aggregation
-Artifact** dedup telemetry. The shadow/parent sections are **on by default** and
-collect P0 data only; pass `--disable-worker-routing-shadow` or
-`--disable-parent-aggregation` to omit a section.
-
-### Step 5 — Interpret: realized savings vs advisory candidates
-
-Keep these two numbers in separate mental buckets — never add them together:
-
-- **Realized savings** (telemetry: `chars_saved`, `~tokens`, savings ratio,
-  monitor report) — what ContextPilot *actually* saved via lossless dedup +
-  reorder. This is real and bankable.
-- **Advisory / shadow candidate tokens** (analyzer: routing-shadow
-  `est_advisory_candidate_tokens`, parent-aggregation `est_duplicate_tokens`,
-  cross-type redundant tokens) — an **upper-bound estimate** of what a *future*
-  router/dedup *might* save. **Not realized.** It is training/eval data, and
-  every token estimate is a heuristic (`chars/4`).
-
-When reporting to the user, state realized savings as fact and label every
-advisory number as a candidate that still needs validation. Do not imply that
-advisory tokens are available simply by toggling a flag.
-
-### Step 6 — Optional read-only cron jobs
-
-Schedule the monitor and/or analyzer as **read-only watchdogs**. They produce
-reports; they must not apply config or code changes.
-
-```python
-cronjob(
-    action="create",
-    name="contextpilot-self-evolve-daily",
-    schedule="0 4 * * *",
-    repeat=7,
-    deliver="origin",
-    enabled_toolsets=["terminal", "file"],
-    prompt="""
-Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with
---out-dir /root/contextpilot/reports --since-hours 24, then run
-analyze_hermes_context_opportunities.py with --since-hours 24. Read today's
-Markdown reports and send a short summary: realized token savings, session
-count, whether ContextPilot events were observed, and the top advisory
-opportunities (clearly labeled as candidates, not realized). Do NOT read raw
-conversation content. Do NOT modify source/config.
-""",
-)
-```
-
-For a weekly trend, add a second job with `--since-hours 168` on a `0 5 * * 1`
-schedule. Both stay strictly read-only.
-
-### Step 7 — Propose improvements (do NOT auto-apply risky changes)
-
-From the reports, write a prioritized proposal. **Never** auto-enable context
-**routing**, context **dropping**, or **summarization** based on shadow numbers.
-Those are high-recall-sensitive changes that can silently drop needed context;
-they require the accuracy gate plus human sign-off.
-
-Before any ContextPilot change ships, run a fixed golden eval set and require:
-
-- no task-success regression,
-- no drop in context recall beyond the chosen threshold,
-- no unsafe raw-content leakage in reports,
-- no increase in failed tool calls.
-
-If any gate fails, hold the proposal and require human review.
-
-### Step 8 — Safe path for code/config changes
-
-For anything beyond a read-only report, follow this gate every time:
-
-1. **Branch.** Make changes on a dedicated branch; never on `main`. No
-   destructive git operations, no commit/push unless the user explicitly asks.
-2. **Tests.** Add/extend tests and run the relevant suite (see below). A change
-   to analysis or routing logic must ship with coverage.
-3. **Privacy / no-raw-content check.** Re-confirm no report path can emit raw
-   conversation/tool/system text, reasoning, or raw session ids. Keep the
-   `write_report` forbidden-key guard intact.
-4. **Independent review.** Get a second, independent review (human or a separate
-   reviewing agent) focused on correctness, recall safety, and privacy before
-   merge.
-
-### Optional — delegated coding + independent verification
-
-If the user has a coding-agent workflow, you may delegate the *implementation*
-of an approved proposal to a coding agent (e.g. Claude Code) on a branch, and
-then run **independent verification** in Hermes (re-run tests, the privacy
-guard, and the accuracy gate) rather than trusting the author's own check. This
-two-party split (one writes, another verifies) is recommended but generic — any
-"author + independent reviewer" arrangement satisfies the gate. The skill itself
-never merges; a human approves.
-
-## Report locations (quick reference)
-
-| Tool | Scope | Default output |
-|------|-------|----------------|
-| `hermes_contextpilot_monitor.py` | metadata only | `~/contextpilot/reports/daily_YYYY-MM-DD.{json,md}` |
-| `analyze_hermes_context_opportunities.py` | content-aware (hashes only in reports) | `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.{json,md}` |
-
-## Relevant tests
-
-```bash
-python -m pytest tests/test_hermes_contextpilot_monitor.py \
-  tests/test_hermes_context_opportunity_analyzer.py \
-  tests/test_contextpilot_self_evolve_skill.py -q
-```
-
-## Hard rules (never violate)
-
-- Observe and **propose** only — never auto-apply routing/drop/summarization.
-- Reports never contain raw conversation/tool/system text, reasoning, or raw
-  session ids; session ids are salted hashes only.
-- Realized savings and advisory/shadow candidate tokens are reported separately.
-- `--force` install only for an intentional update/reinstall.
-- Code/config changes require: branch, tests, privacy check, independent review.
-- No destructive git operations; no commit/push unless the user asks.
diff --git a/tests/test_contextpilot_self_evolve_skill.py b/tests/test_contextpilot_self_evolve_skill.py
deleted file mode 100644
index 5da09c4..0000000
--- a/tests/test_contextpilot_self_evolve_skill.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""Validation tests for the contextpilot-self-evolve Hermes skill.
-
-These guard the SKILL.md packaging contract: valid YAML frontmatter, required
-Hermes fields, size limits, and the presence of the safety/privacy phrases that
-make this skill safe to ship (it must stay proposal-only and never promise to
-auto-apply risky context changes).
-"""
-from pathlib import Path
-
-import pytest
-
-import yaml
-
-
-SKILL_PATH = (
-    Path(__file__).resolve().parents[1]
-    / "skills"
-    / "contextpilot-self-evolve"
-    / "SKILL.md"
-)
-
-MAX_SKILL_CHARS = 100_000
-MAX_DESCRIPTION_CHARS = 1024
-
-
-def _read_skill():
-    text = SKILL_PATH.read_text(encoding="utf-8")
-    assert text.startswith("---\n"), "SKILL.md must start with YAML frontmatter"
-    # Split on the closing frontmatter fence.
-    _, frontmatter, body = text.split("---\n", 2)
-    meta = yaml.safe_load(frontmatter)
-    return text, meta, body
-
-
-def test_skill_file_exists():
-    assert SKILL_PATH.is_file(), f"missing skill file: {SKILL_PATH}"
-
-
-def test_skill_size_under_limit():
-    text = SKILL_PATH.read_text(encoding="utf-8")
-    assert len(text) <= MAX_SKILL_CHARS, (
-        f"SKILL.md is {len(text)} chars, exceeds {MAX_SKILL_CHARS}"
-    )
-
-
-def test_frontmatter_parses_and_has_required_fields():
-    _, meta, _ = _read_skill()
-    assert isinstance(meta, dict), "frontmatter must parse to a mapping"
-    for field in ("name", "description", "version", "author", "license", "metadata"):
-        assert field in meta, f"frontmatter missing required field: {field}"
-
-
-def test_name_matches():
-    _, meta, _ = _read_skill()
-    assert meta["name"] == "contextpilot-self-evolve"
-
-
-def test_description_is_use_when_and_within_limit():
-    _, meta, _ = _read_skill()
-    description = meta["description"]
-    assert isinstance(description, str) and description.strip()
-    assert description.lstrip().lower().startswith("use when"), (
-        "description should start with 'Use when' per Hermes convention"
-    )
-    assert len(description) <= MAX_DESCRIPTION_CHARS, (
-        f"description is {len(description)} chars, exceeds {MAX_DESCRIPTION_CHARS}"
-    )
-
-
-def test_metadata_has_tags():
-    _, meta, _ = _read_skill()
-    metadata = meta["metadata"]
-    assert isinstance(metadata, dict)
-    hermes_meta = metadata.get("hermes")
-    assert isinstance(hermes_meta, dict), "metadata.hermes must be present"
-    assert hermes_meta.get("tags"), "metadata.hermes.tags must be a non-empty list"
-    assert isinstance(hermes_meta["tags"], list)
-
-
-@pytest.mark.parametrize(
-    "phrase",
-    [
-        # proposal-only / no auto-apply of risky changes
-        "propose",
-        "independent review",
-        # privacy boundary
-        "raw",
-        "salted",
-        "session ids",
-        # realized vs advisory separation
-        "advisory",
-        "realized",
-        # safe install convention
-        "--force",
-        # change-gate requirements
-        "branch",
-        "tests",
-    ],
-)
-def test_required_safety_phrases_present(phrase):
-    text, _, _ = _read_skill()
-    assert phrase.lower() in text.lower(), f"SKILL.md missing safety phrase: {phrase!r}"
-
-
-def test_does_not_promise_auto_apply():
-    """The skill must keep its proposal-only stance for risky changes."""
-    text, _, _ = _read_skill()
-    lowered = text.lower()
-    # Must explicitly disclaim auto-applying routing/drop/summarization.
-    assert "never auto-apply" in lowered or "do not auto-apply" in lowered or (
-        "not" in lowered and "auto-enable" in lowered
-    ), "SKILL.md must state it never auto-applies risky context changes"