From 940f8ccce866a9265d7f85ed8d377dd80475087d Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Jun 2026 00:03:25 +0200 Subject: [PATCH 1/9] feat: add privacy-safe Hermes monitor --- docs/guides/hermes-monitor.md | 61 +++++ scripts/hermes_contextpilot_monitor.py | 299 ++++++++++++++++++++++ tests/test_hermes_contextpilot_monitor.py | 114 +++++++++ 3 files changed, 474 insertions(+) create mode 100644 docs/guides/hermes-monitor.md create mode 100644 scripts/hermes_contextpilot_monitor.py create mode 100644 tests/test_hermes_contextpilot_monitor.py diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md new file mode 100644 index 0000000..a83c0b0 --- /dev/null +++ b/docs/guides/hermes-monitor.md @@ -0,0 +1,61 @@ +# ContextPilot Hermes Monitor + +This is an opt-in, metadata-only monitor for testing ContextPilot inside Hermes Agent over a one-week window. + +## What it reads + +- `~/.hermes/state.db:sessions` metadata only: token counts, tool/API call counts, source, estimated cost, timestamps. +- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries. + +It intentionally does **not** read: + +- `messages.content` +- `sessions.system_prompt` +- reasoning fields +- raw tool call payloads +- raw user/assistant text + +Session ids are salted SHA-256 hashes in reports. + +## Daily run + +```bash +python scripts/hermes_contextpilot_monitor.py \ + --out-dir ~/contextpilot/reports \ + --since-hours 24 +``` + +Outputs: + +- `~/contextpilot/reports/daily_YYYY-MM-DD.json` +- `~/contextpilot/reports/daily_YYYY-MM-DD.md` + +## Suggested Hermes cron job + +Use this as a read-only watchdog. It produces reports; it does not apply config/code changes. + +```python +cronjob( + action="create", + name="contextpilot-hermes-monitor-7d", + schedule="0 4 * * *", + repeat=7, + deliver="origin", + enabled_toolsets=["terminal", "file"], + prompt=""" +Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with --out-dir /root/contextpilot/reports --since-hours 24. +Then read the generated Markdown report for today and send a short Chinese summary: token savings, session count, whether ContextPilot log events were observed, and any blocker. Do not read raw conversation content. Do not modify source/config. +""", +) +``` + +## Accuracy gate + +This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require: + +- no task-success regression, +- no drop in context recall beyond the chosen threshold, +- no unsafe raw-content leakage in reports, +- no increase in failed tool calls. + +If any gate fails, hold proposals and require human review. diff --git a/scripts/hermes_contextpilot_monitor.py b/scripts/hermes_contextpilot_monitor.py new file mode 100644 index 0000000..c4ceb23 --- /dev/null +++ b/scripts/hermes_contextpilot_monitor.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +"""Privacy-safe ContextPilot monitor for Hermes Agent. + +Reads Hermes metadata (sessions table) and ContextPilot savings log lines, then +writes daily JSON/Markdown reports. It deliberately never reads message bodies, +system prompts, reasoning text, or tool payload content. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import json +import re +import sqlite3 +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable + +SAVINGS_RE = re.compile( + r"\[ContextPilot\].*?saved\s+(?P\d+)\s+chars\s+\(~(?P\d+)\s+tokens\)" +) +SESSION_RE = re.compile( + r"\[ContextPilot\]\s+Session\s+(?P[^:]+):\s+(?P\d+)\s+turns,\s+" + r"(?P\d+)\s+chars\s+saved\s+\(~(?P\d+)\s+tokens\)" +) + +FORBIDDEN_COLUMNS = { + "content", + "system_prompt", + "reasoning", + "reasoning_content", + "reasoning_details", + "tool_calls", + "codex_reasoning_items", + "codex_message_items", +} + + +@dataclass +class SessionMetric: + session_hash: str + source: str | None + started_at: float + ended_at: float | None + message_count: int + tool_call_count: int + api_call_count: int + input_tokens: int + output_tokens: int + cache_read_tokens: int + cache_write_tokens: int + reasoning_tokens: int + estimated_cost_usd: float | None + + +@dataclass +class DailyReport: + date: str + since_hours: int + session_count: int + total_messages: int + total_tool_calls: int + total_api_calls: int + total_input_tokens: int + total_output_tokens: int + total_cache_read_tokens: int + total_cache_write_tokens: int + total_reasoning_tokens: int + estimated_cost_usd: float + contextpilot_log_events: int + contextpilot_chars_saved: int + contextpilot_tokens_saved: int + estimated_input_token_reduction_pct: float + top_sources: dict[str, int] + top_token_sessions: list[SessionMetric] + notes: list[str] + + +def _hash_session(session_id: str, salt: str) -> str: + return hashlib.sha256(f"{salt}:{session_id}".encode()).hexdigest()[:16] + + +def _connect_readonly(path: Path) -> sqlite3.Connection: + uri = f"file:{path}?mode=ro" + return sqlite3.connect(uri, uri=True) + + +def _assert_schema_safe(conn: sqlite3.Connection) -> None: + # Guard against accidental SELECT * expansion in future edits: explicitly + # name every session column we read and refuse message-table content access. + session_cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} + message_cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")} + if not session_cols: + raise RuntimeError("Hermes sessions table not found") + if "content" in message_cols: + # The monitor is allowed to count messages, never read their bodies. + pass + + +def load_session_metrics(db_path: Path, *, since_hours: int, salt: str) -> list[SessionMetric]: + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + conn = _connect_readonly(db_path) + try: + _assert_schema_safe(conn) + query_columns = [ + "id", + "source", + "started_at", + "ended_at", + "message_count", + "tool_call_count", + "input_tokens", + "output_tokens", + "cache_read_tokens", + "cache_write_tokens", + "reasoning_tokens", + "estimated_cost_usd", + "api_call_count", + ] + if FORBIDDEN_COLUMNS.intersection(query_columns): + raise RuntimeError("Internal error: forbidden raw-content column requested") + sql = f""" + SELECT {', '.join(query_columns)} + FROM sessions + WHERE started_at >= ? AND archived = 0 + ORDER BY input_tokens DESC + """ + rows = conn.execute(sql, (cutoff,)).fetchall() + finally: + conn.close() + + metrics: list[SessionMetric] = [] + for row in rows: + ( + sid, + source, + started_at, + ended_at, + message_count, + tool_call_count, + input_tokens, + output_tokens, + cache_read_tokens, + cache_write_tokens, + reasoning_tokens, + estimated_cost_usd, + api_call_count, + ) = row + metrics.append( + SessionMetric( + session_hash=_hash_session(str(sid), salt), + source=source, + started_at=float(started_at), + ended_at=float(ended_at) if ended_at is not None else None, + message_count=int(message_count or 0), + tool_call_count=int(tool_call_count or 0), + api_call_count=int(api_call_count or 0), + input_tokens=int(input_tokens or 0), + output_tokens=int(output_tokens or 0), + cache_read_tokens=int(cache_read_tokens or 0), + cache_write_tokens=int(cache_write_tokens or 0), + reasoning_tokens=int(reasoning_tokens or 0), + estimated_cost_usd=float(estimated_cost_usd or 0.0), + ) + ) + return metrics + + +def parse_contextpilot_savings(log_path: Path, *, since_hours: int) -> tuple[int, int, int]: + if not log_path.exists(): + return 0, 0, 0 + # Gateway logs can be large. Tail a bounded byte window; cron should run daily. + max_bytes = 8 * 1024 * 1024 + with log_path.open("rb") as f: + f.seek(0, 2) + size = f.tell() + f.seek(max(0, size - max_bytes)) + text = f.read().decode("utf-8", errors="replace") + + events = 0 + chars = 0 + tokens = 0 + for line in text.splitlines(): + # Timestamp filtering is best-effort; if parse fails, keep the line only + # when it is in the tailed window. No message content is logged here. + m = SAVINGS_RE.search(line) + if not m: + continue + events += 1 + chars += int(m.group("chars")) + tokens += int(m.group("tokens")) + return events, chars, tokens + + +def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: int, log_stats: tuple[int, int, int]) -> DailyReport: + rows = list(metrics) + source_counts: dict[str, int] = {} + for row in rows: + source_counts[row.source or "unknown"] = source_counts.get(row.source or "unknown", 0) + 1 + + total_input = sum(r.input_tokens for r in rows) + events, saved_chars, saved_tokens = log_stats + denominator = total_input + saved_tokens + reduction = (saved_tokens / denominator * 100.0) if denominator else 0.0 + + notes: list[str] = [ + "metadata-only: did not read messages.content, sessions.system_prompt, reasoning, or tool payloads", + "accuracy gate is observational here; apply code/config changes only after separate golden-eval pass", + ] + if not rows: + notes.append("no sessions observed in the selected window") + if events == 0: + notes.append("no ContextPilot savings log lines observed; gateway may need restart after enabling plugin") + + return DailyReport( + date=date, + since_hours=since_hours, + session_count=len(rows), + total_messages=sum(r.message_count for r in rows), + total_tool_calls=sum(r.tool_call_count for r in rows), + total_api_calls=sum(r.api_call_count for r in rows), + total_input_tokens=total_input, + total_output_tokens=sum(r.output_tokens for r in rows), + total_cache_read_tokens=sum(r.cache_read_tokens for r in rows), + total_cache_write_tokens=sum(r.cache_write_tokens for r in rows), + total_reasoning_tokens=sum(r.reasoning_tokens for r in rows), + estimated_cost_usd=sum(r.estimated_cost_usd or 0.0 for r in rows), + contextpilot_log_events=events, + contextpilot_chars_saved=saved_chars, + contextpilot_tokens_saved=saved_tokens, + estimated_input_token_reduction_pct=round(reduction, 2), + top_sources=dict(sorted(source_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]), + top_token_sessions=rows[:10], + notes=notes, + ) + + +def write_report(report: DailyReport, out_dir: Path) -> tuple[Path, Path]: + out_dir.mkdir(parents=True, exist_ok=True) + json_path = out_dir / f"daily_{report.date}.json" + md_path = out_dir / f"daily_{report.date}.md" + data = asdict(report) + json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + md = [ + f"# ContextPilot Hermes monitor — {report.date}", + "", + f"Window: last {report.since_hours}h", + "", + "## Summary", + f"- Sessions: {report.session_count}", + f"- Input tokens: {report.total_input_tokens}", + f"- Output tokens: {report.total_output_tokens}", + f"- Tool calls: {report.total_tool_calls}", + f"- ContextPilot saved: ~{report.contextpilot_tokens_saved} tokens ({report.contextpilot_chars_saved} chars)", + f"- Estimated input-token reduction: {report.estimated_input_token_reduction_pct}%", + f"- Estimated cost: ${report.estimated_cost_usd:.4f}", + "", + "## Top sources", + ] + for source, count in report.top_sources.items(): + md.append(f"- {source}: {count}") + md.extend(["", "## Top token sessions (hashed)"]) + for row in report.top_token_sessions: + md.append( + f"- `{row.session_hash}` source={row.source} input={row.input_tokens} " + f"output={row.output_tokens} tools={row.tool_call_count} apis={row.api_call_count}" + ) + md.extend(["", "## Notes"]) + for note in report.notes: + md.append(f"- {note}") + md_path.write_text("\n".join(md) + "\n", encoding="utf-8") + return json_path, md_path + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--state-db", type=Path, default=Path.home() / ".hermes" / "state.db") + parser.add_argument("--gateway-log", type=Path, default=Path.home() / ".hermes" / "logs" / "gateway.log") + parser.add_argument("--out-dir", type=Path, default=Path.home() / "contextpilot" / "reports") + parser.add_argument("--since-hours", type=int, default=24) + parser.add_argument("--salt", default="contextpilot-hermes-monitor-v1", help="salt for stable per-install session hashes") + parser.add_argument("--date", default=dt.date.today().isoformat()) + args = parser.parse_args() + + if not args.state_db.exists(): + raise SystemExit(f"Hermes state DB not found: {args.state_db}") + + metrics = load_session_metrics(args.state_db, since_hours=args.since_hours, salt=args.salt) + log_stats = parse_contextpilot_savings(args.gateway_log, since_hours=args.since_hours) + report = build_report(metrics, date=args.date, since_hours=args.since_hours, log_stats=log_stats) + json_path, md_path = write_report(report, args.out_dir) + print(json.dumps({"ok": True, "json": str(json_path), "markdown": str(md_path)}, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_hermes_contextpilot_monitor.py b/tests/test_hermes_contextpilot_monitor.py new file mode 100644 index 0000000..d0f0218 --- /dev/null +++ b/tests/test_hermes_contextpilot_monitor.py @@ -0,0 +1,114 @@ +import importlib.util +import json +import sqlite3 +import sys +from pathlib import Path + + +MODULE_PATH = Path(__file__).resolve().parents[1] / "scripts" / "hermes_contextpilot_monitor.py" +spec = importlib.util.spec_from_file_location("hermes_contextpilot_monitor", MODULE_PATH) +monitor = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = monitor +spec.loader.exec_module(monitor) + + +def _make_db(path: Path): + conn = sqlite3.connect(path) + conn.execute( + """ + CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + source TEXT, + started_at REAL NOT NULL, + ended_at REAL, + message_count INTEGER DEFAULT 0, + tool_call_count INTEGER DEFAULT 0, + input_tokens INTEGER DEFAULT 0, + output_tokens INTEGER DEFAULT 0, + cache_read_tokens INTEGER DEFAULT 0, + cache_write_tokens INTEGER DEFAULT 0, + reasoning_tokens INTEGER DEFAULT 0, + estimated_cost_usd REAL, + api_call_count INTEGER DEFAULT 0, + archived INTEGER NOT NULL DEFAULT 0, + system_prompt TEXT + ) + """ + ) + conn.execute( + """ + CREATE TABLE messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT, + reasoning TEXT, + timestamp REAL NOT NULL + ) + """ + ) + conn.execute( + """ + INSERT INTO sessions ( + id, source, started_at, message_count, tool_call_count, + input_tokens, output_tokens, cache_read_tokens, cache_write_tokens, + reasoning_tokens, estimated_cost_usd, api_call_count, archived, + system_prompt + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + "raw-session-id", + "discord", + 4102444800.0, # 2100-01-01, always inside test window + 4, + 2, + 1000, + 200, + 50, + 10, + 25, + 0.0123, + 3, + 0, + "SECRET SYSTEM PROMPT", + ), + ) + conn.execute( + """ + INSERT INTO messages (session_id, role, content, reasoning, timestamp) + VALUES (?, ?, ?, ?, ?) + """, + ("raw-session-id", "user", "DO NOT READ ME", "PRIVATE", 4102444800.0), + ) + conn.commit() + conn.close() + + +def test_monitor_reads_metadata_only_and_hashes_session_ids(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + log = tmp_path / "gateway.log" + log.write_text( + "2026-01-01 INFO [ContextPilot] Turn 2: saved 400 chars (~100 tokens) | cumulative: 400 chars (~100 tokens)\n", + encoding="utf-8", + ) + out_dir = tmp_path / "reports" + + metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test") + report = monitor.build_report( + metrics, + date="2100-01-01", + since_hours=24, + log_stats=monitor.parse_contextpilot_savings(log, since_hours=24), + ) + json_path, md_path = monitor.write_report(report, out_dir) + + data = json.loads(json_path.read_text(encoding="utf-8")) + md = md_path.read_text(encoding="utf-8") + assert data["session_count"] == 1 + assert data["contextpilot_tokens_saved"] == 100 + assert data["estimated_input_token_reduction_pct"] > 0 + assert "raw-session-id" not in md + assert "DO NOT READ ME" not in md + assert "SECRET SYSTEM PROMPT" not in md + assert data["top_token_sessions"][0]["session_hash"] != "raw-session-id" From 39fa0593dfe97ebde5d1084060491410888ed6af Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Jun 2026 02:31:29 +0200 Subject: [PATCH 2/9] fix: add safe ContextPilot telemetry and dedup regressions --- __init__.py | 57 ++++++++++ docs/guides/hermes-monitor.md | 8 +- scripts/hermes_contextpilot_monitor.py | 92 +++++++++++++++-- tests/test_block_dedup_regression.py | 120 ++++++++++++++++++++++ tests/test_hermes_contextpilot_monitor.py | 69 +++++++++++++ tests/test_hermes_plugin_patch.py | 100 ++++++++++++++++++ 6 files changed, 438 insertions(+), 8 deletions(-) create mode 100644 tests/test_block_dedup_regression.py diff --git a/__init__.py b/__init__.py index 44b4214..eac1f08 100644 --- a/__init__.py +++ b/__init__.py @@ -13,6 +13,7 @@ import os import subprocess import sys +import time from pathlib import Path from typing import Any, Dict, List, Tuple @@ -184,6 +185,38 @@ def _hash_text(text: str) -> str: return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()[:16] +def _telemetry_path() -> "Path | None": + """Resolve the metadata-only telemetry file, or None if disabled. + + Lets the monitor read ContextPilot savings without depending on gateway log + lines. Override with CONTEXTPILOT_TELEMETRY_FILE; disable with + CONTEXTPILOT_DISABLE_TELEMETRY=1. + """ + if os.environ.get("CONTEXTPILOT_DISABLE_TELEMETRY") == "1": + return None + override = os.environ.get("CONTEXTPILOT_TELEMETRY_FILE") + if override: + return Path(override) + return Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl" + + +def _write_telemetry(record: Dict[str, Any]) -> None: + """Append one metadata-only JSON line. Never raises; best-effort only. + + Privacy contract: callers must pass numeric counters / timestamps / session + / turn metadata only — never message bodies, prompts, or tool payloads. + """ + try: + path = _telemetry_path() + if path is None: + return + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, separators=(",", ":")) + "\n") + except Exception as e: # noqa: BLE001 - telemetry must never break optimization + logger.debug("[ContextPilot] telemetry write skipped: %s", e) + + def _reorder_docs(docs: List[str], alpha: float = 0.001) -> List[str]: global _intercept_index if len(docs) < 2: @@ -286,6 +319,7 @@ def __init__(self): self._total_reordered = 0 self._total_docs_deduped = 0 self._optimize_count = 0 + self._session_id = None self.threshold_percent = 0.75 @staticmethod @@ -624,6 +658,28 @@ def _tool_chars(msgs): self._total_chars_saved, self._total_chars_saved // 4, ) + # Metadata-only telemetry so the monitor does not depend solely on + # gateway log lines. No content, prompts, or tool payloads here. + _write_telemetry( + { + "ts": time.time(), + "type": "turn", + "session_hash": ( + _hash_text(str(self._session_id)) + if self._session_id is not None else None + ), + "turn": self._optimize_count, + "chars_saved": turn_chars_saved, + "tokens_saved": turn_chars_saved // 4, + "doc_chars_saved": doc_chars_saved, + "block_chars_saved": dedup_result.chars_saved, + "blocks_deduped": dedup_result.blocks_deduped, + "blocks_total": dedup_result.blocks_total, + "docs_deduped": self._total_docs_deduped, + "system_blocks_matched": dedup_result.system_blocks_matched, + "cumulative_chars_saved": self._total_chars_saved, + } + ) return api_messages, { "chars_saved": turn_chars_saved, @@ -648,6 +704,7 @@ def on_context_compressed(self, old_count: int, new_count: int) -> None: def on_session_start(self, session_id: str, **kwargs) -> None: _patch_hermes_sanitizer() + self._session_id = session_id self._model = kwargs.get("model", "") self._base_url = "" self._api_key = "" diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index a83c0b0..e63bfd7 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -5,7 +5,8 @@ This is an opt-in, metadata-only monitor for testing ContextPilot inside Hermes ## What it reads - `~/.hermes/state.db:sessions` metadata only: token counts, tool/API call counts, source, estimated cost, timestamps. -- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries. +- `~/.hermes/contextpilot/telemetry.jsonl` metadata-only ContextPilot savings records (preferred source). +- `~/.hermes/logs/gateway.log` lines containing ContextPilot savings summaries (fallback source). It intentionally does **not** read: @@ -22,9 +23,12 @@ Session ids are salted SHA-256 hashes in reports. ```bash python scripts/hermes_contextpilot_monitor.py \ --out-dir ~/contextpilot/reports \ - --since-hours 24 + --since-hours 24 \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl ``` +The telemetry file is written by the ContextPilot Hermes plugin when savings occur. Set `CONTEXTPILOT_DISABLE_TELEMETRY=1` to disable writes, or `CONTEXTPILOT_TELEMETRY_FILE=/path/to/file.jsonl` to override the location. + Outputs: - `~/contextpilot/reports/daily_YYYY-MM-DD.json` diff --git a/scripts/hermes_contextpilot_monitor.py b/scripts/hermes_contextpilot_monitor.py index c4ceb23..3bd9483 100644 --- a/scripts/hermes_contextpilot_monitor.py +++ b/scripts/hermes_contextpilot_monitor.py @@ -69,6 +69,8 @@ class DailyReport: total_reasoning_tokens: int estimated_cost_usd: float contextpilot_log_events: int + contextpilot_telemetry_events: int + contextpilot_savings_source: str contextpilot_chars_saved: int contextpilot_tokens_saved: int estimated_input_token_reduction_pct: float @@ -193,25 +195,86 @@ def parse_contextpilot_savings(log_path: Path, *, since_hours: int) -> tuple[int return events, chars, tokens -def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: int, log_stats: tuple[int, int, int]) -> DailyReport: +def parse_contextpilot_telemetry(telemetry_path: Path, *, since_hours: int) -> tuple[int, int, int]: + """Aggregate the plugin's metadata-only telemetry file. + + Returns (events, chars_saved, tokens_saved). The file is JSON-lines, one + numeric record per saved turn; it never contains message content, prompts, + or tool payloads, so we only read numeric counters here. + """ + if not telemetry_path or not telemetry_path.exists(): + return 0, 0, 0 + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + + events = 0 + chars = 0 + tokens = 0 + with telemetry_path.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + record = json.loads(line) + except (ValueError, TypeError): + continue + if not isinstance(record, dict): + continue + ts = record.get("ts") + if isinstance(ts, (int, float)) and ts < cutoff: + continue + cs = record.get("chars_saved") + if not isinstance(cs, (int, float)): + continue + saved_tokens = record.get("tokens_saved") + events += 1 + chars += int(cs) + tokens += int(saved_tokens) if isinstance(saved_tokens, (int, float)) else int(cs) // 4 + return events, chars, tokens + + +def build_report( + metrics: Iterable[SessionMetric], + *, + date: str, + since_hours: int, + log_stats: tuple[int, int, int], + telemetry_stats: tuple[int, int, int] = (0, 0, 0), +) -> DailyReport: rows = list(metrics) source_counts: dict[str, int] = {} for row in rows: source_counts[row.source or "unknown"] = source_counts.get(row.source or "unknown", 0) + 1 total_input = sum(r.input_tokens for r in rows) - events, saved_chars, saved_tokens = log_stats + log_events, log_chars, log_tokens = log_stats + tel_events, tel_chars, tel_tokens = telemetry_stats + + # Prefer the local telemetry file when present: it is the authoritative, + # log-independent source. Logs are a fallback and are NOT summed on top + # (both record the same turns, so summing would double-count). + if tel_events > 0: + events, saved_chars, saved_tokens = tel_events, tel_chars, tel_tokens + savings_source = "telemetry" + else: + events, saved_chars, saved_tokens = log_events, log_chars, log_tokens + savings_source = "gateway-log" + denominator = total_input + saved_tokens reduction = (saved_tokens / denominator * 100.0) if denominator else 0.0 notes: list[str] = [ "metadata-only: did not read messages.content, sessions.system_prompt, reasoning, or tool payloads", "accuracy gate is observational here; apply code/config changes only after separate golden-eval pass", + f"contextpilot savings source: {savings_source} (telemetry={tel_events} events, log={log_events} events)", ] if not rows: notes.append("no sessions observed in the selected window") - if events == 0: - notes.append("no ContextPilot savings log lines observed; gateway may need restart after enabling plugin") + if tel_events == 0 and log_events == 0: + notes.append( + "no ContextPilot savings observed via telemetry or logs; " + "gateway may need restart after enabling plugin" + ) return DailyReport( date=date, @@ -226,7 +289,9 @@ def build_report(metrics: Iterable[SessionMetric], *, date: str, since_hours: in total_cache_write_tokens=sum(r.cache_write_tokens for r in rows), total_reasoning_tokens=sum(r.reasoning_tokens for r in rows), estimated_cost_usd=sum(r.estimated_cost_usd or 0.0 for r in rows), - contextpilot_log_events=events, + contextpilot_log_events=log_events, + contextpilot_telemetry_events=tel_events, + contextpilot_savings_source=savings_source, contextpilot_chars_saved=saved_chars, contextpilot_tokens_saved=saved_tokens, estimated_input_token_reduction_pct=round(reduction, 2), @@ -254,6 +319,8 @@ def write_report(report: DailyReport, out_dir: Path) -> tuple[Path, Path]: f"- Output tokens: {report.total_output_tokens}", f"- Tool calls: {report.total_tool_calls}", f"- ContextPilot saved: ~{report.contextpilot_tokens_saved} tokens ({report.contextpilot_chars_saved} chars)", + f"- ContextPilot savings source: {report.contextpilot_savings_source} " + f"(telemetry events={report.contextpilot_telemetry_events}, log events={report.contextpilot_log_events})", f"- Estimated input-token reduction: {report.estimated_input_token_reduction_pct}%", f"- Estimated cost: ${report.estimated_cost_usd:.4f}", "", @@ -278,6 +345,12 @@ def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--state-db", type=Path, default=Path.home() / ".hermes" / "state.db") parser.add_argument("--gateway-log", type=Path, default=Path.home() / ".hermes" / "logs" / "gateway.log") + parser.add_argument( + "--telemetry-file", + type=Path, + default=Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl", + help="metadata-only ContextPilot telemetry file (preferred over gateway log)", + ) parser.add_argument("--out-dir", type=Path, default=Path.home() / "contextpilot" / "reports") parser.add_argument("--since-hours", type=int, default=24) parser.add_argument("--salt", default="contextpilot-hermes-monitor-v1", help="salt for stable per-install session hashes") @@ -289,7 +362,14 @@ def main() -> int: metrics = load_session_metrics(args.state_db, since_hours=args.since_hours, salt=args.salt) log_stats = parse_contextpilot_savings(args.gateway_log, since_hours=args.since_hours) - report = build_report(metrics, date=args.date, since_hours=args.since_hours, log_stats=log_stats) + telemetry_stats = parse_contextpilot_telemetry(args.telemetry_file, since_hours=args.since_hours) + report = build_report( + metrics, + date=args.date, + since_hours=args.since_hours, + log_stats=log_stats, + telemetry_stats=telemetry_stats, + ) json_path, md_path = write_report(report, args.out_dir) print(json.dumps({"ok": True, "json": str(json_path), "markdown": str(md_path)}, ensure_ascii=False)) return 0 diff --git a/tests/test_block_dedup_regression.py b/tests/test_block_dedup_regression.py new file mode 100644 index 0000000..57e66b3 --- /dev/null +++ b/tests/test_block_dedup_regression.py @@ -0,0 +1,120 @@ +"""Regression coverage for the block-dedup behavioral contract. + +These tests lock in the guarantees the Hermes integration depends on: + +1. Exact-identical tool-result chunks are replaced by short references. +2. Edited / near-duplicate content keeps the changed (delta) text verbatim and + is NOT collapsed wholesale into an "identical" reference. +3. Genuinely different content (e.g. a different file in the same repo) is never + claimed identical — nothing is deduped and the payload is left untouched. + +The hashing is exact-content based; these tests guard against any future change +that weakens it (e.g. fuzzy matching that would hide new, unique content behind +a reference). +""" +import importlib.util +from pathlib import Path + +MODULE_PATH = Path(__file__).resolve().parents[1] / "contextpilot" / "dedup" / "block_dedup.py" +_spec = importlib.util.spec_from_file_location("contextpilot_block_dedup_test", MODULE_PATH) +block_dedup = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(block_dedup) + +dedup_chat_completions = block_dedup.dedup_chat_completions + + +def _file_content(prefix: str = "compute_value", n: int = 80) -> str: + """Deterministic multi-line tool-result body that chunks into many blocks.""" + return "\n".join( + f"{i:3d}| def function_number_{i}(): return {prefix}({i}) + base_offset_value" + for i in range(n) + ) + + +def _two_tool_results(content_a: str, content_b: str) -> dict: + """Two Read tool results in a single chat-completions body.""" + return { + "messages": [ + {"role": "user", "content": "read the file"}, + {"role": "assistant", "tool_calls": [{"id": "c1", "function": {"name": "Read"}}]}, + {"role": "tool", "tool_call_id": "c1", "content": content_a}, + {"role": "assistant", "tool_calls": [{"id": "c2", "function": {"name": "Read"}}]}, + {"role": "tool", "tool_call_id": "c2", "content": content_b}, + ] + } + + +def test_exact_duplicate_tool_result_is_replaced_by_reference(): + content = _file_content() + body = _two_tool_results(content, content) + + result = dedup_chat_completions(body) + + first = body["messages"][2]["content"] + second = body["messages"][4]["content"] + + # Savings actually happened and were attributed to deduped blocks. + assert result.chars_saved > 0 + assert result.blocks_deduped > 0 + + # The first occurrence is untouched; the second is shortened and points back. + assert first == content + assert len(second) < len(content) + assert "identical to earlier" in second + + +def test_edited_near_duplicate_preserves_delta_verbatim(): + content = _file_content() + lines = content.split("\n") + # Edit a single line in the middle — a realistic same-file edit between turns. + delta = "40| TOTALLY_UNIQUE_EDITED_LINE_MARKER xyzzy brand new content not seen before" + lines[40] = delta + edited = "\n".join(lines) + + body = _two_tool_results(content, edited) + result = dedup_chat_completions(body) + + second = body["messages"][4]["content"] + + # The unique edited text MUST survive verbatim — it must never be hidden + # behind an "identical" reference. + assert "TOTALLY_UNIQUE_EDITED_LINE_MARKER" in second + assert delta in second + + # Identical surrounding blocks are still deduped (so this is not a no-op), + # but the result is not collapsed into a single wholesale "identical" marker. + assert result.blocks_deduped > 0 + assert result.blocks_deduped < result.blocks_total + + +def test_different_file_same_repo_is_not_claimed_identical(): + content = _file_content("compute_value") + other = "\n".join( + f"{i:3d}| class Widget_{i}: pass # unrelated module, distinct content line" + for i in range(80) + ) + body = _two_tool_results(content, other) + + result = dedup_chat_completions(body) + + # No shared blocks -> nothing deduped and both payloads left byte-for-byte intact. + assert result.chars_saved == 0 + assert result.blocks_deduped == 0 + assert body["messages"][2]["content"] == content + assert body["messages"][4]["content"] == other + + +def test_single_changed_char_breaks_block_match(): + """A one-character change must produce a different hash (no fuzzy collapse).""" + content = _file_content() + # Flip exactly one character deep inside the body. + idx = len(content) // 2 + mutated = content[:idx] + ("Z" if content[idx] != "Z" else "Q") + content[idx + 1:] + + body = _two_tool_results(content, mutated) + dedup_chat_completions(body) + + second = body["messages"][4]["content"] + # The block containing the mutation is preserved verbatim (not referenced away). + mutated_line = mutated.split("\n")[content[:idx].count("\n")] + assert mutated_line in second diff --git a/tests/test_hermes_contextpilot_monitor.py b/tests/test_hermes_contextpilot_monitor.py index d0f0218..ba56bb2 100644 --- a/tests/test_hermes_contextpilot_monitor.py +++ b/tests/test_hermes_contextpilot_monitor.py @@ -112,3 +112,72 @@ def test_monitor_reads_metadata_only_and_hashes_session_ids(tmp_path): assert "DO NOT READ ME" not in md assert "SECRET SYSTEM PROMPT" not in md assert data["top_token_sessions"][0]["session_hash"] != "raw-session-id" + + +def _write_telemetry(path, records): + path.write_text( + "\n".join(json.dumps(r) for r in records) + "\n", encoding="utf-8" + ) + + +def test_parse_telemetry_aggregates_recent_records(tmp_path): + tel = tmp_path / "telemetry.jsonl" + far_future = 4102444800.0 # 2100-01-01 + _write_telemetry( + tel, + [ + {"ts": far_future, "type": "turn", "session": "s1", "turn": 1, + "chars_saved": 400, "tokens_saved": 100}, + {"ts": far_future, "type": "turn", "session": "s1", "turn": 2, + "chars_saved": 200, "tokens_saved": 50}, + # Stale record far in the past must be excluded by the window. + {"ts": 1000.0, "type": "turn", "session": "s0", "turn": 1, + "chars_saved": 999999, "tokens_saved": 999999}, + "this is not json", + ], + ) + + events, chars, tokens = monitor.parse_contextpilot_telemetry(tel, since_hours=24) + assert events == 2 + assert chars == 600 + assert tokens == 150 + + +def test_parse_telemetry_missing_file_is_safe(tmp_path): + assert monitor.parse_contextpilot_telemetry(tmp_path / "nope.jsonl", since_hours=24) == (0, 0, 0) + + +def test_build_report_prefers_telemetry_over_logs(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test") + + report = monitor.build_report( + metrics, + date="2100-01-01", + since_hours=24, + log_stats=(5, 4000, 1000), + telemetry_stats=(2, 600, 150), + ) + # Telemetry is authoritative when present; logs are not summed on top. + assert report.contextpilot_tokens_saved == 150 + assert report.contextpilot_chars_saved == 600 + assert report.contextpilot_telemetry_events == 2 + assert report.contextpilot_log_events == 5 + assert report.contextpilot_savings_source == "telemetry" + + +def test_build_report_falls_back_to_logs_without_telemetry(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + metrics = monitor.load_session_metrics(db, since_hours=24 * 365 * 100, salt="test") + + report = monitor.build_report( + metrics, + date="2100-01-01", + since_hours=24, + log_stats=(5, 4000, 1000), + telemetry_stats=(0, 0, 0), + ) + assert report.contextpilot_tokens_saved == 1000 + assert report.contextpilot_savings_source == "gateway-log" diff --git a/tests/test_hermes_plugin_patch.py b/tests/test_hermes_plugin_patch.py index 9a372d8..74bcd19 100644 --- a/tests/test_hermes_plugin_patch.py +++ b/tests/test_hermes_plugin_patch.py @@ -184,3 +184,103 @@ def dedup(body, **kwargs): assert second_out[1]["content"] == "DEDUPED TOOL RESULT" assert second_out[2]["content"] == "now summarize it" assert calls[-1][1]["content"] == "DEDUPED TOOL RESULT" + + +def _saving_dedup(body, **kwargs): + saved = 0 + for msg in body["messages"]: + if msg.get("role") == "tool" and msg.get("content") == "FULL TOOL RESULT": + msg["content"] = "REF" + saved += len("FULL TOOL RESULT") - len("REF") + return SimpleNamespace( + chars_saved=saved, + blocks_deduped=1 if saved else 0, + blocks_total=1, + system_blocks_matched=0, + ) + + +def test_optimize_writes_metadata_only_telemetry_line(monkeypatch, tmp_path): + import json + + module, _ = _load_plugin_module(monkeypatch) + monkeypatch.setattr(module, "_check_reorder", lambda: False) + monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False) + monkeypatch.setattr(module, "dedup_chat_completions", _saving_dedup) + + telemetry = tmp_path / "nested" / "telemetry.jsonl" + monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry)) + + engine = module.ContextPilotEngine() + engine.on_session_start("session-XYZ", model="test-model") + + secret = "SUPER SECRET USER PROMPT — must never be written to telemetry" + messages = [ + {"role": "user", "content": secret}, + {"role": "tool", "tool_call_id": "call_1", "content": "FULL TOOL RESULT"}, + ] + engine.optimize_api_messages(messages) + + assert telemetry.exists() + lines = [l for l in telemetry.read_text(encoding="utf-8").splitlines() if l.strip()] + assert len(lines) == 1 + record = json.loads(lines[0]) + + # Numeric/metadata only — savings recorded. + assert record["chars_saved"] > 0 + assert record["tokens_saved"] == record["chars_saved"] // 4 + assert record["turn"] == 1 + assert record["session_hash"] == module._hash_text("session-XYZ") + assert "session" not in record + assert isinstance(record["ts"], (int, float)) + + # Privacy: no message/prompt/tool-payload content may appear anywhere. + raw = telemetry.read_text(encoding="utf-8") + assert secret not in raw + assert "FULL TOOL RESULT" not in raw + forbidden = {"content", "messages", "prompt", "system_prompt", "text", "tool_calls"} + assert forbidden.isdisjoint(record.keys()) + + +def test_optimize_telemetry_skipped_when_nothing_saved(monkeypatch, tmp_path): + module, _ = _load_plugin_module(monkeypatch) + monkeypatch.setattr(module, "_check_reorder", lambda: False) + monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False) + monkeypatch.setattr( + module, + "dedup_chat_completions", + lambda body, **kw: SimpleNamespace( + chars_saved=0, blocks_deduped=0, blocks_total=0, system_blocks_matched=0 + ), + ) + + telemetry = tmp_path / "telemetry.jsonl" + monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry)) + + engine = module.ContextPilotEngine() + engine.optimize_api_messages([{"role": "user", "content": "hello"}]) + + # No save -> no telemetry noise. + assert not telemetry.exists() + + +def test_optimize_survives_unwritable_telemetry_path(monkeypatch, tmp_path): + module, _ = _load_plugin_module(monkeypatch) + monkeypatch.setattr(module, "_check_reorder", lambda: False) + monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False) + monkeypatch.setattr(module, "dedup_chat_completions", _saving_dedup) + + # Point telemetry at a path whose parent is an existing *file*, so mkdir fails. + blocker = tmp_path / "iam_a_file" + blocker.write_text("x", encoding="utf-8") + monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(blocker / "telemetry.jsonl")) + + engine = module.ContextPilotEngine() + messages = [ + {"role": "user", "content": "read file"}, + {"role": "tool", "tool_call_id": "call_1", "content": "FULL TOOL RESULT"}, + ] + # Must not raise despite the unwritable telemetry destination. + out, stats = engine.optimize_api_messages(messages) + assert out[1]["content"] == "REF" + assert stats["chars_saved"] > 0 From 1c93f1dc7fce1af677a1eec2dd276996a87f1dcf Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Jun 2026 03:09:03 +0200 Subject: [PATCH 3/9] feat: add Hermes context opportunity scanner --- docs/guides/hermes-monitor.md | 35 + .../analyze_hermes_context_opportunities.py | 680 ++++++++++++++++++ ...est_hermes_context_opportunity_analyzer.py | 222 ++++++ 3 files changed, 937 insertions(+) create mode 100644 scripts/analyze_hermes_context_opportunities.py create mode 100644 tests/test_hermes_context_opportunity_analyzer.py diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index e63bfd7..4d20d43 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -53,6 +53,41 @@ Then read the generated Markdown report for today and send a short Chinese summa ) ``` +## Opportunity scanning + +`scripts/analyze_hermes_context_opportunities.py` is a companion scanner meant +for a continuous cron job. Where the monitor stays metadata-only, this analyzer +*does* read message content and tool outputs — but only in-memory, to compute +salted SHA-256 fingerprints and aggregate counters. Reports never contain raw +message/tool text, system prompts, reasoning, or raw session ids. + +It surfaces concrete token-reduction opportunities: + +- exact duplicate tool outputs (identical payloads re-sent across turns), +- repeated line/block fingerprints (shared boilerplate across outputs), +- large tool outputs grouped by `tool_name`, +- heavy sessions by input-token / tool-call / message counts (hashed ids), +- ContextPilot telemetry coverage and savings ratios. + +```bash +python scripts/analyze_hermes_context_opportunities.py \ + --state-db /root/.hermes/state.db \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ + --out-dir ~/contextpilot/opportunities \ + --since-hours 24 +``` + +Outputs: + +- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json` +- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md` + +Each estimated "wasted tokens" figure is a heuristic (chars / 4); treat the +report as a prioritized list of candidates and validate against the accuracy +gate below before changing ContextPilot config or code. A defensive guard in +`write_report` refuses to emit any forbidden raw-content key, so the reports are +safe to ship from an unattended cron job. + ## Accuracy gate This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require: diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py new file mode 100644 index 0000000..780315a --- /dev/null +++ b/scripts/analyze_hermes_context_opportunities.py @@ -0,0 +1,680 @@ +#!/usr/bin/env python3 +"""Privacy-safe Hermes context opportunity analyzer for ContextPilot. + +Unlike ``hermes_contextpilot_monitor.py`` (which never reads message bodies), +this analyzer *does* inspect message content and tool outputs in order to find +concrete token-reduction opportunities: exact duplicate tool outputs, repeated +line/block fingerprints, oversized tool outputs per tool, heavy sessions, and +ContextPilot telemetry coverage. + +It reads content only in-memory to compute salted hashes and aggregate +counters. Reports never contain raw message/tool text, system prompts, or raw +session ids -- only salted SHA-256 fingerprints and numeric aggregates. This +makes it safe to run continuously from a cron job and ship the reports. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import json +import sqlite3 +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Iterable + +# Columns we are explicitly forbidden from EMITTING in any report. We may read +# message content in-memory for hashing, but it must never reach an output file. +FORBIDDEN_OUTPUT_KEYS = { + "content", + "system_prompt", + "reasoning", + "reasoning_content", + "reasoning_details", + "tool_calls", + "codex_reasoning_items", + "codex_message_items", +} + +# Tunables (overridable via CLI). +DEFAULT_MIN_BLOCK_CHARS = 40 # ignore trivial lines when fingerprinting +DEFAULT_MIN_BLOCK_REPEAT = 3 # a block must recur this often to be a "repeat" +DEFAULT_LARGE_OUTPUT_CHARS = 8000 # tool outputs at/above this are "large" +DEFAULT_TOP_N = 20 +EST_CHARS_PER_TOKEN = 4 + + +def _est_tokens(chars: int) -> int: + return chars // EST_CHARS_PER_TOKEN + + +def _salted_hash(text: str, salt: str, *, length: int = 16) -> str: + return hashlib.sha256(f"{salt}:{text}".encode("utf-8", "replace")).hexdigest()[:length] + + +def _salt_fingerprint(salt: str) -> str: + # Confirms a salt was applied without revealing it. + return hashlib.sha256(f"fingerprint:{salt}".encode()).hexdigest()[:12] + + +def _connect_readonly(path: Path) -> sqlite3.Connection: + uri = f"file:{path}?mode=ro" + return sqlite3.connect(uri, uri=True) + + +# --------------------------------------------------------------------------- +# Data structures (all privacy-safe: hashes + counters only) +# --------------------------------------------------------------------------- + + +@dataclass +class DuplicateToolOutput: + content_hash: str + tool_name: str | None + occurrences: int + char_length: int + est_tokens: int + est_wasted_tokens: int # tokens spent re-sending identical output: (n-1) * est_tokens + + +@dataclass +class RepeatedBlock: + block_hash: str + occurrences: int + char_length: int + est_tokens: int + est_wasted_tokens: int # (n-1) * est_tokens + + +@dataclass +class ToolSizeStat: + tool_name: str + output_count: int + total_chars: int + max_chars: int + avg_chars: int + total_est_tokens: int + large_output_count: int # outputs >= large_output_chars threshold + + +@dataclass +class HeavySession: + session_hash: str + source: str | None + input_tokens: int + output_tokens: int + message_count: int + tool_call_count: int + api_call_count: int + + +@dataclass +class TelemetryCoverage: + events: int + chars_saved: int + tokens_saved: int + avg_tokens_saved_per_event: float + coverage_ratio_pct: float # tokens_saved / (tokens_saved + total_input_tokens) + malformed_records_skipped: int + + +@dataclass +class OpportunityReport: + date: str + since_hours: int + salt_fingerprint: str + tool_message_count: int + total_tool_output_chars: int + total_tool_output_est_tokens: int + exact_duplicate_groups: list[DuplicateToolOutput] + duplicate_tool_output_groups: int + duplicate_tool_output_wasted_tokens: int + repeated_block_count: int + repeated_block_wasted_tokens: int + repeated_blocks: list[RepeatedBlock] + large_tool_outputs_by_tool: list[ToolSizeStat] + heavy_sessions: list[HeavySession] + telemetry: TelemetryCoverage + notes: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Loading +# --------------------------------------------------------------------------- + + +@dataclass +class _ToolMessage: + tool_name: str | None + content: str + + +def load_tool_messages( + db_path: Path, *, since_hours: int +) -> list[_ToolMessage]: + """Load tool-output messages within the window. + + Content is returned for in-memory hashing only; callers must not emit it. + A message is treated as tool output when ``role='tool'`` or ``tool_name`` + is set. + """ + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + conn = _connect_readonly(db_path) + try: + cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")} + if "content" not in cols: + return [] + has_tool_name = "tool_name" in cols + has_ts = "timestamp" in cols + select_tool = "tool_name" if has_tool_name else "NULL AS tool_name" + where = [] + params: list[object] = [] + if has_ts: + where.append("timestamp >= ?") + params.append(cutoff) + tool_pred = "role = 'tool'" + if has_tool_name: + tool_pred = "(role = 'tool' OR tool_name IS NOT NULL)" + where.append(tool_pred) + sql = ( + f"SELECT {select_tool}, content FROM messages " + f"WHERE {' AND '.join(where)}" + ) + rows = conn.execute(sql, params).fetchall() + finally: + conn.close() + + out: list[_ToolMessage] = [] + for tool_name, content in rows: + if content is None: + continue + out.append(_ToolMessage(tool_name=tool_name, content=str(content))) + return out + + +def load_heavy_sessions( + db_path: Path, *, since_hours: int, salt: str, top_n: int +) -> list[HeavySession]: + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + conn = _connect_readonly(db_path) + try: + cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} + if "id" not in cols: + return [] + wanted = [ + "id", + "source", + "input_tokens", + "output_tokens", + "message_count", + "tool_call_count", + "api_call_count", + ] + select_cols = [c if c in cols else f"NULL AS {c}" for c in wanted] + where = [] + params: list[object] = [] + if "started_at" in cols: + where.append("started_at >= ?") + params.append(cutoff) + if "archived" in cols: + where.append("archived = 0") + sql = f"SELECT {', '.join(select_cols)} FROM sessions" + if where: + sql += " WHERE " + " AND ".join(where) + sql += " ORDER BY input_tokens DESC" + rows = conn.execute(sql, params).fetchall() + finally: + conn.close() + + sessions: list[HeavySession] = [] + for sid, source, inp, out_tok, msgs, tools, apis in rows: + sessions.append( + HeavySession( + session_hash=_salted_hash(str(sid), salt), + source=source, + input_tokens=int(inp or 0), + output_tokens=int(out_tok or 0), + message_count=int(msgs or 0), + tool_call_count=int(tools or 0), + api_call_count=int(apis or 0), + ) + ) + sessions.sort(key=lambda s: (s.input_tokens, s.tool_call_count), reverse=True) + return sessions[:top_n] + + +def total_input_tokens(db_path: Path, *, since_hours: int) -> int: + """Sum input tokens across ALL in-window sessions (not just the top-N).""" + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + conn = _connect_readonly(db_path) + try: + cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} + if "input_tokens" not in cols: + return 0 + where = [] + params: list[object] = [] + if "started_at" in cols: + where.append("started_at >= ?") + params.append(cutoff) + if "archived" in cols: + where.append("archived = 0") + sql = "SELECT COALESCE(SUM(input_tokens), 0) FROM sessions" + if where: + sql += " WHERE " + " AND ".join(where) + (total,) = conn.execute(sql, params).fetchone() + finally: + conn.close() + return int(total or 0) + + +def parse_telemetry( + telemetry_path: Path, *, since_hours: int, total_input_tokens: int +) -> TelemetryCoverage: + """Aggregate the metadata-only ContextPilot telemetry file. + + Tolerates malformed lines (non-JSON, non-dict, missing counters) by + skipping and counting them. Never reads message content. + """ + events = 0 + chars = 0 + tokens = 0 + malformed = 0 + if telemetry_path and telemetry_path.exists(): + cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + with telemetry_path.open("r", encoding="utf-8", errors="replace") as f: + for raw in f: + line = raw.strip() + if not line: + continue + try: + record = json.loads(line) + except (ValueError, TypeError): + malformed += 1 + continue + if not isinstance(record, dict): + malformed += 1 + continue + ts = record.get("ts") + if isinstance(ts, (int, float)) and ts < cutoff: + continue + cs = record.get("chars_saved") + if not isinstance(cs, (int, float)): + malformed += 1 + continue + saved_tokens = record.get("tokens_saved") + events += 1 + chars += int(cs) + tokens += ( + int(saved_tokens) + if isinstance(saved_tokens, (int, float)) + else int(cs) // EST_CHARS_PER_TOKEN + ) + + denom = tokens + total_input_tokens + coverage = (tokens / denom * 100.0) if denom else 0.0 + avg = (tokens / events) if events else 0.0 + return TelemetryCoverage( + events=events, + chars_saved=chars, + tokens_saved=tokens, + avg_tokens_saved_per_event=round(avg, 2), + coverage_ratio_pct=round(coverage, 2), + malformed_records_skipped=malformed, + ) + + +# --------------------------------------------------------------------------- +# Detection +# --------------------------------------------------------------------------- + + +def detect_exact_duplicate_tool_outputs( + messages: Iterable[_ToolMessage], *, salt: str, top_n: int +) -> list[DuplicateToolOutput]: + groups: dict[str, dict] = {} + for msg in messages: + content = msg.content + if not content: + continue + h = _salted_hash(content, salt) + g = groups.get(h) + if g is None: + groups[h] = { + "tool_name": msg.tool_name, + "occurrences": 1, + "char_length": len(content), + } + else: + g["occurrences"] += 1 + if g["tool_name"] != msg.tool_name: + g["tool_name"] = None # mixed tools produced identical output + + dups: list[DuplicateToolOutput] = [] + for h, g in groups.items(): + if g["occurrences"] < 2: + continue + est = _est_tokens(g["char_length"]) + dups.append( + DuplicateToolOutput( + content_hash=h, + tool_name=g["tool_name"], + occurrences=g["occurrences"], + char_length=g["char_length"], + est_tokens=est, + est_wasted_tokens=est * (g["occurrences"] - 1), + ) + ) + dups.sort(key=lambda d: d.est_wasted_tokens, reverse=True) + return dups[:top_n] + + +def detect_repeated_blocks( + messages: Iterable[_ToolMessage], + *, + salt: str, + min_block_chars: int, + min_repeat: int, + top_n: int, +) -> list[RepeatedBlock]: + counts: dict[str, dict] = {} + for msg in messages: + seen_in_msg: set[str] = set() + for line in msg.content.splitlines(): + block = line.strip() + if len(block) < min_block_chars: + continue + h = _salted_hash(block, salt) + # Count cross-message recurrence; collapse repeats within one + # message so a single noisy output cannot dominate. + if h in seen_in_msg: + continue + seen_in_msg.add(h) + c = counts.get(h) + if c is None: + counts[h] = {"occurrences": 1, "char_length": len(block)} + else: + c["occurrences"] += 1 + + blocks: list[RepeatedBlock] = [] + for h, c in counts.items(): + if c["occurrences"] < min_repeat: + continue + est = _est_tokens(c["char_length"]) + blocks.append( + RepeatedBlock( + block_hash=h, + occurrences=c["occurrences"], + char_length=c["char_length"], + est_tokens=est, + est_wasted_tokens=est * (c["occurrences"] - 1), + ) + ) + blocks.sort(key=lambda b: b.est_wasted_tokens, reverse=True) + return blocks[:top_n] + + +def summarize_tool_sizes( + messages: Iterable[_ToolMessage], *, large_output_chars: int, top_n: int +) -> list[ToolSizeStat]: + agg: dict[str, dict] = {} + for msg in messages: + name = msg.tool_name or "(unknown)" + length = len(msg.content) + a = agg.get(name) + if a is None: + agg[name] = { + "output_count": 1, + "total_chars": length, + "max_chars": length, + "large_output_count": 1 if length >= large_output_chars else 0, + } + else: + a["output_count"] += 1 + a["total_chars"] += length + a["max_chars"] = max(a["max_chars"], length) + if length >= large_output_chars: + a["large_output_count"] += 1 + + stats: list[ToolSizeStat] = [] + for name, a in agg.items(): + stats.append( + ToolSizeStat( + tool_name=name, + output_count=a["output_count"], + total_chars=a["total_chars"], + max_chars=a["max_chars"], + avg_chars=a["total_chars"] // a["output_count"], + total_est_tokens=_est_tokens(a["total_chars"]), + large_output_count=a["large_output_count"], + ) + ) + stats.sort(key=lambda s: s.total_chars, reverse=True) + return stats[:top_n] + + +# --------------------------------------------------------------------------- +# Build + write +# --------------------------------------------------------------------------- + + +def build_report( + *, + date: str, + since_hours: int, + salt: str, + tool_messages: list[_ToolMessage], + heavy_sessions: list[HeavySession], + telemetry: TelemetryCoverage, + min_block_chars: int = DEFAULT_MIN_BLOCK_CHARS, + min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT, + large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS, + top_n: int = DEFAULT_TOP_N, +) -> OpportunityReport: + dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n) + blocks = detect_repeated_blocks( + tool_messages, + salt=salt, + min_block_chars=min_block_chars, + min_repeat=min_block_repeat, + top_n=top_n, + ) + sizes = summarize_tool_sizes( + tool_messages, large_output_chars=large_output_chars, top_n=top_n + ) + + total_chars = sum(len(m.content) for m in tool_messages) + dup_wasted = sum(d.est_wasted_tokens for d in dups) + block_wasted = sum(b.est_wasted_tokens for b in blocks) + + notes = [ + "content-aware analysis: message/tool text was hashed in-memory only and never written to reports", + "all identifiers are salted SHA-256 fingerprints; counters are aggregates", + "wasted-token figures are heuristic estimates (chars/4); validate before acting", + "session 'source' and 'tool_name' are emitted verbatim as low-cardinality enums, not raw text", + ] + if not tool_messages: + notes.append("no tool-output messages observed in the selected window") + + return OpportunityReport( + date=date, + since_hours=since_hours, + salt_fingerprint=_salt_fingerprint(salt), + tool_message_count=len(tool_messages), + total_tool_output_chars=total_chars, + total_tool_output_est_tokens=_est_tokens(total_chars), + exact_duplicate_groups=dups, + duplicate_tool_output_groups=len(dups), + duplicate_tool_output_wasted_tokens=dup_wasted, + repeated_block_count=len(blocks), + repeated_block_wasted_tokens=block_wasted, + repeated_blocks=blocks, + large_tool_outputs_by_tool=sizes, + heavy_sessions=heavy_sessions, + telemetry=telemetry, + notes=notes, + ) + + +def _assert_no_forbidden_keys(data: dict) -> None: + """Defensive guard: ensure no forbidden raw-content key reached the output.""" + + def walk(obj): + if isinstance(obj, dict): + for k, v in obj.items(): + if k in FORBIDDEN_OUTPUT_KEYS: + raise RuntimeError(f"refusing to emit forbidden key: {k}") + walk(v) + elif isinstance(obj, list): + for item in obj: + walk(item) + + walk(data) + + +def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: + out_dir.mkdir(parents=True, exist_ok=True) + data = asdict(report) + _assert_no_forbidden_keys(data) + + json_path = out_dir / f"opportunities_{report.date}.json" + md_path = out_dir / f"opportunities_{report.date}.md" + json_path.write_text( + json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8" + ) + + t = report.telemetry + md = [ + f"# ContextPilot Hermes opportunity scan — {report.date}", + "", + f"Window: last {report.since_hours}h", + f"Salt fingerprint: `{report.salt_fingerprint}`", + "", + "## Summary", + f"- Tool-output messages: {report.tool_message_count}", + f"- Total tool-output tokens (est): {report.total_tool_output_est_tokens}", + f"- Exact duplicate groups: {report.duplicate_tool_output_groups} " + f"(~{report.duplicate_tool_output_wasted_tokens} wasted tokens)", + f"- Repeated blocks: {report.repeated_block_count} " + f"(~{report.repeated_block_wasted_tokens} wasted tokens)", + f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, " + f"coverage {t.coverage_ratio_pct}%", + "", + "## Top exact-duplicate tool outputs", + ] + for d in report.exact_duplicate_groups: + md.append( + f"- `{d.content_hash}` tool={d.tool_name} x{d.occurrences} " + f"chars={d.char_length} ~wasted={d.est_wasted_tokens} tokens" + ) + md.append("") + md.append("## Top repeated blocks") + for b in report.repeated_blocks: + md.append( + f"- `{b.block_hash}` x{b.occurrences} chars={b.char_length} " + f"~wasted={b.est_wasted_tokens} tokens" + ) + md.append("") + md.append("## Large tool outputs by tool") + for s in report.large_tool_outputs_by_tool: + md.append( + f"- {s.tool_name}: count={s.output_count} total_chars={s.total_chars} " + f"max={s.max_chars} avg={s.avg_chars} large(>=thresh)={s.large_output_count}" + ) + md.append("") + md.append("## Heavy sessions (hashed)") + for h in report.heavy_sessions: + md.append( + f"- `{h.session_hash}` source={h.source} input={h.input_tokens} " + f"output={h.output_tokens} msgs={h.message_count} tools={h.tool_call_count} " + f"apis={h.api_call_count}" + ) + md.append("") + md.append("## Telemetry coverage") + md.extend( + [ + f"- Events: {t.events}", + f"- Tokens saved: {t.tokens_saved} (chars {t.chars_saved})", + f"- Avg tokens saved / event: {t.avg_tokens_saved_per_event}", + f"- Coverage ratio: {t.coverage_ratio_pct}%", + f"- Malformed records skipped: {t.malformed_records_skipped}", + ] + ) + md.append("") + md.append("## Notes") + for note in report.notes: + md.append(f"- {note}") + md_path.write_text("\n".join(md) + "\n", encoding="utf-8") + return json_path, md_path + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--state-db", type=Path, default=Path("/root/.hermes/state.db")) + parser.add_argument( + "--telemetry-file", + type=Path, + default=Path.home() / ".hermes" / "contextpilot" / "telemetry.jsonl", + help="metadata-only ContextPilot telemetry file", + ) + parser.add_argument( + "--out-dir", type=Path, default=Path.home() / "contextpilot" / "opportunities" + ) + parser.add_argument("--since-hours", type=int, default=24) + parser.add_argument( + "--salt", + default="contextpilot-hermes-opportunity-v1", + help="salt for stable per-install content/session fingerprints", + ) + parser.add_argument("--date", default=dt.date.today().isoformat()) + parser.add_argument("--min-block-chars", type=int, default=DEFAULT_MIN_BLOCK_CHARS) + parser.add_argument("--min-block-repeat", type=int, default=DEFAULT_MIN_BLOCK_REPEAT) + parser.add_argument( + "--large-output-chars", type=int, default=DEFAULT_LARGE_OUTPUT_CHARS + ) + parser.add_argument("--top-n", type=int, default=DEFAULT_TOP_N) + args = parser.parse_args() + + if not args.state_db.exists(): + raise SystemExit(f"Hermes state DB not found: {args.state_db}") + + # Harden for unattended cron use: never dump a traceback (which would echo + # the DB path / SQL); emit only the exception class name and a non-zero code. + try: + tool_messages = load_tool_messages(args.state_db, since_hours=args.since_hours) + heavy_sessions = load_heavy_sessions( + args.state_db, since_hours=args.since_hours, salt=args.salt, top_n=args.top_n + ) + total_input = total_input_tokens(args.state_db, since_hours=args.since_hours) + telemetry = parse_telemetry( + args.telemetry_file, + since_hours=args.since_hours, + total_input_tokens=total_input, + ) + report = build_report( + date=args.date, + since_hours=args.since_hours, + salt=args.salt, + tool_messages=tool_messages, + heavy_sessions=heavy_sessions, + telemetry=telemetry, + min_block_chars=args.min_block_chars, + min_block_repeat=args.min_block_repeat, + large_output_chars=args.large_output_chars, + top_n=args.top_n, + ) + json_path, md_path = write_report(report, args.out_dir) + except Exception as exc: # noqa: BLE001 - cron-safe: report class only, no payload + print(json.dumps({"ok": False, "error": type(exc).__name__})) + return 1 + + print( + json.dumps( + {"ok": True, "json": str(json_path), "markdown": str(md_path)}, + ensure_ascii=False, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py new file mode 100644 index 0000000..0c18e63 --- /dev/null +++ b/tests/test_hermes_context_opportunity_analyzer.py @@ -0,0 +1,222 @@ +import importlib.util +import json +import sqlite3 +import sys +from pathlib import Path + + +MODULE_PATH = ( + Path(__file__).resolve().parents[1] + / "scripts" + / "analyze_hermes_context_opportunities.py" +) +spec = importlib.util.spec_from_file_location( + "analyze_hermes_context_opportunities", MODULE_PATH +) +analyzer = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = analyzer +spec.loader.exec_module(analyzer) + + +FAR_FUTURE = 4102444800.0 # 2100-01-01, always inside a generous test window +WIDE_WINDOW = 24 * 365 * 100 + + +def _make_db(path: Path, messages, *, sessions=None): + conn = sqlite3.connect(path) + conn.execute( + """ + CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + source TEXT, + started_at REAL NOT NULL, + ended_at REAL, + message_count INTEGER DEFAULT 0, + tool_call_count INTEGER DEFAULT 0, + input_tokens INTEGER DEFAULT 0, + output_tokens INTEGER DEFAULT 0, + api_call_count INTEGER DEFAULT 0, + archived INTEGER NOT NULL DEFAULT 0, + system_prompt TEXT + ) + """ + ) + conn.execute( + """ + CREATE TABLE messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT, + tool_name TEXT, + reasoning TEXT, + timestamp REAL NOT NULL + ) + """ + ) + # tuple layout: (id, source, _placeholder, tool_call_count, message_count, + # input_tokens, output_tokens, api_call_count, system_prompt) + for s in sessions or [ + ("raw-session-id", "discord", None, 4, 6, 1000, 200, 3, "SECRET SYSTEM PROMPT") + ]: + conn.execute( + """ + INSERT INTO sessions ( + id, source, started_at, tool_call_count, message_count, + input_tokens, output_tokens, api_call_count, archived, system_prompt + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, 0, ?) + """, + (s[0], s[1], FAR_FUTURE, s[3], s[4], s[5], s[6], s[7], s[8]), + ) + for role, content, tool_name in messages: + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_name, reasoning, timestamp)" + " VALUES (?, ?, ?, ?, ?, ?)", + ("raw-session-id", role, content, tool_name, "PRIVATE REASONING", FAR_FUTURE), + ) + conn.commit() + conn.close() + + +def _analyze(db, tmp_path, telemetry=None, salt="test-salt"): + tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW) + heavy = analyzer.load_heavy_sessions( + db, since_hours=WIDE_WINDOW, salt=salt, top_n=20 + ) + total_input = sum(h.input_tokens for h in heavy) + tel = analyzer.parse_telemetry( + telemetry if telemetry is not None else tmp_path / "none.jsonl", + since_hours=WIDE_WINDOW, + total_input_tokens=total_input, + ) + report = analyzer.build_report( + date="2100-01-01", + since_hours=24, + salt=salt, + tool_messages=tool_messages, + heavy_sessions=heavy, + telemetry=tel, + min_block_repeat=2, + ) + return report + + +def test_no_raw_content_leaks_in_reports(tmp_path): + db = tmp_path / "state.db" + secret = "TOP-SECRET-TOOL-OUTPUT-PAYLOAD-DO-NOT-LEAK " * 10 + _make_db( + db, + [ + ("tool", secret, "Bash"), + ("tool", secret, "Bash"), + ("user", "DO NOT READ ME USER TEXT", None), + ], + ) + report = _analyze(db, tmp_path) + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + + blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8") + # Raw content, prompts, reasoning, and raw session ids must never appear. + assert "TOP-SECRET-TOOL-OUTPUT-PAYLOAD" not in blob + assert "DO NOT READ ME" not in blob + assert "SECRET SYSTEM PROMPT" not in blob + assert "PRIVATE REASONING" not in blob + assert "raw-session-id" not in blob + # But the duplicate was still detected via hashing. + assert report.duplicate_tool_output_groups == 1 + assert report.heavy_sessions[0].session_hash != "raw-session-id" + + +def test_exact_duplicate_tool_outputs_counted(tmp_path): + db = tmp_path / "state.db" + payload = "identical output line one\nidentical output line two\n" * 3 + _make_db( + db, + [ + ("tool", payload, "Read"), + ("tool", payload, "Read"), + ("tool", payload, "Read"), + ], + ) + report = _analyze(db, tmp_path) + assert report.duplicate_tool_output_groups == 1 + group = report.exact_duplicate_groups[0] + assert group.occurrences == 3 + assert group.tool_name == "Read" + # Two of the three sends are pure waste. + assert group.est_wasted_tokens == group.est_tokens * 2 + assert report.duplicate_tool_output_wasted_tokens == group.est_wasted_tokens + + +def test_near_or_different_content_not_exact_duplicate(tmp_path): + db = tmp_path / "state.db" + base = "the quick brown fox jumps over the lazy dog " * 5 + near = base + "X" # one char different -> different hash + other = "completely unrelated tool output content here " * 5 + _make_db( + db, + [ + ("tool", base, "Bash"), + ("tool", near, "Bash"), + ("tool", other, "Bash"), + ], + ) + report = _analyze(db, tmp_path) + # No two outputs are byte-identical -> zero exact-duplicate groups. + assert report.duplicate_tool_output_groups == 0 + assert report.duplicate_tool_output_wasted_tokens == 0 + + +def test_malformed_telemetry_tolerated(tmp_path): + db = tmp_path / "state.db" + _make_db(db, [("tool", "some output", "Bash")]) + tel = tmp_path / "telemetry.jsonl" + tel.write_text( + "\n".join( + [ + json.dumps({"ts": FAR_FUTURE, "chars_saved": 400, "tokens_saved": 100}), + json.dumps({"ts": FAR_FUTURE, "chars_saved": 200}), # missing tokens_saved + "this is not json at all", + json.dumps([1, 2, 3]), # not a dict + json.dumps({"ts": FAR_FUTURE, "note": "no counters here"}), + "", + ] + ) + + "\n", + encoding="utf-8", + ) + report = _analyze(db, tmp_path, telemetry=tel) + t = report.telemetry + # Two valid records aggregated; second infers tokens from chars (200//4=50). + assert t.events == 2 + assert t.chars_saved == 600 + assert t.tokens_saved == 150 + # Non-json, non-dict, and missing-counter lines are skipped, not fatal. + assert t.malformed_records_skipped == 3 + assert t.coverage_ratio_pct > 0 + + +def test_repeated_blocks_and_large_outputs(tmp_path): + db = tmp_path / "state.db" + shared = "this shared boilerplate block is long enough to be fingerprinted" + big = "x" * 9000 + _make_db( + db, + [ + ("tool", shared + "\nunique tail alpha that is also sufficiently long here", "Bash"), + ("tool", shared + "\nunique tail beta that is also sufficiently long here", "Bash"), + ("tool", big, "Read"), + ], + ) + report = _analyze(db, tmp_path) + assert any(b.occurrences >= 2 for b in report.repeated_blocks) + read_stat = next(s for s in report.large_tool_outputs_by_tool if s.tool_name == "Read") + assert read_stat.large_output_count == 1 + + +def test_missing_telemetry_file_is_safe(tmp_path): + db = tmp_path / "state.db" + _make_db(db, [("tool", "out", "Bash")]) + report = _analyze(db, tmp_path, telemetry=tmp_path / "nope.jsonl") + assert report.telemetry.events == 0 + assert report.telemetry.malformed_records_skipped == 0 From 992cd1ad8635f408c5f3848ee8d7fe6d39047039 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Jun 2026 11:51:06 +0200 Subject: [PATCH 4/9] feat: analyze LLM-bound context redundancy --- docs/guides/hermes-monitor.md | 36 ++ .../analyze_hermes_context_opportunities.py | 409 +++++++++++++++++- ...est_hermes_context_opportunity_analyzer.py | 236 +++++++++- 3 files changed, 658 insertions(+), 23 deletions(-) diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index 4d20d43..8798885 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -69,12 +69,48 @@ It surfaces concrete token-reduction opportunities: - heavy sessions by input-token / tool-call / message counts (hashed ids), - ContextPilot telemetry coverage and savings ratios. +### LLM-bound block redundancy + +The analyzer also performs an **LLM-bound block scan** that looks *only* at +content Hermes would actually send to a model, and reports where the same block +is paid for more than once: + +- `sessions.system_prompt`, classified heuristically as `system_prompt` or + `skill_prompt` (skill frontmatter / "use this skill" style cues), +- active `messages.content` for roles `system` / `user` / `assistant` / `tool`, + bucketed as `user_prompt`, `assistant_context`, `tool_result`, etc., +- tool-result messages (`role='tool'` or `tool_name` set) as `tool_result`. + +Inactive messages are skipped when an `active` column exists, and archived +sessions (and their messages) are skipped when an `archived` column exists. Each +block is split line-wise, fingerprinted with a salted SHA-256 hash, and +aggregated. The report then shows: + +- **redundancy by block type** — per-type block / unique / repeated counts and + estimated redundant tokens, +- **cross-type repeated blocks** — the headline signal: a single fingerprint + observed in 2+ block types (e.g. the same chunk shipped from a skill/system + prompt *and* a tool result *and* a user prompt). Reported only as a hash plus + per-type counters — never the raw text. + +Use `--all-sessions` to ignore the `--since-hours` window and scan **all** +non-archived sessions and active messages (useful for a one-shot, whole-history +audit rather than a rolling daily window): + ```bash +# rolling daily window python scripts/analyze_hermes_context_opportunities.py \ --state-db /root/.hermes/state.db \ --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ --out-dir ~/contextpilot/opportunities \ --since-hours 24 + +# whole-history audit across every session and LLM-bound block +python scripts/analyze_hermes_context_opportunities.py \ + --state-db /root/.hermes/state.db \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ + --out-dir ~/contextpilot/opportunities \ + --all-sessions ``` Outputs: diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py index 780315a..5bcd3b1 100644 --- a/scripts/analyze_hermes_context_opportunities.py +++ b/scripts/analyze_hermes_context_opportunities.py @@ -86,6 +86,54 @@ class RepeatedBlock: est_wasted_tokens: int # (n-1) * est_tokens +# Recognized LLM-bound block types. These are low-cardinality enums, safe to +# emit verbatim (they describe the *origin* of a block, never its text). +BLOCK_TYPES = ( + "system_prompt", + "skill_prompt", + "user_prompt", + "assistant_context", + "tool_result", + "unknown", +) + + +@dataclass +class TypeCount: + block_type: str + count: int + + +@dataclass +class BlockTypeStat: + """Aggregate redundancy within a single LLM-bound block type.""" + + block_type: str + item_count: int # source items (prompts/messages) of this type + block_count: int # total fingerprintable block instances + unique_block_count: int # distinct fingerprints + repeated_block_count: int # fingerprints recurring >= min_repeat within type + est_redundant_tokens: int # sum over repeats of (occ-1) * est_tokens + + +@dataclass +class CrossTypeBlockGroup: + """A single block fingerprint observed in 2+ distinct block types. + + This is the headline signal: the same chunk of text is being shipped to the + LLM from, e.g., a skill/system prompt *and* a tool result, so it is paying + for the same tokens twice from different sources. + """ + + block_hash: str + block_types: list[str] # sorted distinct types this block spans + type_occurrences: list[TypeCount] # per-type occurrence counts + occurrences: int # total occurrences across all types + char_length: int + est_tokens: int + est_wasted_tokens: int # (occurrences - 1) * est_tokens + + @dataclass class ToolSizeStat: tool_name: str @@ -122,6 +170,7 @@ class TelemetryCoverage: class OpportunityReport: date: str since_hours: int + all_sessions: bool salt_fingerprint: str tool_message_count: int total_tool_output_chars: int @@ -135,6 +184,11 @@ class OpportunityReport: large_tool_outputs_by_tool: list[ToolSizeStat] heavy_sessions: list[HeavySession] telemetry: TelemetryCoverage + # LLM-bound block analysis (system/skill prompts, prompts, tool results). + llm_bound_item_count: int + llm_block_types: list[BlockTypeStat] + cross_type_block_groups: list[CrossTypeBlockGroup] + cross_type_wasted_tokens: int notes: list[str] = field(default_factory=list) @@ -149,16 +203,75 @@ class _ToolMessage: content: str +@dataclass +class _LLMContent: + """A chunk of content that Hermes would actually send to the LLM. + + Held in-memory only for hashing; ``content`` must never be emitted. + """ + + block_type: str + content: str + + +def _window_cutoff(since_hours: int, all_sessions: bool) -> float | None: + """Return the epoch cutoff, or ``None`` to scan all history. + + ``all_sessions=True`` disables the time window so old sessions/messages are + included regardless of ``since_hours``. + """ + if all_sessions: + return None + return dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + + +def _classify_system_prompt(text: str) -> str: + """Heuristically label a system prompt as skill material or a plain prompt. + + Operates on in-memory text only; returns a low-cardinality enum, never the + text itself. + """ + low = text.lower() + stripped = low.lstrip() + # Skill-style frontmatter block (e.g. "---\nname: ...\ndescription: ..."). + if stripped.startswith("---") and "name:" in low[:300]: + return "skill_prompt" + cues = ( + "use this skill", + "available skills", + "when to use", + "invoke it via skill", + " str: + if role == "tool" or tool_name is not None: + return "tool_result" + if role == "user": + return "user_prompt" + if role == "assistant": + return "assistant_context" + if role == "system": + return "system_prompt" + return "unknown" + + def load_tool_messages( - db_path: Path, *, since_hours: int + db_path: Path, *, since_hours: int, all_sessions: bool = False ) -> list[_ToolMessage]: """Load tool-output messages within the window. Content is returned for in-memory hashing only; callers must not emit it. A message is treated as tool output when ``role='tool'`` or ``tool_name`` - is set. + is set. With ``all_sessions=True`` the time window is ignored. """ - cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + cutoff = _window_cutoff(since_hours, all_sessions) conn = _connect_readonly(db_path) try: cols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")} @@ -169,9 +282,11 @@ def load_tool_messages( select_tool = "tool_name" if has_tool_name else "NULL AS tool_name" where = [] params: list[object] = [] - if has_ts: + if has_ts and cutoff is not None: where.append("timestamp >= ?") params.append(cutoff) + if "active" in cols: + where.append("active = 1") tool_pred = "role = 'tool'" if has_tool_name: tool_pred = "(role = 'tool' OR tool_name IS NOT NULL)" @@ -192,10 +307,92 @@ def load_tool_messages( return out +def load_llm_bound_content( + db_path: Path, *, since_hours: int, all_sessions: bool = False +) -> list[_LLMContent]: + """Load only content Hermes would actually send to an LLM. + + Sources, all read in-memory for hashing (never emitted): + * ``sessions.system_prompt`` -> ``system_prompt`` or ``skill_prompt``, + * ``messages.content`` for active messages with role in + ``system``/``user``/``assistant``/``tool`` -> per-role block type, + * tool-result messages (role=tool or ``tool_name`` set) -> ``tool_result``. + + Inactive messages are skipped when an ``active`` column exists; archived + sessions (and their messages) are skipped when an ``archived`` column + exists. With ``all_sessions=True`` the time window is ignored. + """ + cutoff = _window_cutoff(since_hours, all_sessions) + conn = _connect_readonly(db_path) + out: list[_LLMContent] = [] + try: + scols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} + mcols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")} + + # --- system / skill prompts from sessions ------------------------- + if "system_prompt" in scols: + where = ["system_prompt IS NOT NULL"] + params: list[object] = [] + if cutoff is not None and "started_at" in scols: + where.append("started_at >= ?") + params.append(cutoff) + if "archived" in scols: + where.append("archived = 0") + sql = f"SELECT system_prompt FROM sessions WHERE {' AND '.join(where)}" + for (sp,) in conn.execute(sql, params): + if sp is None: + continue + text = str(sp) + out.append( + _LLMContent(block_type=_classify_system_prompt(text), content=text) + ) + + # --- active messages bound for the LLM ---------------------------- + if "content" in mcols: + has_role = "role" in mcols + has_tool_name = "tool_name" in mcols + select = [ + "messages.role" if has_role else "NULL AS role", + "messages.content", + "messages.tool_name" if has_tool_name else "NULL AS tool_name", + ] + where = ["messages.content IS NOT NULL"] + params = [] + if has_role: + where.append( + "messages.role IN ('system', 'user', 'assistant', 'tool')" + ) + if cutoff is not None and "timestamp" in mcols: + where.append("messages.timestamp >= ?") + params.append(cutoff) + if "active" in mcols: + where.append("messages.active = 1") + join = "" + if "archived" in scols and "session_id" in mcols and "id" in scols: + join = " JOIN sessions ON sessions.id = messages.session_id" + where.append("sessions.archived = 0") + sql = ( + f"SELECT {', '.join(select)} FROM messages{join} " + f"WHERE {' AND '.join(where)}" + ) + for role, content, tool_name in conn.execute(sql, params): + if content is None: + continue + out.append( + _LLMContent( + block_type=_message_block_type(role, tool_name), + content=str(content), + ) + ) + finally: + conn.close() + return out + + def load_heavy_sessions( - db_path: Path, *, since_hours: int, salt: str, top_n: int + db_path: Path, *, since_hours: int, salt: str, top_n: int, all_sessions: bool = False ) -> list[HeavySession]: - cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + cutoff = _window_cutoff(since_hours, all_sessions) conn = _connect_readonly(db_path) try: cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} @@ -213,7 +410,7 @@ def load_heavy_sessions( select_cols = [c if c in cols else f"NULL AS {c}" for c in wanted] where = [] params: list[object] = [] - if "started_at" in cols: + if cutoff is not None and "started_at" in cols: where.append("started_at >= ?") params.append(cutoff) if "archived" in cols: @@ -243,9 +440,11 @@ def load_heavy_sessions( return sessions[:top_n] -def total_input_tokens(db_path: Path, *, since_hours: int) -> int: +def total_input_tokens( + db_path: Path, *, since_hours: int, all_sessions: bool = False +) -> int: """Sum input tokens across ALL in-window sessions (not just the top-N).""" - cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + cutoff = _window_cutoff(since_hours, all_sessions) conn = _connect_readonly(db_path) try: cols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} @@ -253,7 +452,7 @@ def total_input_tokens(db_path: Path, *, since_hours: int) -> int: return 0 where = [] params: list[object] = [] - if "started_at" in cols: + if cutoff is not None and "started_at" in cols: where.append("started_at >= ?") params.append(cutoff) if "archived" in cols: @@ -268,19 +467,24 @@ def total_input_tokens(db_path: Path, *, since_hours: int) -> int: def parse_telemetry( - telemetry_path: Path, *, since_hours: int, total_input_tokens: int + telemetry_path: Path, + *, + since_hours: int, + total_input_tokens: int, + all_sessions: bool = False, ) -> TelemetryCoverage: """Aggregate the metadata-only ContextPilot telemetry file. Tolerates malformed lines (non-JSON, non-dict, missing counters) by - skipping and counting them. Never reads message content. + skipping and counting them. Never reads message content. With + ``all_sessions=True`` the time window is ignored. """ events = 0 chars = 0 tokens = 0 malformed = 0 if telemetry_path and telemetry_path.exists(): - cutoff = dt.datetime.now(dt.timezone.utc).timestamp() - since_hours * 3600 + cutoff = _window_cutoff(since_hours, all_sessions) with telemetry_path.open("r", encoding="utf-8", errors="replace") as f: for raw in f: line = raw.strip() @@ -295,7 +499,7 @@ def parse_telemetry( malformed += 1 continue ts = record.get("ts") - if isinstance(ts, (int, float)) and ts < cutoff: + if cutoff is not None and isinstance(ts, (int, float)) and ts < cutoff: continue cs = record.get("chars_saved") if not isinstance(cs, (int, float)): @@ -452,6 +656,110 @@ def summarize_tool_sizes( return stats[:top_n] +def _iter_blocks(content: str, min_block_chars: int) -> Iterable[str]: + """Yield the distinct fingerprintable lines of one item (deduped in-item).""" + seen: set[str] = set() + for line in content.splitlines(): + block = line.strip() + if len(block) < min_block_chars: + continue + if block in seen: + continue + seen.add(block) + yield block + + +def analyze_llm_bound_blocks( + contents: Iterable[_LLMContent], + *, + salt: str, + min_block_chars: int, + min_repeat: int, + top_n: int, +) -> tuple[list[BlockTypeStat], list[CrossTypeBlockGroup]]: + """Fingerprint LLM-bound blocks and report redundancy. + + Returns (per-type stats, cross-type repeated block groups). All output is + salted hashes / counters / block-type enums -- no raw text. + """ + # block_hash -> {char_length, types: {block_type: occ}} + agg: dict[str, dict] = {} + # block_type -> source item count + item_counts: dict[str, int] = {} + + for item in contents: + bt = item.block_type + item_counts[bt] = item_counts.get(bt, 0) + 1 + for block in _iter_blocks(item.content, min_block_chars): + h = _salted_hash(block, salt) + entry = agg.get(h) + if entry is None: + agg[h] = {"char_length": len(block), "types": {bt: 1}} + else: + entry["types"][bt] = entry["types"].get(bt, 0) + 1 + + # --- per block-type aggregate redundancy ------------------------------ + per_type: dict[str, dict] = {} + for entry in agg.values(): + est = _est_tokens(entry["char_length"]) + for bt, occ in entry["types"].items(): + t = per_type.setdefault( + bt, + { + "block_count": 0, + "unique": 0, + "repeated": 0, + "redundant_tokens": 0, + }, + ) + t["block_count"] += occ + t["unique"] += 1 + if occ >= min_repeat: + t["repeated"] += 1 + t["redundant_tokens"] += est * (occ - 1) + + block_type_stats: list[BlockTypeStat] = [] + for bt in sorted(set(per_type) | set(item_counts)): + t = per_type.get( + bt, {"block_count": 0, "unique": 0, "repeated": 0, "redundant_tokens": 0} + ) + block_type_stats.append( + BlockTypeStat( + block_type=bt, + item_count=item_counts.get(bt, 0), + block_count=t["block_count"], + unique_block_count=t["unique"], + repeated_block_count=t["repeated"], + est_redundant_tokens=t["redundant_tokens"], + ) + ) + + # --- cross-type repeated blocks --------------------------------------- + cross: list[CrossTypeBlockGroup] = [] + for h, entry in agg.items(): + types = entry["types"] + if len(types) < 2: + continue + total_occ = sum(types.values()) + est = _est_tokens(entry["char_length"]) + cross.append( + CrossTypeBlockGroup( + block_hash=h, + block_types=sorted(types.keys()), + type_occurrences=[ + TypeCount(block_type=bt, count=occ) + for bt, occ in sorted(types.items()) + ], + occurrences=total_occ, + char_length=entry["char_length"], + est_tokens=est, + est_wasted_tokens=est * (total_occ - 1), + ) + ) + cross.sort(key=lambda g: g.est_wasted_tokens, reverse=True) + return block_type_stats, cross[:top_n] + + # --------------------------------------------------------------------------- # Build + write # --------------------------------------------------------------------------- @@ -465,6 +773,8 @@ def build_report( tool_messages: list[_ToolMessage], heavy_sessions: list[HeavySession], telemetry: TelemetryCoverage, + llm_contents: list[_LLMContent] | None = None, + all_sessions: bool = False, min_block_chars: int = DEFAULT_MIN_BLOCK_CHARS, min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT, large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS, @@ -482,22 +792,38 @@ def build_report( tool_messages, large_output_chars=large_output_chars, top_n=top_n ) + llm_contents = llm_contents or [] + block_type_stats, cross_groups = analyze_llm_bound_blocks( + llm_contents, + salt=salt, + min_block_chars=min_block_chars, + min_repeat=min_block_repeat, + top_n=top_n, + ) + total_chars = sum(len(m.content) for m in tool_messages) dup_wasted = sum(d.est_wasted_tokens for d in dups) block_wasted = sum(b.est_wasted_tokens for b in blocks) + cross_wasted = sum(g.est_wasted_tokens for g in cross_groups) notes = [ "content-aware analysis: message/tool text was hashed in-memory only and never written to reports", "all identifiers are salted SHA-256 fingerprints; counters are aggregates", "wasted-token figures are heuristic estimates (chars/4); validate before acting", - "session 'source' and 'tool_name' are emitted verbatim as low-cardinality enums, not raw text", + "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text", + "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages", ] + if all_sessions: + notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages") if not tool_messages: notes.append("no tool-output messages observed in the selected window") + if not llm_contents: + notes.append("no llm-bound content observed in the selected window") return OpportunityReport( date=date, since_hours=since_hours, + all_sessions=all_sessions, salt_fingerprint=_salt_fingerprint(salt), tool_message_count=len(tool_messages), total_tool_output_chars=total_chars, @@ -511,6 +837,10 @@ def build_report( large_tool_outputs_by_tool=sizes, heavy_sessions=heavy_sessions, telemetry=telemetry, + llm_bound_item_count=len(llm_contents), + llm_block_types=block_type_stats, + cross_type_block_groups=cross_groups, + cross_type_wasted_tokens=cross_wasted, notes=notes, ) @@ -543,10 +873,11 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: ) t = report.telemetry + window = "all sessions (no time window)" if report.all_sessions else f"last {report.since_hours}h" md = [ f"# ContextPilot Hermes opportunity scan — {report.date}", "", - f"Window: last {report.since_hours}h", + f"Window: {window}", f"Salt fingerprint: `{report.salt_fingerprint}`", "", "## Summary", @@ -556,11 +887,30 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: f"(~{report.duplicate_tool_output_wasted_tokens} wasted tokens)", f"- Repeated blocks: {report.repeated_block_count} " f"(~{report.repeated_block_wasted_tokens} wasted tokens)", + f"- LLM-bound items scanned: {report.llm_bound_item_count}", + f"- Cross-type repeated blocks: {len(report.cross_type_block_groups)} " + f"(~{report.cross_type_wasted_tokens} wasted tokens)", f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, " f"coverage {t.coverage_ratio_pct}%", "", - "## Top exact-duplicate tool outputs", + "## LLM-bound redundancy by block type", ] + for bt in report.llm_block_types: + md.append( + f"- {bt.block_type}: items={bt.item_count} blocks={bt.block_count} " + f"unique={bt.unique_block_count} repeated={bt.repeated_block_count} " + f"~redundant={bt.est_redundant_tokens} tokens" + ) + md.append("") + md.append("## Cross-type repeated blocks (same block, multiple sources)") + for g in report.cross_type_block_groups: + spread = ", ".join(f"{tc.block_type}x{tc.count}" for tc in g.type_occurrences) + md.append( + f"- `{g.block_hash}` types=[{', '.join(g.block_types)}] ({spread}) " + f"chars={g.char_length} ~wasted={g.est_wasted_tokens} tokens" + ) + md.append("") + md.append("## Top exact-duplicate tool outputs") for d in report.exact_duplicate_groups: md.append( f"- `{d.content_hash}` tool={d.tool_name} x{d.occurrences} " @@ -620,6 +970,11 @@ def main() -> int: "--out-dir", type=Path, default=Path.home() / "contextpilot" / "opportunities" ) parser.add_argument("--since-hours", type=int, default=24) + parser.add_argument( + "--all-sessions", + action="store_true", + help="ignore --since-hours; scan all non-archived sessions and active messages", + ) parser.add_argument( "--salt", default="contextpilot-hermes-opportunity-v1", @@ -640,15 +995,27 @@ def main() -> int: # Harden for unattended cron use: never dump a traceback (which would echo # the DB path / SQL); emit only the exception class name and a non-zero code. try: - tool_messages = load_tool_messages(args.state_db, since_hours=args.since_hours) + tool_messages = load_tool_messages( + args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions + ) + llm_contents = load_llm_bound_content( + args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions + ) heavy_sessions = load_heavy_sessions( - args.state_db, since_hours=args.since_hours, salt=args.salt, top_n=args.top_n + args.state_db, + since_hours=args.since_hours, + salt=args.salt, + top_n=args.top_n, + all_sessions=args.all_sessions, + ) + total_input = total_input_tokens( + args.state_db, since_hours=args.since_hours, all_sessions=args.all_sessions ) - total_input = total_input_tokens(args.state_db, since_hours=args.since_hours) telemetry = parse_telemetry( args.telemetry_file, since_hours=args.since_hours, total_input_tokens=total_input, + all_sessions=args.all_sessions, ) report = build_report( date=args.date, @@ -657,6 +1024,8 @@ def main() -> int: tool_messages=tool_messages, heavy_sessions=heavy_sessions, telemetry=telemetry, + llm_contents=llm_contents, + all_sessions=args.all_sessions, min_block_chars=args.min_block_chars, min_block_repeat=args.min_block_repeat, large_output_chars=args.large_output_chars, diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py index 0c18e63..649288a 100644 --- a/tests/test_hermes_context_opportunity_analyzer.py +++ b/tests/test_hermes_context_opportunity_analyzer.py @@ -78,16 +78,22 @@ def _make_db(path: Path, messages, *, sessions=None): conn.close() -def _analyze(db, tmp_path, telemetry=None, salt="test-salt"): - tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW) +def _analyze(db, tmp_path, telemetry=None, salt="test-salt", all_sessions=False): + tool_messages = analyzer.load_tool_messages( + db, since_hours=WIDE_WINDOW, all_sessions=all_sessions + ) + llm_contents = analyzer.load_llm_bound_content( + db, since_hours=WIDE_WINDOW, all_sessions=all_sessions + ) heavy = analyzer.load_heavy_sessions( - db, since_hours=WIDE_WINDOW, salt=salt, top_n=20 + db, since_hours=WIDE_WINDOW, salt=salt, top_n=20, all_sessions=all_sessions ) total_input = sum(h.input_tokens for h in heavy) tel = analyzer.parse_telemetry( telemetry if telemetry is not None else tmp_path / "none.jsonl", since_hours=WIDE_WINDOW, total_input_tokens=total_input, + all_sessions=all_sessions, ) report = analyzer.build_report( date="2100-01-01", @@ -96,6 +102,8 @@ def _analyze(db, tmp_path, telemetry=None, salt="test-salt"): tool_messages=tool_messages, heavy_sessions=heavy, telemetry=tel, + llm_contents=llm_contents, + all_sessions=all_sessions, min_block_repeat=2, ) return report @@ -220,3 +228,225 @@ def test_missing_telemetry_file_is_safe(tmp_path): report = _analyze(db, tmp_path, telemetry=tmp_path / "nope.jsonl") assert report.telemetry.events == 0 assert report.telemetry.malformed_records_skipped == 0 + + +# --------------------------------------------------------------------------- +# LLM-bound block analysis + all-sessions tests +# --------------------------------------------------------------------------- + +OLD_TS = 1_000_000_000.0 # 2001 — far outside any normal recent window + + +def _make_db_ex(path, *, sessions, messages, message_active_col=False): + """Flexible builder: custom timestamps, optional messages.active column.""" + conn = sqlite3.connect(path) + conn.execute( + """ + CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + source TEXT, + started_at REAL NOT NULL, + input_tokens INTEGER DEFAULT 0, + output_tokens INTEGER DEFAULT 0, + message_count INTEGER DEFAULT 0, + tool_call_count INTEGER DEFAULT 0, + api_call_count INTEGER DEFAULT 0, + archived INTEGER NOT NULL DEFAULT 0, + system_prompt TEXT + ) + """ + ) + active_col = ", active INTEGER NOT NULL DEFAULT 1" if message_active_col else "" + conn.execute( + f""" + CREATE TABLE messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT, + tool_name TEXT, + reasoning TEXT, + timestamp REAL NOT NULL{active_col} + ) + """ + ) + for s in sessions: + conn.execute( + "INSERT INTO sessions (id, source, started_at, input_tokens, archived," + " system_prompt) VALUES (?, ?, ?, ?, ?, ?)", + ( + s["id"], + s.get("source"), + s["started_at"], + s.get("input_tokens", 0), + s.get("archived", 0), + s.get("system_prompt"), + ), + ) + for m in messages: + if message_active_col: + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_name, reasoning," + " timestamp, active) VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + m.get("session_id", "s1"), + m["role"], + m.get("content"), + m.get("tool_name"), + "PRIVATE REASONING", + m.get("timestamp", FAR_FUTURE), + m.get("active", 1), + ), + ) + else: + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_name, reasoning," + " timestamp) VALUES (?, ?, ?, ?, ?, ?)", + ( + m.get("session_id", "s1"), + m["role"], + m.get("content"), + m.get("tool_name"), + "PRIVATE REASONING", + m.get("timestamp", FAR_FUTURE), + ), + ) + conn.commit() + conn.close() + + +def test_all_sessions_includes_old_out_of_window_data(tmp_path): + db = tmp_path / "state.db" + _make_db_ex( + db, + sessions=[ + { + "id": "old-sess", + "source": "discord", + "started_at": OLD_TS, + "input_tokens": 500, + "system_prompt": "old system prompt material that is plenty long here", + } + ], + messages=[ + { + "session_id": "old-sess", + "role": "tool", + "content": "old tool output block sufficiently long to be scanned", + "tool_name": "Bash", + "timestamp": OLD_TS, + }, + { + "session_id": "old-sess", + "role": "user", + "content": "old user prompt text that is also long enough to scan", + "timestamp": OLD_TS, + }, + ], + ) + # A normal recent window excludes the old data entirely. + assert analyzer.load_tool_messages(db, since_hours=24) == [] + assert analyzer.load_llm_bound_content(db, since_hours=24) == [] + assert analyzer.load_heavy_sessions(db, since_hours=24, salt="s", top_n=5) == [] + + # all_sessions ignores the window and picks the old data back up. + assert len(analyzer.load_tool_messages(db, since_hours=24, all_sessions=True)) == 1 + llm = analyzer.load_llm_bound_content(db, since_hours=24, all_sessions=True) + assert len(llm) == 3 # system_prompt + tool_result + user_prompt + assert ( + len( + analyzer.load_heavy_sessions( + db, since_hours=24, salt="s", top_n=5, all_sessions=True + ) + ) + == 1 + ) + + +def test_inactive_messages_skipped(tmp_path): + db = tmp_path / "state.db" + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + { + "role": "tool", + "content": "active tool output that is sufficiently long to fingerprint", + "tool_name": "Bash", + "active": 1, + }, + { + "role": "tool", + "content": "inactive tool output that should be skipped entirely here", + "tool_name": "Bash", + "active": 0, + }, + { + "role": "user", + "content": "inactive user prompt that must also be skipped here", + "active": 0, + }, + ], + message_active_col=True, + ) + # Inactive rows are filtered out of both loaders. + assert len(analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW)) == 1 + llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW) + assert sorted(c.block_type for c in llm) == ["tool_result"] + + +def test_skill_prompt_classification(tmp_path): + db = tmp_path / "state.db" + skill_sys = ( + "---\nname: deep-research\ndescription: research harness\n---\n" + "Use this skill when researching a topic." + ) + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": skill_sys}], + messages=[], + ) + llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW) + assert len(llm) == 1 + assert llm[0].block_type == "skill_prompt" + + +def test_cross_type_redundancy_reported_via_hashes_only(tmp_path): + db = tmp_path / "state.db" + shared = "This is a shared instruction block long enough to fingerprint cleanly." + sys_prompt = "You are a helpful system.\n" + shared + "\nEnd of system prompt." + tool_out = "tool produced this output line\n" + shared + "\nand more tool lines" + user_msg = "user asks the assistant something specific here\n" + shared + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_prompt}], + messages=[ + {"role": "tool", "content": tool_out, "tool_name": "Bash"}, + {"role": "user", "content": user_msg}, + ], + ) + report = _analyze(db, tmp_path) + + # The shared block spans system_prompt, tool_result, and user_prompt. + assert len(report.cross_type_block_groups) >= 1 + grp = report.cross_type_block_groups[0] + assert "tool_result" in grp.block_types + assert any(bt in grp.block_types for bt in ("system_prompt", "skill_prompt")) + assert "user_prompt" in grp.block_types + assert grp.occurrences == 3 + # Reported only via salted hash + counters — never the raw block text. + assert shared not in grp.block_hash + assert report.cross_type_wasted_tokens > 0 + + # Per-type block stats are populated for the LLM-bound types. + types_seen = {b.block_type for b in report.llm_block_types} + assert {"tool_result", "user_prompt"} <= types_seen + + # The written report leaks no raw prompt/tool/system text. + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8") + assert shared not in blob + assert "shared instruction block" not in blob + assert "You are a helpful system" not in blob + assert "user asks the assistant" not in blob + assert "PRIVATE REASONING" not in blob From 9a1e1833d6d82df0ecf4713fdaeebb8d9a1956d4 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Jun 2026 00:48:51 +0200 Subject: [PATCH 5/9] feat: dedup exact cross-role payload blocks --- contextpilot/dedup/block_dedup.py | 303 +++++++++++++++---------- tests/test_payload_cross_role_dedup.py | 215 ++++++++++++++++++ 2 files changed, 393 insertions(+), 125 deletions(-) create mode 100644 tests/test_payload_cross_role_dedup.py diff --git a/contextpilot/dedup/block_dedup.py b/contextpilot/dedup/block_dedup.py index 073ee37..d9e0082 100644 --- a/contextpilot/dedup/block_dedup.py +++ b/contextpilot/dedup/block_dedup.py @@ -232,155 +232,208 @@ def _rebuild_json_content(original: str, key: str, new_text: str) -> str: return original -def dedup_chat_completions( - body: dict, - min_block_chars: int = MIN_BLOCK_CHARS, - min_content_chars: int = MIN_CONTENT_CHARS, - chunk_modulus: int = CHUNK_MODULUS, - system_content: Optional[str] = None, -) -> DedupResult: - messages = body.get("messages") - if not isinstance(messages, list) or not messages: - return DedupResult() +def _account(result: DedupResult, original_len: int, new_len: int) -> None: + """Roll a single field's before/after lengths into the aggregate result.""" + result.chars_before += original_len + result.chars_after += new_len + result.chars_saved += original_len - new_len - tool_names = _build_tool_name_map_openai(messages) - seen_blocks: Dict[str, Tuple[int, str, int]] = {} - pre_seen = _prescan_system_blocks(system_content, min_block_chars, chunk_modulus) - result = DedupResult() - for idx, msg in enumerate(messages): - if not isinstance(msg, dict) or msg.get("role") != "tool": - continue +def _register_blocks_only( + text: Optional[str], + seen_blocks: Dict[str, Tuple[int, str, int]], + msg_idx: int, + label: str, + result: DedupResult, + min_block_chars: int, + chunk_modulus: int, +) -> None: + """Register a message's blocks as dedup *sources* without modifying it. - content = msg.get("content", "") - if not isinstance(content, str) or len(content) < min_content_chars: + Used for the canonical first copy — e.g. the system / skill prompt — which + may seed references for later duplicates but must itself stay verbatim. + """ + if not isinstance(text, str) or not text.strip(): + return + for block_idx, block in enumerate(_content_defined_chunking(text, chunk_modulus)): + if len(block.strip()) < min_block_chars: continue + result.blocks_total += 1 + h = _hash_block(block) + if h not in seen_blocks: + seen_blocks[h] = (msg_idx, label, block_idx) - tc_id = msg.get("tool_call_id", "") - fn_name = tool_names.get(tc_id, msg.get("name", "")) or "tool" - # Extract text from JSON-wrapped tool results for proper chunking - extracted_text, json_key = _extract_text_for_dedup(content) - dedup_target = extracted_text if extracted_text else content +def _dedup_string_field( + text: str, + seen_blocks: Dict[str, Tuple[int, str, int]], + msg_idx: int, + label: str, + result: DedupResult, + min_block_chars: int, + chunk_modulus: int, +) -> Optional[str]: + """Dedup one plain-text field against earlier blocks in the same payload.""" + new_text = _dedup_text( + text, seen_blocks, msg_idx, label, result, min_block_chars, chunk_modulus + ) + if new_text is not None: + _account(result, len(text), len(new_text)) + logger.info( + "Block dedup: msg[%d] %s — saved %d chars", + msg_idx, + label, + len(text) - len(new_text), + ) + return new_text - new_content = _dedup_text( - dedup_target, - seen_blocks, - idx, - fn_name, - result, - min_block_chars, - chunk_modulus, - pre_seen=pre_seen, + +def _dedup_assistant_message( + msg: dict, + idx: int, + seen_blocks: Dict[str, Tuple[int, str, int]], + result: DedupResult, + min_block_chars: int, + min_content_chars: int, + chunk_modulus: int, +) -> None: + """Dedup assistant content (string or list-of-text-blocks) against earlier blocks.""" + raw = msg.get("content", "") + if isinstance(raw, str): + if len(raw) < min_content_chars: + return + new_content = _dedup_string_field( + raw, seen_blocks, idx, "assistant message", result, + min_block_chars, chunk_modulus, ) if new_content is not None: - if json_key and extracted_text: - # Rebuild the JSON with shortened content field - original_len = len(content) - msg["content"] = _rebuild_json_content(content, json_key, new_content) - new_len = len(msg["content"]) - else: - original_len = len(content) - msg["content"] = new_content - new_len = len(new_content) - result.chars_before += original_len - result.chars_after += new_len - result.chars_saved += original_len - new_len - logger.info( - f"Block dedup: msg[{idx}] {fn_name} — " - f"saved {original_len - new_len:,} chars" + msg["content"] = new_content + elif isinstance(raw, list): + # OpenClaw sends [{type: "text", text: "..."}, ...] + for block in raw: + if not isinstance(block, dict) or block.get("type") != "text": + continue + t = block.get("text", "") + if not isinstance(t, str) or len(t) < min_content_chars: + continue + new_text = _dedup_string_field( + t, seen_blocks, idx, "assistant message", result, + min_block_chars, chunk_modulus, ) + if new_text is not None: + block["text"] = new_text - _dedup_assistant_code_blocks( - messages, - seen_blocks, - result, - min_block_chars, - min_content_chars, - chunk_modulus, - pre_seen=pre_seen, - ) - - return result - - -_CODE_BLOCK_RE = re.compile(r"(```[\w]*\n)(.*?)(```)", re.DOTALL) - -def _dedup_assistant_code_blocks( - messages: list, +def _dedup_tool_message( + msg: dict, + idx: int, + tool_names: Dict[str, str], seen_blocks: Dict[str, Tuple[int, str, int]], result: DedupResult, min_block_chars: int, min_content_chars: int, chunk_modulus: int, - pre_seen: Optional[Dict[str, Tuple[int, str, int]]] = None, ) -> None: - for idx, msg in enumerate(messages): - if not isinstance(msg, dict) or msg.get("role") != "assistant": - continue - raw_content = msg.get("content", "") - - # Handle both string and list (content blocks) formats - is_list_content = False - text_block_idx = -1 - if isinstance(raw_content, str): - content = raw_content - elif isinstance(raw_content, list): - # OpenClaw sends [{type: "text", text: "..."}, ...] - # Find the text block that contains code - content = "" - for bi, block in enumerate(raw_content): - if isinstance(block, dict) and block.get("type") == "text": - t = block.get("text", "") - if "```" in t and len(t) > len(content): - content = t - text_block_idx = bi - is_list_content = True - if not content: - continue - else: - continue + """Dedup a tool result (JSON-aware) against earlier blocks in the payload.""" + content = msg.get("content", "") + if not isinstance(content, str) or len(content) < min_content_chars: + return - if len(content) < min_content_chars: - continue + tc_id = msg.get("tool_call_id", "") + fn_name = tool_names.get(tc_id, msg.get("name", "")) or "tool" - code_blocks = list(_CODE_BLOCK_RE.finditer(content)) - if not code_blocks: - continue + # Extract text from JSON-wrapped tool results for proper chunking. + extracted_text, json_key = _extract_text_for_dedup(content) + dedup_target = extracted_text if extracted_text else content - modified = False - new_content = content + new_content = _dedup_text( + dedup_target, seen_blocks, idx, fn_name, result, min_block_chars, chunk_modulus + ) + if new_content is None: + return + + original_len = len(content) + if json_key and extracted_text: + msg["content"] = _rebuild_json_content(content, json_key, new_content) + else: + msg["content"] = new_content + new_len = len(msg["content"]) + _account(result, original_len, new_len) + logger.info( + "Block dedup: msg[%d] %s — saved %d chars", idx, fn_name, original_len - new_len + ) - for match in reversed(code_blocks): - code = match.group(2) - if len(code.strip()) < min_block_chars: - continue - new_code = _dedup_text( - code, - seen_blocks, - idx, - "assistant", - result, - min_block_chars, - chunk_modulus, - pre_seen=pre_seen, +def dedup_chat_completions( + body: dict, + min_block_chars: int = MIN_BLOCK_CHARS, + min_content_chars: int = MIN_CONTENT_CHARS, + chunk_modulus: int = CHUNK_MODULUS, + system_content: Optional[str] = None, +) -> DedupResult: + """Exact-block dedup across ALL roles within a single chat payload. + + Walks messages in document order with a shared block table. The first + (earliest) occurrence of any block — across system/skill prompt, user, + assistant, and tool messages — keeps its full text; later EXACT occurrences + anywhere in the *same* payload are replaced by a short reference pointing to + the earlier copy ("see above"). Only exact hash matches are ever replaced; + references only ever point backward, to a block in this same payload. + + The system / skill prompt is treated as the canonical source: its blocks are + registered but it is never itself shortened. + """ + messages = body.get("messages") + if not isinstance(messages, list) or not messages: + return DedupResult() + + tool_names = _build_tool_name_map_openai(messages) + seen_blocks: Dict[str, Tuple[int, str, int]] = {} + result = DedupResult() + + # Seed an externally-supplied system / skill prompt (e.g. one not present as + # a message in `messages`) as the canonical first copy. Registered at -1 so + # later matches are attributed as system-block hits. + pre_seen = _prescan_system_blocks(system_content, min_block_chars, chunk_modulus) + for h, origin in pre_seen.items(): + seen_blocks.setdefault(h, origin) + + for idx, msg in enumerate(messages): + if not isinstance(msg, dict): + continue + role = msg.get("role") + + if role == "system": + # Canonical source — register but never shorten. Use -1 so downstream + # matches count as system-block hits, consistent with `pre_seen`. + _register_blocks_only( + msg.get("content", ""), seen_blocks, -1, "system prompt", + result, min_block_chars, chunk_modulus, ) - if new_code is not None: - start, end = match.start(2), match.end(2) - original_len = end - start - new_content = new_content[:start] + new_code + new_content[end:] - result.chars_before += original_len - result.chars_after += len(new_code) - result.chars_saved += original_len - len(new_code) - modified = True - - if modified: - if is_list_content and text_block_idx >= 0: - msg["content"][text_block_idx]["text"] = new_content - else: - msg["content"] = new_content + + elif role == "user": + content = msg.get("content", "") + if isinstance(content, str) and len(content) >= min_content_chars: + new_content = _dedup_string_field( + content, seen_blocks, idx, "user message", result, + min_block_chars, chunk_modulus, + ) + if new_content is not None: + msg["content"] = new_content + + elif role == "assistant": + _dedup_assistant_message( + msg, idx, seen_blocks, result, + min_block_chars, min_content_chars, chunk_modulus, + ) + + elif role == "tool": + _dedup_tool_message( + msg, idx, tool_names, seen_blocks, result, + min_block_chars, min_content_chars, chunk_modulus, + ) + + return result def dedup_responses_api( diff --git a/tests/test_payload_cross_role_dedup.py b/tests/test_payload_cross_role_dedup.py new file mode 100644 index 0000000..a03e4bf --- /dev/null +++ b/tests/test_payload_cross_role_dedup.py @@ -0,0 +1,215 @@ +"""Cross-role exact-block dedup within a single LLM-bound payload. + +This is the safest concrete ContextPilot optimization: inside ONE OpenAI chat +payload, an exact repeated block that already appears in an earlier message +(system / skill prompt, user, assistant, or tool result) is replaced — in the +*later* message only — by a short reference back to the earlier copy. The LLM +has already seen one full copy in the same request, so no information is lost. + +Hard safety contract these tests lock in: + +1. Exact repeated blocks ACROSS DIFFERENT ROLES are deduped. The first + (earliest, in document order) occurrence keeps its full text; later + occurrences are shortened to a reference pointing "above". +2. References point to an EARLIER block in the SAME payload (never forward, + never the first occurrence). +3. A one-character-different / near-duplicate block is NEVER collapsed — its + unique text survives verbatim. +4. Genuinely different content is left byte-for-byte intact. +5. No raw block/message content is ever written to telemetry. +""" +import importlib.util +import json +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +MODULE_PATH = REPO_ROOT / "contextpilot" / "dedup" / "block_dedup.py" +_spec = importlib.util.spec_from_file_location("contextpilot_block_dedup_xrole", MODULE_PATH) +block_dedup = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(block_dedup) + +dedup_chat_completions = block_dedup.dedup_chat_completions + +REFERENCE_MARKER = "identical to earlier" + + +def _instruction_block(prefix: str = "always follow safety rule", n: int = 30) -> str: + """A deterministic multi-line instruction/skill block that chunks cleanly.""" + return "\n".join( + f"INSTRUCTION LINE {i}: {prefix} number {i} carefully and verbatim every time" + for i in range(n) + ) + + +def _cross_role_payload(sys_block, user_block, tool_block, asst_block) -> dict: + """A single chat payload where the same instruction block can recur by role.""" + return { + "messages": [ + {"role": "system", "content": "You are a coding agent.\n" + sys_block + "\nEnd system."}, + {"role": "user", "content": "Please remember the rules:\n" + user_block + "\nThanks."}, + {"role": "assistant", "tool_calls": [{"id": "c1", "function": {"name": "Read"}}]}, + {"role": "tool", "tool_call_id": "c1", "content": "File header\n" + tool_block + "\nFooter."}, + {"role": "assistant", "content": "Acknowledging the rules:\n" + asst_block + "\nDone."}, + ] + } + + +def test_repeated_block_across_roles_is_deduped_first_copy_kept(): + block = _instruction_block() + body = _cross_role_payload(block, block, block, block) + original_system = body["messages"][0]["content"] + + result = dedup_chat_completions(body) + + system_after = body["messages"][0]["content"] + user_after = body["messages"][1]["content"] + tool_after = body["messages"][3]["content"] + asst_after = body["messages"][4]["content"] + + # Real savings, attributed to deduped blocks. + assert result.chars_saved > 0 + assert result.blocks_deduped > 0 + + # First (earliest) occurrence — the system prompt — is left fully intact. + assert system_after == original_system + + # Every later role that repeats the block is shortened and references "above". + assert len(user_after) < len(body["messages"][1]["content"]) or REFERENCE_MARKER in user_after + assert REFERENCE_MARKER in user_after, "user-role duplicate must be deduped" + assert REFERENCE_MARKER in tool_after, "tool-role duplicate must be deduped" + assert REFERENCE_MARKER in asst_after, "assistant-role duplicate must be deduped" + + +def test_reference_points_backward_only_never_first_occurrence(): + block = _instruction_block() + body = _cross_role_payload(block, block, block, block) + dedup_chat_completions(body) + + # The system message is first; it must never become a reference to itself or + # to anything later in the payload. + assert REFERENCE_MARKER not in body["messages"][0]["content"] + + +def test_near_duplicate_block_survives_verbatim(): + block = _instruction_block() + # One unique line differs in the user copy — a one-line delta. + lines = block.split("\n") + lines[15] = "INSTRUCTION LINE 15: UNIQUE_DELTA_MARKER_qwerty brand new never-seen directive" + edited = "\n".join(lines) + + body = _cross_role_payload(block, edited, block, block) + dedup_chat_completions(body) + + user_after = body["messages"][1]["content"] + # The changed line MUST survive verbatim — never hidden behind a reference. + assert "UNIQUE_DELTA_MARKER_qwerty" in user_after + + +def test_single_char_difference_is_not_collapsed(): + block = _instruction_block() + idx = len(block) // 2 + mutated = block[:idx] + ("Z" if block[idx] != "Z" else "Q") + block[idx + 1:] + + body = _cross_role_payload(block, mutated, block, block) + dedup_chat_completions(body) + + user_after = body["messages"][1]["content"] + mutated_line = mutated.split("\n")[block[:idx].count("\n")] + assert mutated_line in user_after + + +def test_genuinely_different_content_left_intact(): + block = _instruction_block() + other = "\n".join( + f"UNRELATED ROW {i}: a completely different paragraph about widgets and gears {i}" + for i in range(30) + ) + body = _cross_role_payload(block, other, other, other) + user_before = body["messages"][1]["content"] + tool_before = body["messages"][3]["content"] + + result = dedup_chat_completions(body) + + assert result.chars_saved == 0 + assert result.blocks_deduped == 0 + assert body["messages"][1]["content"] == user_before + assert body["messages"][3]["content"] == tool_before + + +def test_no_raw_block_content_in_plugin_telemetry(monkeypatch, tmp_path): + """End-to-end through the Hermes engine: telemetry stays metadata-only.""" + import sys + import types + + # Minimal fake Hermes surface so __init__.py imports cleanly. + agent_pkg = types.ModuleType("agent") + context_engine_mod = types.ModuleType("agent.context_engine") + context_compressor_mod = types.ModuleType("agent.context_compressor") + + class FakeContextEngine: + threshold_percent = 0.75 + + def get_status(self): + return {} + + class FakeContextCompressor(FakeContextEngine): + def __init__(self, **kwargs): + self.threshold_tokens = 0 + self.context_length = 0 + self.protect_first_n = 3 + self.protect_last_n = 6 + self.compression_count = 0 + + def on_session_start(self, session_id, **kwargs): + return None + + context_engine_mod.ContextEngine = FakeContextEngine + context_compressor_mod.ContextCompressor = FakeContextCompressor + agent_pkg.context_engine = context_engine_mod + agent_pkg.context_compressor = context_compressor_mod + monkeypatch.setitem(sys.modules, "agent", agent_pkg) + monkeypatch.setitem(sys.modules, "agent.context_engine", context_engine_mod) + monkeypatch.setitem(sys.modules, "agent.context_compressor", context_compressor_mod) + + run_agent_mod = types.ModuleType("run_agent") + + class FakeAIAgent: + @staticmethod + def _sanitize_api_messages(messages): + return messages + + run_agent_mod.AIAgent = FakeAIAgent + monkeypatch.setitem(sys.modules, "run_agent", run_agent_mod) + + module_path = REPO_ROOT / "__init__.py" + spec = importlib.util.spec_from_file_location("contextpilot_plugin_xrole", module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + monkeypatch.setattr(module, "_check_reorder", lambda: False) + monkeypatch.setattr(module, "_CONTEXTPILOT_AVAILABLE", False) + + telemetry = tmp_path / "telemetry.jsonl" + monkeypatch.setenv("CONTEXTPILOT_TELEMETRY_FILE", str(telemetry)) + + block = _instruction_block() + secret_line = "INSTRUCTION LINE 0: always follow safety rule number 0 carefully and verbatim every time" + assert secret_line in block + + engine = module.ContextPilotEngine() + engine.on_session_start("session-XR", model="test-model") + body = _cross_role_payload(block, block, block, block) + _out, stats = engine.optimize_api_messages(body["messages"]) + + assert stats["chars_saved"] > 0 + + assert telemetry.exists() + raw = telemetry.read_text(encoding="utf-8") + # No raw block/message content may ever leak into telemetry. + assert secret_line not in raw + assert "INSTRUCTION LINE" not in raw + for record in (json.loads(l) for l in raw.splitlines() if l.strip()): + forbidden = {"content", "messages", "prompt", "system_prompt", "text", "tool_calls"} + assert forbidden.isdisjoint(record.keys()) From ba48354c294c7ca8a255e23aff36518e540b07c8 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Jun 2026 02:14:36 +0200 Subject: [PATCH 6/9] feat: add worker routing shadow telemetry --- docs/guides/hermes-monitor.md | 25 +- .../analyze_hermes_context_opportunities.py | 437 ++++++++++++++++++ ...est_hermes_context_opportunity_analyzer.py | 254 ++++++++++ 3 files changed, 715 insertions(+), 1 deletion(-) diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index 8798885..0f5e91a 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -67,7 +67,8 @@ It surfaces concrete token-reduction opportunities: - repeated line/block fingerprints (shared boilerplate across outputs), - large tool outputs grouped by `tool_name`, - heavy sessions by input-token / tool-call / message counts (hashed ids), -- ContextPilot telemetry coverage and savings ratios. +- ContextPilot telemetry coverage and savings ratios, +- **Worker Context Routing shadow labels** for future router training/eval. ### LLM-bound block redundancy @@ -93,6 +94,28 @@ aggregated. The report then shows: prompt *and* a tool result *and* a user prompt). Reported only as a hash plus per-type counters — never the raw text. +### Worker Context Routing shadow mode + +The analyzer now includes a **Worker Context Routing — shadow mode** section by +default. This is P0 data collection only: it never drops, summarizes, or mutates +context. It fingerprints each LLM-bound block and emits only low-cardinality +labels/counters such as: + +- `policy_must_keep` for user/system/skill prompts and explicit safety / + acceptance constraints, +- `direct_task_hint` for short actionable task/error hints, +- `likely_relevant` for conservative default-keep blocks, +- `summarizable_candidate` / `likely_drop_candidate` for large or repeated + tool-like blocks that a future router might route away. Large diagnostic logs + containing `error:` / `failed` / `traceback` cues are still only advisory + summarization candidates, not must-drop decisions. + +The report includes estimated advisory candidate tokens and salted candidate +block hashes. These are **not realized savings** and must be treated as training +/ evaluation data for a future high-recall router. Use +`--disable-worker-routing-shadow` only when you want to omit this section from a +scan. + Use `--all-sessions` to ignore the `--since-hours` window and scan **all** non-archived sessions and active messages (useful for a one-shot, whole-history audit rather than a rolling daily window): diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py index 5bcd3b1..780f0f1 100644 --- a/scripts/analyze_hermes_context_opportunities.py +++ b/scripts/analyze_hermes_context_opportunities.py @@ -134,6 +134,131 @@ class CrossTypeBlockGroup: est_wasted_tokens: int # (occurrences - 1) * est_tokens +# --------------------------------------------------------------------------- +# Worker Context Routing — SHADOW MODE (P0 data collection only) +# --------------------------------------------------------------------------- +# Low-cardinality router labels. These are the *training/eval* labels a future +# small worker-context router would predict. P0 is data-collection only: nothing +# here ever drops, summarizes, or mutates context — it only classifies blocks +# and emits aggregate counters + salted hashes so the labels can be evaluated +# offline before any online pruning is built. +ROUTER_LABELS = ( + "policy_must_keep", # never droppable (user/system/skill/safety constraints) + "direct_task_hint", # short actionable task signal — keep + "likely_relevant", # default keep; not obviously prunable + "summarizable_candidate", # large single block that *might* be summarized later + "likely_drop_candidate", # large/repeated tool-like block, candidate to route away +) + +# Labels whose blocks a future router might safely route away. Used only to +# tally *advisory* candidate tokens; P0 never acts on them. +_ROUTABLE_LABELS = ("summarizable_candidate", "likely_drop_candidate") + +# Block-type priority when one fingerprint spans multiple origins: the most +# "must-keep" origin wins, so cross-origin blocks are classified conservatively. +_TYPE_KEEP_PRIORITY = { + "user_prompt": 5, + "system_prompt": 4, + "skill_prompt": 4, + "assistant_context": 2, + "tool_result": 1, + "unknown": 0, +} + +# Cues marking content that must NEVER be dropped even from a tool/assistant +# block: explicit safety / acceptance / hard-constraint language. Matching here +# is intentionally generous — over-keeping is the safe direction for P0. +_SAFETY_CONSTRAINT_CUES = ( + "must not", + "must never", + "never drop", + "do not delete", + "do not remove", + "do not modify", + "acceptance criteria", + "acceptance test", + "safety", + "must keep", + "you must", + "required:", + "constraint", + "forbidden", + "policy", +) + +# Cues marking a short, actionable task hint worth keeping verbatim. +_TASK_HINT_CUES = ( + "todo", + "next step", + "error:", + "traceback", + "failed", + "fixme", + "task:", + "goal:", + "implement", + "reproduce", +) + + +@dataclass +class RouterLabelCount: + """Aggregate over all blocks assigned one router label.""" + + route_label: str + block_count: int # distinct fingerprints with this label + occurrence_count: int # total occurrences across the window + total_est_tokens: int # est tokens these blocks occupy (occ * est) + est_candidate_tokens: int # ADVISORY routable tokens (0 unless routable) + + +@dataclass +class RouterReasonCount: + """Aggregate keyed by (block_type, route_label, reason_code).""" + + block_type: str + route_label: str + reason_code: str + block_count: int + occurrence_count: int + total_est_tokens: int + est_candidate_tokens: int + + +@dataclass +class RouterCandidateBlock: + """A single routable-candidate fingerprint (salted hash + counters only).""" + + block_hash: str + block_type: str + route_label: str + reason_code: str + occurrences: int + char_length: int + est_tokens: int + est_candidate_tokens: int # ADVISORY upper bound only + + +@dataclass +class WorkerRoutingShadow: + """Shadow-mode worker-context routing report (P0: data collection only).""" + + enabled: bool + item_count: int # LLM-bound items classified + classified_block_count: int # distinct fingerprints classified + total_occurrences: int + must_keep_block_count: int + must_keep_occurrence_count: int + est_must_keep_tokens: int + est_candidate_tokens_total: int # ADVISORY routable ceiling + est_drop_candidate_tokens: int # ADVISORY + est_summarizable_candidate_tokens: int # ADVISORY + label_counts: list[RouterLabelCount] + reason_counts: list[RouterReasonCount] + top_candidate_blocks: list[RouterCandidateBlock] + notes: list[str] = field(default_factory=list) + + @dataclass class ToolSizeStat: tool_name: str @@ -189,6 +314,8 @@ class OpportunityReport: llm_block_types: list[BlockTypeStat] cross_type_block_groups: list[CrossTypeBlockGroup] cross_type_wasted_tokens: int + # Worker Context Routing shadow mode (P0 data collection; never prunes). + worker_routing: WorkerRoutingShadow notes: list[str] = field(default_factory=list) @@ -760,6 +887,247 @@ def analyze_llm_bound_blocks( return block_type_stats, cross[:top_n] +def classify_router_label( + block_type: str, + content: str, + *, + occurrences: int, + large_output_chars: int, + min_repeat: int, +) -> tuple[str, str]: + """Heuristically assign a worker-routing label + reason code to a block. + + Pure P0 heuristic: no ML, no network, no mutation. Operates on in-memory + text only and returns two low-cardinality enums (``route_label``, + ``reason_code``) -- never the text. The bias is deliberately conservative: + when in doubt, keep. Anything that is a user prompt, a system/skill prompt, + or carries explicit safety/acceptance-constraint language is pinned to + ``policy_must_keep`` and can never become a routable candidate. + """ + low = content.lower() + + # 1. Never-drop by origin: prompts the user/system/skills authored. + if block_type == "user_prompt": + return "policy_must_keep", "user_prompt_never_drop" + if block_type in ("system_prompt", "skill_prompt"): + return "policy_must_keep", "system_or_skill_constraint_never_drop" + + # 2. Never-drop by content: explicit safety / acceptance / hard constraints, + # even inside an assistant or tool block. + if any(cue in low for cue in _SAFETY_CONSTRAINT_CUES): + return "policy_must_keep", "safety_or_acceptance_constraint" + + char_len = len(content) + has_task_hint = any(cue in low for cue in _TASK_HINT_CUES) + + # 3. Short actionable task hints -> keep verbatim. Very large diagnostic + # logs often contain "error:"/"failed"/"traceback"; keep collecting + # them as summarization candidates instead of pinning the whole log. + if has_task_hint and char_len < large_output_chars: + return "direct_task_hint", "actionable_task_signal" + + # 4. Bulky / repeated tool-like material -> routable candidates (advisory). + if block_type in ("tool_result", "assistant_context", "unknown"): + if has_task_hint and char_len >= large_output_chars: + return "summarizable_candidate", "large_actionable_tool_block" + is_large = char_len >= large_output_chars + is_repeated = occurrences >= min_repeat + if is_large and is_repeated: + return "likely_drop_candidate", "large_repeated_tool_block" + if is_repeated: + return "likely_drop_candidate", "repeated_tool_block" + if is_large: + return "summarizable_candidate", "large_single_tool_block" + + # 5. Everything else: keep by default. + return "likely_relevant", "default_keep" + + +def analyze_worker_routing_shadow( + contents: Iterable[_LLMContent], + *, + salt: str, + large_output_chars: int, + min_repeat: int, + top_n: int, + enabled: bool = True, +) -> WorkerRoutingShadow: + """Shadow-mode worker-context routing classifier (P0: data collection only). + + Fingerprints each LLM-bound item, assigns a conservative router label, and + returns aggregate counters + salted hashes for routable candidates. Emits + NO raw text and never mutates/drops context. ``est_candidate_tokens`` is an + advisory upper bound on what a *future* router might route away -- not a + realized saving. + """ + if not enabled: + return WorkerRoutingShadow( + enabled=False, + item_count=0, + classified_block_count=0, + total_occurrences=0, + must_keep_block_count=0, + must_keep_occurrence_count=0, + est_must_keep_tokens=0, + est_candidate_tokens_total=0, + est_drop_candidate_tokens=0, + est_summarizable_candidate_tokens=0, + label_counts=[], + reason_counts=[], + top_candidate_blocks=[], + notes=["worker-routing shadow analysis disabled via flag"], + ) + + # Aggregate occurrences per fingerprint, picking the most must-keep origin + # when one block spans several block types. + agg: dict[str, dict] = {} + item_count = 0 + for item in contents: + content = item.content + if not content: + continue + item_count += 1 + h = _salted_hash(content, salt) + bt = item.block_type + entry = agg.get(h) + if entry is None: + agg[h] = { + "block_type": bt, + "char_length": len(content), + "occurrences": 1, + "content": content, + } + else: + entry["occurrences"] += 1 + cur = entry["block_type"] + bt_pri = _TYPE_KEEP_PRIORITY.get(bt, 0) + cur_pri = _TYPE_KEEP_PRIORITY.get(cur, 0) + if bt_pri > cur_pri or (bt_pri == cur_pri and bt < cur): + entry["block_type"] = bt + + # Classify each unique fingerprint and roll up counters. + label_agg: dict[str, dict] = {} + reason_agg: dict[tuple[str, str, str], dict] = {} + candidates: list[RouterCandidateBlock] = [] + must_keep_blocks = 0 + must_keep_occ = 0 + est_must_keep_tokens = 0 + drop_tokens = 0 + summ_tokens = 0 + + for h, entry in agg.items(): + bt = entry["block_type"] + occ = entry["occurrences"] + char_len = entry["char_length"] + est = _est_tokens(char_len) + total_est = est * occ + label, reason = classify_router_label( + bt, + entry["content"], + occurrences=occ, + large_output_chars=large_output_chars, + min_repeat=min_repeat, + ) + candidate_tokens = total_est if label in _ROUTABLE_LABELS else 0 + + la = label_agg.setdefault( + label, + {"block_count": 0, "occ": 0, "total_est": 0, "candidate": 0}, + ) + la["block_count"] += 1 + la["occ"] += occ + la["total_est"] += total_est + la["candidate"] += candidate_tokens + + ra = reason_agg.setdefault( + (bt, label, reason), + {"block_count": 0, "occ": 0, "total_est": 0, "candidate": 0}, + ) + ra["block_count"] += 1 + ra["occ"] += occ + ra["total_est"] += total_est + ra["candidate"] += candidate_tokens + + if label == "policy_must_keep": + must_keep_blocks += 1 + must_keep_occ += occ + est_must_keep_tokens += total_est + if label == "likely_drop_candidate": + drop_tokens += candidate_tokens + elif label == "summarizable_candidate": + summ_tokens += candidate_tokens + + if candidate_tokens > 0: + candidates.append( + RouterCandidateBlock( + block_hash=h, + block_type=bt, + route_label=label, + reason_code=reason, + occurrences=occ, + char_length=char_len, + est_tokens=est, + est_candidate_tokens=candidate_tokens, + ) + ) + + # Deterministic ordering: label_counts follow the canonical label order; + # reason_counts and candidates sort by a stable key. + label_counts = [ + RouterLabelCount( + route_label=lbl, + block_count=label_agg[lbl]["block_count"], + occurrence_count=label_agg[lbl]["occ"], + total_est_tokens=label_agg[lbl]["total_est"], + est_candidate_tokens=label_agg[lbl]["candidate"], + ) + for lbl in ROUTER_LABELS + if lbl in label_agg + ] + reason_counts = [ + RouterReasonCount( + block_type=bt, + route_label=lbl, + reason_code=reason, + block_count=v["block_count"], + occurrence_count=v["occ"], + total_est_tokens=v["total_est"], + est_candidate_tokens=v["candidate"], + ) + for (bt, lbl, reason), v in sorted(reason_agg.items()) + ] + candidates.sort( + key=lambda c: (c.est_candidate_tokens, c.occurrences, c.block_hash), + reverse=True, + ) + + total_occ = sum(e["occurrences"] for e in agg.values()) + notes = [ + "SHADOW MODE P0: classification only -- no context was dropped, summarized, or mutated", + "route_label/reason_code/block_type are low-cardinality enums; block_hash is a salted SHA-256 fingerprint", + "est_candidate_tokens is ADVISORY (an upper bound for a FUTURE router), not a realized saving", + "user/system/skill prompts and safety/acceptance constraints are pinned to policy_must_keep and never routable", + "classification is conservative: when uncertain, blocks are kept (likely_relevant)", + ] + + return WorkerRoutingShadow( + enabled=True, + item_count=item_count, + classified_block_count=len(agg), + total_occurrences=total_occ, + must_keep_block_count=must_keep_blocks, + must_keep_occurrence_count=must_keep_occ, + est_must_keep_tokens=est_must_keep_tokens, + est_candidate_tokens_total=drop_tokens + summ_tokens, + est_drop_candidate_tokens=drop_tokens, + est_summarizable_candidate_tokens=summ_tokens, + label_counts=label_counts, + reason_counts=reason_counts, + top_candidate_blocks=candidates[:top_n], + notes=notes, + ) + + # --------------------------------------------------------------------------- # Build + write # --------------------------------------------------------------------------- @@ -779,6 +1147,7 @@ def build_report( min_block_repeat: int = DEFAULT_MIN_BLOCK_REPEAT, large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS, top_n: int = DEFAULT_TOP_N, + worker_routing_shadow: bool = True, ) -> OpportunityReport: dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n) blocks = detect_repeated_blocks( @@ -801,6 +1170,15 @@ def build_report( top_n=top_n, ) + worker_routing = analyze_worker_routing_shadow( + llm_contents, + salt=salt, + large_output_chars=large_output_chars, + min_repeat=min_block_repeat, + top_n=top_n, + enabled=worker_routing_shadow, + ) + total_chars = sum(len(m.content) for m in tool_messages) dup_wasted = sum(d.est_wasted_tokens for d in dups) block_wasted = sum(b.est_wasted_tokens for b in blocks) @@ -812,6 +1190,7 @@ def build_report( "wasted-token figures are heuristic estimates (chars/4); validate before acting", "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text", "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages", + "worker-routing section is SHADOW MODE P0: it labels blocks for a future router but never drops/summarizes context", ] if all_sessions: notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages") @@ -841,6 +1220,7 @@ def build_report( llm_block_types=block_type_stats, cross_type_block_groups=cross_groups, cross_type_wasted_tokens=cross_wasted, + worker_routing=worker_routing, notes=notes, ) @@ -892,6 +1272,9 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: f"(~{report.cross_type_wasted_tokens} wasted tokens)", f"- Telemetry: {t.events} events, ~{t.tokens_saved} tokens saved, " f"coverage {t.coverage_ratio_pct}%", + f"- Worker routing (shadow): {report.worker_routing.classified_block_count} blocks " + f"classified, {report.worker_routing.must_keep_block_count} must-keep, " + f"~{report.worker_routing.est_candidate_tokens_total} advisory candidate tokens", "", "## LLM-bound redundancy by block type", ] @@ -950,6 +1333,51 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: ] ) md.append("") + wr = report.worker_routing + md.append("## Worker Context Routing — shadow mode (P0, advisory only)") + if not wr.enabled: + md.append("- disabled") + else: + md.append( + f"- Items classified: {wr.item_count} " + f"(distinct fingerprints: {wr.classified_block_count}, " + f"occurrences: {wr.total_occurrences})" + ) + md.append( + f"- Must-keep: {wr.must_keep_block_count} blocks / " + f"{wr.must_keep_occurrence_count} occurrences " + f"(~{wr.est_must_keep_tokens} tokens, never routable)" + ) + md.append( + f"- Advisory candidate tokens: ~{wr.est_candidate_tokens_total} " + f"(drop ~{wr.est_drop_candidate_tokens}, " + f"summarize ~{wr.est_summarizable_candidate_tokens}) — NOT a realized saving" + ) + md.append("") + md.append("### Router labels") + for lc in wr.label_counts: + md.append( + f"- {lc.route_label}: blocks={lc.block_count} " + f"occ={lc.occurrence_count} tokens={lc.total_est_tokens} " + f"~candidate={lc.est_candidate_tokens}" + ) + md.append("") + md.append("### Reason codes (block_type / label / reason)") + for rc in wr.reason_counts: + md.append( + f"- {rc.block_type} / {rc.route_label} / {rc.reason_code}: " + f"blocks={rc.block_count} occ={rc.occurrence_count} " + f"tokens={rc.total_est_tokens} ~candidate={rc.est_candidate_tokens}" + ) + md.append("") + md.append("### Top routable-candidate blocks (hashed)") + for cb in wr.top_candidate_blocks: + md.append( + f"- `{cb.block_hash}` type={cb.block_type} " + f"label={cb.route_label} reason={cb.reason_code} " + f"x{cb.occurrences} chars={cb.char_length} ~candidate={cb.est_candidate_tokens}" + ) + md.append("") md.append("## Notes") for note in report.notes: md.append(f"- {note}") @@ -987,6 +1415,14 @@ def main() -> int: "--large-output-chars", type=int, default=DEFAULT_LARGE_OUTPUT_CHARS ) parser.add_argument("--top-n", type=int, default=DEFAULT_TOP_N) + parser.add_argument( + "--disable-worker-routing-shadow", + action="store_true", + help=( + "skip the shadow-mode Worker Context Routing classification " + "(P0 data collection; enabled by default, never prunes context)" + ), + ) args = parser.parse_args() if not args.state_db.exists(): @@ -1030,6 +1466,7 @@ def main() -> int: min_block_repeat=args.min_block_repeat, large_output_chars=args.large_output_chars, top_n=args.top_n, + worker_routing_shadow=not args.disable_worker_routing_shadow, ) json_path, md_path = write_report(report, args.out_dir) except Exception as exc: # noqa: BLE001 - cron-safe: report class only, no payload diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py index 649288a..7eb04d2 100644 --- a/tests/test_hermes_context_opportunity_analyzer.py +++ b/tests/test_hermes_context_opportunity_analyzer.py @@ -1,3 +1,4 @@ +import dataclasses import importlib.util import json import sqlite3 @@ -450,3 +451,256 @@ def test_cross_type_redundancy_reported_via_hashes_only(tmp_path): assert "You are a helpful system" not in blob assert "user asks the assistant" not in blob assert "PRIVATE REASONING" not in blob + + +# --------------------------------------------------------------------------- +# Worker Context Routing — SHADOW MODE (P0 data-collection) tests +# --------------------------------------------------------------------------- + +LARGE = 8000 # default large-output threshold + + +def _route_map(report): + """label -> RouterLabelCount for convenient assertions.""" + return {lc.route_label: lc for lc in report.worker_routing.label_counts} + + +def _labels_for(contents, **kw): + """Run only the shadow classifier over a list of _LLMContent.""" + return analyzer.analyze_worker_routing_shadow( + contents, salt="test-salt", large_output_chars=LARGE, min_repeat=2, top_n=20, **kw + ) + + +def test_user_and_system_constraints_are_must_keep(tmp_path): + db = tmp_path / "state.db" + sys_prompt = "You are an agent. Follow the rules below for the whole session here." + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_prompt}], + messages=[ + {"role": "user", "content": "Please implement the worker routing layer for me now"}, + { + "role": "tool", + "content": "ACCEPTANCE CRITERIA: the targeted pytest suite must pass before merge", + "tool_name": "Bash", + }, + ], + ) + report = _analyze(db, tmp_path) + wr = report.worker_routing + + # User prompt, system prompt, and the acceptance-constraint tool block are + # all pinned to policy_must_keep and contribute zero routable tokens. + rm = _route_map(report) + assert "policy_must_keep" in rm + assert rm["policy_must_keep"].est_candidate_tokens == 0 + assert wr.est_candidate_tokens_total == 0 + + reasons = {(r.block_type, r.route_label, r.reason_code) for r in wr.reason_counts} + assert ("user_prompt", "policy_must_keep", "user_prompt_never_drop") in reasons + assert ( + "system_prompt", + "policy_must_keep", + "system_or_skill_constraint_never_drop", + ) in reasons + # The acceptance constraint inside a TOOL block is still must-keep. + assert any( + r.route_label == "policy_must_keep" + and r.reason_code == "safety_or_acceptance_constraint" + and r.block_type == "tool_result" + for r in wr.reason_counts + ) + + +def test_large_repeated_tool_blocks_become_drop_candidates(tmp_path): + db = tmp_path / "state.db" + big_unrelated = "row of unrelated build log output number 7 with filler text " * 200 + assert len(big_unrelated) >= LARGE + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": big_unrelated, "tool_name": "Bash"}, + {"role": "tool", "content": big_unrelated, "tool_name": "Bash"}, + {"role": "tool", "content": big_unrelated, "tool_name": "Bash"}, + ], + ) + report = _analyze(db, tmp_path) + wr = report.worker_routing + rm = _route_map(report) + + # The repeated, large, unrelated tool output is a drop candidate. + assert "likely_drop_candidate" in rm + assert wr.est_drop_candidate_tokens > 0 + assert wr.est_candidate_tokens_total >= wr.est_drop_candidate_tokens + top = wr.top_candidate_blocks[0] + assert top.route_label == "likely_drop_candidate" + assert top.reason_code == "large_repeated_tool_block" + assert top.occurrences == 3 + + +def test_large_single_tool_block_is_summarizable(tmp_path): + db = tmp_path / "state.db" + big_once = "one-off diagnostic dump segment with assorted detail text " * 200 + assert len(big_once) >= LARGE + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[{"role": "tool", "content": big_once, "tool_name": "Read"}], + ) + report = _analyze(db, tmp_path) + wr = report.worker_routing + assert wr.est_summarizable_candidate_tokens > 0 + assert any( + c.route_label == "summarizable_candidate" + and c.reason_code == "large_single_tool_block" + for c in wr.top_candidate_blocks + ) + + +def test_large_actionable_diagnostic_log_is_summarizable_not_pinned(tmp_path): + db = tmp_path / "state.db" + big_error_log = "ERROR: integration test failed with stack frame details " * 220 + assert len(big_error_log) >= LARGE + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[{"role": "tool", "content": big_error_log, "tool_name": "Bash"}], + ) + report = _analyze(db, tmp_path) + wr = report.worker_routing + + assert wr.est_summarizable_candidate_tokens > 0 + assert any( + c.route_label == "summarizable_candidate" + and c.reason_code == "large_actionable_tool_block" + for c in wr.top_candidate_blocks + ) + assert not any( + r.route_label == "direct_task_hint" + and r.reason_code == "actionable_task_signal" + and r.block_type == "tool_result" + for r in wr.reason_counts + ) + + +def test_short_actionable_hint_and_default_blocks_are_kept(tmp_path): + contents = [ + analyzer._LLMContent(block_type="assistant_context", content="Next step: run pytest"), + analyzer._LLMContent(block_type="assistant_context", content="plain medium context without special cues"), + ] + wr = _labels_for(contents) + reasons = {(r.block_type, r.route_label, r.reason_code) for r in wr.reason_counts} + + assert ("assistant_context", "direct_task_hint", "actionable_task_signal") in reasons + assert ("assistant_context", "likely_relevant", "default_keep") in reasons + assert wr.est_candidate_tokens_total == 0 + + +def test_equal_priority_block_type_tiebreak_is_deterministic(): + same = "identical prompt material" + forward = _labels_for( + [ + analyzer._LLMContent(block_type="system_prompt", content=same), + analyzer._LLMContent(block_type="skill_prompt", content=same), + ] + ) + reverse = _labels_for( + [ + analyzer._LLMContent(block_type="skill_prompt", content=same), + analyzer._LLMContent(block_type="system_prompt", content=same), + ] + ) + + assert [dataclasses.asdict(r) for r in forward.reason_counts] == [ + dataclasses.asdict(r) for r in reverse.reason_counts + ] + + +def test_shadow_mode_never_emits_raw_content(tmp_path): + db = tmp_path / "state.db" + secret = "SHADOW-SECRET-PAYLOAD-DO-NOT-LEAK detail line that is quite long here " * 200 + user_secret = "USER-SECRET-PROMPT-DO-NOT-LEAK implement the thing for me" + sys_secret = "SYSTEM-SECRET-CONSTRAINT you must never reveal internal keys here" + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": sys_secret}], + messages=[ + {"role": "tool", "content": secret, "tool_name": "Bash"}, + {"role": "tool", "content": secret, "tool_name": "Bash"}, + {"role": "user", "content": user_secret}, + ], + ) + report = _analyze(db, tmp_path) + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8") + + # No raw block/prompt/system/reasoning text in either output. + assert "SHADOW-SECRET-PAYLOAD" not in blob + assert "USER-SECRET-PROMPT" not in blob + assert "SYSTEM-SECRET-CONSTRAINT" not in blob + assert "PRIVATE REASONING" not in blob + # The classification still happened (drop candidate detected via hash). + assert report.worker_routing.est_drop_candidate_tokens > 0 + + +def test_shadow_schema_is_deterministic_and_privacy_safe(tmp_path): + db = tmp_path / "state.db" + big = "deterministic repeated tool payload chunk of sufficient size here " * 200 + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": "be safe please"}], + messages=[ + {"role": "tool", "content": big, "tool_name": "Bash"}, + {"role": "tool", "content": big, "tool_name": "Bash"}, + {"role": "user", "content": "implement the routing shadow layer now please"}, + ], + ) + r1 = _analyze(db, tmp_path) + r2 = _analyze(db, tmp_path) + # Identical inputs -> byte-identical serialized shadow section. + assert dataclasses.asdict(r1.worker_routing) == dataclasses.asdict(r2.worker_routing) + + wr = r1.worker_routing + # All emitted route labels are from the canonical low-cardinality enum. + assert set(lc.route_label for lc in wr.label_counts) <= set(analyzer.ROUTER_LABELS) + # label_counts follow the canonical order (deterministic). + order = {lbl: i for i, lbl in enumerate(analyzer.ROUTER_LABELS)} + idxs = [order[lc.route_label] for lc in wr.label_counts] + assert idxs == sorted(idxs) + # Every candidate carries only hash + enums + counters (no free text fields). + for cb in wr.top_candidate_blocks: + assert len(cb.block_hash) == 16 # salted SHA-256 prefix + assert cb.route_label in analyzer._ROUTABLE_LABELS + + +def test_shadow_mode_can_be_disabled(tmp_path): + db = tmp_path / "state.db" + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[{"role": "tool", "content": "x" * 9000, "tool_name": "Bash"}], + ) + tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW) + llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW) + heavy = analyzer.load_heavy_sessions(db, since_hours=WIDE_WINDOW, salt="s", top_n=5) + tel = analyzer.parse_telemetry( + tmp_path / "none.jsonl", since_hours=WIDE_WINDOW, total_input_tokens=0 + ) + report = analyzer.build_report( + date="2100-01-01", + since_hours=24, + salt="s", + tool_messages=tool_messages, + heavy_sessions=heavy, + telemetry=tel, + llm_contents=llm, + min_block_repeat=2, + worker_routing_shadow=False, + ) + assert report.worker_routing.enabled is False + assert report.worker_routing.classified_block_count == 0 + # Disabled section still serializes safely. + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + assert "disabled" in md_path.read_text(encoding="utf-8") From 0213650f523a0a94551cbd0ef5c184676fc67a20 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Jun 2026 10:48:04 +0200 Subject: [PATCH 7/9] feat: add parent aggregation artifact telemetry --- docs/guides/hermes-monitor.md | 39 +- .../analyze_hermes_context_opportunities.py | 425 ++++++++++++++++++ ...est_hermes_context_opportunity_analyzer.py | 235 ++++++++++ 3 files changed, 698 insertions(+), 1 deletion(-) diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index 0f5e91a..4aa9128 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -68,7 +68,9 @@ It surfaces concrete token-reduction opportunities: - large tool outputs grouped by `tool_name`, - heavy sessions by input-token / tool-call / message counts (hashed ids), - ContextPilot telemetry coverage and savings ratios, -- **Worker Context Routing shadow labels** for future router training/eval. +- **Worker Context Routing shadow labels** for future router training/eval, +- **Parent Aggregation Artifact telemetry** (exact duplicate worker/parent + artifacts grouped by hash) for future parent-aggregation dedup eval. ### LLM-bound block redundancy @@ -116,6 +118,41 @@ block hashes. These are **not realized savings** and must be treated as training `--disable-worker-routing-shadow` only when you want to omit this section from a scan. +### Parent Aggregation Artifacts — shadow mode + +The analyzer also includes a **Parent Aggregation Artifacts — shadow mode** +section by default. This is **P0 telemetry only**: it collects data so a future +parent-aggregation dedup can be evaluated offline. It never drops, summarizes, +replaces, or mutates any context. + +When a parent/orchestrator aggregates results from several workers, the same +artifact body (a test log, a diff, a file dump, a review summary, ...) is often +carried into the parent's LLM context once per worker and again in the parent's +own roll-up — paying for the same tokens several times. The analyzer groups +**EXACT** artifact bodies by salted content hash (near-duplicates never group), +classifies each body with a deterministic heuristic kind, and emits only +low-cardinality metadata + counters: + +- `artifact_kind` — one of `test_log`, `terminal_output`, `file_content`, + `diff`, `error_trace`, `review_findings`, `benchmark_result`, + `worker_summary`, `unknown_large_block` (deterministic, first-match-wins), +- per-kind summary — distinct bodies, occurrences, duplicate-group count, + estimated tokens, and advisory duplicate tokens, +- **provenance** — per duplicate group, `source_type_counts` such as + `tool_result xN` and `assistant_context xM`, plus a deterministically chosen + `canonical_source_type` (the dominant origin, tie-broken alphabetically), +- top duplicate artifact groups, reported **only** as a salted `content_hash` + plus counters. + +`est_duplicate_tokens` is computed as `(occurrences - 1) * est_tokens` and is an +**advisory upper bound** on what a future parent dedup might save — **not a +realized saving**, and payloads are never changed. No raw artifact / worker / +tool / system text, and no raw session ids, are ever emitted. Only sizeable +blocks (`>= --min-artifact-chars`, default 400) from parent/worker output origins +(`assistant_context` and `tool_result`) are considered candidates, so prompt +boilerplate and short hints never enter this telemetry. Use +`--disable-parent-aggregation` to omit this section from a scan. + Use `--all-sessions` to ignore the `--since-hours` window and scan **all** non-archived sessions and active messages (useful for a one-shot, whole-history audit rather than a rolling daily window): diff --git a/scripts/analyze_hermes_context_opportunities.py b/scripts/analyze_hermes_context_opportunities.py index 780f0f1..b6cca65 100644 --- a/scripts/analyze_hermes_context_opportunities.py +++ b/scripts/analyze_hermes_context_opportunities.py @@ -316,6 +316,8 @@ class OpportunityReport: cross_type_wasted_tokens: int # Worker Context Routing shadow mode (P0 data collection; never prunes). worker_routing: WorkerRoutingShadow + # Parent Aggregation Artifacts shadow mode (P0 telemetry; never dedups). + parent_aggregation: ParentAggregationArtifacts notes: list[str] = field(default_factory=list) @@ -1128,6 +1130,361 @@ def analyze_worker_routing_shadow( ) +# --------------------------------------------------------------------------- +# Parent Aggregation Artifacts — SHADOW MODE (P0 telemetry only) +# --------------------------------------------------------------------------- +# When a parent/orchestrator aggregates results from several workers, the same +# artifact body (a test log, a diff, a file dump, a review summary, ...) is often +# carried into the parent's LLM context once per worker and again in the parent's +# own roll-up -- paying for the same tokens several times. This section collects +# *telemetry only* so a future parent-aggregation dedup can be evaluated offline: +# it groups EXACT artifact bodies by salted content hash, classifies each body +# with a deterministic heuristic kind, and emits low-cardinality metadata + +# counters. It NEVER drops, summarizes, replaces, or mutates any context, and it +# NEVER emits raw artifact text, worker text, tool output, session ids, or +# system prompts. + +# Heuristic P0 artifact kinds. Low-cardinality enums describing the *shape* of an +# aggregation artifact, never its text. Classification is deterministic. +ARTIFACT_KINDS = ( + "test_log", + "terminal_output", + "file_content", + "diff", + "error_trace", + "review_findings", + "benchmark_result", + "worker_summary", + "unknown_large_block", +) + +# Conservative floor: only sizeable blocks are treated as candidate aggregation +# artifacts, so short prompts/hints never enter parent-aggregation telemetry. +DEFAULT_MIN_ARTIFACT_CHARS = 400 + +# Parent aggregation P0 focuses on content produced by workers/tools and then +# carried into the parent context. System/skill/user prompts are analyzed by the +# LLM-bound redundancy and worker-routing sections, but excluding them here keeps +# parent artifact telemetry from being polluted by prompt boilerplate. +PARENT_AGGREGATION_SOURCE_TYPES = ("assistant_context", "tool_result") + + +def classify_artifact_kind(content: str) -> str: + """Deterministically classify a candidate aggregation artifact body. + + Pure P0 heuristic over in-memory text; returns a low-cardinality enum from + ``ARTIFACT_KINDS`` and never the text. The check order is fixed so the same + body always yields the same kind (first match wins). + """ + low = content.lower() + stripped = content.lstrip() + + # 1. Unified diff / patch. + if ( + stripped.startswith("diff --git") + or stripped.startswith("--- a/") + or stripped.startswith("@@ ") + or "\n@@ " in content + or ("\n--- " in content and "\n+++ " in content) + ): + return "diff" + + # 2. Test/pytest log (checked before error_trace: a failing test log may + # embed a traceback but is still fundamentally a test log). + if ( + "pytest" in low + or "test session starts" in low + or " passed in " in low + or " failed in " in low + or ("passed" in low and "failed" in low) + or "=== " in content + ): + return "test_log" + + # 3. Error / exception trace. + if ( + "traceback (most recent call last)" in low + or "\n at " in content + or "stack trace" in low + or ("exception" in low and "error" in low) + ): + return "error_trace" + + # 4. Benchmark / perf result. + if ( + "benchmark" in low + or "ops/sec" in low + or "ops/s" in low + or "req/sec" in low + or "throughput" in low + or "latency" in low + or "iterations/sec" in low + ): + return "benchmark_result" + + # 5. Code-review findings. + if ( + "code review" in low + or "review findings" in low + or "severity:" in low + or "vulnerab" in low + or "## findings" in low + ): + return "review_findings" + + # 6. File content / source dump (cat -n style numbering or code cues). + if ( + "\n 1\t" in content + or "\n 1\t" in content + or "def " in content + or "class " in content + or "\nimport " in content + or "#include" in content + or "function " in content + ): + return "file_content" + + # 7. Worker / aggregation summary. Checked after source-code cues so files + # mentioning workers are still labeled as file_content. + if ( + "## summary" in low + or "in summary" in low + or "summary:" in low + or "tl;dr" in low + or "aggregat" in low + or "worker" in low + ): + return "worker_summary" + + # 8. Terminal / shell session output. + if ( + "\n$ " in content + or stripped.startswith("$ ") + or "\n# " in content + or "user@" in low + or "bash-" in low + or "exit code" in low + ): + return "terminal_output" + + # 9. Fallback: a large block we could not confidently classify. + return "unknown_large_block" + + +@dataclass +class ArtifactSourceCount: + """Provenance counter: occurrences of one artifact body from one source.""" + + source_type: str + count: int + + +@dataclass +class ParentAggregationGroup: + """One EXACT artifact body observed 2+ times across parent/worker contexts. + + Salted hash + counters only -- never the body text. + """ + + content_hash: str + artifact_kind: str + canonical_source_type: str # dominant origin, chosen deterministically + occurrences: int + char_length: int + est_tokens: int + est_duplicate_tokens: int # ADVISORY: (occurrences - 1) * est_tokens + source_type_counts: list[ArtifactSourceCount] # provenance: tool_result xN, ... + + +@dataclass +class ArtifactKindStat: + """Aggregate over all candidate artifact bodies of one kind.""" + + artifact_kind: str + group_count: int # distinct bodies of this kind + occurrence_count: int # total occurrences of those bodies + duplicate_group_count: int # bodies seen >= 2 times + est_tokens: int # sum of est tokens for distinct bodies + est_duplicate_tokens: int # ADVISORY duplicate tokens for this kind + + +@dataclass +class ParentAggregationArtifacts: + """Shadow-mode parent-aggregation artifact report (P0: telemetry only).""" + + enabled: bool + item_count: int # candidate artifact items considered + artifact_body_count: int # distinct bodies (groups) + total_occurrences: int + duplicate_group_count: int + est_total_tokens: int # est tokens for distinct bodies + est_duplicate_tokens: int # ADVISORY duplicate-artifact tokens + by_kind: list[ArtifactKindStat] + source_type_counts: list[ArtifactSourceCount] # provenance across candidates + top_duplicate_groups: list[ParentAggregationGroup] + notes: list[str] = field(default_factory=list) + + +def analyze_parent_aggregation_artifacts( + contents: Iterable[_LLMContent], + *, + salt: str, + min_artifact_chars: int, + top_n: int, + enabled: bool = True, +) -> ParentAggregationArtifacts: + """Group EXACT aggregation-artifact bodies and emit provenance telemetry. + + P0 telemetry/advisory only: no context is dropped, summarized, replaced, or + mutated. Each sizeable LLM-bound block is fingerprinted by EXACT salted + content hash (near-duplicates never group), classified with a deterministic + heuristic kind, and rolled up into low-cardinality metadata + counters. + ``est_duplicate_tokens`` is an advisory upper bound on what a *future* parent + dedup might save -- never a realized saving. No raw artifact/worker/tool/ + system text, and no raw session ids, are ever emitted. + """ + if not enabled: + return ParentAggregationArtifacts( + enabled=False, + item_count=0, + artifact_body_count=0, + total_occurrences=0, + duplicate_group_count=0, + est_total_tokens=0, + est_duplicate_tokens=0, + by_kind=[], + source_type_counts=[], + top_duplicate_groups=[], + notes=["parent-aggregation artifact analysis disabled via flag"], + ) + + # --- group sizeable bodies by EXACT salted content hash ---------------- + groups: dict[str, dict] = {} + item_count = 0 + source_totals: dict[str, int] = {} + for item in contents: + content = item.content + bt = item.block_type + if bt not in PARENT_AGGREGATION_SOURCE_TYPES: + continue + if not content or len(content) < min_artifact_chars: + continue + item_count += 1 + source_totals[bt] = source_totals.get(bt, 0) + 1 + h = _salted_hash(content, salt) + g = groups.get(h) + if g is None: + groups[h] = { + "char_length": len(content), + "occurrences": 1, + "sources": {bt: 1}, + # classify once from in-memory text; never stored/emitted. + "kind": classify_artifact_kind(content), + } + else: + g["occurrences"] += 1 + g["sources"][bt] = g["sources"].get(bt, 0) + 1 + + # --- per-kind rollup + per-group records ------------------------------- + kind_agg: dict[str, dict] = {} + group_records: list[ParentAggregationGroup] = [] + total_occurrences = 0 + est_total_tokens = 0 + est_duplicate_tokens = 0 + duplicate_group_count = 0 + + for h, g in groups.items(): + occ = g["occurrences"] + char_len = g["char_length"] + est = _est_tokens(char_len) + dup_tokens = est * (occ - 1) + kind = g["kind"] + is_dup = occ >= 2 + + total_occurrences += occ + est_total_tokens += est + est_duplicate_tokens += dup_tokens + if is_dup: + duplicate_group_count += 1 + + ka = kind_agg.setdefault( + kind, + {"groups": 0, "occ": 0, "dups": 0, "est": 0, "dup_tokens": 0}, + ) + ka["groups"] += 1 + ka["occ"] += occ + ka["est"] += est + ka["dup_tokens"] += dup_tokens + if is_dup: + ka["dups"] += 1 + + if is_dup: + # Provenance counts, sorted by source_type for determinism. + source_counts = [ + ArtifactSourceCount(source_type=st, count=c) + for st, c in sorted(g["sources"].items()) + ] + # Canonical source: dominant origin, tie-broken alphabetically. + canonical = min( + g["sources"].items(), key=lambda kv: (-kv[1], kv[0]) + )[0] + group_records.append( + ParentAggregationGroup( + content_hash=h, + artifact_kind=kind, + canonical_source_type=canonical, + occurrences=occ, + char_length=char_len, + est_tokens=est, + est_duplicate_tokens=dup_tokens, + source_type_counts=source_counts, + ) + ) + + by_kind = [ + ArtifactKindStat( + artifact_kind=kind, + group_count=kind_agg[kind]["groups"], + occurrence_count=kind_agg[kind]["occ"], + duplicate_group_count=kind_agg[kind]["dups"], + est_tokens=kind_agg[kind]["est"], + est_duplicate_tokens=kind_agg[kind]["dup_tokens"], + ) + for kind in ARTIFACT_KINDS + if kind in kind_agg + ] + source_type_counts = [ + ArtifactSourceCount(source_type=st, count=c) + for st, c in sorted(source_totals.items()) + ] + group_records.sort( + key=lambda g: (g.est_duplicate_tokens, g.occurrences, g.content_hash), + reverse=True, + ) + + notes = [ + "SHADOW MODE P0: telemetry only -- no aggregation artifact was deduped, replaced, summarized, or mutated", + "artifact_kind/source_type/canonical_source_type are low-cardinality enums; content_hash is a salted SHA-256 fingerprint", + "grouping is EXACT (same salted content hash): near-duplicate artifacts never group", + "est_duplicate_tokens is ADVISORY ((occurrences-1) * est_tokens), an upper bound for a FUTURE parent dedup -- not a realized saving", + "provenance source_type_counts show how many copies came from each parent/worker output origin (assistant_context, tool_result)", + ] + + return ParentAggregationArtifacts( + enabled=True, + item_count=item_count, + artifact_body_count=len(groups), + total_occurrences=total_occurrences, + duplicate_group_count=duplicate_group_count, + est_total_tokens=est_total_tokens, + est_duplicate_tokens=est_duplicate_tokens, + by_kind=by_kind, + source_type_counts=source_type_counts, + top_duplicate_groups=group_records[:top_n], + notes=notes, + ) + + # --------------------------------------------------------------------------- # Build + write # --------------------------------------------------------------------------- @@ -1148,6 +1505,8 @@ def build_report( large_output_chars: int = DEFAULT_LARGE_OUTPUT_CHARS, top_n: int = DEFAULT_TOP_N, worker_routing_shadow: bool = True, + parent_aggregation_shadow: bool = True, + min_artifact_chars: int = DEFAULT_MIN_ARTIFACT_CHARS, ) -> OpportunityReport: dups = detect_exact_duplicate_tool_outputs(tool_messages, salt=salt, top_n=top_n) blocks = detect_repeated_blocks( @@ -1179,6 +1538,14 @@ def build_report( enabled=worker_routing_shadow, ) + parent_aggregation = analyze_parent_aggregation_artifacts( + llm_contents, + salt=salt, + min_artifact_chars=min_artifact_chars, + top_n=top_n, + enabled=parent_aggregation_shadow, + ) + total_chars = sum(len(m.content) for m in tool_messages) dup_wasted = sum(d.est_wasted_tokens for d in dups) block_wasted = sum(b.est_wasted_tokens for b in blocks) @@ -1191,6 +1558,7 @@ def build_report( "session 'source', 'tool_name', and block_type are emitted verbatim as low-cardinality enums, not raw text", "llm-bound scan covers only content sent to the LLM: system/skill prompts, active user/assistant/tool messages", "worker-routing section is SHADOW MODE P0: it labels blocks for a future router but never drops/summarizes context", + "parent-aggregation section is SHADOW MODE P0 telemetry: it groups exact artifact bodies but never dedups/replaces context", ] if all_sessions: notes.append("all-sessions mode: time window ignored; scanned all non-archived sessions/active messages") @@ -1221,6 +1589,7 @@ def build_report( cross_type_block_groups=cross_groups, cross_type_wasted_tokens=cross_wasted, worker_routing=worker_routing, + parent_aggregation=parent_aggregation, notes=notes, ) @@ -1275,6 +1644,9 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: f"- Worker routing (shadow): {report.worker_routing.classified_block_count} blocks " f"classified, {report.worker_routing.must_keep_block_count} must-keep, " f"~{report.worker_routing.est_candidate_tokens_total} advisory candidate tokens", + f"- Parent aggregation (shadow): {report.parent_aggregation.duplicate_group_count} " + f"duplicate artifact groups, " + f"~{report.parent_aggregation.est_duplicate_tokens} advisory duplicate tokens", "", "## LLM-bound redundancy by block type", ] @@ -1378,6 +1750,46 @@ def write_report(report: OpportunityReport, out_dir: Path) -> tuple[Path, Path]: f"x{cb.occurrences} chars={cb.char_length} ~candidate={cb.est_candidate_tokens}" ) md.append("") + pa = report.parent_aggregation + md.append("## Parent Aggregation Artifacts — shadow mode") + if not pa.enabled: + md.append("- disabled") + else: + md.append( + f"- Candidate artifact items: {pa.item_count} " + f"(distinct bodies: {pa.artifact_body_count}, " + f"occurrences: {pa.total_occurrences})" + ) + md.append( + f"- Duplicate artifact groups: {pa.duplicate_group_count} " + f"(~{pa.est_duplicate_tokens} advisory duplicate tokens of " + f"~{pa.est_total_tokens} distinct-body tokens) — NOT a realized saving, " + f"payloads are unchanged" + ) + md.append("") + md.append("### By artifact kind") + for ks in pa.by_kind: + md.append( + f"- {ks.artifact_kind}: bodies={ks.group_count} " + f"occ={ks.occurrence_count} dup_groups={ks.duplicate_group_count} " + f"tokens={ks.est_tokens} ~dup={ks.est_duplicate_tokens}" + ) + md.append("") + md.append("### Provenance (artifact source types)") + for sc in pa.source_type_counts: + md.append(f"- {sc.source_type}: {sc.count}") + md.append("") + md.append("### Top duplicate artifact groups (hashed)") + for g in pa.top_duplicate_groups: + spread = ", ".join( + f"{sc.source_type}x{sc.count}" for sc in g.source_type_counts + ) + md.append( + f"- `{g.content_hash}` kind={g.artifact_kind} " + f"canonical={g.canonical_source_type} x{g.occurrences} " + f"({spread}) chars={g.char_length} ~dup={g.est_duplicate_tokens} tokens" + ) + md.append("") md.append("## Notes") for note in report.notes: md.append(f"- {note}") @@ -1423,6 +1835,17 @@ def main() -> int: "(P0 data collection; enabled by default, never prunes context)" ), ) + parser.add_argument( + "--disable-parent-aggregation", + action="store_true", + help=( + "skip the shadow-mode Parent Aggregation Artifact telemetry " + "(P0 telemetry only; enabled by default, never dedups/replaces context)" + ), + ) + parser.add_argument( + "--min-artifact-chars", type=int, default=DEFAULT_MIN_ARTIFACT_CHARS + ) args = parser.parse_args() if not args.state_db.exists(): @@ -1467,6 +1890,8 @@ def main() -> int: large_output_chars=args.large_output_chars, top_n=args.top_n, worker_routing_shadow=not args.disable_worker_routing_shadow, + parent_aggregation_shadow=not args.disable_parent_aggregation, + min_artifact_chars=args.min_artifact_chars, ) json_path, md_path = write_report(report, args.out_dir) except Exception as exc: # noqa: BLE001 - cron-safe: report class only, no payload diff --git a/tests/test_hermes_context_opportunity_analyzer.py b/tests/test_hermes_context_opportunity_analyzer.py index 7eb04d2..fc997fc 100644 --- a/tests/test_hermes_context_opportunity_analyzer.py +++ b/tests/test_hermes_context_opportunity_analyzer.py @@ -704,3 +704,238 @@ def test_shadow_mode_can_be_disabled(tmp_path): # Disabled section still serializes safely. json_path, md_path = analyzer.write_report(report, tmp_path / "out") assert "disabled" in md_path.read_text(encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Parent Aggregation Artifacts — SHADOW MODE (P0 telemetry) tests +# --------------------------------------------------------------------------- + +# An artifact body must be >= DEFAULT_MIN_ARTIFACT_CHARS to be a candidate. +TEST_LOG_ARTIFACT = "pytest session: tests/test_widget.py::test_alpha PASSED\n" * 20 + + +def _pa_kinds(report): + """artifact_kind -> ArtifactKindStat for convenient assertions.""" + return {ks.artifact_kind: ks for ks in report.parent_aggregation.by_kind} + + +def test_exact_duplicate_artifacts_group_and_estimate_duplicate_tokens(tmp_path): + db = tmp_path / "state.db" + assert len(TEST_LOG_ARTIFACT) >= analyzer.DEFAULT_MIN_ARTIFACT_CHARS + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + ], + ) + report = _analyze(db, tmp_path) + pa = report.parent_aggregation + + assert pa.enabled is True + assert pa.duplicate_group_count == 1 + grp = pa.top_duplicate_groups[0] + assert grp.occurrences == 3 + assert grp.artifact_kind == "test_log" + # Two of the three copies are advisory duplicate tokens. + assert grp.est_duplicate_tokens == grp.est_tokens * 2 + assert pa.est_duplicate_tokens == grp.est_duplicate_tokens + + +def test_near_duplicate_artifacts_do_not_group(tmp_path): + db = tmp_path / "state.db" + base = "pytest run output line that is sufficiently long to be an artifact body\n" * 8 + near = base + "X" # one char different -> different exact hash + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": base, "tool_name": "Bash"}, + {"role": "tool", "content": near, "tool_name": "Bash"}, + ], + ) + report = _analyze(db, tmp_path) + pa = report.parent_aggregation + # Two distinct bodies, neither repeated -> zero duplicate groups. + assert pa.artifact_body_count == 2 + assert pa.duplicate_group_count == 0 + assert pa.est_duplicate_tokens == 0 + assert pa.top_duplicate_groups == [] + + +def test_provenance_source_type_counts_are_emitted(tmp_path): + db = tmp_path / "state.db" + # Same exact artifact body shipped from a tool result AND assistant context. + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "assistant", "content": TEST_LOG_ARTIFACT}, + ], + ) + report = _analyze(db, tmp_path) + pa = report.parent_aggregation + grp = pa.top_duplicate_groups[0] + + spread = {sc.source_type: sc.count for sc in grp.source_type_counts} + assert spread == {"assistant_context": 1, "tool_result": 1} + # Canonical source chosen deterministically (tie -> alphabetical). + assert grp.canonical_source_type == "assistant_context" + # Aggregate provenance across all candidates is also emitted. + agg = {sc.source_type: sc.count for sc in pa.source_type_counts} + assert agg == {"assistant_context": 1, "tool_result": 1} + + +def test_parent_aggregation_excludes_prompt_boilerplate_sources(tmp_path): + db = tmp_path / "state.db" + prompt_like_artifact = "pytest prompt boilerplate that should not be a parent artifact\n" * 20 + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": prompt_like_artifact}], + messages=[ + {"role": "user", "content": prompt_like_artifact}, + {"role": "system", "content": prompt_like_artifact}, + ], + ) + report = _analyze(db, tmp_path) + pa = report.parent_aggregation + + assert pa.item_count == 0 + assert pa.artifact_body_count == 0 + assert pa.source_type_counts == [] + assert pa.top_duplicate_groups == [] + + +def test_parent_aggregation_never_emits_raw_content(tmp_path): + db = tmp_path / "state.db" + secret = "PARENT-AGG-SECRET-ARTIFACT-DO-NOT-LEAK pytest detail line here\n" * 20 + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE, "system_prompt": "be safe"}], + messages=[ + {"role": "tool", "content": secret, "tool_name": "Bash"}, + {"role": "tool", "content": secret, "tool_name": "Bash"}, + ], + ) + report = _analyze(db, tmp_path) + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + blob = json_path.read_text(encoding="utf-8") + md_path.read_text(encoding="utf-8") + + assert "PARENT-AGG-SECRET-ARTIFACT" not in blob + assert "PRIVATE REASONING" not in blob + assert "raw-session-id" not in blob + # The duplicate was still detected via salted hashing. + assert report.parent_aggregation.duplicate_group_count == 1 + assert report.parent_aggregation.est_duplicate_tokens > 0 + + +def test_parent_aggregation_schema_is_deterministic(tmp_path): + db = tmp_path / "state.db" + diff_artifact = ( + "diff --git a/foo.py b/foo.py\n@@ -1,4 +1,4 @@\n" + "-old line of code here that is long\n+new line of code here that is long\n" * 8 + ) + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "assistant", "content": diff_artifact}, + {"role": "assistant", "content": diff_artifact}, + ], + ) + r1 = _analyze(db, tmp_path) + r2 = _analyze(db, tmp_path) + # Identical inputs -> byte-identical serialized section. + assert dataclasses.asdict(r1.parent_aggregation) == dataclasses.asdict( + r2.parent_aggregation + ) + + pa = r1.parent_aggregation + # by_kind follows the canonical ARTIFACT_KINDS order (deterministic). + order = {k: i for i, k in enumerate(analyzer.ARTIFACT_KINDS)} + idxs = [order[ks.artifact_kind] for ks in pa.by_kind] + assert idxs == sorted(idxs) + # Every emitted kind is from the canonical low-cardinality enum. + assert set(order) >= {ks.artifact_kind for ks in pa.by_kind} + # Candidates carry only hash + enums + counters; hash is a salted prefix. + for g in pa.top_duplicate_groups: + assert len(g.content_hash) == 16 + assert g.artifact_kind in analyzer.ARTIFACT_KINDS + assert g.canonical_source_type in analyzer.BLOCK_TYPES + + +def test_classify_artifact_kind_is_deterministic_and_low_cardinality(): + cases = { + "diff --git a/x b/x\n@@ -1 +1 @@\n-a\n+b\n": "diff", + "Traceback (most recent call last):\n File x\nValueError: boom": "error_trace", + "## Summary\nThe worker aggregated all results successfully here.": "worker_summary", + "def worker_helper():\n return 'source code should win over worker word'": "file_content", + "some entirely opaque blob of bytes with no recognizable structure!!": "unknown_large_block", + } + for text, expected in cases.items(): + kind = analyzer.classify_artifact_kind(text) + assert kind == expected + assert kind in analyzer.ARTIFACT_KINDS + + +def test_parent_aggregation_can_be_disabled(tmp_path): + db = tmp_path / "state.db" + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + {"role": "tool", "content": TEST_LOG_ARTIFACT, "tool_name": "Bash"}, + ], + ) + tool_messages = analyzer.load_tool_messages(db, since_hours=WIDE_WINDOW) + llm = analyzer.load_llm_bound_content(db, since_hours=WIDE_WINDOW) + heavy = analyzer.load_heavy_sessions(db, since_hours=WIDE_WINDOW, salt="s", top_n=5) + tel = analyzer.parse_telemetry( + tmp_path / "none.jsonl", since_hours=WIDE_WINDOW, total_input_tokens=0 + ) + report = analyzer.build_report( + date="2100-01-01", + since_hours=24, + salt="s", + tool_messages=tool_messages, + heavy_sessions=heavy, + telemetry=tel, + llm_contents=llm, + min_block_repeat=2, + parent_aggregation_shadow=False, + ) + assert report.parent_aggregation.enabled is False + assert report.parent_aggregation.duplicate_group_count == 0 + json_path, md_path = analyzer.write_report(report, tmp_path / "out") + md = md_path.read_text(encoding="utf-8") + assert "Parent Aggregation Artifacts — shadow mode" in md + assert "disabled" in md + + +def test_worker_routing_intact_alongside_parent_aggregation(tmp_path): + """Adding parent-aggregation telemetry must not disturb worker routing.""" + db = tmp_path / "state.db" + big_repeated = "row of unrelated build log output number 7 with filler text " * 200 + assert len(big_repeated) >= LARGE + _make_db_ex( + db, + sessions=[{"id": "s1", "started_at": FAR_FUTURE}], + messages=[ + {"role": "tool", "content": big_repeated, "tool_name": "Bash"}, + {"role": "tool", "content": big_repeated, "tool_name": "Bash"}, + {"role": "tool", "content": big_repeated, "tool_name": "Bash"}, + ], + ) + report = _analyze(db, tmp_path) + # Worker routing still classifies the large repeated block as a drop candidate. + rm = _route_map(report) + assert "likely_drop_candidate" in rm + assert report.worker_routing.est_drop_candidate_tokens > 0 + # And parent aggregation independently sees the same body as a duplicate. + assert report.parent_aggregation.duplicate_group_count == 1 From 957d04ae097668b1c1b08cdd66df799cbaa3f064 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Jun 2026 14:47:49 +0200 Subject: [PATCH 8/9] docs: add ContextPilot self-evolve skill --- docs/guides/hermes-monitor.md | 29 ++ docs/guides/hermes.md | 9 + skills/contextpilot-self-evolve/SKILL.md | 274 +++++++++++++++++++ tests/test_contextpilot_self_evolve_skill.py | 112 ++++++++ 4 files changed, 424 insertions(+) create mode 100644 skills/contextpilot-self-evolve/SKILL.md create mode 100644 tests/test_contextpilot_self_evolve_skill.py diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index 4aa9128..a851fa3 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -184,6 +184,35 @@ gate below before changing ContextPilot config or code. A defensive guard in `write_report` refuses to emit any forbidden raw-content key, so the reports are safe to ship from an unattended cron job. +## Self-evolve skill (Hermes) + +The monitor and analyzer above are bundled into a reusable Hermes skill so users +can run the same safe loop — install/enable ContextPilot, collect +telemetry/shadow data, analyze realized savings vs advisory candidates, and +propose improvements under strict safety gates: + +- Skill path: `skills/contextpilot-self-evolve/SKILL.md` + +The skill is **proposal-only**: it never auto-applies context routing, dropping, +or summarization, and it enforces the same privacy boundary (reports never emit +raw conversation/tool/system text, reasoning, or raw session ids). + +To use it in Hermes, copy or load the skill into your Hermes skills directory, +then invoke it by name: + +```bash +# copy into your Hermes skills directory (adjust path to your install) +mkdir -p ~/.hermes/skills/contextpilot-self-evolve +cp skills/contextpilot-self-evolve/SKILL.md \ + ~/.hermes/skills/contextpilot-self-evolve/SKILL.md +``` + +Then ask Hermes to run the **contextpilot-self-evolve** skill. It walks through +install/verify, the metadata-only monitor, the content-aware analyzer for +`--since-hours 24` and `168`, interpretation of realized vs advisory tokens, +optional read-only cron jobs, and the branch/tests/privacy/independent-review +gate required before any code or config change ships. + ## Accuracy gate This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require: diff --git a/docs/guides/hermes.md b/docs/guides/hermes.md index d13f293..7761697 100644 --- a/docs/guides/hermes.md +++ b/docs/guides/hermes.md @@ -97,6 +97,15 @@ Hermes ships with `ContextCompressor`, a threshold-based LLM-summarization engin ContextPilot runs *before* the threshold-based compressor, reducing how often the expensive summarization path is hit. +## Self-evolve skill + +For a guided, safety-gated loop that installs ContextPilot, monitors its real +token savings, scans for context-reduction opportunities, and proposes +improvements (without auto-applying risky changes), use the bundled Hermes +skill at `skills/contextpilot-self-evolve/SKILL.md`. See +[`hermes-monitor.md`](./hermes-monitor.md#self-evolve-skill-hermes) for how to +copy/load and invoke it. + ## Troubleshooting **Plugin not discovered after install.** Check `~/.hermes/plugins/ContextPilot/plugin.yaml` exists and contains `type: context_engine`. Run `hermes plugins list` to confirm. diff --git a/skills/contextpilot-self-evolve/SKILL.md b/skills/contextpilot-self-evolve/SKILL.md new file mode 100644 index 0000000..d88b81b --- /dev/null +++ b/skills/contextpilot-self-evolve/SKILL.md @@ -0,0 +1,274 @@ +--- +name: contextpilot-self-evolve +description: Use when a user wants to install/enable ContextPilot inside Hermes Agent and then run a safe, repeatable "self-evolve" loop — collect metadata-only telemetry and content-aware shadow data, analyze realized token savings vs advisory candidate tokens, and propose ContextPilot improvements under strict safety gates. Use it for monitoring token spend, scanning context-redundancy opportunities, setting up read-only daily/weekly cron analysis, and preparing reviewed, branch-gated code/config changes. Do NOT use it to auto-apply routing/drop/summarization changes; this skill only proposes risky changes and requires tests, privacy checks, and independent review before anything ships. +version: 1.0.0 +author: ContextPilot +license: MIT +metadata: + hermes: + tags: [contextpilot, hermes, telemetry, context-optimization, token-savings, safety-gated] + related_skills: [] + category: observability + safety: proposal-only +--- + +# ContextPilot Self-Evolve (Hermes) + +This skill drives a **safe, repeatable** loop for running ContextPilot inside +Hermes Agent and continuously improving it from real telemetry — **without** +auto-applying any risky context change. You measure, you analyze, you *propose*; +a human (plus tests, privacy checks, and independent review) decides what ships. + +> Core safety stance: **observe and propose only.** This skill never enables +> context routing, dropping, or summarization on its own. Shadow/advisory +> numbers are training/eval data, **not** realized savings, and must never be +> treated as something to "just turn on." + +## When to use this skill + +- A user asks to install or enable ContextPilot in Hermes and watch its impact. +- A user wants to know how many tokens/cost ContextPilot is actually saving. +- A user wants to find token-reduction *opportunities* (duplicate tool outputs, + cross-role repeated blocks, oversized tool results, routing/dedup candidates). +- A user wants a daily/weekly read-only cron that reports savings + opportunities. +- A user wants to propose a ContextPilot config or code change and needs the safe + workflow (branch, tests, privacy/no-raw-content checks, independent review). + +If the user instead wants the low-level integration mechanics, point them at +`docs/guides/hermes.md`; for the metadata-only monitor details, see +`docs/guides/hermes-monitor.md`. This skill orchestrates both into one loop. + +## Privacy boundary (read this first) + +There are two analysis tools with **different** read scopes: + +- `scripts/hermes_contextpilot_monitor.py` — **metadata only**. Never reads + `messages.content`, `sessions.system_prompt`, reasoning, or raw tool payloads. +- `scripts/analyze_hermes_context_opportunities.py` — **content-aware**. It + *may* read message/tool/system content **in-memory** to compute salted + SHA-256 fingerprints and aggregate counters. + +In **both** cases the rule is absolute: **reports must never emit raw +conversation text, tool-call payloads, system prompts, reasoning, or raw session +ids.** Session ids appear only as salted hashes. The analyzer has a defensive +`write_report` guard that refuses to emit forbidden raw-content keys; do not +weaken or bypass it. If you are ever unsure whether an output is safe to ship, +treat it as unsafe and stop. + +## Workflow + +### Step 1 — Install / enable ContextPilot in Hermes + +Normal install (do **not** use `--force`): + +```bash +hermes plugins install EfficientContext/ContextPilot --enable +hermes config set context.engine contextpilot +``` + +`--force` is **only** for an intentional update/reinstall over an existing +install — never as the default: + +```bash +hermes plugins install EfficientContext/ContextPilot --enable --force +``` + +If your Hermes version does not support `--enable`, install first and then use the +plugin menu: + +```bash +hermes plugins # General Plugins -> toggle "contextpilot" enabled +``` + +### Step 2 — Verify the context engine + restart + +Confirm Hermes is actually routing through ContextPilot. The active context +engine must be `contextpilot`: + +```yaml +# ~/.hermes config +context: + engine: contextpilot +``` + +```python +from hermes_cli.plugins import get_plugin_manager +engine = get_plugin_manager()._context_engine +print(engine.get_status()) # expect {'engine': 'contextpilot', ...} +``` + +Then **restart the Hermes gateway / start a fresh session** so the engine is +loaded. On startup you should see: + +``` +Plugin 'contextpilot' registered context engine: contextpilot +``` + +> The context-engine TUI submenu may show "contextpilot (not found)" — that is +> cosmetic; `get_status()` is the source of truth. + +### Step 3 — Run the metadata-only monitor + +Use this as the safe baseline. It reports realized savings and operational +signals from telemetry/metadata only: + +```bash +python scripts/hermes_contextpilot_monitor.py \ + --out-dir ~/contextpilot/reports \ + --since-hours 24 \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl +``` + +Reports: + +- `~/contextpilot/reports/daily_YYYY-MM-DD.json` +- `~/contextpilot/reports/daily_YYYY-MM-DD.md` + +The telemetry file is written by the ContextPilot Hermes plugin when savings +occur. `CONTEXTPILOT_DISABLE_TELEMETRY=1` disables writes; +`CONTEXTPILOT_TELEMETRY_FILE=/path` overrides the location. + +### Step 4 — Run the content-aware opportunity analyzer + +Run for both a rolling day and a rolling week to separate noise from trend: + +```bash +# last 24h +python scripts/analyze_hermes_context_opportunities.py \ + --state-db ~/.hermes/state.db \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ + --out-dir ~/contextpilot/opportunities \ + --since-hours 24 + +# last 7 days (168h) +python scripts/analyze_hermes_context_opportunities.py \ + --state-db ~/.hermes/state.db \ + --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ + --out-dir ~/contextpilot/opportunities \ + --since-hours 168 +``` + +For a one-shot whole-history audit, swap the window for `--all-sessions`. +Reports: + +- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json` +- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md` + +The analyzer surfaces: exact duplicate tool outputs, repeated line/block +fingerprints, large outputs by `tool_name`, heavy sessions (hashed ids), +ContextPilot telemetry coverage/ratios, **LLM-bound cross-type repeated +blocks**, **Worker Context Routing shadow labels**, and **Parent Aggregation +Artifact** dedup telemetry. The shadow/parent sections are **on by default** and +collect P0 data only; pass `--disable-worker-routing-shadow` or +`--disable-parent-aggregation` to omit a section. + +### Step 5 — Interpret: realized savings vs advisory candidates + +Keep these two numbers in separate mental buckets — never add them together: + +- **Realized savings** (telemetry: `chars_saved`, `~tokens`, savings ratio, + monitor report) — what ContextPilot *actually* saved via lossless dedup + + reorder. This is real and bankable. +- **Advisory / shadow candidate tokens** (analyzer: routing-shadow + `est_advisory_candidate_tokens`, parent-aggregation `est_duplicate_tokens`, + cross-type redundant tokens) — an **upper-bound estimate** of what a *future* + router/dedup *might* save. **Not realized.** It is training/eval data, and + every token estimate is a heuristic (`chars/4`). + +When reporting to the user, state realized savings as fact and label every +advisory number as a candidate that still needs validation. Do not imply that +advisory tokens are available simply by toggling a flag. + +### Step 6 — Optional read-only cron jobs + +Schedule the monitor and/or analyzer as **read-only watchdogs**. They produce +reports; they must not apply config or code changes. + +```python +cronjob( + action="create", + name="contextpilot-self-evolve-daily", + schedule="0 4 * * *", + repeat=7, + deliver="origin", + enabled_toolsets=["terminal", "file"], + prompt=""" +Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with +--out-dir /root/contextpilot/reports --since-hours 24, then run +analyze_hermes_context_opportunities.py with --since-hours 24. Read today's +Markdown reports and send a short summary: realized token savings, session +count, whether ContextPilot events were observed, and the top advisory +opportunities (clearly labeled as candidates, not realized). Do NOT read raw +conversation content. Do NOT modify source/config. +""", +) +``` + +For a weekly trend, add a second job with `--since-hours 168` on a `0 5 * * 1` +schedule. Both stay strictly read-only. + +### Step 7 — Propose improvements (do NOT auto-apply risky changes) + +From the reports, write a prioritized proposal. **Never** auto-enable context +**routing**, context **dropping**, or **summarization** based on shadow numbers. +Those are high-recall-sensitive changes that can silently drop needed context; +they require the accuracy gate plus human sign-off. + +Before any ContextPilot change ships, run a fixed golden eval set and require: + +- no task-success regression, +- no drop in context recall beyond the chosen threshold, +- no unsafe raw-content leakage in reports, +- no increase in failed tool calls. + +If any gate fails, hold the proposal and require human review. + +### Step 8 — Safe path for code/config changes + +For anything beyond a read-only report, follow this gate every time: + +1. **Branch.** Make changes on a dedicated branch; never on `main`. No + destructive git operations, no commit/push unless the user explicitly asks. +2. **Tests.** Add/extend tests and run the relevant suite (see below). A change + to analysis or routing logic must ship with coverage. +3. **Privacy / no-raw-content check.** Re-confirm no report path can emit raw + conversation/tool/system text, reasoning, or raw session ids. Keep the + `write_report` forbidden-key guard intact. +4. **Independent review.** Get a second, independent review (human or a separate + reviewing agent) focused on correctness, recall safety, and privacy before + merge. + +### Optional — delegated coding + independent verification + +If the user has a coding-agent workflow, you may delegate the *implementation* +of an approved proposal to a coding agent (e.g. Claude Code) on a branch, and +then run **independent verification** in Hermes (re-run tests, the privacy +guard, and the accuracy gate) rather than trusting the author's own check. This +two-party split (one writes, another verifies) is recommended but generic — any +"author + independent reviewer" arrangement satisfies the gate. The skill itself +never merges; a human approves. + +## Report locations (quick reference) + +| Tool | Scope | Default output | +|------|-------|----------------| +| `hermes_contextpilot_monitor.py` | metadata only | `~/contextpilot/reports/daily_YYYY-MM-DD.{json,md}` | +| `analyze_hermes_context_opportunities.py` | content-aware (hashes only in reports) | `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.{json,md}` | + +## Relevant tests + +```bash +python -m pytest tests/test_hermes_contextpilot_monitor.py \ + tests/test_hermes_context_opportunity_analyzer.py \ + tests/test_contextpilot_self_evolve_skill.py -q +``` + +## Hard rules (never violate) + +- Observe and **propose** only — never auto-apply routing/drop/summarization. +- Reports never contain raw conversation/tool/system text, reasoning, or raw + session ids; session ids are salted hashes only. +- Realized savings and advisory/shadow candidate tokens are reported separately. +- `--force` install only for an intentional update/reinstall. +- Code/config changes require: branch, tests, privacy check, independent review. +- No destructive git operations; no commit/push unless the user asks. diff --git a/tests/test_contextpilot_self_evolve_skill.py b/tests/test_contextpilot_self_evolve_skill.py new file mode 100644 index 0000000..5da09c4 --- /dev/null +++ b/tests/test_contextpilot_self_evolve_skill.py @@ -0,0 +1,112 @@ +"""Validation tests for the contextpilot-self-evolve Hermes skill. + +These guard the SKILL.md packaging contract: valid YAML frontmatter, required +Hermes fields, size limits, and the presence of the safety/privacy phrases that +make this skill safe to ship (it must stay proposal-only and never promise to +auto-apply risky context changes). +""" +from pathlib import Path + +import pytest + +import yaml + + +SKILL_PATH = ( + Path(__file__).resolve().parents[1] + / "skills" + / "contextpilot-self-evolve" + / "SKILL.md" +) + +MAX_SKILL_CHARS = 100_000 +MAX_DESCRIPTION_CHARS = 1024 + + +def _read_skill(): + text = SKILL_PATH.read_text(encoding="utf-8") + assert text.startswith("---\n"), "SKILL.md must start with YAML frontmatter" + # Split on the closing frontmatter fence. + _, frontmatter, body = text.split("---\n", 2) + meta = yaml.safe_load(frontmatter) + return text, meta, body + + +def test_skill_file_exists(): + assert SKILL_PATH.is_file(), f"missing skill file: {SKILL_PATH}" + + +def test_skill_size_under_limit(): + text = SKILL_PATH.read_text(encoding="utf-8") + assert len(text) <= MAX_SKILL_CHARS, ( + f"SKILL.md is {len(text)} chars, exceeds {MAX_SKILL_CHARS}" + ) + + +def test_frontmatter_parses_and_has_required_fields(): + _, meta, _ = _read_skill() + assert isinstance(meta, dict), "frontmatter must parse to a mapping" + for field in ("name", "description", "version", "author", "license", "metadata"): + assert field in meta, f"frontmatter missing required field: {field}" + + +def test_name_matches(): + _, meta, _ = _read_skill() + assert meta["name"] == "contextpilot-self-evolve" + + +def test_description_is_use_when_and_within_limit(): + _, meta, _ = _read_skill() + description = meta["description"] + assert isinstance(description, str) and description.strip() + assert description.lstrip().lower().startswith("use when"), ( + "description should start with 'Use when' per Hermes convention" + ) + assert len(description) <= MAX_DESCRIPTION_CHARS, ( + f"description is {len(description)} chars, exceeds {MAX_DESCRIPTION_CHARS}" + ) + + +def test_metadata_has_tags(): + _, meta, _ = _read_skill() + metadata = meta["metadata"] + assert isinstance(metadata, dict) + hermes_meta = metadata.get("hermes") + assert isinstance(hermes_meta, dict), "metadata.hermes must be present" + assert hermes_meta.get("tags"), "metadata.hermes.tags must be a non-empty list" + assert isinstance(hermes_meta["tags"], list) + + +@pytest.mark.parametrize( + "phrase", + [ + # proposal-only / no auto-apply of risky changes + "propose", + "independent review", + # privacy boundary + "raw", + "salted", + "session ids", + # realized vs advisory separation + "advisory", + "realized", + # safe install convention + "--force", + # change-gate requirements + "branch", + "tests", + ], +) +def test_required_safety_phrases_present(phrase): + text, _, _ = _read_skill() + assert phrase.lower() in text.lower(), f"SKILL.md missing safety phrase: {phrase!r}" + + +def test_does_not_promise_auto_apply(): + """The skill must keep its proposal-only stance for risky changes.""" + text, _, _ = _read_skill() + lowered = text.lower() + # Must explicitly disclaim auto-applying routing/drop/summarization. + assert "never auto-apply" in lowered or "do not auto-apply" in lowered or ( + "not" in lowered and "auto-enable" in lowered + ), "SKILL.md must state it never auto-applies risky context changes" From 746b7d4d4d31a24c162d8bafb621fdcf439c712a Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Jun 2026 15:09:51 +0200 Subject: [PATCH 9/9] revert: remove user-facing self-evolve skill --- docs/guides/hermes-monitor.md | 29 -- docs/guides/hermes.md | 9 - skills/contextpilot-self-evolve/SKILL.md | 274 ------------------- tests/test_contextpilot_self_evolve_skill.py | 112 -------- 4 files changed, 424 deletions(-) delete mode 100644 skills/contextpilot-self-evolve/SKILL.md delete mode 100644 tests/test_contextpilot_self_evolve_skill.py diff --git a/docs/guides/hermes-monitor.md b/docs/guides/hermes-monitor.md index a851fa3..4aa9128 100644 --- a/docs/guides/hermes-monitor.md +++ b/docs/guides/hermes-monitor.md @@ -184,35 +184,6 @@ gate below before changing ContextPilot config or code. A defensive guard in `write_report` refuses to emit any forbidden raw-content key, so the reports are safe to ship from an unattended cron job. -## Self-evolve skill (Hermes) - -The monitor and analyzer above are bundled into a reusable Hermes skill so users -can run the same safe loop — install/enable ContextPilot, collect -telemetry/shadow data, analyze realized savings vs advisory candidates, and -propose improvements under strict safety gates: - -- Skill path: `skills/contextpilot-self-evolve/SKILL.md` - -The skill is **proposal-only**: it never auto-applies context routing, dropping, -or summarization, and it enforces the same privacy boundary (reports never emit -raw conversation/tool/system text, reasoning, or raw session ids). - -To use it in Hermes, copy or load the skill into your Hermes skills directory, -then invoke it by name: - -```bash -# copy into your Hermes skills directory (adjust path to your install) -mkdir -p ~/.hermes/skills/contextpilot-self-evolve -cp skills/contextpilot-self-evolve/SKILL.md \ - ~/.hermes/skills/contextpilot-self-evolve/SKILL.md -``` - -Then ask Hermes to run the **contextpilot-self-evolve** skill. It walks through -install/verify, the metadata-only monitor, the content-aware analyzer for -`--since-hours 24` and `168`, interpretation of realized vs advisory tokens, -optional read-only cron jobs, and the branch/tests/privacy/independent-review -gate required before any code or config change ships. - ## Accuracy gate This monitor only measures token/cost savings and operational signals. Before shipping ContextPilot changes, run a fixed golden eval set and require: diff --git a/docs/guides/hermes.md b/docs/guides/hermes.md index 7761697..d13f293 100644 --- a/docs/guides/hermes.md +++ b/docs/guides/hermes.md @@ -97,15 +97,6 @@ Hermes ships with `ContextCompressor`, a threshold-based LLM-summarization engin ContextPilot runs *before* the threshold-based compressor, reducing how often the expensive summarization path is hit. -## Self-evolve skill - -For a guided, safety-gated loop that installs ContextPilot, monitors its real -token savings, scans for context-reduction opportunities, and proposes -improvements (without auto-applying risky changes), use the bundled Hermes -skill at `skills/contextpilot-self-evolve/SKILL.md`. See -[`hermes-monitor.md`](./hermes-monitor.md#self-evolve-skill-hermes) for how to -copy/load and invoke it. - ## Troubleshooting **Plugin not discovered after install.** Check `~/.hermes/plugins/ContextPilot/plugin.yaml` exists and contains `type: context_engine`. Run `hermes plugins list` to confirm. diff --git a/skills/contextpilot-self-evolve/SKILL.md b/skills/contextpilot-self-evolve/SKILL.md deleted file mode 100644 index d88b81b..0000000 --- a/skills/contextpilot-self-evolve/SKILL.md +++ /dev/null @@ -1,274 +0,0 @@ ---- -name: contextpilot-self-evolve -description: Use when a user wants to install/enable ContextPilot inside Hermes Agent and then run a safe, repeatable "self-evolve" loop — collect metadata-only telemetry and content-aware shadow data, analyze realized token savings vs advisory candidate tokens, and propose ContextPilot improvements under strict safety gates. Use it for monitoring token spend, scanning context-redundancy opportunities, setting up read-only daily/weekly cron analysis, and preparing reviewed, branch-gated code/config changes. Do NOT use it to auto-apply routing/drop/summarization changes; this skill only proposes risky changes and requires tests, privacy checks, and independent review before anything ships. -version: 1.0.0 -author: ContextPilot -license: MIT -metadata: - hermes: - tags: [contextpilot, hermes, telemetry, context-optimization, token-savings, safety-gated] - related_skills: [] - category: observability - safety: proposal-only ---- - -# ContextPilot Self-Evolve (Hermes) - -This skill drives a **safe, repeatable** loop for running ContextPilot inside -Hermes Agent and continuously improving it from real telemetry — **without** -auto-applying any risky context change. You measure, you analyze, you *propose*; -a human (plus tests, privacy checks, and independent review) decides what ships. - -> Core safety stance: **observe and propose only.** This skill never enables -> context routing, dropping, or summarization on its own. Shadow/advisory -> numbers are training/eval data, **not** realized savings, and must never be -> treated as something to "just turn on." - -## When to use this skill - -- A user asks to install or enable ContextPilot in Hermes and watch its impact. -- A user wants to know how many tokens/cost ContextPilot is actually saving. -- A user wants to find token-reduction *opportunities* (duplicate tool outputs, - cross-role repeated blocks, oversized tool results, routing/dedup candidates). -- A user wants a daily/weekly read-only cron that reports savings + opportunities. -- A user wants to propose a ContextPilot config or code change and needs the safe - workflow (branch, tests, privacy/no-raw-content checks, independent review). - -If the user instead wants the low-level integration mechanics, point them at -`docs/guides/hermes.md`; for the metadata-only monitor details, see -`docs/guides/hermes-monitor.md`. This skill orchestrates both into one loop. - -## Privacy boundary (read this first) - -There are two analysis tools with **different** read scopes: - -- `scripts/hermes_contextpilot_monitor.py` — **metadata only**. Never reads - `messages.content`, `sessions.system_prompt`, reasoning, or raw tool payloads. -- `scripts/analyze_hermes_context_opportunities.py` — **content-aware**. It - *may* read message/tool/system content **in-memory** to compute salted - SHA-256 fingerprints and aggregate counters. - -In **both** cases the rule is absolute: **reports must never emit raw -conversation text, tool-call payloads, system prompts, reasoning, or raw session -ids.** Session ids appear only as salted hashes. The analyzer has a defensive -`write_report` guard that refuses to emit forbidden raw-content keys; do not -weaken or bypass it. If you are ever unsure whether an output is safe to ship, -treat it as unsafe and stop. - -## Workflow - -### Step 1 — Install / enable ContextPilot in Hermes - -Normal install (do **not** use `--force`): - -```bash -hermes plugins install EfficientContext/ContextPilot --enable -hermes config set context.engine contextpilot -``` - -`--force` is **only** for an intentional update/reinstall over an existing -install — never as the default: - -```bash -hermes plugins install EfficientContext/ContextPilot --enable --force -``` - -If your Hermes version does not support `--enable`, install first and then use the -plugin menu: - -```bash -hermes plugins # General Plugins -> toggle "contextpilot" enabled -``` - -### Step 2 — Verify the context engine + restart - -Confirm Hermes is actually routing through ContextPilot. The active context -engine must be `contextpilot`: - -```yaml -# ~/.hermes config -context: - engine: contextpilot -``` - -```python -from hermes_cli.plugins import get_plugin_manager -engine = get_plugin_manager()._context_engine -print(engine.get_status()) # expect {'engine': 'contextpilot', ...} -``` - -Then **restart the Hermes gateway / start a fresh session** so the engine is -loaded. On startup you should see: - -``` -Plugin 'contextpilot' registered context engine: contextpilot -``` - -> The context-engine TUI submenu may show "contextpilot (not found)" — that is -> cosmetic; `get_status()` is the source of truth. - -### Step 3 — Run the metadata-only monitor - -Use this as the safe baseline. It reports realized savings and operational -signals from telemetry/metadata only: - -```bash -python scripts/hermes_contextpilot_monitor.py \ - --out-dir ~/contextpilot/reports \ - --since-hours 24 \ - --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl -``` - -Reports: - -- `~/contextpilot/reports/daily_YYYY-MM-DD.json` -- `~/contextpilot/reports/daily_YYYY-MM-DD.md` - -The telemetry file is written by the ContextPilot Hermes plugin when savings -occur. `CONTEXTPILOT_DISABLE_TELEMETRY=1` disables writes; -`CONTEXTPILOT_TELEMETRY_FILE=/path` overrides the location. - -### Step 4 — Run the content-aware opportunity analyzer - -Run for both a rolling day and a rolling week to separate noise from trend: - -```bash -# last 24h -python scripts/analyze_hermes_context_opportunities.py \ - --state-db ~/.hermes/state.db \ - --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ - --out-dir ~/contextpilot/opportunities \ - --since-hours 24 - -# last 7 days (168h) -python scripts/analyze_hermes_context_opportunities.py \ - --state-db ~/.hermes/state.db \ - --telemetry-file ~/.hermes/contextpilot/telemetry.jsonl \ - --out-dir ~/contextpilot/opportunities \ - --since-hours 168 -``` - -For a one-shot whole-history audit, swap the window for `--all-sessions`. -Reports: - -- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.json` -- `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.md` - -The analyzer surfaces: exact duplicate tool outputs, repeated line/block -fingerprints, large outputs by `tool_name`, heavy sessions (hashed ids), -ContextPilot telemetry coverage/ratios, **LLM-bound cross-type repeated -blocks**, **Worker Context Routing shadow labels**, and **Parent Aggregation -Artifact** dedup telemetry. The shadow/parent sections are **on by default** and -collect P0 data only; pass `--disable-worker-routing-shadow` or -`--disable-parent-aggregation` to omit a section. - -### Step 5 — Interpret: realized savings vs advisory candidates - -Keep these two numbers in separate mental buckets — never add them together: - -- **Realized savings** (telemetry: `chars_saved`, `~tokens`, savings ratio, - monitor report) — what ContextPilot *actually* saved via lossless dedup + - reorder. This is real and bankable. -- **Advisory / shadow candidate tokens** (analyzer: routing-shadow - `est_advisory_candidate_tokens`, parent-aggregation `est_duplicate_tokens`, - cross-type redundant tokens) — an **upper-bound estimate** of what a *future* - router/dedup *might* save. **Not realized.** It is training/eval data, and - every token estimate is a heuristic (`chars/4`). - -When reporting to the user, state realized savings as fact and label every -advisory number as a candidate that still needs validation. Do not imply that -advisory tokens are available simply by toggling a flag. - -### Step 6 — Optional read-only cron jobs - -Schedule the monitor and/or analyzer as **read-only watchdogs**. They produce -reports; they must not apply config or code changes. - -```python -cronjob( - action="create", - name="contextpilot-self-evolve-daily", - schedule="0 4 * * *", - repeat=7, - deliver="origin", - enabled_toolsets=["terminal", "file"], - prompt=""" -Run /root/work/ContextPilot/scripts/hermes_contextpilot_monitor.py with ---out-dir /root/contextpilot/reports --since-hours 24, then run -analyze_hermes_context_opportunities.py with --since-hours 24. Read today's -Markdown reports and send a short summary: realized token savings, session -count, whether ContextPilot events were observed, and the top advisory -opportunities (clearly labeled as candidates, not realized). Do NOT read raw -conversation content. Do NOT modify source/config. -""", -) -``` - -For a weekly trend, add a second job with `--since-hours 168` on a `0 5 * * 1` -schedule. Both stay strictly read-only. - -### Step 7 — Propose improvements (do NOT auto-apply risky changes) - -From the reports, write a prioritized proposal. **Never** auto-enable context -**routing**, context **dropping**, or **summarization** based on shadow numbers. -Those are high-recall-sensitive changes that can silently drop needed context; -they require the accuracy gate plus human sign-off. - -Before any ContextPilot change ships, run a fixed golden eval set and require: - -- no task-success regression, -- no drop in context recall beyond the chosen threshold, -- no unsafe raw-content leakage in reports, -- no increase in failed tool calls. - -If any gate fails, hold the proposal and require human review. - -### Step 8 — Safe path for code/config changes - -For anything beyond a read-only report, follow this gate every time: - -1. **Branch.** Make changes on a dedicated branch; never on `main`. No - destructive git operations, no commit/push unless the user explicitly asks. -2. **Tests.** Add/extend tests and run the relevant suite (see below). A change - to analysis or routing logic must ship with coverage. -3. **Privacy / no-raw-content check.** Re-confirm no report path can emit raw - conversation/tool/system text, reasoning, or raw session ids. Keep the - `write_report` forbidden-key guard intact. -4. **Independent review.** Get a second, independent review (human or a separate - reviewing agent) focused on correctness, recall safety, and privacy before - merge. - -### Optional — delegated coding + independent verification - -If the user has a coding-agent workflow, you may delegate the *implementation* -of an approved proposal to a coding agent (e.g. Claude Code) on a branch, and -then run **independent verification** in Hermes (re-run tests, the privacy -guard, and the accuracy gate) rather than trusting the author's own check. This -two-party split (one writes, another verifies) is recommended but generic — any -"author + independent reviewer" arrangement satisfies the gate. The skill itself -never merges; a human approves. - -## Report locations (quick reference) - -| Tool | Scope | Default output | -|------|-------|----------------| -| `hermes_contextpilot_monitor.py` | metadata only | `~/contextpilot/reports/daily_YYYY-MM-DD.{json,md}` | -| `analyze_hermes_context_opportunities.py` | content-aware (hashes only in reports) | `~/contextpilot/opportunities/opportunities_YYYY-MM-DD.{json,md}` | - -## Relevant tests - -```bash -python -m pytest tests/test_hermes_contextpilot_monitor.py \ - tests/test_hermes_context_opportunity_analyzer.py \ - tests/test_contextpilot_self_evolve_skill.py -q -``` - -## Hard rules (never violate) - -- Observe and **propose** only — never auto-apply routing/drop/summarization. -- Reports never contain raw conversation/tool/system text, reasoning, or raw - session ids; session ids are salted hashes only. -- Realized savings and advisory/shadow candidate tokens are reported separately. -- `--force` install only for an intentional update/reinstall. -- Code/config changes require: branch, tests, privacy check, independent review. -- No destructive git operations; no commit/push unless the user asks. diff --git a/tests/test_contextpilot_self_evolve_skill.py b/tests/test_contextpilot_self_evolve_skill.py deleted file mode 100644 index 5da09c4..0000000 --- a/tests/test_contextpilot_self_evolve_skill.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Validation tests for the contextpilot-self-evolve Hermes skill. - -These guard the SKILL.md packaging contract: valid YAML frontmatter, required -Hermes fields, size limits, and the presence of the safety/privacy phrases that -make this skill safe to ship (it must stay proposal-only and never promise to -auto-apply risky context changes). -""" -from pathlib import Path - -import pytest - -import yaml - - -SKILL_PATH = ( - Path(__file__).resolve().parents[1] - / "skills" - / "contextpilot-self-evolve" - / "SKILL.md" -) - -MAX_SKILL_CHARS = 100_000 -MAX_DESCRIPTION_CHARS = 1024 - - -def _read_skill(): - text = SKILL_PATH.read_text(encoding="utf-8") - assert text.startswith("---\n"), "SKILL.md must start with YAML frontmatter" - # Split on the closing frontmatter fence. - _, frontmatter, body = text.split("---\n", 2) - meta = yaml.safe_load(frontmatter) - return text, meta, body - - -def test_skill_file_exists(): - assert SKILL_PATH.is_file(), f"missing skill file: {SKILL_PATH}" - - -def test_skill_size_under_limit(): - text = SKILL_PATH.read_text(encoding="utf-8") - assert len(text) <= MAX_SKILL_CHARS, ( - f"SKILL.md is {len(text)} chars, exceeds {MAX_SKILL_CHARS}" - ) - - -def test_frontmatter_parses_and_has_required_fields(): - _, meta, _ = _read_skill() - assert isinstance(meta, dict), "frontmatter must parse to a mapping" - for field in ("name", "description", "version", "author", "license", "metadata"): - assert field in meta, f"frontmatter missing required field: {field}" - - -def test_name_matches(): - _, meta, _ = _read_skill() - assert meta["name"] == "contextpilot-self-evolve" - - -def test_description_is_use_when_and_within_limit(): - _, meta, _ = _read_skill() - description = meta["description"] - assert isinstance(description, str) and description.strip() - assert description.lstrip().lower().startswith("use when"), ( - "description should start with 'Use when' per Hermes convention" - ) - assert len(description) <= MAX_DESCRIPTION_CHARS, ( - f"description is {len(description)} chars, exceeds {MAX_DESCRIPTION_CHARS}" - ) - - -def test_metadata_has_tags(): - _, meta, _ = _read_skill() - metadata = meta["metadata"] - assert isinstance(metadata, dict) - hermes_meta = metadata.get("hermes") - assert isinstance(hermes_meta, dict), "metadata.hermes must be present" - assert hermes_meta.get("tags"), "metadata.hermes.tags must be a non-empty list" - assert isinstance(hermes_meta["tags"], list) - - -@pytest.mark.parametrize( - "phrase", - [ - # proposal-only / no auto-apply of risky changes - "propose", - "independent review", - # privacy boundary - "raw", - "salted", - "session ids", - # realized vs advisory separation - "advisory", - "realized", - # safe install convention - "--force", - # change-gate requirements - "branch", - "tests", - ], -) -def test_required_safety_phrases_present(phrase): - text, _, _ = _read_skill() - assert phrase.lower() in text.lower(), f"SKILL.md missing safety phrase: {phrase!r}" - - -def test_does_not_promise_auto_apply(): - """The skill must keep its proposal-only stance for risky changes.""" - text, _, _ = _read_skill() - lowered = text.lower() - # Must explicitly disclaim auto-applying routing/drop/summarization. - assert "never auto-apply" in lowered or "do not auto-apply" in lowered or ( - "not" in lowered and "auto-enable" in lowered - ), "SKILL.md must state it never auto-applies risky context changes"