diff --git a/.gitignore b/.gitignore index e4d3908..5eb2631 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,13 @@ dist/ *.log *.jsonl *.egg-info/ + +# Trace-derived validation sets are local-only / private: generated corpora hold +# raw conversation content and must never be committed. The in-repo convenience +# dir is ignored; raw corpora also match the *.jsonl rule above. +.contextpilot_validation/ +# ...but the committed SYNTHETIC test fixture (no real trace data) is kept. +!tests/fixtures/trace_validation/synthetic_cases.jsonl */.DS_Store *.DS_Store diff --git a/contextpilot/trace_validation/__init__.py b/contextpilot/trace_validation/__init__.py new file mode 100644 index 0000000..b15790d --- /dev/null +++ b/contextpilot/trace_validation/__init__.py @@ -0,0 +1,82 @@ +"""Trace-derived validation-set framework for ContextPilot. + +A fixed, replayable corpus + a gate runner so any future accuracy-affecting or +runtime-payload-changing change can be checked against stable Hermes traces +instead of an ad-hoc "run once and see". + +Two pieces, with a deliberate privacy split: + +* :mod:`.builder` reads a local Hermes SQLite DB (read-only) and exports a fixed + JSONL corpus under a local, gitignored directory. Raw content lives ONLY in + that local artifact; the sidecar manifest is privacy-safe. +* :mod:`.runner` replays the corpus through ContextPilot's optimization in a + baseline (``off``) mode vs a configured candidate mode and checks + accuracy-preservation invariants, emitting a privacy-safe pass/fail report. + +Both reuse the analyzer's read-only DB loaders, salted hashing, and forbidden-key +guard so the privacy primitives stay defined in one place. +""" +from __future__ import annotations + +from .builder import ( + DEFAULT_OUT_DIR, + DEFAULT_SALT, + DEFAULT_STATE_DB, + build_manifest, + case_to_json, + load_trace_cases, + write_validation_set, +) +from .builder import main as build_main +from .models import ( + DEFAULT_CASE_LIMIT, + DEFAULT_MIN_INPUT_TOKENS, + MUTABLE_BLOCK_TYPE, + VALIDATION_SET_SCHEMA_VERSION, + TraceCase, + TraceMessage, + ValidationCaseResult, + ValidationReport, +) +from .runner import ( + INVARIANT_NAMES, + assert_report_privacy_safe, + check_invariants, + load_cases, + optimize_case, + render_markdown, + report_to_dict, + run_validation, +) +from .runner import main as run_main + +__all__ = [ + # models / constants + "VALIDATION_SET_SCHEMA_VERSION", + "DEFAULT_CASE_LIMIT", + "DEFAULT_MIN_INPUT_TOKENS", + "MUTABLE_BLOCK_TYPE", + "INVARIANT_NAMES", + "TraceMessage", + "TraceCase", + "ValidationCaseResult", + "ValidationReport", + # builder + "DEFAULT_STATE_DB", + "DEFAULT_OUT_DIR", + "DEFAULT_SALT", + "load_trace_cases", + "case_to_json", + "build_manifest", + "write_validation_set", + "build_main", + # runner + "load_cases", + "optimize_case", + "check_invariants", + "run_validation", + "report_to_dict", + "render_markdown", + "assert_report_privacy_safe", + "run_main", +] diff --git a/contextpilot/trace_validation/builder.py b/contextpilot/trace_validation/builder.py new file mode 100644 index 0000000..1249e38 --- /dev/null +++ b/contextpilot/trace_validation/builder.py @@ -0,0 +1,384 @@ +"""Trace-derived validation-set builder (read-only, local-only output). + +Reads a Hermes local SQLite state DB in read-only mode and exports a *fixed* +JSONL corpus of replayable cases under a local, gitignored directory. The corpus +captures the exact LLM-bound payload (system/skill prompts + ordered messages) +so future accuracy-affecting or runtime-payload-changing changes can be validated +against a stable set instead of an ad-hoc "run once and see". + +Privacy contract: + +* The DB is opened ``mode=ro`` and never written. +* Raw ``content`` is written ONLY into the local JSONL artifact (default under + the user's home, and the in-repo convenience dir is gitignored). It is never + committed and never placed in the privacy-safe manifest. +* Case ids are salted hashes of the session id -- the raw session id is never + emitted. The manifest carries only a salt *fingerprint*, counters, and enums. +* Sampling is conservative by default (small ``--limit``, a time window) so the + artifact stays a representative sample, not a full export. + +This module deliberately reuses the analyzer's read-only DB helpers and salted +hashing rather than re-implementing them, so the privacy primitives stay in one +place. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import json +from dataclasses import asdict +from pathlib import Path + +from contextpilot.hermes_opportunities.db import ( + _classify_system_prompt, + _connect_readonly, + _message_block_type, + _window_cutoff, +) +from contextpilot.hermes_opportunities.privacy import ( + _assert_no_forbidden_keys, + _salt_fingerprint, + _salted_hash, +) + +from .models import ( + DEFAULT_CASE_LIMIT, + DEFAULT_MIN_INPUT_TOKENS, + DEFAULT_MIN_MESSAGES, + DEFAULT_SINCE_HOURS, + VALIDATION_SET_SCHEMA_VERSION, + TraceCase, + TraceMessage, +) + + +# Manifest dictionaries are passed through the shared privacy guard, which +# rejects forbidden key names such as "system_prompt" or "user_prompt". Keep the +# human meaning but avoid those exact raw-content-shaped key names in reports. +_BLOCK_TYPE_REPORT_KEYS = { + "system_prompt": "system_ctx", + "skill_prompt": "skill_ctx", + "user_prompt": "user_ctx", + "assistant_context": "assistant_ctx", + "tool_result": "tool_result", +} + + +def _report_block_type_key(block_type: str) -> str: + return _BLOCK_TYPE_REPORT_KEYS.get(block_type, block_type.replace("prompt", "ctx")) + +DEFAULT_STATE_DB = Path("/root/.hermes/state.db") +# Default output lives OUTSIDE the repo, under the user's home, so a generated +# corpus can never be committed by accident. The in-repo ``.contextpilot_validation/`` +# convenience dir is also gitignored for users who prefer to keep it local-to-repo. +DEFAULT_OUT_DIR = Path.home() / "contextpilot" / "validation_sets" +DEFAULT_SALT = "contextpilot-trace-validation-v1" + + +def _order_clause(mcols: set[str]) -> str: + """Pick a deterministic message ordering column, preferring explicit ids.""" + if "id" in mcols: + return "messages.id ASC" + if "timestamp" in mcols: + return "messages.timestamp ASC, messages.rowid ASC" + return "messages.rowid ASC" + + +def load_trace_cases( + db_path: Path, + *, + since_hours: int, + salt: str, + limit: int, + all_sessions: bool = False, + min_input_tokens: int = DEFAULT_MIN_INPUT_TOKENS, + min_messages: int = DEFAULT_MIN_MESSAGES, + include_system_prompt: bool = True, +) -> list[TraceCase]: + """Load up to ``limit`` replayable cases from the Hermes state DB. + + Sessions are filtered by the time window (unless ``all_sessions``), archival + flag, and ``min_input_tokens``, then ordered by ``input_tokens`` descending so + the heaviest (most worth validating) sessions are sampled first. Per session + the optional classified system prompt is emitted first, followed by active + messages in deterministic order. Content is read in-memory only. + """ + cutoff = _window_cutoff(since_hours, all_sessions) + conn = _connect_readonly(db_path) + try: + scols = {row[1] for row in conn.execute("PRAGMA table_info(sessions)")} + mcols = {row[1] for row in conn.execute("PRAGMA table_info(messages)")} + if "id" not in scols: + return [] + + wanted = ["id", "source", "input_tokens"] + select_cols = [c if c in scols else f"NULL AS {c}" for c in wanted] + has_sys = include_system_prompt and "system_prompt" in scols + select_cols.append("system_prompt" if has_sys else "NULL AS system_prompt") + + where: list[str] = [] + params: list[object] = [] + if cutoff is not None and "started_at" in scols: + where.append("started_at >= ?") + params.append(cutoff) + if "archived" in scols: + where.append("archived = 0") + if min_input_tokens > 0 and "input_tokens" in scols: + where.append("input_tokens >= ?") + params.append(min_input_tokens) + sql = f"SELECT {', '.join(select_cols)} FROM sessions" + if where: + sql += " WHERE " + " AND ".join(where) + if "input_tokens" in scols: + sql += " ORDER BY input_tokens DESC" + session_rows = conn.execute(sql, params).fetchall() + + # Pre-resolve the per-session message query shape once. + has_content = "content" in mcols + has_role = "role" in mcols + has_tool = "tool_name" in mcols + has_session_fk = "session_id" in mcols + msg_select = ", ".join( + [ + "messages.role" if has_role else "NULL AS role", + "messages.content", + "messages.tool_name" if has_tool else "NULL AS tool_name", + ] + ) + order_by = _order_clause(mcols) + + cases: list[TraceCase] = [] + for sid, source, input_tokens, system_prompt in session_rows: + if len(cases) >= limit: + break + messages: list[TraceMessage] = [] + + if has_sys and system_prompt is not None: + text = str(system_prompt) + messages.append( + TraceMessage( + role="system", + block_type=_classify_system_prompt(text), + content=text, + ) + ) + + if has_content and has_session_fk: + mwhere = ["messages.content IS NOT NULL", "messages.session_id = ?"] + mparams: list[object] = [sid] + if has_role: + mwhere.append( + "messages.role IN ('system', 'user', 'assistant', 'tool')" + ) + if "active" in mcols: + mwhere.append("messages.active = 1") + msql = ( + f"SELECT {msg_select} FROM messages " + f"WHERE {' AND '.join(mwhere)} ORDER BY {order_by}" + ) + for role, content, tool_name in conn.execute(msql, mparams): + if content is None: + continue + messages.append( + TraceMessage( + role=role, + block_type=_message_block_type(role, tool_name), + content=str(content), + ) + ) + + if len(messages) < min_messages: + continue + cases.append( + TraceCase( + case_id=_salted_hash(str(sid), salt), + source=source, + input_tokens=int(input_tokens or 0), + message_count=len(messages), + messages=messages, + ) + ) + finally: + conn.close() + return cases + + +def case_to_json(case: TraceCase) -> dict: + """Serialize a case for the LOCAL JSONL artifact (includes raw content).""" + return { + "schema_version": VALIDATION_SET_SCHEMA_VERSION, + "case_id": case.case_id, + "source": case.source, + "input_tokens": case.input_tokens, + "message_count": case.message_count, + "messages": [asdict(m) for m in case.messages], + } + + +def build_manifest( + cases: list[TraceCase], + *, + date: str, + salt: str, + since_hours: int, + all_sessions: bool, + min_input_tokens: int, + include_system_prompt: bool, + corpus_filename: str, +) -> dict: + """Build the PRIVACY-SAFE manifest: counters, enums, and a salt fingerprint. + + Never contains raw content; passed through the forbidden-key guard before it + is returned so a future regression cannot smuggle content in via this path. + """ + by_source: dict[str, int] = {} + by_block_type: dict[str, int] = {} + total_messages = 0 + for case in cases: + key = case.source or "unknown" + by_source[key] = by_source.get(key, 0) + 1 + total_messages += case.message_count + for m in case.messages: + report_key = _report_block_type_key(m.block_type) + by_block_type[report_key] = by_block_type.get(report_key, 0) + 1 + + manifest = { + "schema_version": VALIDATION_SET_SCHEMA_VERSION, + "generated_date": date, + "salt_fingerprint": _salt_fingerprint(salt), + "window": "all_sessions" if all_sessions else f"last_{since_hours}h", + "since_hours": since_hours, + "all_sessions": all_sessions, + "min_input_tokens": min_input_tokens, + "include_system_prompt": include_system_prompt, + "corpus_file": corpus_filename, + "case_count": len(cases), + "total_messages": total_messages, + "cases_by_source": by_source, + "messages_by_block_type": by_block_type, + "privacy_note": ( + "manifest is metadata-only (salted ids, counters, enums); the corpus " + "JSONL holds raw content and is local-only / gitignored, never committed" + ), + } + _assert_no_forbidden_keys(manifest) + return manifest + + +def write_validation_set( + cases: list[TraceCase], manifest: dict, out_dir: Path, corpus_filename: str +) -> tuple[Path, Path]: + """Write the local JSONL corpus and its privacy-safe manifest sidecar.""" + out_dir.mkdir(parents=True, exist_ok=True) + corpus_path = out_dir / corpus_filename + manifest_path = out_dir / (corpus_filename + ".manifest.json") + with corpus_path.open("w", encoding="utf-8") as f: + for case in cases: + f.write(json.dumps(case_to_json(case), ensure_ascii=False) + "\n") + manifest_path.write_text( + json.dumps(manifest, ensure_ascii=False, indent=2) + "\n", encoding="utf-8" + ) + return corpus_path, manifest_path + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description=( + "Build a trace-derived ContextPilot validation set from a local " + "Hermes state DB. Raw content is written ONLY to a local/gitignored " + "JSONL corpus; the manifest is privacy-safe." + ) + ) + parser.add_argument("--state-db", type=Path, default=DEFAULT_STATE_DB) + parser.add_argument("--out", type=Path, default=None, help="output directory") + parser.add_argument("--since-hours", type=int, default=DEFAULT_SINCE_HOURS) + parser.add_argument( + "--all-sessions", + action="store_true", + help="ignore --since-hours; scan all non-archived sessions", + ) + parser.add_argument( + "--limit", + type=int, + default=DEFAULT_CASE_LIMIT, + help=f"max number of cases to export (default {DEFAULT_CASE_LIMIT})", + ) + parser.add_argument( + "--min-input-tokens", + type=int, + default=DEFAULT_MIN_INPUT_TOKENS, + help="only include sessions with at least this many input tokens", + ) + parser.add_argument( + "--min-messages", + type=int, + default=DEFAULT_MIN_MESSAGES, + help="drop cases with fewer than this many LLM-bound messages", + ) + parser.add_argument( + "--include-system-prompt", + dest="include_system_prompt", + action="store_true", + default=True, + help="include the session system/skill prompt as the first message (default)", + ) + parser.add_argument( + "--no-system-prompt", + dest="include_system_prompt", + action="store_false", + help="exclude session system/skill prompts from the corpus", + ) + parser.add_argument("--salt", default=DEFAULT_SALT) + parser.add_argument("--date", default=dt.date.today().isoformat()) + args = parser.parse_args(argv) + + if not args.state_db.exists(): + raise SystemExit(f"Hermes state DB not found: {args.state_db}") + + out_dir = args.out if args.out is not None else DEFAULT_OUT_DIR + corpus_filename = f"validation_set_{args.date}.jsonl" + + # Cron-safe: never dump a traceback (which could echo the DB path or SQL); + # emit only the exception class name and a non-zero exit code. + try: + cases = load_trace_cases( + args.state_db, + since_hours=args.since_hours, + salt=args.salt, + limit=args.limit, + all_sessions=args.all_sessions, + min_input_tokens=args.min_input_tokens, + min_messages=args.min_messages, + include_system_prompt=args.include_system_prompt, + ) + manifest = build_manifest( + cases, + date=args.date, + salt=args.salt, + since_hours=args.since_hours, + all_sessions=args.all_sessions, + min_input_tokens=args.min_input_tokens, + include_system_prompt=args.include_system_prompt, + corpus_filename=corpus_filename, + ) + corpus_path, manifest_path = write_validation_set( + cases, manifest, out_dir, corpus_filename + ) + except Exception as exc: # noqa: BLE001 - cron-safe: class name only, no payload + print(json.dumps({"ok": False, "error": type(exc).__name__})) + return 1 + + # Stdout is privacy-safe: paths + counters only (raw content stays in the file). + print( + json.dumps( + { + "ok": True, + "corpus": str(corpus_path), + "manifest": str(manifest_path), + "case_count": manifest["case_count"], + "total_messages": manifest["total_messages"], + }, + ensure_ascii=False, + ) + ) + return 0 diff --git a/contextpilot/trace_validation/models.py b/contextpilot/trace_validation/models.py new file mode 100644 index 0000000..dcaf204 --- /dev/null +++ b/contextpilot/trace_validation/models.py @@ -0,0 +1,112 @@ +"""Data structures for the trace-derived validation-set framework. + +Two layers, with a deliberate privacy split: + +* :class:`TraceCase` / :class:`TraceMessage` are the *local* corpus carriers. + They DO hold raw LLM-bound text (``content``) because the validation set must + be able to replay the exact payload ContextPilot would process. They are only + ever serialized into the gitignored local artifact produced by the builder -- + never into a committed fixture or a runner report. +* :class:`ValidationCaseResult` / :class:`ValidationReport` are the *report* + carriers. They are privacy-safe by construction: salted case ids, integer + counters, low-cardinality enums and pass/fail booleans only -- never raw + prompt/message/tool text. + +The runner's report is additionally passed through the analyzer's +``_assert_no_forbidden_keys`` guard and a raw-substring scan before it is +emitted, so a regression that accidentally threads content into a report fails +loudly instead of leaking. +""" +from __future__ import annotations + +from dataclasses import dataclass, field + +# Bumped when the on-disk JSONL case schema changes in a non-additive way. +VALIDATION_SET_SCHEMA_VERSION = 1 + +# Conservative sampling defaults: the builder is meant to capture a small, +# representative corpus, not exfiltrate a whole history. +DEFAULT_CASE_LIMIT = 25 +DEFAULT_SINCE_HOURS = 24 +DEFAULT_MIN_INPUT_TOKENS = 0 +DEFAULT_MIN_MESSAGES = 1 + +# The only block type the prompt-dedup canary is ever allowed to mutate. Every +# other block type is "protected" and must survive optimization byte-identical. +MUTABLE_BLOCK_TYPE = "skill_prompt" + + +@dataclass +class TraceMessage: + """One ordered LLM-bound message in a replayed case. + + ``content`` is RAW text and is local-only: it appears in the gitignored + corpus artifact, never in a committed fixture or a runner report. + """ + + role: str | None + block_type: str + content: str + + +@dataclass +class TraceCase: + """A single replayable case derived from one Hermes session. + + ``case_id`` is a salted hash of the session id (never the raw id). The + counters are privacy-safe; ``messages`` carries raw content and is local-only. + """ + + case_id: str + source: str | None + input_tokens: int + message_count: int + messages: list[TraceMessage] + + +@dataclass +class ValidationCaseResult: + """Privacy-safe per-case outcome of a validation run. + + Salted id + counters + enums + invariant booleans only. ``chars_saved`` and + ``blocks_replaced`` are REALIZED processed-payload figures (actual before/ + after character delta), not opportunity counts. Token figures are populated + only when an exact tokenizer backend was configured. + """ + + case_id: str + source: str | None + message_count: int + skill_item_count: int + mutated: bool + blocks_replaced: int + chars_saved: int # REALIZED before-after char delta + invariants: dict[str, bool] + passed: bool + failed_invariants: list[str] = field(default_factory=list) + actual_tokens_before: int | None = None + actual_tokens_after: int | None = None + actual_tokens_saved: int | None = None + + +@dataclass +class ValidationReport: + """Privacy-safe summary of a whole validation run (gate + accounting).""" + + schema_version: int + generated_date: str + salt_fingerprint: str + baseline_mode: str + candidate_mode: str + case_count: int + passed: bool # overall gate + passed_cases: int + failed_cases: int + total_blocks_replaced: int + total_chars_saved: int # REALIZED before-after char delta + tokenizer_status: str # "available" | "unavailable" + tokenizer_backend: str | None + total_actual_tokens_saved: int | None + invariant_names: list[str] + cases: list[ValidationCaseResult] + notes: list[str] = field(default_factory=list) diff --git a/contextpilot/trace_validation/runner.py b/contextpilot/trace_validation/runner.py new file mode 100644 index 0000000..20c8582 --- /dev/null +++ b/contextpilot/trace_validation/runner.py @@ -0,0 +1,427 @@ +"""Trace validation runner: gate accuracy-preservation against fixed cases. + +Loads the local JSONL corpus produced by :mod:`.builder` and, for every case, +runs ContextPilot's optimization in two controlled modes: + +* a **baseline** (``off``) pass, which must leave the payload byte-identical, and +* a **candidate** pass, whose mode is taken from the configured environment + (``CONTEXTPILOT_PROMPT_DEDUP_MODE``) or overridden on the command line. + +It then checks accuracy-preservation invariants between the two payloads -- +message count, order, and roles preserved; protected (non-skill) content +byte-identical; mutation confined to the allowed scope and never growing the +payload; and realized-savings accounting consistent. The realized ``chars_saved`` +is the ACTUAL processed-payload before/after character delta, never an +opportunity count. Exact-token figures appear only when a tokenizer backend is +configured; otherwise the status is ``unavailable`` and no token fields are set. + +The emitted report is privacy-safe: salted ids, counters, enums, and pass/fail +only. It is passed through the analyzer's forbidden-key guard and a raw-content +substring scan before it is printed, so a regression cannot leak raw text. The +process exits non-zero on any gate failure. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import json +from pathlib import Path +from typing import Callable + +from contextpilot.hermes_opportunities.models import DEFAULT_MIN_BLOCK_CHARS, _LLMContent +from contextpilot.hermes_opportunities.privacy import ( + _assert_no_forbidden_keys, + _salt_fingerprint, +) +from contextpilot.hermes_opportunities.prompt_dedup_canary import ( + PROMPT_DEDUP_CANARY_REFERENCE_TEMPLATE, + PromptDedupCanaryResult, + apply_prompt_dedup_canary, + resolve_prompt_dedup_mode, +) +from contextpilot.hermes_opportunities.tokenizer import resolve_tokenizer + +from .builder import DEFAULT_SALT +from .models import ( + MUTABLE_BLOCK_TYPE, + VALIDATION_SET_SCHEMA_VERSION, + ValidationCaseResult, + ValidationReport, +) + +# Stable invariant identifiers, also used as the report's gate vocabulary. +INVARIANT_NAMES = [ + "message_count_preserved", + "order_and_roles_preserved", + "protected_content_preserved", + "mutation_scope_allowed", + "savings_accounting_consistent", +] + +# The fixed prefix/needle of the canary's reference string. A mutated skill line +# must equal a string of this shape -- anything else means the optimizer emitted +# unexpected (possibly raw) text, which is a gate failure. +_REF_PREFIX = PROMPT_DEDUP_CANARY_REFERENCE_TEMPLATE.split("", 1)[0] # "[...ref=" +_REF_NEEDLE = f"ref={MUTABLE_BLOCK_TYPE}:" + + +def load_cases(corpus_path: Path) -> list[dict]: + """Load the JSONL corpus into a list of case dicts (raw content in-memory). + + Tolerates blank lines; raises on malformed JSON so a corrupt corpus fails + loudly rather than silently validating a partial set. + """ + cases: list[dict] = [] + with corpus_path.open("r", encoding="utf-8") as f: + for raw in f: + line = raw.strip() + if not line: + continue + cases.append(json.loads(line)) + return cases + + +def _messages(case: dict) -> list[dict]: + return [ + { + "role": m.get("role"), + "block_type": m.get("block_type", "unknown"), + "content": m.get("content", ""), + } + for m in case.get("messages", []) + ] + + +def optimize_case( + messages: list[dict], *, mode: str, salt: str, min_block_chars: int +) -> tuple[list[dict], PromptDedupCanaryResult]: + """Run the prompt-dedup canary over a case's messages in the given mode. + + Returns ``(out_messages, result)``. The canary mutates only ``skill_prompt`` + content in place; ``out_messages`` mirrors the input role/block_type/order + with the (possibly) rewritten content so the caller can diff payloads. + """ + contents = [_LLMContent(m["block_type"], m["content"]) for m in messages] + result = apply_prompt_dedup_canary( + contents, salt=salt, min_block_chars=min_block_chars, mode=mode + ) + out = [ + {"role": m["role"], "block_type": m["block_type"], "content": c.content} + for m, c in zip(messages, contents) + ] + return out, result + + +def _is_reference_line(line: str) -> bool: + """True if a line is a canary reference placeholder (no raw content).""" + return line.startswith(_REF_PREFIX) and _REF_NEEDLE in line and line.endswith("]") + + +def _mutation_scope_ok(base: dict, cand: dict) -> bool: + """A single message changed only within the allowed (skill-only) scope.""" + if base["content"] == cand["content"]: + return True + # Only skill_prompt content may ever change. + if base["block_type"] != MUTABLE_BLOCK_TYPE: + return False + # Never grow the payload. + if len(cand["content"]) > len(base["content"]): + return False + base_lines = base["content"].split("\n") + cand_lines = cand["content"].split("\n") + # The canary replaces lines 1:1; a differing line count is out of scope. + if len(base_lines) != len(cand_lines): + return False + for b, c in zip(base_lines, cand_lines): + if b == c: + continue + # A changed line must be a reference placeholder strictly shorter than + # what it replaced -- never new free text and never a growth. + if not (_is_reference_line(c) and len(c) < len(b)): + return False + return True + + +def check_invariants( + baseline: list[dict], candidate: list[dict], result: PromptDedupCanaryResult +) -> tuple[dict[str, bool], int]: + """Check accuracy-preservation invariants between two payloads. + + Returns ``(invariant -> passed, realized_chars_saved)`` where + ``realized_chars_saved`` is the ACTUAL summed before/after character delta of + the processed payload (not an opportunity count). + """ + inv: dict[str, bool] = {} + + inv["message_count_preserved"] = len(baseline) == len(candidate) + + if inv["message_count_preserved"]: + inv["order_and_roles_preserved"] = all( + b["role"] == c["role"] and b["block_type"] == c["block_type"] + for b, c in zip(baseline, candidate) + ) + inv["protected_content_preserved"] = all( + b["content"] == c["content"] + for b, c in zip(baseline, candidate) + if b["block_type"] != MUTABLE_BLOCK_TYPE + ) + inv["mutation_scope_allowed"] = all( + _mutation_scope_ok(b, c) for b, c in zip(baseline, candidate) + ) + realized = sum( + len(b["content"]) - len(c["content"]) + for b, c in zip(baseline, candidate) + ) + else: + # Count mismatch makes positional comparison meaningless; fail the rest. + inv["order_and_roles_preserved"] = False + inv["protected_content_preserved"] = False + inv["mutation_scope_allowed"] = False + realized = 0 + + # Realized savings must equal the optimizer's own realized figure, must be + # non-negative, and a non-zero saving must coincide with a reported mutation. + inv["savings_accounting_consistent"] = ( + realized >= 0 + and realized == result.chars_saved + and (realized > 0) == bool(result.mutated) + and (result.blocks_replaced > 0) == bool(result.mutated) + ) + return inv, realized + + +def _raw_content_strings(cases: list[dict], *, min_len: int = 12) -> list[str]: + """Collect non-trivial raw content lines for the privacy substring scan.""" + out: list[str] = [] + for case in cases: + for m in case.get("messages", []): + text = (m.get("content") or "") + for line in text.split("\n"): + line = line.strip() + if len(line) >= min_len: + out.append(line) + return out + + +def assert_report_privacy_safe(report_dict: dict, raw_texts: list[str]) -> None: + """Guard the report before emission: no forbidden keys, no raw content.""" + _assert_no_forbidden_keys(report_dict) + blob = json.dumps(report_dict, ensure_ascii=False) + for text in raw_texts: + if text and text in blob: + raise RuntimeError("refusing to emit report containing raw case content") + + +def run_validation( + cases: list[dict], + *, + baseline_mode: str = "off", + candidate_mode: str, + salt: str, + min_block_chars: int = DEFAULT_MIN_BLOCK_CHARS, + date: str, + tokenizer_spec: object | None = None, + optimize_fn: Callable[..., tuple[list[dict], PromptDedupCanaryResult]] | None = None, +) -> ValidationReport: + """Validate every case under baseline vs candidate and build the gate report.""" + # Resolve at call time (not as a default arg) so the module-level + # ``optimize_case`` stays monkeypatchable from tests and callers. + optimize_fn = optimize_fn or optimize_case + tokenizer = resolve_tokenizer(tokenizer_spec) + tok_status = "available" if tokenizer is not None else "unavailable" + + case_results: list[ValidationCaseResult] = [] + total_blocks = 0 + total_chars = 0 + total_actual_saved = 0 if tokenizer is not None else None + + for case in cases: + msgs = _messages(case) + baseline_msgs, _ = optimize_fn( + list(msgs), mode=baseline_mode, salt=salt, min_block_chars=min_block_chars + ) + candidate_msgs, result = optimize_fn( + list(msgs), mode=candidate_mode, salt=salt, min_block_chars=min_block_chars + ) + + inv, realized = check_invariants(baseline_msgs, candidate_msgs, result) + failed = [name for name, ok in inv.items() if not ok] + + at_before = at_after = at_saved = None + if tokenizer is not None: + at_before = sum(tokenizer.count(m["content"]) for m in baseline_msgs) + at_after = sum(tokenizer.count(m["content"]) for m in candidate_msgs) + at_saved = at_before - at_after + total_actual_saved += at_saved + + skill_items = sum(1 for m in msgs if m["block_type"] == MUTABLE_BLOCK_TYPE) + total_blocks += result.blocks_replaced if result.mutated else 0 + total_chars += realized + + case_results.append( + ValidationCaseResult( + case_id=str(case.get("case_id", "")), + source=case.get("source"), + message_count=len(msgs), + skill_item_count=skill_items, + mutated=bool(result.mutated), + blocks_replaced=result.blocks_replaced if result.mutated else 0, + chars_saved=realized, + invariants=inv, + passed=not failed, + failed_invariants=failed, + actual_tokens_before=at_before, + actual_tokens_after=at_after, + actual_tokens_saved=at_saved, + ) + ) + + passed_cases = sum(1 for c in case_results if c.passed) + failed_cases = len(case_results) - passed_cases + notes = [ + "baseline runs the optimizer in 'off' mode and must leave the payload " + "byte-identical; the candidate mode is the change under test", + "chars_saved is the REALIZED processed-payload before/after char delta, " + "not an opportunity count", + ] + if tokenizer is None: + notes.append( + "actual-token savings unavailable (no exact tokenizer backend configured); " + "no actual-token fields are reported" + ) + + return ValidationReport( + schema_version=VALIDATION_SET_SCHEMA_VERSION, + generated_date=date, + salt_fingerprint=_salt_fingerprint(salt), + baseline_mode=baseline_mode, + candidate_mode=candidate_mode, + case_count=len(case_results), + passed=failed_cases == 0, + passed_cases=passed_cases, + failed_cases=failed_cases, + total_blocks_replaced=total_blocks, + total_chars_saved=total_chars, + tokenizer_status=tok_status, + tokenizer_backend=tokenizer.name if tokenizer is not None else None, + total_actual_tokens_saved=total_actual_saved, + invariant_names=list(INVARIANT_NAMES), + cases=case_results, + notes=notes, + ) + + +def report_to_dict(report: ValidationReport) -> dict: + from dataclasses import asdict + + return asdict(report) + + +def render_markdown(report: ValidationReport) -> str: + gate = "PASS ✅" if report.passed else "FAIL ❌" + lines = [ + f"# ContextPilot trace validation — {report.generated_date}", + "", + f"Gate: **{gate}**", + f"Salt fingerprint: `{report.salt_fingerprint}`", + f"Baseline mode: `{report.baseline_mode}` | Candidate mode: `{report.candidate_mode}`", + "", + "## Summary", + f"- Cases: {report.case_count} ({report.passed_cases} passed, " + f"{report.failed_cases} failed)", + f"- Blocks replaced (realized): {report.total_blocks_replaced}", + f"- Chars saved (realized before/after delta): {report.total_chars_saved}", + ] + if report.tokenizer_status == "available": + lines.append( + f"- Actual tokens saved ({report.tokenizer_backend}): " + f"{report.total_actual_tokens_saved}" + ) + else: + lines.append("- Actual tokens saved: unavailable (no tokenizer backend)") + lines.append("") + lines.append("## Invariants checked") + for name in report.invariant_names: + lines.append(f"- {name}") + if report.failed_cases: + lines.append("") + lines.append("## Failures") + for c in report.cases: + if not c.passed: + lines.append( + f"- `{c.case_id}` (source={c.source}): " + f"{', '.join(c.failed_invariants)}" + ) + return "\n".join(lines) + "\n" + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description=( + "Run ContextPilot trace validation: check accuracy-preservation " + "invariants of a candidate optimization against a fixed local corpus. " + "Exits non-zero on any gate failure." + ) + ) + parser.add_argument("corpus", type=Path, help="path to the JSONL validation corpus") + parser.add_argument( + "--candidate-mode", + default=None, + help=( + "prompt-dedup mode to validate (off|shadow|canary). Defaults to the " + "resolved CONTEXTPILOT_PROMPT_DEDUP_MODE environment value." + ), + ) + parser.add_argument( + "--baseline-mode", + default="off", + help="reference mode the candidate is compared against (default: off)", + ) + parser.add_argument("--salt", default=DEFAULT_SALT) + parser.add_argument( + "--min-block-chars", type=int, default=DEFAULT_MIN_BLOCK_CHARS + ) + parser.add_argument( + "--tokenizer", + default=None, + help=( + "opt-in exact tokenizer backend for actual-token accounting, e.g. " + "'tiktoken:cl100k_base' (off by default -> tokens reported unavailable)" + ), + ) + parser.add_argument( + "--format", choices=["json", "markdown"], default="json" + ) + parser.add_argument("--date", default=dt.date.today().isoformat()) + args = parser.parse_args(argv) + + if not args.corpus.exists(): + raise SystemExit(f"validation corpus not found: {args.corpus}") + + candidate_mode = ( + args.candidate_mode + if args.candidate_mode is not None + else resolve_prompt_dedup_mode() + ) + + cases = load_cases(args.corpus) + report = run_validation( + cases, + baseline_mode=args.baseline_mode, + candidate_mode=candidate_mode, + salt=args.salt, + min_block_chars=args.min_block_chars, + date=args.date, + tokenizer_spec=args.tokenizer, + ) + + report_dict = report_to_dict(report) + # Hard privacy gate: never emit a report carrying forbidden keys or raw text. + assert_report_privacy_safe(report_dict, _raw_content_strings(cases)) + + if args.format == "markdown": + print(render_markdown(report)) + else: + print(json.dumps(report_dict, ensure_ascii=False, indent=2)) + + return 0 if report.passed else 1 diff --git a/docs/guides/trace-validation.md b/docs/guides/trace-validation.md new file mode 100644 index 0000000..bc76c57 --- /dev/null +++ b/docs/guides/trace-validation.md @@ -0,0 +1,110 @@ +# Trace-derived validation sets for ContextPilot + +ContextPilot changes can reduce token usage only if they preserve task accuracy. +For any future change that mutates the LLM-bound payload, do **not** rely on a +single live run. First build or reuse a fixed validation set derived from local +Hermes traces, then run the validation gate against the candidate mode. + +## Privacy model + +- The builder reads the local Hermes SQLite state DB in read-only mode. +- The generated JSONL corpus contains raw replay content and is **local-only**. + Do not commit it, upload it, or paste it into reviews. +- The default output directory is outside the repo: + `~/contextpilot/validation_sets/`. +- The in-repo convenience directory `.contextpilot_validation/` is gitignored. +- Reports and manifests are metadata-only: salted case ids, counters, enums and + pass/fail flags. They must not contain raw conversation, tool output, system + prompt, reasoning, API keys, or session ids. + +## Build a validation set + +Conservative last-24h sample: + +```bash +python scripts/build_trace_validation_set.py +``` + +Heavier all-history sample for accuracy-sensitive changes: + +```bash +python scripts/build_trace_validation_set.py \ + --all-sessions \ + --min-input-tokens 20000 \ + --limit 50 \ + --out ~/contextpilot/validation_sets +``` + +Exclude system/skill prompts if the change does not touch prompt handling: + +```bash +python scripts/build_trace_validation_set.py --no-system-prompt +``` + +The command prints a privacy-safe JSON object with the corpus and manifest paths. +Only the corpus file contains raw replay content. + +## Run the validation gate + +Validate the current prompt-dedup canary candidate: + +```bash +python scripts/run_trace_validation.py \ + ~/contextpilot/validation_sets/validation_set_YYYY-MM-DD.jsonl \ + --candidate-mode canary \ + --format markdown +``` + +Use the environment-configured mode instead: + +```bash +CONTEXTPILOT_PROMPT_DEDUP_MODE=canary \ +python scripts/run_trace_validation.py \ + ~/contextpilot/validation_sets/validation_set_YYYY-MM-DD.jsonl +``` + +Optional exact-token accounting, only when an exact tokenizer backend is +available: + +```bash +python scripts/run_trace_validation.py \ + ~/contextpilot/validation_sets/validation_set_YYYY-MM-DD.jsonl \ + --candidate-mode canary \ + --tokenizer tiktoken:cl100k_base +``` + +If no tokenizer is configured, the report says actual-token savings are +`unavailable`. It does not substitute chars/4 as actual tokens. + +## Gate semantics + +The runner compares a baseline `off` pass with the candidate pass and exits +non-zero on any failed invariant: + +- message count preserved; +- message order and roles preserved; +- protected user/assistant/tool/system content preserved; +- mutation confined to the explicitly allowed scope; +- realized savings accounting matches the actual processed-payload before/after + character delta. + +For the current canary, the only allowed mutation scope is +`same_type_skill_prompt_only`: later exact duplicate `skill_prompt` lines may be +replaced with a deterministic ContextPilot reference if and only if the reference +is shorter and the line is not safety-denylisted. + +## When this is required + +Run this gate before merging or enabling any change that can affect accuracy, +including: + +- prompt/system/skill dedup or replacement; +- context routing, filtering, summarization or dropping; +- parent/child artifact aggregation rewrites; +- changes to runtime optimization order or telemetry accounting that affect the + LLM-bound payload. + +Passing this gate is necessary but not always sufficient. High-risk changes such +as system prompt replacement or context dropping still need shadow telemetry, +offline A/B evidence, golden evals and default-off canary rollout before default +enablement. diff --git a/scripts/build_trace_validation_set.py b/scripts/build_trace_validation_set.py new file mode 100644 index 0000000..bb8fb97 --- /dev/null +++ b/scripts/build_trace_validation_set.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +"""Build a trace-derived ContextPilot validation set from a local Hermes DB. + +Thin wrapper around :mod:`contextpilot.trace_validation.builder`. Reads the +Hermes state DB read-only and writes a FIXED JSONL corpus (raw content, +local-only / gitignored) plus a privacy-safe manifest sidecar. + +Examples:: + + # last 24h, conservative sampling, default local output dir + python scripts/build_trace_validation_set.py + + # heavier sessions only, all history, to a chosen private dir + python scripts/build_trace_validation_set.py --all-sessions \\ + --min-input-tokens 20000 --limit 50 --out .contextpilot_validation + + # exclude system/skill prompts from the corpus + python scripts/build_trace_validation_set.py --no-system-prompt +""" +from __future__ import annotations + +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from contextpilot.trace_validation.builder import main # noqa: E402 + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_trace_validation.py b/scripts/run_trace_validation.py new file mode 100644 index 0000000..910d39d --- /dev/null +++ b/scripts/run_trace_validation.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +"""Run ContextPilot trace validation against a fixed local corpus. + +Thin wrapper around :mod:`contextpilot.trace_validation.runner`. Replays the +JSONL corpus through ContextPilot's optimization in a baseline (``off``) mode vs +a configured candidate mode, checks accuracy-preservation invariants, prints a +privacy-safe JSON/Markdown summary, and exits non-zero on any gate failure. + +Examples:: + + # validate the canary candidate against a corpus, JSON gate report + python scripts/run_trace_validation.py \\ + ~/contextpilot/validation_sets/validation_set_2026-06-14.jsonl \\ + --candidate-mode canary + + # use the resolved CONTEXTPILOT_PROMPT_DEDUP_MODE env, Markdown output + CONTEXTPILOT_PROMPT_DEDUP_MODE=canary \\ + python scripts/run_trace_validation.py --format markdown + + # with exact-token accounting + python scripts/run_trace_validation.py \\ + --candidate-mode canary --tokenizer tiktoken:cl100k_base +""" +from __future__ import annotations + +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from contextpilot.trace_validation.runner import main # noqa: E402 + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/fixtures/trace_validation/synthetic_cases.jsonl b/tests/fixtures/trace_validation/synthetic_cases.jsonl new file mode 100644 index 0000000..0cb10a0 --- /dev/null +++ b/tests/fixtures/trace_validation/synthetic_cases.jsonl @@ -0,0 +1,3 @@ +{"schema_version": 1, "case_id": "synth00000000aaaa", "source": "synthetic", "input_tokens": 4096, "message_count": 4, "messages": [{"role": "system", "block_type": "skill_prompt", "content": "Synthetic reusable skill paragraph that explains how the demo helper reformats sample markdown tables into neat aligned columns for readers.\nSynthetic reusable skill paragraph that explains how the demo helper reformats sample markdown tables into neat aligned columns for readers.\nSynthetic reusable skill paragraph that explains how the demo helper reformats sample markdown tables into neat aligned columns for readers."}, {"role": "user", "block_type": "user_prompt", "content": "Please summarize the synthetic quarterly figures attached above for the demo."}, {"role": "assistant", "block_type": "assistant_context", "content": "Here is the synthetic summary of the demo quarterly figures as requested."}, {"role": "tool", "block_type": "tool_result", "content": "{\"synthetic_metric\": 42, \"label\": \"demo only, not real data\"}"}]} +{"schema_version": 1, "case_id": "synth00000000bbbb", "source": "synthetic", "input_tokens": 2048, "message_count": 4, "messages": [{"role": "system", "block_type": "system_prompt", "content": "Synthetic system narration paragraph describing the demo assistant persona and the general tone it adopts."}, {"role": "user", "block_type": "user_prompt", "content": "What is the synthetic weather in the demo city today?"}, {"role": "assistant", "block_type": "assistant_context", "content": "The synthetic demo weather report indicates clear skies for the example city."}, {"role": "tool", "block_type": "tool_result", "content": "synthetic_weather=clear; demo_temperature=21C; note=fabricated_for_tests"}]} +{"schema_version": 1, "case_id": "synth00000000cccc", "source": "synthetic", "input_tokens": 1024, "message_count": 2, "messages": [{"role": "system", "block_type": "skill_prompt", "content": "You must double check the synthetic example output before sharing it with the demo readers in this flow.\nYou must double check the synthetic example output before sharing it with the demo readers in this flow.\nYou must double check the synthetic example output before sharing it with the demo readers in this flow."}, {"role": "user", "block_type": "user_prompt", "content": "Run the synthetic demo check now please."}]} diff --git a/tests/test_trace_validation_builder.py b/tests/test_trace_validation_builder.py new file mode 100644 index 0000000..05e3f43 --- /dev/null +++ b/tests/test_trace_validation_builder.py @@ -0,0 +1,220 @@ +"""Tests for the trace-validation-set builder: redaction, privacy, sampling. + +The builder reads a Hermes-shaped SQLite DB read-only and exports a fixed JSONL +corpus (raw content, local-only) plus a privacy-safe manifest. These tests pin: +case ids are salted (never the raw session id); the manifest is metadata-only +(no raw content) and passes the forbidden-key guard; conservative sampling +honours --limit / --min-input-tokens / --min-messages; --no-system-prompt drops +system/skill prompts; and the corpus is the only place raw content appears. +""" +import json +import sqlite3 +from pathlib import Path + +from contextpilot.hermes_opportunities.privacy import _salt_fingerprint, _salted_hash +from contextpilot.trace_validation.builder import ( + build_manifest, + load_trace_cases, + main as build_main, + write_validation_set, +) + +SALT = "test-trace-salt" + +SKILL_LINE = ( + "Synthetic reusable skill paragraph that explains how the demo helper " + "reformats sample markdown tables into neat aligned columns for readers." +) +USER_LINE = "Please summarize the synthetic figures for the demo run." +TOOL_LINE = '{"synthetic_metric": 42, "label": "demo only"}' +SYS_LINE = "Synthetic system narration describing the demo persona and tone." + + +def _make_db(path: Path) -> None: + """Create a minimal Hermes-shaped DB with two sessions.""" + conn = sqlite3.connect(path) + conn.execute( + "CREATE TABLE sessions (id TEXT, source TEXT, input_tokens INTEGER, " + "system_prompt TEXT, started_at REAL, archived INTEGER, message_count INTEGER)" + ) + conn.execute( + "CREATE TABLE messages (id INTEGER PRIMARY KEY, session_id TEXT, role TEXT, " + "content TEXT, tool_name TEXT, timestamp REAL, active INTEGER)" + ) + now = 1_900_000_000.0 + conn.execute( + "INSERT INTO sessions VALUES (?,?,?,?,?,?,?)", + ("sess-heavy", "synthetic", 9000, SYS_LINE, now, 0, 3), + ) + conn.execute( + "INSERT INTO sessions VALUES (?,?,?,?,?,?,?)", + ("sess-light", "synthetic", 500, None, now, 0, 2), + ) + # Heavy session messages (ordered by id). + conn.execute( + "INSERT INTO messages VALUES (?,?,?,?,?,?,?)", + (1, "sess-heavy", "user", USER_LINE, None, now, 1), + ) + conn.execute( + "INSERT INTO messages VALUES (?,?,?,?,?,?,?)", + (2, "sess-heavy", "tool", TOOL_LINE, "calc", now, 1), + ) + # An inactive message that must be skipped. + conn.execute( + "INSERT INTO messages VALUES (?,?,?,?,?,?,?)", + (3, "sess-heavy", "user", "INACTIVE should not appear", None, now, 0), + ) + conn.execute( + "INSERT INTO messages VALUES (?,?,?,?,?,?,?)", + (4, "sess-light", "user", "light user message for the demo", None, now, 1), + ) + conn.commit() + conn.close() + + +def test_case_ids_are_salted_not_raw(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases(db, since_hours=24, salt=SALT, limit=10, all_sessions=True) + ids = {c.case_id for c in cases} + assert "sess-heavy" not in ids and "sess-light" not in ids + assert _salted_hash("sess-heavy", SALT) in ids + + +def test_heavy_session_ordered_first_and_inactive_skipped(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases(db, since_hours=24, salt=SALT, limit=10, all_sessions=True) + # Ordered by input_tokens desc -> heavy first. + assert cases[0].input_tokens == 9000 + heavy = cases[0] + # system(skill/plain) + user + tool, inactive message dropped. + contents = [m.content for m in heavy.messages] + assert SYS_LINE in contents + assert USER_LINE in contents + assert "INACTIVE should not appear" not in contents + + +def test_min_input_tokens_filters_light_session(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases( + db, since_hours=24, salt=SALT, limit=10, all_sessions=True, min_input_tokens=1000 + ) + assert len(cases) == 1 + assert cases[0].input_tokens == 9000 + + +def test_limit_caps_number_of_cases(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases(db, since_hours=24, salt=SALT, limit=1, all_sessions=True) + assert len(cases) == 1 + + +def test_no_system_prompt_excludes_system(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases( + db, + since_hours=24, + salt=SALT, + limit=10, + all_sessions=True, + include_system_prompt=False, + ) + heavy = next(c for c in cases if c.input_tokens == 9000) + assert all(m.role != "system" for m in heavy.messages) + assert SYS_LINE not in [m.content for m in heavy.messages] + + +def test_manifest_is_privacy_safe_no_raw_content(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases(db, since_hours=24, salt=SALT, limit=10, all_sessions=True) + manifest = build_manifest( + cases, + date="2026-06-14", + salt=SALT, + since_hours=24, + all_sessions=True, + min_input_tokens=0, + include_system_prompt=True, + corpus_filename="corpus.jsonl", + ) + blob = json.dumps(manifest) + # No raw content anywhere in the manifest. + for needle in (SKILL_LINE, USER_LINE, TOOL_LINE, SYS_LINE): + assert needle not in blob + # Carries a salt fingerprint, never the raw salt. + assert manifest["salt_fingerprint"] == _salt_fingerprint(SALT) + assert SALT not in blob + assert manifest["case_count"] == len(cases) + assert manifest["messages_by_block_type"] # counters present + + +def test_write_validation_set_corpus_has_raw_manifest_does_not(tmp_path): + db = tmp_path / "state.db" + _make_db(db) + cases = load_trace_cases(db, since_hours=24, salt=SALT, limit=10, all_sessions=True) + manifest = build_manifest( + cases, + date="2026-06-14", + salt=SALT, + since_hours=24, + all_sessions=True, + min_input_tokens=0, + include_system_prompt=True, + corpus_filename="corpus.jsonl", + ) + corpus_path, manifest_path = write_validation_set( + cases, manifest, tmp_path / "out", "corpus.jsonl" + ) + corpus_text = corpus_path.read_text() + manifest_text = manifest_path.read_text() + # Raw content lives ONLY in the corpus, never the manifest. + assert USER_LINE in corpus_text + assert USER_LINE not in manifest_text + # Each corpus line is a well-formed case object. + for line in corpus_text.splitlines(): + obj = json.loads(line) + assert obj["schema_version"] == 1 + assert "case_id" in obj and "messages" in obj + + +def test_main_emits_privacy_safe_stdout(tmp_path, capsys): + db = tmp_path / "state.db" + _make_db(db) + out = tmp_path / "out" + rc = build_main( + [ + "--state-db", + str(db), + "--out", + str(out), + "--all-sessions", + "--salt", + SALT, + "--date", + "2026-06-14", + ] + ) + assert rc == 0 + printed = capsys.readouterr().out + payload = json.loads(printed) + assert payload["ok"] is True + assert payload["case_count"] == 2 + # Stdout carries paths + counters only, never raw content. + for needle in (SKILL_LINE, USER_LINE, TOOL_LINE, SYS_LINE): + assert needle not in printed + assert Path(payload["corpus"]).exists() + assert Path(payload["manifest"]).exists() + + +def test_main_missing_db_exits(tmp_path): + try: + build_main(["--state-db", str(tmp_path / "nope.db")]) + except SystemExit as exc: + assert exc.code != 0 + else: # pragma: no cover - should always raise + raise AssertionError("expected SystemExit for missing DB") diff --git a/tests/test_trace_validation_runner.py b/tests/test_trace_validation_runner.py new file mode 100644 index 0000000..4a26005 --- /dev/null +++ b/tests/test_trace_validation_runner.py @@ -0,0 +1,143 @@ +"""Tests for the trace-validation runner gate. + +The committed fixture is fully synthetic. The runner may read raw case content +from the local JSONL corpus, but its emitted report must remain privacy-safe and +must fail when mutations touch protected content or savings accounting lies. +""" + +import json +from pathlib import Path + +import pytest + +from contextpilot.hermes_opportunities.prompt_dedup_canary import PromptDedupCanaryResult +from contextpilot.trace_validation.runner import ( + assert_report_privacy_safe, + load_cases, + render_markdown, + report_to_dict, + run_validation, +) + +FIXTURE = Path("tests/fixtures/trace_validation/synthetic_cases.jsonl") +SALT = "test-trace-salt" + + +def test_canary_validation_passes_on_synthetic_fixture(): + cases = load_cases(FIXTURE) + report = run_validation( + cases, + baseline_mode="off", + candidate_mode="canary", + salt=SALT, + min_block_chars=40, + date="2026-06-14", + ) + assert report.passed is True + assert report.failed_cases == 0 + # The first synthetic skill case has safe duplicate lines; the third is + # denylisted by "must" and should remain unchanged. + assert report.total_blocks_replaced == 2 + assert report.total_chars_saved > 0 + assert any(c.mutated for c in report.cases) + assert report.tokenizer_status == "unavailable" + assert report.total_actual_tokens_saved is None + + +def test_shadow_validation_passes_without_realized_savings(): + cases = load_cases(FIXTURE) + report = run_validation( + cases, + baseline_mode="off", + candidate_mode="shadow", + salt=SALT, + min_block_chars=40, + date="2026-06-14", + ) + assert report.passed is True + assert report.total_blocks_replaced == 0 + assert report.total_chars_saved == 0 + assert all(not c.mutated for c in report.cases) + + +def test_report_is_privacy_safe_and_markdown_contains_no_raw_fixture_text(): + cases = load_cases(FIXTURE) + report = run_validation( + cases, + baseline_mode="off", + candidate_mode="canary", + salt=SALT, + min_block_chars=40, + date="2026-06-14", + ) + report_dict = report_to_dict(report) + raw_needles = [ + "Please summarize the synthetic quarterly figures attached above for the demo.", + "Synthetic reusable skill paragraph that explains how the demo helper", + "synthetic_weather=clear", + ] + assert_report_privacy_safe(report_dict, raw_needles) + blob = json.dumps(report_dict, ensure_ascii=False) + md = render_markdown(report) + for needle in raw_needles: + assert needle not in blob + assert needle not in md + + +def test_runner_fails_when_candidate_mutates_protected_user_content(): + cases = load_cases(FIXTURE) + + def bad_optimizer(messages, *, mode, salt, min_block_chars): + out = [dict(m) for m in messages] + if mode == "bad" and out: + for m in out: + if m["block_type"] == "user_prompt": + m["content"] = "[dropped]" + break + result = PromptDedupCanaryResult( + mode="canary", + prompt_dedup_class="same_type_skill_prompt_only", + mutated=True, + item_count=0, + skill_item_count=0, + candidate_block_count=0, + candidate_chars=0, + blocks_replaced=1, + chars_saved=1, + denylisted_block_count=0, + ) + return out, result + result = PromptDedupCanaryResult( + mode="off", + prompt_dedup_class="same_type_skill_prompt_only", + mutated=False, + item_count=0, + skill_item_count=0, + candidate_block_count=0, + candidate_chars=0, + blocks_replaced=0, + chars_saved=0, + denylisted_block_count=0, + ) + return out, result + + report = run_validation( + cases[:1], + baseline_mode="off", + candidate_mode="bad", + salt=SALT, + min_block_chars=40, + date="2026-06-14", + optimize_fn=bad_optimizer, + ) + assert report.passed is False + assert report.failed_cases == 1 + failed = report.cases[0].failed_invariants + assert "protected_content_preserved" in failed + assert "mutation_scope_allowed" in failed + assert "savings_accounting_consistent" in failed + + +def test_privacy_guard_rejects_raw_content_in_report(): + with pytest.raises(RuntimeError): + assert_report_privacy_safe({"ok": True, "note": "raw secret line"}, ["raw secret line"])