diff --git a/SIMPLICIO_INTEGRATION.md b/SIMPLICIO_INTEGRATION.md index 12fa077..7422beb 100644 --- a/SIMPLICIO_INTEGRATION.md +++ b/SIMPLICIO_INTEGRATION.md @@ -350,6 +350,102 @@ text languages (`sample.ts`, `sample.py`, `sample.json`, `sample.md`) plus a binary file (`binary.bin`) used by the refusal test. Producers in downstream repositories can copy these as ready-made parity inputs. +## Context Packs and Hash-Based Cache (issue #115) + +Schemas: `simplicio.context-pack/v1` and `simplicio.context-cache/v1`. +The canonical contracts live in +[`simplicio-runtime#70`](https://github.com/wesleysimplicio/simplicio-runtime/issues/70); +this repository implements the producer half so the mapper, not the LLM, +decides what compact context goes into a prompt. + +### Context pack + +```python +from simplicio_mapper.context_pack import build_context_pack + +pack = build_context_pack( + root=".", + targets=[ + {"path": "simplicio_mapper/cli.py", "ranges": [(900, 950)]}, + {"path": "simplicio_mapper/mapper.py", "ranges": [(160, 200)]}, + ], +) +``` + +The returned envelope: + +```json +{ + "schema": "simplicio.context-pack/v1", + "repo": { + "mapper_schema": "simplicio.mapper-index/v1", + "root_hash": "" + }, + "pack_hash": "", + "files": [ + { + "path": "...", + "language": "python", + "snapshot_hash": "", + "line_count": 1234, + "compact": false, + "ranges": [ + { + "start_line": 900, + "end_line": 950, + "range_hash": "", + "snippet": ["first line", "last line"] + } + ], + "symbols": [{"name": "...", "kind": "...", "line": 0, "hash": "..."}], + "callers": ["a/file.py"], + "imports": ["b/file.py"], + "tests": ["tests/test_file.py"] + } + ], + "dependencies": { "package_manager": "...", "manifest": "..." }, + "recent_changes": [ "..." ], + "needs_broader_context": false, + "needs_broader_context_reason": "" +} +``` + +`build_context_pack` accepts pre-loaded `project_map`, `symbol_index`, and +`call_graph` dicts; otherwise it reads them from `.simplicio/`. When any +of the three is absent — or a target is missing / unreadable, or a range +is out-of-bounds — the function still returns a pack but sets +`needs_broader_context=True` and lists the concrete reasons. **The mapper +does not pretend compact context is enough when anchors, hashes, or +symbol coverage are missing.** + +### Context cache + +```python +from simplicio_mapper.context_cache import ContextCache + +cache = ContextCache(".simplicio/context-cache.json") +hit = cache.get(file_or_pack_hash) +if hit is None: + summary = summarize_via_llm(...) # caller-supplied + cache.set(file_or_pack_hash, summary) +``` + +Stored on disk as a single JSON document with shape +`{"schema": "simplicio.context-cache/v1", "entries": {...}}`. Entries are +keyed by any opaque hash string the caller chooses — typically +`snapshot_hash`, `range_hash`, or the overall `pack_hash` — so a change in +the underlying file invalidates the cached summary naturally. Writes are +persisted immediately; multiple processes pick up the latest value on +their next load. + +### Fixtures + +`tests/fixtures/ctx-pack-host/` ships four small multi-language fixtures +(`sample.ts`, `sample.py`, `sample.json`, `sample.md`) used by the test +suite to verify language detection and snippet emission. Large-file +behavior is exercised in a temp-dir generated test (`huge.py` with more +than `COMPACT_LINE_THRESHOLD` lines). + ## Native Runtime Contract (issue #95) The unified native Simplicio runtime — coordinating diff --git a/simplicio_mapper/context_cache.py b/simplicio_mapper/context_cache.py new file mode 100644 index 0000000..f6187a6 --- /dev/null +++ b/simplicio_mapper/context_cache.py @@ -0,0 +1,80 @@ +"""simplicio.context-cache/v1 — file-backed summary cache keyed by hash. + +The mapper context pack is the source of compact context; the context +cache lets downstream LLM planners reuse summaries for unchanged files +without re-summarizing them. Entries are keyed by a content hash +(`snapshot_hash` of a file, the `range_hash` of a slice, or the +`pack_hash` of a whole context pack) so a change underneath naturally +invalidates the cached summary. + +The cache is intentionally small and JSON-backed: it is persisted under +`.simplicio/context-cache.json` by default and is safe to ship across +machines. +""" + +from __future__ import annotations + +import json +import os +from typing import Any + +CONTEXT_CACHE_SCHEMA = "simplicio.context-cache/v1" + + +class ContextCache: + """Hash-keyed cache for LLM summaries. + + `key` is any opaque string the caller chose (typically a content hash + derived by the mapper). `summary` is any JSON-serialisable payload. + Reads return `None` on miss; writes are persisted immediately so + multiple processes pick up the latest value on their next load. + """ + + def __init__(self, cache_path: str | os.PathLike) -> None: + self.path = str(cache_path) + self._entries: dict[str, Any] = self._load() + + def _load(self) -> dict[str, Any]: + try: + with open(self.path, encoding="utf-8") as handle: + payload = json.load(handle) + except (OSError, ValueError): + return {} + if not isinstance(payload, dict): + return {} + if payload.get("schema") != CONTEXT_CACHE_SCHEMA: + return {} + entries = payload.get("entries", {}) + return dict(entries) if isinstance(entries, dict) else {} + + def get(self, key: str) -> Any | None: + return self._entries.get(key) + + def set(self, key: str, summary: Any) -> None: + self._entries[key] = summary + self._persist() + + def clear(self) -> None: + self._entries = {} + self._persist() + + def __contains__(self, key: str) -> bool: + return key in self._entries + + def __len__(self) -> int: + return len(self._entries) + + def _persist(self) -> None: + directory = os.path.dirname(self.path) + if directory: + os.makedirs(directory, exist_ok=True) + payload = { + "schema": CONTEXT_CACHE_SCHEMA, + "entries": self._entries, + } + with open(self.path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, sort_keys=True, indent=2) + handle.write("\n") + + +__all__ = ["CONTEXT_CACHE_SCHEMA", "ContextCache"] diff --git a/simplicio_mapper/context_pack.py b/simplicio_mapper/context_pack.py new file mode 100644 index 0000000..226066d --- /dev/null +++ b/simplicio_mapper/context_pack.py @@ -0,0 +1,249 @@ +"""simplicio.context-pack/v1 — compact LLM context bundles (issue #115). + +The mapper is the canonical producer of compact context for LLM planners. +A context pack collects, for a given set of target files: + +- repo metadata (mapper schema, root hash); +- per-file snapshot hashes, language, symbols defined inside, callers / + imports derived from the call graph, and related test files; +- optional selected line ranges with per-range hashes and a short snippet + (omitted in compact mode for files above the line threshold); +- dependency hints carried forward from `project-map.json`; +- recent changed files when available; +- an explicit `needs_broader_context` flag with a `reason` when compact + context is unsafe (target missing, unreadable, unstable range, or any of + the upstream mapper artifacts is absent). + +The canonical schema lives in simplicio-runtime issue #70; this module +must not introduce a repo-local variation. +""" + +from __future__ import annotations + +import hashlib +import json +import os +from collections.abc import Iterable +from typing import Any + +CONTEXT_PACK_SCHEMA = "simplicio.context-pack/v1" +MAPPER_INDEX_SCHEMA = "simplicio.mapper-index/v1" + +COMPACT_LINE_THRESHOLD = 2000 +_SNIPPET_PREFIX_CHARS = 120 + +_LANGUAGE_BY_EXT = { + ".ts": "typescript", + ".tsx": "typescript", + ".js": "javascript", + ".jsx": "javascript", + ".mjs": "javascript", + ".cjs": "javascript", + ".py": "python", + ".md": "markdown", + ".json": "json", + ".yaml": "yaml", + ".yml": "yaml", + ".toml": "toml", + ".go": "go", + ".rs": "rust", + ".cs": "csharp", +} + + +def _sha256_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _read_safe(path: str) -> str | None: + try: + with open(path, encoding="utf-8", errors="replace") as handle: + return handle.read() + except OSError: + return None + + +def _load_json(path: str) -> dict | None: + try: + with open(path, encoding="utf-8") as handle: + return json.load(handle) + except (OSError, ValueError): + return None + + +def _language_for(path: str) -> str: + base = os.path.basename(path) + if base == "Dockerfile": + return "dockerfile" + ext = os.path.splitext(path)[1].lower() + return _LANGUAGE_BY_EXT.get(ext, ext[1:] if ext else "text") + + +def _range_snippet(text: str, start: int, end: int, compact: bool) -> list[str]: + if compact: + return [] + lines = text.splitlines() + out: list[str] = [] + if 1 <= start <= len(lines): + out.append(lines[start - 1].strip()[:_SNIPPET_PREFIX_CHARS]) + if start != end and 1 <= end <= len(lines): + out.append(lines[end - 1].strip()[:_SNIPPET_PREFIX_CHARS]) + return [piece for piece in out if piece] + + +def _file_symbols(symbols: list[dict], path: str) -> list[dict]: + found: list[dict] = [] + for symbol in symbols: + if symbol.get("defined_in") != path: + continue + identity = symbol.get("qualified_name") or symbol.get("name") or "" + found.append({ + "name": symbol.get("name"), + "kind": symbol.get("kind"), + "line": symbol.get("line"), + "hash": hashlib.sha256(f"{identity}|{path}".encode()).hexdigest()[:16], + }) + return sorted(found, key=lambda entry: (entry.get("line") or 0, entry.get("name") or "")) + + +def _related_tests(project_files: dict, path: str) -> list[str]: + base = os.path.splitext(os.path.basename(path))[0] + if not base: + return [] + matches: set[str] = set() + for candidate, meta in project_files.items(): + roles = meta.get("roles", []) + if "test" in roles and base in candidate: + matches.add(candidate) + return sorted(matches) + + +def _call_graph_edges(call_graph: dict) -> list[dict]: + """Return a flat list of edges from either `edges`, `imports`, or `calls`.""" + edges: list[dict] = [] + for key in ("edges", "imports", "calls"): + for edge in call_graph.get(key, []): + if isinstance(edge, dict) and edge.get("from") and edge.get("to"): + edges.append(edge) + return edges + + +def build_context_pack( + root: str, + targets: Iterable[dict], + *, + project_map: dict | None = None, + symbol_index: dict | None = None, + call_graph: dict | None = None, +) -> dict[str, Any]: + """Build a `simplicio.context-pack/v1` envelope. + + `targets` is an iterable of `{"path": str, "ranges": [(start, end), ...]}` + dicts. Pre-built `project_map` / `symbol_index` / `call_graph` payloads + can be passed in; otherwise the function looks under `.simplicio/` and + emits `needs_broader_context=True` when any of them is missing. + """ + abs_root = os.path.abspath(root) + base = os.path.join(abs_root, ".simplicio") + project_map = project_map if project_map is not None else _load_json(os.path.join(base, "project-map.json")) + symbol_index = symbol_index if symbol_index is not None else _load_json(os.path.join(base, "symbol-index.json")) + call_graph = call_graph if call_graph is not None else _load_json(os.path.join(base, "call-graph.json")) + + reasons: list[str] = [] + if not project_map: + reasons.append("project-map.json absent") + project_map = {} + if not symbol_index: + reasons.append("symbol-index.json absent") + symbol_index = {} + if not call_graph: + reasons.append("call-graph.json absent") + call_graph = {} + + pm_files = {entry["path"]: entry for entry in project_map.get("files", [])} + si_symbols = symbol_index.get("symbols", []) + cg_edges = _call_graph_edges(call_graph) + + files_out: list[dict] = [] + for target in targets: + path = target["path"] + ranges = list(target.get("ranges", [])) + abs_path = os.path.join(abs_root, path) if not os.path.isabs(path) else path + if not os.path.exists(abs_path): + reasons.append(f"target missing: {path}") + continue + text = _read_safe(abs_path) + if text is None: + reasons.append(f"unreadable: {path}") + continue + line_count = len(text.splitlines()) + compact = line_count > COMPACT_LINE_THRESHOLD + selected_ranges: list[dict] = [] + for start, end in ranges: + if start < 1 or end < start or end > line_count: + reasons.append(f"unstable range {start}-{end} in {path}") + continue + chunk = "\n".join(text.splitlines()[start - 1 : end]) + selected_ranges.append({ + "start_line": start, + "end_line": end, + "range_hash": _sha256_text(chunk), + "snippet": _range_snippet(text, start, end, compact), + }) + callers = sorted({ + edge["from"] for edge in cg_edges + if edge.get("to") == path and edge.get("from") != path + }) + imports = sorted({ + edge["to"] for edge in cg_edges + if edge.get("from") == path and edge.get("to") != path + }) + files_out.append({ + "path": path.replace(os.sep, "/"), + "language": _language_for(abs_path), + "snapshot_hash": _sha256_text(text), + "line_count": line_count, + "compact": compact, + "ranges": selected_ranges, + "symbols": _file_symbols(si_symbols, path), + "callers": callers, + "imports": imports, + "tests": _related_tests(pm_files, path), + }) + + needs_broader = bool(reasons) + files_out.sort(key=lambda entry: entry["path"]) + + digest = hashlib.sha256() + root_hash = _sha256_text(abs_root) + digest.update(root_hash.encode("utf-8")) + for entry in files_out: + digest.update(entry["snapshot_hash"].encode("utf-8")) + for selected in entry["ranges"]: + digest.update(selected["range_hash"].encode("utf-8")) + + return { + "schema": CONTEXT_PACK_SCHEMA, + "repo": { + "mapper_schema": MAPPER_INDEX_SCHEMA, + "root_hash": root_hash, + }, + "pack_hash": digest.hexdigest(), + "files": files_out, + "dependencies": project_map.get("dependencies", {}), + "recent_changes": ( + project_map.get("recent_changes") + or project_map.get("changed_files") + or [] + ), + "needs_broader_context": needs_broader, + "needs_broader_context_reason": "; ".join(reasons) if reasons else "", + } + + +__all__ = [ + "COMPACT_LINE_THRESHOLD", + "CONTEXT_PACK_SCHEMA", + "MAPPER_INDEX_SCHEMA", + "build_context_pack", +] diff --git a/tests/fixtures/ctx-pack-host/sample.json b/tests/fixtures/ctx-pack-host/sample.json new file mode 100644 index 0000000..effdb13 --- /dev/null +++ b/tests/fixtures/ctx-pack-host/sample.json @@ -0,0 +1,7 @@ +{ + "name": "ctx-pack-host", + "users": [ + { "id": 1, "name": "alpha" }, + { "id": 2, "name": "beta" } + ] +} diff --git a/tests/fixtures/ctx-pack-host/sample.md b/tests/fixtures/ctx-pack-host/sample.md new file mode 100644 index 0000000..77ec72c --- /dev/null +++ b/tests/fixtures/ctx-pack-host/sample.md @@ -0,0 +1,9 @@ +# ctx-pack fixture + +This Markdown file lives in `tests/fixtures/ctx-pack-host/`. It is used by +the `simplicio.context-pack/v1` test suite to verify language detection +and snippet emission for Markdown targets. + +## Section + +A small paragraph used as a stable target. diff --git a/tests/fixtures/ctx-pack-host/sample.py b/tests/fixtures/ctx-pack-host/sample.py new file mode 100644 index 0000000..57a576f --- /dev/null +++ b/tests/fixtures/ctx-pack-host/sample.py @@ -0,0 +1,5 @@ +"""Tiny Python fixture for the context-pack tests.""" + + +def load_user(user_id: int) -> dict: + return {"id": user_id, "name": f"user-{user_id}"} diff --git a/tests/fixtures/ctx-pack-host/sample.ts b/tests/fixtures/ctx-pack-host/sample.ts new file mode 100644 index 0000000..af27758 --- /dev/null +++ b/tests/fixtures/ctx-pack-host/sample.ts @@ -0,0 +1,8 @@ +export interface User { + id: number; + name: string; +} + +export function loadUser(id: number): User { + return { id, name: `user-${id}` }; +} diff --git a/tests/python/test_context_pack.py b/tests/python/test_context_pack.py new file mode 100644 index 0000000..2d06d3c --- /dev/null +++ b/tests/python/test_context_pack.py @@ -0,0 +1,196 @@ +"""Tests for the simplicio.context-pack/v1 builder and the hash-keyed cache (#115). + +Covers: + +- the basic envelope (schema, repo metadata, pack_hash, files); +- selected-range extraction with per-range hashes; +- dependency/caller inclusion from a provided `call_graph`; +- `needs_broader_context` when the target file is missing or a range is + unstable, and when upstream mapper artifacts are absent; +- determinism across repeated runs on the same tree; +- large-file compact mode (snippets omitted, snapshot_hash stable); +- the cache: hit, miss, invalidation by changing the key, and JSON-backed + persistence between instances. +""" + +from __future__ import annotations + +import sys +import tempfile +import unittest +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +FIXTURE = ROOT / "tests" / "fixtures" / "ctx-pack-host" +sys.path.insert(0, str(ROOT)) + +from simplicio_mapper.context_cache import ( # noqa: E402 + CONTEXT_CACHE_SCHEMA, + ContextCache, +) +from simplicio_mapper.context_pack import ( # noqa: E402 + COMPACT_LINE_THRESHOLD, + CONTEXT_PACK_SCHEMA, + build_context_pack, +) + + +def _pack(root, targets, **kwargs): + defaults = {"project_map": {}, "symbol_index": {}, "call_graph": {}} + defaults.update(kwargs) + return build_context_pack(root, targets, **defaults) + + +class ContextPackBasicTest(unittest.TestCase): + def test_envelope_shape_and_pack_hash(self) -> None: + pack = _pack(str(FIXTURE), [{"path": "sample.py"}]) + self.assertEqual(pack["schema"], CONTEXT_PACK_SCHEMA) + self.assertEqual(pack["repo"]["mapper_schema"], "simplicio.mapper-index/v1") + self.assertEqual(len(pack["pack_hash"]), 64) + self.assertEqual(len(pack["files"]), 1) + self.assertEqual(pack["files"][0]["language"], "python") + + def test_multi_language_fixtures(self) -> None: + pack = _pack(str(FIXTURE), [ + {"path": "sample.ts"}, + {"path": "sample.py"}, + {"path": "sample.json"}, + {"path": "sample.md"}, + ]) + by_path = {entry["path"]: entry for entry in pack["files"]} + self.assertEqual(by_path["sample.ts"]["language"], "typescript") + self.assertEqual(by_path["sample.py"]["language"], "python") + self.assertEqual(by_path["sample.json"]["language"], "json") + self.assertEqual(by_path["sample.md"]["language"], "markdown") + for entry in by_path.values(): + self.assertEqual(len(entry["snapshot_hash"]), 64) + + +class RangeExtractionTest(unittest.TestCase): + def test_range_hash_emitted_with_snippet(self) -> None: + pack = _pack(str(FIXTURE), [{"path": "sample.py", "ranges": [(3, 4)]}]) + ranges = pack["files"][0]["ranges"] + self.assertEqual(len(ranges), 1) + self.assertEqual(len(ranges[0]["range_hash"]), 64) + self.assertTrue(ranges[0]["snippet"]) + + def test_unstable_range_marks_needs_broader_context(self) -> None: + pack = _pack(str(FIXTURE), [{"path": "sample.py", "ranges": [(1, 9999)]}]) + self.assertTrue(pack["needs_broader_context"]) + self.assertIn("unstable range", pack["needs_broader_context_reason"]) + + +class CallGraphAndDependencyTest(unittest.TestCase): + def test_callers_and_imports_resolved(self) -> None: + call_graph = {"edges": [ + {"from": "sample.py", "to": "shared/util.py"}, + {"from": "caller.py", "to": "sample.py"}, + ]} + pack = _pack( + str(FIXTURE), + [{"path": "sample.py"}], + call_graph=call_graph, + ) + entry = pack["files"][0] + self.assertEqual(entry["imports"], ["shared/util.py"]) + self.assertEqual(entry["callers"], ["caller.py"]) + + def test_tests_resolved_from_project_map(self) -> None: + project_map = {"files": [ + {"path": "sample.py", "roles": ["domain"]}, + {"path": "tests/test_sample.py", "roles": ["test"]}, + ]} + pack = _pack( + str(FIXTURE), + [{"path": "sample.py"}], + project_map=project_map, + ) + self.assertIn("tests/test_sample.py", pack["files"][0]["tests"]) + + +class NeedsBroaderContextTest(unittest.TestCase): + def test_missing_target_marked(self) -> None: + pack = _pack(str(FIXTURE), [{"path": "does-not-exist.py"}]) + self.assertTrue(pack["needs_broader_context"]) + self.assertIn("target missing", pack["needs_broader_context_reason"]) + + def test_missing_upstream_artifacts_loaded_implicitly(self) -> None: + # No `.simplicio/` under the fixture dir, so build_context_pack + # falls into "absent" branches when no overrides are passed. + pack = build_context_pack(str(FIXTURE), [{"path": "sample.py"}]) + self.assertTrue(pack["needs_broader_context"]) + self.assertIn("project-map.json absent", pack["needs_broader_context_reason"]) + + +class DeterminismTest(unittest.TestCase): + def test_same_inputs_same_pack(self) -> None: + targets = [ + {"path": "sample.py", "ranges": [(3, 4)]}, + {"path": "sample.ts", "ranges": [(1, 4)]}, + ] + first = _pack(str(FIXTURE), targets) + second = _pack(str(FIXTURE), targets) + self.assertEqual(first["pack_hash"], second["pack_hash"]) + self.assertEqual(first, second) + + +class LargeFileCompactModeTest(unittest.TestCase): + def test_compact_mode_omits_snippets_above_threshold(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + big = Path(tmp) / "huge.py" + big.write_text("\n".join(f"line_{i}" for i in range(COMPACT_LINE_THRESHOLD + 100))) + pack = _pack(str(tmp), [{"path": "huge.py", "ranges": [(5, 10)]}]) + entry = pack["files"][0] + self.assertTrue(entry["compact"]) + self.assertEqual(entry["ranges"][0]["snippet"], []) + + +class ContextCacheTest(unittest.TestCase): + def setUp(self) -> None: + self._tmp = tempfile.TemporaryDirectory() + self.cache_path = Path(self._tmp.name) / "ctx-cache.json" + + def tearDown(self) -> None: + self._tmp.cleanup() + + def test_hit_returns_stored_summary(self) -> None: + cache = ContextCache(self.cache_path) + cache.set("abc123", {"summary": "small file"}) + self.assertEqual(cache.get("abc123"), {"summary": "small file"}) + self.assertIn("abc123", cache) + self.assertEqual(len(cache), 1) + + def test_miss_returns_none(self) -> None: + cache = ContextCache(self.cache_path) + self.assertIsNone(cache.get("never-stored")) + + def test_invalidation_when_key_changes(self) -> None: + cache = ContextCache(self.cache_path) + cache.set("hash-v1", {"summary": "old"}) + self.assertIsNone(cache.get("hash-v2")) + + def test_persistence_across_instances(self) -> None: + first = ContextCache(self.cache_path) + first.set("abc", {"summary": "persisted"}) + second = ContextCache(self.cache_path) + self.assertEqual(second.get("abc"), {"summary": "persisted"}) + + def test_clear_empties_cache(self) -> None: + cache = ContextCache(self.cache_path) + cache.set("abc", {"summary": "x"}) + cache.clear() + self.assertIsNone(cache.get("abc")) + # And the new instance also sees the cleared state. + self.assertEqual(len(ContextCache(self.cache_path)), 0) + + def test_persisted_payload_uses_schema(self) -> None: + cache = ContextCache(self.cache_path) + cache.set("abc", {"summary": "x"}) + import json as _json + on_disk = _json.loads(self.cache_path.read_text()) + self.assertEqual(on_disk["schema"], CONTEXT_CACHE_SCHEMA) + self.assertEqual(on_disk["entries"], {"abc": {"summary": "x"}}) + + +if __name__ == "__main__": + unittest.main()