From 4a4d34a4f3510c9f0cceba461572178a6f0234a1 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 13:49:00 +0000 Subject: [PATCH] feat(mapper): add simplicio.mechanical-edit/v1 anchor producer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #110. Adopts the canonical cross-Simplicio mechanical edit contract pinned in simplicio-runtime#69. The mapper is now the deterministic producer of the edit-anchor context an LLM planner needs to plan compact JSON edits without rewriting whole files. Adds `simplicio_mapper.mechanical_edit` exposing: - `snapshot_hash(text)` — sha256 of the full file content. - `range_hash(text, start, end)` — sha256 of the 1-indexed inclusive line slice. Raises ValueError on out-of-bounds. - `is_binary(path)` — heuristic NUL-byte / UTF-8 decode probe over the first 8 KiB. - `extract_file_entry(root, path, ranges)` — builds the per-file dict with language detection (reusing `mapper._language_for`), full snapshot hash, and a `selected_ranges[]` list with `start_line`, `end_line`, `before_hash`, and `must_contain` (first + last line of the range, truncated to 120 chars). Files longer than `COMPACT_LINE_THRESHOLD` (2000 lines) omit `must_contain` so the envelope stays compact while still anchoring via the hash. - `build_context(root, selections)` — groups selections by path, builds per-file entries, and emits the full envelope with `schema="simplicio.mechanical-edit/v1"`, an overall `context_hash` over all snapshot + range hashes, and the canonical `mapper_schema` pointer to `simplicio.mapper-index/v1`. Missing files raise `FileNotFoundError` and binary files raise `ValueError` — the mapper never silently emits an ambiguous anchor. Tests (`tests/python/test_mechanical_edit.py`, 12 cases) cover stable range hashing, snapshot drift, anchor drift detection, missing-file refusal, binary-file refusal, large-file compact mode, multi-language fixture parity (TS / Python / JSON / Markdown), and deterministic context-hash agreement on repeated runs. Fixtures live under `tests/fixtures/mech-edit-host/` and ship with the test suite so downstream consumers can copy them. Documents the contract in `SIMPLICIO_INTEGRATION.md` next to the existing runtime contract section. https://claude.ai/code/session_01JdmemqddwFnvbceWyuDE8m --- SIMPLICIO_INTEGRATION.md | 86 +++++++++++ simplicio_mapper/mechanical_edit.py | 169 ++++++++++++++++++++++ tests/fixtures/mech-edit-host/binary.bin | Bin 0 -> 15 bytes tests/fixtures/mech-edit-host/sample.json | 9 ++ tests/fixtures/mech-edit-host/sample.md | 10 ++ tests/fixtures/mech-edit-host/sample.py | 9 ++ tests/fixtures/mech-edit-host/sample.ts | 12 ++ tests/python/test_mechanical_edit.py | 142 ++++++++++++++++++ 8 files changed, 437 insertions(+) create mode 100644 simplicio_mapper/mechanical_edit.py create mode 100644 tests/fixtures/mech-edit-host/binary.bin create mode 100644 tests/fixtures/mech-edit-host/sample.json create mode 100644 tests/fixtures/mech-edit-host/sample.md create mode 100644 tests/fixtures/mech-edit-host/sample.py create mode 100644 tests/fixtures/mech-edit-host/sample.ts create mode 100644 tests/python/test_mechanical_edit.py diff --git a/SIMPLICIO_INTEGRATION.md b/SIMPLICIO_INTEGRATION.md index e4b0fca..12fa077 100644 --- a/SIMPLICIO_INTEGRATION.md +++ b/SIMPLICIO_INTEGRATION.md @@ -264,6 +264,92 @@ and agent instruction files remain the human-readable source for project operation. If `.simplicio/` is absent, consumers should fall back to the current markdown or file-inspection behavior. +## Mechanical Edit Contract (issue #110) + +Schema: `simplicio.mechanical-edit/v1` (envelope) and +`simplicio.mechanical-edit-result/v1` (executor result). The canonical +contract lives in +[`simplicio-runtime#69`](https://github.com/wesleysimplicio/simplicio-runtime/issues/69); +this repository implements the **producer half** so an LLM planner can +plan compact JSON edits without rewriting whole files. + +### What the mapper produces + +Use `simplicio_mapper.mechanical_edit.build_context(root, selections)` to +build a context envelope: + +```python +from simplicio_mapper.mechanical_edit import build_context + +envelope = build_context( + root=".", + selections=[ + ("simplicio_mapper/mapper.py", 99, 102), + ("simplicio_mapper/mapper.py", 180, 199), + ("simplicio_mapper/cli.py", 1, 20), + ], +) +``` + +The returned dict matches: + +```json +{ + "schema": "simplicio.mechanical-edit/v1", + "context": { + "mapper_schema": "simplicio.mapper-index/v1", + "context_hash": "", + "files": [ + { + "path": "simplicio_mapper/cli.py", + "language": "python", + "snapshot_hash": "", + "selected_ranges": [ + { + "start_line": 1, + "end_line": 20, + "before_hash": "", + "must_contain": [ + "\"\"\"Command-line entry point for simplicio-mapper.", + "from .mapper import write_mapping_artifacts" + ] + } + ] + } + ] + } +} +``` + +### Guarantees + +- **Stable.** Identical `(path, start, end)` selections on an unchanged tree + always produce the same `snapshot_hash`, `before_hash`, and overall + `context_hash`. The test suite enforces this across repeated runs. +- **Drift-detecting.** A consumer that captured a `before_hash` will see a + different value if the file changes underneath; the executor rejects the + edit when the anchor no longer matches. +- **Refuses unsafe inputs.** Missing files raise `FileNotFoundError`; + binary files (NUL byte in the first 8 KiB or non-UTF-8 decode) raise + `ValueError`. The mapper never silently emits an ambiguous anchor — the + caller must either widen the snapshot or hand off a different file. +- **Compact above threshold.** Files longer than + `COMPACT_LINE_THRESHOLD` (2000 lines by default) omit `must_contain` + snippets so the envelope stays small; the `before_hash` is still + emitted, which is enough for the executor to anchor. +- **Language-aware.** `language` follows the same detection used by + `project-map.json` (`typescript`, `python`, `json`, `markdown`, etc.). +- **Canonical schema only.** This module must not introduce a repo-local + variation. The contract is owned by + [`simplicio-runtime#69`](https://github.com/wesleysimplicio/simplicio-runtime/issues/69). + +### Fixtures + +`tests/fixtures/mech-edit-host/` ships small deterministic files in four +text languages (`sample.ts`, `sample.py`, `sample.json`, `sample.md`) plus +a binary file (`binary.bin`) used by the refusal test. Producers in +downstream repositories can copy these as ready-made parity inputs. + ## Native Runtime Contract (issue #95) The unified native Simplicio runtime — coordinating diff --git a/simplicio_mapper/mechanical_edit.py b/simplicio_mapper/mechanical_edit.py new file mode 100644 index 0000000..15158de --- /dev/null +++ b/simplicio_mapper/mechanical_edit.py @@ -0,0 +1,169 @@ +"""simplicio.mechanical-edit/v1 — deterministic edit-anchor context (issue #110). + +This module provides the mapper-side helpers an LLM planner needs to plan +compact JSON edits without rewriting whole files. It produces: + +- snapshot hashes per selected file (full-file content sha256), +- range hashes per selected line range (sha256 of the joined chunk), +- explicit `must_contain` anchors with the first and last line of each + range, truncated to a short prefix for stability, +- an overall `context_hash` digest over all snapshot + range hashes so the + consumer can verify the whole context in one comparison. + +The schema is the canonical cross-Simplicio contract pinned in +simplicio-runtime issue #69; this module must not introduce a repo-local +variation. Builders raise explicitly when a range cannot be made stable +(out-of-bounds, file missing, file is binary). +""" + +from __future__ import annotations + +import hashlib +import os +from collections.abc import Iterable + +from .mapper import _language_for, _read_safe + +MECHANICAL_EDIT_SCHEMA = "simplicio.mechanical-edit/v1" +MECHANICAL_EDIT_RESULT_SCHEMA = "simplicio.mechanical-edit-result/v1" +MAPPER_INDEX_SCHEMA = "simplicio.mapper-index/v1" + +_NUL_PROBE_BYTES = 8192 +# Files longer than this trigger compact mode: ranges still carry their hash +# anchors, but `must_contain` is omitted so the JSON envelope stays small. +COMPACT_LINE_THRESHOLD = 2000 +_MUST_CONTAIN_PREFIX_CHARS = 120 + + +def _sha256_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _read_bytes_sample(path: str) -> bytes: + try: + with open(path, "rb") as handle: + return handle.read(_NUL_PROBE_BYTES) + except OSError: + return b"" + + +def is_binary(path: str) -> bool: + """Return True when the file at `path` looks binary. + + A file with a NUL byte in its first 8 KiB, or that does not decode as + UTF-8, is treated as binary. The mechanical edit contract refuses binary + files because line-anchored edits do not apply to them. + """ + sample = _read_bytes_sample(path) + if not sample: + return False + if b"\x00" in sample: + return True + try: + sample.decode("utf-8") + except UnicodeDecodeError: + return True + return False + + +def snapshot_hash(text: str) -> str: + """sha256 of the full text content (UTF-8).""" + return _sha256_text(text) + + +def range_hash(text: str, start_line: int, end_line: int) -> str: + """sha256 of the `start_line..end_line` slice (1-indexed, inclusive).""" + if start_line < 1 or end_line < start_line: + raise ValueError(f"invalid range {start_line}-{end_line}") + lines = text.splitlines() + if end_line > len(lines): + raise ValueError( + f"range {start_line}-{end_line} out of bounds ({len(lines)} lines)" + ) + chunk = "\n".join(lines[start_line - 1 : end_line]) + return _sha256_text(chunk) + + +def _must_contain(text: str, start_line: int, end_line: int, compact: bool) -> list[str]: + if compact: + return [] + lines = text.splitlines() + out: list[str] = [] + if 1 <= start_line <= len(lines): + out.append(lines[start_line - 1].strip()[:_MUST_CONTAIN_PREFIX_CHARS]) + if start_line != end_line and 1 <= end_line <= len(lines): + out.append(lines[end_line - 1].strip()[:_MUST_CONTAIN_PREFIX_CHARS]) + return [piece for piece in out if piece] + + +def _absolute(root: str, path: str) -> str: + return path if os.path.isabs(path) else os.path.join(root, path) + + +def extract_file_entry(root: str, path: str, ranges: list[tuple[int, int]]) -> dict: + """Build a single `files[]` entry for `path` and the given line ranges.""" + abs_path = _absolute(root, path) + if not os.path.exists(abs_path): + raise FileNotFoundError(f"file not found: {path}") + if is_binary(abs_path): + raise ValueError(f"binary file refused: {path}") + text = _read_safe(abs_path) + compact = len(text.splitlines()) > COMPACT_LINE_THRESHOLD + selected = [] + for start, end in ranges: + selected.append({ + "start_line": start, + "end_line": end, + "before_hash": range_hash(text, start, end), + "must_contain": _must_contain(text, start, end, compact), + }) + return { + "path": path.replace(os.sep, "/"), + "language": _language_for(abs_path), + "snapshot_hash": snapshot_hash(text), + "selected_ranges": selected, + } + + +def build_context( + root: str, + selections: Iterable[tuple[str, int, int]], +) -> dict: + """Build a `simplicio.mechanical-edit/v1` context envelope. + + `selections` is an iterable of `(path, start_line, end_line)` triples. + Ranges on the same file are grouped and emitted in input order under + that file's `selected_ranges`. + """ + grouped: dict[str, list[tuple[int, int]]] = {} + for path, start, end in selections: + grouped.setdefault(path, []).append((start, end)) + files = [] + digest = hashlib.sha256() + for path in sorted(grouped): + entry = extract_file_entry(root, path, grouped[path]) + files.append(entry) + digest.update(entry["snapshot_hash"].encode("utf-8")) + for selected_range in entry["selected_ranges"]: + digest.update(selected_range["before_hash"].encode("utf-8")) + return { + "schema": MECHANICAL_EDIT_SCHEMA, + "context": { + "mapper_schema": MAPPER_INDEX_SCHEMA, + "context_hash": digest.hexdigest(), + "files": files, + }, + } + + +__all__ = [ + "COMPACT_LINE_THRESHOLD", + "MAPPER_INDEX_SCHEMA", + "MECHANICAL_EDIT_RESULT_SCHEMA", + "MECHANICAL_EDIT_SCHEMA", + "build_context", + "extract_file_entry", + "is_binary", + "range_hash", + "snapshot_hash", +] diff --git a/tests/fixtures/mech-edit-host/binary.bin b/tests/fixtures/mech-edit-host/binary.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd3d21e664349846a056173244381ee7253942d7 GIT binary patch literal 15 WcmZQzWXed*$;oFZ&o9bJVE_Ofvjji@ literal 0 HcmV?d00001 diff --git a/tests/fixtures/mech-edit-host/sample.json b/tests/fixtures/mech-edit-host/sample.json new file mode 100644 index 0000000..cec2a52 --- /dev/null +++ b/tests/fixtures/mech-edit-host/sample.json @@ -0,0 +1,9 @@ +{ + "name": "mech-edit-host", + "version": "0.0.1", + "audiences": [ + "world", + "team", + "everyone" + ] +} diff --git a/tests/fixtures/mech-edit-host/sample.md b/tests/fixtures/mech-edit-host/sample.md new file mode 100644 index 0000000..eca4044 --- /dev/null +++ b/tests/fixtures/mech-edit-host/sample.md @@ -0,0 +1,10 @@ +# Mech-edit fixture + +This Markdown file lives in `tests/fixtures/mech-edit-host/`. It is used by +the `simplicio.mechanical-edit/v1` test suite to verify that Markdown +ranges hash deterministically and that anchor drift is detected when the +file changes underneath. + +## Section + +A small paragraph used as a stable anchor target. diff --git a/tests/fixtures/mech-edit-host/sample.py b/tests/fixtures/mech-edit-host/sample.py new file mode 100644 index 0000000..d80dff9 --- /dev/null +++ b/tests/fixtures/mech-edit-host/sample.py @@ -0,0 +1,9 @@ +"""Tiny Python fixture for mechanical-edit anchor tests.""" + + +def greet(audience: str) -> str: + return f"hello, {audience}" + + +def farewell(audience: str) -> str: + return f"bye, {audience}" diff --git a/tests/fixtures/mech-edit-host/sample.ts b/tests/fixtures/mech-edit-host/sample.ts new file mode 100644 index 0000000..160b4ea --- /dev/null +++ b/tests/fixtures/mech-edit-host/sample.ts @@ -0,0 +1,12 @@ +export interface Greeting { + audience: string; + message: string; +} + +export function greet(audience: string): Greeting { + return { audience, message: `hello, ${audience}` }; +} + +export function farewell(audience: string): Greeting { + return { audience, message: `bye, ${audience}` }; +} diff --git a/tests/python/test_mechanical_edit.py b/tests/python/test_mechanical_edit.py new file mode 100644 index 0000000..3311b1e --- /dev/null +++ b/tests/python/test_mechanical_edit.py @@ -0,0 +1,142 @@ +"""Tests for the simplicio.mechanical-edit/v1 helpers (issue #110). + +Covers: + +- stable range hashing across repeated runs; +- snapshot hash changes when the file content changes; +- anchor drift detection (re-hashing a stale range produces a different + hash than the captured `before_hash`); +- missing file raises `FileNotFoundError`; +- binary file refusal raises `ValueError`; +- large-file compact mode (no `must_contain` snippets above the threshold); +- multi-language fixtures (TypeScript, Python, JSON, Markdown). +""" + +from __future__ import annotations + +import sys +import tempfile +import unittest +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +FIXTURE = ROOT / "tests" / "fixtures" / "mech-edit-host" +sys.path.insert(0, str(ROOT)) + +from simplicio_mapper.mechanical_edit import ( # noqa: E402 + COMPACT_LINE_THRESHOLD, + MECHANICAL_EDIT_SCHEMA, + build_context, + extract_file_entry, + is_binary, + range_hash, + snapshot_hash, +) + + +class StableRangeHashingTest(unittest.TestCase): + def test_same_text_same_range_yields_same_hash(self) -> None: + text = "alpha\nbeta\ngamma\ndelta\n" + a = range_hash(text, 2, 3) + b = range_hash(text, 2, 3) + self.assertEqual(a, b) + + def test_different_ranges_yield_different_hashes(self) -> None: + text = "alpha\nbeta\ngamma\ndelta\n" + self.assertNotEqual(range_hash(text, 1, 2), range_hash(text, 3, 4)) + + def test_range_out_of_bounds_raises(self) -> None: + text = "one\ntwo\n" + with self.assertRaises(ValueError): + range_hash(text, 1, 5) + with self.assertRaises(ValueError): + range_hash(text, 0, 1) + with self.assertRaises(ValueError): + range_hash(text, 2, 1) + + +class SnapshotHashTest(unittest.TestCase): + def test_snapshot_hash_changes_when_content_changes(self) -> None: + before = snapshot_hash("alpha\nbeta\n") + after = snapshot_hash("alpha\nbeta CHANGED\n") + self.assertNotEqual(before, after) + + +class AnchorDriftDetectionTest(unittest.TestCase): + def test_drift_changes_before_hash(self) -> None: + original = "line a\nline b\nline c\n" + captured = range_hash(original, 2, 2) + mutated = "line a\nline B different\nline c\n" + re_hashed = range_hash(mutated, 2, 2) + self.assertNotEqual(captured, re_hashed) + + +class MissingFileAndBinaryRefusalTest(unittest.TestCase): + def test_missing_file_raises(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + with self.assertRaises(FileNotFoundError): + extract_file_entry(tmp, "does-not-exist.py", [(1, 1)]) + + def test_binary_file_refused(self) -> None: + self.assertTrue(is_binary(str(FIXTURE / "binary.bin"))) + with self.assertRaises(ValueError): + extract_file_entry(str(FIXTURE), "binary.bin", [(1, 1)]) + + +class LargeFileCompactModeTest(unittest.TestCase): + def test_must_contain_omitted_above_threshold(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + big = root / "huge.py" + big.write_text("\n".join(f"line_{i}" for i in range(COMPACT_LINE_THRESHOLD + 50))) + entry = extract_file_entry(str(root), "huge.py", [(10, 12)]) + self.assertEqual(entry["selected_ranges"][0]["must_contain"], []) + + def test_must_contain_emitted_below_threshold(self) -> None: + entry = extract_file_entry(str(FIXTURE), "sample.py", [(4, 5)]) + self.assertTrue(entry["selected_ranges"][0]["must_contain"]) + + +class MultiLanguageFixtureTest(unittest.TestCase): + def test_typescript_python_json_markdown_languages(self) -> None: + context = build_context( + str(FIXTURE), + [ + ("sample.ts", 1, 4), + ("sample.py", 4, 5), + ("sample.json", 1, 9), + ("sample.md", 1, 1), + ], + ) + self.assertEqual(context["schema"], MECHANICAL_EDIT_SCHEMA) + by_path = {f["path"]: f for f in context["context"]["files"]} + self.assertEqual(by_path["sample.ts"]["language"], "typescript") + self.assertEqual(by_path["sample.py"]["language"], "python") + self.assertEqual(by_path["sample.json"]["language"], "json") + self.assertEqual(by_path["sample.md"]["language"], "markdown") + for entry in by_path.values(): + self.assertEqual(len(entry["snapshot_hash"]), 64) + self.assertEqual(len(entry["selected_ranges"][0]["before_hash"]), 64) + + +class DeterministicContextHashTest(unittest.TestCase): + def test_same_selections_yield_same_context_hash(self) -> None: + first = build_context(str(FIXTURE), [("sample.py", 4, 5), ("sample.ts", 1, 4)]) + second = build_context(str(FIXTURE), [("sample.ts", 1, 4), ("sample.py", 4, 5)]) + self.assertEqual( + first["context"]["context_hash"], + second["context"]["context_hash"], + ) + self.assertEqual(first, second) + + def test_changed_selection_changes_context_hash(self) -> None: + base = build_context(str(FIXTURE), [("sample.py", 4, 5)]) + wider = build_context(str(FIXTURE), [("sample.py", 4, 7)]) + self.assertNotEqual( + base["context"]["context_hash"], + wider["context"]["context_hash"], + ) + + +if __name__ == "__main__": + unittest.main()