diff --git a/.gitignore b/.gitignore index e2ed4f2..0116b0b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,7 @@ __pycache__/ *.pt *.pth *.jsonl +eval_manifest.json +pipeline_manifest.json +mid_checkpoint.safetensors +mid_checkpoint.merkle.json diff --git a/requirements.txt b/requirements.txt index 45989b6..5476be4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ tqdm==4.67.3 # This default torch wheel is CPU-only (or CUDA, depending on your platform). # For an Intel GPU (Iris Xe / Arc), install the XPU build instead — see README # "Locally on an Intel GPU": -# pip install torch --index-url https://download.pytorch.org/whl/xpu \ No newline at end of file +# pip install torch --index-url https://download.pytorch.org/whl/xpu diff --git a/src/artifacts.py b/src/artifacts.py new file mode 100644 index 0000000..40e6210 --- /dev/null +++ b/src/artifacts.py @@ -0,0 +1,236 @@ +import hashlib +import json +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +if TYPE_CHECKING: + import torch + +MERKLE_CHUNK_SIZE_BYTES = 1024 * 1024 + +CHECKPOINT_STATE_PATH = "mid_checkpoint.pt" +CHECKPOINT_WEIGHTS_PATH = "mid_checkpoint.safetensors" +CHECKPOINT_MERKLE_PATH = "mid_checkpoint.merkle.json" + + +def hash_json(data: Any) -> str: + encoded = json.dumps(data, sort_keys=True).encode() + return hashlib.sha256(encoded).hexdigest() + + +def compute_sha256_bytes( + *, + data: Optional[Union[bytes, bytearray]] = None, + file_path: Optional[Union[str, Path]] = None, +) -> bytes: + if (data is None) == (file_path is None): + raise ValueError("Exactly one of data or file_path must be provided") + + h = hashlib.sha256() + if data is not None: + h.update(data) + return h.digest() + + with Path(file_path).open("rb") as f: + while chunk := f.read(1024 * 1024): + h.update(chunk) + return h.digest() + + +def compute_sha256( + *, + data: Optional[Union[bytes, bytearray]] = None, + file_path: Optional[Union[str, Path]] = None, +) -> str: + return compute_sha256_bytes(data=data, file_path=file_path).hex() + + +def model_parameters_sha256(model: "torch.nn.Module") -> str: + h = hashlib.sha256() + for param in model.parameters(): + h.update(param.detach().cpu().numpy().tobytes()) + return h.hexdigest() + + +def _merkle_parent(left: bytes, right: bytes) -> bytes: + return compute_sha256_bytes(data=left + right) + + +def merkle_root_from_leaf_hashes(leaf_hashes: List[str]) -> str: + if not leaf_hashes: + return compute_sha256(data=b"") + + level = [bytes.fromhex(leaf) for leaf in leaf_hashes] + while len(level) > 1: + next_level = [] + for i in range(0, len(level), 2): + left = level[i] + right = level[i + 1] if i + 1 < len(level) else left + next_level.append(_merkle_parent(left, right)) + level = next_level + return level[0].hex() + + +def build_merkle_manifest( + file_path: Union[str, Path], + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> Dict[str, Any]: + if chunk_size <= 0: + raise ValueError("chunk_size must be a positive integer") + + path = Path(file_path) + chunks = [] + offset = 0 + file_hasher = hashlib.sha256() + with path.open("rb") as f: + while chunk := f.read(chunk_size): + file_hasher.update(chunk) + chunks.append( + { + "index": len(chunks), + "offset": offset, + "size": len(chunk), + "sha256": compute_sha256(data=chunk), + } + ) + offset += len(chunk) + + leaf_hashes = [chunk["sha256"] for chunk in chunks] + return { + "artifact": path.name, + "size_bytes": offset, + "sha256": file_hasher.hexdigest(), + "chunk_size_bytes": chunk_size, + "chunk_count": len(chunks), + "merkle_root": merkle_root_from_leaf_hashes(leaf_hashes), + "chunks": chunks, + } + + +def write_merkle_manifest( + file_path: Union[str, Path], + output_path: Union[str, Path], + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> Dict[str, Any]: + manifest = build_merkle_manifest(file_path, chunk_size=chunk_size) + output = Path(output_path) + with output.open("w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2) + return manifest + + +def generate_merkle_proof( + file_path: Union[str, Path], + chunk_index: int, + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> List[Dict[str, Any]]: + manifest = build_merkle_manifest(file_path, chunk_size=chunk_size) + if manifest["chunk_count"] == 0: + raise ValueError("Cannot generate a Merkle proof for an empty file") + if chunk_index < 0 or chunk_index >= manifest["chunk_count"]: + raise IndexError("chunk_index out of range") + + level = [bytes.fromhex(chunk["sha256"]) for chunk in manifest["chunks"]] + proof = [] + index = chunk_index + while len(level) > 1: + if len(level) % 2 == 1: + level.append(level[-1]) + + sibling_index = index ^ 1 + proof.append( + { + "sibling_sha256": level[sibling_index].hex(), + "sibling_position": "left" if sibling_index < index else "right", + } + ) + + next_level = [] + for i in range(0, len(level), 2): + next_level.append(_merkle_parent(level[i], level[i + 1])) + index //= 2 + level = next_level + return proof + + +def verify_merkle_proof( + chunk_bytes: bytes, + proof: List[Dict[str, Any]], + expected_root: str, +) -> bool: + try: + current = compute_sha256_bytes(data=chunk_bytes) + expected = bytes.fromhex(expected_root) + except (TypeError, ValueError): + return False + + for step in proof: + try: + sibling = bytes.fromhex(step["sibling_sha256"]) + position = step["sibling_position"] + except (KeyError, TypeError, ValueError): + return False + + if len(sibling) != hashlib.sha256().digest_size: + return False + if position == "left": + current = _merkle_parent(sibling, current) + elif position == "right": + current = _merkle_parent(current, sibling) + else: + return False + + return current == expected + + +def _stable_cpu_state_dict(model: "torch.nn.Module") -> Dict[str, "torch.Tensor"]: + state = model.state_dict() + return { + name: tensor.detach().cpu().contiguous() + for name, tensor in sorted(state.items(), key=lambda item: item[0]) + } + + +def save_model_safetensors( + model: "torch.nn.Module", + output_path: Union[str, Path] = CHECKPOINT_WEIGHTS_PATH, + *, + metadata: Optional[Dict[str, str]] = None, +) -> Path: + try: + from safetensors.torch import save_file + except ImportError as exc: + raise RuntimeError( + "safetensors is required to write stable model artifacts. " + "Install dependencies with `pip install -r requirements.txt`." + ) from exc + + output = Path(output_path) + save_file(_stable_cpu_state_dict(model), str(output), metadata=metadata) + return output + + +def load_model_safetensors( + model: "torch.nn.Module", + input_path: Union[str, Path] = CHECKPOINT_WEIGHTS_PATH, + *, + device: Optional["torch.device"] = None, +) -> "torch.nn.Module": + input_file = Path(input_path) + if not input_file.exists(): + raise FileNotFoundError(f"Safetensors artifact not found: {input_file}") + + try: + from safetensors.torch import load_file + except ImportError as exc: + raise RuntimeError( + "safetensors is required to read stable model artifacts. " + "Install dependencies with `pip install -r requirements.txt`." + ) from exc + + state = load_file(str(input_file), device=str(device) if device is not None else "cpu") + model.load_state_dict(state) + return model diff --git a/src/eval.py b/src/eval.py index 260e706..57214a0 100644 --- a/src/eval.py +++ b/src/eval.py @@ -9,18 +9,15 @@ from main import set_seed from config import TRAIN_CONFIG from device import get_device +from artifacts import CHECKPOINT_WEIGHTS_PATH, hash_json, load_model_safetensors, model_parameters_sha256 DEVICE = get_device() def hash_model(model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() + return model_parameters_sha256(model) def hash_dict(d): - encoded = json.dumps(d, sort_keys=True).encode() - return hashlib.sha256(encoded).hexdigest() + return hash_json(d) if __name__ == "__main__": set_seed(TRAIN_CONFIG["seed"]) @@ -70,9 +67,10 @@ def hash_dict(d): eval_data_hash = hashlib.sha256(dataset.encoded.numpy().tobytes()).hexdigest() - # Build manifest — hash is computed over content, not including itself + # Build manifest: hash is computed over content, not including itself. manifest = { "model_checkpoint_hash": model_hash, + "model_checkpoint_source": checkpoint_source, "eval_dataset": eval_data_hash, "eval_loss": loss.item(), "perplexity": perplexity, @@ -86,4 +84,4 @@ def hash_dict(d): json.dump(manifest, f, indent=2) print(f"\n ~> Manifest saved to {os.path.normpath(manifest_path)}") - print(json.dumps(manifest, indent=2)) \ No newline at end of file + print(json.dumps(manifest, indent=2)) diff --git a/src/global_manifest.py b/src/global_manifest.py index f0674e7..ceae37a 100644 --- a/src/global_manifest.py +++ b/src/global_manifest.py @@ -4,13 +4,21 @@ import sys import platform import os +from pathlib import Path from dataset import TinyDataset from config import TRAIN_CONFIG, get_config_hash +from artifacts import ( + CHECKPOINT_MERKLE_PATH, + CHECKPOINT_STATE_PATH, + CHECKPOINT_WEIGHTS_PATH, + build_merkle_manifest, + compute_sha256, + hash_json, +) def hash_dict(d): # Sort keys to ensure deterministic JSON stringification - encoded = json.dumps(d, sort_keys=True).encode() - return hashlib.sha256(encoded).hexdigest() + return hash_json(d) def generate_global_manifest(): if not os.path.exists("eval_manifest.json"): @@ -54,6 +62,10 @@ def generate_global_manifest(): "2_training_config_hash": config_hash, "3_dataset_hash": dataset_hash, "4_model_checkpoint_hash": model_hash, + "4_model_checkpoint_artifact": model_artifact, + "4_model_checkpoint_merkle_root": model_merkle["merkle_root"], + "4_model_checkpoint_chunk_size_bytes": model_merkle["chunk_size_bytes"], + "4_model_checkpoint_chunk_count": model_merkle["chunk_count"], "5_eval_manifest_hash": eval_hash, } @@ -63,8 +75,8 @@ def generate_global_manifest(): with open("pipeline_manifest.json", "w") as f: json.dump(global_manifest, f, indent=2) - print("\n ༼ つ ◕_◕ ༽つ Global Manifest Sealed:") + print("\n Global Manifest Sealed:") print(json.dumps(global_manifest, indent=2)) if __name__ == "__main__": - generate_global_manifest() \ No newline at end of file + generate_global_manifest() diff --git a/src/gpu_reproducibility_test.py b/src/gpu_reproducibility_test.py index f34fec5..d149227 100644 --- a/src/gpu_reproducibility_test.py +++ b/src/gpu_reproducibility_test.py @@ -3,7 +3,7 @@ Trains the deterministic NanoGPT twice from scratch, with no checkpoint reuse, and asserts that the two runs produce identical loss curves and bitwise-identical parameters. On CPU this reproduces the Phase 1 baseline; on a CUDA GPU it is the -Phase 3 claim — that with a pinned cuBLAS workspace and deterministic cuDNN, the +Phase 3 claim: with a pinned cuBLAS workspace and deterministic cuDNN, the *same* GPU yields the *same bits* run to run. Run from the ``src`` directory: @@ -89,9 +89,9 @@ def main(): ok = losses_match and params_match and (hash1 == hash2) if ok: - print("\n(❁ ´◡`❁) PASSED: same device is bitwise reproducible.") + print("\n[PASS] same device is bitwise reproducible.") else: - print("\n(╯°□°)╯︵ ┻━┻ FAILED: entropy detected on this device.") + print("\n[FAIL] entropy detected on this device.") _write_proof(losses1, losses2, hash1, hash2, ok) return ok diff --git a/src/reproducibility.py b/src/reproducibility.py index 92f4b16..ca86479 100644 --- a/src/reproducibility.py +++ b/src/reproducibility.py @@ -2,8 +2,6 @@ import torch.nn.functional as F import json import math -import platform, sys -import hashlib import random import numpy as np from model import TinyGPT @@ -12,16 +10,21 @@ from telemetry import TelemetryLogger from config import TRAIN_CONFIG from device import get_device, accel_rng_state, restore_accel_rng_state, device_name +from artifacts import ( + CHECKPOINT_MERKLE_PATH, + CHECKPOINT_WEIGHTS_PATH, + MERKLE_CHUNK_SIZE_BYTES, + model_parameters_sha256, + save_model_safetensors, + write_merkle_manifest, +) # CUDA when available, else CPU. The same code path runs on both; only the # floating-point reduction order (the hardware entropy under study) differs. DEVICE = get_device() def hash_model(model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() + return model_parameters_sha256(model) def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log_file="audit.jsonl", seed=None, tamper_weights=False): @@ -88,6 +91,20 @@ def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log if not checkpoint_path_to_load and step == (TRAIN_CONFIG["checkpoint_step"] - 1): current_model_hash = logger.hash_model(model) + weights_path = save_model_safetensors( + model, + CHECKPOINT_WEIGHTS_PATH, + metadata={ + "format": "pt-state-dict", + "tensor_sha256": current_model_hash, + "checkpoint_step": str(TRAIN_CONFIG["checkpoint_step"]), + }, + ) + merkle_manifest = write_merkle_manifest( + weights_path, + CHECKPOINT_MERKLE_PATH, + chunk_size=MERKLE_CHUNK_SIZE_BYTES, + ) torch.save({ 'model': model.state_dict(), @@ -96,9 +113,14 @@ def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log 'accel_rng_state': accel_rng_state(), # None on CPU; tagged per-backend on GPU 'numpy_rng': np.random.get_state(), 'python_rng': random.getstate(), - 'checkpoint_hash': current_model_hash + 'checkpoint_hash': current_model_hash, + 'safetensors_path': str(weights_path), + 'safetensors_sha256': merkle_manifest["sha256"], + 'safetensors_merkle_root': merkle_manifest["merkle_root"], + 'merkle_chunk_size_bytes': merkle_manifest["chunk_size_bytes"], }, "mid_checkpoint.pt") print(f" ~> Prover saved checkpoint at step {TRAIN_CONFIG['checkpoint_step']}") + print(f" ~> Stable weights: {weights_path} | Merkle root: {merkle_manifest['merkle_root'][:16]}...") return model @@ -246,7 +268,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"\n[Verifying: {label}]") if len(prover_segment) != len(auditor_logs): - print(f"Log length mismatch — prover: {len(prover_segment)}, auditor: {len(auditor_logs)}") + print(f"Log length mismatch: prover={len(prover_segment)}, auditor={len(auditor_logs)}") return False match = True @@ -260,7 +282,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT if not step_ok: match = False delta = abs(p['loss'] - a['loss']) - print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | Δ {delta:.2e} FAILED") + print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | delta {delta:.2e} FAILED") else: print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | PASSED") @@ -269,9 +291,9 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"\n Hash mismatch! Prover hash: {prover_hash[:16]} // Auditor hash: {auditor_hash[:16]} [HASH ERROR]") if match and hash_match: - print(f"\n (❁ ´◡`❁) {label} PASSED: Segment replay is bitwise deterministic.") + print(f"\n [PASS] {label}: Segment replay is bitwise deterministic.") else: - print(f"\n (╯°□°)╯︵ ┻━┻ {label} FAILED: Trajectories diverged.") + print(f"\n [FAIL] {label}: Trajectories diverged.") # Return full verification verdict instead of just telemetry match verification_result = { @@ -292,7 +314,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT if DEVICE.type == "cuda": print(" Phase 3: strict GPU determinism (cuDNN deterministic + pinned cuBLAS workspace)") elif DEVICE.type == "xpu": - print(" Phase 3: Intel XPU (oneAPI) — deterministic algorithms enabled (best-effort)") + print(" Phase 3: Intel XPU (oneAPI): deterministic algorithms enabled (best-effort)") # Baseline: should pass print("\n Scenario 1: CLEAN AUDIT ") @@ -379,7 +401,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT prover_segment = prover_logs[5:10] if len(prover_segment) != 5 or len(auditor_logs) != 5: - print(f"Log length mismatch — prover_segment: {len(prover_segment)}, auditor: {len(auditor_logs)}") + print(f"Log length mismatch: prover_segment={len(prover_segment)}, auditor={len(auditor_logs)}") print("Cannot verify. Check for crashes or early exits in training.") else: match = True @@ -396,9 +418,9 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"Step {p['step']} | Prover Loss: {p['loss']:.6f} | Auditor Loss: {a['loss']:.6f} {status}") if match: - print("\n (❁ ´◡`❁) \nAUDIT PASSED: Segment replay is bitwise deterministic.") + print("\n[PASS]\nAUDIT PASSED: Segment replay is bitwise deterministic.") else: - print("\n (╯°□°)╯︵ ┻━┻ \nAUDIT FAILED: Trajectories diverged.") + print("\n[FAIL]\nAUDIT FAILED: Trajectories diverged.") ''' #Reproducibility test for tinyGPT without Segment Verification test @@ -456,4 +478,4 @@ def train_once(): else: print ("\nFailure: Entropy led to non-deterministic behavior") -''' \ No newline at end of file +''' diff --git a/src/telemetry.py b/src/telemetry.py index 64e543e..0e352b5 100644 --- a/src/telemetry.py +++ b/src/telemetry.py @@ -34,7 +34,4 @@ def log_step(self, step, loss, model): return record def hash_model(self, model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() \ No newline at end of file + return model_parameters_sha256(model) diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py new file mode 100644 index 0000000..ac861e4 --- /dev/null +++ b/tests/test_artifacts.py @@ -0,0 +1,55 @@ +import sys +import tempfile +import unittest +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) + +from artifacts import ( # noqa: E402 + build_merkle_manifest, + compute_sha256, + generate_merkle_proof, + merkle_root_from_leaf_hashes, + verify_merkle_proof, +) + + +class ArtifactMerkleTests(unittest.TestCase): + def test_merkle_manifest_matches_manual_root(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "artifact.bin" + path.write_bytes(b"abcdefghij") + + manifest = build_merkle_manifest(path, chunk_size=4) + leaves = [ + compute_sha256(data=b"abcd"), + compute_sha256(data=b"efgh"), + compute_sha256(data=b"ij"), + ] + + self.assertEqual(manifest["chunk_count"], 3) + self.assertEqual(manifest["merkle_root"], merkle_root_from_leaf_hashes(leaves)) + self.assertEqual(manifest["sha256"], compute_sha256(file_path=path)) + + def test_merkle_proof_verifies_chunk_and_rejects_tampering(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "artifact.bin" + path.write_bytes(b"abcdefghij") + manifest = build_merkle_manifest(path, chunk_size=4) + proof = generate_merkle_proof(path, 1, chunk_size=4) + + self.assertTrue(verify_merkle_proof(b"efgh", proof, manifest["merkle_root"])) + self.assertFalse(verify_merkle_proof(b"EFGH", proof, manifest["merkle_root"])) + + def test_empty_file_root_is_empty_sha256(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "empty.bin" + path.write_bytes(b"") + manifest = build_merkle_manifest(path, chunk_size=4) + + self.assertEqual(manifest["chunk_count"], 0) + self.assertEqual(manifest["merkle_root"], compute_sha256(data=b"")) + + +if __name__ == "__main__": + unittest.main()