From 1f0643d4359a71cbd3d460f357b9c383abde9bf7 Mon Sep 17 00:00:00 2001 From: Rajat Date: Sun, 14 Jun 2026 04:02:45 +0530 Subject: [PATCH 1/2] feature: safetensors and merkle hashing implemented --- .gitignore | 4 + requirements.txt | 3 +- src/artifacts.py | 234 ++++++++++++++++++++++++++++++++ src/eval.py | 25 ++-- src/global_manifest.py | 37 ++++- src/gpu_reproducibility_test.py | 6 +- src/reproducibility.py | 54 +++++--- src/telemetry.py | 11 +- tests/test_artifacts.py | 55 ++++++++ 9 files changed, 383 insertions(+), 46 deletions(-) create mode 100644 src/artifacts.py create mode 100644 tests/test_artifacts.py diff --git a/.gitignore b/.gitignore index e2ed4f2..0116b0b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,7 @@ __pycache__/ *.pt *.pth *.jsonl +eval_manifest.json +pipeline_manifest.json +mid_checkpoint.safetensors +mid_checkpoint.merkle.json diff --git a/requirements.txt b/requirements.txt index 5525d8c..14edbf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ torch>=2.10.0,<3.0 numpy>=2.4,<3.0 tqdm>=4.67,<5.0 +safetensors>=0.4.5,<1.0 # This default torch wheel is CPU-only (or CUDA, depending on your platform). # For an Intel GPU (Iris Xe / Arc), install the XPU build instead — see README # "Locally on an Intel GPU": -# pip install torch --index-url https://download.pytorch.org/whl/xpu \ No newline at end of file +# pip install torch --index-url https://download.pytorch.org/whl/xpu diff --git a/src/artifacts.py b/src/artifacts.py new file mode 100644 index 0000000..6105e2b --- /dev/null +++ b/src/artifacts.py @@ -0,0 +1,234 @@ +import hashlib +import json +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +if TYPE_CHECKING: + import torch + +MERKLE_CHUNK_SIZE_BYTES = 1024 * 1024 + +CHECKPOINT_STATE_PATH = "mid_checkpoint.pt" +CHECKPOINT_WEIGHTS_PATH = "mid_checkpoint.safetensors" +CHECKPOINT_MERKLE_PATH = "mid_checkpoint.merkle.json" + + +def hash_json(data: Any) -> str: + encoded = json.dumps(data, sort_keys=True).encode() + return hashlib.sha256(encoded).hexdigest() + + +def compute_sha256_bytes( + *, + data: Optional[Union[bytes, bytearray]] = None, + file_path: Optional[Union[str, Path]] = None, +) -> bytes: + if (data is None) == (file_path is None): + raise ValueError("Exactly one of data or file_path must be provided") + + h = hashlib.sha256() + if data is not None: + h.update(data) + return h.digest() + + with Path(file_path).open("rb") as f: + while chunk := f.read(1024 * 1024): + h.update(chunk) + return h.digest() + + +def compute_sha256( + *, + data: Optional[Union[bytes, bytearray]] = None, + file_path: Optional[Union[str, Path]] = None, +) -> str: + return compute_sha256_bytes(data=data, file_path=file_path).hex() + + +def model_parameters_sha256(model: "torch.nn.Module") -> str: + h = hashlib.sha256() + for param in model.parameters(): + h.update(param.detach().cpu().numpy().tobytes()) + return h.hexdigest() + + +def _merkle_parent(left: bytes, right: bytes) -> bytes: + return compute_sha256_bytes(data=left + right) + + +def merkle_root_from_leaf_hashes(leaf_hashes: List[str]) -> str: + if not leaf_hashes: + return compute_sha256(data=b"") + + level = [bytes.fromhex(leaf) for leaf in leaf_hashes] + while len(level) > 1: + next_level = [] + for i in range(0, len(level), 2): + left = level[i] + right = level[i + 1] if i + 1 < len(level) else left + next_level.append(_merkle_parent(left, right)) + level = next_level + return level[0].hex() + + +def build_merkle_manifest( + file_path: Union[str, Path], + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> Dict[str, Any]: + if chunk_size <= 0: + raise ValueError("chunk_size must be a positive integer") + + path = Path(file_path) + chunks = [] + offset = 0 + with path.open("rb") as f: + while chunk := f.read(chunk_size): + chunks.append( + { + "index": len(chunks), + "offset": offset, + "size": len(chunk), + "sha256": compute_sha256(data=chunk), + } + ) + offset += len(chunk) + + leaf_hashes = [chunk["sha256"] for chunk in chunks] + return { + "artifact": path.name, + "size_bytes": path.stat().st_size, + "sha256": compute_sha256(file_path=path), + "chunk_size_bytes": chunk_size, + "chunk_count": len(chunks), + "merkle_root": merkle_root_from_leaf_hashes(leaf_hashes), + "chunks": chunks, + } + + +def write_merkle_manifest( + file_path: Union[str, Path], + output_path: Union[str, Path], + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> Dict[str, Any]: + manifest = build_merkle_manifest(file_path, chunk_size=chunk_size) + output = Path(output_path) + with output.open("w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2) + return manifest + + +def generate_merkle_proof( + file_path: Union[str, Path], + chunk_index: int, + *, + chunk_size: int = MERKLE_CHUNK_SIZE_BYTES, +) -> List[Dict[str, Any]]: + manifest = build_merkle_manifest(file_path, chunk_size=chunk_size) + if manifest["chunk_count"] == 0: + raise ValueError("Cannot generate a Merkle proof for an empty file") + if chunk_index < 0 or chunk_index >= manifest["chunk_count"]: + raise IndexError("chunk_index out of range") + + level = [bytes.fromhex(chunk["sha256"]) for chunk in manifest["chunks"]] + proof = [] + index = chunk_index + while len(level) > 1: + if len(level) % 2 == 1: + level.append(level[-1]) + + sibling_index = index ^ 1 + proof.append( + { + "sibling_sha256": level[sibling_index].hex(), + "sibling_position": "left" if sibling_index < index else "right", + } + ) + + next_level = [] + for i in range(0, len(level), 2): + next_level.append(_merkle_parent(level[i], level[i + 1])) + index //= 2 + level = next_level + return proof + + +def verify_merkle_proof( + chunk_bytes: bytes, + proof: List[Dict[str, Any]], + expected_root: str, +) -> bool: + try: + current = compute_sha256_bytes(data=chunk_bytes) + expected = bytes.fromhex(expected_root) + except (TypeError, ValueError): + return False + + for step in proof: + try: + sibling = bytes.fromhex(step["sibling_sha256"]) + position = step["sibling_position"] + except (KeyError, TypeError, ValueError): + return False + + if len(sibling) != hashlib.sha256().digest_size: + return False + if position == "left": + current = _merkle_parent(sibling, current) + elif position == "right": + current = _merkle_parent(current, sibling) + else: + return False + + return current == expected + + +def _stable_cpu_state_dict(model: "torch.nn.Module") -> Dict[str, "torch.Tensor"]: + state = model.state_dict() + return { + name: tensor.detach().cpu().contiguous() + for name, tensor in sorted(state.items(), key=lambda item: item[0]) + } + + +def save_model_safetensors( + model: "torch.nn.Module", + output_path: Union[str, Path] = CHECKPOINT_WEIGHTS_PATH, + *, + metadata: Optional[Dict[str, str]] = None, +) -> Path: + try: + from safetensors.torch import save_file + except ImportError as exc: + raise RuntimeError( + "safetensors is required to write stable model artifacts. " + "Install dependencies with `pip install -r requirements.txt`." + ) from exc + + output = Path(output_path) + save_file(_stable_cpu_state_dict(model), str(output), metadata=metadata) + return output + + +def load_model_safetensors( + model: "torch.nn.Module", + input_path: Union[str, Path] = CHECKPOINT_WEIGHTS_PATH, + *, + device: Optional["torch.device"] = None, +) -> "torch.nn.Module": + input_file = Path(input_path) + if not input_file.exists(): + raise FileNotFoundError(f"Safetensors artifact not found: {input_file}") + + try: + from safetensors.torch import load_file + except ImportError as exc: + raise RuntimeError( + "safetensors is required to read stable model artifacts. " + "Install dependencies with `pip install -r requirements.txt`." + ) from exc + + state = load_file(str(input_file), device=str(device) if device is not None else "cpu") + model.load_state_dict(state) + return model diff --git a/src/eval.py b/src/eval.py index 6e9f2f3..20f5c33 100644 --- a/src/eval.py +++ b/src/eval.py @@ -8,18 +8,15 @@ from main import set_seed from config import TRAIN_CONFIG from device import get_device +from artifacts import CHECKPOINT_WEIGHTS_PATH, hash_json, load_model_safetensors, model_parameters_sha256 DEVICE = get_device() def hash_model(model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() + return model_parameters_sha256(model) def hash_dict(d): - encoded = json.dumps(d, sort_keys=True).encode() - return hashlib.sha256(encoded).hexdigest() + return hash_json(d) if __name__ == "__main__": set_seed(TRAIN_CONFIG["seed"]) @@ -33,12 +30,17 @@ def hash_dict(d): dropout=TRAIN_CONFIG["dropout"] ).to(DEVICE) - checkpoint = torch.load("mid_checkpoint.pt", weights_only=False, map_location=DEVICE) - model.load_state_dict(checkpoint['model']) + try: + load_model_safetensors(model, CHECKPOINT_WEIGHTS_PATH, device=DEVICE) + checkpoint_source = str(CHECKPOINT_WEIGHTS_PATH) + except FileNotFoundError: + checkpoint = torch.load("mid_checkpoint.pt", weights_only=False, map_location=DEVICE) + model.load_state_dict(checkpoint['model']) + checkpoint_source = "mid_checkpoint.pt" model.eval() # disabling dropout for eval as results must be deterministic model_hash = hash_model(model) - print(f" ~> Model loaded | checkpoint hash: {model_hash[:16]}...") + print(f" ~> Model loaded from {checkpoint_source} | checkpoint hash: {model_hash[:16]}...") # Held-out eval which is never seen during training x, y = dataset.get_batch() @@ -55,9 +57,10 @@ def hash_dict(d): eval_data_hash = hashlib.sha256(dataset.encoded.numpy().tobytes()).hexdigest() - # Build manifest — hash is computed over content, not including itself + # Build manifest: hash is computed over content, not including itself. manifest = { "model_checkpoint_hash": model_hash, + "model_checkpoint_source": checkpoint_source, "eval_dataset": eval_data_hash, "eval_loss": loss.item(), "perplexity": perplexity, @@ -68,4 +71,4 @@ def hash_dict(d): json.dump(manifest, f, indent=2) print(f"\n ~> Manifest saved to eval_manifest.json") - print(json.dumps(manifest, indent=2)) \ No newline at end of file + print(json.dumps(manifest, indent=2)) diff --git a/src/global_manifest.py b/src/global_manifest.py index 4250506..11d1d43 100644 --- a/src/global_manifest.py +++ b/src/global_manifest.py @@ -4,13 +4,21 @@ import sys import platform import os +from pathlib import Path from dataset import TinyDataset from config import TRAIN_CONFIG, get_config_hash +from artifacts import ( + CHECKPOINT_MERKLE_PATH, + CHECKPOINT_STATE_PATH, + CHECKPOINT_WEIGHTS_PATH, + build_merkle_manifest, + compute_sha256, + hash_json, +) def hash_dict(d): # Sort keys to ensure deterministic JSON stringification - encoded = json.dumps(d, sort_keys=True).encode() - return hashlib.sha256(encoded).hexdigest() + return hash_json(d) def generate_global_manifest(): if not os.path.exists("eval_manifest.json"): @@ -33,9 +41,20 @@ def generate_global_manifest(): dataset = TinyDataset() dataset_hash = hashlib.sha256(dataset.encoded.numpy().tobytes()).hexdigest() - # 4. Model Hash - with open("mid_checkpoint.pt", "rb") as f: - model_hash = hashlib.sha256(f.read()).hexdigest() + # 4. Model artifact hash. Prefer safetensors because it is byte-stable; + # keep the .pt fallback for older runs that only have replay checkpoints. + model_artifact_path = Path( + CHECKPOINT_WEIGHTS_PATH + if os.path.exists(CHECKPOINT_WEIGHTS_PATH) + else CHECKPOINT_STATE_PATH + ) + model_artifact = str(model_artifact_path) + model_hash = compute_sha256(file_path=model_artifact_path) + if os.path.exists(CHECKPOINT_MERKLE_PATH): + with open(CHECKPOINT_MERKLE_PATH, "r", encoding="utf-8") as f: + model_merkle = json.load(f) + else: + model_merkle = build_merkle_manifest(model_artifact_path) # 5. Eval Manifest Hash (run eval.py before this script) with open("eval_manifest.json", "r") as f: @@ -48,6 +67,10 @@ def generate_global_manifest(): "2_training_config_hash": config_hash, "3_dataset_hash": dataset_hash, "4_model_checkpoint_hash": model_hash, + "4_model_checkpoint_artifact": model_artifact, + "4_model_checkpoint_merkle_root": model_merkle["merkle_root"], + "4_model_checkpoint_chunk_size_bytes": model_merkle["chunk_size_bytes"], + "4_model_checkpoint_chunk_count": model_merkle["chunk_count"], "5_eval_manifest_hash": eval_hash, } @@ -57,8 +80,8 @@ def generate_global_manifest(): with open("pipeline_manifest.json", "w") as f: json.dump(global_manifest, f, indent=2) - print("\n ༼ つ ◕_◕ ༽つ Global Manifest Sealed:") + print("\n Global Manifest Sealed:") print(json.dumps(global_manifest, indent=2)) if __name__ == "__main__": - generate_global_manifest() \ No newline at end of file + generate_global_manifest() diff --git a/src/gpu_reproducibility_test.py b/src/gpu_reproducibility_test.py index f34fec5..d149227 100644 --- a/src/gpu_reproducibility_test.py +++ b/src/gpu_reproducibility_test.py @@ -3,7 +3,7 @@ Trains the deterministic NanoGPT twice from scratch, with no checkpoint reuse, and asserts that the two runs produce identical loss curves and bitwise-identical parameters. On CPU this reproduces the Phase 1 baseline; on a CUDA GPU it is the -Phase 3 claim — that with a pinned cuBLAS workspace and deterministic cuDNN, the +Phase 3 claim: with a pinned cuBLAS workspace and deterministic cuDNN, the *same* GPU yields the *same bits* run to run. Run from the ``src`` directory: @@ -89,9 +89,9 @@ def main(): ok = losses_match and params_match and (hash1 == hash2) if ok: - print("\n(❁ ´◡`❁) PASSED: same device is bitwise reproducible.") + print("\n[PASS] same device is bitwise reproducible.") else: - print("\n(╯°□°)╯︵ ┻━┻ FAILED: entropy detected on this device.") + print("\n[FAIL] entropy detected on this device.") _write_proof(losses1, losses2, hash1, hash2, ok) return ok diff --git a/src/reproducibility.py b/src/reproducibility.py index f635982..bddd422 100644 --- a/src/reproducibility.py +++ b/src/reproducibility.py @@ -2,8 +2,6 @@ import torch.nn.functional as F import json import math -import platform, sys -import hashlib import random import numpy as np from model import TinyGPT @@ -12,16 +10,21 @@ from telemetry import TelemetryLogger from config import TRAIN_CONFIG from device import get_device, accel_rng_state, restore_accel_rng_state, device_name +from artifacts import ( + CHECKPOINT_MERKLE_PATH, + CHECKPOINT_WEIGHTS_PATH, + MERKLE_CHUNK_SIZE_BYTES, + model_parameters_sha256, + save_model_safetensors, + write_merkle_manifest, +) # CUDA when available, else CPU. The same code path runs on both; only the # floating-point reduction order (the hardware entropy under study) differs. DEVICE = get_device() def hash_model(model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() + return model_parameters_sha256(model) def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log_file="audit.jsonl", seed=None, tamper_weights=False): @@ -80,6 +83,20 @@ def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log if not checkpoint_path_to_load and step == (TRAIN_CONFIG["checkpoint_step"] - 1): current_model_hash = logger.hash_model(model) + weights_path = save_model_safetensors( + model, + CHECKPOINT_WEIGHTS_PATH, + metadata={ + "format": "pt-state-dict", + "tensor_sha256": current_model_hash, + "checkpoint_step": str(TRAIN_CONFIG["checkpoint_step"]), + }, + ) + merkle_manifest = write_merkle_manifest( + weights_path, + CHECKPOINT_MERKLE_PATH, + chunk_size=MERKLE_CHUNK_SIZE_BYTES, + ) torch.save({ 'model': model.state_dict(), @@ -88,9 +105,14 @@ def run_training_segment(start_step, end_step, checkpoint_path_to_load=None, log 'accel_rng_state': accel_rng_state(), # None on CPU; tagged per-backend on GPU 'numpy_rng': np.random.get_state(), 'python_rng': random.getstate(), - 'checkpoint_hash': current_model_hash + 'checkpoint_hash': current_model_hash, + 'safetensors_path': str(weights_path), + 'safetensors_sha256': merkle_manifest["sha256"], + 'safetensors_merkle_root': merkle_manifest["merkle_root"], + 'merkle_chunk_size_bytes': merkle_manifest["chunk_size_bytes"], }, "mid_checkpoint.pt") print(f" ~> Prover saved checkpoint at step {TRAIN_CONFIG['checkpoint_step']}") + print(f" ~> Stable weights: {weights_path} | Merkle root: {merkle_manifest['merkle_root'][:16]}...") return model @@ -217,7 +239,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"\n[Verifying: {label}]") if len(prover_segment) != len(auditor_logs): - print(f"Log length mismatch — prover: {len(prover_segment)}, auditor: {len(auditor_logs)}") + print(f"Log length mismatch: prover={len(prover_segment)}, auditor={len(auditor_logs)}") return False match = True @@ -231,7 +253,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT if not step_ok: match = False delta = abs(p['loss'] - a['loss']) - print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | Δ {delta:.2e} FAILED") + print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | delta {delta:.2e} FAILED") else: print(f"Step {p['step']} | Prover: {p['loss']:.8f} | Auditor: {a['loss']:.8f} | PASSED") @@ -240,9 +262,9 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"\n Hash mismatch! Prover hash: {prover_hash[:16]} // Auditor hash: {auditor_hash[:16]} [HASH ERROR]") if match and hash_match: - print(f"\n (❁ ´◡`❁) {label} PASSED: Segment replay is bitwise deterministic.") + print(f"\n [PASS] {label}: Segment replay is bitwise deterministic.") else: - print(f"\n (╯°□°)╯︵ ┻━┻ {label} FAILED: Trajectories diverged.") + print(f"\n [FAIL] {label}: Trajectories diverged.") return match @@ -254,7 +276,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT if DEVICE.type == "cuda": print(" Phase 3: strict GPU determinism (cuDNN deterministic + pinned cuBLAS workspace)") elif DEVICE.type == "xpu": - print(" Phase 3: Intel XPU (oneAPI) — deterministic algorithms enabled (best-effort)") + print(" Phase 3: Intel XPU (oneAPI): deterministic algorithms enabled (best-effort)") # Baseline: should pass print("\n Scenario 1: CLEAN AUDIT ") @@ -341,7 +363,7 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT prover_segment = prover_logs[5:10] if len(prover_segment) != 5 or len(auditor_logs) != 5: - print(f"Log length mismatch — prover_segment: {len(prover_segment)}, auditor: {len(auditor_logs)}") + print(f"Log length mismatch: prover_segment={len(prover_segment)}, auditor={len(auditor_logs)}") print("Cannot verify. Check for crashes or early exits in training.") else: match = True @@ -358,9 +380,9 @@ def verify(prover_segment, auditor_logs, prover_hash, auditor_hash, label="AUDIT print(f"Step {p['step']} | Prover Loss: {p['loss']:.6f} | Auditor Loss: {a['loss']:.6f} {status}") if match: - print("\n (❁ ´◡`❁) \nAUDIT PASSED: Segment replay is bitwise deterministic.") + print("\n[PASS]\nAUDIT PASSED: Segment replay is bitwise deterministic.") else: - print("\n (╯°□°)╯︵ ┻━┻ \nAUDIT FAILED: Trajectories diverged.") + print("\n[FAIL]\nAUDIT FAILED: Trajectories diverged.") ''' #Reproducibility test for tinyGPT without Segment Verification test @@ -418,4 +440,4 @@ def train_once(): else: print ("\nFailure: Entropy led to non-deterministic behavior") -''' \ No newline at end of file +''' diff --git a/src/telemetry.py b/src/telemetry.py index 7770ae9..2ee9542 100644 --- a/src/telemetry.py +++ b/src/telemetry.py @@ -1,8 +1,6 @@ import json -from xml.parsers.expat import model -import torch -import os -import hashlib + +from artifacts import model_parameters_sha256 class TelemetryLogger: def __init__(self, filepath="audit_log.jsonl"): @@ -35,7 +33,4 @@ def log_step(self, step, loss, model): return record def hash_model(self, model): - h = hashlib.sha256() - for p in model.parameters(): - h.update(p.data.cpu().numpy().tobytes()) - return h.hexdigest() \ No newline at end of file + return model_parameters_sha256(model) diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py new file mode 100644 index 0000000..ac861e4 --- /dev/null +++ b/tests/test_artifacts.py @@ -0,0 +1,55 @@ +import sys +import tempfile +import unittest +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) + +from artifacts import ( # noqa: E402 + build_merkle_manifest, + compute_sha256, + generate_merkle_proof, + merkle_root_from_leaf_hashes, + verify_merkle_proof, +) + + +class ArtifactMerkleTests(unittest.TestCase): + def test_merkle_manifest_matches_manual_root(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "artifact.bin" + path.write_bytes(b"abcdefghij") + + manifest = build_merkle_manifest(path, chunk_size=4) + leaves = [ + compute_sha256(data=b"abcd"), + compute_sha256(data=b"efgh"), + compute_sha256(data=b"ij"), + ] + + self.assertEqual(manifest["chunk_count"], 3) + self.assertEqual(manifest["merkle_root"], merkle_root_from_leaf_hashes(leaves)) + self.assertEqual(manifest["sha256"], compute_sha256(file_path=path)) + + def test_merkle_proof_verifies_chunk_and_rejects_tampering(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "artifact.bin" + path.write_bytes(b"abcdefghij") + manifest = build_merkle_manifest(path, chunk_size=4) + proof = generate_merkle_proof(path, 1, chunk_size=4) + + self.assertTrue(verify_merkle_proof(b"efgh", proof, manifest["merkle_root"])) + self.assertFalse(verify_merkle_proof(b"EFGH", proof, manifest["merkle_root"])) + + def test_empty_file_root_is_empty_sha256(self): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "empty.bin" + path.write_bytes(b"") + manifest = build_merkle_manifest(path, chunk_size=4) + + self.assertEqual(manifest["chunk_count"], 0) + self.assertEqual(manifest["merkle_root"], compute_sha256(data=b"")) + + +if __name__ == "__main__": + unittest.main() From 25e6188f58d456e155fafffcc73d6cccb234ec82 Mon Sep 17 00:00:00 2001 From: Rajat Roy Date: Sun, 14 Jun 2026 04:17:27 +0530 Subject: [PATCH 2/2] Update src/artifacts.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/artifacts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/artifacts.py b/src/artifacts.py index 6105e2b..40e6210 100644 --- a/src/artifacts.py +++ b/src/artifacts.py @@ -82,8 +82,10 @@ def build_merkle_manifest( path = Path(file_path) chunks = [] offset = 0 + file_hasher = hashlib.sha256() with path.open("rb") as f: while chunk := f.read(chunk_size): + file_hasher.update(chunk) chunks.append( { "index": len(chunks), @@ -97,8 +99,8 @@ def build_merkle_manifest( leaf_hashes = [chunk["sha256"] for chunk in chunks] return { "artifact": path.name, - "size_bytes": path.stat().st_size, - "sha256": compute_sha256(file_path=path), + "size_bytes": offset, + "sha256": file_hasher.hexdigest(), "chunk_size_bytes": chunk_size, "chunk_count": len(chunks), "merkle_root": merkle_root_from_leaf_hashes(leaf_hashes),