From d210ee6edd2f0bf703189457ddc8014c53bc3791 Mon Sep 17 00:00:00 2001
From: westkevin12 <lvvlwest@gmail.com>
Date: Fri, 5 Jun 2026 18:46:19 -0500
Subject: [PATCH] feat: integrate tracing hooks into benchmark suite and update
 output format to support comparative analysis between standard and
 trace-enabled modes.

---
 README.md                           |  53 ++++----
 cmd/orchid-daemon/matmul_wrapper.go | 179 ++++++++++++++++++++--------
 docs/ARCHITECTURE.md                |  10 ++
 evidence/reproduced/speedups.json   |  16 ++-
 jit/jit.go                          | 101 +++++++++++++---
 jit/jit_amd64.go                    |  21 +++-
 jit/jit_test.go                     |  58 +++++++++
 7 files changed, 344 insertions(+), 94 deletions(-)

diff --git a/README.md b/README.md
index 6029cd7..f52d2ba 100644
--- a/README.md
+++ b/README.md
@@ -29,42 +29,55 @@ Project **ORCHID** is the low-level micro-architectural execution core of the RA
 
 The absolute base foundation, research primitives, and original codebase layout can be found preserved on the legacy archive branch:
 👉 **[View the Baseline Concept Code (`tree/gatchimuchio-original`)](https://github.com/DigitalServerHost/ORCHID/tree/gatchimuchio-original)**
+
 ---
 
 ## 📊 Reproduced Locality Performance
 
-Under identical, mathematically verified logical execution constraints (512x512 matrix size and double-triplicate verification), the locality-aligned memory mapping sweeps demonstrate exceptionally high performance improvements. Badges below are dynamically parsed from current timing sweeps:
+Under identical, mathematically verified logical execution constraints (512x512 matrix size and double-triplicate verification), ORCHID executes in two timing configurations. Standard Mode prioritizes raw bare-metal machine code throughput, while Trace Mode instruments execution boundaries for out-of-band ZK verification (Project VALKYRIE).
+
+| Metric | Standard Mode (Raw Execution) | Trace Mode (Verification Hook Active) |
+| :--- | :--- | :--- |
+| **Minimum Speedup** | ![Speedup Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.min&label=Min%20Speedup&color=blue) | ![Trace Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.min&label=Trace%20Min&color=blueviolet) |
+| **Median Speedup** | ![Speedup Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.median&label=Median%20Speedup&color=blue) | ![Trace Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.median&label=Trace%20Median&color=blueviolet) |
+| **Maximum Speedup** | ![Speedup Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.max&label=Max%20Speedup&color=brightgreen) | ![Trace Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.max&label=Trace%20Max&color=orange) |
+| **Mean Speedup** | ![Speedup Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.mean&label=Mean%20Speedup&color=brightgreen) | ![Trace Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.mean&label=Trace%20Mean&color=orange) |
+
+### 🔀 Parallel Memory Scheduler (CADENCE Scheduler)
 
-| Metric              | Speedup                                                                                                                                                                                                                                       |
-| :------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Minimum Speedup** | ![Speedup Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.min&label=Speedup%20Min&color=blue)                |
-| **Median Speedup**  | ![Speedup Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.median&label=Speedup%20Median&color=blueviolet) |
-| **Maximum Speedup** | ![Speedup Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.max&label=Speedup%20Max&color=brightgreen)         |
-| **Mean Speedup**    | ![Speedup Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.mean&label=Speedup%20Mean&color=orange)           |
+The parallel role scheduler (`scheduler.go`) partitions memory operations into three distinct logical streams (B-read, C-read, A-write) using a simulated STREAM-Triad memory controller queue. Scheduling these operations onto three independent hardware memory banks achieves the absolute theoretical parallel saturation limit:
 
-> [!NOTE]
-> **Understanding the Speedup Profiles:**
-> - **Physical Cache Locality (C Harness)**: The dynamic badges above measure the hardware execution speedup of cache-blocked locality-aligned loops (matrix multiplication) over flat baselines, yielding **3.0x - 3.4x** actual hardware speedups on warm cache lines.
-> - **Parallel Memory Scheduler (Go Simulator)**: The scheduler unit tests (`TestBankedSchedulerTriad`) run a software-simulated queue model (STREAM-Triad) to measure bank serialization and parallel role routing. Because STREAM-Triad partitions requests into 3 distinct logical data streams (B-read, C-read, A-write), mapping them to 3 independent memory banks achieves a theoretical parallel speedup limit of exactly **3.0x** (which the Go scheduler hits at exactly **3.000x** cycle reduction).
+![Scheduler Speedup](https://img.shields.io/badge/Scheduler%20Speedup-3.000x-brightgreen)
+
+* **Theoretical Maximum:** 3.0x cycle reduction due to perfect memory-role serialization elimination.
+* **Reproduced Efficiency:** The Go scheduling model hits exactly **3.000x** parallel performance speedup.
 
 ---
 
 ## 🖥️ Platform Target Support & JIT Engine
 
 Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures:
-*   **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code:
-    - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading.
-    - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
-    - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions.
-*   **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path:
-    - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported.
-    - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs.
-    - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops.
-    - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability.
+
+- **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code:
+  - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading.
+  - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
+  - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions.
+- **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path:
+  - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported.
+  - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs.
+  - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops.
+  - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability.
 
 ### 🔒 W^X Memory Security
+
 The JIT compiler strictly enforces **Write-XOR-Execute (W^X)** memory constraints. Page memory is allocated with write permission (`syscall.PROT_WRITE`), code is generated, and then the page is transitioned to read-execute (`syscall.PROT_EXEC`) via `syscall.Mprotect` before execution.
 
+### 🛡️ Decoupled Verification & Standalone Engine
+
+Project ORCHID is designed to be **fully standalone**; developers can run the core JIT compiler and parallel scheduling runtime completely independent of any blockchain or verification logic.
+
+To maintain raw execution performance, cryptographic proof generation is decoupled from the hot path. The codebase exports runtime execution statistics and memory pointers via a lightweight, zero-overhead tracing interface. Developers can register their own custom verification layers or plug into **[Project VALKYRIE](https://github.com/DigitalServerHost/VALKYRIE)**, which is ORCHID's default recommended open-source ZK-proving and verification layer.
+
 ---
 
 ## 🏛️ Centralized Architectural Design & Blueprint
diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go
index 1b37ac9..3a3590c 100644
--- a/cmd/orchid-daemon/matmul_wrapper.go
+++ b/cmd/orchid-daemon/matmul_wrapper.go
@@ -70,20 +70,28 @@ func median(values []float64) float64 {
 	return (values[n/2-1] + values[n/2]) / 2.0
 }
 
+/**
+ * @struct benchmarkConfig
+ * @brief Bundles variables and pointers passed to benchmark executors.
+ */
+type benchmarkConfig struct {
+	aPtr     unsafe.Pointer
+	bPtr     unsafe.Pointer
+	cfPtr    unsafe.Pointer
+	clPtr    unsafe.Pointer
+	flushPtr unsafe.Pointer
+	kFlat    jit.Kernel
+	kLoc     jit.Kernel
+}
+
 /**
  * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups.
  * 
  * @param repeats Number of benchmark iterations to perform.
- * @param aPtr Pointer to matrix A.
- * @param bPtr Pointer to matrix B.
- * @param cfPtr Pointer to flat output buffer.
- * @param clPtr Pointer to locality output buffer.
- * @param flushPtr Pointer to cache flushing buffer space.
- * @param kFlat Pre-compiled JIT flat kernel.
- * @param kLoc Pre-compiled JIT locality kernel.
+ * @param cfg Pointer to benchmarkConfig payload.
  * @return Speedup values slice and printed log lines slice.
  */
-func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, kFlat, kLoc jit.Kernel) ([]float64, []string) {
+func runBenchmarkPairs(repeats int, cfg *benchmarkConfig) ([]float64, []string) {
 	var speedups []float64
 	var timingLines []string
 
@@ -93,29 +101,29 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po
 
 		if r%2 == 0 {
 			order = "flat-first"
-			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
-			C.memset(cfPtr, 0, C.size_t(Bytes))
+			C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes))
+			C.memset(cfg.cfPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			kFlat.Execute(aPtr, bPtr, cfPtr)
+			cfg.kFlat.Execute(cfg.aPtr, cfg.bPtr, cfg.cfPtr)
 			flatSec = time.Since(t0).Seconds()
 
-			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
-			C.memset(clPtr, 0, C.size_t(Bytes))
+			C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes))
+			C.memset(cfg.clPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			kLoc.Execute(aPtr, bPtr, clPtr)
+			cfg.kLoc.Execute(cfg.aPtr, cfg.bPtr, cfg.clPtr)
 			localSec = time.Since(t0).Seconds()
 		} else {
 			order = "locality-first"
-			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
-			C.memset(clPtr, 0, C.size_t(Bytes))
+			C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes))
+			C.memset(cfg.clPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			kLoc.Execute(aPtr, bPtr, clPtr)
+			cfg.kLoc.Execute(cfg.aPtr, cfg.bPtr, cfg.clPtr)
 			localSec = time.Since(t0).Seconds()
 
-			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
-			C.memset(cfPtr, 0, C.size_t(Bytes))
+			C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes))
+			C.memset(cfg.cfPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			kFlat.Execute(aPtr, bPtr, cfPtr)
+			cfg.kFlat.Execute(cfg.aPtr, cfg.bPtr, cfg.cfPtr)
 			flatSec = time.Since(t0).Seconds()
 		}
 
@@ -131,6 +139,12 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po
 	return speedups, timingLines
 }
 
+type benchmarkTraceHook struct{}
+
+func (b *benchmarkTraceHook) OnExecute(meta jit.ExecutionMetadata) {
+	// This empty callback is intentional to measure trace callback dispatch overhead.
+}
+
 /**
  * @struct BenchmarkOutputs
  * @brief Groups together the benchmark output metrics and directory configurations.
@@ -145,6 +159,10 @@ type BenchmarkOutputs struct {
 	Median       float64
 	Max          float64
 	Mean         float64
+	TraceMin     float64
+	TraceMedian  float64
+	TraceMax     float64
+	TraceMean    float64
 }
 
 /**
@@ -181,12 +199,20 @@ func writeBenchmarkOutputs(cfg *BenchmarkOutputs) error {
 		return err
 	}
 
-	// 3. Write speedups.json
-	speedupMap := map[string]string{
-		"min":    fmt.Sprintf("%.3fx", cfg.Min),
-		"median": fmt.Sprintf("%.3fx", cfg.Median),
-		"max":    fmt.Sprintf("%.3fx", cfg.Max),
-		"mean":   fmt.Sprintf("%.3fx", cfg.Mean),
+	// 3. Write speedups.json (Standard vs Trace comparative format)
+	speedupMap := map[string]map[string]string{
+		"standard": {
+			"min":    fmt.Sprintf("%.3fx", cfg.Min),
+			"median": fmt.Sprintf("%.3fx", cfg.Median),
+			"max":    fmt.Sprintf("%.3fx", cfg.Max),
+			"mean":   fmt.Sprintf("%.3fx", cfg.Mean),
+		},
+		"trace": {
+			"min":    fmt.Sprintf("%.3fx", cfg.TraceMin),
+			"median": fmt.Sprintf("%.3fx", cfg.TraceMedian),
+			"max":    fmt.Sprintf("%.3fx", cfg.TraceMax),
+			"mean":   fmt.Sprintf("%.3fx", cfg.TraceMean),
+		},
 	}
 	speedupJSON, err := json.MarshalIndent(speedupMap, "", "  ")
 	if err != nil {
@@ -195,6 +221,46 @@ func writeBenchmarkOutputs(cfg *BenchmarkOutputs) error {
 	return os.WriteFile(filepath.Join(cfg.OutDir, "speedups.json"), append(speedupJSON, '\n'), 0644)
 }
 
+/**
+ * @brief Computes summary statistics from a speedup values slice.
+ * 
+ * @param speedups Slice of floating-point speedups.
+ * @return min, median, max, and mean speedups.
+ */
+func computeStats(speedups []float64) (float64, float64, float64, float64) {
+	minVal := speedups[0]
+	maxVal := speedups[0]
+	sumVal := 0.0
+	for _, v := range speedups {
+		if v < minVal {
+			minVal = v
+		}
+		if v > maxVal {
+			maxVal = v
+		}
+		sumVal += v
+	}
+	meanVal := sumVal / float64(len(speedups))
+	medianVal := median(speedups)
+	return minVal, medianVal, maxVal, meanVal
+}
+
+/**
+ * @brief Compares two result slices for structural equality.
+ * 
+ * @param cf Reference flat result slice.
+ * @param cl Locality-optimized result slice.
+ * @return error if any value mismatch is found.
+ */
+func verifyEquivalence(cf, cl []int32) error {
+	for i := 0; i < len(cf); i++ {
+		if cf[i] != cl[i] {
+			return fmt.Errorf("MISMATCH: Verification failure at index=%d flat=%d locality=%d", i, cf[i], cl[i])
+		}
+	}
+	return nil
+}
+
 /**
  * @brief Entry point for running the matrix cache-locality timing benchmark.
  * 
@@ -267,10 +333,8 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 	kLoc.Execute(aPtr, bPtr, clPtr)
 
 	// Verify equal outputs
-	for i := 0; i < Cells; i++ {
-		if cfSlice[i] != clSlice[i] {
-			return nil, fmt.Errorf("MISMATCH: Verification failure at index=%d flat=%d locality=%d", i, cfSlice[i], clSlice[i])
-		}
+	if err := verifyEquivalence(cfSlice, clSlice); err != nil {
+		return nil, err
 	}
 
 	// Calculate checksum of results
@@ -282,27 +346,42 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 	verifyMsg := fmt.Sprintf("VERIFY equal N=%d operations=%d cache_flush_bytes=%d", N, N*N*N, FlushBytes)
 	fmt.Println(verifyMsg)
 
-	// Collect timing pairs
-	speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, kFlat, kLoc)
+	benchCfg := &benchmarkConfig{
+		aPtr:     aPtr,
+		bPtr:     bPtr,
+		cfPtr:    cfPtr,
+		clPtr:    clPtr,
+		flushPtr: flushPtr,
+		kFlat:    kFlat,
+		kLoc:     kLoc,
+	}
+
+	// Collect standard timing pairs
+	speedups, timingLines := runBenchmarkPairs(repeats, benchCfg)
+
+	// Register trace hook for trace mode benchmarking
+	fmt.Println("\n--- ENABLING TRACE MODE ---")
+	jit.RegisterTraceHook(&benchmarkTraceHook{})
+
+	// Collect trace timing pairs
+	traceSpeedups, traceTimingLines := runBenchmarkPairs(repeats, benchCfg)
+
+	// Clean up trace hook registration
+	jit.RegisterTraceHook(nil)
+	fmt.Println("--- TRACE MODE DISABLED ---")
+	fmt.Println()
 
 	flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink())
 	fmt.Println(flushSinkMsg)
 
-	// Compute summary statistics
-	minVal := speedups[0]
-	maxVal := speedups[0]
-	sumVal := 0.0
-	for _, v := range speedups {
-		if v < minVal {
-			minVal = v
-		}
-		if v > maxVal {
-			maxVal = v
-		}
-		sumVal += v
-	}
-	meanVal := sumVal / float64(len(speedups))
-	medianVal := median(speedups)
+	// Compute statistics
+	minVal, medianVal, maxVal, meanVal := computeStats(speedups)
+	traceMinVal, traceMedianVal, traceMaxVal, traceMeanVal := computeStats(traceSpeedups)
+
+	// Merge timing lines for logging
+	allTimingLines := append([]string{"=== STANDARD MODE TIMINGS ==="}, timingLines...)
+	allTimingLines = append(allTimingLines, "=== TRACE MODE TIMINGS ===")
+	allTimingLines = append(allTimingLines, traceTimingLines...)
 
 	// Write output files
 	cfg := &BenchmarkOutputs{
@@ -310,11 +389,15 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 		TelemetryMsg: telemetryMsg,
 		VerifyMsg:    verifyMsg,
 		FlushSinkMsg: flushSinkMsg,
-		TimingLines:  timingLines,
+		TimingLines:  allTimingLines,
 		Min:          minVal,
 		Median:       medianVal,
 		Max:          maxVal,
 		Mean:         meanVal,
+		TraceMin:     traceMinVal,
+		TraceMedian:  traceMedianVal,
+		TraceMax:     traceMaxVal,
+		TraceMean:    traceMeanVal,
 	}
 	if err := writeBenchmarkOutputs(cfg); err != nil {
 		return nil, err
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index b255a1d..09b4404 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -145,6 +145,16 @@ To support real-time execution mesh demands without writing temporary files to d
 
 ---
 
+### 3.5. Computational Verification & Isolated Trust Plane (Project VALKYRIE)
+In high-performance decentralized systems, runtime performance optimization must not compromise computational integrity. ORCHID enforces a strict separation of concerns between raw execution and trust verification:
+
+*   **Zero Prover Bloat:** To preserve ORCHID's bare-metal execution speed and lightweight container footprints, zero-knowledge prover generation (ZK-SNARK/STARK) and consensus logic are completely excluded from the ORCHID repository.
+*   **Verifier-Agnostic Design:** ORCHID provides clean, low-overhead tracing interfaces (such as scheduling queues and execution parameter records). Developers can plug in custom verification hooks to capture inputs, outputs, and kernel metadata.
+*   **Recommended Verification Layer:** We recommend utilizing **[Project VALKYRIE](https://github.com/DigitalServerHost/VALKYRIE)**—RAMNET's dedicated, out-of-band verification and zero-knowledge proof generation system. VALKYRIE ingests ORCHID's execution traces and output buffers to compile polynomial constraints and issue verification proofs, keeping the hot execution loop unburdened.
+*   **Local Sanity Validation:** For local testing, the JIT benchmarks run element-by-element equivalence verification between baseline flat outputs and locality-optimized outputs to validate compiler index and register correctness before timing sweeps.
+
+---
+
 ## 🐳 4. Orchestration & Static Quality Control
 
 ORCHID integrates modern tooling to guarantee code health:
diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json
index c847662..abaa596 100644
--- a/evidence/reproduced/speedups.json
+++ b/evidence/reproduced/speedups.json
@@ -1,6 +1,14 @@
 {
-  "max": "12.457x",
-  "mean": "11.128x",
-  "median": "11.530x",
-  "min": "8.964x"
+  "standard": {
+    "max": "12.945x",
+    "mean": "12.212x",
+    "median": "12.497x",
+    "min": "10.905x"
+  },
+  "trace": {
+    "max": "12.314x",
+    "mean": "10.850x",
+    "median": "10.621x",
+    "min": "9.749x"
+  }
 }
diff --git a/jit/jit.go b/jit/jit.go
index cf76c47..d1de561 100644
--- a/jit/jit.go
+++ b/jit/jit.go
@@ -24,6 +24,39 @@ type Kernel interface {
 	Free() error
 }
 
+/**
+ * @struct ExecutionMetadata
+ * @brief Holds runtime details of a completed JIT kernel execution for auditing.
+ */
+type ExecutionMetadata struct {
+	N        int            ///< Matrix size (N x N)
+	APtr     unsafe.Pointer ///< Pointer to input matrix A
+	BPtr     unsafe.Pointer ///< Pointer to input matrix B
+	CPtr     unsafe.Pointer ///< Pointer to output matrix C
+	Locality bool           ///< True if locality optimization was used
+}
+
+/**
+ * @interface TraceHook
+ * @brief Interface for registering execution tracing and verification auditing tools.
+ */
+type TraceHook interface {
+	// OnExecute is invoked after a JIT kernel completes computation.
+	OnExecute(meta ExecutionMetadata)
+}
+
+// Global active trace hook registration
+var activeTraceHook TraceHook
+
+/**
+ * @brief Registers a global trace hook to capture JIT kernel execution details.
+ * 
+ * @param hook The TraceHook to register.
+ */
+func RegisterTraceHook(hook TraceHook) {
+	activeTraceHook = hook
+}
+
 /**
  * @brief Allocates memory using syscall.Mmap with read-write protections.
  * 
@@ -90,6 +123,45 @@ func (k *GoFallbackKernel) Free() error {
 	return nil
 }
 
+/**
+ * @brief Performs a locality-optimized matrix multiplication traversal in Go.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @param a Flat slice containing matrix A data.
+ * @param b Flat slice containing matrix B data.
+ * @param c Flat slice containing matrix C output data.
+ */
+func executeLocality(n int, a, b, c []int32) {
+	for i := 0; i < n; i++ {
+		for kv := 0; kv < n; kv++ {
+			r := a[i*n+kv]
+			for j := 0; j < n; j++ {
+				c[i*n+j] += r * b[kv*n+j]
+			}
+		}
+	}
+}
+
+/**
+ * @brief Performs a flat triple-loop matrix multiplication traversal in Go.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @param a Flat slice containing matrix A data.
+ * @param b Flat slice containing matrix B data.
+ * @param c Flat slice containing matrix C output data.
+ */
+func executeFlat(n int, a, b, c []int32) {
+	for i := 0; i < n; i++ {
+		for j := 0; j < n; j++ {
+			var sum int32
+			for kv := 0; kv < n; kv++ {
+				sum += a[i*n+kv] * b[kv*n+j]
+			}
+			c[i*n+j] = sum
+		}
+	}
+}
+
 /**
  * @brief Executes matrix multiplication using Go fallback loops.
  * 
@@ -105,23 +177,18 @@ func (k *GoFallbackKernel) Execute(a, b, c unsafe.Pointer) {
 	cSlice := (*[1 << 28]int32)(c)[:cells:cells]
 
 	if k.Locality {
-		for i := 0; i < n; i++ {
-			for kv := 0; kv < n; kv++ {
-				r := aSlice[i*n+kv]
-				for j := 0; j < n; j++ {
-					cSlice[i*n+j] += r * bSlice[kv*n+j]
-				}
-			}
-		}
+		executeLocality(n, aSlice, bSlice, cSlice)
 	} else {
-		for i := 0; i < n; i++ {
-			for j := 0; j < n; j++ {
-				var sum int32
-				for kv := 0; kv < n; kv++ {
-					sum += aSlice[i*n+kv] * bSlice[kv*n+j]
-				}
-				cSlice[i*n+j] = sum
-			}
-		}
+		executeFlat(n, aSlice, bSlice, cSlice)
+	}
+
+	if activeTraceHook != nil {
+		activeTraceHook.OnExecute(ExecutionMetadata{
+			N:        n,
+			APtr:     a,
+			BPtr:     b,
+			CPtr:     c,
+			Locality: k.Locality,
+		})
 	}
 }
diff --git a/jit/jit_amd64.go b/jit/jit_amd64.go
index ddbd567..ae60cfc 100644
--- a/jit/jit_amd64.go
+++ b/jit/jit_amd64.go
@@ -50,7 +50,9 @@ func hasAVX2() bool {
  * @brief Implements Kernel interface for memory-resident AMD64 machine code blocks.
  */
 type amd64Kernel struct {
-	code []byte ///< Slice holding the JIT-allocated and marked executable byte segment
+	code     []byte ///< Slice holding the JIT-allocated and marked executable byte segment
+	N        int    ///< Matrix size (N x N)
+	Locality bool   ///< True if locality optimization was used
 }
 
 /**
@@ -62,6 +64,15 @@ type amd64Kernel struct {
  */
 func (k *amd64Kernel) Execute(a, b, c unsafe.Pointer) {
 	callJIT(unsafe.Pointer(&k.code[0]), a, b, c)
+	if activeTraceHook != nil {
+		activeTraceHook.OnExecute(ExecutionMetadata{
+			N:        k.N,
+			APtr:     a,
+			BPtr:     b,
+			CPtr:     c,
+			Locality: k.Locality,
+		})
+	}
 }
 
 /**
@@ -140,7 +151,7 @@ func CompileFlat(n int) (Kernel, error) {
 		return nil, err
 	}
 
-	return &amd64Kernel{code: code}, nil
+	return &amd64Kernel{code: code, N: n, Locality: false}, nil
 }
 
 /**
@@ -212,7 +223,7 @@ func CompileLocality(n int) (Kernel, error) {
 			return nil, err
 		}
 
-		return &amd64Kernel{code: code}, nil
+		return &amd64Kernel{code: code, N: n, Locality: true}, nil
 	} else if hasAVX2() {
 		// Emit vectorized AVX2 kernel (8-way strides)
 		template := []byte{
@@ -271,7 +282,7 @@ func CompileLocality(n int) (Kernel, error) {
 			return nil, err
 		}
 
-		return &amd64Kernel{code: code}, nil
+		return &amd64Kernel{code: code, N: n, Locality: true}, nil
 	} else {
 		// Emit optimized scalar locality kernel
 		template := []byte{
@@ -336,7 +347,7 @@ func CompileLocality(n int) (Kernel, error) {
 			return nil, err
 		}
 
-		return &amd64Kernel{code: code}, nil
+		return &amd64Kernel{code: code, N: n, Locality: true}, nil
 	}
 }
 
diff --git a/jit/jit_test.go b/jit/jit_test.go
index 6c30f7d..81e0bb8 100644
--- a/jit/jit_test.go
+++ b/jit/jit_test.go
@@ -100,3 +100,61 @@ func TestJITCompilationTime(t *testing.T) {
 		t.Errorf("JIT compiler overhead exceeded performance threshold: %s", elapsed)
 	}
 }
+
+/**
+ * @struct mockTraceHook
+ * @brief Simple mock implementation of TraceHook to test verification routing.
+ */
+type mockTraceHook struct {
+	called bool              ///< Flag indicating if OnExecute was invoked
+	meta   ExecutionMetadata ///< Metadata captured during the execution callback
+}
+
+/**
+ * @brief Callback method to record JIT execution details.
+ * 
+ * @param meta The runtime execution details.
+ */
+func (m *mockTraceHook) OnExecute(meta ExecutionMetadata) {
+	m.called = true
+	m.meta = meta
+}
+
+/**
+ * @brief Verifies that registering a trace hook captures execute pointer data.
+ * 
+ * @param t Go testing state handle.
+ */
+func TestJITTraceHook(t *testing.T) {
+	hook := &mockTraceHook{}
+	RegisterTraceHook(hook)
+	defer RegisterTraceHook(nil)
+
+	n := 64
+	a, b, c := generateMatrices(n)
+
+	k, err := CompileLocality(n)
+	if err != nil {
+		t.Fatalf("Failed to compile kernel: %v", err)
+	}
+	defer k.Free()
+
+	k.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&c[0]))
+
+	if !hook.called {
+		t.Fatalf("Expected trace hook to be called, but it was not")
+	}
+
+	if hook.meta.N != n {
+		t.Errorf("Expected N=%d in metadata, got %d", n, hook.meta.N)
+	}
+
+	if !hook.meta.Locality {
+		t.Errorf("Expected Locality=true in metadata, got false")
+	}
+
+	if hook.meta.APtr != unsafe.Pointer(&a[0]) {
+		t.Errorf("Expected APtr=%p in metadata, got %p", unsafe.Pointer(&a[0]), hook.meta.APtr)
+	}
+}
+