From d210ee6edd2f0bf703189457ddc8014c53bc3791 Mon Sep 17 00:00:00 2001 From: westkevin12 Date: Fri, 5 Jun 2026 18:46:19 -0500 Subject: [PATCH] feat: integrate tracing hooks into benchmark suite and update output format to support comparative analysis between standard and trace-enabled modes. --- README.md | 53 ++++---- cmd/orchid-daemon/matmul_wrapper.go | 179 ++++++++++++++++++++-------- docs/ARCHITECTURE.md | 10 ++ evidence/reproduced/speedups.json | 16 ++- jit/jit.go | 101 +++++++++++++--- jit/jit_amd64.go | 21 +++- jit/jit_test.go | 58 +++++++++ 7 files changed, 344 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index 6029cd7..f52d2ba 100644 --- a/README.md +++ b/README.md @@ -29,42 +29,55 @@ Project **ORCHID** is the low-level micro-architectural execution core of the RA The absolute base foundation, research primitives, and original codebase layout can be found preserved on the legacy archive branch: πŸ‘‰ **[View the Baseline Concept Code (`tree/gatchimuchio-original`)](https://github.com/DigitalServerHost/ORCHID/tree/gatchimuchio-original)** + --- ## πŸ“Š Reproduced Locality Performance -Under identical, mathematically verified logical execution constraints (512x512 matrix size and double-triplicate verification), the locality-aligned memory mapping sweeps demonstrate exceptionally high performance improvements. Badges below are dynamically parsed from current timing sweeps: +Under identical, mathematically verified logical execution constraints (512x512 matrix size and double-triplicate verification), ORCHID executes in two timing configurations. Standard Mode prioritizes raw bare-metal machine code throughput, while Trace Mode instruments execution boundaries for out-of-band ZK verification (Project VALKYRIE). + +| Metric | Standard Mode (Raw Execution) | Trace Mode (Verification Hook Active) | +| :--- | :--- | :--- | +| **Minimum Speedup** | ![Speedup Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.min&label=Min%20Speedup&color=blue) | ![Trace Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.min&label=Trace%20Min&color=blueviolet) | +| **Median Speedup** | ![Speedup Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.median&label=Median%20Speedup&color=blue) | ![Trace Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.median&label=Trace%20Median&color=blueviolet) | +| **Maximum Speedup** | ![Speedup Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.max&label=Max%20Speedup&color=brightgreen) | ![Trace Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.max&label=Trace%20Max&color=orange) | +| **Mean Speedup** | ![Speedup Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.standard.mean&label=Mean%20Speedup&color=brightgreen) | ![Trace Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.trace.mean&label=Trace%20Mean&color=orange) | + +### πŸ”€ Parallel Memory Scheduler (CADENCE Scheduler) -| Metric | Speedup | -| :------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Minimum Speedup** | ![Speedup Min](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.min&label=Speedup%20Min&color=blue) | -| **Median Speedup** | ![Speedup Median](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.median&label=Speedup%20Median&color=blueviolet) | -| **Maximum Speedup** | ![Speedup Max](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.max&label=Speedup%20Max&color=brightgreen) | -| **Mean Speedup** | ![Speedup Mean](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fraw.githubusercontent.com%2FDigitalServerHost%2FORCHID%2Fmain%2Fevidence%2Freproduced%2Fspeedups.json&query=%24.mean&label=Speedup%20Mean&color=orange) | +The parallel role scheduler (`scheduler.go`) partitions memory operations into three distinct logical streams (B-read, C-read, A-write) using a simulated STREAM-Triad memory controller queue. Scheduling these operations onto three independent hardware memory banks achieves the absolute theoretical parallel saturation limit: -> [!NOTE] -> **Understanding the Speedup Profiles:** -> - **Physical Cache Locality (C Harness)**: The dynamic badges above measure the hardware execution speedup of cache-blocked locality-aligned loops (matrix multiplication) over flat baselines, yielding **3.0x - 3.4x** actual hardware speedups on warm cache lines. -> - **Parallel Memory Scheduler (Go Simulator)**: The scheduler unit tests (`TestBankedSchedulerTriad`) run a software-simulated queue model (STREAM-Triad) to measure bank serialization and parallel role routing. Because STREAM-Triad partitions requests into 3 distinct logical data streams (B-read, C-read, A-write), mapping them to 3 independent memory banks achieves a theoretical parallel speedup limit of exactly **3.0x** (which the Go scheduler hits at exactly **3.000x** cycle reduction). +![Scheduler Speedup](https://img.shields.io/badge/Scheduler%20Speedup-3.000x-brightgreen) + +* **Theoretical Maximum:** 3.0x cycle reduction due to perfect memory-role serialization elimination. +* **Reproduced Efficiency:** The Go scheduling model hits exactly **3.000x** parallel performance speedup. --- ## πŸ–₯️ Platform Target Support & JIT Engine Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures: -* **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code: - - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading. - - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets. - - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions. -* **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path: - - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported. - - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs. - - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops. - - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability. + +- **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code: + - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading. + - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets. + - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions. +- **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path: + - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported. + - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs. + - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops. + - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability. ### πŸ”’ W^X Memory Security + The JIT compiler strictly enforces **Write-XOR-Execute (W^X)** memory constraints. Page memory is allocated with write permission (`syscall.PROT_WRITE`), code is generated, and then the page is transitioned to read-execute (`syscall.PROT_EXEC`) via `syscall.Mprotect` before execution. +### πŸ›‘οΈ Decoupled Verification & Standalone Engine + +Project ORCHID is designed to be **fully standalone**; developers can run the core JIT compiler and parallel scheduling runtime completely independent of any blockchain or verification logic. + +To maintain raw execution performance, cryptographic proof generation is decoupled from the hot path. The codebase exports runtime execution statistics and memory pointers via a lightweight, zero-overhead tracing interface. Developers can register their own custom verification layers or plug into **[Project VALKYRIE](https://github.com/DigitalServerHost/VALKYRIE)**, which is ORCHID's default recommended open-source ZK-proving and verification layer. + --- ## πŸ›οΈ Centralized Architectural Design & Blueprint diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go index 1b37ac9..3a3590c 100644 --- a/cmd/orchid-daemon/matmul_wrapper.go +++ b/cmd/orchid-daemon/matmul_wrapper.go @@ -70,20 +70,28 @@ func median(values []float64) float64 { return (values[n/2-1] + values[n/2]) / 2.0 } +/** + * @struct benchmarkConfig + * @brief Bundles variables and pointers passed to benchmark executors. + */ +type benchmarkConfig struct { + aPtr unsafe.Pointer + bPtr unsafe.Pointer + cfPtr unsafe.Pointer + clPtr unsafe.Pointer + flushPtr unsafe.Pointer + kFlat jit.Kernel + kLoc jit.Kernel +} + /** * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups. * * @param repeats Number of benchmark iterations to perform. - * @param aPtr Pointer to matrix A. - * @param bPtr Pointer to matrix B. - * @param cfPtr Pointer to flat output buffer. - * @param clPtr Pointer to locality output buffer. - * @param flushPtr Pointer to cache flushing buffer space. - * @param kFlat Pre-compiled JIT flat kernel. - * @param kLoc Pre-compiled JIT locality kernel. + * @param cfg Pointer to benchmarkConfig payload. * @return Speedup values slice and printed log lines slice. */ -func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, kFlat, kLoc jit.Kernel) ([]float64, []string) { +func runBenchmarkPairs(repeats int, cfg *benchmarkConfig) ([]float64, []string) { var speedups []float64 var timingLines []string @@ -93,29 +101,29 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po if r%2 == 0 { order = "flat-first" - C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) - C.memset(cfPtr, 0, C.size_t(Bytes)) + C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes)) + C.memset(cfg.cfPtr, 0, C.size_t(Bytes)) t0 := time.Now() - kFlat.Execute(aPtr, bPtr, cfPtr) + cfg.kFlat.Execute(cfg.aPtr, cfg.bPtr, cfg.cfPtr) flatSec = time.Since(t0).Seconds() - C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) - C.memset(clPtr, 0, C.size_t(Bytes)) + C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes)) + C.memset(cfg.clPtr, 0, C.size_t(Bytes)) t0 = time.Now() - kLoc.Execute(aPtr, bPtr, clPtr) + cfg.kLoc.Execute(cfg.aPtr, cfg.bPtr, cfg.clPtr) localSec = time.Since(t0).Seconds() } else { order = "locality-first" - C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) - C.memset(clPtr, 0, C.size_t(Bytes)) + C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes)) + C.memset(cfg.clPtr, 0, C.size_t(Bytes)) t0 := time.Now() - kLoc.Execute(aPtr, bPtr, clPtr) + cfg.kLoc.Execute(cfg.aPtr, cfg.bPtr, cfg.clPtr) localSec = time.Since(t0).Seconds() - C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) - C.memset(cfPtr, 0, C.size_t(Bytes)) + C.flush_cache_c((*C.uint8_t)(cfg.flushPtr), C.size_t(FlushBytes)) + C.memset(cfg.cfPtr, 0, C.size_t(Bytes)) t0 = time.Now() - kFlat.Execute(aPtr, bPtr, cfPtr) + cfg.kFlat.Execute(cfg.aPtr, cfg.bPtr, cfg.cfPtr) flatSec = time.Since(t0).Seconds() } @@ -131,6 +139,12 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po return speedups, timingLines } +type benchmarkTraceHook struct{} + +func (b *benchmarkTraceHook) OnExecute(meta jit.ExecutionMetadata) { + // This empty callback is intentional to measure trace callback dispatch overhead. +} + /** * @struct BenchmarkOutputs * @brief Groups together the benchmark output metrics and directory configurations. @@ -145,6 +159,10 @@ type BenchmarkOutputs struct { Median float64 Max float64 Mean float64 + TraceMin float64 + TraceMedian float64 + TraceMax float64 + TraceMean float64 } /** @@ -181,12 +199,20 @@ func writeBenchmarkOutputs(cfg *BenchmarkOutputs) error { return err } - // 3. Write speedups.json - speedupMap := map[string]string{ - "min": fmt.Sprintf("%.3fx", cfg.Min), - "median": fmt.Sprintf("%.3fx", cfg.Median), - "max": fmt.Sprintf("%.3fx", cfg.Max), - "mean": fmt.Sprintf("%.3fx", cfg.Mean), + // 3. Write speedups.json (Standard vs Trace comparative format) + speedupMap := map[string]map[string]string{ + "standard": { + "min": fmt.Sprintf("%.3fx", cfg.Min), + "median": fmt.Sprintf("%.3fx", cfg.Median), + "max": fmt.Sprintf("%.3fx", cfg.Max), + "mean": fmt.Sprintf("%.3fx", cfg.Mean), + }, + "trace": { + "min": fmt.Sprintf("%.3fx", cfg.TraceMin), + "median": fmt.Sprintf("%.3fx", cfg.TraceMedian), + "max": fmt.Sprintf("%.3fx", cfg.TraceMax), + "mean": fmt.Sprintf("%.3fx", cfg.TraceMean), + }, } speedupJSON, err := json.MarshalIndent(speedupMap, "", " ") if err != nil { @@ -195,6 +221,46 @@ func writeBenchmarkOutputs(cfg *BenchmarkOutputs) error { return os.WriteFile(filepath.Join(cfg.OutDir, "speedups.json"), append(speedupJSON, '\n'), 0644) } +/** + * @brief Computes summary statistics from a speedup values slice. + * + * @param speedups Slice of floating-point speedups. + * @return min, median, max, and mean speedups. + */ +func computeStats(speedups []float64) (float64, float64, float64, float64) { + minVal := speedups[0] + maxVal := speedups[0] + sumVal := 0.0 + for _, v := range speedups { + if v < minVal { + minVal = v + } + if v > maxVal { + maxVal = v + } + sumVal += v + } + meanVal := sumVal / float64(len(speedups)) + medianVal := median(speedups) + return minVal, medianVal, maxVal, meanVal +} + +/** + * @brief Compares two result slices for structural equality. + * + * @param cf Reference flat result slice. + * @param cl Locality-optimized result slice. + * @return error if any value mismatch is found. + */ +func verifyEquivalence(cf, cl []int32) error { + for i := 0; i < len(cf); i++ { + if cf[i] != cl[i] { + return fmt.Errorf("MISMATCH: Verification failure at index=%d flat=%d locality=%d", i, cf[i], cl[i]) + } + } + return nil +} + /** * @brief Entry point for running the matrix cache-locality timing benchmark. * @@ -267,10 +333,8 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { kLoc.Execute(aPtr, bPtr, clPtr) // Verify equal outputs - for i := 0; i < Cells; i++ { - if cfSlice[i] != clSlice[i] { - return nil, fmt.Errorf("MISMATCH: Verification failure at index=%d flat=%d locality=%d", i, cfSlice[i], clSlice[i]) - } + if err := verifyEquivalence(cfSlice, clSlice); err != nil { + return nil, err } // Calculate checksum of results @@ -282,27 +346,42 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { verifyMsg := fmt.Sprintf("VERIFY equal N=%d operations=%d cache_flush_bytes=%d", N, N*N*N, FlushBytes) fmt.Println(verifyMsg) - // Collect timing pairs - speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, kFlat, kLoc) + benchCfg := &benchmarkConfig{ + aPtr: aPtr, + bPtr: bPtr, + cfPtr: cfPtr, + clPtr: clPtr, + flushPtr: flushPtr, + kFlat: kFlat, + kLoc: kLoc, + } + + // Collect standard timing pairs + speedups, timingLines := runBenchmarkPairs(repeats, benchCfg) + + // Register trace hook for trace mode benchmarking + fmt.Println("\n--- ENABLING TRACE MODE ---") + jit.RegisterTraceHook(&benchmarkTraceHook{}) + + // Collect trace timing pairs + traceSpeedups, traceTimingLines := runBenchmarkPairs(repeats, benchCfg) + + // Clean up trace hook registration + jit.RegisterTraceHook(nil) + fmt.Println("--- TRACE MODE DISABLED ---") + fmt.Println() flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink()) fmt.Println(flushSinkMsg) - // Compute summary statistics - minVal := speedups[0] - maxVal := speedups[0] - sumVal := 0.0 - for _, v := range speedups { - if v < minVal { - minVal = v - } - if v > maxVal { - maxVal = v - } - sumVal += v - } - meanVal := sumVal / float64(len(speedups)) - medianVal := median(speedups) + // Compute statistics + minVal, medianVal, maxVal, meanVal := computeStats(speedups) + traceMinVal, traceMedianVal, traceMaxVal, traceMeanVal := computeStats(traceSpeedups) + + // Merge timing lines for logging + allTimingLines := append([]string{"=== STANDARD MODE TIMINGS ==="}, timingLines...) + allTimingLines = append(allTimingLines, "=== TRACE MODE TIMINGS ===") + allTimingLines = append(allTimingLines, traceTimingLines...) // Write output files cfg := &BenchmarkOutputs{ @@ -310,11 +389,15 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { TelemetryMsg: telemetryMsg, VerifyMsg: verifyMsg, FlushSinkMsg: flushSinkMsg, - TimingLines: timingLines, + TimingLines: allTimingLines, Min: minVal, Median: medianVal, Max: maxVal, Mean: meanVal, + TraceMin: traceMinVal, + TraceMedian: traceMedianVal, + TraceMax: traceMaxVal, + TraceMean: traceMeanVal, } if err := writeBenchmarkOutputs(cfg); err != nil { return nil, err diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index b255a1d..09b4404 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -145,6 +145,16 @@ To support real-time execution mesh demands without writing temporary files to d --- +### 3.5. Computational Verification & Isolated Trust Plane (Project VALKYRIE) +In high-performance decentralized systems, runtime performance optimization must not compromise computational integrity. ORCHID enforces a strict separation of concerns between raw execution and trust verification: + +* **Zero Prover Bloat:** To preserve ORCHID's bare-metal execution speed and lightweight container footprints, zero-knowledge prover generation (ZK-SNARK/STARK) and consensus logic are completely excluded from the ORCHID repository. +* **Verifier-Agnostic Design:** ORCHID provides clean, low-overhead tracing interfaces (such as scheduling queues and execution parameter records). Developers can plug in custom verification hooks to capture inputs, outputs, and kernel metadata. +* **Recommended Verification Layer:** We recommend utilizing **[Project VALKYRIE](https://github.com/DigitalServerHost/VALKYRIE)**β€”RAMNET's dedicated, out-of-band verification and zero-knowledge proof generation system. VALKYRIE ingests ORCHID's execution traces and output buffers to compile polynomial constraints and issue verification proofs, keeping the hot execution loop unburdened. +* **Local Sanity Validation:** For local testing, the JIT benchmarks run element-by-element equivalence verification between baseline flat outputs and locality-optimized outputs to validate compiler index and register correctness before timing sweeps. + +--- + ## 🐳 4. Orchestration & Static Quality Control ORCHID integrates modern tooling to guarantee code health: diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json index c847662..abaa596 100644 --- a/evidence/reproduced/speedups.json +++ b/evidence/reproduced/speedups.json @@ -1,6 +1,14 @@ { - "max": "12.457x", - "mean": "11.128x", - "median": "11.530x", - "min": "8.964x" + "standard": { + "max": "12.945x", + "mean": "12.212x", + "median": "12.497x", + "min": "10.905x" + }, + "trace": { + "max": "12.314x", + "mean": "10.850x", + "median": "10.621x", + "min": "9.749x" + } } diff --git a/jit/jit.go b/jit/jit.go index cf76c47..d1de561 100644 --- a/jit/jit.go +++ b/jit/jit.go @@ -24,6 +24,39 @@ type Kernel interface { Free() error } +/** + * @struct ExecutionMetadata + * @brief Holds runtime details of a completed JIT kernel execution for auditing. + */ +type ExecutionMetadata struct { + N int ///< Matrix size (N x N) + APtr unsafe.Pointer ///< Pointer to input matrix A + BPtr unsafe.Pointer ///< Pointer to input matrix B + CPtr unsafe.Pointer ///< Pointer to output matrix C + Locality bool ///< True if locality optimization was used +} + +/** + * @interface TraceHook + * @brief Interface for registering execution tracing and verification auditing tools. + */ +type TraceHook interface { + // OnExecute is invoked after a JIT kernel completes computation. + OnExecute(meta ExecutionMetadata) +} + +// Global active trace hook registration +var activeTraceHook TraceHook + +/** + * @brief Registers a global trace hook to capture JIT kernel execution details. + * + * @param hook The TraceHook to register. + */ +func RegisterTraceHook(hook TraceHook) { + activeTraceHook = hook +} + /** * @brief Allocates memory using syscall.Mmap with read-write protections. * @@ -90,6 +123,45 @@ func (k *GoFallbackKernel) Free() error { return nil } +/** + * @brief Performs a locality-optimized matrix multiplication traversal in Go. + * + * @param n Size of the matrix (N x N). + * @param a Flat slice containing matrix A data. + * @param b Flat slice containing matrix B data. + * @param c Flat slice containing matrix C output data. + */ +func executeLocality(n int, a, b, c []int32) { + for i := 0; i < n; i++ { + for kv := 0; kv < n; kv++ { + r := a[i*n+kv] + for j := 0; j < n; j++ { + c[i*n+j] += r * b[kv*n+j] + } + } + } +} + +/** + * @brief Performs a flat triple-loop matrix multiplication traversal in Go. + * + * @param n Size of the matrix (N x N). + * @param a Flat slice containing matrix A data. + * @param b Flat slice containing matrix B data. + * @param c Flat slice containing matrix C output data. + */ +func executeFlat(n int, a, b, c []int32) { + for i := 0; i < n; i++ { + for j := 0; j < n; j++ { + var sum int32 + for kv := 0; kv < n; kv++ { + sum += a[i*n+kv] * b[kv*n+j] + } + c[i*n+j] = sum + } + } +} + /** * @brief Executes matrix multiplication using Go fallback loops. * @@ -105,23 +177,18 @@ func (k *GoFallbackKernel) Execute(a, b, c unsafe.Pointer) { cSlice := (*[1 << 28]int32)(c)[:cells:cells] if k.Locality { - for i := 0; i < n; i++ { - for kv := 0; kv < n; kv++ { - r := aSlice[i*n+kv] - for j := 0; j < n; j++ { - cSlice[i*n+j] += r * bSlice[kv*n+j] - } - } - } + executeLocality(n, aSlice, bSlice, cSlice) } else { - for i := 0; i < n; i++ { - for j := 0; j < n; j++ { - var sum int32 - for kv := 0; kv < n; kv++ { - sum += aSlice[i*n+kv] * bSlice[kv*n+j] - } - cSlice[i*n+j] = sum - } - } + executeFlat(n, aSlice, bSlice, cSlice) + } + + if activeTraceHook != nil { + activeTraceHook.OnExecute(ExecutionMetadata{ + N: n, + APtr: a, + BPtr: b, + CPtr: c, + Locality: k.Locality, + }) } } diff --git a/jit/jit_amd64.go b/jit/jit_amd64.go index ddbd567..ae60cfc 100644 --- a/jit/jit_amd64.go +++ b/jit/jit_amd64.go @@ -50,7 +50,9 @@ func hasAVX2() bool { * @brief Implements Kernel interface for memory-resident AMD64 machine code blocks. */ type amd64Kernel struct { - code []byte ///< Slice holding the JIT-allocated and marked executable byte segment + code []byte ///< Slice holding the JIT-allocated and marked executable byte segment + N int ///< Matrix size (N x N) + Locality bool ///< True if locality optimization was used } /** @@ -62,6 +64,15 @@ type amd64Kernel struct { */ func (k *amd64Kernel) Execute(a, b, c unsafe.Pointer) { callJIT(unsafe.Pointer(&k.code[0]), a, b, c) + if activeTraceHook != nil { + activeTraceHook.OnExecute(ExecutionMetadata{ + N: k.N, + APtr: a, + BPtr: b, + CPtr: c, + Locality: k.Locality, + }) + } } /** @@ -140,7 +151,7 @@ func CompileFlat(n int) (Kernel, error) { return nil, err } - return &amd64Kernel{code: code}, nil + return &amd64Kernel{code: code, N: n, Locality: false}, nil } /** @@ -212,7 +223,7 @@ func CompileLocality(n int) (Kernel, error) { return nil, err } - return &amd64Kernel{code: code}, nil + return &amd64Kernel{code: code, N: n, Locality: true}, nil } else if hasAVX2() { // Emit vectorized AVX2 kernel (8-way strides) template := []byte{ @@ -271,7 +282,7 @@ func CompileLocality(n int) (Kernel, error) { return nil, err } - return &amd64Kernel{code: code}, nil + return &amd64Kernel{code: code, N: n, Locality: true}, nil } else { // Emit optimized scalar locality kernel template := []byte{ @@ -336,7 +347,7 @@ func CompileLocality(n int) (Kernel, error) { return nil, err } - return &amd64Kernel{code: code}, nil + return &amd64Kernel{code: code, N: n, Locality: true}, nil } } diff --git a/jit/jit_test.go b/jit/jit_test.go index 6c30f7d..81e0bb8 100644 --- a/jit/jit_test.go +++ b/jit/jit_test.go @@ -100,3 +100,61 @@ func TestJITCompilationTime(t *testing.T) { t.Errorf("JIT compiler overhead exceeded performance threshold: %s", elapsed) } } + +/** + * @struct mockTraceHook + * @brief Simple mock implementation of TraceHook to test verification routing. + */ +type mockTraceHook struct { + called bool ///< Flag indicating if OnExecute was invoked + meta ExecutionMetadata ///< Metadata captured during the execution callback +} + +/** + * @brief Callback method to record JIT execution details. + * + * @param meta The runtime execution details. + */ +func (m *mockTraceHook) OnExecute(meta ExecutionMetadata) { + m.called = true + m.meta = meta +} + +/** + * @brief Verifies that registering a trace hook captures execute pointer data. + * + * @param t Go testing state handle. + */ +func TestJITTraceHook(t *testing.T) { + hook := &mockTraceHook{} + RegisterTraceHook(hook) + defer RegisterTraceHook(nil) + + n := 64 + a, b, c := generateMatrices(n) + + k, err := CompileLocality(n) + if err != nil { + t.Fatalf("Failed to compile kernel: %v", err) + } + defer k.Free() + + k.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&c[0])) + + if !hook.called { + t.Fatalf("Expected trace hook to be called, but it was not") + } + + if hook.meta.N != n { + t.Errorf("Expected N=%d in metadata, got %d", n, hook.meta.N) + } + + if !hook.meta.Locality { + t.Errorf("Expected Locality=true in metadata, got false") + } + + if hook.meta.APtr != unsafe.Pointer(&a[0]) { + t.Errorf("Expected APtr=%p in metadata, got %p", unsafe.Pointer(&a[0]), hook.meta.APtr) + } +} +