DigitalServerHost · westkevin12 · Jun 5, 2026 · Jun 5, 2026
diff --git a/README.md b/README.md
@@ -49,15 +49,21 @@ Under identical, mathematically verified logical execution constraints (512x512
 
 ---
 
-## 🖥️ Platform Target Support
-
-Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures. The assembler (`orchid/assembler.py`) dynamically auto-detects the host architecture (or accepts a target override parameter via `--target`) and emits optimized assembly targets:
-
-- **`x86_64` (AVX-512)**: Standard vectorized loop utilizing 512-bit vector registers with active `prefetcht0` hardware preloading.
-- **`arm64` (NEON / SVE)**: Vectorized execution using ARM64 NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
-- **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper with custom `amxinit`/`amxstop` instructions (`.word` directives).
-
-At runtime, the benchmarking harness (`locality/fair_harness.c`) performs dynamic hardware capability telemetry (`CPUID` for x86-64, `getauxval(AT_HWCAP)` for ARM64 SVE/ASIMD on Linux) to dispatch execution to the optimal native assembly kernel.
+## 🖥️ Platform Target Support & JIT Engine
+
+Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures:
+*   **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code:
+    - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading.
+    - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
+    - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions.
+*   **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path:
+    - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported.
+    - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs.
+    - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops.
+    - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability.
+
+### 🔒 W^X Memory Security
+The JIT compiler strictly enforces **Write-XOR-Execute (W^X)** memory constraints. Page memory is allocated with write permission (`syscall.PROT_WRITE`), code is generated, and then the page is transitioned to read-execute (`syscall.PROT_EXEC`) via `syscall.Mprotect` before execution.
 
 ---
 

diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go
@@ -1,8 +1,8 @@
 /**
  * @file matmul_wrapper.go
- * @brief Go wrapper linking C/assembly matrix kernels and executing locality timing benchmarks.
+ * @brief Go wrapper linking JIT compilation and executing locality timing benchmarks.
  * 
- * Coordinate AVX-512/scalar dispatch execution, physical memory alignment allocations,
+ * Coordinate JIT execution, physical memory alignment allocations,
  * CPU cache flushes, statistical speedup analysis, and timing files creation.
  * 
  * Originator: Teppei Oohira (@gatchimuchio) / 大平鉄兵
@@ -17,10 +17,6 @@ package main
 #include <stdlib.h>
 #include <string.h>
 
-int has_avx512f(void);
-void matmul_flat(const int32_t *a, const int32_t *b, int32_t *c);
-void matmul_locality(const int32_t *a, const int32_t *b, int32_t *c);
-void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c);
 void flush_cache_c(uint8_t *buf, size_t size);
 uint64_t get_flush_sink(void);
 */
@@ -33,6 +29,8 @@ import (
 	"sort"
 	"time"
 	"unsafe"
+
+	"ORCHID/jit"
 )
 
 const (
@@ -72,22 +70,6 @@ func median(values []float64) float64 {
 	return (values[n/2-1] + values[n/2]) / 2.0
 }
 
-/**
- * @brief Invokes either the AVX-512 assembly kernel or the scalar fallback kernel.
- * 
- * @param aPtr Pointer to input matrix A.
- * @param bPtr Pointer to input matrix B.
- * @param clPtr Pointer to output matrix C.
- * @param useAVX512 Flag indicating if AVX-512 should be executed.
- */
-func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) {
-	if useAVX512 {
-		C.matmul_locality((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr))
-	} else {
-		C.matmul_locality_fallback((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr))
-	}
-}
-
 /**
  * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups.
  * 
@@ -97,10 +79,11 @@ func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) {
  * @param cfPtr Pointer to flat output buffer.
  * @param clPtr Pointer to locality output buffer.
  * @param flushPtr Pointer to cache flushing buffer space.
- * @param useAVX512 Flag for AVX-512 hardware support.
+ * @param kFlat Pre-compiled JIT flat kernel.
+ * @param kLoc Pre-compiled JIT locality kernel.
  * @return Speedup values slice and printed log lines slice.
  */
-func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, useAVX512 bool) ([]float64, []string) {
+func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, kFlat, kLoc jit.Kernel) ([]float64, []string) {
 	var speedups []float64
 	var timingLines []string
 
@@ -113,26 +96,26 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(cfPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr))
+			kFlat.Execute(aPtr, bPtr, cfPtr)
 			flatSec = time.Since(t0).Seconds()
 
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(clPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+			kLoc.Execute(aPtr, bPtr, clPtr)
 			localSec = time.Since(t0).Seconds()
 		} else {
 			order = "locality-first"
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(clPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+			kLoc.Execute(aPtr, bPtr, clPtr)
 			localSec = time.Since(t0).Seconds()
 
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(cfPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr))
+			kFlat.Execute(aPtr, bPtr, cfPtr)
 			flatSec = time.Since(t0).Seconds()
 		}
 
@@ -258,20 +241,30 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 		bSlice[i] = int32((uint32(i)*13 + 5) % 7) - 3
 	}
 
-	// Detect host AVX-512 capability at runtime
-	useAVX512 := C.has_avx512f() != 0
-	telemetryMsg := "HARDWARE TELEMETRY: AVX-512 not supported. Dispatching to optimized scalar fallback kernel."
-	if useAVX512 {
-		telemetryMsg = "HARDWARE TELEMETRY: Native AVX-512 support detected. Dispatching to assembly vector kernel."
+	// Dynamic compile dynamic JIT kernels and measure compilation latency
+	tJitStart := time.Now()
+	kFlat, err := jit.CompileFlat(N)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile JIT flat kernel: %w", err)
+	}
+	defer kFlat.Free()
+
+	kLoc, err := jit.CompileLocality(N)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile JIT locality kernel: %w", err)
 	}
+	defer kLoc.Free()
+	jitElapsed := time.Since(tJitStart)
+
+	telemetryMsg := fmt.Sprintf("HARDWARE TELEMETRY: JIT compiled kernels in %s. Executing bare-metal blocks via W^X function pointers.", jitElapsed)
 	fmt.Println(telemetryMsg)
 
 	// Initial warm run & arithmetic validation check
 	C.memset(cfPtr, 0, C.size_t(Bytes))
 	C.memset(clPtr, 0, C.size_t(Bytes))
 
-	C.matmul_flat((*C.int32_t)(unsafe.Pointer(aPtr)), (*C.int32_t)(unsafe.Pointer(bPtr)), (*C.int32_t)(unsafe.Pointer(cfPtr)))
-	executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+	kFlat.Execute(aPtr, bPtr, cfPtr)
+	kLoc.Execute(aPtr, bPtr, clPtr)
 
 	// Verify equal outputs
 	for i := 0; i < Cells; i++ {
@@ -290,7 +283,7 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 	fmt.Println(verifyMsg)
 
 	// Collect timing pairs
-	speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, useAVX512)
+	speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, kFlat, kLoc)
 
 	flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink())
 	fmt.Println(flushSinkMsg)

diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -46,6 +46,13 @@ ORCHID/
 │   ├── build/               # Directory containing compiled object targets
 │   ├── fair_harness.c       # C11 Timing runner utilizing cache flushes
 │   └── matmul.plan          # Program parameter declaration configurations
+├── jit/                     # Just-In-Time (JIT) Dynamic Compilation Subsystem
+│   ├── jit.go               # Memory management, W^X page protection & Go fallbacks
+│   ├── jit_amd64.go         # AMD64 instruction emitters (AVX-512, AVX2, and scalar)
+│   ├── jit_amd64.s          # System V ABI pointer jump stubs & CPUID detection
+│   ├── jit_arm64.go         # ARM64 architecture portable fallback
+│   ├── jit_other.go         # Generic platform-independent fallback
+│   └── jit_test.go          # JIT math verification & compilation latency benchmark suite
 ├── orchid/                  # Packaged, Publishable Python SDK Core
 │   ├── __init__.py          # SDK Package version registration & exports
 │   ├── aggregator.py        # Locality results timing aggregator
@@ -127,6 +134,17 @@ The execution layer implements CADENCE routing using native Go concurrency primi
 
 ---
 
+### 3.4. JIT Compiler Subsystem (Dynamic Memory Compilation)
+To support real-time execution mesh demands without writing temporary files to disk or invoking external toolchains (GCC), ORCHID integrates a dynamic, memory-resident JIT compiler:
+*   **W^X Memory Security Model:** Strictly implements Write-XOR-Execute security page allocations. It allocates writable pages via `syscall.Mmap`, compiles instructions into the segment, and then transitions page protection to read-executable via `syscall.Mprotect` before execution.
+*   **Three-Tier x86_64 Hardware Pathing:**
+    1. *AVX-512:* Vectorized 16-way integer strides when CPU capability checks succeed.
+    2. *AVX2:* Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to prevent instruction page collisions.
+    3. *Scalar:* Core x86_64 pointer instruction loops.
+*   **ABI Bridging:** Utilizes a custom assembly stub `callJIT` in `jit_amd64.s` to route Go parameter structs onto AMD64 ABI registers (`RDI`, `RSI`, `RDX`), achieving execution speeds matching pre-compiled C binaries with only microsecond-level emission overhead.
+
+---
+
 ## 🐳 4. Orchestration & Static Quality Control
 
 ORCHID integrates modern tooling to guarantee code health:

diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json
@@ -1,6 +1,6 @@
 {
-  "min": "2.871x",
-  "median": "3.171x",
-  "max": "3.396x",
-  "mean": "3.176x"
-}
+  "max": "12.457x",
+  "mean": "11.128x",
+  "median": "11.530x",
+  "min": "8.964x"
+}
diff --git a/jit/jit.go b/jit/jit.go
@@ -0,0 +1,127 @@
+/**
+ * @file jit.go
+ * @brief Memory management and W^X memory page allocation wrappers for ORCHID JIT compiler.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+/**
+ * @interface Kernel
+ * @brief Represents an executable JIT-compiled matrix multiplication block.
+ */
+type Kernel interface {
+	// Execute dispatches the compiled block using pointers to input/output buffers.
+	Execute(a, b, c unsafe.Pointer)
+	// Free releases the allocated memory segment.
+	Free() error
+}
+
+/**
+ * @brief Allocates memory using syscall.Mmap with read-write protections.
+ * 
+ * @param size The size of the memory segment to allocate in bytes.
+ * @return The allocated byte slice or an error.
+ */
+func mmapJIT(size int) ([]byte, error) {
+	data, err := syscall.Mmap(
+		-1,
+		0,
+		size,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_ANON|syscall.MAP_PRIVATE,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("syscall mmap failed: %w", err)
+	}
+	return data, nil
+}
+
+/**
+ * @brief Transitions the memory protections of a segment to read-execute.
+ * 
+ * @param data The byte slice representing the memory segment to protect.
+ * @return nil on success, or error if syscall failed.
+ */
+func mprotectRX(data []byte) error {
+	err := syscall.Mprotect(data, syscall.PROT_READ|syscall.PROT_EXEC)
+	if err != nil {
+		return fmt.Errorf("syscall mprotect RX failed: %w", err)
+	}
+	return nil
+}
+
+/**
+ * @brief Frees memory allocated using syscall.Mmap.
+ * 
+ * @param data The byte slice to release.
+ * @return nil on success, or error if munmap failed.
+ */
+func munmapJIT(data []byte) error {
+	err := syscall.Munmap(data)
+	if err != nil {
+		return fmt.Errorf("syscall munmap failed: %w", err)
+	}
+	return nil
+}
+
+/**
+ * @struct GoFallbackKernel
+ * @brief Implements Kernel by executing a standard math calculation loop in Go.
+ */
+type GoFallbackKernel struct {
+	N        int  ///< Size of the matrix (N x N)
+	Locality bool ///< Flag indicating if locality-aware access loop order should be used
+}
+
+/**
+ * @brief Releases the memory page for the fallback kernel (noop).
+ * 
+ * @return nil always.
+ */
+func (k *GoFallbackKernel) Free() error {
+	return nil
+}
+
+/**
+ * @brief Executes matrix multiplication using Go fallback loops.
+ * 
+ * @param a Pointer to matrix A.
+ * @param b Pointer to matrix B.
+ * @param c Pointer to output matrix C.
+ */
+func (k *GoFallbackKernel) Execute(a, b, c unsafe.Pointer) {
+	n := k.N
+	cells := n * n
+	aSlice := (*[1 << 28]int32)(a)[:cells:cells]
+	bSlice := (*[1 << 28]int32)(b)[:cells:cells]
+	cSlice := (*[1 << 28]int32)(c)[:cells:cells]
+
+	if k.Locality {
+		for i := 0; i < n; i++ {
+			for kv := 0; kv < n; kv++ {
+				r := aSlice[i*n+kv]
+				for j := 0; j < n; j++ {
+					cSlice[i*n+j] += r * bSlice[kv*n+j]
+				}
+			}
+		}
+	} else {
+		for i := 0; i < n; i++ {
+			for j := 0; j < n; j++ {
+				var sum int32
+				for kv := 0; kv < n; kv++ {
+					sum += aSlice[i*n+kv] * bSlice[kv*n+j]
+				}
+				cSlice[i*n+j] = sum
+			}
+		}
+	}
+}