diff --git a/README.md b/README.md
index e484469..6029cd7 100644
--- a/README.md
+++ b/README.md
@@ -49,15 +49,21 @@ Under identical, mathematically verified logical execution constraints (512x512
 
 ---
 
-## 🖥️ Platform Target Support
-
-Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures. The assembler (`orchid/assembler.py`) dynamically auto-detects the host architecture (or accepts a target override parameter via `--target`) and emits optimized assembly targets:
-
-- **`x86_64` (AVX-512)**: Standard vectorized loop utilizing 512-bit vector registers with active `prefetcht0` hardware preloading.
-- **`arm64` (NEON / SVE)**: Vectorized execution using ARM64 NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
-- **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper with custom `amxinit`/`amxstop` instructions (`.word` directives).
-
-At runtime, the benchmarking harness (`locality/fair_harness.c`) performs dynamic hardware capability telemetry (`CPUID` for x86-64, `getauxval(AT_HWCAP)` for ARM64 SVE/ASIMD on Linux) to dispatch execution to the optimal native assembly kernel.
+## 🖥️ Platform Target Support & JIT Engine
+
+Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures:
+*   **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code:
+    - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading.
+    - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets.
+    - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions.
+*   **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path:
+    - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported.
+    - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs.
+    - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops.
+    - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability.
+
+### 🔒 W^X Memory Security
+The JIT compiler strictly enforces **Write-XOR-Execute (W^X)** memory constraints. Page memory is allocated with write permission (`syscall.PROT_WRITE`), code is generated, and then the page is transitioned to read-execute (`syscall.PROT_EXEC`) via `syscall.Mprotect` before execution.
 
 ---
 
diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go
index e413cff..1b37ac9 100644
--- a/cmd/orchid-daemon/matmul_wrapper.go
+++ b/cmd/orchid-daemon/matmul_wrapper.go
@@ -1,8 +1,8 @@
 /**
  * @file matmul_wrapper.go
- * @brief Go wrapper linking C/assembly matrix kernels and executing locality timing benchmarks.
+ * @brief Go wrapper linking JIT compilation and executing locality timing benchmarks.
  * 
- * Coordinate AVX-512/scalar dispatch execution, physical memory alignment allocations,
+ * Coordinate JIT execution, physical memory alignment allocations,
  * CPU cache flushes, statistical speedup analysis, and timing files creation.
  * 
  * Originator: Teppei Oohira (@gatchimuchio) / 大平鉄兵
@@ -17,10 +17,6 @@ package main
 #include <stdlib.h>
 #include <string.h>
 
-int has_avx512f(void);
-void matmul_flat(const int32_t *a, const int32_t *b, int32_t *c);
-void matmul_locality(const int32_t *a, const int32_t *b, int32_t *c);
-void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c);
 void flush_cache_c(uint8_t *buf, size_t size);
 uint64_t get_flush_sink(void);
 */
@@ -33,6 +29,8 @@ import (
 	"sort"
 	"time"
 	"unsafe"
+
+	"ORCHID/jit"
 )
 
 const (
@@ -72,22 +70,6 @@ func median(values []float64) float64 {
 	return (values[n/2-1] + values[n/2]) / 2.0
 }
 
-/**
- * @brief Invokes either the AVX-512 assembly kernel or the scalar fallback kernel.
- * 
- * @param aPtr Pointer to input matrix A.
- * @param bPtr Pointer to input matrix B.
- * @param clPtr Pointer to output matrix C.
- * @param useAVX512 Flag indicating if AVX-512 should be executed.
- */
-func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) {
-	if useAVX512 {
-		C.matmul_locality((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr))
-	} else {
-		C.matmul_locality_fallback((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr))
-	}
-}
-
 /**
  * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups.
  * 
@@ -97,10 +79,11 @@ func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) {
  * @param cfPtr Pointer to flat output buffer.
  * @param clPtr Pointer to locality output buffer.
  * @param flushPtr Pointer to cache flushing buffer space.
- * @param useAVX512 Flag for AVX-512 hardware support.
+ * @param kFlat Pre-compiled JIT flat kernel.
+ * @param kLoc Pre-compiled JIT locality kernel.
  * @return Speedup values slice and printed log lines slice.
  */
-func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, useAVX512 bool) ([]float64, []string) {
+func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, kFlat, kLoc jit.Kernel) ([]float64, []string) {
 	var speedups []float64
 	var timingLines []string
 
@@ -113,26 +96,26 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(cfPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr))
+			kFlat.Execute(aPtr, bPtr, cfPtr)
 			flatSec = time.Since(t0).Seconds()
 
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(clPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+			kLoc.Execute(aPtr, bPtr, clPtr)
 			localSec = time.Since(t0).Seconds()
 		} else {
 			order = "locality-first"
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(clPtr, 0, C.size_t(Bytes))
 			t0 := time.Now()
-			executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+			kLoc.Execute(aPtr, bPtr, clPtr)
 			localSec = time.Since(t0).Seconds()
 
 			C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes))
 			C.memset(cfPtr, 0, C.size_t(Bytes))
 			t0 = time.Now()
-			C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr))
+			kFlat.Execute(aPtr, bPtr, cfPtr)
 			flatSec = time.Since(t0).Seconds()
 		}
 
@@ -258,20 +241,30 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 		bSlice[i] = int32((uint32(i)*13 + 5) % 7) - 3
 	}
 
-	// Detect host AVX-512 capability at runtime
-	useAVX512 := C.has_avx512f() != 0
-	telemetryMsg := "HARDWARE TELEMETRY: AVX-512 not supported. Dispatching to optimized scalar fallback kernel."
-	if useAVX512 {
-		telemetryMsg = "HARDWARE TELEMETRY: Native AVX-512 support detected. Dispatching to assembly vector kernel."
+	// Dynamic compile dynamic JIT kernels and measure compilation latency
+	tJitStart := time.Now()
+	kFlat, err := jit.CompileFlat(N)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile JIT flat kernel: %w", err)
+	}
+	defer kFlat.Free()
+
+	kLoc, err := jit.CompileLocality(N)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile JIT locality kernel: %w", err)
 	}
+	defer kLoc.Free()
+	jitElapsed := time.Since(tJitStart)
+
+	telemetryMsg := fmt.Sprintf("HARDWARE TELEMETRY: JIT compiled kernels in %s. Executing bare-metal blocks via W^X function pointers.", jitElapsed)
 	fmt.Println(telemetryMsg)
 
 	// Initial warm run & arithmetic validation check
 	C.memset(cfPtr, 0, C.size_t(Bytes))
 	C.memset(clPtr, 0, C.size_t(Bytes))
 
-	C.matmul_flat((*C.int32_t)(unsafe.Pointer(aPtr)), (*C.int32_t)(unsafe.Pointer(bPtr)), (*C.int32_t)(unsafe.Pointer(cfPtr)))
-	executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512)
+	kFlat.Execute(aPtr, bPtr, cfPtr)
+	kLoc.Execute(aPtr, bPtr, clPtr)
 
 	// Verify equal outputs
 	for i := 0; i < Cells; i++ {
@@ -290,7 +283,7 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) {
 	fmt.Println(verifyMsg)
 
 	// Collect timing pairs
-	speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, useAVX512)
+	speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, kFlat, kLoc)
 
 	flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink())
 	fmt.Println(flushSinkMsg)
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index d2b3f6d..b255a1d 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -46,6 +46,13 @@ ORCHID/
 │   ├── build/               # Directory containing compiled object targets
 │   ├── fair_harness.c       # C11 Timing runner utilizing cache flushes
 │   └── matmul.plan          # Program parameter declaration configurations
+├── jit/                     # Just-In-Time (JIT) Dynamic Compilation Subsystem
+│   ├── jit.go               # Memory management, W^X page protection & Go fallbacks
+│   ├── jit_amd64.go         # AMD64 instruction emitters (AVX-512, AVX2, and scalar)
+│   ├── jit_amd64.s          # System V ABI pointer jump stubs & CPUID detection
+│   ├── jit_arm64.go         # ARM64 architecture portable fallback
+│   ├── jit_other.go         # Generic platform-independent fallback
+│   └── jit_test.go          # JIT math verification & compilation latency benchmark suite
 ├── orchid/                  # Packaged, Publishable Python SDK Core
 │   ├── __init__.py          # SDK Package version registration & exports
 │   ├── aggregator.py        # Locality results timing aggregator
@@ -127,6 +134,17 @@ The execution layer implements CADENCE routing using native Go concurrency primi
 
 ---
 
+### 3.4. JIT Compiler Subsystem (Dynamic Memory Compilation)
+To support real-time execution mesh demands without writing temporary files to disk or invoking external toolchains (GCC), ORCHID integrates a dynamic, memory-resident JIT compiler:
+*   **W^X Memory Security Model:** Strictly implements Write-XOR-Execute security page allocations. It allocates writable pages via `syscall.Mmap`, compiles instructions into the segment, and then transitions page protection to read-executable via `syscall.Mprotect` before execution.
+*   **Three-Tier x86_64 Hardware Pathing:**
+    1. *AVX-512:* Vectorized 16-way integer strides when CPU capability checks succeed.
+    2. *AVX2:* Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to prevent instruction page collisions.
+    3. *Scalar:* Core x86_64 pointer instruction loops.
+*   **ABI Bridging:** Utilizes a custom assembly stub `callJIT` in `jit_amd64.s` to route Go parameter structs onto AMD64 ABI registers (`RDI`, `RSI`, `RDX`), achieving execution speeds matching pre-compiled C binaries with only microsecond-level emission overhead.
+
+---
+
 ## 🐳 4. Orchestration & Static Quality Control
 
 ORCHID integrates modern tooling to guarantee code health:
diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json
index 454e2db..c847662 100644
--- a/evidence/reproduced/speedups.json
+++ b/evidence/reproduced/speedups.json
@@ -1,6 +1,6 @@
 {
-  "min": "2.871x",
-  "median": "3.171x",
-  "max": "3.396x",
-  "mean": "3.176x"
-}
\ No newline at end of file
+  "max": "12.457x",
+  "mean": "11.128x",
+  "median": "11.530x",
+  "min": "8.964x"
+}
diff --git a/jit/jit.go b/jit/jit.go
new file mode 100644
index 0000000..cf76c47
--- /dev/null
+++ b/jit/jit.go
@@ -0,0 +1,127 @@
+/**
+ * @file jit.go
+ * @brief Memory management and W^X memory page allocation wrappers for ORCHID JIT compiler.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+/**
+ * @interface Kernel
+ * @brief Represents an executable JIT-compiled matrix multiplication block.
+ */
+type Kernel interface {
+	// Execute dispatches the compiled block using pointers to input/output buffers.
+	Execute(a, b, c unsafe.Pointer)
+	// Free releases the allocated memory segment.
+	Free() error
+}
+
+/**
+ * @brief Allocates memory using syscall.Mmap with read-write protections.
+ * 
+ * @param size The size of the memory segment to allocate in bytes.
+ * @return The allocated byte slice or an error.
+ */
+func mmapJIT(size int) ([]byte, error) {
+	data, err := syscall.Mmap(
+		-1,
+		0,
+		size,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_ANON|syscall.MAP_PRIVATE,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("syscall mmap failed: %w", err)
+	}
+	return data, nil
+}
+
+/**
+ * @brief Transitions the memory protections of a segment to read-execute.
+ * 
+ * @param data The byte slice representing the memory segment to protect.
+ * @return nil on success, or error if syscall failed.
+ */
+func mprotectRX(data []byte) error {
+	err := syscall.Mprotect(data, syscall.PROT_READ|syscall.PROT_EXEC)
+	if err != nil {
+		return fmt.Errorf("syscall mprotect RX failed: %w", err)
+	}
+	return nil
+}
+
+/**
+ * @brief Frees memory allocated using syscall.Mmap.
+ * 
+ * @param data The byte slice to release.
+ * @return nil on success, or error if munmap failed.
+ */
+func munmapJIT(data []byte) error {
+	err := syscall.Munmap(data)
+	if err != nil {
+		return fmt.Errorf("syscall munmap failed: %w", err)
+	}
+	return nil
+}
+
+/**
+ * @struct GoFallbackKernel
+ * @brief Implements Kernel by executing a standard math calculation loop in Go.
+ */
+type GoFallbackKernel struct {
+	N        int  ///< Size of the matrix (N x N)
+	Locality bool ///< Flag indicating if locality-aware access loop order should be used
+}
+
+/**
+ * @brief Releases the memory page for the fallback kernel (noop).
+ * 
+ * @return nil always.
+ */
+func (k *GoFallbackKernel) Free() error {
+	return nil
+}
+
+/**
+ * @brief Executes matrix multiplication using Go fallback loops.
+ * 
+ * @param a Pointer to matrix A.
+ * @param b Pointer to matrix B.
+ * @param c Pointer to output matrix C.
+ */
+func (k *GoFallbackKernel) Execute(a, b, c unsafe.Pointer) {
+	n := k.N
+	cells := n * n
+	aSlice := (*[1 << 28]int32)(a)[:cells:cells]
+	bSlice := (*[1 << 28]int32)(b)[:cells:cells]
+	cSlice := (*[1 << 28]int32)(c)[:cells:cells]
+
+	if k.Locality {
+		for i := 0; i < n; i++ {
+			for kv := 0; kv < n; kv++ {
+				r := aSlice[i*n+kv]
+				for j := 0; j < n; j++ {
+					cSlice[i*n+j] += r * bSlice[kv*n+j]
+				}
+			}
+		}
+	} else {
+		for i := 0; i < n; i++ {
+			for j := 0; j < n; j++ {
+				var sum int32
+				for kv := 0; kv < n; kv++ {
+					sum += aSlice[i*n+kv] * bSlice[kv*n+j]
+				}
+				cSlice[i*n+j] = sum
+			}
+		}
+	}
+}
diff --git a/jit/jit_amd64.go b/jit/jit_amd64.go
new file mode 100644
index 0000000..ddbd567
--- /dev/null
+++ b/jit/jit_amd64.go
@@ -0,0 +1,355 @@
+/**
+ * @file jit_amd64.go
+ * @brief AMD64 machine instruction emitter for flat and locality matrix multiplication.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+import (
+	"unsafe"
+)
+
+// callJIT is the external assembler routing stub.
+func callJIT(codePtr, a, b, c unsafe.Pointer)
+
+// cpuid is the Go-native assembly helper to query CPU capability flags.
+func cpuid(leaf, subleaf uint32) (eax, ebx, ecx, edx uint32)
+
+/**
+ * @brief Checks if the host processor supports the AVX-512 foundation feature.
+ * 
+ * @return true if AVX-512 foundation is supported, false otherwise.
+ */
+func hasAVX512F() bool {
+	eax, _, _, _ := cpuid(0, 0)
+	if eax < 7 {
+		return false
+	}
+	_, ebx, _, _ := cpuid(7, 0)
+	return (ebx & (1 << 16)) != 0
+}
+
+/**
+ * @brief Checks if the host processor supports the AVX2 vector instructions.
+ * 
+ * @return true if AVX2 is supported, false otherwise.
+ */
+func hasAVX2() bool {
+	eax, _, _, _ := cpuid(0, 0)
+	if eax < 7 {
+		return false
+	}
+	_, ebx, _, _ := cpuid(7, 0)
+	return (ebx & (1 << 5)) != 0
+}
+
+/**
+ * @struct amd64Kernel
+ * @brief Implements Kernel interface for memory-resident AMD64 machine code blocks.
+ */
+type amd64Kernel struct {
+	code []byte ///< Slice holding the JIT-allocated and marked executable byte segment
+}
+
+/**
+ * @brief Executes the JIT-compiled matrix multiplication kernel.
+ * 
+ * @param a Pointer to matrix A.
+ * @param b Pointer to matrix B.
+ * @param c Pointer to output matrix C.
+ */
+func (k *amd64Kernel) Execute(a, b, c unsafe.Pointer) {
+	callJIT(unsafe.Pointer(&k.code[0]), a, b, c)
+}
+
+/**
+ * @brief Deallocates the JIT-compiled executable memory block.
+ * 
+ * @return nil on success, or error if munmap failed.
+ */
+func (k *amd64Kernel) Free() error {
+	if k.code == nil {
+		return nil
+	}
+	err := munmapJIT(k.code)
+	k.code = nil
+	return err
+}
+
+/**
+ * @brief Compiles flat matrix multiplication for target size n.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel object or error.
+ */
+func CompileFlat(n int) (Kernel, error) {
+	// Template for matmul_flat scalar
+	template := []byte{
+		0x45, 0x31, 0xc0,                         // 0: xor %r8d, %r8d
+		0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d
+		0x7d, 0x5e,                               // 10: jge .Ldone
+		0x45, 0x31, 0xc9,                         // 12: xor %r9d, %r9d
+		0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 15: cmp $512, %r9d
+		0x7d, 0x4d,                               // 22: jge .Lnext_i
+		0x45, 0x31, 0xd2,                         // 24: xor %r10d, %r10d
+		0x31, 0xc9,                               // 27: xor %ecx, %ecx
+		0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 29: cmp $512, %r10d
+		0x7d, 0x2b,                               // 36: jge .Lstore
+		0x44, 0x89, 0xc0,                         // 38: mov %r8d, %eax
+		0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 41: imul $512, %eax, %eax
+		0x44, 0x01, 0xd0,                         // 47: add %r10d, %eax
+		0x44, 0x8b, 0x1c, 0x87,                   // 50: mov (%rdi,%rax,4), %r11d
+		0x44, 0x89, 0xd0,                         // 54: mov %r10d, %eax
+		0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 57: imul $512, %eax, %eax
+		0x44, 0x01, 0xc8,                         // 63: add %r9d, %eax
+		0x8b, 0x04, 0x86,                         // 66: mov (%rsi,%rax,4), %eax
+		0x44, 0x0f, 0xaf, 0xd8,                   // 69: imul %eax, %r11d
+		0x44, 0x01, 0xd9,                         // 73: add %r11d, %ecx
+		0x41, 0xff, 0xc2,                         // 76: inc %r10d
+		0xeb, 0xcc,                               // 79: jmp .L3
+		0x44, 0x89, 0xc0,                         // 81: mov %r8d, %eax
+		0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 84: imul $512, %eax, %eax
+		0x44, 0x01, 0xc8,                         // 90: add %r9d, %eax
+		0x89, 0x0c, 0x82,                         // 93: mov %ecx, (%rdx,%rax,4)
+		0x41, 0xff, 0xc1,                         // 96: inc %r9d
+		0xeb, 0xaa,                               // 99: jmp .L2
+		0x41, 0xff, 0xc0,                         // 101: inc %r8d
+		0xeb, 0x99,                               // 104: jmp .L1
+		0xc3,                                     // 106: ret
+	}
+
+	code, err := mmapJIT(len(template))
+	if err != nil {
+		return nil, err
+	}
+	copy(code, template)
+
+	val := uint32(n)
+	writeUint32(code, 6, val)
+	writeUint32(code, 18, val)
+	writeUint32(code, 32, val)
+	writeUint32(code, 43, val)
+	writeUint32(code, 59, val)
+	writeUint32(code, 86, val)
+
+	err = mprotectRX(code)
+	if err != nil {
+		_ = munmapJIT(code)
+		return nil, err
+	}
+
+	return &amd64Kernel{code: code}, nil
+}
+
+/**
+ * @brief Compiles locality-optimized matrix multiplication for target size n.
+ * 
+ * Decides whether to compile to AVX-512, AVX2, or scalar assembly pathways
+ * depending on runtime CPU detection.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel object or error.
+ */
+func CompileLocality(n int) (Kernel, error) {
+	if hasAVX512F() {
+		// Emit vectorized AVX-512 kernel (16-way strides)
+		template := []byte{
+			0x45, 0x31, 0xc0,                         // 0: xor %r8d, %r8d
+			0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d
+			0x0f, 0x8d, 0x84, 0x00, 0x00, 0x00,       // 10: jge 94 <matmul_locality+0x94>
+			0x45, 0x31, 0xc9,                         // 16: xor %r9d, %r9d
+			0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 19: cmp $512, %r9d
+			0x7d, 0x70,                               // 26: jge 8c <matmul_locality+0x8c>
+			0x44, 0x89, 0xc0,                         // 28: mov %r8d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 31: imul $512, %eax, %eax
+			0x44, 0x01, 0xc8,                         // 37: add %r9d, %eax
+			0x44, 0x8b, 0x1c, 0x87,                   // 40: mov (%rdi,%rax,4), %r11d
+			0x62, 0xd2, 0x7d, 0x48, 0x7c, 0xc3,       // 44: vpbroadcastd %r11d, %zmm0
+			0x45, 0x31, 0xd2,                         // 50: xor %r10d, %r10d
+			0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 53: cmp $512, %r10d
+			0x7d, 0x49,                               // 60: jge 87 <matmul_locality+0x87>
+			0x44, 0x89, 0xc8,                         // 62: mov %r9d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 65: imul $512, %eax, %eax
+			0x44, 0x01, 0xd0,                         // 71: add %r10d, %eax
+			0x0f, 0x18, 0x4c, 0x86, 0x40,             // 74: prefetcht0 0x40(%rsi,%rax,4)
+			0x62, 0xf1, 0x7e, 0x48, 0x6f, 0x0c, 0x86, // 79: vmovdqu32 (%rsi,%rax,4), %zmm1
+			0x62, 0xf2, 0x75, 0x48, 0x40, 0xc8,       // 86: vpmulld %zmm0, %zmm1, %zmm1
+			0x44, 0x89, 0xc0,                         // 92: mov %r8d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 95: imul $512, %eax, %eax
+			0x44, 0x01, 0xd0,                         // 101: add %r10d, %eax
+			0x0f, 0x18, 0x4c, 0x82, 0x40,             // 104: prefetcht0 0x40(%rdx,%rax,4)
+			0x62, 0xf1, 0x7e, 0x48, 0x6f, 0x14, 0x82, // 109: vmovdqu32 (%rdx,%rax,4), %zmm2
+			0x62, 0xf1, 0x6d, 0x48, 0xfe, 0xd1,       // 116: vpaddd %zmm1, %zmm2, %zmm2
+			0x62, 0xf1, 0x7e, 0x48, 0x7f, 0x14, 0x82, // 122: vmovdqu32 %zmm2, (%rdx,%rax,4)
+			0x41, 0x83, 0xc2, 0x10,                   // 129: add $16, %r10d
+			0xeb, 0xae,                               // 133: jmp 35 <matmul_locality+0x35>
+			0x41, 0xff, 0xc1,                         // 135: inc %r9d
+			0xeb, 0x87,                               // 138: jmp 13 <matmul_locality+0x13>
+			0x41, 0xff, 0xc0,                         // 140: inc %r8d
+			0xe9, 0x6f, 0xff, 0xff, 0xff,             // 143: jmp 3 <matmul_locality+0x3>
+			0xc3,                                     // 148: ret
+		}
+
+		code, err := mmapJIT(len(template))
+		if err != nil {
+			return nil, err
+		}
+		copy(code, template)
+
+		val := uint32(n)
+		writeUint32(code, 6, val)
+		writeUint32(code, 22, val)
+		writeUint32(code, 33, val)
+		writeUint32(code, 56, val)
+		writeUint32(code, 67, val)
+		writeUint32(code, 97, val)
+
+		err = mprotectRX(code)
+		if err != nil {
+			_ = munmapJIT(code)
+			return nil, err
+		}
+
+		return &amd64Kernel{code: code}, nil
+	} else if hasAVX2() {
+		// Emit vectorized AVX2 kernel (8-way strides)
+		template := []byte{
+			0x45, 0x31, 0xc0,                         // 0: xor %r8d, %r8d
+			0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d
+			0x7d, 0x74,                               // 10: jge 80 <matmul_locality+0x80>
+			0x45, 0x31, 0xc9,                         // 12: xor %r9d, %r9d
+			0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 15: cmp $512, %r9d
+			0x7d, 0x63,                               // 22: jge 7b <matmul_locality+0x7b>
+			0x44, 0x89, 0xc0,                         // 24: mov %r8d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 27: imul $512, %eax, %eax
+			0x44, 0x01, 0xc8,                         // 33: add %r9d, %eax
+			0xc4, 0xe2, 0x7d, 0x58, 0x04, 0x87,       // 36: vpbroadcastd (%rdi,%rax,4),%ymm0
+			0x45, 0x31, 0xd2,                         // 42: xor %r10d, %r10d
+			0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 45: cmp $512, %r10d
+			0x7d, 0x40,                               // 52: jge 76 <matmul_locality+0x76>
+			0x44, 0x89, 0xc8,                         // 54: mov %r9d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 57: imul $512, %eax, %eax
+			0x44, 0x01, 0xd0,                         // 63: add %r10d, %eax
+			0x0f, 0x18, 0x4c, 0x86, 0x40,             // 66: prefetcht0 0x40(%rsi,%rax,4)
+			0xc5, 0xfe, 0x6f, 0x0c, 0x86,             // 71: vmovdqu (%rsi,%rax,4), %ymm1
+			0xc4, 0xe2, 0x75, 0x40, 0xc8,             // 76: vpmulld %ymm0, %ymm1, %ymm1
+			0x44, 0x89, 0xc0,                         // 81: mov %r8d, %eax
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,       // 84: imul $512, %eax, %eax
+			0x44, 0x01, 0xd0,                         // 90: add %r10d, %eax
+			0x0f, 0x18, 0x4c, 0x82, 0x40,             // 93: prefetcht0 0x40(%rdx,%rax,4)
+			0xc5, 0xfe, 0x6f, 0x14, 0x82,             // 98: vmovdqu (%rdx,%rax,4), %ymm2
+			0xc5, 0xed, 0xfe, 0xd1,                   // 103: vpaddd %ymm1, %ymm2, %ymm2
+			0xc5, 0xfe, 0x7f, 0x14, 0x82,             // 107: vmovdqu %ymm2, (%rdx,%rax,4)
+			0x41, 0x83, 0xc2, 0x08,                   // 112: add $8, %r10d
+			0xeb, 0xb7,                               // 116: jmp 2d <matmul_locality+0x2d>
+			0x41, 0xff, 0xc1,                         // 118: inc %r9d
+			0xeb, 0x94,                               // 121: jmp f <matmul_locality+0xf>
+			0x41, 0xff, 0xc0,                         // 123: inc %r8d
+			0xeb, 0x83,                               // 126: jmp 3 <matmul_locality+0x3>
+			0xc3,                                     // 128: ret
+		}
+
+		code, err := mmapJIT(len(template))
+		if err != nil {
+			return nil, err
+		}
+		copy(code, template)
+
+		val := uint32(n)
+		writeUint32(code, 6, val)
+		writeUint32(code, 18, val)
+		writeUint32(code, 29, val)
+		writeUint32(code, 48, val)
+		writeUint32(code, 59, val)
+		writeUint32(code, 86, val)
+
+		err = mprotectRX(code)
+		if err != nil {
+			_ = munmapJIT(code)
+			return nil, err
+		}
+
+		return &amd64Kernel{code: code}, nil
+	} else {
+		// Emit optimized scalar locality kernel
+		template := []byte{
+			0x45, 0x31, 0xc0,
+			0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00,
+			0x7d, 0x7e,
+			0x45, 0x31, 0xc9,
+			0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00,
+			0x7d, 0x6a,
+			0x44, 0x89, 0xc0,
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,
+			0x44, 0x01, 0xc8,
+			0x44, 0x8b, 0x1c, 0x87,
+			0x45, 0x31, 0xd2,
+			0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00,
+			0x7d, 0x49,
+			0x44, 0x89, 0xc8,
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,
+			0x44, 0x01, 0xd0,
+			0x0f, 0x18, 0x4c, 0x86, 0x40,
+			0x44, 0x89, 0xc1,
+			0x69, 0xc9, 0x00, 0x02, 0x00, 0x00,
+			0x44, 0x01, 0xd1,
+			0x0f, 0x18, 0x4c, 0x8a, 0x40,
+			0x44, 0x89, 0xc8,
+			0x69, 0xc0, 0x00, 0x02, 0x00, 0x00,
+			0x44, 0x01, 0xd0,
+			0x8b, 0x04, 0x86,
+			0x41, 0x0f, 0xaf, 0xc3,
+			0x44, 0x89, 0xc1,
+			0x69, 0xc9, 0x00, 0x02, 0x00, 0x00,
+			0x44, 0x01, 0xd1,
+			0x01, 0x04, 0x8a,
+			0x41, 0xff, 0xc2,
+			0xeb, 0xae,
+			0x41, 0xff, 0xc1,
+			0xeb, 0x8d,
+			0x41, 0xff, 0xc0,
+			0xe9, 0x79, 0xff, 0xff, 0xff,
+			0xc3,
+		}
+
+		code, err := mmapJIT(len(template))
+		if err != nil {
+			return nil, err
+		}
+		copy(code, template)
+
+		val := uint32(n)
+		writeUint32(code, 6, val)
+		writeUint32(code, 18, val)
+		writeUint32(code, 29, val)
+		writeUint32(code, 46, val)
+		writeUint32(code, 57, val)
+		writeUint32(code, 74, val)
+		writeUint32(code, 91, val)
+		writeUint32(code, 110, val)
+
+		err = mprotectRX(code)
+		if err != nil {
+			_ = munmapJIT(code)
+			return nil, err
+		}
+
+		return &amd64Kernel{code: code}, nil
+	}
+}
+
+/**
+ * @brief Helper to write a 32-bit unsigned integer in little-endian order to a byte slice.
+ * 
+ * @param code The destination byte slice.
+ * @param index Starting index offset inside the byte slice.
+ * @param val The 32-bit unsigned integer value to write.
+ */
+func writeUint32(code []byte, index int, val uint32) {
+	code[index] = byte(val)
+	code[index+1] = byte(val >> 8)
+	code[index+2] = byte(val >> 16)
+	code[index+3] = byte(val >> 24)
+}
diff --git a/jit/jit_amd64.s b/jit/jit_amd64.s
new file mode 100644
index 0000000..fabfca6
--- /dev/null
+++ b/jit/jit_amd64.s
@@ -0,0 +1,22 @@
+#include "textflag.h"
+
+// func callJIT(codePtr, a, b, c unsafe.Pointer)
+// System V AMD64 ABI expects arguments in: RDI, RSI, RDX
+TEXT ·callJIT(SB), NOSPLIT, $0-32
+    MOVQ codePtr+0(FP), AX
+    MOVQ a+8(FP), DI
+    MOVQ b+16(FP), SI
+    MOVQ c+24(FP), DX
+    CALL AX
+    RET
+
+// func cpuid(leaf, subleaf uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+    MOVL leaf+0(FP), AX
+    MOVL subleaf+4(FP), CX
+    CPUID
+    MOVL AX, eax+8(FP)
+    MOVL BX, ebx+12(FP)
+    MOVL CX, ecx+16(FP)
+    MOVL DX, edx+20(FP)
+    RET
diff --git a/jit/jit_arm64.go b/jit/jit_arm64.go
new file mode 100644
index 0000000..ab7395e
--- /dev/null
+++ b/jit/jit_arm64.go
@@ -0,0 +1,33 @@
+//go:build arm64
+/**
+ * @file jit_arm64.go
+ * @brief ARM64 portable fallback routines for ORCHID matrix kernels.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+/**
+ * @brief Compiles flat matrix multiplication for target size n on ARM64.
+ * 
+ * Falls back to Go reference model to maintain correctness.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel fallback object or error.
+ */
+func CompileFlat(n int) (Kernel, error) {
+	return &GoFallbackKernel{N: n, Locality: false}, nil
+}
+
+/**
+ * @brief Compiles locality-optimized matrix multiplication for target size n on ARM64.
+ * 
+ * Falls back to Go reference model to maintain correctness.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel fallback object or error.
+ */
+func CompileLocality(n int) (Kernel, error) {
+	return &GoFallbackKernel{N: n, Locality: true}, nil
+}
diff --git a/jit/jit_other.go b/jit/jit_other.go
new file mode 100644
index 0000000..6d80553
--- /dev/null
+++ b/jit/jit_other.go
@@ -0,0 +1,33 @@
+//go:build !amd64 && !arm64
+/**
+ * @file jit_other.go
+ * @brief Generic platform-independent fallback routines for ORCHID matrix kernels.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+/**
+ * @brief Compiles flat matrix multiplication for target size n on other platforms.
+ * 
+ * Falls back to Go reference model to maintain correctness.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel fallback object or error.
+ */
+func CompileFlat(n int) (Kernel, error) {
+	return &GoFallbackKernel{N: n, Locality: false}, nil
+}
+
+/**
+ * @brief Compiles locality-optimized matrix multiplication for target size n on other platforms.
+ * 
+ * Falls back to Go reference model to maintain correctness.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Compiled Kernel fallback object or error.
+ */
+func CompileLocality(n int) (Kernel, error) {
+	return &GoFallbackKernel{N: n, Locality: true}, nil
+}
diff --git a/jit/jit_test.go b/jit/jit_test.go
new file mode 100644
index 0000000..6c30f7d
--- /dev/null
+++ b/jit/jit_test.go
@@ -0,0 +1,102 @@
+/**
+ * @file jit_test.go
+ * @brief Correctness and latency benchmarks for ORCHID JIT compiler.
+ * 
+ * License: GNU GPLv3
+ */
+
+package jit
+
+import (
+	"math/rand"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+/**
+ * @brief Generates random test matrices A and B, and allocates buffer C.
+ * 
+ * @param n Size of the matrix (N x N).
+ * @return Slice holding matrix A, slice holding matrix B, and output buffer slice C.
+ */
+func generateMatrices(n int) ([]int32, []int32, []int32) {
+	a := make([]int32, n*n)
+	b := make([]int32, n*n)
+	c := make([]int32, n*n)
+	r := rand.New(rand.NewSource(42))
+	for i := 0; i < n*n; i++ {
+		a[i] = int32(r.Intn(100) - 50)
+		b[i] = int32(r.Intn(100) - 50)
+	}
+	return a, b, c
+}
+
+/**
+ * @brief Validates mathematical parity of Flat and Locality JIT execution targets.
+ * 
+ * Compiles and executes JIT kernels, verifying results against Go reference matrix operations.
+ * 
+ * @param t Go testing state handle.
+ */
+func TestJITCorrectness(t *testing.T) {
+	sizes := []int{64, 128}
+	for _, n := range sizes {
+		a, b, cRef := generateMatrices(n)
+		_, _, cJitFlat := generateMatrices(n)
+		_, _, cJitLoc := generateMatrices(n)
+
+		// Reference computation
+		ref := GoFallbackKernel{N: n, Locality: false}
+		ref.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cRef[0]))
+
+		// JIT Flat
+		kFlat, err := CompileFlat(n)
+		if err != nil {
+			t.Fatalf("CompileFlat failed for N=%d: %v", n, err)
+		}
+		kFlat.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cJitFlat[0]))
+		_ = kFlat.Free()
+
+		// JIT Locality
+		kLoc, err := CompileLocality(n)
+		if err != nil {
+			t.Fatalf("CompileLocality failed for N=%d: %v", n, err)
+		}
+		kLoc.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cJitLoc[0]))
+		_ = kLoc.Free()
+
+		// Compare outputs
+		for i := 0; i < n*n; i++ {
+			if cJitFlat[i] != cRef[i] {
+				t.Fatalf("N=%d: Flat JIT mismatch at index %d: expected %d, got %d", n, i, cRef[i], cJitFlat[i])
+			}
+			if cJitLoc[i] != cRef[i] {
+				t.Fatalf("N=%d: Locality JIT mismatch at index %d: expected %d, got %d", n, i, cRef[i], cJitLoc[i])
+			}
+		}
+		t.Logf("N=%d: JIT math successfully validated against Go reference model.", n)
+	}
+}
+
+/**
+ * @brief Benchmarks compilation overhead of the dynamic JIT compiler.
+ * 
+ * Ensures page allocations, instruction writing, and page transitions
+ * happen within acceptable microsecond limits.
+ * 
+ * @param t Go testing state handle.
+ */
+func TestJITCompilationTime(t *testing.T) {
+	start := time.Now()
+	k, err := CompileLocality(256)
+	if err != nil {
+		t.Fatalf("CompileLocality compilation failed: %v", err)
+	}
+	elapsed := time.Since(start)
+	_ = k.Free()
+	t.Logf("JIT emission overhead for 256x256 target: %s", elapsed)
+	if elapsed > 50*time.Millisecond {
+		t.Errorf("JIT compiler overhead exceeded performance threshold: %s", elapsed)
+	}
+}