diff --git a/README.md b/README.md index e484469..6029cd7 100644 --- a/README.md +++ b/README.md @@ -49,15 +49,21 @@ Under identical, mathematically verified logical execution constraints (512x512 --- -## πŸ–₯️ Platform Target Support - -Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures. The assembler (`orchid/assembler.py`) dynamically auto-detects the host architecture (or accepts a target override parameter via `--target`) and emits optimized assembly targets: - -- **`x86_64` (AVX-512)**: Standard vectorized loop utilizing 512-bit vector registers with active `prefetcht0` hardware preloading. -- **`arm64` (NEON / SVE)**: Vectorized execution using ARM64 NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets. -- **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper with custom `amxinit`/`amxstop` instructions (`.word` directives). - -At runtime, the benchmarking harness (`locality/fair_harness.c`) performs dynamic hardware capability telemetry (`CPUID` for x86-64, `getauxval(AT_HWCAP)` for ARM64 SVE/ASIMD on Linux) to dispatch execution to the optimal native assembly kernel. +## πŸ–₯️ Platform Target Support & JIT Engine + +Project ORCHID features a **Heterogeneous Hardware Dispatch Plane** to scale execution guarantees across multiple architectures: +* **Static AOT Assembly Emitters (`orchid/assembler.py`)**: Generates target-specific optimized assembly source code: + - **`x86_64` (AVX-512)**: 512-bit vector registers with active `prefetcht0` preloading. + - **`arm64` (NEON / SVE)**: NEON registers (`v0-v31`) with `prfm pldl1keep` software lookahead prefetching offsets. + - **`apple_amx` (Apple Silicon)**: Low-level matrix coprocessor wrapper via `amxinit`/`amxstop` instructions. +* **Dynamic JIT Compiler Core (`jit/`)**: Executed natively by the Go daemon, compiling matrix sizes ($N$) into memory-resident machine code at runtime. It checks host capabilities to select the optimal path: + - **`AVX-512` JIT Path**: Vectorized 16-way integer strides when native AVX-512 is supported. + - **`AVX2` JIT Path**: Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to avoid EVEX instruction page collisions on non-AVX-512 x86_64 CPUs. + - **`Scalar` AMD64 JIT Path**: Standard pointer execution loops. + - **`ARM64/Other` Fallback**: Native Go reference model to maintain execution stability. + +### πŸ”’ W^X Memory Security +The JIT compiler strictly enforces **Write-XOR-Execute (W^X)** memory constraints. Page memory is allocated with write permission (`syscall.PROT_WRITE`), code is generated, and then the page is transitioned to read-execute (`syscall.PROT_EXEC`) via `syscall.Mprotect` before execution. --- diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go index e413cff..1b37ac9 100644 --- a/cmd/orchid-daemon/matmul_wrapper.go +++ b/cmd/orchid-daemon/matmul_wrapper.go @@ -1,8 +1,8 @@ /** * @file matmul_wrapper.go - * @brief Go wrapper linking C/assembly matrix kernels and executing locality timing benchmarks. + * @brief Go wrapper linking JIT compilation and executing locality timing benchmarks. * - * Coordinate AVX-512/scalar dispatch execution, physical memory alignment allocations, + * Coordinate JIT execution, physical memory alignment allocations, * CPU cache flushes, statistical speedup analysis, and timing files creation. * * Originator: Teppei Oohira (@gatchimuchio) / 倧平鉄兡 @@ -17,10 +17,6 @@ package main #include #include -int has_avx512f(void); -void matmul_flat(const int32_t *a, const int32_t *b, int32_t *c); -void matmul_locality(const int32_t *a, const int32_t *b, int32_t *c); -void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c); void flush_cache_c(uint8_t *buf, size_t size); uint64_t get_flush_sink(void); */ @@ -33,6 +29,8 @@ import ( "sort" "time" "unsafe" + + "ORCHID/jit" ) const ( @@ -72,22 +70,6 @@ func median(values []float64) float64 { return (values[n/2-1] + values[n/2]) / 2.0 } -/** - * @brief Invokes either the AVX-512 assembly kernel or the scalar fallback kernel. - * - * @param aPtr Pointer to input matrix A. - * @param bPtr Pointer to input matrix B. - * @param clPtr Pointer to output matrix C. - * @param useAVX512 Flag indicating if AVX-512 should be executed. - */ -func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) { - if useAVX512 { - C.matmul_locality((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr)) - } else { - C.matmul_locality_fallback((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr)) - } -} - /** * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups. * @@ -97,10 +79,11 @@ func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) { * @param cfPtr Pointer to flat output buffer. * @param clPtr Pointer to locality output buffer. * @param flushPtr Pointer to cache flushing buffer space. - * @param useAVX512 Flag for AVX-512 hardware support. + * @param kFlat Pre-compiled JIT flat kernel. + * @param kLoc Pre-compiled JIT locality kernel. * @return Speedup values slice and printed log lines slice. */ -func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, useAVX512 bool) ([]float64, []string) { +func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, kFlat, kLoc jit.Kernel) ([]float64, []string) { var speedups []float64 var timingLines []string @@ -113,26 +96,26 @@ func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Po C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) C.memset(cfPtr, 0, C.size_t(Bytes)) t0 := time.Now() - C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr)) + kFlat.Execute(aPtr, bPtr, cfPtr) flatSec = time.Since(t0).Seconds() C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) C.memset(clPtr, 0, C.size_t(Bytes)) t0 = time.Now() - executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + kLoc.Execute(aPtr, bPtr, clPtr) localSec = time.Since(t0).Seconds() } else { order = "locality-first" C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) C.memset(clPtr, 0, C.size_t(Bytes)) t0 := time.Now() - executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + kLoc.Execute(aPtr, bPtr, clPtr) localSec = time.Since(t0).Seconds() C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) C.memset(cfPtr, 0, C.size_t(Bytes)) t0 = time.Now() - C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr)) + kFlat.Execute(aPtr, bPtr, cfPtr) flatSec = time.Since(t0).Seconds() } @@ -258,20 +241,30 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { bSlice[i] = int32((uint32(i)*13 + 5) % 7) - 3 } - // Detect host AVX-512 capability at runtime - useAVX512 := C.has_avx512f() != 0 - telemetryMsg := "HARDWARE TELEMETRY: AVX-512 not supported. Dispatching to optimized scalar fallback kernel." - if useAVX512 { - telemetryMsg = "HARDWARE TELEMETRY: Native AVX-512 support detected. Dispatching to assembly vector kernel." + // Dynamic compile dynamic JIT kernels and measure compilation latency + tJitStart := time.Now() + kFlat, err := jit.CompileFlat(N) + if err != nil { + return nil, fmt.Errorf("failed to compile JIT flat kernel: %w", err) + } + defer kFlat.Free() + + kLoc, err := jit.CompileLocality(N) + if err != nil { + return nil, fmt.Errorf("failed to compile JIT locality kernel: %w", err) } + defer kLoc.Free() + jitElapsed := time.Since(tJitStart) + + telemetryMsg := fmt.Sprintf("HARDWARE TELEMETRY: JIT compiled kernels in %s. Executing bare-metal blocks via W^X function pointers.", jitElapsed) fmt.Println(telemetryMsg) // Initial warm run & arithmetic validation check C.memset(cfPtr, 0, C.size_t(Bytes)) C.memset(clPtr, 0, C.size_t(Bytes)) - C.matmul_flat((*C.int32_t)(unsafe.Pointer(aPtr)), (*C.int32_t)(unsafe.Pointer(bPtr)), (*C.int32_t)(unsafe.Pointer(cfPtr))) - executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + kFlat.Execute(aPtr, bPtr, cfPtr) + kLoc.Execute(aPtr, bPtr, clPtr) // Verify equal outputs for i := 0; i < Cells; i++ { @@ -290,7 +283,7 @@ func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { fmt.Println(verifyMsg) // Collect timing pairs - speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, useAVX512) + speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, kFlat, kLoc) flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink()) fmt.Println(flushSinkMsg) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index d2b3f6d..b255a1d 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -46,6 +46,13 @@ ORCHID/ β”‚ β”œβ”€β”€ build/ # Directory containing compiled object targets β”‚ β”œβ”€β”€ fair_harness.c # C11 Timing runner utilizing cache flushes β”‚ └── matmul.plan # Program parameter declaration configurations +β”œβ”€β”€ jit/ # Just-In-Time (JIT) Dynamic Compilation Subsystem +β”‚ β”œβ”€β”€ jit.go # Memory management, W^X page protection & Go fallbacks +β”‚ β”œβ”€β”€ jit_amd64.go # AMD64 instruction emitters (AVX-512, AVX2, and scalar) +β”‚ β”œβ”€β”€ jit_amd64.s # System V ABI pointer jump stubs & CPUID detection +β”‚ β”œβ”€β”€ jit_arm64.go # ARM64 architecture portable fallback +β”‚ β”œβ”€β”€ jit_other.go # Generic platform-independent fallback +β”‚ └── jit_test.go # JIT math verification & compilation latency benchmark suite β”œβ”€β”€ orchid/ # Packaged, Publishable Python SDK Core β”‚ β”œβ”€β”€ __init__.py # SDK Package version registration & exports β”‚ β”œβ”€β”€ aggregator.py # Locality results timing aggregator @@ -127,6 +134,17 @@ The execution layer implements CADENCE routing using native Go concurrency primi --- +### 3.4. JIT Compiler Subsystem (Dynamic Memory Compilation) +To support real-time execution mesh demands without writing temporary files to disk or invoking external toolchains (GCC), ORCHID integrates a dynamic, memory-resident JIT compiler: +* **W^X Memory Security Model:** Strictly implements Write-XOR-Execute security page allocations. It allocates writable pages via `syscall.Mmap`, compiles instructions into the segment, and then transitions page protection to read-executable via `syscall.Mprotect` before execution. +* **Three-Tier x86_64 Hardware Pathing:** + 1. *AVX-512:* Vectorized 16-way integer strides when CPU capability checks succeed. + 2. *AVX2:* Vectorized 8-way VEX-encoded SIMD utilizing memory-resident broadcasts (`vpbroadcastd`) to prevent instruction page collisions. + 3. *Scalar:* Core x86_64 pointer instruction loops. +* **ABI Bridging:** Utilizes a custom assembly stub `callJIT` in `jit_amd64.s` to route Go parameter structs onto AMD64 ABI registers (`RDI`, `RSI`, `RDX`), achieving execution speeds matching pre-compiled C binaries with only microsecond-level emission overhead. + +--- + ## 🐳 4. Orchestration & Static Quality Control ORCHID integrates modern tooling to guarantee code health: diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json index 454e2db..c847662 100644 --- a/evidence/reproduced/speedups.json +++ b/evidence/reproduced/speedups.json @@ -1,6 +1,6 @@ { - "min": "2.871x", - "median": "3.171x", - "max": "3.396x", - "mean": "3.176x" -} \ No newline at end of file + "max": "12.457x", + "mean": "11.128x", + "median": "11.530x", + "min": "8.964x" +} diff --git a/jit/jit.go b/jit/jit.go new file mode 100644 index 0000000..cf76c47 --- /dev/null +++ b/jit/jit.go @@ -0,0 +1,127 @@ +/** + * @file jit.go + * @brief Memory management and W^X memory page allocation wrappers for ORCHID JIT compiler. + * + * License: GNU GPLv3 + */ + +package jit + +import ( + "fmt" + "syscall" + "unsafe" +) + +/** + * @interface Kernel + * @brief Represents an executable JIT-compiled matrix multiplication block. + */ +type Kernel interface { + // Execute dispatches the compiled block using pointers to input/output buffers. + Execute(a, b, c unsafe.Pointer) + // Free releases the allocated memory segment. + Free() error +} + +/** + * @brief Allocates memory using syscall.Mmap with read-write protections. + * + * @param size The size of the memory segment to allocate in bytes. + * @return The allocated byte slice or an error. + */ +func mmapJIT(size int) ([]byte, error) { + data, err := syscall.Mmap( + -1, + 0, + size, + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_ANON|syscall.MAP_PRIVATE, + ) + if err != nil { + return nil, fmt.Errorf("syscall mmap failed: %w", err) + } + return data, nil +} + +/** + * @brief Transitions the memory protections of a segment to read-execute. + * + * @param data The byte slice representing the memory segment to protect. + * @return nil on success, or error if syscall failed. + */ +func mprotectRX(data []byte) error { + err := syscall.Mprotect(data, syscall.PROT_READ|syscall.PROT_EXEC) + if err != nil { + return fmt.Errorf("syscall mprotect RX failed: %w", err) + } + return nil +} + +/** + * @brief Frees memory allocated using syscall.Mmap. + * + * @param data The byte slice to release. + * @return nil on success, or error if munmap failed. + */ +func munmapJIT(data []byte) error { + err := syscall.Munmap(data) + if err != nil { + return fmt.Errorf("syscall munmap failed: %w", err) + } + return nil +} + +/** + * @struct GoFallbackKernel + * @brief Implements Kernel by executing a standard math calculation loop in Go. + */ +type GoFallbackKernel struct { + N int ///< Size of the matrix (N x N) + Locality bool ///< Flag indicating if locality-aware access loop order should be used +} + +/** + * @brief Releases the memory page for the fallback kernel (noop). + * + * @return nil always. + */ +func (k *GoFallbackKernel) Free() error { + return nil +} + +/** + * @brief Executes matrix multiplication using Go fallback loops. + * + * @param a Pointer to matrix A. + * @param b Pointer to matrix B. + * @param c Pointer to output matrix C. + */ +func (k *GoFallbackKernel) Execute(a, b, c unsafe.Pointer) { + n := k.N + cells := n * n + aSlice := (*[1 << 28]int32)(a)[:cells:cells] + bSlice := (*[1 << 28]int32)(b)[:cells:cells] + cSlice := (*[1 << 28]int32)(c)[:cells:cells] + + if k.Locality { + for i := 0; i < n; i++ { + for kv := 0; kv < n; kv++ { + r := aSlice[i*n+kv] + for j := 0; j < n; j++ { + cSlice[i*n+j] += r * bSlice[kv*n+j] + } + } + } + } else { + for i := 0; i < n; i++ { + for j := 0; j < n; j++ { + var sum int32 + for kv := 0; kv < n; kv++ { + sum += aSlice[i*n+kv] * bSlice[kv*n+j] + } + cSlice[i*n+j] = sum + } + } + } +} diff --git a/jit/jit_amd64.go b/jit/jit_amd64.go new file mode 100644 index 0000000..ddbd567 --- /dev/null +++ b/jit/jit_amd64.go @@ -0,0 +1,355 @@ +/** + * @file jit_amd64.go + * @brief AMD64 machine instruction emitter for flat and locality matrix multiplication. + * + * License: GNU GPLv3 + */ + +package jit + +import ( + "unsafe" +) + +// callJIT is the external assembler routing stub. +func callJIT(codePtr, a, b, c unsafe.Pointer) + +// cpuid is the Go-native assembly helper to query CPU capability flags. +func cpuid(leaf, subleaf uint32) (eax, ebx, ecx, edx uint32) + +/** + * @brief Checks if the host processor supports the AVX-512 foundation feature. + * + * @return true if AVX-512 foundation is supported, false otherwise. + */ +func hasAVX512F() bool { + eax, _, _, _ := cpuid(0, 0) + if eax < 7 { + return false + } + _, ebx, _, _ := cpuid(7, 0) + return (ebx & (1 << 16)) != 0 +} + +/** + * @brief Checks if the host processor supports the AVX2 vector instructions. + * + * @return true if AVX2 is supported, false otherwise. + */ +func hasAVX2() bool { + eax, _, _, _ := cpuid(0, 0) + if eax < 7 { + return false + } + _, ebx, _, _ := cpuid(7, 0) + return (ebx & (1 << 5)) != 0 +} + +/** + * @struct amd64Kernel + * @brief Implements Kernel interface for memory-resident AMD64 machine code blocks. + */ +type amd64Kernel struct { + code []byte ///< Slice holding the JIT-allocated and marked executable byte segment +} + +/** + * @brief Executes the JIT-compiled matrix multiplication kernel. + * + * @param a Pointer to matrix A. + * @param b Pointer to matrix B. + * @param c Pointer to output matrix C. + */ +func (k *amd64Kernel) Execute(a, b, c unsafe.Pointer) { + callJIT(unsafe.Pointer(&k.code[0]), a, b, c) +} + +/** + * @brief Deallocates the JIT-compiled executable memory block. + * + * @return nil on success, or error if munmap failed. + */ +func (k *amd64Kernel) Free() error { + if k.code == nil { + return nil + } + err := munmapJIT(k.code) + k.code = nil + return err +} + +/** + * @brief Compiles flat matrix multiplication for target size n. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel object or error. + */ +func CompileFlat(n int) (Kernel, error) { + // Template for matmul_flat scalar + template := []byte{ + 0x45, 0x31, 0xc0, // 0: xor %r8d, %r8d + 0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d + 0x7d, 0x5e, // 10: jge .Ldone + 0x45, 0x31, 0xc9, // 12: xor %r9d, %r9d + 0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 15: cmp $512, %r9d + 0x7d, 0x4d, // 22: jge .Lnext_i + 0x45, 0x31, 0xd2, // 24: xor %r10d, %r10d + 0x31, 0xc9, // 27: xor %ecx, %ecx + 0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 29: cmp $512, %r10d + 0x7d, 0x2b, // 36: jge .Lstore + 0x44, 0x89, 0xc0, // 38: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 41: imul $512, %eax, %eax + 0x44, 0x01, 0xd0, // 47: add %r10d, %eax + 0x44, 0x8b, 0x1c, 0x87, // 50: mov (%rdi,%rax,4), %r11d + 0x44, 0x89, 0xd0, // 54: mov %r10d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 57: imul $512, %eax, %eax + 0x44, 0x01, 0xc8, // 63: add %r9d, %eax + 0x8b, 0x04, 0x86, // 66: mov (%rsi,%rax,4), %eax + 0x44, 0x0f, 0xaf, 0xd8, // 69: imul %eax, %r11d + 0x44, 0x01, 0xd9, // 73: add %r11d, %ecx + 0x41, 0xff, 0xc2, // 76: inc %r10d + 0xeb, 0xcc, // 79: jmp .L3 + 0x44, 0x89, 0xc0, // 81: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 84: imul $512, %eax, %eax + 0x44, 0x01, 0xc8, // 90: add %r9d, %eax + 0x89, 0x0c, 0x82, // 93: mov %ecx, (%rdx,%rax,4) + 0x41, 0xff, 0xc1, // 96: inc %r9d + 0xeb, 0xaa, // 99: jmp .L2 + 0x41, 0xff, 0xc0, // 101: inc %r8d + 0xeb, 0x99, // 104: jmp .L1 + 0xc3, // 106: ret + } + + code, err := mmapJIT(len(template)) + if err != nil { + return nil, err + } + copy(code, template) + + val := uint32(n) + writeUint32(code, 6, val) + writeUint32(code, 18, val) + writeUint32(code, 32, val) + writeUint32(code, 43, val) + writeUint32(code, 59, val) + writeUint32(code, 86, val) + + err = mprotectRX(code) + if err != nil { + _ = munmapJIT(code) + return nil, err + } + + return &amd64Kernel{code: code}, nil +} + +/** + * @brief Compiles locality-optimized matrix multiplication for target size n. + * + * Decides whether to compile to AVX-512, AVX2, or scalar assembly pathways + * depending on runtime CPU detection. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel object or error. + */ +func CompileLocality(n int) (Kernel, error) { + if hasAVX512F() { + // Emit vectorized AVX-512 kernel (16-way strides) + template := []byte{ + 0x45, 0x31, 0xc0, // 0: xor %r8d, %r8d + 0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d + 0x0f, 0x8d, 0x84, 0x00, 0x00, 0x00, // 10: jge 94 + 0x45, 0x31, 0xc9, // 16: xor %r9d, %r9d + 0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 19: cmp $512, %r9d + 0x7d, 0x70, // 26: jge 8c + 0x44, 0x89, 0xc0, // 28: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 31: imul $512, %eax, %eax + 0x44, 0x01, 0xc8, // 37: add %r9d, %eax + 0x44, 0x8b, 0x1c, 0x87, // 40: mov (%rdi,%rax,4), %r11d + 0x62, 0xd2, 0x7d, 0x48, 0x7c, 0xc3, // 44: vpbroadcastd %r11d, %zmm0 + 0x45, 0x31, 0xd2, // 50: xor %r10d, %r10d + 0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 53: cmp $512, %r10d + 0x7d, 0x49, // 60: jge 87 + 0x44, 0x89, 0xc8, // 62: mov %r9d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 65: imul $512, %eax, %eax + 0x44, 0x01, 0xd0, // 71: add %r10d, %eax + 0x0f, 0x18, 0x4c, 0x86, 0x40, // 74: prefetcht0 0x40(%rsi,%rax,4) + 0x62, 0xf1, 0x7e, 0x48, 0x6f, 0x0c, 0x86, // 79: vmovdqu32 (%rsi,%rax,4), %zmm1 + 0x62, 0xf2, 0x75, 0x48, 0x40, 0xc8, // 86: vpmulld %zmm0, %zmm1, %zmm1 + 0x44, 0x89, 0xc0, // 92: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 95: imul $512, %eax, %eax + 0x44, 0x01, 0xd0, // 101: add %r10d, %eax + 0x0f, 0x18, 0x4c, 0x82, 0x40, // 104: prefetcht0 0x40(%rdx,%rax,4) + 0x62, 0xf1, 0x7e, 0x48, 0x6f, 0x14, 0x82, // 109: vmovdqu32 (%rdx,%rax,4), %zmm2 + 0x62, 0xf1, 0x6d, 0x48, 0xfe, 0xd1, // 116: vpaddd %zmm1, %zmm2, %zmm2 + 0x62, 0xf1, 0x7e, 0x48, 0x7f, 0x14, 0x82, // 122: vmovdqu32 %zmm2, (%rdx,%rax,4) + 0x41, 0x83, 0xc2, 0x10, // 129: add $16, %r10d + 0xeb, 0xae, // 133: jmp 35 + 0x41, 0xff, 0xc1, // 135: inc %r9d + 0xeb, 0x87, // 138: jmp 13 + 0x41, 0xff, 0xc0, // 140: inc %r8d + 0xe9, 0x6f, 0xff, 0xff, 0xff, // 143: jmp 3 + 0xc3, // 148: ret + } + + code, err := mmapJIT(len(template)) + if err != nil { + return nil, err + } + copy(code, template) + + val := uint32(n) + writeUint32(code, 6, val) + writeUint32(code, 22, val) + writeUint32(code, 33, val) + writeUint32(code, 56, val) + writeUint32(code, 67, val) + writeUint32(code, 97, val) + + err = mprotectRX(code) + if err != nil { + _ = munmapJIT(code) + return nil, err + } + + return &amd64Kernel{code: code}, nil + } else if hasAVX2() { + // Emit vectorized AVX2 kernel (8-way strides) + template := []byte{ + 0x45, 0x31, 0xc0, // 0: xor %r8d, %r8d + 0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, // 3: cmp $512, %r8d + 0x7d, 0x74, // 10: jge 80 + 0x45, 0x31, 0xc9, // 12: xor %r9d, %r9d + 0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, // 15: cmp $512, %r9d + 0x7d, 0x63, // 22: jge 7b + 0x44, 0x89, 0xc0, // 24: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 27: imul $512, %eax, %eax + 0x44, 0x01, 0xc8, // 33: add %r9d, %eax + 0xc4, 0xe2, 0x7d, 0x58, 0x04, 0x87, // 36: vpbroadcastd (%rdi,%rax,4),%ymm0 + 0x45, 0x31, 0xd2, // 42: xor %r10d, %r10d + 0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, // 45: cmp $512, %r10d + 0x7d, 0x40, // 52: jge 76 + 0x44, 0x89, 0xc8, // 54: mov %r9d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 57: imul $512, %eax, %eax + 0x44, 0x01, 0xd0, // 63: add %r10d, %eax + 0x0f, 0x18, 0x4c, 0x86, 0x40, // 66: prefetcht0 0x40(%rsi,%rax,4) + 0xc5, 0xfe, 0x6f, 0x0c, 0x86, // 71: vmovdqu (%rsi,%rax,4), %ymm1 + 0xc4, 0xe2, 0x75, 0x40, 0xc8, // 76: vpmulld %ymm0, %ymm1, %ymm1 + 0x44, 0x89, 0xc0, // 81: mov %r8d, %eax + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, // 84: imul $512, %eax, %eax + 0x44, 0x01, 0xd0, // 90: add %r10d, %eax + 0x0f, 0x18, 0x4c, 0x82, 0x40, // 93: prefetcht0 0x40(%rdx,%rax,4) + 0xc5, 0xfe, 0x6f, 0x14, 0x82, // 98: vmovdqu (%rdx,%rax,4), %ymm2 + 0xc5, 0xed, 0xfe, 0xd1, // 103: vpaddd %ymm1, %ymm2, %ymm2 + 0xc5, 0xfe, 0x7f, 0x14, 0x82, // 107: vmovdqu %ymm2, (%rdx,%rax,4) + 0x41, 0x83, 0xc2, 0x08, // 112: add $8, %r10d + 0xeb, 0xb7, // 116: jmp 2d + 0x41, 0xff, 0xc1, // 118: inc %r9d + 0xeb, 0x94, // 121: jmp f + 0x41, 0xff, 0xc0, // 123: inc %r8d + 0xeb, 0x83, // 126: jmp 3 + 0xc3, // 128: ret + } + + code, err := mmapJIT(len(template)) + if err != nil { + return nil, err + } + copy(code, template) + + val := uint32(n) + writeUint32(code, 6, val) + writeUint32(code, 18, val) + writeUint32(code, 29, val) + writeUint32(code, 48, val) + writeUint32(code, 59, val) + writeUint32(code, 86, val) + + err = mprotectRX(code) + if err != nil { + _ = munmapJIT(code) + return nil, err + } + + return &amd64Kernel{code: code}, nil + } else { + // Emit optimized scalar locality kernel + template := []byte{ + 0x45, 0x31, 0xc0, + 0x41, 0x81, 0xf8, 0x00, 0x02, 0x00, 0x00, + 0x7d, 0x7e, + 0x45, 0x31, 0xc9, + 0x41, 0x81, 0xf9, 0x00, 0x02, 0x00, 0x00, + 0x7d, 0x6a, + 0x44, 0x89, 0xc0, + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, + 0x44, 0x01, 0xc8, + 0x44, 0x8b, 0x1c, 0x87, + 0x45, 0x31, 0xd2, + 0x41, 0x81, 0xfa, 0x00, 0x02, 0x00, 0x00, + 0x7d, 0x49, + 0x44, 0x89, 0xc8, + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, + 0x44, 0x01, 0xd0, + 0x0f, 0x18, 0x4c, 0x86, 0x40, + 0x44, 0x89, 0xc1, + 0x69, 0xc9, 0x00, 0x02, 0x00, 0x00, + 0x44, 0x01, 0xd1, + 0x0f, 0x18, 0x4c, 0x8a, 0x40, + 0x44, 0x89, 0xc8, + 0x69, 0xc0, 0x00, 0x02, 0x00, 0x00, + 0x44, 0x01, 0xd0, + 0x8b, 0x04, 0x86, + 0x41, 0x0f, 0xaf, 0xc3, + 0x44, 0x89, 0xc1, + 0x69, 0xc9, 0x00, 0x02, 0x00, 0x00, + 0x44, 0x01, 0xd1, + 0x01, 0x04, 0x8a, + 0x41, 0xff, 0xc2, + 0xeb, 0xae, + 0x41, 0xff, 0xc1, + 0xeb, 0x8d, + 0x41, 0xff, 0xc0, + 0xe9, 0x79, 0xff, 0xff, 0xff, + 0xc3, + } + + code, err := mmapJIT(len(template)) + if err != nil { + return nil, err + } + copy(code, template) + + val := uint32(n) + writeUint32(code, 6, val) + writeUint32(code, 18, val) + writeUint32(code, 29, val) + writeUint32(code, 46, val) + writeUint32(code, 57, val) + writeUint32(code, 74, val) + writeUint32(code, 91, val) + writeUint32(code, 110, val) + + err = mprotectRX(code) + if err != nil { + _ = munmapJIT(code) + return nil, err + } + + return &amd64Kernel{code: code}, nil + } +} + +/** + * @brief Helper to write a 32-bit unsigned integer in little-endian order to a byte slice. + * + * @param code The destination byte slice. + * @param index Starting index offset inside the byte slice. + * @param val The 32-bit unsigned integer value to write. + */ +func writeUint32(code []byte, index int, val uint32) { + code[index] = byte(val) + code[index+1] = byte(val >> 8) + code[index+2] = byte(val >> 16) + code[index+3] = byte(val >> 24) +} diff --git a/jit/jit_amd64.s b/jit/jit_amd64.s new file mode 100644 index 0000000..fabfca6 --- /dev/null +++ b/jit/jit_amd64.s @@ -0,0 +1,22 @@ +#include "textflag.h" + +// func callJIT(codePtr, a, b, c unsafe.Pointer) +// System V AMD64 ABI expects arguments in: RDI, RSI, RDX +TEXT Β·callJIT(SB), NOSPLIT, $0-32 + MOVQ codePtr+0(FP), AX + MOVQ a+8(FP), DI + MOVQ b+16(FP), SI + MOVQ c+24(FP), DX + CALL AX + RET + +// func cpuid(leaf, subleaf uint32) (eax, ebx, ecx, edx uint32) +TEXT Β·cpuid(SB), NOSPLIT, $0-24 + MOVL leaf+0(FP), AX + MOVL subleaf+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET diff --git a/jit/jit_arm64.go b/jit/jit_arm64.go new file mode 100644 index 0000000..ab7395e --- /dev/null +++ b/jit/jit_arm64.go @@ -0,0 +1,33 @@ +//go:build arm64 +/** + * @file jit_arm64.go + * @brief ARM64 portable fallback routines for ORCHID matrix kernels. + * + * License: GNU GPLv3 + */ + +package jit + +/** + * @brief Compiles flat matrix multiplication for target size n on ARM64. + * + * Falls back to Go reference model to maintain correctness. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel fallback object or error. + */ +func CompileFlat(n int) (Kernel, error) { + return &GoFallbackKernel{N: n, Locality: false}, nil +} + +/** + * @brief Compiles locality-optimized matrix multiplication for target size n on ARM64. + * + * Falls back to Go reference model to maintain correctness. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel fallback object or error. + */ +func CompileLocality(n int) (Kernel, error) { + return &GoFallbackKernel{N: n, Locality: true}, nil +} diff --git a/jit/jit_other.go b/jit/jit_other.go new file mode 100644 index 0000000..6d80553 --- /dev/null +++ b/jit/jit_other.go @@ -0,0 +1,33 @@ +//go:build !amd64 && !arm64 +/** + * @file jit_other.go + * @brief Generic platform-independent fallback routines for ORCHID matrix kernels. + * + * License: GNU GPLv3 + */ + +package jit + +/** + * @brief Compiles flat matrix multiplication for target size n on other platforms. + * + * Falls back to Go reference model to maintain correctness. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel fallback object or error. + */ +func CompileFlat(n int) (Kernel, error) { + return &GoFallbackKernel{N: n, Locality: false}, nil +} + +/** + * @brief Compiles locality-optimized matrix multiplication for target size n on other platforms. + * + * Falls back to Go reference model to maintain correctness. + * + * @param n Size of the matrix (N x N). + * @return Compiled Kernel fallback object or error. + */ +func CompileLocality(n int) (Kernel, error) { + return &GoFallbackKernel{N: n, Locality: true}, nil +} diff --git a/jit/jit_test.go b/jit/jit_test.go new file mode 100644 index 0000000..6c30f7d --- /dev/null +++ b/jit/jit_test.go @@ -0,0 +1,102 @@ +/** + * @file jit_test.go + * @brief Correctness and latency benchmarks for ORCHID JIT compiler. + * + * License: GNU GPLv3 + */ + +package jit + +import ( + "math/rand" + "testing" + "time" + "unsafe" +) + +/** + * @brief Generates random test matrices A and B, and allocates buffer C. + * + * @param n Size of the matrix (N x N). + * @return Slice holding matrix A, slice holding matrix B, and output buffer slice C. + */ +func generateMatrices(n int) ([]int32, []int32, []int32) { + a := make([]int32, n*n) + b := make([]int32, n*n) + c := make([]int32, n*n) + r := rand.New(rand.NewSource(42)) + for i := 0; i < n*n; i++ { + a[i] = int32(r.Intn(100) - 50) + b[i] = int32(r.Intn(100) - 50) + } + return a, b, c +} + +/** + * @brief Validates mathematical parity of Flat and Locality JIT execution targets. + * + * Compiles and executes JIT kernels, verifying results against Go reference matrix operations. + * + * @param t Go testing state handle. + */ +func TestJITCorrectness(t *testing.T) { + sizes := []int{64, 128} + for _, n := range sizes { + a, b, cRef := generateMatrices(n) + _, _, cJitFlat := generateMatrices(n) + _, _, cJitLoc := generateMatrices(n) + + // Reference computation + ref := GoFallbackKernel{N: n, Locality: false} + ref.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cRef[0])) + + // JIT Flat + kFlat, err := CompileFlat(n) + if err != nil { + t.Fatalf("CompileFlat failed for N=%d: %v", n, err) + } + kFlat.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cJitFlat[0])) + _ = kFlat.Free() + + // JIT Locality + kLoc, err := CompileLocality(n) + if err != nil { + t.Fatalf("CompileLocality failed for N=%d: %v", n, err) + } + kLoc.Execute(unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&cJitLoc[0])) + _ = kLoc.Free() + + // Compare outputs + for i := 0; i < n*n; i++ { + if cJitFlat[i] != cRef[i] { + t.Fatalf("N=%d: Flat JIT mismatch at index %d: expected %d, got %d", n, i, cRef[i], cJitFlat[i]) + } + if cJitLoc[i] != cRef[i] { + t.Fatalf("N=%d: Locality JIT mismatch at index %d: expected %d, got %d", n, i, cRef[i], cJitLoc[i]) + } + } + t.Logf("N=%d: JIT math successfully validated against Go reference model.", n) + } +} + +/** + * @brief Benchmarks compilation overhead of the dynamic JIT compiler. + * + * Ensures page allocations, instruction writing, and page transitions + * happen within acceptable microsecond limits. + * + * @param t Go testing state handle. + */ +func TestJITCompilationTime(t *testing.T) { + start := time.Now() + k, err := CompileLocality(256) + if err != nil { + t.Fatalf("CompileLocality compilation failed: %v", err) + } + elapsed := time.Since(start) + _ = k.Free() + t.Logf("JIT emission overhead for 256x256 target: %s", elapsed) + if elapsed > 50*time.Millisecond { + t.Errorf("JIT compiler overhead exceeded performance threshold: %s", elapsed) + } +}