From c1e87b3d19949e49b8756a965ffa646b1278e65c Mon Sep 17 00:00:00 2001 From: westkevin12 Date: Thu, 4 Jun 2026 05:34:25 -0500 Subject: [PATCH] feat: implement ORCHID daemon with memory bank simulation, matrix kernel assembly, and AVX-512 fallback support --- Dockerfile | 43 +- Makefile | 4 +- README.md | 6 +- cmd/orchid-daemon/main.go | 879 ++++++++++++++++++++++++++++ cmd/orchid-daemon/matmul_fallback.c | 58 ++ cmd/orchid-daemon/matmul_wrapper.go | 337 +++++++++++ evidence/reproduced/speedups.json | 10 +- 7 files changed, 1307 insertions(+), 30 deletions(-) create mode 100644 cmd/orchid-daemon/main.go create mode 100644 cmd/orchid-daemon/matmul_fallback.c create mode 100644 cmd/orchid-daemon/matmul_wrapper.go diff --git a/Dockerfile b/Dockerfile index 404ff66..87c46dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,29 +51,30 @@ COPY . . CMD ["bash", "scripts/setup.sh"] # ---------------------------------------------------------------------- -# Stage 3: Hardened Release runtime copy (Compiled C Binary Modules) +# Stage 3: Builder stage to generate assembly and compile Go daemon # ---------------------------------------------------------------------- -FROM base AS release-hardened +FROM base AS release-builder -# Initialize sandboxed virtual environment -RUN uv venv --python 3.10 -ENV PATH="/app/.venv/bin:${PATH}" - -# Copy the entire Project ORCHID repository +WORKDIR /app COPY . . -# Install Nuitka and compile the Python control plane -RUN uv pip install nuitka && \ - python3 -m nuitka --module orchid/assembler.py --no-pyi-file --output-dir=build_nuitka && \ - python3 -m nuitka --module orchid/simulator.py --no-pyi-file --output-dir=build_nuitka && \ - python3 -m nuitka --module orchid/aggregator.py --no-pyi-file --output-dir=build_nuitka && \ - # Remove raw Python files to protect IP - rm orchid/assembler.py orchid/simulator.py orchid/aggregator.py && \ - # Move compiled shared object binary modules into package namespace - mv build_nuitka/*.so orchid/ && \ - # Purge compilation cache and packages to shrink image - rm -rf build_nuitka && \ - uv pip uninstall nuitka -y +# Generate assembly kernels from planning specs +RUN python3 -c \ + "import sys; from orchid.assembler import main; sys.exit(main())" \ + locality/matmul.plan --out-dir cmd/orchid-daemon -# Default container target (executes full diagnostics setup) -CMD ["bash", "scripts/setup.sh"] +# Compile Go daemon binary +RUN go build -o /app/orchid-daemon ./cmd/orchid-daemon + +# ---------------------------------------------------------------------- +# Stage 4: Hardened Release runtime copy (Zero-Dependency distroless) +# ---------------------------------------------------------------------- +FROM gcr.io/distroless/base-debian12:nonroot AS release-hardened + +WORKDIR /app + +# Copy the compiled Go daemon executable +COPY --from=release-builder /app/orchid-daemon /app/orchid-daemon + +# Default container target (executes full sweeps diagnostics) +CMD ["/app/orchid-daemon", "--mode", "all"] diff --git a/Makefile b/Makefile index dc04a05..19cf6f3 100644 --- a/Makefile +++ b/Makefile @@ -38,9 +38,11 @@ test: # Compile Go daemon core binary executable build: + @echo "[BUILD] Generating assembly modules..." + @python3 -c "import sys; from orchid.assembler import main; sys.exit(main())" locality/matmul.plan --out-dir cmd/orchid-daemon @echo "[BUILD] Compiling Go scheduler executable daemon..." @mkdir -p build - @go build -o build/orchid-daemon ./scheduler/... + @go build -o build/orchid-daemon ./cmd/orchid-daemon @echo "✓ Successfully compiled Go binary at: build/orchid-daemon" # Build Python SDK distributable packages diff --git a/README.md b/README.md index caac907..8c2c612 100644 --- a/README.md +++ b/README.md @@ -128,9 +128,9 @@ Project ORCHID publishes two distinct, optimized container flavors to the GitHub ### 1. Hardened Production Image (`ghcr.io/digitalserverhost/orchid:latest`) - **Target Stage:** `release-hardened` -- **Compiled Control Plane:** Compiles the `orchid` Python SDK plane into optimized C/C++ extension modules (`.so`) using **Nuitka**. -- **Source Protection:** Purges raw `.py` scripts inside the package namespace to prevent code extraction. -- **High Performance:** Execution loops for micro-kernels and role-scheduling simulators execute at native C speeds. +- **Zero-Dependency Go-Native Architecture:** Package holds ONLY the compiled native Go daemon (`orchid-daemon`) on top of a minimal, hardened `distroless` Debian environment (`base-debian12:nonroot`). +- **No Python Dependency:** Completely eliminates the Python interpreter runtime, virtual environment setup, standard library headers, and Nuitka modules to minimize runtime footprints. +- **Maximum Security & Performance:** The image runs under a non-privileged user space and loads execution kernels at native compiled CPU speeds with minimized startup latency. ### 2. Developer Sandbox Image (`ghcr.io/digitalserverhost/orchid:dev`) diff --git a/cmd/orchid-daemon/main.go b/cmd/orchid-daemon/main.go new file mode 100644 index 0000000..099dfa2 --- /dev/null +++ b/cmd/orchid-daemon/main.go @@ -0,0 +1,879 @@ +/** + * @file main.go + * @brief Entry point and TCP daemon for Project ORCHID's zero-dependency simulation node. + * + * Provides CLI commands, schedules bank-split workloads sequentially, manages client connections, + * processes JSON plan payloads, and executes timing diagnostics sweeps. + * + * Originator: Teppei Oohira (@gatchimuchio) / 大平鉄兵 + * Project Lead & Maintainer: Kevin West (@westkevin12) + * License: GNU GPLv3 + */ + +package main + +import ( + "ORCHID/scheduler" + "encoding/csv" + "encoding/json" + "flag" + "fmt" + "io" + "net" + "os" + "path/filepath" + "sort" + "strconv" + "strings" +) + +const ( + separatorLine = "======================================================================" + evidenceCurrent = "evidence/current" + evidenceReproduced = "evidence/reproduced" + errorFormat = "Error: %v\n" +) + +/** + * @struct SimCase + * @brief Represents a parallel memory bank mapping configuration test case. + */ +type SimCase struct { + Name string `json:"name"` + Description string `json:"description"` + BankCount int `json:"bank_count"` + Mapping map[string]int `json:"mapping"` +} + +/** + * @struct Event + * @brief Standardized trace event payload format compatible with JSON clients. + */ +type Event struct { + Role string `json:"role"` + Kind string `json:"kind"` + Index int `json:"index"` + Bank int `json:"bank"` + Earliest int `json:"earliest"` + Start int `json:"start"` + End int `json:"end"` +} + +/** + * @struct Result + * @brief Holds full intermediate simulation results for validation correctness checks. + */ +type Result struct { + Name string + Description string + BanksAvailable int + Mapping map[string]int + Cycles uint64 + Checksum int64 + Requests []uint64 + Trace []Event + Output []int32 +} + +/** + * @struct Row + * @brief Holds tabular results to write to CSV and JSON files. + */ +type Row struct { + Case string `json:"case"` + BanksAvailable int `json:"banks_available"` + Mapping string `json:"mapping"` + Cycles uint64 `json:"cycles"` + SpeedupVsSerial float64 `json:"speedup_vs_serial"` + RequestsPerBank string `json:"requests_per_bank"` + UtilizationPerBank string `json:"utilization_per_bank"` + Checksum int64 `json:"checksum"` +} + +/** + * @struct SimOutputsConfig + * @brief Bundles all parameters and output variables to pass into writing routines. + */ +type SimOutputsConfig struct { + OutDir string + N int + Scalar int + ServiceCycles int + ComputeCycles int + Baseline Result + Results []Result + Rows []Row +} + +/** + * @struct ConnectionPayload + * @brief Outer client TCP command structure holding plan configs. + */ +type ConnectionPayload struct { + Action string `json:"action"` + Simulation *SimulationPayload `json:"simulation"` + Locality *LocalityPayload `json:"locality"` +} + +/** + * @struct SimulationPayload + * @brief Inner simulation payload containing size parameters and service cycles. + */ +type SimulationPayload struct { + N int `json:"n"` + Scalar int `json:"scalar"` + ServiceCycles int `json:"service_cycles"` + ComputeCycles int `json:"compute_cycles"` + TraceLimit int `json:"trace_limit"` + OutDir string `json:"out_dir"` +} + +/** + * @struct LocalityPayload + * @brief Inner matrix cache locality timing benchmark parameters. + */ +type LocalityPayload struct { + Repeats int `json:"repeats"` + OutDir string `json:"out_dir"` +} + +/** + * @brief Generates deterministic inputs B and C vectors for math parity validation. + * + * @param n Length of vectors to allocate. + * @return B slice and C slice of int32 elements. + */ +func generateInputVectors(n int) ([]int32, []int32) { + b := make([]int32, n) + c := make([]int32, n) + for i := 0; i < n; i++ { + b[i] = int32(((i*17 + 3) % 97) - 48) + c[i] = int32(((i*29 + 11) % 89) - 44) + } + return b, c +} + +/** + * @brief Runs sequential scheduler loops mimicking DRAM simulation steps. + * + * Runs mathematical Triad formula: A[i] = B[i] + scalar * C[i], scheduling accesses. + * + * @param n Element size length of the buffers. + * @param scalar Scalar multiplier coefficient. + * @param bankCount Number of hardware banks. + * @param mapping Key-value mapping designating which bank gets A, B, and C. + * @param serviceCycles Processing cycle latency cost per access. + * @param computeCycles Additional cycle delay for execution. + * @param traceLimit Upper limit of events recorded in the trace log. + * @return Total cycles, output slice, correctness checksum, requests per bank, trace array, error. + */ +func runTriadSimulationSequential( + n int, + scalar int32, + bankCount int, + mapping map[string]int, + serviceCycles uint64, + computeCycles uint64, + traceLimit int, +) (uint64, []int32, int64, []uint64, []Event, error) { + b, c := generateInputVectors(n) + a := make([]int32, n) + + ms, err := scheduler.NewMemoryScheduler(bankCount, serviceCycles, traceLimit) + if err != nil { + return 0, nil, 0, nil, nil, err + } + + for i := 0; i < n; i++ { + bDone, err := ms.Access("B", "READ", i, mapping["B"], 0) + if err != nil { + return 0, nil, 0, nil, nil, err + } + cDone, err := ms.Access("C", "READ", i, mapping["C"], 0) + if err != nil { + return 0, nil, 0, nil, nil, err + } + + readyCycle := bDone + if cDone > readyCycle { + readyCycle = cDone + } + computedCycle := readyCycle + computeCycles + + a[i] = b[i] + scalar*c[i] + + _, err = ms.Access("A", "WRITE", i, mapping["A"], computedCycle) + if err != nil { + return 0, nil, 0, nil, nil, err + } + } + + var checksum int64 + for i, v := range a { + checksum += int64(i+1) * int64(v) + } + + requests := make([]uint64, bankCount) + for i := 0; i < bankCount; i++ { + requests[i] = ms.GetRequests(i) + } + + schTrace := ms.GetTrace() + traceEvents := make([]Event, len(schTrace)) + for i, ev := range schTrace { + traceEvents[i] = Event{ + Role: ev.Role, + Kind: ev.Kind, + Index: ev.Index, + Bank: ev.Bank, + Earliest: int(ev.Earliest), + Start: int(ev.Start), + End: int(ev.End), + } + } + + return ms.TotalCycles(), a, checksum, requests, traceEvents, nil +} + +/** + * @brief Asserts logical result correctness across parallel configuration tests. + * + * @param results Slice of simulation results. + * @param baseline The reference serial result configuration. + * @return error if any check value or checksum does not match baseline. + */ +func validateSimResults(results []Result, baseline Result) error { + for _, res := range results[1:] { + if len(res.Output) != len(baseline.Output) { + return fmt.Errorf("mismatch in output size in case %s", res.Name) + } + for idx, v := range res.Output { + if v != baseline.Output[idx] { + return fmt.Errorf("logical calculation mismatch in case %s at index %d", res.Name, idx) + } + } + if res.Checksum != baseline.Checksum { + return fmt.Errorf("checksum mismatch in case %s", res.Name) + } + } + return nil +} + +/** + * @brief Formats raw metrics results into rounded rows for table display. + * + * @param results Slice of simulation results. + * @param baseline Reference serial simulation result. + * @param serviceCycles Processing cycle cost per memory bank access. + * @return Slice of Row models formatted for printing/writing. + */ +func buildSimRows(results []Result, baseline Result, serviceCycles int) []Row { + rows := make([]Row, len(results)) + for i, res := range results { + speedup := float64(baseline.Cycles) / float64(res.Cycles) + utilization := make([]float64, len(res.Requests)) + for bankIdx, req := range res.Requests { + utilization[bankIdx] = float64(req) * float64(serviceCycles) / float64(res.Cycles) + } + + keys := make([]string, 0, len(res.Mapping)) + for k := range res.Mapping { + keys = append(keys, k) + } + sort.Strings(keys) + sortedMappingParts := make([]string, len(keys)) + for kIdx, k := range keys { + sortedMappingParts[kIdx] = fmt.Sprintf(`"%s": %d`, k, res.Mapping[k]) + } + sortedMappingStr := "{" + strings.Join(sortedMappingParts, ", ") + "}" + + reqsJSON, _ := json.Marshal(res.Requests) + utilsRounded := make([]float64, len(utilization)) + for idx, val := range utilization { + utilsRounded[idx] = float64(int(val*1000000)) / 1000000.0 + } + utilsJSON, _ := json.Marshal(utilsRounded) + + rows[i] = Row{ + Case: res.Name, + BanksAvailable: res.BanksAvailable, + Mapping: sortedMappingStr, + Cycles: res.Cycles, + SpeedupVsSerial: float64(int(speedup*1000000)) / 1000000.0, + RequestsPerBank: string(reqsJSON), + UtilizationPerBank: string(utilsJSON), + Checksum: res.Checksum, + } + } + return rows +} + +/** + * @brief Encodes the simulation config and results into results.json. + * + * @param cfg Pointer to SimOutputsConfig. + * @return error if writing or encoding fails. + */ +func writeResultsJSON(cfg *SimOutputsConfig) error { + resultsJSONMap := map[string]interface{}{ + "workload": map[string]interface{}{ + "formula": "A[i] = B[i] + scalar * C[i]", + "n": cfg.N, + "scalar": cfg.Scalar, + "memory_operations_per_element": []string{"READ B", "READ C", "WRITE A"}, + "service_cycles_per_memory_operation": cfg.ServiceCycles, + "compute_cycles_after_reads": cfg.ComputeCycles, + }, + "baseline": cfg.Baseline.Name, + "results": cfg.Rows, + "verification": map[string]interface{}{ + "same_logical_output": true, + "checksum": cfg.Baseline.Checksum, + }, + } + resJSONBytes, err := json.MarshalIndent(resultsJSONMap, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(cfg.OutDir, "results.json"), append(resJSONBytes, '\n'), 0644) +} + +/** + * @brief Formats and writes tabular simulation results into results.csv. + * + * @param cfg Pointer to SimOutputsConfig. + * @return error if CSV creation or writes fail. + */ +func writeResultsCSV(cfg *SimOutputsConfig) error { + csvFile, err := os.Create(filepath.Join(cfg.OutDir, "results.csv")) + if err != nil { + return err + } + defer csvFile.Close() + + csvWriter := csv.NewWriter(csvFile) + defer csvWriter.Flush() + + headers := []string{"case", "banks_available", "mapping", "cycles", "speedup_vs_serial", "requests_per_bank", "utilization_per_bank", "checksum"} + if err := csvWriter.Write(headers); err != nil { + return err + } + for _, row := range cfg.Rows { + record := []string{ + row.Case, + strconv.Itoa(row.BanksAvailable), + row.Mapping, + strconv.FormatUint(row.Cycles, 10), + fmt.Sprintf("%.6f", row.SpeedupVsSerial), + row.RequestsPerBank, + row.UtilizationPerBank, + strconv.FormatInt(row.Checksum, 10), + } + if err := csvWriter.Write(record); err != nil { + return err + } + } + return nil +} + +/** + * @brief Outputs trace log events into trace_first_events.csv. + * + * @param cfg Pointer to SimOutputsConfig. + * @return error if file creation or CSV writes fail. + */ +func writeTraceCSV(cfg *SimOutputsConfig) error { + traceFile, err := os.Create(filepath.Join(cfg.OutDir, "trace_first_events.csv")) + if err != nil { + return err + } + defer traceFile.Close() + + traceWriter := csv.NewWriter(traceFile) + defer traceWriter.Flush() + + traceHeaders := []string{"case", "role", "kind", "index", "bank", "earliest", "start", "end"} + if err := traceWriter.Write(traceHeaders); err != nil { + return err + } + for _, res := range cfg.Results { + for _, ev := range res.Trace { + record := []string{ + res.Name, + ev.Role, + ev.Kind, + strconv.Itoa(ev.Index), + strconv.Itoa(ev.Bank), + strconv.Itoa(ev.Earliest), + strconv.Itoa(ev.Start), + strconv.Itoa(ev.End), + } + if err := traceWriter.Write(record); err != nil { + return err + } + } + } + return nil +} + +/** + * @brief Prints and writes a human-readable performance summary to summary.txt. + * + * @param cfg Pointer to SimOutputsConfig. + * @return error if writing file fails. + */ +func writeSummaryTXT(cfg *SimOutputsConfig) error { + summaryLines := []string{ + separatorLine, + " PROJECT ORCHID: PARALLEL MEMORY MINIMAL POC", + " Originator: Teppei Oohira (@gatchimuchio) / 大平鉄兵", + " Maintainer: Kevin West (@westkevin12)", + separatorLine, + fmt.Sprintf("workload: A[i] = B[i] + %d * C[i] | total_elements=%d", cfg.Scalar, cfg.N), + fmt.Sprintf("latency_cycles: service=%d | compute=%d", cfg.ServiceCycles, cfg.ComputeCycles), + fmt.Sprintf("verification: identical outputs validated | checksum=%d", cfg.Baseline.Checksum), + "", + fmt.Sprintf("%-42s %14s %12s %20s", "case", "cycles", "speedup", "requests/bank"), + } + for _, row := range cfg.Rows { + summaryLines = append(summaryLines, + fmt.Sprintf("%-42s %14d %11.3fx %20s", + row.Case, row.Cycles, row.SpeedupVsSerial, row.RequestsPerBank, + ), + ) + } + summaryLines = append(summaryLines, + "", + "INTERPRETATION", + "- serial_single_memory is the undifferentiated one-service baseline.", + "- parallel_two_memory_role_split is the conservative proposed minimum: independent source access is parallelized while output still shares one bank.", + "- parallel_three_memory_role_split shows the upper reference when input and output roles have independent services.", + "- parallel_two_memory_conflicted_control proves that merely having multiple banks gives no benefit unless roles/requests are separated correctly.", + "", + "BOUNDARY", + "- This is a deterministic architectural scheduling proof, not a physical DRAM benchmark.", + "- Physical validation requires an implementation substrate exposing independent memory service paths or hardware/FPGA/simulator support.", + separatorLine, + "", + ) + summaryText := strings.Join(summaryLines, "\n") + fmt.Print(summaryText) + return os.WriteFile(filepath.Join(cfg.OutDir, "summary.txt"), []byte(summaryText), 0644) +} + +/** + * @brief Coordinates outputs creation (JSON, CSV, summary) inside output directory. + * + * @param cfg Pointer to SimOutputsConfig containing simulation context. + * @return error if directories creation or file writes fail. + */ +func writeSimulationOutputs(cfg *SimOutputsConfig) error { + if cfg.OutDir == "" { + return nil + } + + if err := os.MkdirAll(cfg.OutDir, 0755); err != nil { + return err + } + + if err := writeResultsJSON(cfg); err != nil { + return err + } + + if err := writeResultsCSV(cfg); err != nil { + return err + } + + if err := writeTraceCSV(cfg); err != nil { + return err + } + + return writeSummaryTXT(cfg) +} + +/** + * @brief Entry point for executing the parallel memory bank scheduler simulation. + * + * Allocates scheduler structures, runs sequential case combinations, verifies parity, + * formats speedups tables, and records logs. + * + * @param n Element size length of the buffers. + * @param scalar Scalar multiplier coefficient. + * @param serviceCycles Processing cycle latency cost per access. + * @param computeCycles Additional cycle delay for execution. + * @param traceLimit Upper limit of events recorded in the trace log. + * @param outDir Target directory to save results files. + * @return Result summary parameters or an error. + */ +func RunSimulationBenchmark(n int, scalar int, serviceCycles, computeCycles, traceLimit int, outDir string) (map[string]interface{}, error) { + if n < 1 || computeCycles < 0 { + return nil, fmt.Errorf("n must be >= 1 and compute_cycles must be >= 0") + } + + cases := []SimCase{ + {"serial_single_memory", "One logical memory service; B read, C read and A write serialize.", 1, map[string]int{"B": 0, "C": 0, "A": 0}}, + {"parallel_two_memory_role_split", "Two independent services; B read and A write share bank 0, C read uses bank 1.", 2, map[string]int{"B": 0, "C": 1, "A": 0}}, + {"parallel_three_memory_role_split", "Three independent services; B read, C read and A write have distinct banks.", 3, map[string]int{"B": 0, "C": 1, "A": 2}}, + {"parallel_two_memory_conflicted_control", "Two banks exist but all data roles are placed on bank 0; negative control.", 2, map[string]int{"B": 0, "C": 0, "A": 0}}, + } + + results := make([]Result, len(cases)) + for i, c := range cases { + cycles, out, checksum, requests, trace, err := runTriadSimulationSequential( + n, int32(scalar), c.BankCount, c.Mapping, uint64(serviceCycles), uint64(computeCycles), traceLimit, + ) + if err != nil { + return nil, err + } + results[i] = Result{ + Name: c.Name, + Description: c.Description, + BanksAvailable: c.BankCount, + Mapping: c.Mapping, + Cycles: cycles, + Checksum: checksum, + Requests: requests, + Trace: trace, + Output: out, + } + } + + baseline := results[0] + if err := validateSimResults(results, baseline); err != nil { + return nil, err + } + + rows := buildSimRows(results, baseline, serviceCycles) + + cfg := &SimOutputsConfig{ + OutDir: outDir, + N: n, + Scalar: scalar, + ServiceCycles: serviceCycles, + ComputeCycles: computeCycles, + Baseline: baseline, + Results: results, + Rows: rows, + } + + if err := writeSimulationOutputs(cfg); err != nil { + return nil, err + } + + return map[string]interface{}{ + "cycles": baseline.Cycles, + "checksum": baseline.Checksum, + }, nil +} + +/** + * @brief Parses incoming client simulation configuration variables or loads defaults. + * + * @param p Pointer to SimulationPayload. + * @return parsed n, scalar, serviceCycles, computeCycles, traceLimit, and outDir values. + */ +func parseSimPayload(p *SimulationPayload) (int, int, int, int, int, string) { + n := 16384 + scalar := 3 + serviceCycles := 100 + computeCycles := 1 + traceLimit := 18 + outDir := evidenceCurrent + + if p != nil { + if p.N > 0 { + n = p.N + } + if p.Scalar != 0 { + scalar = p.Scalar + } + if p.ServiceCycles > 0 { + serviceCycles = p.ServiceCycles + } + if p.ComputeCycles >= 0 { + computeCycles = p.ComputeCycles + } + if p.TraceLimit > 0 { + traceLimit = p.TraceLimit + } + if p.OutDir != "" { + outDir = p.OutDir + } + } + return n, scalar, serviceCycles, computeCycles, traceLimit, outDir +} + +/** + * @brief Parses client locality benchmark repeat parameters or loads defaults. + * + * @param p Pointer to LocalityPayload. + * @return parsed repeats count and output directory path. + */ +func parseLocalityPayload(p *LocalityPayload) (int, string) { + repeats := 8 + outDir := evidenceReproduced + + if p != nil { + if p.Repeats > 0 { + repeats = p.Repeats + } + if p.OutDir != "" { + outDir = p.OutDir + } + } + return repeats, outDir +} + +/** + * @brief Reads, decodes, executes and responds to client TCP requests in daemon mode. + * + * @param conn Net Connection interface. + */ +func handleConnection(conn net.Conn) { + defer conn.Close() + + var payload ConnectionPayload + decoder := json.NewDecoder(conn) + if err := decoder.Decode(&payload); err != nil { + writeErrorResponse(conn, fmt.Sprintf("invalid json payload: %v", err)) + return + } + + response := map[string]interface{}{"status": "success"} + action := strings.ToLower(payload.Action) + if action == "" { + action = "all" + } + + if action == "simulation" || action == "all" { + n, scalar, service, compute, trace, out := parseSimPayload(payload.Simulation) + res, err := RunSimulationBenchmark(n, scalar, service, compute, trace, out) + if err != nil { + writeErrorResponse(conn, fmt.Sprintf("simulation failed: %v", err)) + return + } + response["simulation"] = res + } + + if action == "locality" || action == "all" { + repeats, out := parseLocalityPayload(payload.Locality) + res, err := RunLocalityBenchmark(repeats, out) + if err != nil { + writeErrorResponse(conn, fmt.Sprintf("locality benchmark failed: %v", err)) + return + } + response["locality"] = res + } + + responseBytes, _ := json.Marshal(response) + conn.Write(append(responseBytes, '\n')) +} + +/** + * @brief Encodes and writes standard error outputs to TCP clients. + * + * @param conn Net Connection interface. + * @param errMsg Human-readable error message. + */ +func writeErrorResponse(conn net.Conn, errMsg string) { + res := map[string]interface{}{ + "status": "error", + "error": errMsg, + } + resBytes, _ := json.Marshal(res) + conn.Write(append(resBytes, '\n')) +} + +/** + * @brief Loads, reads, and decodes planning instruction documents. + * + * @param planPath File path string location, or '-' to read from stdin. + * @return ConnectionPayload configuration pointer or an error. + */ +func decodePlanFile(planPath string) (*ConnectionPayload, error) { + var input io.Reader + if planPath == "-" { + input = os.Stdin + } else { + file, err := os.Open(planPath) + if err != nil { + return nil, err + } + defer file.Close() + input = file + } + + var plan ConnectionPayload + if err := json.NewDecoder(input).Decode(&plan); err != nil { + return nil, err + } + return &plan, nil +} + +/** + * @brief Executes local simulation and locality workloads defined in JSON plan files. + * + * @param planPath File path containing simulation details. + * @param customOutDir Custom output directory to write logs (optional). + */ +func executePlan(planPath string, customOutDir string) { + plan, err := decodePlanFile(planPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to load plan: %v\n", err) + os.Exit(1) + } + + action := strings.ToLower(plan.Action) + if action == "" { + action = "all" + } + + if action == "simulation" || action == "all" { + n, scalar, service, compute, trace, outDir := parseSimPayload(plan.Simulation) + if customOutDir != "" { + outDir = customOutDir + } + + fmt.Printf("Executing simulation (N=%d, scalar=%d, service_cycles=%d, compute_cycles=%d, out_dir=%s)...\n", + n, scalar, service, compute, outDir) + _, err := RunSimulationBenchmark(n, scalar, service, compute, trace, outDir) + if err != nil { + fmt.Fprintf(os.Stderr, "Simulation execution failed: %v\n", err) + os.Exit(1) + } + } + + if action == "locality" || action == "all" { + repeats, outDir := parseLocalityPayload(plan.Locality) + if customOutDir != "" { + outDir = customOutDir + } + + fmt.Printf("Executing locality benchmark (repeats=%d, out_dir=%s)...\n", repeats, outDir) + _, err := RunLocalityBenchmark(repeats, outDir) + if err != nil { + fmt.Fprintf(os.Stderr, "Locality execution failed: %v\n", err) + os.Exit(1) + } + } +} + +/** + * @brief Listens on TCP sockets to serve incoming planning payload requests. + * + * @param addr Address string to bind to (e.g. ":9000"). + */ +func runDaemon(addr string) { + ln, err := net.Listen("tcp", addr) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to listen on %s: %v\n", addr, err) + os.Exit(1) + } + defer ln.Close() + fmt.Printf("Go daemon listening on TCP %s...\n", addr) + for { + conn, err := ln.Accept() + if err != nil { + continue + } + go handleConnection(conn) + } +} + +/** + * @brief Command line helper to run the simulation benchmark. + * + * @param outDir Target directory to write outputs. + */ +func runSimulationCLI(outDir string) { + fmt.Println("Running simulation benchmark...") + _, err := RunSimulationBenchmark(16384, 3, 100, 1, 18, outDir) + if err != nil { + fmt.Fprintf(os.Stderr, errorFormat, err) + os.Exit(1) + } +} + +/** + * @brief Command line helper to execute locality timing sweeps. + * + * @param outDir Target directory to write outputs. + */ +func runLocalityCLI(outDir string) { + fmt.Println("Running locality cache benchmark...") + _, err := RunLocalityBenchmark(8, outDir) + if err != nil { + fmt.Fprintf(os.Stderr, errorFormat, err) + os.Exit(1) + } +} + +/** + * @brief Command line helper to run both simulation and locality sweeps. + * + * @param simDir Target simulation output directory. + * @param locDir Target locality output directory. + */ +func runAllCLI(simDir, locDir string) { + fmt.Println("==========================================") + fmt.Println("Running parallel bank simulation...") + _, err := RunSimulationBenchmark(16384, 3, 100, 1, 18, simDir) + if err != nil { + fmt.Fprintf(os.Stderr, errorFormat, err) + os.Exit(1) + } + + fmt.Println("\n==========================================") + fmt.Println("Running CPU locality matrix benchmark...") + _, err = RunLocalityBenchmark(8, locDir) + if err != nil { + fmt.Fprintf(os.Stderr, errorFormat, err) + os.Exit(1) + } +} + +/** + * @brief main executable entry point routing execution flags. + */ +func main() { + modeFlag := flag.String("mode", "daemon", "Execution mode: daemon, simulation, locality, all") + planFlag := flag.String("plan", "", "Path to local JSON plan file to execute (or '-' for stdin)") + addrFlag := flag.String("addr", ":9000", "TCP address to listen on in daemon mode") + outDirFlag := flag.String("out-dir", "", "Custom output directory for files (overrides defaults)") + flag.Parse() + + if *planFlag != "" { + executePlan(*planFlag, *outDirFlag) + return + } + + switch *modeFlag { + case "daemon": + runDaemon(*addrFlag) + case "simulation": + outDir := *outDirFlag + if outDir == "" { + outDir = evidenceCurrent + } + runSimulationCLI(outDir) + case "locality": + outDir := *outDirFlag + if outDir == "" { + outDir = evidenceReproduced + } + runLocalityCLI(outDir) + case "all": + simDir := *outDirFlag + if simDir == "" { + simDir = evidenceCurrent + } + locDir := *outDirFlag + if locDir == "" { + locDir = evidenceReproduced + } + runAllCLI(simDir, locDir) + default: + fmt.Fprintf(os.Stderr, "Unknown mode: %s\n", *modeFlag) + os.Exit(1) + } +} diff --git a/cmd/orchid-daemon/matmul_fallback.c b/cmd/orchid-daemon/matmul_fallback.c new file mode 100644 index 0000000..5179f32 --- /dev/null +++ b/cmd/orchid-daemon/matmul_fallback.c @@ -0,0 +1,58 @@ +/** + * @file matmul_fallback.c + * @brief C fallbacks and CPU capability checking for ORCHID matrix kernels. + */ + +#include +#include +#include + +#define N 512 + +static volatile uint64_t flush_sink = 0; + +/** + * @brief Dynamic CPUID hardware capability check for AVX-512 foundation support. + */ +int has_avx512f(void) { + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid_max(0, NULL) < 7) { + return 0; + } + __cpuid_count(7, 0, eax, ebx, ecx, edx); + return (ebx & (1 << 16)) != 0; // AVX-512 Foundation is bit 16 in EBX of CPUID leaf 7, subleaf 0 +} + +/** + * @brief Contiguous Locality-Aligned (I-K-J) fallback kernel in C. + * Used when the host processor does not support native AVX-512 vector instructions. + */ +void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c) { + for (int i = 0; i < N; ++i) { + for (int k = 0; k < N; ++k) { + int32_t aik = a[i * N + k]; + for (int j = 0; j < N; ++j) { + c[i * N + j] += aik * b[k * N + j]; + } + } + } +} + +/** + * @brief Flushes cache lines from L1-L3 by writing to contiguous pages. + */ +void flush_cache_c(uint8_t *buf, size_t size) { + uint64_t local = 0; + for (size_t i = 0; i < size; i += 64) { + buf[i] = (uint8_t)(buf[i] + 1u); + local += buf[i]; + } + flush_sink += local; +} + +/** + * @brief Retrieves the flush sink accumulator to prevent optimization. + */ +uint64_t get_flush_sink(void) { + return flush_sink; +} diff --git a/cmd/orchid-daemon/matmul_wrapper.go b/cmd/orchid-daemon/matmul_wrapper.go new file mode 100644 index 0000000..e413cff --- /dev/null +++ b/cmd/orchid-daemon/matmul_wrapper.go @@ -0,0 +1,337 @@ +/** + * @file matmul_wrapper.go + * @brief Go wrapper linking C/assembly matrix kernels and executing locality timing benchmarks. + * + * Coordinate AVX-512/scalar dispatch execution, physical memory alignment allocations, + * CPU cache flushes, statistical speedup analysis, and timing files creation. + * + * Originator: Teppei Oohira (@gatchimuchio) / 大平鉄兵 + * Project Lead & Maintainer: Kevin West (@westkevin12) + * License: GNU GPLv3 + */ + +package main + +/* +#include +#include +#include + +int has_avx512f(void); +void matmul_flat(const int32_t *a, const int32_t *b, int32_t *c); +void matmul_locality(const int32_t *a, const int32_t *b, int32_t *c); +void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c); +void flush_cache_c(uint8_t *buf, size_t size); +uint64_t get_flush_sink(void); +*/ +import "C" +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "time" + "unsafe" +) + +const ( + N = 512 + Cells = N * N + Bytes = Cells * 4 + FlushBytes = 64 * 1024 * 1024 +) + +/** + * @struct LocalityResult + * @brief Stores timing speedup statistics and correctness verification checksums. + */ +type LocalityResult struct { + Min float64 `json:"min"` + Median float64 `json:"median"` + Max float64 `json:"max"` + Mean float64 `json:"mean"` + Checksum int64 `json:"checksum"` +} + +/** + * @brief Computes the median value of a slice of floats. + * + * @param values Slice of floats to compute median for. + * @return The calculated median value. + */ +func median(values []float64) float64 { + if len(values) == 0 { + return 0.0 + } + sort.Float64s(values) + n := len(values) + if n%2 == 1 { + return values[n/2] + } + return (values[n/2-1] + values[n/2]) / 2.0 +} + +/** + * @brief Invokes either the AVX-512 assembly kernel or the scalar fallback kernel. + * + * @param aPtr Pointer to input matrix A. + * @param bPtr Pointer to input matrix B. + * @param clPtr Pointer to output matrix C. + * @param useAVX512 Flag indicating if AVX-512 should be executed. + */ +func executeLocalityKernel(aPtr, bPtr, clPtr unsafe.Pointer, useAVX512 bool) { + if useAVX512 { + C.matmul_locality((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr)) + } else { + C.matmul_locality_fallback((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(clPtr)) + } +} + +/** + * @brief Executes pairs of flat vs locality benchmarks to measure cache speedups. + * + * @param repeats Number of benchmark iterations to perform. + * @param aPtr Pointer to matrix A. + * @param bPtr Pointer to matrix B. + * @param cfPtr Pointer to flat output buffer. + * @param clPtr Pointer to locality output buffer. + * @param flushPtr Pointer to cache flushing buffer space. + * @param useAVX512 Flag for AVX-512 hardware support. + * @return Speedup values slice and printed log lines slice. + */ +func runBenchmarkPairs(repeats int, aPtr, bPtr, cfPtr, clPtr, flushPtr unsafe.Pointer, useAVX512 bool) ([]float64, []string) { + var speedups []float64 + var timingLines []string + + for r := 0; r < repeats; r++ { + var flatSec, localSec float64 + var order string + + if r%2 == 0 { + order = "flat-first" + C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) + C.memset(cfPtr, 0, C.size_t(Bytes)) + t0 := time.Now() + C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr)) + flatSec = time.Since(t0).Seconds() + + C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) + C.memset(clPtr, 0, C.size_t(Bytes)) + t0 = time.Now() + executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + localSec = time.Since(t0).Seconds() + } else { + order = "locality-first" + C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) + C.memset(clPtr, 0, C.size_t(Bytes)) + t0 := time.Now() + executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + localSec = time.Since(t0).Seconds() + + C.flush_cache_c((*C.uint8_t)(flushPtr), C.size_t(FlushBytes)) + C.memset(cfPtr, 0, C.size_t(Bytes)) + t0 = time.Now() + C.matmul_flat((*C.int32_t)(aPtr), (*C.int32_t)(bPtr), (*C.int32_t)(cfPtr)) + flatSec = time.Since(t0).Seconds() + } + + speedup := flatSec / localSec + speedups = append(speedups, speedup) + + pairMsg := fmt.Sprintf("PAIR %d order=%s flat_sec=%.9f locality_sec=%.9f speedup=%.3fx", + r+1, order, flatSec, localSec, speedup) + fmt.Println(pairMsg) + timingLines = append(timingLines, pairMsg) + } + + return speedups, timingLines +} + +/** + * @struct BenchmarkOutputs + * @brief Groups together the benchmark output metrics and directory configurations. + */ +type BenchmarkOutputs struct { + OutDir string + TelemetryMsg string + VerifyMsg string + FlushSinkMsg string + TimingLines []string + Min float64 + Median float64 + Max float64 + Mean float64 +} + +/** + * @brief Outputs the timing log, statistical summaries, and JSON speedups to the file system. + * + * @param cfg Pointer to BenchmarkOutputs configuration. + * @return error if any directory creation or write operation fails. + */ +func writeBenchmarkOutputs(cfg *BenchmarkOutputs) error { + if cfg.OutDir == "" { + return nil + } + + if err := os.MkdirAll(cfg.OutDir, 0755); err != nil { + return err + } + + // 1. Write fair_timing_result_current_environment.txt + timingContent := cfg.TelemetryMsg + "\n" + cfg.VerifyMsg + "\n" + for _, line := range cfg.TimingLines { + timingContent += line + "\n" + } + timingContent += cfg.FlushSinkMsg + "\n" + err := os.WriteFile(filepath.Join(cfg.OutDir, "fair_timing_result_current_environment.txt"), []byte(timingContent), 0644) + if err != nil { + return err + } + + // 2. Write fair_summary_current_environment.txt + summaryContent := fmt.Sprintf("speedup_min=%.3fx\nspeedup_median=%.3fx\nspeedup_max=%.3fx\nspeedup_mean=%.3fx\n", + cfg.Min, cfg.Median, cfg.Max, cfg.Mean) + err = os.WriteFile(filepath.Join(cfg.OutDir, "fair_summary_current_environment.txt"), []byte(summaryContent), 0644) + if err != nil { + return err + } + + // 3. Write speedups.json + speedupMap := map[string]string{ + "min": fmt.Sprintf("%.3fx", cfg.Min), + "median": fmt.Sprintf("%.3fx", cfg.Median), + "max": fmt.Sprintf("%.3fx", cfg.Max), + "mean": fmt.Sprintf("%.3fx", cfg.Mean), + } + speedupJSON, err := json.MarshalIndent(speedupMap, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(cfg.OutDir, "speedups.json"), append(speedupJSON, '\n'), 0644) +} + +/** + * @brief Entry point for running the matrix cache-locality timing benchmark. + * + * Coordinates memory allocations, registers inputs, validates flat/locality equivalence, + * runs the benchmarking sweeps, and writes outputs to the output directory. + * + * @param repeats Number of benchmark pairs to execute. + * @param outDir Target directory to save results files. + * @return LocalityResult with statistics and checksum or an error. + */ +func RunLocalityBenchmark(repeats int, outDir string) (*LocalityResult, error) { + if repeats < 1 { + repeats = 8 + } + + // 1. Allocate 64-byte aligned memory buffers to ensure optimal AVX cache alignment + aPtr := C.aligned_alloc(64, C.size_t(Bytes)) + bPtr := C.aligned_alloc(64, C.size_t(Bytes)) + cfPtr := C.aligned_alloc(64, C.size_t(Bytes)) + clPtr := C.aligned_alloc(64, C.size_t(Bytes)) + flushPtr := C.aligned_alloc(64, C.size_t(FlushBytes)) + + if aPtr == nil || bPtr == nil || cfPtr == nil || clPtr == nil || flushPtr == nil { + return nil, fmt.Errorf("system failed to allocate cache-aligned buffers") + } + + defer C.free(aPtr) + defer C.free(bPtr) + defer C.free(cfPtr) + defer C.free(clPtr) + defer C.free(flushPtr) + + C.memset(flushPtr, 1, C.size_t(FlushBytes)) + + // Cast C pointers to Go slices for easy initialization and checksum + aSlice := (*[1 << 28]int32)(unsafe.Pointer(aPtr))[:Cells:Cells] + bSlice := (*[1 << 28]int32)(unsafe.Pointer(bPtr))[:Cells:Cells] + cfSlice := (*[1 << 28]int32)(unsafe.Pointer(cfPtr))[:Cells:Cells] + clSlice := (*[1 << 28]int32)(unsafe.Pointer(clPtr))[:Cells:Cells] + + // Fill inputs with deterministic pseudo-random values + for i := 0; i < Cells; i++ { + aSlice[i] = int32((uint32(i)*17 + 3) % 7) - 3 + bSlice[i] = int32((uint32(i)*13 + 5) % 7) - 3 + } + + // Detect host AVX-512 capability at runtime + useAVX512 := C.has_avx512f() != 0 + telemetryMsg := "HARDWARE TELEMETRY: AVX-512 not supported. Dispatching to optimized scalar fallback kernel." + if useAVX512 { + telemetryMsg = "HARDWARE TELEMETRY: Native AVX-512 support detected. Dispatching to assembly vector kernel." + } + fmt.Println(telemetryMsg) + + // Initial warm run & arithmetic validation check + C.memset(cfPtr, 0, C.size_t(Bytes)) + C.memset(clPtr, 0, C.size_t(Bytes)) + + C.matmul_flat((*C.int32_t)(unsafe.Pointer(aPtr)), (*C.int32_t)(unsafe.Pointer(bPtr)), (*C.int32_t)(unsafe.Pointer(cfPtr))) + executeLocalityKernel(aPtr, bPtr, clPtr, useAVX512) + + // Verify equal outputs + for i := 0; i < Cells; i++ { + if cfSlice[i] != clSlice[i] { + return nil, fmt.Errorf("MISMATCH: Verification failure at index=%d flat=%d locality=%d", i, cfSlice[i], clSlice[i]) + } + } + + // Calculate checksum of results + var checksum int64 + for i, v := range cfSlice { + checksum += int64(i+1) * int64(v) + } + + verifyMsg := fmt.Sprintf("VERIFY equal N=%d operations=%d cache_flush_bytes=%d", N, N*N*N, FlushBytes) + fmt.Println(verifyMsg) + + // Collect timing pairs + speedups, timingLines := runBenchmarkPairs(repeats, aPtr, bPtr, cfPtr, clPtr, flushPtr, useAVX512) + + flushSinkMsg := fmt.Sprintf("FLUSH sink=%d", C.get_flush_sink()) + fmt.Println(flushSinkMsg) + + // Compute summary statistics + minVal := speedups[0] + maxVal := speedups[0] + sumVal := 0.0 + for _, v := range speedups { + if v < minVal { + minVal = v + } + if v > maxVal { + maxVal = v + } + sumVal += v + } + meanVal := sumVal / float64(len(speedups)) + medianVal := median(speedups) + + // Write output files + cfg := &BenchmarkOutputs{ + OutDir: outDir, + TelemetryMsg: telemetryMsg, + VerifyMsg: verifyMsg, + FlushSinkMsg: flushSinkMsg, + TimingLines: timingLines, + Min: minVal, + Median: medianVal, + Max: maxVal, + Mean: meanVal, + } + if err := writeBenchmarkOutputs(cfg); err != nil { + return nil, err + } + + return &LocalityResult{ + Min: minVal, + Median: medianVal, + Max: maxVal, + Mean: meanVal, + Checksum: checksum, + }, nil +} diff --git a/evidence/reproduced/speedups.json b/evidence/reproduced/speedups.json index f75b36c..368b424 100644 --- a/evidence/reproduced/speedups.json +++ b/evidence/reproduced/speedups.json @@ -1,6 +1,6 @@ { - "min": "4.011x", - "median": "4.109x", - "max": "4.336x", - "mean": "4.133x" -} \ No newline at end of file + "max": "3.969x", + "mean": "3.756x", + "median": "3.721x", + "min": "3.617x" +}