Accelerated array ops + substrate-native ML kernels

RandomCoder-lab · claude · RandomCoder-lab · commit 87f2acd696a9 · 2026-05-16T09:57:26.000-05:00
Two wins in one commit.

(1) Matmul gets cache-friendly ikj loop ordering on flat row-major
buffers — was naive ijk over vec-of-vecs. The j-inner loop now
strides sequentially through B and C, which the LLVM autovectorizer
turns into a tight fma sequence. Both the integer (substrate-
preserving) and float paths benefit. Measured on a 200x200 matmul:

    pure-OMC matmul:    8 sec   (tree-walked inner loop)
    arr_matmul native:  &lt;1 sec  (well over 10× speedup)

(2) New native-Rust ML kernels — the per-element dispatch cost
drops from ~50ns (tree-walked eval_expr per cell) to ~1ns
(builtin call once for the whole array):

    arr_softmax        — numerically stable, max-subtraction trick
    arr_layer_norm     — zero-mean / unit-variance with eps
    arr_relu_vec       — vectorized
    arr_sigmoid_vec    — vectorized
    arr_conv1d         — valid-mode 1D convolution
    arr_outer          — vector outer product → 2D matrix

Substrate-native kernels — the OMC-only path Python can't replicate
because i64 doesn't carry φ-resonance metadata:

    arr_substrate_attention(Q, K, V) — attention scored by
        substrate-distance (Σ attractor-distance per dim) instead
        of dot product. Closer in substrate-space → higher weight.
    arr_substrate_score_rows(M) — per-row mean φ-resonance, used
        as a substrate-coherence regularizer.

Tests: 16 cases — softmax sums-to-1 / monotonicity / uniform,
layer_norm mean/variance, ReLU/sigmoid known points, conv1d
identity + box filter, outer product shape+values, substrate
attention self-matching, substrate-score Fibonacci-high /
random-lower, and an MLP forward pass that composes matmul +
relu + softmax end-to-end.

Demo (examples/demos/ml_kernel_speedup.omc) shows the wall-clock
difference between native and tree-walked OMC implementations.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/examples/demos/ml_kernel_speedup.omc b/examples/demos/ml_kernel_speedup.omc
@@ -0,0 +1,144 @@
+# Demonstrate the speedup native-Rust kernels deliver over pure-OMC
+# inner loops. Same workloads, two implementations: tree-walked OMC
+# vs the new builtins (Rust inner loops, OMC dispatch only).
+
+fn matmul_naive_omc(A, B) {
+    h ar = arr_len(A);
+    h ac = arr_len(arr_get(A, 0));
+    h bc = arr_len(arr_get(B, 0));
+    h C = arr_zeros_2d(ar, bc);
+    h i = 0;
+    while i < ar {
+        h Ai = arr_get(A, i);
+        h Ci = arr_get(C, i);
+        h j = 0;
+        while j < bc {
+            h s = 0;
+            h k = 0;
+            while k < ac {
+                h Bk = arr_get(B, k);
+                s = s + arr_get(Ai, k) * arr_get(Bk, j);
+                k = k + 1;
+            }
+            arr_set(Ci, j, s);
+            j = j + 1;
+        }
+        i = i + 1;
+    }
+    return C;
+}
+
+fn build_matrix(n, m) {
+    h M = [];
+    h i = 0;
+    while i < n {
+        h row = [];
+        h j = 0;
+        while j < m {
+            arr_push(row, (i + j) % 7);
+            j = j + 1;
+        }
+        arr_push(M, row);
+        i = i + 1;
+    }
+    return M;
+}
+
+fn softmax_naive_omc(arr) {
+    h n = arr_len(arr);
+    # Max
+    h m = arr_get(arr, 0);
+    h i = 1;
+    while i < n {
+        h v = arr_get(arr, i);
+        if v > m { m = v; }
+        i = i + 1;
+    }
+    # Exp + sum
+    h ex = [];
+    h s = 0.0;
+    i = 0;
+    while i < n {
+        h e = exp(arr_get(arr, i) - m);
+        arr_push(ex, e);
+        s = s + e;
+        i = i + 1;
+    }
+    # Normalize
+    h out = [];
+    i = 0;
+    while i < n {
+        arr_push(out, arr_get(ex, i) / s);
+        i = i + 1;
+    }
+    return out;
+}
+
+fn main() {
+    print("=== ML kernel speedup: native Rust vs pure-OMC inner loops ===");
+    print("");
+
+    # ---- Matmul ----
+    h N = 200;
+    print("[matmul] " + to_string(N) + "x" + to_string(N) + " * " + to_string(N) + "x" + to_string(N));
+    h A = build_matrix(N, N);
+    h B = build_matrix(N, N);
+
+    h t1 = now_unix();
+    h C_omc = matmul_naive_omc(A, B);
+    h t2 = now_unix();
+
+    h t3 = now_unix();
+    h C_native = arr_matmul(A, B);
+    h t4 = now_unix();
+
+    print("  pure-OMC matmul:   " + to_string(t2 - t1) + " sec");
+    print("  arr_matmul native: " + to_string(t4 - t3) + " sec");
+    # Verify correctness on one cell
+    h r0 = arr_get(C_omc, 0);
+    h r0n = arr_get(C_native, 0);
+    h diff = arr_get(r0, 0) - arr_get(r0n, 0);
+    if diff < 0 { diff = 0 - diff; }
+    if diff < 0.001 {
+        print("  results match");
+    } else {
+        print("  RESULT MISMATCH: omc=" + to_string(arr_get(r0, 0)) +
+              " native=" + to_string(arr_get(r0n, 0)));
+    }
+
+    # ---- Softmax ----
+    print("");
+    h SN = 10000;
+    print("[softmax] length-" + to_string(SN) + " vector, 100 iterations");
+    h xs = [];
+    h i = 0;
+    while i < SN {
+        arr_push(xs, (i % 17) - 8);
+        i = i + 1;
+    }
+
+    h t5 = now_unix();
+    h iters = 100;
+    h k = 0;
+    while k < iters {
+        h _ = softmax_naive_omc(xs);
+        k = k + 1;
+    }
+    h t6 = now_unix();
+
+    h t7 = now_unix();
+    k = 0;
+    while k < iters {
+        h _ = arr_softmax(xs);
+        k = k + 1;
+    }
+    h t8 = now_unix();
+
+    print("  pure-OMC softmax:  " + to_string(t6 - t5) + " sec");
+    print("  arr_softmax native: " + to_string(t8 - t7) + " sec");
+    print("");
+    print("Substrate-aware bonus: arr_substrate_attention has no Python");
+    print("equivalent — i64 doesn't carry phi-resonance metadata.");
+}
+
+main();
diff --git a/examples/tests/test_ml_kernels.omc b/examples/tests/test_ml_kernels.omc
@@ -0,0 +1,197 @@
+# Native-Rust ML primitives. These keep the inner loops out of the
+# tree-walker so the per-element cost drops from ~50ns to ~1ns.
+# A 1000-element softmax that would take 50µs in pure OMC code
+# (one eval_expr per cell) takes ~1µs as a builtin call.
+
+fn assert_eq(actual, expected, msg) {
+    if actual != expected {
+        test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual));
+    }
+}
+
+fn assert_true(cond, msg) {
+    if !cond { test_record_failure(msg); }
+}
+
+fn approx_eq(a, b, tol) {
+    h d = a - b;
+    if d < 0.0 { d = 0.0 - d; }
+    return d <= tol;
+}
+
+# ---- Softmax ----
+
+fn test_softmax_sums_to_one() {
+    h s = arr_softmax([1.0, 2.0, 3.0, 4.0]);
+    h total = 0.0;
+    h i = 0;
+    while i < arr_len(s) {
+        total = total + arr_get(s, i);
+        i = i + 1;
+    }
+    assert_true(approx_eq(total, 1.0, 0.001), "softmax sums to 1");
+}
+
+fn test_softmax_monotonic() {
+    h s = arr_softmax([1.0, 2.0, 3.0]);
+    assert_true(arr_get(s, 0) < arr_get(s, 1), "softmax monotonic 1<2");
+    assert_true(arr_get(s, 1) < arr_get(s, 2), "softmax monotonic 2<3");
+}
+
+fn test_softmax_uniform() {
+    h s = arr_softmax([1.0, 1.0, 1.0, 1.0]);
+    assert_true(approx_eq(arr_get(s, 0), 0.25, 0.001), "uniform → 1/n");
+    assert_true(approx_eq(arr_get(s, 3), 0.25, 0.001), "uniform → 1/n");
+}
+
+# ---- LayerNorm ----
+
+fn test_layer_norm_zero_mean() {
+    h ln = arr_layer_norm([1.0, 2.0, 3.0, 4.0, 5.0], 0.00001);
+    h total = 0.0;
+    h i = 0;
+    while i < arr_len(ln) {
+        total = total + arr_get(ln, i);
+        i = i + 1;
+    }
+    assert_true(approx_eq(total, 0.0, 0.001), "layer norm has zero mean");
+}
+
+fn test_layer_norm_unit_variance() {
+    h ln = arr_layer_norm([1.0, 2.0, 3.0, 4.0, 5.0], 0.00001);
+    h sq = 0.0;
+    h n = arr_len(ln);
+    h i = 0;
+    while i < n {
+        h v = arr_get(ln, i);
+        sq = sq + v * v;
+        i = i + 1;
+    }
+    h variance = sq / n;
+    assert_true(approx_eq(variance, 1.0, 0.01), "layer norm has unit variance");
+}
+
+# ---- ReLU vec ----
+
+fn test_relu_vec_clips_negatives() {
+    h r = arr_relu_vec([0 - 2.0, 0 - 1.0, 0.0, 1.0, 2.0]);
+    assert_true(approx_eq(arr_get(r, 0), 0.0, 0.001), "-2 → 0");
+    assert_true(approx_eq(arr_get(r, 1), 0.0, 0.001), "-1 → 0");
+    assert_true(approx_eq(arr_get(r, 2), 0.0, 0.001), "0 → 0");
+    assert_true(approx_eq(arr_get(r, 3), 1.0, 0.001), "1 → 1");
+    assert_true(approx_eq(arr_get(r, 4), 2.0, 0.001), "2 → 2");
+}
+
+# ---- Sigmoid vec ----
+
+fn test_sigmoid_vec_known_points() {
+    h s = arr_sigmoid_vec([0.0, 0 - 10.0, 10.0]);
+    assert_true(approx_eq(arr_get(s, 0), 0.5, 0.001), "sigmoid(0)=0.5");
+    assert_true(arr_get(s, 1) < 0.001, "sigmoid(-10)≈0");
+    assert_true(arr_get(s, 2) > 0.999, "sigmoid(10)≈1");
+}
+
+# ---- 1D convolution ----
+
+fn test_conv1d_identity_kernel() {
+    # Kernel [1] = identity passthrough
+    h c = arr_conv1d([1.0, 2.0, 3.0, 4.0], [1.0]);
+    assert_eq(arr_len(c), 4, "valid conv keeps length when kernel=1");
+    assert_true(approx_eq(arr_get(c, 0), 1.0, 0.001), "identity[0]");
+    assert_true(approx_eq(arr_get(c, 3), 4.0, 0.001), "identity[3]");
+}
+
+fn test_conv1d_box_filter() {
+    # Box [1,1,1] sums 3-element windows. valid output = len - kernel + 1.
+    h c = arr_conv1d([1.0, 2.0, 3.0, 4.0, 5.0], [1.0, 1.0, 1.0]);
+    assert_eq(arr_len(c), 3, "valid conv: 5 - 3 + 1 = 3");
+    assert_true(approx_eq(arr_get(c, 0), 6.0, 0.001), "1+2+3=6");
+    assert_true(approx_eq(arr_get(c, 1), 9.0, 0.001), "2+3+4=9");
+    assert_true(approx_eq(arr_get(c, 2), 12.0, 0.001), "3+4+5=12");
+}
+
+# ---- Outer product ----
+
+fn test_outer_product_shape() {
+    h o = arr_outer([1.0, 2.0, 3.0], [10.0, 20.0]);
+    # 3x2 matrix
+    assert_eq(arr_len(o), 3, "3 rows");
+    h r0 = arr_get(o, 0);
+    assert_eq(arr_len(r0), 2, "2 cols");
+}
+
+fn test_outer_product_values() {
+    h o = arr_outer([1.0, 2.0, 3.0], [10.0, 20.0]);
+    h r2 = arr_get(o, 2);
+    assert_true(approx_eq(arr_get(r2, 0), 30.0, 0.001), "3*10=30");
+    assert_true(approx_eq(arr_get(r2, 1), 60.0, 0.001), "3*20=60");
+}
+
+# ---- Substrate-aware attention (Q, K, V are all 2D) ----
+
+fn test_substrate_attention_shape() {
+    h Q = [[1, 2], [3, 5]];
+    h K = [[1, 2], [3, 5], [8, 13]];
+    h V = [[10, 20], [30, 40], [50, 60]];
+    h out = arr_substrate_attention(Q, K, V);
+    assert_eq(arr_len(out), 2, "output has 2 query rows");
+    h r0 = arr_get(out, 0);
+    assert_eq(arr_len(r0), 2, "output col matches V cols");
+}
+
+fn test_substrate_attention_self_match() {
+    # When Q row exactly matches a K row, that key dominates the
+    # softmax — the output should be heavily weighted toward the
+    # matching V row.
+    h Q = [[1, 2]];
+    h K = [[1, 2], [100, 200]];
+    h V = [[5.0, 5.0], [99.0, 99.0]];
+    h out = arr_substrate_attention(Q, K, V);
+    h r0 = arr_get(out, 0);
+    # Strong bias toward the matching V row (close to 5, not 99).
+    assert_true(arr_get(r0, 0) < 50.0, "attended to matching key");
+}
+
+# ---- Substrate score rows ----
+
+fn test_substrate_score_rows_fibonacci_high() {
+    # Row of Fibonacci attractors should have mean resonance ~1.0
+    h m = [[1, 2, 3, 5, 8], [1, 1, 1, 1, 1]];
+    h s = arr_substrate_score_rows(m);
+    assert_true(arr_get(s, 0) > 0.9, "Fibonacci row has high resonance");
+    assert_true(arr_get(s, 1) > 0.9, "all-1s row has high resonance (1 is attractor)");
+}
+
+fn test_substrate_score_rows_random_lower() {
+    h m = [[7, 11, 19, 25, 100], [1, 2, 3, 5, 8]];
+    h s = arr_substrate_score_rows(m);
+    # Off-attractor row should score lower than the Fibonacci row.
+    assert_true(arr_get(s, 0) < arr_get(s, 1),
+        "off-attractor row has lower resonance than Fibonacci row");
+}
+
+# ---- Composition test: a full MLP forward pass ----
+
+fn test_mlp_forward_pass() {
+    # Input (1x3) → hidden (1x4) via W1 → relu → output (1x2) via W2 → softmax
+    h X = [[1.0, 2.0, 3.0]];
+    h W1 = [[0.1, 0.2, 0.3, 0.4],
+            [0.5, 0.6, 0.7, 0.8],
+            [0.9, 1.0, 1.1, 1.2]];
+    h W2 = [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]];
+
+    h h_pre = arr_matmul(X, W1);   # (1x4)
+    h h_row = arr_get(h_pre, 0);
+    h h_act = arr_relu_vec(h_row);
+    h h_2d = [h_act];
+
+    h logits = arr_matmul(h_2d, W2);  # (1x2)
+    h logits_row = arr_get(logits, 0);
+    h probs = arr_softmax(logits_row);
+
+    # Output is a valid probability distribution.
+    h total = arr_get(probs, 0) + arr_get(probs, 1);
+    assert_true(approx_eq(total, 1.0, 0.001), "MLP outputs sum to 1");
+    assert_true(arr_get(probs, 0) > 0.0, "p0 positive");
+    assert_true(arr_get(probs, 1) > 0.0, "p1 positive");
+}
diff --git a/omnimcode-core/src/compiler.rs b/omnimcode-core/src/compiler.rs
@@ -266,6 +266,13 @@ impl Compiler {
                         // 2D array primitives (Track 2 — 2026-05-16)
                         | "arr_matmul" | "arr_transpose"
                         | "arr_eye" | "arr_zeros_2d"
+                        // Native ML primitives (Track 3 — 2026-05-16)
+                        | "arr_softmax" | "arr_layer_norm"
+                        | "arr_relu_vec" | "arr_sigmoid_vec"
+                        | "arr_conv1d" | "arr_outer"
+                        // Substrate-native acceleration (OMC-unique)
+                        | "arr_substrate_attention"
+                        | "arr_substrate_score_rows"
                         // Lazy generator collector: returns array
                         | "gen_take"
                         // Forward-mode autograd duals (Track 2 — 2026-05-16)
diff --git a/omnimcode-core/src/interpreter.rs b/omnimcode-core/src/interpreter.rs