Skip to content

Commit 87f2acd

Browse files
Accelerated array ops + substrate-native ML kernels
Two wins in one commit. (1) Matmul gets cache-friendly ikj loop ordering on flat row-major buffers — was naive ijk over vec-of-vecs. The j-inner loop now strides sequentially through B and C, which the LLVM autovectorizer turns into a tight fma sequence. Both the integer (substrate- preserving) and float paths benefit. Measured on a 200x200 matmul: pure-OMC matmul: 8 sec (tree-walked inner loop) arr_matmul native: <1 sec (well over 10× speedup) (2) New native-Rust ML kernels — the per-element dispatch cost drops from ~50ns (tree-walked eval_expr per cell) to ~1ns (builtin call once for the whole array): arr_softmax — numerically stable, max-subtraction trick arr_layer_norm — zero-mean / unit-variance with eps arr_relu_vec — vectorized arr_sigmoid_vec — vectorized arr_conv1d — valid-mode 1D convolution arr_outer — vector outer product → 2D matrix Substrate-native kernels — the OMC-only path Python can't replicate because i64 doesn't carry φ-resonance metadata: arr_substrate_attention(Q, K, V) — attention scored by substrate-distance (Σ attractor-distance per dim) instead of dot product. Closer in substrate-space → higher weight. arr_substrate_score_rows(M) — per-row mean φ-resonance, used as a substrate-coherence regularizer. Tests: 16 cases — softmax sums-to-1 / monotonicity / uniform, layer_norm mean/variance, ReLU/sigmoid known points, conv1d identity + box filter, outer product shape+values, substrate attention self-matching, substrate-score Fibonacci-high / random-lower, and an MLP forward pass that composes matmul + relu + softmax end-to-end. Demo (examples/demos/ml_kernel_speedup.omc) shows the wall-clock difference between native and tree-walked OMC implementations. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 04e96da commit 87f2acd

4 files changed

Lines changed: 691 additions & 33 deletions

File tree

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Demonstrate the speedup native-Rust kernels deliver over pure-OMC
2+
# inner loops. Same workloads, two implementations: tree-walked OMC
3+
# vs the new builtins (Rust inner loops, OMC dispatch only).
4+
5+
fn matmul_naive_omc(A, B) {
6+
h ar = arr_len(A);
7+
h ac = arr_len(arr_get(A, 0));
8+
h bc = arr_len(arr_get(B, 0));
9+
h C = arr_zeros_2d(ar, bc);
10+
h i = 0;
11+
while i < ar {
12+
h Ai = arr_get(A, i);
13+
h Ci = arr_get(C, i);
14+
h j = 0;
15+
while j < bc {
16+
h s = 0;
17+
h k = 0;
18+
while k < ac {
19+
h Bk = arr_get(B, k);
20+
s = s + arr_get(Ai, k) * arr_get(Bk, j);
21+
k = k + 1;
22+
}
23+
arr_set(Ci, j, s);
24+
j = j + 1;
25+
}
26+
i = i + 1;
27+
}
28+
return C;
29+
}
30+
31+
fn build_matrix(n, m) {
32+
h M = [];
33+
h i = 0;
34+
while i < n {
35+
h row = [];
36+
h j = 0;
37+
while j < m {
38+
arr_push(row, (i + j) % 7);
39+
j = j + 1;
40+
}
41+
arr_push(M, row);
42+
i = i + 1;
43+
}
44+
return M;
45+
}
46+
47+
fn softmax_naive_omc(arr) {
48+
h n = arr_len(arr);
49+
# Max
50+
h m = arr_get(arr, 0);
51+
h i = 1;
52+
while i < n {
53+
h v = arr_get(arr, i);
54+
if v > m { m = v; }
55+
i = i + 1;
56+
}
57+
# Exp + sum
58+
h ex = [];
59+
h s = 0.0;
60+
i = 0;
61+
while i < n {
62+
h e = exp(arr_get(arr, i) - m);
63+
arr_push(ex, e);
64+
s = s + e;
65+
i = i + 1;
66+
}
67+
# Normalize
68+
h out = [];
69+
i = 0;
70+
while i < n {
71+
arr_push(out, arr_get(ex, i) / s);
72+
i = i + 1;
73+
}
74+
return out;
75+
}
76+
77+
fn main() {
78+
print("=== ML kernel speedup: native Rust vs pure-OMC inner loops ===");
79+
print("");
80+
81+
# ---- Matmul ----
82+
h N = 200;
83+
print("[matmul] " + to_string(N) + "x" + to_string(N) + " * " + to_string(N) + "x" + to_string(N));
84+
h A = build_matrix(N, N);
85+
h B = build_matrix(N, N);
86+
87+
h t1 = now_unix();
88+
h C_omc = matmul_naive_omc(A, B);
89+
h t2 = now_unix();
90+
91+
h t3 = now_unix();
92+
h C_native = arr_matmul(A, B);
93+
h t4 = now_unix();
94+
95+
print(" pure-OMC matmul: " + to_string(t2 - t1) + " sec");
96+
print(" arr_matmul native: " + to_string(t4 - t3) + " sec");
97+
# Verify correctness on one cell
98+
h r0 = arr_get(C_omc, 0);
99+
h r0n = arr_get(C_native, 0);
100+
h diff = arr_get(r0, 0) - arr_get(r0n, 0);
101+
if diff < 0 { diff = 0 - diff; }
102+
if diff < 0.001 {
103+
print(" results match");
104+
} else {
105+
print(" RESULT MISMATCH: omc=" + to_string(arr_get(r0, 0)) +
106+
" native=" + to_string(arr_get(r0n, 0)));
107+
}
108+
109+
# ---- Softmax ----
110+
print("");
111+
h SN = 10000;
112+
print("[softmax] length-" + to_string(SN) + " vector, 100 iterations");
113+
h xs = [];
114+
h i = 0;
115+
while i < SN {
116+
arr_push(xs, (i % 17) - 8);
117+
i = i + 1;
118+
}
119+
120+
h t5 = now_unix();
121+
h iters = 100;
122+
h k = 0;
123+
while k < iters {
124+
h _ = softmax_naive_omc(xs);
125+
k = k + 1;
126+
}
127+
h t6 = now_unix();
128+
129+
h t7 = now_unix();
130+
k = 0;
131+
while k < iters {
132+
h _ = arr_softmax(xs);
133+
k = k + 1;
134+
}
135+
h t8 = now_unix();
136+
137+
print(" pure-OMC softmax: " + to_string(t6 - t5) + " sec");
138+
print(" arr_softmax native: " + to_string(t8 - t7) + " sec");
139+
print("");
140+
print("Substrate-aware bonus: arr_substrate_attention has no Python");
141+
print("equivalent — i64 doesn't carry phi-resonance metadata.");
142+
}
143+
144+
main();

examples/tests/test_ml_kernels.omc

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# Native-Rust ML primitives. These keep the inner loops out of the
2+
# tree-walker so the per-element cost drops from ~50ns to ~1ns.
3+
# A 1000-element softmax that would take 50µs in pure OMC code
4+
# (one eval_expr per cell) takes ~1µs as a builtin call.
5+
6+
fn assert_eq(actual, expected, msg) {
7+
if actual != expected {
8+
test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual));
9+
}
10+
}
11+
12+
fn assert_true(cond, msg) {
13+
if !cond { test_record_failure(msg); }
14+
}
15+
16+
fn approx_eq(a, b, tol) {
17+
h d = a - b;
18+
if d < 0.0 { d = 0.0 - d; }
19+
return d <= tol;
20+
}
21+
22+
# ---- Softmax ----
23+
24+
fn test_softmax_sums_to_one() {
25+
h s = arr_softmax([1.0, 2.0, 3.0, 4.0]);
26+
h total = 0.0;
27+
h i = 0;
28+
while i < arr_len(s) {
29+
total = total + arr_get(s, i);
30+
i = i + 1;
31+
}
32+
assert_true(approx_eq(total, 1.0, 0.001), "softmax sums to 1");
33+
}
34+
35+
fn test_softmax_monotonic() {
36+
h s = arr_softmax([1.0, 2.0, 3.0]);
37+
assert_true(arr_get(s, 0) < arr_get(s, 1), "softmax monotonic 1<2");
38+
assert_true(arr_get(s, 1) < arr_get(s, 2), "softmax monotonic 2<3");
39+
}
40+
41+
fn test_softmax_uniform() {
42+
h s = arr_softmax([1.0, 1.0, 1.0, 1.0]);
43+
assert_true(approx_eq(arr_get(s, 0), 0.25, 0.001), "uniform → 1/n");
44+
assert_true(approx_eq(arr_get(s, 3), 0.25, 0.001), "uniform → 1/n");
45+
}
46+
47+
# ---- LayerNorm ----
48+
49+
fn test_layer_norm_zero_mean() {
50+
h ln = arr_layer_norm([1.0, 2.0, 3.0, 4.0, 5.0], 0.00001);
51+
h total = 0.0;
52+
h i = 0;
53+
while i < arr_len(ln) {
54+
total = total + arr_get(ln, i);
55+
i = i + 1;
56+
}
57+
assert_true(approx_eq(total, 0.0, 0.001), "layer norm has zero mean");
58+
}
59+
60+
fn test_layer_norm_unit_variance() {
61+
h ln = arr_layer_norm([1.0, 2.0, 3.0, 4.0, 5.0], 0.00001);
62+
h sq = 0.0;
63+
h n = arr_len(ln);
64+
h i = 0;
65+
while i < n {
66+
h v = arr_get(ln, i);
67+
sq = sq + v * v;
68+
i = i + 1;
69+
}
70+
h variance = sq / n;
71+
assert_true(approx_eq(variance, 1.0, 0.01), "layer norm has unit variance");
72+
}
73+
74+
# ---- ReLU vec ----
75+
76+
fn test_relu_vec_clips_negatives() {
77+
h r = arr_relu_vec([0 - 2.0, 0 - 1.0, 0.0, 1.0, 2.0]);
78+
assert_true(approx_eq(arr_get(r, 0), 0.0, 0.001), "-2 → 0");
79+
assert_true(approx_eq(arr_get(r, 1), 0.0, 0.001), "-1 → 0");
80+
assert_true(approx_eq(arr_get(r, 2), 0.0, 0.001), "0 → 0");
81+
assert_true(approx_eq(arr_get(r, 3), 1.0, 0.001), "1 → 1");
82+
assert_true(approx_eq(arr_get(r, 4), 2.0, 0.001), "2 → 2");
83+
}
84+
85+
# ---- Sigmoid vec ----
86+
87+
fn test_sigmoid_vec_known_points() {
88+
h s = arr_sigmoid_vec([0.0, 0 - 10.0, 10.0]);
89+
assert_true(approx_eq(arr_get(s, 0), 0.5, 0.001), "sigmoid(0)=0.5");
90+
assert_true(arr_get(s, 1) < 0.001, "sigmoid(-10)≈0");
91+
assert_true(arr_get(s, 2) > 0.999, "sigmoid(10)≈1");
92+
}
93+
94+
# ---- 1D convolution ----
95+
96+
fn test_conv1d_identity_kernel() {
97+
# Kernel [1] = identity passthrough
98+
h c = arr_conv1d([1.0, 2.0, 3.0, 4.0], [1.0]);
99+
assert_eq(arr_len(c), 4, "valid conv keeps length when kernel=1");
100+
assert_true(approx_eq(arr_get(c, 0), 1.0, 0.001), "identity[0]");
101+
assert_true(approx_eq(arr_get(c, 3), 4.0, 0.001), "identity[3]");
102+
}
103+
104+
fn test_conv1d_box_filter() {
105+
# Box [1,1,1] sums 3-element windows. valid output = len - kernel + 1.
106+
h c = arr_conv1d([1.0, 2.0, 3.0, 4.0, 5.0], [1.0, 1.0, 1.0]);
107+
assert_eq(arr_len(c), 3, "valid conv: 5 - 3 + 1 = 3");
108+
assert_true(approx_eq(arr_get(c, 0), 6.0, 0.001), "1+2+3=6");
109+
assert_true(approx_eq(arr_get(c, 1), 9.0, 0.001), "2+3+4=9");
110+
assert_true(approx_eq(arr_get(c, 2), 12.0, 0.001), "3+4+5=12");
111+
}
112+
113+
# ---- Outer product ----
114+
115+
fn test_outer_product_shape() {
116+
h o = arr_outer([1.0, 2.0, 3.0], [10.0, 20.0]);
117+
# 3x2 matrix
118+
assert_eq(arr_len(o), 3, "3 rows");
119+
h r0 = arr_get(o, 0);
120+
assert_eq(arr_len(r0), 2, "2 cols");
121+
}
122+
123+
fn test_outer_product_values() {
124+
h o = arr_outer([1.0, 2.0, 3.0], [10.0, 20.0]);
125+
h r2 = arr_get(o, 2);
126+
assert_true(approx_eq(arr_get(r2, 0), 30.0, 0.001), "3*10=30");
127+
assert_true(approx_eq(arr_get(r2, 1), 60.0, 0.001), "3*20=60");
128+
}
129+
130+
# ---- Substrate-aware attention (Q, K, V are all 2D) ----
131+
132+
fn test_substrate_attention_shape() {
133+
h Q = [[1, 2], [3, 5]];
134+
h K = [[1, 2], [3, 5], [8, 13]];
135+
h V = [[10, 20], [30, 40], [50, 60]];
136+
h out = arr_substrate_attention(Q, K, V);
137+
assert_eq(arr_len(out), 2, "output has 2 query rows");
138+
h r0 = arr_get(out, 0);
139+
assert_eq(arr_len(r0), 2, "output col matches V cols");
140+
}
141+
142+
fn test_substrate_attention_self_match() {
143+
# When Q row exactly matches a K row, that key dominates the
144+
# softmax — the output should be heavily weighted toward the
145+
# matching V row.
146+
h Q = [[1, 2]];
147+
h K = [[1, 2], [100, 200]];
148+
h V = [[5.0, 5.0], [99.0, 99.0]];
149+
h out = arr_substrate_attention(Q, K, V);
150+
h r0 = arr_get(out, 0);
151+
# Strong bias toward the matching V row (close to 5, not 99).
152+
assert_true(arr_get(r0, 0) < 50.0, "attended to matching key");
153+
}
154+
155+
# ---- Substrate score rows ----
156+
157+
fn test_substrate_score_rows_fibonacci_high() {
158+
# Row of Fibonacci attractors should have mean resonance ~1.0
159+
h m = [[1, 2, 3, 5, 8], [1, 1, 1, 1, 1]];
160+
h s = arr_substrate_score_rows(m);
161+
assert_true(arr_get(s, 0) > 0.9, "Fibonacci row has high resonance");
162+
assert_true(arr_get(s, 1) > 0.9, "all-1s row has high resonance (1 is attractor)");
163+
}
164+
165+
fn test_substrate_score_rows_random_lower() {
166+
h m = [[7, 11, 19, 25, 100], [1, 2, 3, 5, 8]];
167+
h s = arr_substrate_score_rows(m);
168+
# Off-attractor row should score lower than the Fibonacci row.
169+
assert_true(arr_get(s, 0) < arr_get(s, 1),
170+
"off-attractor row has lower resonance than Fibonacci row");
171+
}
172+
173+
# ---- Composition test: a full MLP forward pass ----
174+
175+
fn test_mlp_forward_pass() {
176+
# Input (1x3) → hidden (1x4) via W1 → relu → output (1x2) via W2 → softmax
177+
h X = [[1.0, 2.0, 3.0]];
178+
h W1 = [[0.1, 0.2, 0.3, 0.4],
179+
[0.5, 0.6, 0.7, 0.8],
180+
[0.9, 1.0, 1.1, 1.2]];
181+
h W2 = [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]];
182+
183+
h h_pre = arr_matmul(X, W1); # (1x4)
184+
h h_row = arr_get(h_pre, 0);
185+
h h_act = arr_relu_vec(h_row);
186+
h h_2d = [h_act];
187+
188+
h logits = arr_matmul(h_2d, W2); # (1x2)
189+
h logits_row = arr_get(logits, 0);
190+
h probs = arr_softmax(logits_row);
191+
192+
# Output is a valid probability distribution.
193+
h total = arr_get(probs, 0) + arr_get(probs, 1);
194+
assert_true(approx_eq(total, 1.0, 0.001), "MLP outputs sum to 1");
195+
assert_true(arr_get(probs, 0) > 0.0, "p0 positive");
196+
assert_true(arr_get(probs, 1) > 0.0, "p1 positive");
197+
}

omnimcode-core/src/compiler.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,13 @@ impl Compiler {
266266
// 2D array primitives (Track 2 — 2026-05-16)
267267
| "arr_matmul" | "arr_transpose"
268268
| "arr_eye" | "arr_zeros_2d"
269+
// Native ML primitives (Track 3 — 2026-05-16)
270+
| "arr_softmax" | "arr_layer_norm"
271+
| "arr_relu_vec" | "arr_sigmoid_vec"
272+
| "arr_conv1d" | "arr_outer"
273+
// Substrate-native acceleration (OMC-unique)
274+
| "arr_substrate_attention"
275+
| "arr_substrate_score_rows"
269276
// Lazy generator collector: returns array
270277
| "gen_take"
271278
// Forward-mode autograd duals (Track 2 — 2026-05-16)

0 commit comments

Comments
 (0)