Skip to content

Commit ea3a1e0

Browse files
Prometheus fleshed out: AdamW + Embedding + LayerNorm + CRT-PE + Sequential + transformer
Six new layers / primitives bring Prometheus from "MVP that trains an MLP" to "ships a real transformer." All trained end-to-end in pure OMC, no PyTorch in the loop. (1) tape_set_value Rust builtin (omnimcode-core/src/interpreter.rs) Lets custom optimizers compute updates in OMC space and write them back to tape variables — the missing piece for Adam. (2) AdamW optimizer (examples/lib/prometheus.omc) prom_adamw_new(params, lr, b1, b2, eps, wd) prom_adamw_step(state) Maintains per-param m, v moments; bias-corrected; decoupled weight decay. Verified: cross-entropy on a tiny 3-class classifier goes 1.10 → 0.30 over 50 steps, peaks at target. (3) Embedding layer prom_embedding_new(vocab, d_model, rng) prom_embedding_forward(layer, token_idx) → [1, d_model] Direct row lookup via one-hot @ table internally; differentiable into the table. Verified: only the looked-up row gets non-zero gradient. (4) LayerNorm prom_layernorm_new(d_model, rng) + forward Composed from tape ops: subtract mean, divide by sqrt(var+eps) via exp(-0.5*log(var+eps)), scale by gamma, add beta. Verified: LN([1,2,3,4]) = [-1.34, -0.45, 0.45, 1.34], mean ≈ 0. (5) CRT-Fibonacci positional encoding prom_crt_pe_matrix(seq_len, d_model) Pure-OMC port of the PyTorch CRT-PE that won -5.4% on TinyShakespeare today (3/3 seeds in train_scale.py). (6) Sequential composition prom_sequential([layers]) + prom_sequential_forward prom_collect_params_v2 — handles embedding + layernorm + attention (7) Tiny transformer end-to-end (examples/prometheus_transformer.omc) Architecture (73-char "the quick brown fox..." corpus, vocab=27, d_model=16, ff=32, AdamW lr=0.02, 6 epochs, ~63s): token_idx ↓ Embedding(vocab → d_model) ↓ + CRT-PE[pos] x ↓ LayerNorm ↓ FFN: Linear(d_model → ff) → ReLU → Linear(ff → d_model) ↓ residual ↓ LayerNorm ↓ Linear(d_model → vocab) logits Results: epoch 0 loss=3.65 epoch 5 loss=0.05 (tail mean 0.32) reduction: 11.3x generated from 't': "the quick brown fox jumpsroverrog lazy dog and the" The model REPRODUCES substantial fragments of the training corpus. "the quick brown fox jumps" and "lazy dog and the" are exact. The "jumpsrover" / "rog" artifacts show where transitions confused it — but the embedding learned word-like chunks via the CRT-PE position signal. All 11 trainable param tensors: embedding table, ln1 gamma/beta, ff_up W/b, ff_down W/b, ln2 gamma/beta, head W/b. Updated via AdamW with per-param m, v moments — first real adaptive optimizer in OMC. This is the "Prometheus ships a transformer" moment. Pure-OMC training, substrate-native CRT-PE that won the transformerless-LM experiment, content-addressable, no PyTorch. Caveat — single-token attention: our attention layer's geodesic bias is fully implemented and tested, but this transformer demo processes one token at a time. Multi-token sequences need tape-level gather/scatter primitives (Rust-side addition) for efficient batched processing. That's the next bottleneck to break. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 1ac3b4b commit ea3a1e0

3 files changed

Lines changed: 592 additions & 0 deletions

File tree

examples/lib/prometheus.omc

Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,3 +693,363 @@ fn prom_attention_params(layer) {
693693
arr_push(out, dict_get(layer, "V"));
694694
return out;
695695
}
696+
697+
# ---------------------------------------------------------------------------
698+
# AdamW optimizer — the workhorse modern optimizer.
699+
#
700+
# Maintains per-param first & second gradient moments (m, v); applies
701+
# bias-corrected updates with decoupled weight decay. Implementation
702+
# uses the new tape_set_value Rust builtin so the actual update math
703+
# happens in pure OMC space — easy to instrument or replace.
704+
#
705+
# m_t = β1 · m_{t-1} + (1-β1) · g
706+
# v_t = β2 · v_{t-1} + (1-β2) · g²
707+
# θ_t = θ_{t-1} − lr · (m_t/(1-β1^t)) / (sqrt(v_t/(1-β2^t)) + ε)
708+
# − lr · wd · θ_{t-1} (decoupled weight decay)
709+
# ---------------------------------------------------------------------------
710+
711+
fn prom_adamw_new(params, lr, beta1, beta2, eps, weight_decay) {
712+
h state = dict_new();
713+
dict_set(state, "params", params);
714+
dict_set(state, "lr", lr);
715+
dict_set(state, "beta1", beta1);
716+
dict_set(state, "beta2", beta2);
717+
dict_set(state, "eps", eps);
718+
dict_set(state, "wd", weight_decay);
719+
dict_set(state, "step", 0);
720+
# m and v are arrays parallel to params, each storing a value
721+
# matching the param's shape. Initialized to zeros on first step
722+
# to avoid having to compute shapes here.
723+
dict_set(state, "m", []);
724+
dict_set(state, "v", []);
725+
return state;
726+
}
727+
728+
# Zero-shaped-like for a numeric value (scalar / 1D / 2D array).
729+
fn _prom_zeros_like(v) {
730+
if type_of(v) == "array" {
731+
h out = [];
732+
h i = 0;
733+
while i < arr_len(v) {
734+
h e = arr_get(v, i);
735+
if type_of(e) == "array" {
736+
arr_push(out, _prom_zeros_like(e));
737+
} else {
738+
arr_push(out, 0.0);
739+
}
740+
i = i + 1;
741+
}
742+
return out;
743+
}
744+
return 0.0;
745+
}
746+
747+
# Element-wise binary op on values of arbitrary nested shape (scalar/1D/2D).
748+
fn _prom_zip(a, b, op) {
749+
if type_of(a) == "array" {
750+
h out = [];
751+
h i = 0;
752+
while i < arr_len(a) {
753+
arr_push(out, _prom_zip(arr_get(a, i), arr_get(b, i), op));
754+
i = i + 1;
755+
}
756+
return out;
757+
}
758+
if op == "add" { return a + b; }
759+
if op == "sub" { return a - b; }
760+
if op == "mul" { return a * b; }
761+
if op == "div" { return a / b; }
762+
return 0.0;
763+
}
764+
765+
# Element-wise scalar op on a nested-shape value.
766+
fn _prom_scale(v, s, op) {
767+
if type_of(v) == "array" {
768+
h out = [];
769+
h i = 0;
770+
while i < arr_len(v) {
771+
arr_push(out, _prom_scale(arr_get(v, i), s, op));
772+
i = i + 1;
773+
}
774+
return out;
775+
}
776+
if op == "mul" { return v * s; }
777+
if op == "add" { return v + s; }
778+
if op == "sub" { return v - s; }
779+
return v;
780+
}
781+
782+
fn _prom_sqrt_eps(v, eps) {
783+
if type_of(v) == "array" {
784+
h out = [];
785+
h i = 0;
786+
while i < arr_len(v) {
787+
arr_push(out, _prom_sqrt_eps(arr_get(v, i), eps));
788+
i = i + 1;
789+
}
790+
return out;
791+
}
792+
return sqrt(v) + eps;
793+
}
794+
795+
# One AdamW step. Updates state in-place (mutates dict + tape values).
796+
fn prom_adamw_step(state) {
797+
h params = dict_get(state, "params");
798+
h lr = dict_get(state, "lr");
799+
h b1 = dict_get(state, "beta1");
800+
h b2 = dict_get(state, "beta2");
801+
h eps = dict_get(state, "eps");
802+
h wd = dict_get(state, "wd");
803+
h step = dict_get(state, "step") + 1;
804+
dict_set(state, "step", step);
805+
806+
h m = dict_get(state, "m");
807+
h v = dict_get(state, "v");
808+
809+
# Lazy-init m and v on first step using grad shapes.
810+
if arr_len(m) == 0 {
811+
h i = 0;
812+
while i < arr_len(params) {
813+
h g = tape_grad(arr_get(params, i));
814+
arr_push(m, _prom_zeros_like(g));
815+
arr_push(v, _prom_zeros_like(g));
816+
i = i + 1;
817+
}
818+
dict_set(state, "m", m);
819+
dict_set(state, "v", v);
820+
}
821+
822+
h bias1 = 1.0 - pow(b1, step * 1.0);
823+
h bias2 = 1.0 - pow(b2, step * 1.0);
824+
825+
h i = 0;
826+
while i < arr_len(params) {
827+
h p = arr_get(params, i);
828+
h g = tape_grad(p);
829+
830+
# m_t = b1*m + (1-b1)*g
831+
h m_old = arr_get(m, i);
832+
h m_new = _prom_zip(_prom_scale(m_old, b1, "mul"),
833+
_prom_scale(g, 1.0 - b1, "mul"), "add");
834+
arr_set(m, i, m_new);
835+
836+
# v_t = b2*v + (1-b2)*g²
837+
h v_old = arr_get(v, i);
838+
h gsq = _prom_zip(g, g, "mul");
839+
h v_new = _prom_zip(_prom_scale(v_old, b2, "mul"),
840+
_prom_scale(gsq, 1.0 - b2, "mul"), "add");
841+
arr_set(v, i, v_new);
842+
843+
# m_hat = m_t / bias1; v_hat = v_t / bias2
844+
h m_hat = _prom_scale(m_new, 1.0 / bias1, "mul");
845+
h v_hat = _prom_scale(v_new, 1.0 / bias2, "mul");
846+
h denom = _prom_sqrt_eps(v_hat, eps);
847+
h adam_step = _prom_zip(m_hat, denom, "div");
848+
849+
# θ ← θ − lr*adam_step − lr*wd*θ
850+
h cur = tape_value(p);
851+
h wd_term = _prom_scale(cur, lr * wd, "mul");
852+
h main_term = _prom_scale(adam_step, lr, "mul");
853+
h decayed = _prom_zip(cur, wd_term, "sub");
854+
h new_val = _prom_zip(decayed, main_term, "sub");
855+
tape_set_value(p, new_val);
856+
857+
i = i + 1;
858+
}
859+
}
860+
861+
# ---------------------------------------------------------------------------
862+
# Embedding layer — direct row lookup.
863+
# table: [vocab, d_model]; forward(token_idx) = table[token_idx, :]
864+
# Built without a tape_embedding_lookup builtin yet — we use
865+
# tape_matmul with a one-hot, which is mathematically equivalent and
866+
# composes with the existing autograd. Will replace with a fused
867+
# Rust op when JIT'd embedding is a bottleneck.
868+
# ---------------------------------------------------------------------------
869+
870+
fn prom_embedding_new(vocab, d_model, rng_state) {
871+
h table = _prom_random_matrix(vocab, d_model, 0.3, rng_state);
872+
h layer = dict_new();
873+
dict_set(layer, "kind", "embedding");
874+
dict_set(layer, "vocab", vocab);
875+
dict_set(layer, "d_model", d_model);
876+
dict_set(layer, "table", dict_get(table, "node"));
877+
dict_set(layer, "rng_state", dict_get(table, "state"));
878+
return layer;
879+
}
880+
881+
# Forward: token_idx → [1, d_model] embedding row.
882+
# Uses one-hot @ table internally; result is differentiable into the
883+
# table param so backward updates the relevant row.
884+
fn prom_embedding_forward(layer, token_idx) {
885+
h vocab = dict_get(layer, "vocab");
886+
h table = dict_get(layer, "table");
887+
h x = prom_one_hot(token_idx, vocab);
888+
return tape_matmul(x, table);
889+
}
890+
891+
fn prom_embedding_params(layer) {
892+
return [dict_get(layer, "table")];
893+
}
894+
895+
# ---------------------------------------------------------------------------
896+
# LayerNorm — normalize each row to zero mean / unit variance, then
897+
# scale + shift by learned gamma/beta.
898+
#
899+
# Composed from existing tape ops: subtract row mean, divide by row
900+
# std + eps, multiply by gamma, add beta. Backward is automatic via
901+
# the tape.
902+
# ---------------------------------------------------------------------------
903+
904+
fn prom_layernorm_new(d_model, rng_state) {
905+
# Initialize gamma=1, beta=0 (identity transform at init).
906+
h gamma_row = [];
907+
h beta_row = [];
908+
h i = 0;
909+
while i < d_model {
910+
arr_push(gamma_row, 1.0);
911+
arr_push(beta_row, 0.0);
912+
i = i + 1;
913+
}
914+
h gamma = tape_var([gamma_row]);
915+
h beta = tape_var([beta_row]);
916+
h layer = dict_new();
917+
dict_set(layer, "kind", "layernorm");
918+
dict_set(layer, "d_model", d_model);
919+
dict_set(layer, "gamma", gamma);
920+
dict_set(layer, "beta", beta);
921+
dict_set(layer, "eps", 1e-5);
922+
dict_set(layer, "rng_state", rng_state);
923+
return layer;
924+
}
925+
926+
# Forward: x is [1, d_model] (single row); subtract mean, divide by
927+
# stable std, scale + shift. The Mean op already gives us per-tensor
928+
# mean; for per-row mean we use the same op since our inputs here are
929+
# single-row.
930+
fn prom_layernorm_forward(layer, x_id) {
931+
h gamma = dict_get(layer, "gamma");
932+
h beta = dict_get(layer, "beta");
933+
h eps = dict_get(layer, "eps");
934+
935+
h mean_id = tape_mean(x_id);
936+
# Broadcast mean as a const shaped like x; OMC's tape mul handles
937+
# scalar broadcast.
938+
h centered = tape_sub(x_id, mean_id);
939+
h sq = tape_mul(centered, centered);
940+
h variance = tape_mean(sq);
941+
h std_const = tape_const(eps);
942+
h denom_sq = tape_add(variance, std_const);
943+
# We need sqrt(variance); use tape_pow_int(denom_sq, ...) — but
944+
# pow_int can only do integer powers. Approximate sqrt via the
945+
# identity sqrt(x) = x^0.5: not directly available; use exp(0.5*log(x)).
946+
h log_v = tape_log(denom_sq);
947+
h half = tape_const(0.5);
948+
h half_log = tape_mul(log_v, half);
949+
h std_inv_log = tape_neg(half_log);
950+
h std_inv = tape_exp(std_inv_log); # = 1 / sqrt(variance + eps)
951+
952+
h normed = tape_mul(centered, std_inv);
953+
h scaled = tape_mul(normed, gamma);
954+
return tape_add(scaled, beta);
955+
}
956+
957+
fn prom_layernorm_params(layer) {
958+
return [dict_get(layer, "gamma"), dict_get(layer, "beta")];
959+
}
960+
961+
# ---------------------------------------------------------------------------
962+
# CRT-Fibonacci positional encoding — validated transformerless-LM win.
963+
#
964+
# Today's PyTorch experiments showed:
965+
# - CRT-PE wins −5.4% on TinyShakespeare (3/3 seeds, train_scale.py)
966+
# - −2.9% on distractor mix (3/3, train_distractor_mix.py)
967+
# - Pairs each Fibonacci modulus with a sin/cos pair on a 2π·pos/m circle
968+
#
969+
# Same moduli as the geodesic bias for architectural coherence.
970+
# ---------------------------------------------------------------------------
971+
972+
fn prom_crt_pe_matrix(seq_len, d_model) {
973+
h moduli = _prom_geodesic_moduli();
974+
h n_pairs = d_model / 2;
975+
h table = [];
976+
h pos = 0;
977+
while pos < seq_len {
978+
h row = [];
979+
h i = 0;
980+
while i < n_pairs {
981+
h m = arr_get(moduli, i - (i / arr_len(moduli)) * arr_len(moduli));
982+
h residue = pos - (pos / m) * m;
983+
h angle = 6.283185307179586 * residue / (m * 1.0);
984+
arr_push(row, sin(angle));
985+
arr_push(row, cos(angle));
986+
i = i + 1;
987+
}
988+
# If d_model is odd, pad final cell with 0.
989+
if (n_pairs * 2) < d_model {
990+
arr_push(row, 0.0);
991+
}
992+
arr_push(table, row);
993+
pos = pos + 1;
994+
}
995+
return table;
996+
}
997+
998+
# ---------------------------------------------------------------------------
999+
# Sequential composition — chain layers; collect params automatically.
1000+
# ---------------------------------------------------------------------------
1001+
1002+
fn prom_sequential(layers) {
1003+
h model = dict_new();
1004+
dict_set(model, "kind", "sequential");
1005+
dict_set(model, "layers", layers);
1006+
return model;
1007+
}
1008+
1009+
fn prom_sequential_forward(model, x_id) {
1010+
h layers = dict_get(model, "layers");
1011+
h cur = x_id;
1012+
h i = 0;
1013+
while i < arr_len(layers) {
1014+
h L = arr_get(layers, i);
1015+
h kind = dict_get(L, "kind");
1016+
if kind == "linear" { cur = prom_linear_forward(L, cur); }
1017+
elif kind == "embedding" { cur = prom_embedding_forward(L, cur); }
1018+
elif kind == "layernorm" { cur = prom_layernorm_forward(L, cur); }
1019+
elif kind == "attention" { cur = prom_attention_forward(L, cur); }
1020+
elif kind == "relu" { cur = prom_relu(cur); }
1021+
elif kind == "sigmoid" { cur = prom_sigmoid(cur); }
1022+
i = i + 1;
1023+
}
1024+
return cur;
1025+
}
1026+
1027+
# Activation pseudo-layers — let users put them inline in a Sequential.
1028+
fn prom_relu_layer() {
1029+
h L = dict_new(); dict_set(L, "kind", "relu"); return L;
1030+
}
1031+
fn prom_sigmoid_layer() {
1032+
h L = dict_new(); dict_set(L, "kind", "sigmoid"); return L;
1033+
}
1034+
1035+
# Collect params from all layers (extends to embedding + layernorm too).
1036+
fn prom_collect_params_v2(layers) {
1037+
h out = [];
1038+
h i = 0;
1039+
while i < arr_len(layers) {
1040+
h L = arr_get(layers, i);
1041+
h kind = dict_get(L, "kind");
1042+
h ps = [];
1043+
if kind == "linear" { ps = prom_linear_params(L); }
1044+
elif kind == "embedding" { ps = prom_embedding_params(L); }
1045+
elif kind == "layernorm" { ps = prom_layernorm_params(L); }
1046+
elif kind == "attention" { ps = prom_attention_params(L); }
1047+
h j = 0;
1048+
while j < arr_len(ps) {
1049+
arr_push(out, arr_get(ps, j));
1050+
j = j + 1;
1051+
}
1052+
i = i + 1;
1053+
}
1054+
return out;
1055+
}

0 commit comments

Comments
 (0)