From 9e893be77aee61c3ad3aa4ba7f602608fcd83d7f Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@gmail.com>
Date: Tue, 21 Oct 2025 13:32:35 +0000
Subject: [PATCH 1/7] Muon changes

---
 src/rules.jl | 109 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 45 deletions(-)

diff --git a/src/rules.jl b/src/rules.jl
index 5e7efd5..5ea81e5 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -1,3 +1,4 @@
+using LinearAlgebra: mul!
 
 nonfirstdims(x) = prod(size(x)[2:end])
 
@@ -23,63 +24,81 @@ In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights
 `Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio,
 but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately).
 """
-struct Muon <: AbstractRule
-    opt::AbstractRule
-    eta::Float64
-    mu::Float64
-    lambda::Float64
-    fallback::Function
+@kwdef struct Muon <: AbstractRule
+    eta = 0.02
+    mu = 0.95
+    lambda = 0.01
 end
 
-Muon(;opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), eta = 0.02, mu = 0.95, lambda = 0.01, fallback = x -> false) = Muon(opt, eta, mu, lambda, fallback)
-
-function init(o::Muon, x::AbstractArray)
-  if nonfirstdims(x) == 1 || o.fallback(x)
-    return init(o.opt, x)
-  else
-    return zero(x)
-  end
+init(::Muon, x::AbstractArray) = zero(x)
+
+function apply!((; eta, mu, lambda)::Muon, state, x::AbstractArray{T}, dx) where T
+    η, μ, λ = T(eta), T(mu), T(lambda)
+    # momentum: m ← β m + (1-β) g
+    @.. state = μ * state + (1 - μ) * dx
+    # Nesterov update fed to NS5: U ← β m + (1-β) g
+    U = @. μ * state + (1 - μ) * dx
+    # orthogonalize
+    @.. U = U / ($norm(U) + T(1e-6))
+    Ot = newtonschulz5!!(U)
+    # post shape factor √max(1, r/c)
+    r, c... = size(x)
+    s = √max(1, T(r) / prod(c))
+    dx′ = @lazy η * (Ot * s + λ * x)   # decoupled WD, step will subtract dx′
+    return state, dx′
 end
 
-function apply!(o::Muon, state, x::AbstractArray{T}, dx) where T
-    if nonfirstdims(x) == 1 || o.fallback(x)
-      return apply!(o.opt, state, x, dx)
+const NS5_COEFFICIENTS = (; a = 3.4445f0, b = -4.7750f0, c = 2.0315f0)
+
+# Applies `X = a*X + b*X*X'*X + c*X*X'*X*X'*X` five times,
+# with two branches based on X*X' and X'*X respectively,
+# to minimize memory usage.
+function _newtonschulz5(X::AbstractMatrix{T}) where T
+    (; a, b, c) = map(T, NS5_COEFFICIENTS)
+    if size(X, 1) <= size(X, 2)
+        for _ in 1:5
+            A = X * X'
+            B = b * A + c * A * A
+            X = a * X + B * X
+        end
     else
-      η = T(o.eta); μ = T(o.mu); λ = T(o.lambda)
-      # momentum: m ← β m + (1-β) g
-      @.. state = μ * state + (one(T) - μ) * dx
-      # Nesterov update fed to NS5: U ← (1-β) g + β m
-      U = @. (one(T) - μ) * dx + μ * state
-      # orthogonalize + post shape factor √max(1, r/c)
-      Ot = _newton_schulz5(U)
-      r = size(x, 1); c = nonfirstdims(x)
-      s = T(sqrt(max(one(T), T(r) / T(c))))
-      dx′ = @lazy η * (Ot * s + λ * x)   # decoupled WD, step will subtract dx′
-      return state, dx′
+        for _ in 1:5
+            A = X' * X
+            B = b * A + c * A * A
+            X = a * X + X * B
+        end
     end
+    return X
 end
 
-function _inner_newton_schulz5(X::AbstractMatrix{T}) where T
-  a, b, c = (T(3.4445f0), T(-4.7750f0), T(2.0315f0))
-  for _ in 1:5
-    A = X * X'
-    B = b * A + c * A * A
-    X = a * X + B * X
-  end 
-  X
-end
-
-function _newton_schulz5(G::AbstractMatrix{T}) where T
-    X = G / (norm(G) + T(1e-7))
-    if size(G, 1) > size(G, 2)
-      return transpose(_inner_newton_schulz5(transpose(X)))
+# In-place version of _newtonschulz5 that uses
+# three buffers with a total size of 2n²+nm.
+function _newtonschulz5!(X::AbstractMatrix)
+    n = minimum(size(X))
+    A = similar(X, n, n)
+    B = similar(X, n, n)
+    x = similar(X) # mirror of X
+    (; a, b, c) = NS5_COEFFICIENTS
+    if size(X, 1) <= size(X, 2)
+        for _ in 1:5
+            mul!(A, X, X')
+            copyto!(B, A); mul!(B, A, A, c, b)
+            copyto!(x, X); mul!(X, B, x, true, a)
+        end
     else
-      return _inner_newton_schulz5(X)
+        for _ in 1:5
+            mul!(A, X', X)
+            copyto!(B, A); mul!(B, A, A, c, b)
+            copyto!(x, X); mul!(X, x, B, true, a)
+        end
     end
+    return X
 end
-_newton_schulz5(G::AbstractArray) = reshape(_newton_schulz5(reshape(G, size(G,1), :)), size(G))
 
-adjust(r::Muon, η::Real) = adjust(r, eta = η, opt = adjust(r.opt, eta = (r.opt.eta / r.eta) * η))
+newtonschulz5!!(X::AbstractMatrix) = maywrite(X) ? _newtonschulz5!(X) : _newtonschulz5(X)
+newtonschulz5!!(X::AbstractArray) = reshape(newtonschulz5!!(reshape(X, size(X, 1), :)), size(X))
+
+adjust(r::Muon, η::Real) = adjust(r, eta = η)
 
 """
     NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true)

From 61c85f31ddb0d4b6f47c86aef9e96b33a1b4046a Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 18:57:34 +0100
Subject: [PATCH 2/7] Add setup function; Experimental module

---
 Project.toml                        |  30 +++------
 src/CannotWaitForTheseOptimisers.jl |   4 +-
 src/Experimental/Experimental.jl    |  13 ++++
 src/Experimental/Muon.jl            |  58 ++++++++++++++++
 src/rules.jl                        | 100 ----------------------------
 src/setup.jl                        |  26 ++++++++
 test/Project.toml                   |  11 +++
 7 files changed, 119 insertions(+), 123 deletions(-)
 create mode 100644 src/Experimental/Experimental.jl
 create mode 100644 src/Experimental/Muon.jl
 create mode 100644 src/setup.jl
 create mode 100644 test/Project.toml

diff --git a/Project.toml b/Project.toml
index aec8d16..1bc566c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,34 +1,20 @@
 name = "CannotWaitForTheseOptimisers"
 uuid = "16124dda-d9fe-413b-a880-e3f4df3aa341"
-authors = ["murrellb <murrellb@gmail.com> and contributors"]
 version = "0.1.1"
+authors = ["murrellb <murrellb@gmail.com> and contributors"]
+
+[workspace]
+projects = ["test", "docs"]
 
 [deps]
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+MatrixSign = "a25fa8c1-e8fe-40fb-8be2-d139e369b1d5"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
-ChainRulesCore = "1"
-Functors = "0.5"
-LinearAlgebra = "1"
+Functors = "0.5.2"
+MatrixSign = "0.0.1"
 Optimisers = "0.4"
 Random = "1"
-StaticArrays = "1"
-Statistics = "1"
-Test = "1"
-Zygote = "0.6"
 julia = "1.10"
-
-[extras]
-Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[targets]
-test = ["Test", "ChainRulesCore", "Functors", "LinearAlgebra", "Optimisers", "StaticArrays", "Statistics", "Zygote"]
diff --git a/src/CannotWaitForTheseOptimisers.jl b/src/CannotWaitForTheseOptimisers.jl
index fa5bf9c..c9f2398 100644
--- a/src/CannotWaitForTheseOptimisers.jl
+++ b/src/CannotWaitForTheseOptimisers.jl
@@ -5,7 +5,9 @@ import Optimisers: OptimiserChain, AbstractRule, Leaf, adjust, adjust!, _adjust,
 
 include("rules.jl")
 include("adjust.jl")
+include("setup.jl")
+include("Experimental/Experimental.jl")
 
-export Muon, Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl
+export Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl
 
 end
diff --git a/src/Experimental/Experimental.jl b/src/Experimental/Experimental.jl
new file mode 100644
index 0000000..3f447da
--- /dev/null
+++ b/src/Experimental/Experimental.jl
@@ -0,0 +1,13 @@
+module Experimental
+
+include("Muon.jl")
+export Muon
+
+using Base: IdSet
+export IdSet
+
+using Functors: fcollect
+findnodes(pred::Function, x) = filter(pred, fcollect(x))
+export findnodes
+
+end
diff --git a/src/Experimental/Muon.jl b/src/Experimental/Muon.jl
new file mode 100644
index 0000000..db4edf7
--- /dev/null
+++ b/src/Experimental/Muon.jl
@@ -0,0 +1,58 @@
+using MatrixSign
+using Optimisers: AbstractRule, @lazy, @..
+import Optimisers: init, apply!, adjust!
+
+nonfirstdims(x, dims=ndims(x)) = prod(size(x)[2:dims])
+nonfirstdims(x, ::Nothing) = nonfirstdims(x)
+
+"""
+    Muon(η = 0.02, μ = 0.95, λ = 0.01; dims = nothing)
+    Muon(; [eta, mu, lambda, dims])
+
+Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon)
+
+Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step,
+in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration.
+
+# Parameters
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights
+- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction
+- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation.
+- Keyword `dims`: Dimensions to orthogonalize. If `nothing`, then trailing dimensions get flattened
+  into the second dimension. If `dims < ndims(x)`, remaining dimensions are orthogonalized independently.
+
+Note: Works best with large batch sizes and may not be suitable for fine-tuning.
+In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads.
+
+`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio,
+but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately).
+"""
+@kwdef struct Muon <: AbstractRule
+    eta = 0.02
+    mu = 0.95
+    lambda = 0.01
+    dims = nothing
+end
+
+init(::Muon, x::AbstractArray) = zero(x)
+
+function apply!(
+    (; eta, mu, lambda, dims)::Muon,
+    state, x::AbstractArray{T}, dx
+) where T
+    η, μ, λ = T(eta), T(mu), T(lambda)
+    # update momentum
+    @.. state = μ * state + (1-μ) * dx
+    # Nesterov update fed to msign
+    U = @. μ * state + (1-μ) * dx
+    # orthogonalize
+    Ot = msign!(
+        reshape(U, size(U, 1), nonfirstdims(U, dims), :),
+        steps=5, fused=3)
+    # post shape factor
+    s = √max(1, T(size(Ot, 1) / size(Ot, 2)))
+    dx′ = @lazy η * (Ot * s + λ * x)   # decoupled WD, step will subtract dx′
+    return state, dx′
+end
+
+adjust!(r::Muon, η::Real) = adjust!(r, eta = η)
diff --git a/src/rules.jl b/src/rules.jl
index 5ea81e5..c4e0cbd 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -1,105 +1,5 @@
-using LinearAlgebra: mul!
-
 nonfirstdims(x) = prod(size(x)[2:end])
 
-"""
-    Muon(opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), η = 0.02, μ = 0.95, λ = 0.01, fallback = Returns(false))
-    Muon(; [opt, eta, mu, lambda, fallback])
-
-Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon)
-
-Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step,
-in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration.
-
-# Parameters
-- Fallback optimizer (`opt`): Optimizer to use for 1D parameters or when the `fallback` function returns true
-- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights
-- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction
-- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation.
-- Fallback function (`fallback`): Function to control when, in addition to 1D arrays, the fallback optimizer should be used. Will be passed the parameter array and must return a boolean.
-
-Note: Works best with large batch sizes and may not be suitable for fine-tuning.
-In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads.
-
-`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio,
-but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately).
-"""
-@kwdef struct Muon <: AbstractRule
-    eta = 0.02
-    mu = 0.95
-    lambda = 0.01
-end
-
-init(::Muon, x::AbstractArray) = zero(x)
-
-function apply!((; eta, mu, lambda)::Muon, state, x::AbstractArray{T}, dx) where T
-    η, μ, λ = T(eta), T(mu), T(lambda)
-    # momentum: m ← β m + (1-β) g
-    @.. state = μ * state + (1 - μ) * dx
-    # Nesterov update fed to NS5: U ← β m + (1-β) g
-    U = @. μ * state + (1 - μ) * dx
-    # orthogonalize
-    @.. U = U / ($norm(U) + T(1e-6))
-    Ot = newtonschulz5!!(U)
-    # post shape factor √max(1, r/c)
-    r, c... = size(x)
-    s = √max(1, T(r) / prod(c))
-    dx′ = @lazy η * (Ot * s + λ * x)   # decoupled WD, step will subtract dx′
-    return state, dx′
-end
-
-const NS5_COEFFICIENTS = (; a = 3.4445f0, b = -4.7750f0, c = 2.0315f0)
-
-# Applies `X = a*X + b*X*X'*X + c*X*X'*X*X'*X` five times,
-# with two branches based on X*X' and X'*X respectively,
-# to minimize memory usage.
-function _newtonschulz5(X::AbstractMatrix{T}) where T
-    (; a, b, c) = map(T, NS5_COEFFICIENTS)
-    if size(X, 1) <= size(X, 2)
-        for _ in 1:5
-            A = X * X'
-            B = b * A + c * A * A
-            X = a * X + B * X
-        end
-    else
-        for _ in 1:5
-            A = X' * X
-            B = b * A + c * A * A
-            X = a * X + X * B
-        end
-    end
-    return X
-end
-
-# In-place version of _newtonschulz5 that uses
-# three buffers with a total size of 2n²+nm.
-function _newtonschulz5!(X::AbstractMatrix)
-    n = minimum(size(X))
-    A = similar(X, n, n)
-    B = similar(X, n, n)
-    x = similar(X) # mirror of X
-    (; a, b, c) = NS5_COEFFICIENTS
-    if size(X, 1) <= size(X, 2)
-        for _ in 1:5
-            mul!(A, X, X')
-            copyto!(B, A); mul!(B, A, A, c, b)
-            copyto!(x, X); mul!(X, B, x, true, a)
-        end
-    else
-        for _ in 1:5
-            mul!(A, X', X)
-            copyto!(B, A); mul!(B, A, A, c, b)
-            copyto!(x, X); mul!(X, x, B, true, a)
-        end
-    end
-    return X
-end
-
-newtonschulz5!!(X::AbstractMatrix) = maywrite(X) ? _newtonschulz5!(X) : _newtonschulz5(X)
-newtonschulz5!!(X::AbstractArray) = reshape(newtonschulz5!!(reshape(X, size(X, 1), :)), size(X))
-
-adjust(r::Muon, η::Real) = adjust(r, eta = η)
-
 """
     NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true)
 
diff --git a/src/setup.jl b/src/setup.jl
new file mode 100644
index 0000000..af9313a
--- /dev/null
+++ b/src/setup.jl
@@ -0,0 +1,26 @@
+# See https://github.com/FluxML/Optimisers.jl/pull/204
+
+setup(rule::AbstractRule, model) = setup(Returns(rule), model)
+function setup(fun::Function, model)
+  cache = IdDict()
+  tree = _setup(fun, model; cache)
+  isempty(cache) && @warn "setup found no trainable parameters in this model"
+  tree
+end
+
+# _setup is almost fmapstructure, but needs a _trainable_walk, and a cache which ignores numbers etc.
+function _setup(fun::Function, x; cache)
+  haskey(cache, x) && return cache[x]
+  if isnumeric(x)
+    rule = fun(x)::AbstractRule
+    ℓ = Leaf(rule, init(rule, x))
+    if isbits(x)
+      cache[nothing] = nothing  # just to disable the warning
+      ℓ
+    else
+      cache[x] = ℓ
+    end
+  else
+    mapvalue(xᵢ -> _setup(fun, xᵢ; cache), _trainable(x))
+  end
+end
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..b1db398
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+CannotWaitForTheseOptimisers = "16124dda-d9fe-413b-a880-e3f4df3aa341"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

From be4f62257d11bd3ae20b114bda15c2fac3f0b4c8 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 19:04:48 +0100
Subject: [PATCH 3/7] Add back old Muon

---
 src/rules.jl | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/src/rules.jl b/src/rules.jl
index c4e0cbd..5e7efd5 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -1,5 +1,86 @@
+
 nonfirstdims(x) = prod(size(x)[2:end])
 
+"""
+    Muon(opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), η = 0.02, μ = 0.95, λ = 0.01, fallback = Returns(false))
+    Muon(; [opt, eta, mu, lambda, fallback])
+
+Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon)
+
+Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step,
+in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration.
+
+# Parameters
+- Fallback optimizer (`opt`): Optimizer to use for 1D parameters or when the `fallback` function returns true
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights
+- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction
+- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation.
+- Fallback function (`fallback`): Function to control when, in addition to 1D arrays, the fallback optimizer should be used. Will be passed the parameter array and must return a boolean.
+
+Note: Works best with large batch sizes and may not be suitable for fine-tuning.
+In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads.
+
+`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio,
+but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately).
+"""
+struct Muon <: AbstractRule
+    opt::AbstractRule
+    eta::Float64
+    mu::Float64
+    lambda::Float64
+    fallback::Function
+end
+
+Muon(;opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), eta = 0.02, mu = 0.95, lambda = 0.01, fallback = x -> false) = Muon(opt, eta, mu, lambda, fallback)
+
+function init(o::Muon, x::AbstractArray)
+  if nonfirstdims(x) == 1 || o.fallback(x)
+    return init(o.opt, x)
+  else
+    return zero(x)
+  end
+end
+
+function apply!(o::Muon, state, x::AbstractArray{T}, dx) where T
+    if nonfirstdims(x) == 1 || o.fallback(x)
+      return apply!(o.opt, state, x, dx)
+    else
+      η = T(o.eta); μ = T(o.mu); λ = T(o.lambda)
+      # momentum: m ← β m + (1-β) g
+      @.. state = μ * state + (one(T) - μ) * dx
+      # Nesterov update fed to NS5: U ← (1-β) g + β m
+      U = @. (one(T) - μ) * dx + μ * state
+      # orthogonalize + post shape factor √max(1, r/c)
+      Ot = _newton_schulz5(U)
+      r = size(x, 1); c = nonfirstdims(x)
+      s = T(sqrt(max(one(T), T(r) / T(c))))
+      dx′ = @lazy η * (Ot * s + λ * x)   # decoupled WD, step will subtract dx′
+      return state, dx′
+    end
+end
+
+function _inner_newton_schulz5(X::AbstractMatrix{T}) where T
+  a, b, c = (T(3.4445f0), T(-4.7750f0), T(2.0315f0))
+  for _ in 1:5
+    A = X * X'
+    B = b * A + c * A * A
+    X = a * X + B * X
+  end 
+  X
+end
+
+function _newton_schulz5(G::AbstractMatrix{T}) where T
+    X = G / (norm(G) + T(1e-7))
+    if size(G, 1) > size(G, 2)
+      return transpose(_inner_newton_schulz5(transpose(X)))
+    else
+      return _inner_newton_schulz5(X)
+    end
+end
+_newton_schulz5(G::AbstractArray) = reshape(_newton_schulz5(reshape(G, size(G,1), :)), size(G))
+
+adjust(r::Muon, η::Real) = adjust(r, eta = η, opt = adjust(r.opt, eta = (r.opt.eta / r.eta) * η))
+
 """
     NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true)
 

From 52196c7f0a64d3e4be43c3800853faed56758f66 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 19:07:45 +0100
Subject: [PATCH 4/7] fix CI.yml

---
 .github/workflows/CI.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index d058286..764839d 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -37,6 +37,8 @@ jobs:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v2
+      - name: registry_add
+        run: julia -e 'using Pkg; Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")'
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
@@ -62,6 +64,8 @@ jobs:
         shell: julia --project=docs --color=yes {0}
         run: |
           using Pkg
+          Pkg.Registry.add(url="https://github.com/JuliaRegistries/General")
+          Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")
           Pkg.develop(PackageSpec(path=pwd()))
           Pkg.instantiate()
       - uses: julia-actions/julia-buildpkg@v1

From 64136c19f3ad4edabf249931602f8576cec0e277 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 19:10:06 +0100
Subject: [PATCH 5/7] fix CI

---
 .github/workflows/CI.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 764839d..f416b02 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -38,7 +38,7 @@ jobs:
           arch: ${{ matrix.arch }}
       - uses: julia-actions/cache@v2
       - name: registry_add
-        run: julia -e 'using Pkg; Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")'
+        run: julia -e 'using Pkg; Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/MurrellGroup/MurrellGroupRegistry"))'
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
@@ -64,8 +64,8 @@ jobs:
         shell: julia --project=docs --color=yes {0}
         run: |
           using Pkg
-          Pkg.Registry.add(url="https://github.com/JuliaRegistries/General")
-          Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")
+          Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/JuliaRegistries/General"))
+          Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/MurrellGroup/MurrellGroupRegistry"))
           Pkg.develop(PackageSpec(path=pwd()))
           Pkg.instantiate()
       - uses: julia-actions/julia-buildpkg@v1

From 81d4c28bb3cb751ea8775899e7a5efaa6006e3f0 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 19:17:32 +0100
Subject: [PATCH 6/7] export Muon

---
 src/CannotWaitForTheseOptimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CannotWaitForTheseOptimisers.jl b/src/CannotWaitForTheseOptimisers.jl
index c9f2398..8c01925 100644
--- a/src/CannotWaitForTheseOptimisers.jl
+++ b/src/CannotWaitForTheseOptimisers.jl
@@ -8,6 +8,6 @@ include("adjust.jl")
 include("setup.jl")
 include("Experimental/Experimental.jl")
 
-export Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl
+export Muon, Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl
 
 end

From cb0ffe85482f1c9161a2bb48ef73b2a0a08de892 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@gmail.com>
Date: Tue, 4 Nov 2025 19:22:22 +0100
Subject: [PATCH 7/7] add Experimental module to docs

---
 docs/src/index.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/src/index.md b/docs/src/index.md
index 17345db..7f028ab 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -12,3 +12,9 @@ Documentation for [CannotWaitForTheseOptimisers](https://github.com/MurrellGroup
 ```@autodocs
 Modules = [CannotWaitForTheseOptimisers]
 ```
+
+## Experimental
+
+```@autodocs
+Modules = [CannotWaitForTheseOptimisers.Experimental]
+```
\ No newline at end of file