From 9e893be77aee61c3ad3aa4ba7f602608fcd83d7f Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Tue, 21 Oct 2025 13:32:35 +0000 Subject: [PATCH 1/7] Muon changes --- src/rules.jl | 109 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/src/rules.jl b/src/rules.jl index 5e7efd5..5ea81e5 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -1,3 +1,4 @@ +using LinearAlgebra: mul! nonfirstdims(x) = prod(size(x)[2:end]) @@ -23,63 +24,81 @@ In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights `Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio, but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately). """ -struct Muon <: AbstractRule - opt::AbstractRule - eta::Float64 - mu::Float64 - lambda::Float64 - fallback::Function +@kwdef struct Muon <: AbstractRule + eta = 0.02 + mu = 0.95 + lambda = 0.01 end -Muon(;opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), eta = 0.02, mu = 0.95, lambda = 0.01, fallback = x -> false) = Muon(opt, eta, mu, lambda, fallback) - -function init(o::Muon, x::AbstractArray) - if nonfirstdims(x) == 1 || o.fallback(x) - return init(o.opt, x) - else - return zero(x) - end +init(::Muon, x::AbstractArray) = zero(x) + +function apply!((; eta, mu, lambda)::Muon, state, x::AbstractArray{T}, dx) where T + η, μ, λ = T(eta), T(mu), T(lambda) + # momentum: m ← β m + (1-β) g + @.. state = μ * state + (1 - μ) * dx + # Nesterov update fed to NS5: U ← β m + (1-β) g + U = @. μ * state + (1 - μ) * dx + # orthogonalize + @.. U = U / ($norm(U) + T(1e-6)) + Ot = newtonschulz5!!(U) + # post shape factor √max(1, r/c) + r, c... = size(x) + s = √max(1, T(r) / prod(c)) + dx′ = @lazy η * (Ot * s + λ * x) # decoupled WD, step will subtract dx′ + return state, dx′ end -function apply!(o::Muon, state, x::AbstractArray{T}, dx) where T - if nonfirstdims(x) == 1 || o.fallback(x) - return apply!(o.opt, state, x, dx) +const NS5_COEFFICIENTS = (; a = 3.4445f0, b = -4.7750f0, c = 2.0315f0) + +# Applies `X = a*X + b*X*X'*X + c*X*X'*X*X'*X` five times, +# with two branches based on X*X' and X'*X respectively, +# to minimize memory usage. +function _newtonschulz5(X::AbstractMatrix{T}) where T + (; a, b, c) = map(T, NS5_COEFFICIENTS) + if size(X, 1) <= size(X, 2) + for _ in 1:5 + A = X * X' + B = b * A + c * A * A + X = a * X + B * X + end else - η = T(o.eta); μ = T(o.mu); λ = T(o.lambda) - # momentum: m ← β m + (1-β) g - @.. state = μ * state + (one(T) - μ) * dx - # Nesterov update fed to NS5: U ← (1-β) g + β m - U = @. (one(T) - μ) * dx + μ * state - # orthogonalize + post shape factor √max(1, r/c) - Ot = _newton_schulz5(U) - r = size(x, 1); c = nonfirstdims(x) - s = T(sqrt(max(one(T), T(r) / T(c)))) - dx′ = @lazy η * (Ot * s + λ * x) # decoupled WD, step will subtract dx′ - return state, dx′ + for _ in 1:5 + A = X' * X + B = b * A + c * A * A + X = a * X + X * B + end end + return X end -function _inner_newton_schulz5(X::AbstractMatrix{T}) where T - a, b, c = (T(3.4445f0), T(-4.7750f0), T(2.0315f0)) - for _ in 1:5 - A = X * X' - B = b * A + c * A * A - X = a * X + B * X - end - X -end - -function _newton_schulz5(G::AbstractMatrix{T}) where T - X = G / (norm(G) + T(1e-7)) - if size(G, 1) > size(G, 2) - return transpose(_inner_newton_schulz5(transpose(X))) +# In-place version of _newtonschulz5 that uses +# three buffers with a total size of 2n²+nm. +function _newtonschulz5!(X::AbstractMatrix) + n = minimum(size(X)) + A = similar(X, n, n) + B = similar(X, n, n) + x = similar(X) # mirror of X + (; a, b, c) = NS5_COEFFICIENTS + if size(X, 1) <= size(X, 2) + for _ in 1:5 + mul!(A, X, X') + copyto!(B, A); mul!(B, A, A, c, b) + copyto!(x, X); mul!(X, B, x, true, a) + end else - return _inner_newton_schulz5(X) + for _ in 1:5 + mul!(A, X', X) + copyto!(B, A); mul!(B, A, A, c, b) + copyto!(x, X); mul!(X, x, B, true, a) + end end + return X end -_newton_schulz5(G::AbstractArray) = reshape(_newton_schulz5(reshape(G, size(G,1), :)), size(G)) -adjust(r::Muon, η::Real) = adjust(r, eta = η, opt = adjust(r.opt, eta = (r.opt.eta / r.eta) * η)) +newtonschulz5!!(X::AbstractMatrix) = maywrite(X) ? _newtonschulz5!(X) : _newtonschulz5(X) +newtonschulz5!!(X::AbstractArray) = reshape(newtonschulz5!!(reshape(X, size(X, 1), :)), size(X)) + +adjust(r::Muon, η::Real) = adjust(r, eta = η) """ NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true) From 61c85f31ddb0d4b6f47c86aef9e96b33a1b4046a Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 18:57:34 +0100 Subject: [PATCH 2/7] Add setup function; Experimental module --- Project.toml | 30 +++------ src/CannotWaitForTheseOptimisers.jl | 4 +- src/Experimental/Experimental.jl | 13 ++++ src/Experimental/Muon.jl | 58 ++++++++++++++++ src/rules.jl | 100 ---------------------------- src/setup.jl | 26 ++++++++ test/Project.toml | 11 +++ 7 files changed, 119 insertions(+), 123 deletions(-) create mode 100644 src/Experimental/Experimental.jl create mode 100644 src/Experimental/Muon.jl create mode 100644 src/setup.jl create mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index aec8d16..1bc566c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,34 +1,20 @@ name = "CannotWaitForTheseOptimisers" uuid = "16124dda-d9fe-413b-a880-e3f4df3aa341" -authors = ["murrellb and contributors"] version = "0.1.1" +authors = ["murrellb and contributors"] + +[workspace] +projects = ["test", "docs"] [deps] +Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +MatrixSign = "a25fa8c1-e8fe-40fb-8be2-d139e369b1d5" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [compat] -ChainRulesCore = "1" -Functors = "0.5" -LinearAlgebra = "1" +Functors = "0.5.2" +MatrixSign = "0.0.1" Optimisers = "0.4" Random = "1" -StaticArrays = "1" -Statistics = "1" -Test = "1" -Zygote = "0.6" julia = "1.10" - -[extras] -Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" - -[targets] -test = ["Test", "ChainRulesCore", "Functors", "LinearAlgebra", "Optimisers", "StaticArrays", "Statistics", "Zygote"] diff --git a/src/CannotWaitForTheseOptimisers.jl b/src/CannotWaitForTheseOptimisers.jl index fa5bf9c..c9f2398 100644 --- a/src/CannotWaitForTheseOptimisers.jl +++ b/src/CannotWaitForTheseOptimisers.jl @@ -5,7 +5,9 @@ import Optimisers: OptimiserChain, AbstractRule, Leaf, adjust, adjust!, _adjust, include("rules.jl") include("adjust.jl") +include("setup.jl") +include("Experimental/Experimental.jl") -export Muon, Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl +export Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl end diff --git a/src/Experimental/Experimental.jl b/src/Experimental/Experimental.jl new file mode 100644 index 0000000..3f447da --- /dev/null +++ b/src/Experimental/Experimental.jl @@ -0,0 +1,13 @@ +module Experimental + +include("Muon.jl") +export Muon + +using Base: IdSet +export IdSet + +using Functors: fcollect +findnodes(pred::Function, x) = filter(pred, fcollect(x)) +export findnodes + +end diff --git a/src/Experimental/Muon.jl b/src/Experimental/Muon.jl new file mode 100644 index 0000000..db4edf7 --- /dev/null +++ b/src/Experimental/Muon.jl @@ -0,0 +1,58 @@ +using MatrixSign +using Optimisers: AbstractRule, @lazy, @.. +import Optimisers: init, apply!, adjust! + +nonfirstdims(x, dims=ndims(x)) = prod(size(x)[2:dims]) +nonfirstdims(x, ::Nothing) = nonfirstdims(x) + +""" + Muon(η = 0.02, μ = 0.95, λ = 0.01; dims = nothing) + Muon(; [eta, mu, lambda, dims]) + +Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon) + +Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step, +in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration. + +# Parameters +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights +- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction +- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation. +- Keyword `dims`: Dimensions to orthogonalize. If `nothing`, then trailing dimensions get flattened + into the second dimension. If `dims < ndims(x)`, remaining dimensions are orthogonalized independently. + +Note: Works best with large batch sizes and may not be suitable for fine-tuning. +In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads. + +`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio, +but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately). +""" +@kwdef struct Muon <: AbstractRule + eta = 0.02 + mu = 0.95 + lambda = 0.01 + dims = nothing +end + +init(::Muon, x::AbstractArray) = zero(x) + +function apply!( + (; eta, mu, lambda, dims)::Muon, + state, x::AbstractArray{T}, dx +) where T + η, μ, λ = T(eta), T(mu), T(lambda) + # update momentum + @.. state = μ * state + (1-μ) * dx + # Nesterov update fed to msign + U = @. μ * state + (1-μ) * dx + # orthogonalize + Ot = msign!( + reshape(U, size(U, 1), nonfirstdims(U, dims), :), + steps=5, fused=3) + # post shape factor + s = √max(1, T(size(Ot, 1) / size(Ot, 2))) + dx′ = @lazy η * (Ot * s + λ * x) # decoupled WD, step will subtract dx′ + return state, dx′ +end + +adjust!(r::Muon, η::Real) = adjust!(r, eta = η) diff --git a/src/rules.jl b/src/rules.jl index 5ea81e5..c4e0cbd 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -1,105 +1,5 @@ -using LinearAlgebra: mul! - nonfirstdims(x) = prod(size(x)[2:end]) -""" - Muon(opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), η = 0.02, μ = 0.95, λ = 0.01, fallback = Returns(false)) - Muon(; [opt, eta, mu, lambda, fallback]) - -Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon) - -Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step, -in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration. - -# Parameters -- Fallback optimizer (`opt`): Optimizer to use for 1D parameters or when the `fallback` function returns true -- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights -- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction -- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation. -- Fallback function (`fallback`): Function to control when, in addition to 1D arrays, the fallback optimizer should be used. Will be passed the parameter array and must return a boolean. - -Note: Works best with large batch sizes and may not be suitable for fine-tuning. -In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads. - -`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio, -but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately). -""" -@kwdef struct Muon <: AbstractRule - eta = 0.02 - mu = 0.95 - lambda = 0.01 -end - -init(::Muon, x::AbstractArray) = zero(x) - -function apply!((; eta, mu, lambda)::Muon, state, x::AbstractArray{T}, dx) where T - η, μ, λ = T(eta), T(mu), T(lambda) - # momentum: m ← β m + (1-β) g - @.. state = μ * state + (1 - μ) * dx - # Nesterov update fed to NS5: U ← β m + (1-β) g - U = @. μ * state + (1 - μ) * dx - # orthogonalize - @.. U = U / ($norm(U) + T(1e-6)) - Ot = newtonschulz5!!(U) - # post shape factor √max(1, r/c) - r, c... = size(x) - s = √max(1, T(r) / prod(c)) - dx′ = @lazy η * (Ot * s + λ * x) # decoupled WD, step will subtract dx′ - return state, dx′ -end - -const NS5_COEFFICIENTS = (; a = 3.4445f0, b = -4.7750f0, c = 2.0315f0) - -# Applies `X = a*X + b*X*X'*X + c*X*X'*X*X'*X` five times, -# with two branches based on X*X' and X'*X respectively, -# to minimize memory usage. -function _newtonschulz5(X::AbstractMatrix{T}) where T - (; a, b, c) = map(T, NS5_COEFFICIENTS) - if size(X, 1) <= size(X, 2) - for _ in 1:5 - A = X * X' - B = b * A + c * A * A - X = a * X + B * X - end - else - for _ in 1:5 - A = X' * X - B = b * A + c * A * A - X = a * X + X * B - end - end - return X -end - -# In-place version of _newtonschulz5 that uses -# three buffers with a total size of 2n²+nm. -function _newtonschulz5!(X::AbstractMatrix) - n = minimum(size(X)) - A = similar(X, n, n) - B = similar(X, n, n) - x = similar(X) # mirror of X - (; a, b, c) = NS5_COEFFICIENTS - if size(X, 1) <= size(X, 2) - for _ in 1:5 - mul!(A, X, X') - copyto!(B, A); mul!(B, A, A, c, b) - copyto!(x, X); mul!(X, B, x, true, a) - end - else - for _ in 1:5 - mul!(A, X', X) - copyto!(B, A); mul!(B, A, A, c, b) - copyto!(x, X); mul!(X, x, B, true, a) - end - end - return X -end - -newtonschulz5!!(X::AbstractMatrix) = maywrite(X) ? _newtonschulz5!(X) : _newtonschulz5(X) -newtonschulz5!!(X::AbstractArray) = reshape(newtonschulz5!!(reshape(X, size(X, 1), :)), size(X)) - -adjust(r::Muon, η::Real) = adjust(r, eta = η) - """ NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true) diff --git a/src/setup.jl b/src/setup.jl new file mode 100644 index 0000000..af9313a --- /dev/null +++ b/src/setup.jl @@ -0,0 +1,26 @@ +# See https://github.com/FluxML/Optimisers.jl/pull/204 + +setup(rule::AbstractRule, model) = setup(Returns(rule), model) +function setup(fun::Function, model) + cache = IdDict() + tree = _setup(fun, model; cache) + isempty(cache) && @warn "setup found no trainable parameters in this model" + tree +end + +# _setup is almost fmapstructure, but needs a _trainable_walk, and a cache which ignores numbers etc. +function _setup(fun::Function, x; cache) + haskey(cache, x) && return cache[x] + if isnumeric(x) + rule = fun(x)::AbstractRule + ℓ = Leaf(rule, init(rule, x)) + if isbits(x) + cache[nothing] = nothing # just to disable the warning + ℓ + else + cache[x] = ℓ + end + else + mapvalue(xᵢ -> _setup(fun, xᵢ; cache), _trainable(x)) + end +end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..b1db398 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,11 @@ +[deps] +CannotWaitForTheseOptimisers = "16124dda-d9fe-413b-a880-e3f4df3aa341" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" From be4f62257d11bd3ae20b114bda15c2fac3f0b4c8 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 19:04:48 +0100 Subject: [PATCH 3/7] Add back old Muon --- src/rules.jl | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/src/rules.jl b/src/rules.jl index c4e0cbd..5e7efd5 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -1,5 +1,86 @@ + nonfirstdims(x) = prod(size(x)[2:end]) +""" + Muon(opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), η = 0.02, μ = 0.95, λ = 0.01, fallback = Returns(false)) + Muon(; [opt, eta, mu, lambda, fallback]) + +Muon - MomentUm Orthogonalized by Newton-schulz (https://github.com/KellerJordan/Muon) + +Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step, +in which each 2D parameter's update is replaced with the nearest orthogonal matrix using Newton-Schulz iteration. + +# Parameters +- Fallback optimizer (`opt`): Optimizer to use for 1D parameters or when the `fallback` function returns true +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights +- Momentum (`μ == mu`): Controls the acceleration of gradient descent in the prominent direction +- Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation. +- Fallback function (`fallback`): Function to control when, in addition to 1D arrays, the fallback optimizer should be used. Will be passed the parameter array and must return a boolean. + +Note: Works best with large batch sizes and may not be suitable for fine-tuning. +In nanoGPT speedrun experiments, Muon is used for the internal layer >2D weights, and AdamW is used for the 1D weights, embeddings, and heads. + +`Optimisers.adjust!(optimiser_state, η::Real)` will adjust the fallback optimizer's `eta` to `η * (opt.eta / eta)`, and Muon's `eta` to `η`, preserving their ratio, +but `Optimisers.adjust!(optimiser, eta = η)` will only adjust Muon's learning rate (allowing you to adjust the fallback optimizer's learning rate separately). +""" +struct Muon <: AbstractRule + opt::AbstractRule + eta::Float64 + mu::Float64 + lambda::Float64 + fallback::Function +end + +Muon(;opt = AdamW(eta = 0.0003, beta = (0.9,0.95), lambda = 0.01), eta = 0.02, mu = 0.95, lambda = 0.01, fallback = x -> false) = Muon(opt, eta, mu, lambda, fallback) + +function init(o::Muon, x::AbstractArray) + if nonfirstdims(x) == 1 || o.fallback(x) + return init(o.opt, x) + else + return zero(x) + end +end + +function apply!(o::Muon, state, x::AbstractArray{T}, dx) where T + if nonfirstdims(x) == 1 || o.fallback(x) + return apply!(o.opt, state, x, dx) + else + η = T(o.eta); μ = T(o.mu); λ = T(o.lambda) + # momentum: m ← β m + (1-β) g + @.. state = μ * state + (one(T) - μ) * dx + # Nesterov update fed to NS5: U ← (1-β) g + β m + U = @. (one(T) - μ) * dx + μ * state + # orthogonalize + post shape factor √max(1, r/c) + Ot = _newton_schulz5(U) + r = size(x, 1); c = nonfirstdims(x) + s = T(sqrt(max(one(T), T(r) / T(c)))) + dx′ = @lazy η * (Ot * s + λ * x) # decoupled WD, step will subtract dx′ + return state, dx′ + end +end + +function _inner_newton_schulz5(X::AbstractMatrix{T}) where T + a, b, c = (T(3.4445f0), T(-4.7750f0), T(2.0315f0)) + for _ in 1:5 + A = X * X' + B = b * A + c * A * A + X = a * X + B * X + end + X +end + +function _newton_schulz5(G::AbstractMatrix{T}) where T + X = G / (norm(G) + T(1e-7)) + if size(G, 1) > size(G, 2) + return transpose(_inner_newton_schulz5(transpose(X))) + else + return _inner_newton_schulz5(X) + end +end +_newton_schulz5(G::AbstractArray) = reshape(_newton_schulz5(reshape(G, size(G,1), :)), size(G)) + +adjust(r::Muon, η::Real) = adjust(r, eta = η, opt = adjust(r.opt, eta = (r.opt.eta / r.eta) * η)) + """ NormGrowthCap(τ = 1.01; ϵ = 1e-8, lb = 1e-7, throw = true, scale = true) From 52196c7f0a64d3e4be43c3800853faed56758f66 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 19:07:45 +0100 Subject: [PATCH 4/7] fix CI.yml --- .github/workflows/CI.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index d058286..764839d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -37,6 +37,8 @@ jobs: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - uses: julia-actions/cache@v2 + - name: registry_add + run: julia -e 'using Pkg; Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 @@ -62,6 +64,8 @@ jobs: shell: julia --project=docs --color=yes {0} run: | using Pkg + Pkg.Registry.add(url="https://github.com/JuliaRegistries/General") + Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry") Pkg.develop(PackageSpec(path=pwd())) Pkg.instantiate() - uses: julia-actions/julia-buildpkg@v1 From 64136c19f3ad4edabf249931602f8576cec0e277 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 19:10:06 +0100 Subject: [PATCH 5/7] fix CI --- .github/workflows/CI.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 764839d..f416b02 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,7 +38,7 @@ jobs: arch: ${{ matrix.arch }} - uses: julia-actions/cache@v2 - name: registry_add - run: julia -e 'using Pkg; Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry")' + run: julia -e 'using Pkg; Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/MurrellGroup/MurrellGroupRegistry"))' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 @@ -64,8 +64,8 @@ jobs: shell: julia --project=docs --color=yes {0} run: | using Pkg - Pkg.Registry.add(url="https://github.com/JuliaRegistries/General") - Pkg.Registry.add(url="https://github.com/MurrellGroup/MurrellGroupRegistry") + Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/JuliaRegistries/General")) + Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/MurrellGroup/MurrellGroupRegistry")) Pkg.develop(PackageSpec(path=pwd())) Pkg.instantiate() - uses: julia-actions/julia-buildpkg@v1 From 81d4c28bb3cb751ea8775899e7a5efaa6006e3f0 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 19:17:32 +0100 Subject: [PATCH 6/7] export Muon --- src/CannotWaitForTheseOptimisers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CannotWaitForTheseOptimisers.jl b/src/CannotWaitForTheseOptimisers.jl index c9f2398..8c01925 100644 --- a/src/CannotWaitForTheseOptimisers.jl +++ b/src/CannotWaitForTheseOptimisers.jl @@ -8,6 +8,6 @@ include("adjust.jl") include("setup.jl") include("Experimental/Experimental.jl") -export Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl +export Muon, Apollo, NormGrowthCap, GradNormControl, AdaptiveGradNormControl end From cb0ffe85482f1c9161a2bb48ef73b2a0a08de892 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 4 Nov 2025 19:22:22 +0100 Subject: [PATCH 7/7] add Experimental module to docs --- docs/src/index.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/src/index.md b/docs/src/index.md index 17345db..7f028ab 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,3 +12,9 @@ Documentation for [CannotWaitForTheseOptimisers](https://github.com/MurrellGroup ```@autodocs Modules = [CannotWaitForTheseOptimisers] ``` + +## Experimental + +```@autodocs +Modules = [CannotWaitForTheseOptimisers.Experimental] +``` \ No newline at end of file