diff --git a/Project.toml b/Project.toml index a4c76f8..f603359 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ParallelParticleSwarms" uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419" -version = "1.0.0" authors = ["Utkarsh and contributors"] +version = "1.0.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -9,11 +9,14 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Runic = "62bfec6d-59d7-401d-8490-b29ee721c001" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7" @@ -26,6 +29,7 @@ Enzyme = "0.13" ForwardDiff = "0.10, 1.3" JET = "0.9, 0.10, 0.11" KernelAbstractions = "0.9" +LineSearch = "0.1" Optimization = "4.1, 5.2" PrecompileTools = "1" QuasiMonteCarlo = "0.3" @@ -39,7 +43,6 @@ julia = "1.10" [extras] JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/hybrid.jl b/src/hybrid.jl index 60f80ed..2e31e18 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,69 +1,166 @@ -@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol) +using LinearAlgebra: dot +using KernelAbstractions +using SciMLBase +using Optimization +using LineSearch + +"Check that `θ` lies within a slack-expanded box around `[lb, ub]`." +@inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ) +@inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T} + all(isfinite, θ) || return false + w = ub .- lb + return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w) +end + +"Sentinel gradient with huge magnitude used to reject out-of-box trial points." +@inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ) + +"L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) over cyclic memory of length `M`." +@inline function _lbfgs_direction(g, S, Y, Rho, ::Val{M}, k) where {M} + T = eltype(g) + q = g + a = ntuple(_ -> zero(T), Val(M)) + for j in 0:(M - 1) + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + aii = Rho[ii] * dot(S[ii], q) + a = Base.setindex(a, aii, ii) + q = q - aii * Y[ii] + end + end + γ = if k >= 1 + kk = mod1(k, M) + yy = sum(abs2, Y[kk]) + sy = dot(S[kk], Y[kk]) + ifelse(yy > T(1.0e-30) && sy > zero(T), sy / yy, one(T)) + else + one(T) + end + r = γ * q + for j in (M - 1):-1:0 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + β = Rho[ii] * dot(Y[ii], r) + r = r + (a[ii] - β) * S[ii] + end + end + return -r +end + +"Per-particle hand-written L-BFGS with `LineSearch.StrongWolfeLineSearch`. Avoids `SciMLBase.init`, whose `promote_u0` path uses dynamic dispatch (invalid GPU IR)." +@kernel function lbfgs_run!( + grad_f, p, x0s, result, ls_cache, ::Val{M}, maxiters + ) where {M} i = @index(Global, Linear) - nlcache = remake(nlprob; u0 = x0s[i]) - sol = solve(nlcache, opt; maxiters, abstol, reltol) - @inbounds result[i] = sol.u + x = x0s[i] + T = eltype(x) + g = grad_f(x, p) + + z = zero(typeof(x)) + S = ntuple(_ -> z, Val(M)) + Y = ntuple(_ -> z, Val(M)) + Rho = ntuple(_ -> zero(T), Val(M)) + k = 0 + active = all(isfinite, g) + + for _ in 1:maxiters + if active + dir = _lbfgs_direction(g, S, Y, Rho, Val(M), k) + if dot(g, dir) >= zero(T) + dir = -g + k = 0 + end + ls_sol = SciMLBase.solve!(ls_cache, x, dir) + α = T(ls_sol.step_size) + ok = ls_sol.retcode == ReturnCode.Success + if ok && isfinite(α) && α > zero(T) + xn = x + α * dir + gn = grad_f(xn, p) + if all(isfinite, gn) + s = xn - x + y = gn - g + sy = dot(s, y) + if sy > T(1.0e-10) && isfinite(one(T) / sy) + k += 1 + ii = mod1(k, M) + S = Base.setindex(S, s, ii) + Y = Base.setindex(Y, y, ii) + Rho = Base.setindex(Rho, one(T) / sy, ii) + else + k = 0 + end + x = xn + g = gn + else + active = false + end + else + active = false + end + end + end + @inbounds result[i] = x end +"Hybrid PSO solve: global PSO exploration followed by per-particle local L-BFGS refinement." function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; - abstol = nothing, - reltol = nothing, - maxiters = 100, local_maxiters = 10, kwargs... - ) where { - Backend, LocalOpt <: Union{LBFGS, BFGS}, - } - pso_cache = cache.pso_cache - - sol_pso = solve!(pso_cache) - x0s = sol_pso.original + abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10, + linesearch = LineSearch.StrongWolfeLineSearch(), kwargs... + ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - backend = opt.backend + sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) + orig_lb = cache.prob.lb + orig_ub = cache.prob.ub prob = remake(cache.prob, lb = nothing, ub = nothing) + x0s = sol_pso.original + raw_grad = instantiate_gradient(prob.f.f, prob.f.adtype) + grad_f = (θ, p) -> + _in_safe_box(θ, orig_lb, orig_ub) ? raw_grad(θ, p) : _huge_grad(θ) + result = cache.start_points copyto!(result, x0s) - ∇f = instantiate_gradient(prob.f.f, prob.f.adtype) - - kernel = simplebfgs_run!(backend) - nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p) + T = eltype(prob.u0) + D = length(prob.u0) + M_val = opt.local_opt isa LBFGS ? Val(min(opt.local_opt.threshold, D)) : Val(D) - nlalg = opt.local_opt isa LBFGS ? - SimpleLimitedMemoryBroyden(; - threshold = opt.local_opt.threshold, - linesearch = Val(true) - ) : SimpleBroyden(; linesearch = Val(true)) + # Construct the line-search cache directly: `LineSearch.init` would route + # through `SciMLBase.init`'s `promote_u0`, which dynamic-dispatches on GPU. + ls_cache = LineSearch.StaticStrongWolfeLineSearchCache( + grad_f, grad_f, prob.p, + T(linesearch.c1), T(linesearch.c2), + T(linesearch.α_init), T(linesearch.α_max), + linesearch.maxiters, linesearch.zoom_maxiters + ) t0 = time() + kernel = lbfgs_run!(opt.backend) kernel( - nlprob, - x0s, - result, - nlalg, - local_maxiters, - abstol, - reltol; + grad_f, prob.p, x0s, result, ls_cache, M_val, local_maxiters; ndrange = length(x0s) ) + KernelAbstractions.synchronize(opt.backend) - sol_bfgs = (x -> prob.f(x, prob.p)).(result) - sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs) - - minobj, ind = findmin(sol_bfgs) - sol_u, - sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) : - (view(result, ind), minobj) - t1 = time() - - # @show sol_pso.stats.time + Tobj = eltype(prob.u0) + result_fx = map(eachindex(result)) do i + x = result[i] + _in_safe_box(x, orig_lb, orig_ub) || return convert(Tobj, Inf) + v = prob.f(x, prob.p) + isnan(v) ? convert(Tobj, Inf) : v + end - solve_time = (t1 - t0) + sol_pso.stats.time + minobj, ind = findmin(result_fx) + best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] + best_u, best_obj = minobj > best_obj ? (sol_pso.u, best_obj) : (result[ind], minobj) + solve_time = (time() - t0) + sol_pso.stats.time return SciMLBase.build_solution( - SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - sol_u, sol_obj, + SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) end