From cd517127536ba786ca52ea4b660f70adb20068f0 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 16:04:21 +0530 Subject: [PATCH 1/9] rewrite L-BFGS with custom Strong Wolfe line search Signed-off-by: AdityaPandeyCN --- Project.toml | 2 +- src/hybrid.jl | 253 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 206 insertions(+), 49 deletions(-) diff --git a/Project.toml b/Project.toml index a4c76f8..05e182e 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" @@ -39,7 +40,6 @@ julia = "1.10" [extras] JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/hybrid.jl b/src/hybrid.jl index 60f80ed..8b0302d 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,69 +1,226 @@ -@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol) +using LinearAlgebra: norm, dot +using KernelAbstractions +using SciMLBase +using Optimization + +# Clamp x to interior of bounds and evaluate objective +@inline function _safe_eval(f, p, x, lb, ub) + T = eltype(x) + ε = T(1e-6) + xc = clamp.(x, lb .+ ε, ub .- ε) + xc = map(xi -> abs(xi) < ε ? ε : xi, xc) + v = f(xc, p) + ifelse(isfinite(v), v, T(Inf)) +end + +# Clamp x to interior of bounds and evaluate gradient +@inline function _safe_grad(grad_f, p, x, lb, ub) + T = eltype(x) + ε = T(1e-6) + xc = clamp.(x, lb .+ ε, ub .- ε) + xc = map(xi -> abs(xi) < ε ? ε : xi, xc) + g = grad_f(xc, p) + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) +end + +# Cubic interpolation for line search; falls back to bisection if non-convex +@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) + d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) + desc = d1 * d1 - dϕ_lo * dϕ_hi + # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) + d2 = sqrt(max(desc, zero(desc))) + ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) +end + +# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) +@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) + T = eltype(x) + xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false + done = false + for _ in 1:10 + if !done + a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + xn_j = clamp.(x + a_j * dir, lb, ub) + ϕ_j = _safe_eval(f, p, xn_j, lb, ub) + if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) + a_hi = a_j + else + gn_j = _safe_grad(grad_f, p, xn_j, lb, ub) + dϕ_j = dot(gn_j, dir) + if abs(dϕ_j) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true + else + if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + a_lo, ϕ_lo = a_j, ϕ_j + end + end + end + end + if !done + xn_lo = clamp.(x + a_lo * dir, lb, ub) + xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) + end + (xn_out, ϕ_out, gn_out, ok_out) +end + +# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) +@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + T = eltype(x) + c1, c2 = T(1e-4), T(0.9) + dϕ_0 = dot(g, dir) + xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false + if dϕ_0 < zero(T) + a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false + for i in 1:10 + if !done + xn = clamp.(x + a_i * dir, lb, ub) + ϕ_i = _safe_eval(f, p, xn, lb, ub) + if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true + else + gn_i = _safe_grad(grad_f, p, xn, lb, ub) + dϕ_i = dot(gn_i, dir) + if abs(dϕ_i) <= -c2 * dϕ_0 + xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true + elseif dϕ_i >= zero(T) + xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true + else + a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0) + end + end + end + end + if !done + xn = clamp.(x + a_prev * dir, lb, ub) + xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true + end + end + (xn_out, ϕ_out, gn_out, ok_out) +end + +# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) +@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} + T = eltype(g) + q, a = g, ntuple(_ -> zero(T), Val(M)) + for j in 0:M-1 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii) + q = q - a[ii] * Y[ii] + end + end + kk = mod1(k, M) + yy = sum(abs2, Y[kk]) + γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) + r = γ * q + for j in M-1:-1:0 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] + end + end + -r +end + +# Run L-BFGS independently on each starting point +@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) - nlcache = remake(nlprob; u0 = x0s[i]) - sol = solve(nlcache, opt; maxiters, abstol, reltol) - @inbounds result[i] = sol.u + x = clamp.(x0s[i], lb, ub) + T = eltype(x) + z = zero(typeof(x)) + S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) + Rho = ntuple(_ -> zero(T), Val(M)) + fx = _safe_eval(f, p, x, lb, ub) + g = _safe_grad(grad_f, p, x, lb, ub) + k, active = 0, isfinite(fx) && all(isfinite, g) + for _ in 1:maxiters + if active && norm(g) >= T(1e-7) + dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) + if dot(g, dir) >= zero(T); dir, k = -g, 0; end + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) + if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if ok && isfinite(fn) && all(isfinite, gn) + s, y = xn - x, gn - g + sy = dot(s, y) + if isfinite(one(T) / sy) && sy > T(1e-10) + k += 1; ii = mod1(k, M) + S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) + else + k = 0 + end + x, g, fx = xn, gn, fn + else + active = false + end + end + end + @inbounds result[i] = x end +# Main solve: runs PSO for global exploration, then L-BFGS for local refinement function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; - abstol = nothing, - reltol = nothing, - maxiters = 100, local_maxiters = 10, kwargs... - ) where { - Backend, LocalOpt <: Union{LBFGS, BFGS}, - } - pso_cache = cache.pso_cache + abstol = nothing, reltol = nothing, maxiters = 100, + local_maxiters = 50, n_starts = 20, kwargs... + ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - sol_pso = solve!(pso_cache) - x0s = sol_pso.original + # Phase 1: Global search with PSO + sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) - backend = opt.backend + prob = cache.prob + f_raw, p = prob.f.f, prob.p + lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb + ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub - prob = remake(cache.prob, lb = nothing, ub = nothing) + best_u = sol_pso.u + best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - result = cache.start_points - copyto!(result, x0s) + _obj(x) = let v = prob.f(clamp.(x, lb, ub), p) + (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v + end - ∇f = instantiate_gradient(prob.f.f, prob.f.adtype) + # Build candidate pool: inject global best into particle positions + x0s = copy(sol_pso.original) + x0s[1] = best_u - kernel = simplebfgs_run!(backend) - nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p) + costs = map(_obj, x0s) + n = min(n_starts, length(x0s)) + pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)] - nlalg = opt.local_opt isa LBFGS ? - SimpleLimitedMemoryBroyden(; - threshold = opt.local_opt.threshold, - linesearch = Val(true) - ) : SimpleBroyden(; linesearch = Val(true)) + D = length(first(pool)) + m_val = D > 20 ? Val(5) : Val(10) + grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() - kernel( - nlprob, - x0s, - result, - nlalg, - local_maxiters, - abstol, - reltol; - ndrange = length(x0s) - ) - sol_bfgs = (x -> prob.f(x, prob.p)).(result) - sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs) + result = similar(pool) + copyto!(result, pool) - minobj, ind = findmin(sol_bfgs) - sol_u, - sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) : - (view(result, ind), minobj) - t1 = time() + # Phase 2: Local refinement with L-BFGS on top candidates + kernel = lbfgs_kernel!(opt.backend) + kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n) + KernelAbstractions.synchronize(opt.backend) - # @show sol_pso.stats.time + # Select best result across all refined candidates + for j in 1:n + r = clamp.(result[j], lb, ub) + fval = _obj(r) + if fval < best_obj + best_obj = fval + best_u = r + end + end - solve_time = (t1 - t0) + sol_pso.stats.time + solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( - SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - sol_u, sol_obj, + SciMLBase.build_solution( + SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end +end \ No newline at end of file From e90ef08a0204c2896a215f0b5d5bda40b2b34cf0 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 16:28:39 +0530 Subject: [PATCH 2/9] formatting Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 52 +++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index 8b0302d..ca1b4eb 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - ifelse(isfinite(v), v, T(Inf)) + return ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,10 +39,12 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate( + a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) + ) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -53,7 +55,9 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + if dϕ_j * (a_hi - a_lo) >= zero(T) + a_hi = a_lo + end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -63,13 +67,13 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1e-4), T(0.9) + c1, c2 = T(1.0e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false if dϕ_0 < zero(T) @@ -98,14 +102,14 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - for j in 0:M-1 + for j in 0:(M - 1) idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -115,17 +119,17 @@ end end kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - for j in M-1:-1:0 + for j in (M - 1):-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - -r + return -r end # Run L-BFGS independently on each starting point @@ -140,15 +144,19 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1e-7) + if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - if dot(g, dir) >= zero(T); dir, k = -g, 0; end + if dot(g, dir) >= zero(T) + dir, k = -g, 0 + end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if !ok + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - if isfinite(one(T) / sy) && sy > T(1e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -219,8 +227,8 @@ function SciMLBase.solve!( solve_time = (time() - t0) + sol_pso.stats.time - SciMLBase.build_solution( + return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end \ No newline at end of file +end From c7a60f6e8c60a1d9a1fbffc59d3973880fdc285a Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 18:15:29 +0530 Subject: [PATCH 3/9] remove scalar indexing Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 106 +++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index ca1b4eb..3cb82e7 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1.0e-6) + ε = T(1e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - return ifelse(isfinite(v), v, T(Inf)) + ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1.0e-6) + ε = T(1e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,12 +39,10 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate( - a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) - ) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -55,9 +53,7 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T) - a_hi = a_lo - end + if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -67,16 +63,16 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - return (xn_out, ϕ_out, gn_out, ok_out) + (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1.0e-4), T(0.9) + c1, c2 = T(1e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false - if dϕ_0 < zero(T) + if dϕ_0 < zero(T) # valid descent direction a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false for i in 1:10 if !done @@ -102,14 +98,15 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - return (xn_out, ϕ_out, gn_out, ok_out) + (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - for j in 0:(M - 1) + # First loop: newest to oldest + for j in 0:M-1 idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -117,23 +114,26 @@ end q = q - a[ii] * Y[ii] end end + # Compute scaling factor γ kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - for j in (M - 1):-1:0 + # Second loop: oldest to newest + for j in M-1:-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - return -r + -r end # Run L-BFGS independently on each starting point -@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M} +# History buffers stored as tuples for GPU compatibility +@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) x = clamp.(x0s[i], lb, ub) T = eltype(x) @@ -144,19 +144,18 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1.0e-7) + if active && norm(g) >= T(1e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - if dot(g, dir) >= zero(T) - dir, k = -g, 0 - end + # Reset to steepest descent if direction is not descent + if dot(g, dir) >= zero(T); dir, k = -g, 0; end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - if !ok - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 - end + # Fall back to steepest descent if line search fails + if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - if isfinite(one(T) / sy) && sy > T(1.0e-10) + # Update history only if curvature condition holds + if isfinite(one(T) / sy) && sy > T(1e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -169,13 +168,14 @@ end end end @inbounds result[i] = x + @inbounds result_fx[i] = fx end # Main solve: runs PSO for global exploration, then L-BFGS for local refinement function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, - local_maxiters = 50, n_starts = 20, kwargs... + local_maxiters = 50, kwargs... ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} # Phase 1: Global search with PSO @@ -189,46 +189,38 @@ function SciMLBase.solve!( best_u = sol_pso.u best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - _obj(x) = let v = prob.f(clamp.(x, lb, ub), p) - (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v - end - - # Build candidate pool: inject global best into particle positions - x0s = copy(sol_pso.original) - x0s[1] = best_u + # Run L-BFGS on all particles + x0s = sol_pso.original + n = length(x0s) - costs = map(_obj, x0s) - n = min(n_starts, length(x0s)) - pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)] - - D = length(first(pool)) + D = length(prob.u0) m_val = D > 20 ? Val(5) : Val(10) grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() - result = similar(pool) - copyto!(result, pool) + result = similar(x0s) + copyto!(result, x0s) + result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) - # Phase 2: Local refinement with L-BFGS on top candidates + # Phase 2: Local refinement with L-BFGS on all candidates kernel = lbfgs_kernel!(opt.backend) - kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n) + kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Select best result across all refined candidates - for j in 1:n - r = clamp.(result[j], lb, ub) - fval = _obj(r) - if fval < best_obj - best_obj = fval - best_u = r - end + # Find best result + minobj, ind = findmin(result_fx) + + # Single bulk transfer for winning solution only + if minobj < best_obj + best_obj = minobj + best_u = Array(view(result, ind:ind))[1] end solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( + SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end +end \ No newline at end of file From 1d3c3b146bef457c8a0ef9764aef778065772706 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 18:18:05 +0530 Subject: [PATCH 4/9] formatting Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 54 +++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index 3cb82e7..945306f 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -6,21 +6,21 @@ using Optimization # Clamp x to interior of bounds and evaluate objective @inline function _safe_eval(f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) v = f(xc, p) - ifelse(isfinite(v), v, T(Inf)) + return ifelse(isfinite(v), v, T(Inf)) end # Clamp x to interior of bounds and evaluate gradient @inline function _safe_grad(grad_f, p, x, lb, ub) T = eltype(x) - ε = T(1e-6) + ε = T(1.0e-6) xc = clamp.(x, lb .+ ε, ub .- ε) xc = map(xi -> abs(xi) < ε ? ε : xi, xc) g = grad_f(xc, p) - map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) + return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) end # Cubic interpolation for line search; falls back to bisection if non-convex @@ -29,7 +29,7 @@ end desc = d1 * d1 - dϕ_lo * dϕ_hi # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @@ -39,10 +39,12 @@ end done = false for _ in 1:10 if !done - a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), + a_j = _interpolate( + a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3)) + dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) + ) + a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) ϕ_j = _safe_eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) @@ -53,7 +55,9 @@ end if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else - if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end + if dϕ_j * (a_hi - a_lo) >= zero(T) + a_hi = a_lo + end a_lo, ϕ_lo = a_j, ϕ_j end end @@ -63,13 +67,13 @@ end xn_lo = clamp.(x + a_lo * dir, lb, ub) xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) T = eltype(x) - c1, c2 = T(1e-4), T(0.9) + c1, c2 = T(1.0e-4), T(0.9) dϕ_0 = dot(g, dir) xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false if dϕ_0 < zero(T) # valid descent direction @@ -98,7 +102,7 @@ end xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true end end - (xn_out, ϕ_out, gn_out, ok_out) + return (xn_out, ϕ_out, gn_out, ok_out) end # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) @@ -106,7 +110,7 @@ end T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) # First loop: newest to oldest - for j in 0:M-1 + for j in 0:(M - 1) idx = k - j if idx >= 1 ii = mod1(idx, M) @@ -117,18 +121,18 @@ end # Compute scaling factor γ kk = mod1(k, M) yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T)) + γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q # Second loop: oldest to newest - for j in M-1:-1:0 + for j in (M - 1):-1:0 idx = k - j if idx >= 1 ii = mod1(idx, M) r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] end end - -r + return -r end # Run L-BFGS independently on each starting point @@ -144,18 +148,22 @@ end g = _safe_grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters - if active && norm(g) >= T(1e-7) + if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) # Reset to steepest descent if direction is not descent - if dot(g, dir) >= zero(T); dir, k = -g, 0; end + if dot(g, dir) >= zero(T) + dir, k = -g, 0 + end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) # Fall back to steepest descent if line search fails - if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end + if !ok + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) # Update history only if curvature condition holds - if isfinite(one(T) / sy) && sy > T(1e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) k += 1; ii = mod1(k, M) S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) else @@ -208,7 +216,7 @@ function SciMLBase.solve!( kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Find best result + # Find best result minobj, ind = findmin(result_fx) # Single bulk transfer for winning solution only @@ -219,8 +227,8 @@ function SciMLBase.solve!( solve_time = (time() - t0) + sol_pso.stats.time - SciMLBase.build_solution( + return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) ) -end \ No newline at end of file +end From b6b3443674685480c656f02f7f7cdb8977d9f0d9 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Thu, 2 Apr 2026 22:12:57 +0530 Subject: [PATCH 5/9] add comments Signed-off-by: AdityaPandeyCN --- Project.toml | 5 ++- src/hybrid.jl | 115 +++++++++++++++++++++----------------------------- 2 files changed, 51 insertions(+), 69 deletions(-) diff --git a/Project.toml b/Project.toml index 05e182e..7c97061 100644 --- a/Project.toml +++ b/Project.toml @@ -1,20 +1,21 @@ name = "ParallelParticleSwarms" uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419" -version = "1.0.0" authors = ["Utkarsh and contributors"] +version = "1.0.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Runic = "62bfec6d-59d7-401d-8490-b29ee721c001" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7" diff --git a/src/hybrid.jl b/src/hybrid.jl index 945306f..8ba398c 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -3,69 +3,64 @@ using KernelAbstractions using SciMLBase using Optimization -# Clamp x to interior of bounds and evaluate objective -@inline function _safe_eval(f, p, x, lb, ub) - T = eltype(x) - ε = T(1.0e-6) - xc = clamp.(x, lb .+ ε, ub .- ε) - xc = map(xi -> abs(xi) < ε ? ε : xi, xc) - v = f(xc, p) - return ifelse(isfinite(v), v, T(Inf)) +# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation +@inline _eval(f, p, x, lb, ub) = +let v = f(clamp.(x, lb, ub), p) + ifelse(isfinite(v), v, eltype(x)(Inf)) end -# Clamp x to interior of bounds and evaluate gradient -@inline function _safe_grad(grad_f, p, x, lb, ub) - T = eltype(x) - ε = T(1.0e-6) - xc = clamp.(x, lb .+ ε, ub .- ε) - xc = map(xi -> abs(xi) < ε ? ε : xi, xc) - g = grad_f(xc, p) - return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g) -end +# Clamp to bounds and evaluate gradient; zero out non-finite components +@inline _grad(grad_f, p, x, lb, ub) = + map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p)) -# Cubic interpolation for line search; falls back to bisection if non-convex +# Cubic interpolation with bisection fallback @inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) desc = d1 * d1 - dϕ_lo * dϕ_hi - # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff) d2 = sqrt(max(desc, zero(desc))) - return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))) + return ifelse( + desc < 0, (a_lo + a_hi) / 2, + a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)) + ) end # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) @inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) T = eltype(x) - xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false + xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false + + # Cache endpoint values; updated only when endpoints shift + ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub) + dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir) + dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir) + done = false for _ in 1:10 if !done - a_j = _interpolate( - a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), - dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir), - dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir) - ) + a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) xn_j = clamp.(x + a_j * dir, lb, ub) - ϕ_j = _safe_eval(f, p, xn_j, lb, ub) + ϕ_j = _eval(f, p, xn_j, lb, ub) if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) - a_hi = a_j + a_hi, ϕ_hi = a_j, ϕ_j + dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir) else - gn_j = _safe_grad(grad_f, p, xn_j, lb, ub) + gn_j = _grad(grad_f, p, xn_j, lb, ub) dϕ_j = dot(gn_j, dir) if abs(dϕ_j) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true else if dϕ_j * (a_hi - a_lo) >= zero(T) - a_hi = a_lo + a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo end - a_lo, ϕ_lo = a_j, ϕ_j + a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j end end end end if !done - xn_lo = clamp.(x + a_lo * dir, lb, ub) - xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub) + xn = clamp.(x + a_lo * dir, lb, ub) + xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub) end return (xn_out, ϕ_out, gn_out, ok_out) end @@ -81,25 +76,25 @@ end for i in 1:10 if !done xn = clamp.(x + a_i * dir, lb, ub) - ϕ_i = _safe_eval(f, p, xn, lb, ub) + ϕ_i = _eval(f, p, xn, lb, ub) if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true else - gn_i = _safe_grad(grad_f, p, xn, lb, ub) + gn_i = _grad(grad_f, p, xn, lb, ub) dϕ_i = dot(gn_i, dir) if abs(dϕ_i) <= -c2 * dϕ_0 xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true elseif dϕ_i >= zero(T) xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true else - a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0) + a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2) end end end end if !done xn = clamp.(x + a_prev * dir, lb, ub) - xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true + xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true end end return (xn_out, ϕ_out, gn_out, ok_out) @@ -109,7 +104,6 @@ end @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} T = eltype(g) q, a = g, ntuple(_ -> zero(T), Val(M)) - # First loop: newest to oldest for j in 0:(M - 1) idx = k - j if idx >= 1 @@ -118,13 +112,11 @@ end q = q - a[ii] * Y[ii] end end - # Compute scaling factor γ kk = mod1(k, M) - yy = sum(abs2, Y[kk]) - γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T)) - γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T)) + sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk]) + γ = sy / yy + γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T)) r = γ * q - # Second loop: oldest to newest for j in (M - 1):-1:0 idx = k - j if idx >= 1 @@ -135,8 +127,7 @@ end return -r end -# Run L-BFGS independently on each starting point -# History buffers stored as tuples for GPU compatibility +# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers @kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} i = @index(Global, Linear) x = clamp.(x0s[i], lb, ub) @@ -144,28 +135,28 @@ end z = zero(typeof(x)) S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) Rho = ntuple(_ -> zero(T), Val(M)) - fx = _safe_eval(f, p, x, lb, ub) - g = _safe_grad(grad_f, p, x, lb, ub) + fx = _eval(f, p, x, lb, ub) + g = _grad(grad_f, p, x, lb, ub) k, active = 0, isfinite(fx) && all(isfinite, g) for _ in 1:maxiters if active && norm(g) >= T(1.0e-7) dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - # Reset to steepest descent if direction is not descent - if dot(g, dir) >= zero(T) + if dot(g, dir) >= zero(T) # reset to steepest descent dir, k = -g, 0 end xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - # Fall back to steepest descent if line search fails - if !ok - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0 + if !ok # fall back to steepest descent + xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub) + k = 0 end if ok && isfinite(fn) && all(isfinite, gn) s, y = xn - x, gn - g sy = dot(s, y) - # Update history only if curvature condition holds - if isfinite(one(T) / sy) && sy > T(1.0e-10) + if isfinite(one(T) / sy) && sy > T(1.0e-10) # curvature condition k += 1; ii = mod1(k, M) - S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii) + S = Base.setindex(S, s, ii) + Y = Base.setindex(Y, y, ii) + Rho = Base.setindex(Rho, one(T) / sy, ii) else k = 0 end @@ -179,14 +170,13 @@ end @inbounds result_fx[i] = fx end -# Main solve: runs PSO for global exploration, then L-BFGS for local refinement +# PSO for global exploration, then L-BFGS for local refinement on all particles function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 50, kwargs... ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} - # Phase 1: Global search with PSO sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) prob = cache.prob @@ -197,36 +187,27 @@ function SciMLBase.solve!( best_u = sol_pso.u best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] - # Run L-BFGS on all particles x0s = sol_pso.original n = length(x0s) - - D = length(prob.u0) - m_val = D > 20 ? Val(5) : Val(10) + m_val = length(prob.u0) > 20 ? Val(5) : Val(10) grad_f = instantiate_gradient(f_raw, prob.f.adtype) t0 = time() result = similar(x0s) - copyto!(result, x0s) result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) - # Phase 2: Local refinement with L-BFGS on all candidates kernel = lbfgs_kernel!(opt.backend) kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) KernelAbstractions.synchronize(opt.backend) - # Find best result minobj, ind = findmin(result_fx) - - # Single bulk transfer for winning solution only if minobj < best_obj best_obj = minobj best_u = Array(view(result, ind:ind))[1] end solve_time = (time() - t0) + sol_pso.stats.time - return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; stats = Optimization.OptimizationStats(; time = solve_time) From be7dc7500df71c361e44133d2a235a52ae62851b Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Sun, 17 May 2026 21:50:09 +0530 Subject: [PATCH 6/9] use native line search Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 241 ++++++++++++-------------------------------------- 1 file changed, 56 insertions(+), 185 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index 8ba398c..a0ea1fc 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,212 +1,83 @@ -using LinearAlgebra: norm, dot using KernelAbstractions using SciMLBase using Optimization - -# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation -@inline _eval(f, p, x, lb, ub) = -let v = f(clamp.(x, lb, ub), p) - ifelse(isfinite(v), v, eltype(x)(Inf)) -end - -# Clamp to bounds and evaluate gradient; zero out non-finite components -@inline _grad(grad_f, p, x, lb, ub) = - map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p)) - -# Cubic interpolation with bisection fallback -@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) - d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi) - desc = d1 * d1 - dϕ_lo * dϕ_hi - d2 = sqrt(max(desc, zero(desc))) - return ifelse( - desc < 0, (a_lo + a_hi) / 2, - a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)) - ) -end - -# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6) -@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2) - T = eltype(x) - xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false - - # Cache endpoint values; updated only when endpoints shift - ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub) - dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir) - dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir) - - done = false - for _ in 1:10 - if !done - a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi) - a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3)) - xn_j = clamp.(x + a_j * dir, lb, ub) - ϕ_j = _eval(f, p, xn_j, lb, ub) - if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo) - a_hi, ϕ_hi = a_j, ϕ_j - dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir) - else - gn_j = _grad(grad_f, p, xn_j, lb, ub) - dϕ_j = dot(gn_j, dir) - if abs(dϕ_j) <= -c2 * dϕ_0 - xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true - else - if dϕ_j * (a_hi - a_lo) >= zero(T) - a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo - end - a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j - end - end - end - end - if !done - xn = clamp.(x + a_lo * dir, lb, ub) - xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub) - end - return (xn_out, ϕ_out, gn_out, ok_out) -end - -# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5) -@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - T = eltype(x) - c1, c2 = T(1.0e-4), T(0.9) - dϕ_0 = dot(g, dir) - xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false - if dϕ_0 < zero(T) # valid descent direction - a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false - for i in 1:10 - if !done - xn = clamp.(x + a_i * dir, lb, ub) - ϕ_i = _eval(f, p, xn, lb, ub) - if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1) - xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true - else - gn_i = _grad(grad_f, p, xn, lb, ub) - dϕ_i = dot(gn_i, dir) - if abs(dϕ_i) <= -c2 * dϕ_0 - xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true - elseif dϕ_i >= zero(T) - xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true - else - a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2) - end - end - end - end - if !done - xn = clamp.(x + a_prev * dir, lb, ub) - xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true - end - end - return (xn_out, ϕ_out, gn_out, ok_out) +using LineSearch +using NonlinearSolveQuasiNewton + +# Safety box around the original feasible region. Local refinement may step +# modestly outside `[lb, ub]`, but past this margin we reject the trial point +# instead of evaluating f there — e.g. F19 Griewank-Rosenbrock overflows to Inf +# for |θ| ~ 1e10 in Float32 and then cos(Inf) throws DomainError. +@inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ) +@inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T} + all(isfinite, θ) || return false + w = ub .- lb + return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w) end -# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) -@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M} - T = eltype(g) - q, a = g, ntuple(_ -> zero(T), Val(M)) - for j in 0:(M - 1) - idx = k - j - if idx >= 1 - ii = mod1(idx, M) - a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii) - q = q - a[ii] * Y[ii] - end - end - kk = mod1(k, M) - sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk]) - γ = sy / yy - γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T)) - r = γ * q - for j in (M - 1):-1:0 - idx = k - j - if idx >= 1 - ii = mod1(idx, M) - r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii] - end - end - return -r -end +# Huge-magnitude gradient: makes the Strong Wolfe Armijo test fail so the line +# search rejects the trial step without us ever calling f at the bad point. +@inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ) -# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers -@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M} +@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol, grad_f) i = @index(Global, Linear) - x = clamp.(x0s[i], lb, ub) - T = eltype(x) - z = zero(typeof(x)) - S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M)) - Rho = ntuple(_ -> zero(T), Val(M)) - fx = _eval(f, p, x, lb, ub) - g = _grad(grad_f, p, x, lb, ub) - k, active = 0, isfinite(fx) && all(isfinite, g) - for _ in 1:maxiters - if active && norm(g) >= T(1.0e-7) - dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k) - if dot(g, dir) >= zero(T) # reset to steepest descent - dir, k = -g, 0 - end - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub) - if !ok # fall back to steepest descent - xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub) - k = 0 - end - if ok && isfinite(fn) && all(isfinite, gn) - s, y = xn - x, gn - g - sy = dot(s, y) - if isfinite(one(T) / sy) && sy > T(1.0e-10) # curvature condition - k += 1; ii = mod1(k, M) - S = Base.setindex(S, s, ii) - Y = Base.setindex(Y, y, ii) - Rho = Base.setindex(Rho, one(T) / sy, ii) - else - k = 0 - end - x, g, fx = xn, gn, fn - else - active = false - end - end - end - @inbounds result[i] = x - @inbounds result_fx[i] = fx + nlcache = SciMLBase.init( + nlprob, opt; u0 = x0s[i], maxiters, abstol, reltol, grad_f, + kwargshandle = SciMLBase.KeywordArgSilent + ) + sol = SciMLBase.solve!(nlcache) + @inbounds result[i] = sol.u end -# PSO for global exploration, then L-BFGS for local refinement on all particles +# HybridPSO: global PSO exploration, then per-particle local BFGS refinement. function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; - abstol = nothing, reltol = nothing, maxiters = 100, - local_maxiters = 50, kwargs... + abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10, + linesearch = LineSearch.StrongWolfeLineSearch(), kwargs... ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}} sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters) - prob = cache.prob - f_raw, p = prob.f.f, prob.p - lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb - ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub - - best_u = sol_pso.u - best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] + orig_lb = cache.prob.lb + orig_ub = cache.prob.ub + prob = remake(cache.prob, lb = nothing, ub = nothing) x0s = sol_pso.original - n = length(x0s) - m_val = length(prob.u0) > 20 ? Val(5) : Val(10) + raw_grad = instantiate_gradient(prob.f.f, prob.f.adtype) + grad_f = (θ, p) -> + _in_safe_box(θ, orig_lb, orig_ub) ? raw_grad(θ, p) : _huge_grad(θ) - grad_f = instantiate_gradient(f_raw, prob.f.adtype) - t0 = time() + result = cache.start_points + copyto!(result, x0s) - result = similar(x0s) - result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n) + nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p) + nlalg = opt.local_opt isa LBFGS ? + NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; + threshold = opt.local_opt.threshold, + linesearch + ) : + NonlinearSolveQuasiNewton.Broyden(; linesearch) - kernel = lbfgs_kernel!(opt.backend) - kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n) + t0 = time() + kernel = simplebfgs_run!(opt.backend) + kernel( + nlprob, x0s, result, nlalg, local_maxiters, abstol, reltol, grad_f; + ndrange = length(x0s) + ) KernelAbstractions.synchronize(opt.backend) - minobj, ind = findmin(result_fx) - if minobj < best_obj - best_obj = minobj - best_u = Array(view(result, ind:ind))[1] + Tobj = eltype(prob.u0) + result_fx = map(eachindex(result)) do i + x = result[i] + _in_safe_box(x, orig_lb, orig_ub) || return convert(Tobj, Inf) + v = prob.f(x, prob.p) + isnan(v) ? convert(Tobj, Inf) : v end + minobj, ind = findmin(result_fx) + best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[] + best_u, best_obj = minobj > best_obj ? (sol_pso.u, best_obj) : (result[ind], minobj) + solve_time = (time() - t0) + sol_pso.stats.time return SciMLBase.build_solution( SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj; From 1aa86658f8b88f98963aaef79bfdae5d815cc365 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Sun, 17 May 2026 22:34:51 +0530 Subject: [PATCH 7/9] update Project.toml Signed-off-by: AdityaPandeyCN --- Project.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Project.toml b/Project.toml index 7c97061..e8142c4 100644 --- a/Project.toml +++ b/Project.toml @@ -9,7 +9,9 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NonlinearSolveQuasiNewton = "9a2c21bd-3a47-402d-9113-8faf9a0ee114" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" @@ -28,6 +30,8 @@ Enzyme = "0.13" ForwardDiff = "0.10, 1.3" JET = "0.9, 0.10, 0.11" KernelAbstractions = "0.9" +LineSearch = "0.1" +NonlinearSolveQuasiNewton = "1" Optimization = "4.1, 5.2" PrecompileTools = "1" QuasiMonteCarlo = "0.3" From f6c5d1b015603936dee340516e746369d8acd712 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Sun, 17 May 2026 23:17:50 +0530 Subject: [PATCH 8/9] gpu-compat Signed-off-by: AdityaPandeyCN --- src/hybrid.jl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hybrid.jl b/src/hybrid.jl index a0ea1fc..122771a 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -4,10 +4,7 @@ using Optimization using LineSearch using NonlinearSolveQuasiNewton -# Safety box around the original feasible region. Local refinement may step -# modestly outside `[lb, ub]`, but past this margin we reject the trial point -# instead of evaluating f there — e.g. F19 Griewank-Rosenbrock overflows to Inf -# for |θ| ~ 1e10 in Float32 and then cos(Inf) throws DomainError. +"Check that `θ` lies within a slack-expanded box around `[lb, ub]`." @inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ) @inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T} all(isfinite, θ) || return false @@ -15,21 +12,27 @@ using NonlinearSolveQuasiNewton return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w) end -# Huge-magnitude gradient: makes the Strong Wolfe Armijo test fail so the line -# search rejects the trial step without us ever calling f at the bad point. +"Sentinel gradient with huge magnitude used to reject out-of-box trial points." @inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ) -@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol, grad_f) +"Per-particle local quasi-Newton refinement kernel. The algorithm is built inside the kernel because `QuasiNewtonAlgorithm` is not isbits." +@kernel function simplebfgs_run!( + nlprob, x0s, result, linesearch, ::Val{Threshold}, ::Val{IsLBFGS}, + maxiters, abstol, reltol, grad_f + ) where {Threshold, IsLBFGS} i = @index(Global, Linear) + nlalg = IsLBFGS ? + NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; threshold = Val(Threshold), linesearch) : + NonlinearSolveQuasiNewton.Broyden(; linesearch) nlcache = SciMLBase.init( - nlprob, opt; u0 = x0s[i], maxiters, abstol, reltol, grad_f, + nlprob, nlalg; u0 = x0s[i], maxiters, abstol, reltol, grad_f, kwargshandle = SciMLBase.KeywordArgSilent ) sol = SciMLBase.solve!(nlcache) @inbounds result[i] = sol.u end -# HybridPSO: global PSO exploration, then per-particle local BFGS refinement. +"Hybrid PSO solve: global PSO exploration followed by per-particle local quasi-Newton refinement." function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10, @@ -51,17 +54,14 @@ function SciMLBase.solve!( copyto!(result, x0s) nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p) - nlalg = opt.local_opt isa LBFGS ? - NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; - threshold = opt.local_opt.threshold, - linesearch - ) : - NonlinearSolveQuasiNewton.Broyden(; linesearch) + is_lbfgs_val = Val(opt.local_opt isa LBFGS) + threshold_val = opt.local_opt isa LBFGS ? Val(opt.local_opt.threshold) : Val(0) t0 = time() kernel = simplebfgs_run!(opt.backend) kernel( - nlprob, x0s, result, nlalg, local_maxiters, abstol, reltol, grad_f; + nlprob, x0s, result, linesearch, threshold_val, is_lbfgs_val, + local_maxiters, abstol, reltol, grad_f; ndrange = length(x0s) ) KernelAbstractions.synchronize(opt.backend) From c95cf6084a4f8f61d2d95920bd7f697bffb1a885 Mon Sep 17 00:00:00 2001 From: AdityaPandeyCN Date: Mon, 18 May 2026 08:23:20 +0530 Subject: [PATCH 9/9] custom lbfgs --- Project.toml | 2 - src/hybrid.jl | 124 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 102 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index e8142c4..f603359 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -NonlinearSolveQuasiNewton = "9a2c21bd-3a47-402d-9113-8faf9a0ee114" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" @@ -31,7 +30,6 @@ ForwardDiff = "0.10, 1.3" JET = "0.9, 0.10, 0.11" KernelAbstractions = "0.9" LineSearch = "0.1" -NonlinearSolveQuasiNewton = "1" Optimization = "4.1, 5.2" PrecompileTools = "1" QuasiMonteCarlo = "0.3" diff --git a/src/hybrid.jl b/src/hybrid.jl index 122771a..2e31e18 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -1,8 +1,8 @@ +using LinearAlgebra: dot using KernelAbstractions using SciMLBase using Optimization using LineSearch -using NonlinearSolveQuasiNewton "Check that `θ` lies within a slack-expanded box around `[lb, ub]`." @inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ) @@ -15,24 +15,96 @@ end "Sentinel gradient with huge magnitude used to reject out-of-box trial points." @inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ) -"Per-particle local quasi-Newton refinement kernel. The algorithm is built inside the kernel because `QuasiNewtonAlgorithm` is not isbits." -@kernel function simplebfgs_run!( - nlprob, x0s, result, linesearch, ::Val{Threshold}, ::Val{IsLBFGS}, - maxiters, abstol, reltol, grad_f - ) where {Threshold, IsLBFGS} +"L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) over cyclic memory of length `M`." +@inline function _lbfgs_direction(g, S, Y, Rho, ::Val{M}, k) where {M} + T = eltype(g) + q = g + a = ntuple(_ -> zero(T), Val(M)) + for j in 0:(M - 1) + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + aii = Rho[ii] * dot(S[ii], q) + a = Base.setindex(a, aii, ii) + q = q - aii * Y[ii] + end + end + γ = if k >= 1 + kk = mod1(k, M) + yy = sum(abs2, Y[kk]) + sy = dot(S[kk], Y[kk]) + ifelse(yy > T(1.0e-30) && sy > zero(T), sy / yy, one(T)) + else + one(T) + end + r = γ * q + for j in (M - 1):-1:0 + idx = k - j + if idx >= 1 + ii = mod1(idx, M) + β = Rho[ii] * dot(Y[ii], r) + r = r + (a[ii] - β) * S[ii] + end + end + return -r +end + +"Per-particle hand-written L-BFGS with `LineSearch.StrongWolfeLineSearch`. Avoids `SciMLBase.init`, whose `promote_u0` path uses dynamic dispatch (invalid GPU IR)." +@kernel function lbfgs_run!( + grad_f, p, x0s, result, ls_cache, ::Val{M}, maxiters + ) where {M} i = @index(Global, Linear) - nlalg = IsLBFGS ? - NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; threshold = Val(Threshold), linesearch) : - NonlinearSolveQuasiNewton.Broyden(; linesearch) - nlcache = SciMLBase.init( - nlprob, nlalg; u0 = x0s[i], maxiters, abstol, reltol, grad_f, - kwargshandle = SciMLBase.KeywordArgSilent - ) - sol = SciMLBase.solve!(nlcache) - @inbounds result[i] = sol.u + x = x0s[i] + T = eltype(x) + g = grad_f(x, p) + + z = zero(typeof(x)) + S = ntuple(_ -> z, Val(M)) + Y = ntuple(_ -> z, Val(M)) + Rho = ntuple(_ -> zero(T), Val(M)) + k = 0 + active = all(isfinite, g) + + for _ in 1:maxiters + if active + dir = _lbfgs_direction(g, S, Y, Rho, Val(M), k) + if dot(g, dir) >= zero(T) + dir = -g + k = 0 + end + ls_sol = SciMLBase.solve!(ls_cache, x, dir) + α = T(ls_sol.step_size) + ok = ls_sol.retcode == ReturnCode.Success + if ok && isfinite(α) && α > zero(T) + xn = x + α * dir + gn = grad_f(xn, p) + if all(isfinite, gn) + s = xn - x + y = gn - g + sy = dot(s, y) + if sy > T(1.0e-10) && isfinite(one(T) / sy) + k += 1 + ii = mod1(k, M) + S = Base.setindex(S, s, ii) + Y = Base.setindex(Y, y, ii) + Rho = Base.setindex(Rho, one(T) / sy, ii) + else + k = 0 + end + x = xn + g = gn + else + active = false + end + else + active = false + end + end + end + @inbounds result[i] = x end -"Hybrid PSO solve: global PSO exploration followed by per-particle local quasi-Newton refinement." +"Hybrid PSO solve: global PSO exploration followed by per-particle local L-BFGS refinement." function SciMLBase.solve!( cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...; abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10, @@ -53,15 +125,23 @@ function SciMLBase.solve!( result = cache.start_points copyto!(result, x0s) - nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p) - is_lbfgs_val = Val(opt.local_opt isa LBFGS) - threshold_val = opt.local_opt isa LBFGS ? Val(opt.local_opt.threshold) : Val(0) + T = eltype(prob.u0) + D = length(prob.u0) + M_val = opt.local_opt isa LBFGS ? Val(min(opt.local_opt.threshold, D)) : Val(D) + + # Construct the line-search cache directly: `LineSearch.init` would route + # through `SciMLBase.init`'s `promote_u0`, which dynamic-dispatches on GPU. + ls_cache = LineSearch.StaticStrongWolfeLineSearchCache( + grad_f, grad_f, prob.p, + T(linesearch.c1), T(linesearch.c2), + T(linesearch.α_init), T(linesearch.α_max), + linesearch.maxiters, linesearch.zoom_maxiters + ) t0 = time() - kernel = simplebfgs_run!(opt.backend) + kernel = lbfgs_run!(opt.backend) kernel( - nlprob, x0s, result, linesearch, threshold_val, is_lbfgs_val, - local_maxiters, abstol, reltol, grad_f; + grad_f, prob.p, x0s, result, ls_cache, M_val, local_maxiters; ndrange = length(x0s) ) KernelAbstractions.synchronize(opt.backend)