From cd517127536ba786ca52ea4b660f70adb20068f0 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Thu, 2 Apr 2026 16:04:21 +0530
Subject: [PATCH 1/9] rewrite L-BFGS with custom Strong Wolfe line search

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 Project.toml  |   2 +-
 src/hybrid.jl | 253 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 206 insertions(+), 49 deletions(-)

diff --git a/Project.toml b/Project.toml
index a4c76f8..05e182e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
@@ -39,7 +40,6 @@ julia = "1.10"
 
 [extras]
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
diff --git a/src/hybrid.jl b/src/hybrid.jl
index 60f80ed..8b0302d 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -1,69 +1,226 @@
-@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol)
+using LinearAlgebra: norm, dot
+using KernelAbstractions
+using SciMLBase
+using Optimization
+
+# Clamp x to interior of bounds and evaluate objective
+@inline function _safe_eval(f, p, x, lb, ub)
+    T = eltype(x)
+    ε = T(1e-6)
+    xc = clamp.(x, lb .+ ε, ub .- ε)
+    xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
+    v = f(xc, p)
+    ifelse(isfinite(v), v, T(Inf))
+end
+
+# Clamp x to interior of bounds and evaluate gradient
+@inline function _safe_grad(grad_f, p, x, lb, ub)
+    T = eltype(x)
+    ε = T(1e-6)
+    xc = clamp.(x, lb .+ ε, ub .- ε)
+    xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
+    g = grad_f(xc, p)
+    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
+end
+
+# Cubic interpolation for line search; falls back to bisection if non-convex
+@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
+    d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi)
+    desc = d1 * d1 - dϕ_lo * dϕ_hi
+    # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff)
+    d2 = sqrt(max(desc, zero(desc)))
+    ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
+end
+
+# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
+@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2)
+    T = eltype(x)
+    xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false
+    done = false
+    for _ in 1:10
+        if !done
+            a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
+                dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir),
+                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir))
+            a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3))
+            xn_j = clamp.(x + a_j * dir, lb, ub)
+            ϕ_j = _safe_eval(f, p, xn_j, lb, ub)
+            if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
+                a_hi = a_j
+            else
+                gn_j = _safe_grad(grad_f, p, xn_j, lb, ub)
+                dϕ_j = dot(gn_j, dir)
+                if abs(dϕ_j) <= -c2 * dϕ_0
+                    xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
+                else
+                    if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end
+                    a_lo, ϕ_lo = a_j, ϕ_j
+                end
+            end
+        end
+    end
+    if !done
+        xn_lo = clamp.(x + a_lo * dir, lb, ub)
+        xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub)
+    end
+    (xn_out, ϕ_out, gn_out, ok_out)
+end
+
+# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
+@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
+    T = eltype(x)
+    c1, c2 = T(1e-4), T(0.9)
+    dϕ_0 = dot(g, dir)
+    xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
+    if dϕ_0 < zero(T)
+        a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false
+        for i in 1:10
+            if !done
+                xn = clamp.(x + a_i * dir, lb, ub)
+                ϕ_i = _safe_eval(f, p, xn, lb, ub)
+                if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1)
+                    xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true
+                else
+                    gn_i = _safe_grad(grad_f, p, xn, lb, ub)
+                    dϕ_i = dot(gn_i, dir)
+                    if abs(dϕ_i) <= -c2 * dϕ_0
+                        xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true
+                    elseif dϕ_i >= zero(T)
+                        xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true
+                    else
+                        a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0)
+                    end
+                end
+            end
+        end
+        if !done
+            xn = clamp.(x + a_prev * dir, lb, ub)
+            xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true
+        end
+    end
+    (xn_out, ϕ_out, gn_out, ok_out)
+end
+
+# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
+@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
+    T = eltype(g)
+    q, a = g, ntuple(_ -> zero(T), Val(M))
+    for j in 0:M-1
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii)
+            q = q - a[ii] * Y[ii]
+        end
+    end
+    kk = mod1(k, M)
+    yy = sum(abs2, Y[kk])
+    γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T))
+    γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T))
+    r = γ * q
+    for j in M-1:-1:0
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
+        end
+    end
+    -r
+end
+
+# Run L-BFGS independently on each starting point
+@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M}
     i = @index(Global, Linear)
-    nlcache = remake(nlprob; u0 = x0s[i])
-    sol = solve(nlcache, opt; maxiters, abstol, reltol)
-    @inbounds result[i] = sol.u
+    x = clamp.(x0s[i], lb, ub)
+    T = eltype(x)
+    z = zero(typeof(x))
+    S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M))
+    Rho = ntuple(_ -> zero(T), Val(M))
+    fx = _safe_eval(f, p, x, lb, ub)
+    g = _safe_grad(grad_f, p, x, lb, ub)
+    k, active = 0, isfinite(fx) && all(isfinite, g)
+    for _ in 1:maxiters
+        if active && norm(g) >= T(1e-7)
+            dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
+            if dot(g, dir) >= zero(T); dir, k = -g, 0; end
+            xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
+            if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end
+            if ok && isfinite(fn) && all(isfinite, gn)
+                s, y = xn - x, gn - g
+                sy = dot(s, y)
+                if isfinite(one(T) / sy) && sy > T(1e-10)
+                    k += 1; ii = mod1(k, M)
+                    S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii)
+                else
+                    k = 0
+                end
+                x, g, fx = xn, gn, fn
+            else
+                active = false
+            end
+        end
+    end
+    @inbounds result[i] = x
 end
 
+# Main solve: runs PSO for global exploration, then L-BFGS for local refinement
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
-        abstol = nothing,
-        reltol = nothing,
-        maxiters = 100, local_maxiters = 10, kwargs...
-    ) where {
-        Backend, LocalOpt <: Union{LBFGS, BFGS},
-    }
-    pso_cache = cache.pso_cache
+        abstol = nothing, reltol = nothing, maxiters = 100,
+        local_maxiters = 50, n_starts = 20, kwargs...
+    ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
-    sol_pso = solve!(pso_cache)
-    x0s = sol_pso.original
+    # Phase 1: Global search with PSO
+    sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)
 
-    backend = opt.backend
+    prob = cache.prob
+    f_raw, p = prob.f.f, prob.p
+    lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb
+    ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub
 
-    prob = remake(cache.prob, lb = nothing, ub = nothing)
+    best_u = sol_pso.u
+    best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
 
-    result = cache.start_points
-    copyto!(result, x0s)
+    _obj(x) = let v = prob.f(clamp.(x, lb, ub), p)
+        (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v
+    end
 
-    ∇f = instantiate_gradient(prob.f.f, prob.f.adtype)
+    # Build candidate pool: inject global best into particle positions
+    x0s = copy(sol_pso.original)
+    x0s[1] = best_u
 
-    kernel = simplebfgs_run!(backend)
-    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p)
+    costs = map(_obj, x0s)
+    n = min(n_starts, length(x0s))
+    pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)]
 
-    nlalg = opt.local_opt isa LBFGS ?
-        SimpleLimitedMemoryBroyden(;
-            threshold = opt.local_opt.threshold,
-            linesearch = Val(true)
-        ) : SimpleBroyden(; linesearch = Val(true))
+    D = length(first(pool))
+    m_val = D > 20 ? Val(5) : Val(10)
 
+    grad_f = instantiate_gradient(f_raw, prob.f.adtype)
     t0 = time()
-    kernel(
-        nlprob,
-        x0s,
-        result,
-        nlalg,
-        local_maxiters,
-        abstol,
-        reltol;
-        ndrange = length(x0s)
-    )
 
-    sol_bfgs = (x -> prob.f(x, prob.p)).(result)
-    sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs)
+    result = similar(pool)
+    copyto!(result, pool)
 
-    minobj, ind = findmin(sol_bfgs)
-    sol_u,
-        sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) :
-        (view(result, ind), minobj)
-    t1 = time()
+    # Phase 2: Local refinement with L-BFGS on top candidates
+    kernel = lbfgs_kernel!(opt.backend)
+    kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n)
+    KernelAbstractions.synchronize(opt.backend)
 
-    # @show sol_pso.stats.time
+    # Select best result across all refined candidates
+    for j in 1:n
+        r = clamp.(result[j], lb, ub)
+        fval = _obj(r)
+        if fval < best_obj
+            best_obj = fval
+            best_u = r
+        end
+    end
 
-    solve_time = (t1 - t0) + sol_pso.stats.time
+    solve_time = (time() - t0) + sol_pso.stats.time
 
-    return SciMLBase.build_solution(
-        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        sol_u, sol_obj,
+    SciMLBase.build_solution(
+        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
-end
+end
\ No newline at end of file

From e90ef08a0204c2896a215f0b5d5bda40b2b34cf0 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Thu, 2 Apr 2026 16:28:39 +0530
Subject: [PATCH 2/9] formatting

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 src/hybrid.jl | 52 +++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/hybrid.jl b/src/hybrid.jl
index 8b0302d..ca1b4eb 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -6,21 +6,21 @@ using Optimization
 # Clamp x to interior of bounds and evaluate objective
 @inline function _safe_eval(f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1e-6)
+    ε = T(1.0e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     v = f(xc, p)
-    ifelse(isfinite(v), v, T(Inf))
+    return ifelse(isfinite(v), v, T(Inf))
 end
 
 # Clamp x to interior of bounds and evaluate gradient
 @inline function _safe_grad(grad_f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1e-6)
+    ε = T(1.0e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     g = grad_f(xc, p)
-    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
+    return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
 end
 
 # Cubic interpolation for line search; falls back to bisection if non-convex
@@ -29,7 +29,7 @@ end
     desc = d1 * d1 - dϕ_lo * dϕ_hi
     # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff)
     d2 = sqrt(max(desc, zero(desc)))
-    ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
+    return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
 end
 
 # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
@@ -39,10 +39,12 @@ end
     done = false
     for _ in 1:10
         if !done
-            a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
+            a_j = _interpolate(
+                a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
                 dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir),
-                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir))
-            a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3))
+                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)
+            )
+            a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
             xn_j = clamp.(x + a_j * dir, lb, ub)
             ϕ_j = _safe_eval(f, p, xn_j, lb, ub)
             if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
@@ -53,7 +55,9 @@ end
                 if abs(dϕ_j) <= -c2 * dϕ_0
                     xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
                 else
-                    if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end
+                    if dϕ_j * (a_hi - a_lo) >= zero(T)
+                        a_hi = a_lo
+                    end
                     a_lo, ϕ_lo = a_j, ϕ_j
                 end
             end
@@ -63,13 +67,13 @@ end
         xn_lo = clamp.(x + a_lo * dir, lb, ub)
         xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub)
     end
-    (xn_out, ϕ_out, gn_out, ok_out)
+    return (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
 @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
     T = eltype(x)
-    c1, c2 = T(1e-4), T(0.9)
+    c1, c2 = T(1.0e-4), T(0.9)
     dϕ_0 = dot(g, dir)
     xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
     if dϕ_0 < zero(T)
@@ -98,14 +102,14 @@ end
             xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true
         end
     end
-    (xn_out, ϕ_out, gn_out, ok_out)
+    return (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
 @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
     T = eltype(g)
     q, a = g, ntuple(_ -> zero(T), Val(M))
-    for j in 0:M-1
+    for j in 0:(M - 1)
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
@@ -115,17 +119,17 @@ end
     end
     kk = mod1(k, M)
     yy = sum(abs2, Y[kk])
-    γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T))
+    γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T))
     γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T))
     r = γ * q
-    for j in M-1:-1:0
+    for j in (M - 1):-1:0
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
             r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
         end
     end
-    -r
+    return -r
 end
 
 # Run L-BFGS independently on each starting point
@@ -140,15 +144,19 @@ end
     g = _safe_grad(grad_f, p, x, lb, ub)
     k, active = 0, isfinite(fx) && all(isfinite, g)
     for _ in 1:maxiters
-        if active && norm(g) >= T(1e-7)
+        if active && norm(g) >= T(1.0e-7)
             dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
-            if dot(g, dir) >= zero(T); dir, k = -g, 0; end
+            if dot(g, dir) >= zero(T)
+                dir, k = -g, 0
+            end
             xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
-            if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end
+            if !ok
+                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0
+            end
             if ok && isfinite(fn) && all(isfinite, gn)
                 s, y = xn - x, gn - g
                 sy = dot(s, y)
-                if isfinite(one(T) / sy) && sy > T(1e-10)
+                if isfinite(one(T) / sy) && sy > T(1.0e-10)
                     k += 1; ii = mod1(k, M)
                     S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii)
                 else
@@ -219,8 +227,8 @@ function SciMLBase.solve!(
 
     solve_time = (time() - t0) + sol_pso.stats.time
 
-    SciMLBase.build_solution(
+    return SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
-end
\ No newline at end of file
+end

From c7a60f6e8c60a1d9a1fbffc59d3973880fdc285a Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Thu, 2 Apr 2026 18:15:29 +0530
Subject: [PATCH 3/9] remove scalar indexing

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 src/hybrid.jl | 106 +++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

diff --git a/src/hybrid.jl b/src/hybrid.jl
index ca1b4eb..3cb82e7 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -6,21 +6,21 @@ using Optimization
 # Clamp x to interior of bounds and evaluate objective
 @inline function _safe_eval(f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1.0e-6)
+    ε = T(1e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     v = f(xc, p)
-    return ifelse(isfinite(v), v, T(Inf))
+    ifelse(isfinite(v), v, T(Inf))
 end
 
 # Clamp x to interior of bounds and evaluate gradient
 @inline function _safe_grad(grad_f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1.0e-6)
+    ε = T(1e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     g = grad_f(xc, p)
-    return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
+    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
 end
 
 # Cubic interpolation for line search; falls back to bisection if non-convex
@@ -29,7 +29,7 @@ end
     desc = d1 * d1 - dϕ_lo * dϕ_hi
     # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff)
     d2 = sqrt(max(desc, zero(desc)))
-    return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
+    ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
 end
 
 # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
@@ -39,12 +39,10 @@ end
     done = false
     for _ in 1:10
         if !done
-            a_j = _interpolate(
-                a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
+            a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
                 dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir),
-                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)
-            )
-            a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
+                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir))
+            a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3))
             xn_j = clamp.(x + a_j * dir, lb, ub)
             ϕ_j = _safe_eval(f, p, xn_j, lb, ub)
             if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
@@ -55,9 +53,7 @@ end
                 if abs(dϕ_j) <= -c2 * dϕ_0
                     xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
                 else
-                    if dϕ_j * (a_hi - a_lo) >= zero(T)
-                        a_hi = a_lo
-                    end
+                    if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end
                     a_lo, ϕ_lo = a_j, ϕ_j
                 end
             end
@@ -67,16 +63,16 @@ end
         xn_lo = clamp.(x + a_lo * dir, lb, ub)
         xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub)
     end
-    return (xn_out, ϕ_out, gn_out, ok_out)
+    (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
 @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
     T = eltype(x)
-    c1, c2 = T(1.0e-4), T(0.9)
+    c1, c2 = T(1e-4), T(0.9)
     dϕ_0 = dot(g, dir)
     xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
-    if dϕ_0 < zero(T)
+    if dϕ_0 < zero(T)  # valid descent direction
         a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false
         for i in 1:10
             if !done
@@ -102,14 +98,15 @@ end
             xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true
         end
     end
-    return (xn_out, ϕ_out, gn_out, ok_out)
+    (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
 @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
     T = eltype(g)
     q, a = g, ntuple(_ -> zero(T), Val(M))
-    for j in 0:(M - 1)
+    # First loop: newest to oldest
+    for j in 0:M-1
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
@@ -117,23 +114,26 @@ end
             q = q - a[ii] * Y[ii]
         end
     end
+    # Compute scaling factor γ
     kk = mod1(k, M)
     yy = sum(abs2, Y[kk])
-    γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T))
+    γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T))
     γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T))
     r = γ * q
-    for j in (M - 1):-1:0
+    # Second loop: oldest to newest
+    for j in M-1:-1:0
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
             r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
         end
     end
-    return -r
+    -r
 end
 
 # Run L-BFGS independently on each starting point
-@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, lb, ub, maxiters, ::Val{M}) where {M}
+# History buffers stored as tuples for GPU compatibility
+@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M}
     i = @index(Global, Linear)
     x = clamp.(x0s[i], lb, ub)
     T = eltype(x)
@@ -144,19 +144,18 @@ end
     g = _safe_grad(grad_f, p, x, lb, ub)
     k, active = 0, isfinite(fx) && all(isfinite, g)
     for _ in 1:maxiters
-        if active && norm(g) >= T(1.0e-7)
+        if active && norm(g) >= T(1e-7)
             dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
-            if dot(g, dir) >= zero(T)
-                dir, k = -g, 0
-            end
+            # Reset to steepest descent if direction is not descent
+            if dot(g, dir) >= zero(T); dir, k = -g, 0; end
             xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
-            if !ok
-                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0
-            end
+            # Fall back to steepest descent if line search fails
+            if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end
             if ok && isfinite(fn) && all(isfinite, gn)
                 s, y = xn - x, gn - g
                 sy = dot(s, y)
-                if isfinite(one(T) / sy) && sy > T(1.0e-10)
+                # Update history only if curvature condition holds
+                if isfinite(one(T) / sy) && sy > T(1e-10)
                     k += 1; ii = mod1(k, M)
                     S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii)
                 else
@@ -169,13 +168,14 @@ end
         end
     end
     @inbounds result[i] = x
+    @inbounds result_fx[i] = fx
 end
 
 # Main solve: runs PSO for global exploration, then L-BFGS for local refinement
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
         abstol = nothing, reltol = nothing, maxiters = 100,
-        local_maxiters = 50, n_starts = 20, kwargs...
+        local_maxiters = 50, kwargs...
     ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
     # Phase 1: Global search with PSO
@@ -189,46 +189,38 @@ function SciMLBase.solve!(
     best_u = sol_pso.u
     best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
 
-    _obj(x) = let v = prob.f(clamp.(x, lb, ub), p)
-        (isnan(v) || isinf(v)) ? convert(eltype(best_obj), Inf) : v
-    end
-
-    # Build candidate pool: inject global best into particle positions
-    x0s = copy(sol_pso.original)
-    x0s[1] = best_u
+    # Run L-BFGS on all particles
+    x0s = sol_pso.original
+    n = length(x0s)
 
-    costs = map(_obj, x0s)
-    n = min(n_starts, length(x0s))
-    pool = [x0s[j] for j in partialsortperm(Vector(costs), 1:n)]
-
-    D = length(first(pool))
+    D = length(prob.u0)
     m_val = D > 20 ? Val(5) : Val(10)
 
     grad_f = instantiate_gradient(f_raw, prob.f.adtype)
     t0 = time()
 
-    result = similar(pool)
-    copyto!(result, pool)
+    result = similar(x0s)
+    copyto!(result, x0s)
+    result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n)
 
-    # Phase 2: Local refinement with L-BFGS on top candidates
+    # Phase 2: Local refinement with L-BFGS on all candidates
     kernel = lbfgs_kernel!(opt.backend)
-    kernel(f_raw, grad_f, p, pool, result, lb, ub, local_maxiters, m_val; ndrange = n)
+    kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
     KernelAbstractions.synchronize(opt.backend)
 
-    # Select best result across all refined candidates
-    for j in 1:n
-        r = clamp.(result[j], lb, ub)
-        fval = _obj(r)
-        if fval < best_obj
-            best_obj = fval
-            best_u = r
-        end
+    # Find best result 
+    minobj, ind = findmin(result_fx)
+
+    # Single bulk transfer for winning solution only
+    if minobj < best_obj
+        best_obj = minobj
+        best_u = Array(view(result, ind:ind))[1]
     end
 
     solve_time = (time() - t0) + sol_pso.stats.time
 
-    return SciMLBase.build_solution(
+    SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
-end
+end
\ No newline at end of file

From 1d3c3b146bef457c8a0ef9764aef778065772706 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Thu, 2 Apr 2026 18:18:05 +0530
Subject: [PATCH 4/9] formatting

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 src/hybrid.jl | 54 +++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/hybrid.jl b/src/hybrid.jl
index 3cb82e7..945306f 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -6,21 +6,21 @@ using Optimization
 # Clamp x to interior of bounds and evaluate objective
 @inline function _safe_eval(f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1e-6)
+    ε = T(1.0e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     v = f(xc, p)
-    ifelse(isfinite(v), v, T(Inf))
+    return ifelse(isfinite(v), v, T(Inf))
 end
 
 # Clamp x to interior of bounds and evaluate gradient
 @inline function _safe_grad(grad_f, p, x, lb, ub)
     T = eltype(x)
-    ε = T(1e-6)
+    ε = T(1.0e-6)
     xc = clamp.(x, lb .+ ε, ub .- ε)
     xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
     g = grad_f(xc, p)
-    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
+    return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
 end
 
 # Cubic interpolation for line search; falls back to bisection if non-convex
@@ -29,7 +29,7 @@ end
     desc = d1 * d1 - dϕ_lo * dϕ_hi
     # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff)
     d2 = sqrt(max(desc, zero(desc)))
-    ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
+    return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
 end
 
 # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
@@ -39,10 +39,12 @@ end
     done = false
     for _ in 1:10
         if !done
-            a_j = _interpolate(a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
+            a_j = _interpolate(
+                a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
                 dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir),
-                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir))
-            a_j = clamp(a_j, min(a_lo, a_hi) + T(1e-3), max(a_lo, a_hi) - T(1e-3))
+                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)
+            )
+            a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
             xn_j = clamp.(x + a_j * dir, lb, ub)
             ϕ_j = _safe_eval(f, p, xn_j, lb, ub)
             if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
@@ -53,7 +55,9 @@ end
                 if abs(dϕ_j) <= -c2 * dϕ_0
                     xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
                 else
-                    if dϕ_j * (a_hi - a_lo) >= zero(T); a_hi = a_lo; end
+                    if dϕ_j * (a_hi - a_lo) >= zero(T)
+                        a_hi = a_lo
+                    end
                     a_lo, ϕ_lo = a_j, ϕ_j
                 end
             end
@@ -63,13 +67,13 @@ end
         xn_lo = clamp.(x + a_lo * dir, lb, ub)
         xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub)
     end
-    (xn_out, ϕ_out, gn_out, ok_out)
+    return (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
 @inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
     T = eltype(x)
-    c1, c2 = T(1e-4), T(0.9)
+    c1, c2 = T(1.0e-4), T(0.9)
     dϕ_0 = dot(g, dir)
     xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
     if dϕ_0 < zero(T)  # valid descent direction
@@ -98,7 +102,7 @@ end
             xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true
         end
     end
-    (xn_out, ϕ_out, gn_out, ok_out)
+    return (xn_out, ϕ_out, gn_out, ok_out)
 end
 
 # L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
@@ -106,7 +110,7 @@ end
     T = eltype(g)
     q, a = g, ntuple(_ -> zero(T), Val(M))
     # First loop: newest to oldest
-    for j in 0:M-1
+    for j in 0:(M - 1)
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
@@ -117,18 +121,18 @@ end
     # Compute scaling factor γ
     kk = mod1(k, M)
     yy = sum(abs2, Y[kk])
-    γ = ifelse(k >= 1 && yy > T(1e-30), dot(S[kk], Y[kk]) / yy, one(T))
+    γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T))
     γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T))
     r = γ * q
     # Second loop: oldest to newest
-    for j in M-1:-1:0
+    for j in (M - 1):-1:0
         idx = k - j
         if idx >= 1
             ii = mod1(idx, M)
             r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
         end
     end
-    -r
+    return -r
 end
 
 # Run L-BFGS independently on each starting point
@@ -144,18 +148,22 @@ end
     g = _safe_grad(grad_f, p, x, lb, ub)
     k, active = 0, isfinite(fx) && all(isfinite, g)
     for _ in 1:maxiters
-        if active && norm(g) >= T(1e-7)
+        if active && norm(g) >= T(1.0e-7)
             dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
             # Reset to steepest descent if direction is not descent
-            if dot(g, dir) >= zero(T); dir, k = -g, 0; end
+            if dot(g, dir) >= zero(T)
+                dir, k = -g, 0
+            end
             xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
             # Fall back to steepest descent if line search fails
-            if !ok; xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0; end
+            if !ok
+                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0
+            end
             if ok && isfinite(fn) && all(isfinite, gn)
                 s, y = xn - x, gn - g
                 sy = dot(s, y)
                 # Update history only if curvature condition holds
-                if isfinite(one(T) / sy) && sy > T(1e-10)
+                if isfinite(one(T) / sy) && sy > T(1.0e-10)
                     k += 1; ii = mod1(k, M)
                     S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii)
                 else
@@ -208,7 +216,7 @@ function SciMLBase.solve!(
     kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
     KernelAbstractions.synchronize(opt.backend)
 
-    # Find best result 
+    # Find best result
     minobj, ind = findmin(result_fx)
 
     # Single bulk transfer for winning solution only
@@ -219,8 +227,8 @@ function SciMLBase.solve!(
 
     solve_time = (time() - t0) + sol_pso.stats.time
 
-    SciMLBase.build_solution(
+    return SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
-end
\ No newline at end of file
+end

From b6b3443674685480c656f02f7f7cdb8977d9f0d9 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Thu, 2 Apr 2026 22:12:57 +0530
Subject: [PATCH 5/9] add comments

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 Project.toml  |   5 ++-
 src/hybrid.jl | 115 +++++++++++++++++++++-----------------------------
 2 files changed, 51 insertions(+), 69 deletions(-)

diff --git a/Project.toml b/Project.toml
index 05e182e..7c97061 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,20 +1,21 @@
 name = "ParallelParticleSwarms"
 uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419"
-version = "1.0.0"
 authors = ["Utkarsh <rajpututkarsh530@gmail.com> and contributors"]
+version = "1.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Runic = "62bfec6d-59d7-401d-8490-b29ee721c001"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
diff --git a/src/hybrid.jl b/src/hybrid.jl
index 945306f..8ba398c 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -3,69 +3,64 @@ using KernelAbstractions
 using SciMLBase
 using Optimization
 
-# Clamp x to interior of bounds and evaluate objective
-@inline function _safe_eval(f, p, x, lb, ub)
-    T = eltype(x)
-    ε = T(1.0e-6)
-    xc = clamp.(x, lb .+ ε, ub .- ε)
-    xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
-    v = f(xc, p)
-    return ifelse(isfinite(v), v, T(Inf))
+# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation
+@inline _eval(f, p, x, lb, ub) =
+let v = f(clamp.(x, lb, ub), p)
+    ifelse(isfinite(v), v, eltype(x)(Inf))
 end
 
-# Clamp x to interior of bounds and evaluate gradient
-@inline function _safe_grad(grad_f, p, x, lb, ub)
-    T = eltype(x)
-    ε = T(1.0e-6)
-    xc = clamp.(x, lb .+ ε, ub .- ε)
-    xc = map(xi -> abs(xi) < ε ? ε : xi, xc)
-    g = grad_f(xc, p)
-    return map(gi -> ifelse(isfinite(gi), gi, zero(gi)), g)
-end
+# Clamp to bounds and evaluate gradient; zero out non-finite components
+@inline _grad(grad_f, p, x, lb, ub) =
+    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p))
 
-# Cubic interpolation for line search; falls back to bisection if non-convex
+# Cubic interpolation with bisection fallback
 @inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
     d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi)
     desc = d1 * d1 - dϕ_lo * dϕ_hi
-    # Use max to ensure non-negative argument to sqrt (avoids DomainError with ForwardDiff)
     d2 = sqrt(max(desc, zero(desc)))
-    return ifelse(desc < 0, (a_lo + a_hi) / 2, a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2)))
+    return ifelse(
+        desc < 0, (a_lo + a_hi) / 2,
+        a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))
+    )
 end
 
 # Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
 @inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2)
     T = eltype(x)
-    xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, _safe_grad(grad_f, p, x, lb, ub), false
+    xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false
+
+    # Cache endpoint values; updated only when endpoints shift
+    ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub)
+    dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir)
+    dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir)
+
     done = false
     for _ in 1:10
         if !done
-            a_j = _interpolate(
-                a_lo, a_hi, ϕ_lo, _safe_eval(f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub),
-                dot(_safe_grad(grad_f, p, clamp.(x + a_lo * dir, lb, ub), lb, ub), dir),
-                dot(_safe_grad(grad_f, p, clamp.(x + a_hi * dir, lb, ub), lb, ub), dir)
-            )
+            a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
             a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
             xn_j = clamp.(x + a_j * dir, lb, ub)
-            ϕ_j = _safe_eval(f, p, xn_j, lb, ub)
+            ϕ_j = _eval(f, p, xn_j, lb, ub)
             if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
-                a_hi = a_j
+                a_hi, ϕ_hi = a_j, ϕ_j
+                dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir)
             else
-                gn_j = _safe_grad(grad_f, p, xn_j, lb, ub)
+                gn_j = _grad(grad_f, p, xn_j, lb, ub)
                 dϕ_j = dot(gn_j, dir)
                 if abs(dϕ_j) <= -c2 * dϕ_0
                     xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
                 else
                     if dϕ_j * (a_hi - a_lo) >= zero(T)
-                        a_hi = a_lo
+                        a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo
                     end
-                    a_lo, ϕ_lo = a_j, ϕ_j
+                    a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j
                 end
             end
         end
     end
     if !done
-        xn_lo = clamp.(x + a_lo * dir, lb, ub)
-        xn_out, ϕ_out, gn_out = xn_lo, _safe_eval(f, p, xn_lo, lb, ub), _safe_grad(grad_f, p, xn_lo, lb, ub)
+        xn = clamp.(x + a_lo * dir, lb, ub)
+        xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub)
     end
     return (xn_out, ϕ_out, gn_out, ok_out)
 end
@@ -81,25 +76,25 @@ end
         for i in 1:10
             if !done
                 xn = clamp.(x + a_i * dir, lb, ub)
-                ϕ_i = _safe_eval(f, p, xn, lb, ub)
+                ϕ_i = _eval(f, p, xn, lb, ub)
                 if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1)
                     xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true
                 else
-                    gn_i = _safe_grad(grad_f, p, xn, lb, ub)
+                    gn_i = _grad(grad_f, p, xn, lb, ub)
                     dϕ_i = dot(gn_i, dir)
                     if abs(dϕ_i) <= -c2 * dϕ_0
                         xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true
                     elseif dϕ_i >= zero(T)
                         xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true
                     else
-                        a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2.0)
+                        a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2)
                     end
                 end
             end
         end
         if !done
             xn = clamp.(x + a_prev * dir, lb, ub)
-            xn_out, ϕ_out, gn_out, ok_out = xn, _safe_eval(f, p, xn, lb, ub), _safe_grad(grad_f, p, xn, lb, ub), true
+            xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true
         end
     end
     return (xn_out, ϕ_out, gn_out, ok_out)
@@ -109,7 +104,6 @@ end
 @inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
     T = eltype(g)
     q, a = g, ntuple(_ -> zero(T), Val(M))
-    # First loop: newest to oldest
     for j in 0:(M - 1)
         idx = k - j
         if idx >= 1
@@ -118,13 +112,11 @@ end
             q = q - a[ii] * Y[ii]
         end
     end
-    # Compute scaling factor γ
     kk = mod1(k, M)
-    yy = sum(abs2, Y[kk])
-    γ = ifelse(k >= 1 && yy > T(1.0e-30), dot(S[kk], Y[kk]) / yy, one(T))
-    γ = ifelse(isfinite(γ) && γ > zero(T), γ, one(T))
+    sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk])
+    γ = sy / yy
+    γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T))
     r = γ * q
-    # Second loop: oldest to newest
     for j in (M - 1):-1:0
         idx = k - j
         if idx >= 1
@@ -135,8 +127,7 @@ end
     return -r
 end
 
-# Run L-BFGS independently on each starting point
-# History buffers stored as tuples for GPU compatibility
+# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers
 @kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M}
     i = @index(Global, Linear)
     x = clamp.(x0s[i], lb, ub)
@@ -144,28 +135,28 @@ end
     z = zero(typeof(x))
     S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M))
     Rho = ntuple(_ -> zero(T), Val(M))
-    fx = _safe_eval(f, p, x, lb, ub)
-    g = _safe_grad(grad_f, p, x, lb, ub)
+    fx = _eval(f, p, x, lb, ub)
+    g = _grad(grad_f, p, x, lb, ub)
     k, active = 0, isfinite(fx) && all(isfinite, g)
     for _ in 1:maxiters
         if active && norm(g) >= T(1.0e-7)
             dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
-            # Reset to steepest descent if direction is not descent
-            if dot(g, dir) >= zero(T)
+            if dot(g, dir) >= zero(T)  # reset to steepest descent
                 dir, k = -g, 0
             end
             xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
-            # Fall back to steepest descent if line search fails
-            if !ok
-                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub); k = 0
+            if !ok  # fall back to steepest descent
+                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub)
+                k = 0
             end
             if ok && isfinite(fn) && all(isfinite, gn)
                 s, y = xn - x, gn - g
                 sy = dot(s, y)
-                # Update history only if curvature condition holds
-                if isfinite(one(T) / sy) && sy > T(1.0e-10)
+                if isfinite(one(T) / sy) && sy > T(1.0e-10)  # curvature condition
                     k += 1; ii = mod1(k, M)
-                    S, Y, Rho = Base.setindex(S, s, ii), Base.setindex(Y, y, ii), Base.setindex(Rho, one(T) / sy, ii)
+                    S = Base.setindex(S, s, ii)
+                    Y = Base.setindex(Y, y, ii)
+                    Rho = Base.setindex(Rho, one(T) / sy, ii)
                 else
                     k = 0
                 end
@@ -179,14 +170,13 @@ end
     @inbounds result_fx[i] = fx
 end
 
-# Main solve: runs PSO for global exploration, then L-BFGS for local refinement
+# PSO for global exploration, then L-BFGS for local refinement on all particles
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
         abstol = nothing, reltol = nothing, maxiters = 100,
         local_maxiters = 50, kwargs...
     ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
-    # Phase 1: Global search with PSO
     sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)
 
     prob = cache.prob
@@ -197,36 +187,27 @@ function SciMLBase.solve!(
     best_u = sol_pso.u
     best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
 
-    # Run L-BFGS on all particles
     x0s = sol_pso.original
     n = length(x0s)
-
-    D = length(prob.u0)
-    m_val = D > 20 ? Val(5) : Val(10)
+    m_val = length(prob.u0) > 20 ? Val(5) : Val(10)
 
     grad_f = instantiate_gradient(f_raw, prob.f.adtype)
     t0 = time()
 
     result = similar(x0s)
-    copyto!(result, x0s)
     result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n)
 
-    # Phase 2: Local refinement with L-BFGS on all candidates
     kernel = lbfgs_kernel!(opt.backend)
     kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
     KernelAbstractions.synchronize(opt.backend)
 
-    # Find best result
     minobj, ind = findmin(result_fx)
-
-    # Single bulk transfer for winning solution only
     if minobj < best_obj
         best_obj = minobj
         best_u = Array(view(result, ind:ind))[1]
     end
 
     solve_time = (time() - t0) + sol_pso.stats.time
-
     return SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)

From be7dc7500df71c361e44133d2a235a52ae62851b Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Sun, 17 May 2026 21:50:09 +0530
Subject: [PATCH 6/9] use native line search

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 src/hybrid.jl | 241 ++++++++++++--------------------------------------
 1 file changed, 56 insertions(+), 185 deletions(-)

diff --git a/src/hybrid.jl b/src/hybrid.jl
index 8ba398c..a0ea1fc 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -1,212 +1,83 @@
-using LinearAlgebra: norm, dot
 using KernelAbstractions
 using SciMLBase
 using Optimization
-
-# Clamp to bounds and evaluate; NaN firewall prevents poison through interpolation
-@inline _eval(f, p, x, lb, ub) =
-let v = f(clamp.(x, lb, ub), p)
-    ifelse(isfinite(v), v, eltype(x)(Inf))
-end
-
-# Clamp to bounds and evaluate gradient; zero out non-finite components
-@inline _grad(grad_f, p, x, lb, ub) =
-    map(gi -> ifelse(isfinite(gi), gi, zero(gi)), grad_f(clamp.(x, lb, ub), p))
-
-# Cubic interpolation with bisection fallback
-@inline function _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
-    d1 = dϕ_lo + dϕ_hi - 3 * (ϕ_lo - ϕ_hi) / (a_lo - a_hi)
-    desc = d1 * d1 - dϕ_lo * dϕ_hi
-    d2 = sqrt(max(desc, zero(desc)))
-    return ifelse(
-        desc < 0, (a_lo + a_hi) / 2,
-        a_hi - (a_hi - a_lo) * ((dϕ_hi + d2 - d1) / (dϕ_hi - dϕ_lo + 2 * d2))
-    )
-end
-
-# Zoom phase of Strong Wolfe line search (Nocedal & Wright Algorithm 3.6)
-@inline function _zoom(f, grad_f, p, x, dir, lb, ub, a_lo, a_hi, ϕ_0, dϕ_0, ϕ_lo, c1, c2)
-    T = eltype(x)
-    xn_out, ϕ_out, gn_out, ok_out = x, ϕ_0, zero(typeof(x)), false
-
-    # Cache endpoint values; updated only when endpoints shift
-    ϕ_hi = _eval(f, p, x + a_hi * dir, lb, ub)
-    dϕ_lo = dot(_grad(grad_f, p, x + a_lo * dir, lb, ub), dir)
-    dϕ_hi = dot(_grad(grad_f, p, x + a_hi * dir, lb, ub), dir)
-
-    done = false
-    for _ in 1:10
-        if !done
-            a_j = _interpolate(a_lo, a_hi, ϕ_lo, ϕ_hi, dϕ_lo, dϕ_hi)
-            a_j = clamp(a_j, min(a_lo, a_hi) + T(1.0e-3), max(a_lo, a_hi) - T(1.0e-3))
-            xn_j = clamp.(x + a_j * dir, lb, ub)
-            ϕ_j = _eval(f, p, xn_j, lb, ub)
-            if (ϕ_j > ϕ_0 + c1 * a_j * dϕ_0) || (ϕ_j >= ϕ_lo)
-                a_hi, ϕ_hi = a_j, ϕ_j
-                dϕ_hi = dot(_grad(grad_f, p, xn_j, lb, ub), dir)
-            else
-                gn_j = _grad(grad_f, p, xn_j, lb, ub)
-                dϕ_j = dot(gn_j, dir)
-                if abs(dϕ_j) <= -c2 * dϕ_0
-                    xn_out, ϕ_out, gn_out, ok_out, done = xn_j, ϕ_j, gn_j, true, true
-                else
-                    if dϕ_j * (a_hi - a_lo) >= zero(T)
-                        a_hi, ϕ_hi, dϕ_hi = a_lo, ϕ_lo, dϕ_lo
-                    end
-                    a_lo, ϕ_lo, dϕ_lo = a_j, ϕ_j, dϕ_j
-                end
-            end
-        end
-    end
-    if !done
-        xn = clamp.(x + a_lo * dir, lb, ub)
-        xn_out, ϕ_out, gn_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub)
-    end
-    return (xn_out, ϕ_out, gn_out, ok_out)
-end
-
-# Strong Wolfe line search (Nocedal & Wright Algorithm 3.5)
-@inline function _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
-    T = eltype(x)
-    c1, c2 = T(1.0e-4), T(0.9)
-    dϕ_0 = dot(g, dir)
-    xn_out, ϕ_out, gn_out, ok_out = x, fx, g, false
-    if dϕ_0 < zero(T)  # valid descent direction
-        a_prev, a_i, ϕ_0, ϕ_prev, done = zero(T), one(T), fx, fx, false
-        for i in 1:10
-            if !done
-                xn = clamp.(x + a_i * dir, lb, ub)
-                ϕ_i = _eval(f, p, xn, lb, ub)
-                if (ϕ_i > ϕ_0 + c1 * a_i * dϕ_0) || (ϕ_i >= ϕ_prev && i > 1)
-                    xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_prev, a_i, ϕ_0, dϕ_0, ϕ_prev, c1, c2)..., true
-                else
-                    gn_i = _grad(grad_f, p, xn, lb, ub)
-                    dϕ_i = dot(gn_i, dir)
-                    if abs(dϕ_i) <= -c2 * dϕ_0
-                        xn_out, ϕ_out, gn_out, ok_out, done = xn, ϕ_i, gn_i, true, true
-                    elseif dϕ_i >= zero(T)
-                        xn_out, ϕ_out, gn_out, ok_out, done = _zoom(f, grad_f, p, x, dir, lb, ub, a_i, a_prev, ϕ_0, dϕ_0, ϕ_i, c1, c2)..., true
-                    else
-                        a_prev, ϕ_prev, a_i = a_i, ϕ_i, a_i * T(2)
-                    end
-                end
-            end
-        end
-        if !done
-            xn = clamp.(x + a_prev * dir, lb, ub)
-            xn_out, ϕ_out, gn_out, ok_out = xn, _eval(f, p, xn, lb, ub), _grad(grad_f, p, xn, lb, ub), true
-        end
-    end
-    return (xn_out, ϕ_out, gn_out, ok_out)
+using LineSearch
+using NonlinearSolveQuasiNewton
+
+# Safety box around the original feasible region. Local refinement may step
+# modestly outside `[lb, ub]`, but past this margin we reject the trial point
+# instead of evaluating f there — e.g. F19 Griewank-Rosenbrock overflows to Inf
+# for |θ| ~ 1e10 in Float32 and then cos(Inf) throws DomainError.
+@inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ)
+@inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T}
+    all(isfinite, θ) || return false
+    w = ub .- lb
+    return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w)
 end
 
-# L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4)
-@inline function _lbfgs_dir(g, S, Y, Rho, ::Val{M}, k) where {M}
-    T = eltype(g)
-    q, a = g, ntuple(_ -> zero(T), Val(M))
-    for j in 0:(M - 1)
-        idx = k - j
-        if idx >= 1
-            ii = mod1(idx, M)
-            a = Base.setindex(a, Rho[ii] * dot(S[ii], q), ii)
-            q = q - a[ii] * Y[ii]
-        end
-    end
-    kk = mod1(k, M)
-    sy, yy = dot(S[kk], Y[kk]), sum(abs2, Y[kk])
-    γ = sy / yy
-    γ = ifelse(k >= 1 && yy > T(1.0e-30) && isfinite(γ) && γ > zero(T), γ, one(T))
-    r = γ * q
-    for j in (M - 1):-1:0
-        idx = k - j
-        if idx >= 1
-            ii = mod1(idx, M)
-            r = r + (a[ii] - Rho[ii] * dot(Y[ii], r)) * S[ii]
-        end
-    end
-    return -r
-end
+# Huge-magnitude gradient: makes the Strong Wolfe Armijo test fail so the line
+# search rejects the trial step without us ever calling f at the bad point.
+@inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ)
 
-# Run L-BFGS independently per starting point; tuples for GPU-compatible history buffers
-@kernel function lbfgs_kernel!(f, grad_f, p, x0s, result, result_fx, lb, ub, maxiters, ::Val{M}) where {M}
+@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol, grad_f)
     i = @index(Global, Linear)
-    x = clamp.(x0s[i], lb, ub)
-    T = eltype(x)
-    z = zero(typeof(x))
-    S, Y = ntuple(_ -> z, Val(M)), ntuple(_ -> z, Val(M))
-    Rho = ntuple(_ -> zero(T), Val(M))
-    fx = _eval(f, p, x, lb, ub)
-    g = _grad(grad_f, p, x, lb, ub)
-    k, active = 0, isfinite(fx) && all(isfinite, g)
-    for _ in 1:maxiters
-        if active && norm(g) >= T(1.0e-7)
-            dir = _lbfgs_dir(g, S, Y, Rho, Val(M), k)
-            if dot(g, dir) >= zero(T)  # reset to steepest descent
-                dir, k = -g, 0
-            end
-            xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, dir, lb, ub)
-            if !ok  # fall back to steepest descent
-                xn, fn, gn, ok = _strong_wolfe(f, grad_f, p, x, fx, g, -g, lb, ub)
-                k = 0
-            end
-            if ok && isfinite(fn) && all(isfinite, gn)
-                s, y = xn - x, gn - g
-                sy = dot(s, y)
-                if isfinite(one(T) / sy) && sy > T(1.0e-10)  # curvature condition
-                    k += 1; ii = mod1(k, M)
-                    S = Base.setindex(S, s, ii)
-                    Y = Base.setindex(Y, y, ii)
-                    Rho = Base.setindex(Rho, one(T) / sy, ii)
-                else
-                    k = 0
-                end
-                x, g, fx = xn, gn, fn
-            else
-                active = false
-            end
-        end
-    end
-    @inbounds result[i] = x
-    @inbounds result_fx[i] = fx
+    nlcache = SciMLBase.init(
+        nlprob, opt; u0 = x0s[i], maxiters, abstol, reltol, grad_f,
+        kwargshandle = SciMLBase.KeywordArgSilent
+    )
+    sol = SciMLBase.solve!(nlcache)
+    @inbounds result[i] = sol.u
 end
 
-# PSO for global exploration, then L-BFGS for local refinement on all particles
+# HybridPSO: global PSO exploration, then per-particle local BFGS refinement.
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
-        abstol = nothing, reltol = nothing, maxiters = 100,
-        local_maxiters = 50, kwargs...
+        abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10,
+        linesearch = LineSearch.StrongWolfeLineSearch(), kwargs...
     ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
     sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)
 
-    prob = cache.prob
-    f_raw, p = prob.f.f, prob.p
-    lb = prob.lb === nothing ? convert.(eltype(prob.u0), -Inf) : prob.lb
-    ub = prob.ub === nothing ? convert.(eltype(prob.u0), Inf) : prob.ub
-
-    best_u = sol_pso.u
-    best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
+    orig_lb = cache.prob.lb
+    orig_ub = cache.prob.ub
+    prob = remake(cache.prob, lb = nothing, ub = nothing)
 
     x0s = sol_pso.original
-    n = length(x0s)
-    m_val = length(prob.u0) > 20 ? Val(5) : Val(10)
+    raw_grad = instantiate_gradient(prob.f.f, prob.f.adtype)
+    grad_f = (θ, p) ->
+        _in_safe_box(θ, orig_lb, orig_ub) ? raw_grad(θ, p) : _huge_grad(θ)
 
-    grad_f = instantiate_gradient(f_raw, prob.f.adtype)
-    t0 = time()
+    result = cache.start_points
+    copyto!(result, x0s)
 
-    result = similar(x0s)
-    result_fx = KernelAbstractions.allocate(opt.backend, typeof(best_obj), n)
+    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p)
+    nlalg = opt.local_opt isa LBFGS ?
+        NonlinearSolveQuasiNewton.LimitedMemoryBroyden(;
+            threshold = opt.local_opt.threshold,
+            linesearch
+        ) :
+        NonlinearSolveQuasiNewton.Broyden(; linesearch)
 
-    kernel = lbfgs_kernel!(opt.backend)
-    kernel(f_raw, grad_f, p, x0s, result, result_fx, lb, ub, local_maxiters, m_val; ndrange = n)
+    t0 = time()
+    kernel = simplebfgs_run!(opt.backend)
+    kernel(
+        nlprob, x0s, result, nlalg, local_maxiters, abstol, reltol, grad_f;
+        ndrange = length(x0s)
+    )
     KernelAbstractions.synchronize(opt.backend)
 
-    minobj, ind = findmin(result_fx)
-    if minobj < best_obj
-        best_obj = minobj
-        best_u = Array(view(result, ind:ind))[1]
+    Tobj = eltype(prob.u0)
+    result_fx = map(eachindex(result)) do i
+        x = result[i]
+        _in_safe_box(x, orig_lb, orig_ub) || return convert(Tobj, Inf)
+        v = prob.f(x, prob.p)
+        isnan(v) ? convert(Tobj, Inf) : v
     end
 
+    minobj, ind = findmin(result_fx)
+    best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
+    best_u, best_obj = minobj > best_obj ? (sol_pso.u, best_obj) : (result[ind], minobj)
+
     solve_time = (time() - t0) + sol_pso.stats.time
     return SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;

From 1aa86658f8b88f98963aaef79bfdae5d815cc365 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Sun, 17 May 2026 22:34:51 +0530
Subject: [PATCH 7/9] update Project.toml

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 Project.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Project.toml b/Project.toml
index 7c97061..e8142c4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,7 +9,9 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NonlinearSolveQuasiNewton = "9a2c21bd-3a47-402d-9113-8faf9a0ee114"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
@@ -28,6 +30,8 @@ Enzyme = "0.13"
 ForwardDiff = "0.10, 1.3"
 JET = "0.9, 0.10, 0.11"
 KernelAbstractions = "0.9"
+LineSearch = "0.1"
+NonlinearSolveQuasiNewton = "1"
 Optimization = "4.1, 5.2"
 PrecompileTools = "1"
 QuasiMonteCarlo = "0.3"

From f6c5d1b015603936dee340516e746369d8acd712 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Sun, 17 May 2026 23:17:50 +0530
Subject: [PATCH 8/9] gpu-compat

Signed-off-by: AdityaPandeyCN <adityapand3y666@gmail.com>
---
 src/hybrid.jl | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/hybrid.jl b/src/hybrid.jl
index a0ea1fc..122771a 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -4,10 +4,7 @@ using Optimization
 using LineSearch
 using NonlinearSolveQuasiNewton
 
-# Safety box around the original feasible region. Local refinement may step
-# modestly outside `[lb, ub]`, but past this margin we reject the trial point
-# instead of evaluating f there — e.g. F19 Griewank-Rosenbrock overflows to Inf
-# for |θ| ~ 1e10 in Float32 and then cos(Inf) throws DomainError.
+"Check that `θ` lies within a slack-expanded box around `[lb, ub]`."
 @inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ)
 @inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T}
     all(isfinite, θ) || return false
@@ -15,21 +12,27 @@ using NonlinearSolveQuasiNewton
     return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w)
 end
 
-# Huge-magnitude gradient: makes the Strong Wolfe Armijo test fail so the line
-# search rejects the trial step without us ever calling f at the bad point.
+"Sentinel gradient with huge magnitude used to reject out-of-box trial points."
 @inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ)
 
-@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol, grad_f)
+"Per-particle local quasi-Newton refinement kernel. The algorithm is built inside the kernel because `QuasiNewtonAlgorithm` is not isbits."
+@kernel function simplebfgs_run!(
+        nlprob, x0s, result, linesearch, ::Val{Threshold}, ::Val{IsLBFGS},
+        maxiters, abstol, reltol, grad_f
+    ) where {Threshold, IsLBFGS}
     i = @index(Global, Linear)
+    nlalg = IsLBFGS ?
+        NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; threshold = Val(Threshold), linesearch) :
+        NonlinearSolveQuasiNewton.Broyden(; linesearch)
     nlcache = SciMLBase.init(
-        nlprob, opt; u0 = x0s[i], maxiters, abstol, reltol, grad_f,
+        nlprob, nlalg; u0 = x0s[i], maxiters, abstol, reltol, grad_f,
         kwargshandle = SciMLBase.KeywordArgSilent
     )
     sol = SciMLBase.solve!(nlcache)
     @inbounds result[i] = sol.u
 end
 
-# HybridPSO: global PSO exploration, then per-particle local BFGS refinement.
+"Hybrid PSO solve: global PSO exploration followed by per-particle local quasi-Newton refinement."
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
         abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10,
@@ -51,17 +54,14 @@ function SciMLBase.solve!(
     copyto!(result, x0s)
 
     nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p)
-    nlalg = opt.local_opt isa LBFGS ?
-        NonlinearSolveQuasiNewton.LimitedMemoryBroyden(;
-            threshold = opt.local_opt.threshold,
-            linesearch
-        ) :
-        NonlinearSolveQuasiNewton.Broyden(; linesearch)
+    is_lbfgs_val = Val(opt.local_opt isa LBFGS)
+    threshold_val = opt.local_opt isa LBFGS ? Val(opt.local_opt.threshold) : Val(0)
 
     t0 = time()
     kernel = simplebfgs_run!(opt.backend)
     kernel(
-        nlprob, x0s, result, nlalg, local_maxiters, abstol, reltol, grad_f;
+        nlprob, x0s, result, linesearch, threshold_val, is_lbfgs_val,
+        local_maxiters, abstol, reltol, grad_f;
         ndrange = length(x0s)
     )
     KernelAbstractions.synchronize(opt.backend)

From c95cf6084a4f8f61d2d95920bd7f697bffb1a885 Mon Sep 17 00:00:00 2001
From: AdityaPandeyCN <adityapand3y666@gmail.com>
Date: Mon, 18 May 2026 08:23:20 +0530
Subject: [PATCH 9/9] custom lbfgs

---
 Project.toml  |   2 -
 src/hybrid.jl | 124 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 102 insertions(+), 24 deletions(-)

diff --git a/Project.toml b/Project.toml
index e8142c4..f603359 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NonlinearSolveQuasiNewton = "9a2c21bd-3a47-402d-9113-8faf9a0ee114"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
@@ -31,7 +30,6 @@ ForwardDiff = "0.10, 1.3"
 JET = "0.9, 0.10, 0.11"
 KernelAbstractions = "0.9"
 LineSearch = "0.1"
-NonlinearSolveQuasiNewton = "1"
 Optimization = "4.1, 5.2"
 PrecompileTools = "1"
 QuasiMonteCarlo = "0.3"
diff --git a/src/hybrid.jl b/src/hybrid.jl
index 122771a..2e31e18 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -1,8 +1,8 @@
+using LinearAlgebra: dot
 using KernelAbstractions
 using SciMLBase
 using Optimization
 using LineSearch
-using NonlinearSolveQuasiNewton
 
 "Check that `θ` lies within a slack-expanded box around `[lb, ub]`."
 @inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ)
@@ -15,24 +15,96 @@ end
 "Sentinel gradient with huge magnitude used to reject out-of-box trial points."
 @inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ)
 
-"Per-particle local quasi-Newton refinement kernel. The algorithm is built inside the kernel because `QuasiNewtonAlgorithm` is not isbits."
-@kernel function simplebfgs_run!(
-        nlprob, x0s, result, linesearch, ::Val{Threshold}, ::Val{IsLBFGS},
-        maxiters, abstol, reltol, grad_f
-    ) where {Threshold, IsLBFGS}
+"L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) over cyclic memory of length `M`."
+@inline function _lbfgs_direction(g, S, Y, Rho, ::Val{M}, k) where {M}
+    T = eltype(g)
+    q = g
+    a = ntuple(_ -> zero(T), Val(M))
+    for j in 0:(M - 1)
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            aii = Rho[ii] * dot(S[ii], q)
+            a = Base.setindex(a, aii, ii)
+            q = q - aii * Y[ii]
+        end
+    end
+    γ = if k >= 1
+        kk = mod1(k, M)
+        yy = sum(abs2, Y[kk])
+        sy = dot(S[kk], Y[kk])
+        ifelse(yy > T(1.0e-30) && sy > zero(T), sy / yy, one(T))
+    else
+        one(T)
+    end
+    r = γ * q
+    for j in (M - 1):-1:0
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            β = Rho[ii] * dot(Y[ii], r)
+            r = r + (a[ii] - β) * S[ii]
+        end
+    end
+    return -r
+end
+
+"Per-particle hand-written L-BFGS with `LineSearch.StrongWolfeLineSearch`. Avoids `SciMLBase.init`, whose `promote_u0` path uses dynamic dispatch (invalid GPU IR)."
+@kernel function lbfgs_run!(
+        grad_f, p, x0s, result, ls_cache, ::Val{M}, maxiters
+    ) where {M}
     i = @index(Global, Linear)
-    nlalg = IsLBFGS ?
-        NonlinearSolveQuasiNewton.LimitedMemoryBroyden(; threshold = Val(Threshold), linesearch) :
-        NonlinearSolveQuasiNewton.Broyden(; linesearch)
-    nlcache = SciMLBase.init(
-        nlprob, nlalg; u0 = x0s[i], maxiters, abstol, reltol, grad_f,
-        kwargshandle = SciMLBase.KeywordArgSilent
-    )
-    sol = SciMLBase.solve!(nlcache)
-    @inbounds result[i] = sol.u
+    x = x0s[i]
+    T = eltype(x)
+    g = grad_f(x, p)
+
+    z = zero(typeof(x))
+    S = ntuple(_ -> z, Val(M))
+    Y = ntuple(_ -> z, Val(M))
+    Rho = ntuple(_ -> zero(T), Val(M))
+    k = 0
+    active = all(isfinite, g)
+
+    for _ in 1:maxiters
+        if active
+            dir = _lbfgs_direction(g, S, Y, Rho, Val(M), k)
+            if dot(g, dir) >= zero(T)
+                dir = -g
+                k = 0
+            end
+            ls_sol = SciMLBase.solve!(ls_cache, x, dir)
+            α = T(ls_sol.step_size)
+            ok = ls_sol.retcode == ReturnCode.Success
+            if ok && isfinite(α) && α > zero(T)
+                xn = x + α * dir
+                gn = grad_f(xn, p)
+                if all(isfinite, gn)
+                    s = xn - x
+                    y = gn - g
+                    sy = dot(s, y)
+                    if sy > T(1.0e-10) && isfinite(one(T) / sy)
+                        k += 1
+                        ii = mod1(k, M)
+                        S = Base.setindex(S, s, ii)
+                        Y = Base.setindex(Y, y, ii)
+                        Rho = Base.setindex(Rho, one(T) / sy, ii)
+                    else
+                        k = 0
+                    end
+                    x = xn
+                    g = gn
+                else
+                    active = false
+                end
+            else
+                active = false
+            end
+        end
+    end
+    @inbounds result[i] = x
 end
 
-"Hybrid PSO solve: global PSO exploration followed by per-particle local quasi-Newton refinement."
+"Hybrid PSO solve: global PSO exploration followed by per-particle local L-BFGS refinement."
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
         abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10,
@@ -53,15 +125,23 @@ function SciMLBase.solve!(
     result = cache.start_points
     copyto!(result, x0s)
 
-    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(grad_f, prob.u0, prob.p)
-    is_lbfgs_val = Val(opt.local_opt isa LBFGS)
-    threshold_val = opt.local_opt isa LBFGS ? Val(opt.local_opt.threshold) : Val(0)
+    T = eltype(prob.u0)
+    D = length(prob.u0)
+    M_val = opt.local_opt isa LBFGS ? Val(min(opt.local_opt.threshold, D)) : Val(D)
+
+    # Construct the line-search cache directly: `LineSearch.init` would route
+    # through `SciMLBase.init`'s `promote_u0`, which dynamic-dispatches on GPU.
+    ls_cache = LineSearch.StaticStrongWolfeLineSearchCache(
+        grad_f, grad_f, prob.p,
+        T(linesearch.c1), T(linesearch.c2),
+        T(linesearch.α_init), T(linesearch.α_max),
+        linesearch.maxiters, linesearch.zoom_maxiters
+    )
 
     t0 = time()
-    kernel = simplebfgs_run!(opt.backend)
+    kernel = lbfgs_run!(opt.backend)
     kernel(
-        nlprob, x0s, result, linesearch, threshold_val, is_lbfgs_val,
-        local_maxiters, abstol, reltol, grad_f;
+        grad_f, prob.p, x0s, result, ls_cache, M_val, local_maxiters;
         ndrange = length(x0s)
     )
     KernelAbstractions.synchronize(opt.backend)