diff --git a/Project.toml b/Project.toml
index a4c76f8..f603359 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelParticleSwarms"
 uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419"
-version = "1.0.0"
 authors = ["Utkarsh <rajpututkarsh530@gmail.com> and contributors"]
+version = "1.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -9,11 +9,14 @@ DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Runic = "62bfec6d-59d7-401d-8490-b29ee721c001"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
@@ -26,6 +29,7 @@ Enzyme = "0.13"
 ForwardDiff = "0.10, 1.3"
 JET = "0.9, 0.10, 0.11"
 KernelAbstractions = "0.9"
+LineSearch = "0.1"
 Optimization = "4.1, 5.2"
 PrecompileTools = "1"
 QuasiMonteCarlo = "0.3"
@@ -39,7 +43,6 @@ julia = "1.10"
 
 [extras]
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
diff --git a/src/hybrid.jl b/src/hybrid.jl
index 60f80ed..2e31e18 100644
--- a/src/hybrid.jl
+++ b/src/hybrid.jl
@@ -1,69 +1,166 @@
-@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol)
+using LinearAlgebra: dot
+using KernelAbstractions
+using SciMLBase
+using Optimization
+using LineSearch
+
+"Check that `θ` lies within a slack-expanded box around `[lb, ub]`."
+@inline _in_safe_box(θ, ::Nothing, ::Nothing) = all(isfinite, θ)
+@inline function _in_safe_box(θ::AbstractArray{T}, lb, ub) where {T}
+    all(isfinite, θ) || return false
+    w = ub .- lb
+    return all(θ .>= lb .- T(2) .* w) && all(θ .<= ub .+ T(2) .* w)
+end
+
+"Sentinel gradient with huge magnitude used to reject out-of-box trial points."
+@inline _huge_grad(θ::AbstractArray{T}) where {T} = map(_ -> T(1.0e15), θ)
+
+"L-BFGS two-loop recursion (Nocedal & Wright Algorithm 7.4) over cyclic memory of length `M`."
+@inline function _lbfgs_direction(g, S, Y, Rho, ::Val{M}, k) where {M}
+    T = eltype(g)
+    q = g
+    a = ntuple(_ -> zero(T), Val(M))
+    for j in 0:(M - 1)
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            aii = Rho[ii] * dot(S[ii], q)
+            a = Base.setindex(a, aii, ii)
+            q = q - aii * Y[ii]
+        end
+    end
+    γ = if k >= 1
+        kk = mod1(k, M)
+        yy = sum(abs2, Y[kk])
+        sy = dot(S[kk], Y[kk])
+        ifelse(yy > T(1.0e-30) && sy > zero(T), sy / yy, one(T))
+    else
+        one(T)
+    end
+    r = γ * q
+    for j in (M - 1):-1:0
+        idx = k - j
+        if idx >= 1
+            ii = mod1(idx, M)
+            β = Rho[ii] * dot(Y[ii], r)
+            r = r + (a[ii] - β) * S[ii]
+        end
+    end
+    return -r
+end
+
+"Per-particle hand-written L-BFGS with `LineSearch.StrongWolfeLineSearch`. Avoids `SciMLBase.init`, whose `promote_u0` path uses dynamic dispatch (invalid GPU IR)."
+@kernel function lbfgs_run!(
+        grad_f, p, x0s, result, ls_cache, ::Val{M}, maxiters
+    ) where {M}
     i = @index(Global, Linear)
-    nlcache = remake(nlprob; u0 = x0s[i])
-    sol = solve(nlcache, opt; maxiters, abstol, reltol)
-    @inbounds result[i] = sol.u
+    x = x0s[i]
+    T = eltype(x)
+    g = grad_f(x, p)
+
+    z = zero(typeof(x))
+    S = ntuple(_ -> z, Val(M))
+    Y = ntuple(_ -> z, Val(M))
+    Rho = ntuple(_ -> zero(T), Val(M))
+    k = 0
+    active = all(isfinite, g)
+
+    for _ in 1:maxiters
+        if active
+            dir = _lbfgs_direction(g, S, Y, Rho, Val(M), k)
+            if dot(g, dir) >= zero(T)
+                dir = -g
+                k = 0
+            end
+            ls_sol = SciMLBase.solve!(ls_cache, x, dir)
+            α = T(ls_sol.step_size)
+            ok = ls_sol.retcode == ReturnCode.Success
+            if ok && isfinite(α) && α > zero(T)
+                xn = x + α * dir
+                gn = grad_f(xn, p)
+                if all(isfinite, gn)
+                    s = xn - x
+                    y = gn - g
+                    sy = dot(s, y)
+                    if sy > T(1.0e-10) && isfinite(one(T) / sy)
+                        k += 1
+                        ii = mod1(k, M)
+                        S = Base.setindex(S, s, ii)
+                        Y = Base.setindex(Y, y, ii)
+                        Rho = Base.setindex(Rho, one(T) / sy, ii)
+                    else
+                        k = 0
+                    end
+                    x = xn
+                    g = gn
+                else
+                    active = false
+                end
+            else
+                active = false
+            end
+        end
+    end
+    @inbounds result[i] = x
 end
 
+"Hybrid PSO solve: global PSO exploration followed by per-particle local L-BFGS refinement."
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
-        abstol = nothing,
-        reltol = nothing,
-        maxiters = 100, local_maxiters = 10, kwargs...
-    ) where {
-        Backend, LocalOpt <: Union{LBFGS, BFGS},
-    }
-    pso_cache = cache.pso_cache
-
-    sol_pso = solve!(pso_cache)
-    x0s = sol_pso.original
+        abstol = nothing, reltol = nothing, maxiters = 100, local_maxiters = 10,
+        linesearch = LineSearch.StrongWolfeLineSearch(), kwargs...
+    ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
-    backend = opt.backend
+    sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters = maxiters)
 
+    orig_lb = cache.prob.lb
+    orig_ub = cache.prob.ub
     prob = remake(cache.prob, lb = nothing, ub = nothing)
 
+    x0s = sol_pso.original
+    raw_grad = instantiate_gradient(prob.f.f, prob.f.adtype)
+    grad_f = (θ, p) ->
+        _in_safe_box(θ, orig_lb, orig_ub) ? raw_grad(θ, p) : _huge_grad(θ)
+
     result = cache.start_points
     copyto!(result, x0s)
 
-    ∇f = instantiate_gradient(prob.f.f, prob.f.adtype)
-
-    kernel = simplebfgs_run!(backend)
-    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p)
+    T = eltype(prob.u0)
+    D = length(prob.u0)
+    M_val = opt.local_opt isa LBFGS ? Val(min(opt.local_opt.threshold, D)) : Val(D)
 
-    nlalg = opt.local_opt isa LBFGS ?
-        SimpleLimitedMemoryBroyden(;
-            threshold = opt.local_opt.threshold,
-            linesearch = Val(true)
-        ) : SimpleBroyden(; linesearch = Val(true))
+    # Construct the line-search cache directly: `LineSearch.init` would route
+    # through `SciMLBase.init`'s `promote_u0`, which dynamic-dispatches on GPU.
+    ls_cache = LineSearch.StaticStrongWolfeLineSearchCache(
+        grad_f, grad_f, prob.p,
+        T(linesearch.c1), T(linesearch.c2),
+        T(linesearch.α_init), T(linesearch.α_max),
+        linesearch.maxiters, linesearch.zoom_maxiters
+    )
 
     t0 = time()
+    kernel = lbfgs_run!(opt.backend)
     kernel(
-        nlprob,
-        x0s,
-        result,
-        nlalg,
-        local_maxiters,
-        abstol,
-        reltol;
+        grad_f, prob.p, x0s, result, ls_cache, M_val, local_maxiters;
         ndrange = length(x0s)
     )
+    KernelAbstractions.synchronize(opt.backend)
 
-    sol_bfgs = (x -> prob.f(x, prob.p)).(result)
-    sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs)
-
-    minobj, ind = findmin(sol_bfgs)
-    sol_u,
-        sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) :
-        (view(result, ind), minobj)
-    t1 = time()
-
-    # @show sol_pso.stats.time
+    Tobj = eltype(prob.u0)
+    result_fx = map(eachindex(result)) do i
+        x = result[i]
+        _in_safe_box(x, orig_lb, orig_ub) || return convert(Tobj, Inf)
+        v = prob.f(x, prob.p)
+        isnan(v) ? convert(Tobj, Inf) : v
+    end
 
-    solve_time = (t1 - t0) + sol_pso.stats.time
+    minobj, ind = findmin(result_fx)
+    best_obj = sol_pso.objective isa Real ? sol_pso.objective : sol_pso.objective[]
+    best_u, best_obj = minobj > best_obj ? (sol_pso.u, best_obj) : (result[ind], minobj)
 
+    solve_time = (time() - t0) + sol_pso.stats.time
     return SciMLBase.build_solution(
-        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        sol_u, sol_obj,
+        SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, best_u, best_obj;
         stats = Optimization.OptimizationStats(; time = solve_time)
     )
 end