From a5eca5d00489d5e7cf8cc416083d88211725da16 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 9 Mar 2026 15:44:15 +0000
Subject: [PATCH 01/20] fix reading field in python scope avoiding gpu-cpu sync

---
 genesis/engine/solvers/rigid/rigid_solver.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/genesis/engine/solvers/rigid/rigid_solver.py b/genesis/engine/solvers/rigid/rigid_solver.py
index f6d32eb6f..f1a0fb94f 100644
--- a/genesis/engine/solvers/rigid/rigid_solver.py
+++ b/genesis/engine/solvers/rigid/rigid_solver.py
@@ -440,6 +440,9 @@ def _create_data_manager(self):
         self._errno = self.data_manager.errno
 
         self._rigid_global_info = self.data_manager.rigid_global_info
+        self._rigid_global_info._n_iterations = (
+            self._options.iterations
+        )  # Python-native mirror to avoid CPU-GPU sync in Python-scope functions
         self._rigid_adjoint_cache = self.data_manager.rigid_adjoint_cache
         if self._use_hibernation:
             self.n_awake_dofs = self._rigid_global_info.n_awake_dofs

From b786ecc868c146087f1e0cc940333a8e0bc40c1f Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 9 Mar 2026 16:29:05 +0000
Subject: [PATCH 02/20] update

---
 genesis/engine/solvers/rigid/rigid_solver.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/genesis/engine/solvers/rigid/rigid_solver.py b/genesis/engine/solvers/rigid/rigid_solver.py
index f1a0fb94f..f6d32eb6f 100644
--- a/genesis/engine/solvers/rigid/rigid_solver.py
+++ b/genesis/engine/solvers/rigid/rigid_solver.py
@@ -440,9 +440,6 @@ def _create_data_manager(self):
         self._errno = self.data_manager.errno
 
         self._rigid_global_info = self.data_manager.rigid_global_info
-        self._rigid_global_info._n_iterations = (
-            self._options.iterations
-        )  # Python-native mirror to avoid CPU-GPU sync in Python-scope functions
         self._rigid_adjoint_cache = self.data_manager.rigid_adjoint_cache
         if self._use_hibernation:
             self.n_awake_dofs = self._rigid_global_info.n_awake_dofs

From 3f1737e4023d0f10cbf13dbbc2bc79290c5858d5 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 9 Mar 2026 18:10:05 +0000
Subject: [PATCH 03/20] Replace sequential linesearch with parallel
 multi-kernel linesearch

Replace the sequential per-env linesearch (_kernel_linesearch) with a
parallel linesearch pipeline using 6 specialized kernels:
- _kernel_parallel_linesearch_mv: mv = M @ search, ndrange(dof, env)
- _kernel_parallel_linesearch_jv: jv = J @ search, ndrange(constraint, env)
- _kernel_parallel_linesearch_p0: fused snorm/quad_gauss/eq_sum/p0_cost with shared memory reductions
- _kernel_parallel_linesearch_eval: K=16 log-spaced candidates evaluated in parallel with shared memory argmin
- _kernel_parallel_linesearch_apply_alpha_dofs: apply best alpha to qacc/Ma
- _kernel_parallel_linesearch_apply_alpha_constraints: apply best alpha to Jaref

Also includes decomposed update_constraint (3 kernels) in the iteration loop.

Additional changes:
- Add dofs_info to func_solve_body dispatch signature
- Add _log_scale helper function to solver.py
- Exclude requires_grad from decomposed path (parallel LS is sensitive to FP precision)
- Update test_grad.py to pass dofs_info

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../engine/solvers/rigid/constraint/solver.py |   9 +
 .../rigid/constraint/solver_breakdown.py      | 484 ++++++++++++++++--
 tests/test_grad.py                            |   1 +
 3 files changed, 454 insertions(+), 40 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index 986a8c1d4..d0a42e2d4 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -219,6 +219,7 @@ def resolve(self):
 
         func_solve_body(
             self._solver.entities_info,
+            self._solver.dofs_info,
             self._solver.dofs_state,
             self.constraint_state,
             self._solver._rigid_global_info,
@@ -2354,6 +2355,12 @@ def update_bracket_no_eval_local(
     return flag, p_alpha, p_cost, p_grad, p_hess, p_next_alpha
 
 
+@qd.func
+def _log_scale(min_value: gs.qd_float, max_value: gs.qd_float, num_values: qd.i32, i: qd.i32) -> gs.qd_float:
+    step = (qd.log(max_value) - qd.log(min_value)) / qd.max(1.0, gs.qd_float(num_values - 1))
+    return qd.exp(qd.log(min_value) + gs.qd_float(i) * step)
+
+
 @qd.func
 def func_linesearch_and_apply_alpha(
     i_b,
@@ -3105,6 +3112,7 @@ def func_solve_iter(
 )
 def func_solve_body(
     entities_info: array_class.EntitiesInfo,
+    dofs_info: array_class.DofsInfo,
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
@@ -3117,6 +3125,7 @@ def func_solve_body(
 @qd.kernel(fastcache=gs.use_fastcache)
 def func_solve_body_monolith(
     entities_info: array_class.EntitiesInfo,
+    dofs_info: array_class.DofsInfo,
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index e5cb7f655..a05cb2242 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -1,62 +1,429 @@
-import quadrants as ti
+import quadrants as qd
 
 import genesis as gs
 import genesis.utils.array_class as array_class
 from genesis.engine.solvers.rigid.constraint import solver
 
+LS_PARALLEL_K = 16
+LS_PARALLEL_MIN_STEP = 1e-6
+LS_PARALLEL_N_REFINE = 1  # number of successive refinement passes in parallel linesearch
+_P0_BLOCK = 32
+_JV_BLOCK = 32
 
-@ti.kernel(fastcache=gs.use_fastcache)
-def _kernel_linesearch(
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_mv(
+    dofs_info: array_class.DofsInfo,
     entities_info: array_class.EntitiesInfo,
+    constraint_state: array_class.ConstraintState,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """Compute mv = M @ search, parallelized over (dof, env).
+
+    Uses per-dof entity lookup to find the entity block boundaries, giving n_dofs * B
+    threads (each computing a single ~6-element dot product) instead of n_entities * B
+    threads (each computing the full block matvec).
+    """
+    n_dofs = constraint_state.search.shape[0]
+    _B = constraint_state.grad.shape[1]
+
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
+    for i_d1, i_b in qd.ndrange(n_dofs, _B):
+        if constraint_state.n_constraints[i_b] > 0:
+            I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1
+            i_e = dofs_info.entity_idx[I_d1]
+            mv = gs.qd_float(0.0)
+            for i_d2 in range(entities_info.dof_start[i_e], entities_info.dof_end[i_e]):
+                mv = mv + rigid_global_info.mass_mat[i_d1, i_d2, i_b] * constraint_state.search[i_d2, i_b]
+            constraint_state.mv[i_d1, i_b] = mv
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_jv(
+    constraint_state: array_class.ConstraintState,
+    static_rigid_sim_config: qd.template(),
+):
+    """Compute jv = J @ search, parallelized over (constraint, env)."""
+    n_dofs = constraint_state.search.shape[0]
+    len_constraints = constraint_state.jac.shape[0]
+    _B = constraint_state.grad.shape[1]
+
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
+    for i_c, i_b in qd.ndrange(len_constraints, _B):
+        if i_c < constraint_state.n_constraints[i_b]:
+            jv = gs.qd_float(0.0)
+            if qd.static(static_rigid_sim_config.sparse_solve):
+                for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]):
+                    i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b]
+                    jv = jv + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b]
+            else:
+                for i_d in range(n_dofs):
+                    jv = jv + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b]
+            constraint_state.jv[i_c, i_b] = jv
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_p0(
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
+    """Snorm check, quad_gauss, eq_sum, and p0_cost. T threads per env with shared memory reductions.
+
+    Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs (Options A+B).
+    Phase 2: Parallel reduction over n_constraints for eq_sum and p0_cost.
+    """
     _B = constraint_state.grad.shape[1]
-    ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
-    for i_b in range(_B):
+    _T = qd.static(_P0_BLOCK)
+
+    qd.loop_config(block_dim=_T)
+    for i_flat in range(_B * _T):
+        tid = i_flat % _T
+        i_b = i_flat // _T
+
+        # 6 shared arrays for parallel reductions (reused across phases)
+        sh_snorm_sq = qd.simt.block.SharedArray((_T,), gs.qd_float)
+        sh_qg_grad = qd.simt.block.SharedArray((_T,), gs.qd_float)
+        sh_qg_hess = qd.simt.block.SharedArray((_T,), gs.qd_float)
+        sh_p0_cost = qd.simt.block.SharedArray((_T,), gs.qd_float)
+        sh_constraint_grad = qd.simt.block.SharedArray((_T,), gs.qd_float)
+        sh_constraint_hess = qd.simt.block.SharedArray((_T,), gs.qd_float)
+
+        if constraint_state.n_constraints[i_b] > 0:
+            n_dofs = constraint_state.search.shape[0]
+
+            # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs ===
+            local_snorm_sq = gs.qd_float(0.0)
+            local_qg_grad = gs.qd_float(0.0)
+            local_qg_hess = gs.qd_float(0.0)
+
+            i_d = tid
+            while i_d < n_dofs:
+                s = constraint_state.search[i_d, i_b]
+                local_snorm_sq += s * s
+                local_qg_grad += s * constraint_state.Ma[i_d, i_b] - s * dofs_state.force[i_d, i_b]
+                local_qg_hess += 0.5 * s * constraint_state.mv[i_d, i_b]
+                i_d += _T
+
+            sh_snorm_sq[tid] = local_snorm_sq
+            sh_qg_grad[tid] = local_qg_grad
+            sh_qg_hess[tid] = local_qg_hess
+
+            qd.simt.block.sync()
+
+            # Tree reduction for 3 accumulators
+            stride = _T // 2
+            while stride > 0:
+                if tid < stride:
+                    sh_snorm_sq[tid] += sh_snorm_sq[tid + stride]
+                    sh_qg_grad[tid] += sh_qg_grad[tid + stride]
+                    sh_qg_hess[tid] += sh_qg_hess[tid + stride]
+                qd.simt.block.sync()
+                stride //= 2
+
+            # All threads read the reduced snorm
+            snorm = qd.sqrt(sh_snorm_sq[0])
+
+            if snorm < rigid_global_info.EPS[None]:
+                # Converged — only thread 0 writes
+                if tid == 0:
+                    constraint_state.candidates[0, i_b] = 0.0
+                    constraint_state.candidates[1, i_b] = 0.0
+                    constraint_state.improved[i_b] = False
+            else:
+                # Thread 0 writes quad_gauss to global memory
+                if tid == 0:
+                    constraint_state.improved[i_b] = True
+                    constraint_state.quad_gauss[0, i_b] = constraint_state.gauss[i_b]
+                    constraint_state.quad_gauss[1, i_b] = sh_qg_grad[0]
+                    constraint_state.quad_gauss[2, i_b] = sh_qg_hess[0]
+
+                # === Phase 2: Constraint cost, parallel over n_constraints ===
+                ne = constraint_state.n_constraints_equality[i_b]
+                nef = ne + constraint_state.n_constraints_frictionloss[i_b]
+                n_con = constraint_state.n_constraints[i_b]
+
+                local_eq_cost = gs.qd_float(0.0)
+                local_eq_grad = gs.qd_float(0.0)
+                local_eq_hess = gs.qd_float(0.0)
+                local_p0_cost = gs.qd_float(0.0)
+                local_constraint_grad = gs.qd_float(0.0)
+                local_constraint_hess = gs.qd_float(0.0)
+
+                i_c = tid
+                while i_c < n_con:
+                    Jaref_c = constraint_state.Jaref[i_c, i_b]
+                    jv_c = constraint_state.jv[i_c, i_b]
+                    D = constraint_state.efc_D[i_c, i_b]
+                    qf_0 = D * (0.5 * Jaref_c * Jaref_c)
+                    qf_1 = D * (jv_c * Jaref_c)
+                    qf_2 = D * (0.5 * jv_c * jv_c)
+
+                    if i_c < ne:
+                        # Equality: always active
+                        local_eq_cost += qf_0
+                        local_eq_grad += qf_1
+                        local_eq_hess += qf_2
+                        local_p0_cost += qf_0
+                        local_constraint_grad += qf_1
+                        local_constraint_hess += qf_2
+                    elif i_c < nef:
+                        # Friction: check linear regime at alpha=0
+                        f = constraint_state.efc_frictionloss[i_c, i_b]
+                        r = constraint_state.diag[i_c, i_b]
+                        rf = r * f
+                        linear_neg = Jaref_c <= -rf
+                        linear_pos = Jaref_c >= rf
+                        if linear_neg or linear_pos:
+                            qf_0 = linear_neg * f * (-0.5 * rf - Jaref_c) + linear_pos * f * (-0.5 * rf + Jaref_c)
+                            qf_1 = linear_neg * (-f * jv_c) + linear_pos * (f * jv_c)
+                            qf_2 = 0.0
+                        local_p0_cost += qf_0
+                        local_constraint_grad += qf_1
+                        local_constraint_hess += qf_2
+                    else:
+                        # Contact: active if Jaref < 0
+                        active = Jaref_c < 0
+                        local_p0_cost += qf_0 * active
+                        local_constraint_grad += qf_1 * active
+                        local_constraint_hess += qf_2 * active
+
+                    i_c += _T
+
+                # Reuse shared arrays for Phase 2 reduction
+                sh_snorm_sq[tid] = local_eq_cost
+                sh_qg_grad[tid] = local_eq_grad
+                sh_qg_hess[tid] = local_eq_hess
+                sh_p0_cost[tid] = local_p0_cost
+                sh_constraint_grad[tid] = local_constraint_grad
+                sh_constraint_hess[tid] = local_constraint_hess
+
+                qd.simt.block.sync()
+
+                # Tree reduction for 6 accumulators
+                stride = _T // 2
+                while stride > 0:
+                    if tid < stride:
+                        sh_snorm_sq[tid] += sh_snorm_sq[tid + stride]
+                        sh_qg_grad[tid] += sh_qg_grad[tid + stride]
+                        sh_qg_hess[tid] += sh_qg_hess[tid + stride]
+                        sh_p0_cost[tid] += sh_p0_cost[tid + stride]
+                        sh_constraint_grad[tid] += sh_constraint_grad[tid + stride]
+                        sh_constraint_hess[tid] += sh_constraint_hess[tid + stride]
+                    qd.simt.block.sync()
+                    stride //= 2
+
+                if tid == 0:
+                    constraint_state.eq_sum[0, i_b] = sh_snorm_sq[0]
+                    constraint_state.eq_sum[1, i_b] = sh_qg_grad[0]
+                    constraint_state.eq_sum[2, i_b] = sh_qg_hess[0]
+                    constraint_state.ls_it[i_b] = 1
+                    constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0]
+                    # Initialize best alpha, search range, and best-cost tracker for parallel linesearch
+                    constraint_state.candidates[0, i_b] = 0.0  # default: no step
+
+                    # Use full Newton step (DOF + all constraints) as the range center.
+                    total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
+                    if total_hess > 0.0:
+                        total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
+                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP))
+                        constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
+                        constraint_state.candidates[3, i_b] = alpha_newton * 1e2
+                    else:
+                        constraint_state.candidates[2, i_b] = 1e-6
+                        constraint_state.candidates[3, i_b] = 1e2
+                    constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_eval(
+    constraint_state: array_class.ConstraintState,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """Evaluate K candidate alphas in parallel per env, pick the best via reduction.
+
+    Reads the search range from candidates[2] (lo) and candidates[3] (hi).
+    Writes narrowed range back to candidates[2,3] for successive refinement.
+    """
+    _B = constraint_state.grad.shape[1]
+    _K = qd.static(LS_PARALLEL_K)
+
+    qd.loop_config(block_dim=_K)
+    for i_flat in range(_B * _K):
+        tid = i_flat % _K
+        i_b = i_flat // _K
+
+        # Shared memory for argmin reduction
+        sh_cost = qd.simt.block.SharedArray((_K,), gs.qd_float)
+        sh_idx = qd.simt.block.SharedArray((_K,), qd.i32)
+
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
-            solver.func_linesearch_and_apply_alpha(
-                i_b,
-                entities_info=entities_info,
-                dofs_state=dofs_state,
-                rigid_global_info=rigid_global_info,
-                constraint_state=constraint_state,
-                static_rigid_sim_config=static_rigid_sim_config,
+            ne = constraint_state.n_constraints_equality[i_b]
+            nef = ne + constraint_state.n_constraints_frictionloss[i_b]
+            n_con = constraint_state.n_constraints[i_b]
+
+            lo = constraint_state.candidates[2, i_b]
+            hi = constraint_state.candidates[3, i_b]
+
+            # Generate log-spaced alpha within [lo, hi]
+            alpha = solver._log_scale(lo, hi, _K, tid)
+
+            # Evaluate cost at this alpha
+            cost = (
+                alpha * alpha * constraint_state.quad_gauss[2, i_b]
+                + alpha * constraint_state.quad_gauss[1, i_b]
+                + constraint_state.quad_gauss[0, i_b]
             )
+
+            # Equality constraints (always active) - use eq_sum precomputed during init
+            cost = (
+                cost
+                + alpha * alpha * constraint_state.eq_sum[2, i_b]
+                + alpha * constraint_state.eq_sum[1, i_b]
+                + constraint_state.eq_sum[0, i_b]
+            )
+
+            # Friction constraints
+            for i_c in range(ne, nef):
+                Jaref_c = constraint_state.Jaref[i_c, i_b]
+                jv_c = constraint_state.jv[i_c, i_b]
+                D = constraint_state.efc_D[i_c, i_b]
+                f = constraint_state.efc_frictionloss[i_c, i_b]
+                r = constraint_state.diag[i_c, i_b]
+                x = Jaref_c + alpha * jv_c
+                rf = r * f
+                linear_neg = x <= -rf
+                linear_pos = x >= rf
+                if linear_neg or linear_pos:
+                    cost = cost + linear_neg * f * (-0.5 * rf - Jaref_c - alpha * jv_c)
+                    cost = cost + linear_pos * f * (-0.5 * rf + Jaref_c + alpha * jv_c)
+                else:
+                    cost = cost + D * 0.5 * x * x
+
+            # Contact constraints (active if x < 0)
+            for i_c in range(nef, n_con):
+                Jaref_c = constraint_state.Jaref[i_c, i_b]
+                jv_c = constraint_state.jv[i_c, i_b]
+                D = constraint_state.efc_D[i_c, i_b]
+                x = Jaref_c + alpha * jv_c
+                if x < 0:
+                    cost += D * 0.5 * x * x
+
+            sh_cost[tid] = cost
+            sh_idx[tid] = tid
         else:
-            constraint_state.improved[i_b] = False
+            sh_cost[tid] = gs.qd_float(1e30)
+            sh_idx[tid] = tid
+
+        qd.simt.block.sync()
+
+        # Tree reduction for argmin
+        stride = _K // 2
+        while stride > 0:
+            if tid < stride:
+                if sh_cost[tid + stride] < sh_cost[tid]:
+                    sh_cost[tid] = sh_cost[tid + stride]
+                    sh_idx[tid] = sh_idx[tid + stride]
+            qd.simt.block.sync()
+            stride = stride // 2
+
+        # Thread 0: acceptance check, write result, and narrow range for next pass
+        if tid == 0:
+            if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
+                p0_cost = constraint_state.candidates[1, i_b]
+                best_tid = sh_idx[0]
+                best_cost = sh_cost[0]
+                lo = constraint_state.candidates[2, i_b]
+                hi = constraint_state.candidates[3, i_b]
+                best_alpha = solver._log_scale(lo, hi, _K, best_tid)
+
+                # Only update best alpha if this pass improved over ALL previous passes
+                best_cost_prev = constraint_state.candidates[4, i_b]
+                if best_cost < p0_cost and best_cost < best_cost_prev:
+                    constraint_state.candidates[0, i_b] = best_alpha
+                    constraint_state.candidates[4, i_b] = best_cost
+
+                    # Narrow range around accepted point for next refinement pass
+                    lo_idx = qd.max(0, best_tid - 1)
+                    hi_idx = qd.min(_K - 1, best_tid + 1)
+                    constraint_state.candidates[2, i_b] = solver._log_scale(lo, hi, _K, lo_idx)
+                    constraint_state.candidates[3, i_b] = solver._log_scale(lo, hi, _K, hi_idx)
+            else:
+                constraint_state.candidates[0, i_b] = 0.0
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_apply_alpha_dofs(
+    constraint_state: array_class.ConstraintState,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """Apply best alpha to qacc and Ma, parallelized over (dof, env)."""
+    n_dofs = constraint_state.qacc.shape[0]
+    _B = constraint_state.grad.shape[1]
+
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
+    for i_d, i_b in qd.ndrange(n_dofs, _B):
+        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
+            alpha = constraint_state.candidates[0, i_b]
+            if qd.abs(alpha) < rigid_global_info.EPS[None]:
+                if i_d == 0:
+                    constraint_state.improved[i_b] = False
+            else:
+                constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha
+                constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_parallel_linesearch_apply_alpha_constraints(
+    constraint_state: array_class.ConstraintState,
+    static_rigid_sim_config: qd.template(),
+):
+    """Apply best alpha to Jaref, parallelized over (constraint, env)."""
+    len_constraints = constraint_state.Jaref.shape[0]
+    _B = constraint_state.grad.shape[1]
+
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
+    for i_c, i_b in qd.ndrange(len_constraints, _B):
+        if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]:
+            alpha = constraint_state.candidates[0, i_b]
+            constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha
+
+
+# ============================================== Shared iteration kernels ==============================================
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_cg_only_save_prev_grad(
     constraint_state: array_class.ConstraintState,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Save prev_grad and prev_Mgrad (CG only)"""
     _B = constraint_state.grad.shape[1]
-    ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
     for i_b in range(_B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             solver.func_save_prev_grad(i_b, constraint_state=constraint_state)
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_update_constraint_forces(
     constraint_state: array_class.ConstraintState,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Compute active flags and efc_force, parallelized over (constraint, env)."""
     len_constraints = constraint_state.active.shape[0]
     _B = constraint_state.grad.shape[1]
 
-    for i_c, i_b in ti.ndrange(len_constraints, _B):
+    for i_c, i_b in qd.ndrange(len_constraints, _B):
         if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]:
             ne = constraint_state.n_constraints_equality[i_b]
             nef = ne + constraint_state.n_constraints_frictionloss[i_b]
 
-            if ti.static(static_rigid_sim_config.solver_type == gs.constraint_solver.Newton):
+            if qd.static(static_rigid_sim_config.solver_type == gs.constraint_solver.Newton):
                 constraint_state.prev_active[i_c, i_b] = constraint_state.active[i_c, i_b]
 
             constraint_state.active[i_c, i_b] = True
@@ -78,16 +445,16 @@ def _kernel_update_constraint_forces(
             )
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_update_constraint_qfrc(
     constraint_state: array_class.ConstraintState,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Compute qfrc_constraint = J^T @ efc_force, parallelized over (dof, env)."""
     n_dofs = constraint_state.qfrc_constraint.shape[0]
     _B = constraint_state.grad.shape[1]
 
-    for i_d, i_b in ti.ndrange(n_dofs, _B):
+    for i_d, i_b in qd.ndrange(n_dofs, _B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             n_con = constraint_state.n_constraints[i_b]
             qfrc = gs.qd_float(0.0)
@@ -96,16 +463,16 @@ def _kernel_update_constraint_qfrc(
             constraint_state.qfrc_constraint[i_d, i_b] = qfrc
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_update_constraint_cost(
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Compute gauss and cost (reductions over dofs and constraints). One thread per env."""
     _B = constraint_state.grad.shape[1]
 
-    ti.loop_config(block_dim=32)
+    qd.loop_config(block_dim=32)
     for i_b in range(_B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             n_dofs = constraint_state.qfrc_constraint.shape[0]
@@ -149,15 +516,15 @@ def _kernel_update_constraint_cost(
             constraint_state.cost[i_b] = cost_i
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_newton_only_nt_hessian(
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Step 4: Newton Hessian update (Newton only)"""
     solver.func_hessian_direct_tiled(constraint_state=constraint_state, rigid_global_info=rigid_global_info)
-    if ti.static(static_rigid_sim_config.enable_tiled_cholesky_hessian):
+    if qd.static(static_rigid_sim_config.enable_tiled_cholesky_hessian):
         solver.func_cholesky_factor_direct_tiled(
             constraint_state=constraint_state,
             rigid_global_info=rigid_global_info,
@@ -165,7 +532,7 @@ def _kernel_newton_only_nt_hessian(
         )
     else:
         _B = constraint_state.jac.shape[2]
-        ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
+        qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
         for i_b in range(_B):
             if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
                 solver.func_cholesky_factor_direct_batch(
@@ -173,17 +540,17 @@ def _kernel_newton_only_nt_hessian(
                 )
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_update_gradient(
     entities_info: array_class.EntitiesInfo,
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Step 5: Update gradient"""
     _B = constraint_state.grad.shape[1]
-    ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
     for i_b in range(_B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             solver.func_update_gradient_batch(
@@ -196,15 +563,15 @@ def _kernel_update_gradient(
             )
 
 
-@ti.kernel(fastcache=gs.use_fastcache)
+@qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_update_search_direction(
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: ti.template(),
+    static_rigid_sim_config: qd.template(),
 ):
     """Step 6: Check convergence and update search direction"""
     _B = constraint_state.grad.shape[1]
-    ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
     for i_b in range(_B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             solver.func_terminate_or_update_descent_batch(
@@ -215,9 +582,19 @@ def _kernel_update_search_direction(
             )
 
 
-@solver.func_solve_body.register(is_compatible=lambda *args, **kwargs: gs.backend in {gs.cuda})
+# ============================================== Solve body dispatch ================================================
+
+
+@solver.func_solve_body.register(
+    is_compatible=lambda *args, **kwargs: (
+        # Note: we do not use parallel linesearch for finite difference gradient validation, as it is highly
+        # sensitive to numerical precision and GPU float64 rounding errors can accumulate over many trials.
+        gs.backend in {gs.cuda} and not (args[5] if len(args) > 5 else kwargs["static_rigid_sim_config"]).requires_grad
+    )
+)
 def func_solve_decomposed(
     entities_info,
+    dofs_info,
     dofs_state,
     constraint_state,
     rigid_global_info,
@@ -228,17 +605,43 @@ def func_solve_decomposed(
     Uses separate kernels for each solver step per iteration.
 
     This maximizes kernel granularity, potentially allowing better GPU scheduling
-    and more flexibility in execution, at the cost of more Python→C++ boundary crossings.
+    and more flexibility in execution, at the cost of more Python->C++ boundary crossings.
     """
     # _n_iterations is a Python-native int to avoid CPU-GPU sync (vs rigid_global_info.iterations[None])
     for _it in range(_n_iterations):
-        _kernel_linesearch(
+        _kernel_parallel_linesearch_mv(
+            dofs_info,
             entities_info,
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
+        _kernel_parallel_linesearch_jv(
+            constraint_state,
+            static_rigid_sim_config,
+        )
+        _kernel_parallel_linesearch_p0(
             dofs_state,
             constraint_state,
             rigid_global_info,
             static_rigid_sim_config,
         )
+        # Successive refinement: each pass narrows the search range around the best alpha.
+        for _refine in range(LS_PARALLEL_N_REFINE):
+            _kernel_parallel_linesearch_eval(
+                constraint_state,
+                rigid_global_info,
+                static_rigid_sim_config,
+            )
+        _kernel_parallel_linesearch_apply_alpha_dofs(
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
+        _kernel_parallel_linesearch_apply_alpha_constraints(
+            constraint_state,
+            static_rigid_sim_config,
+        )
         if static_rigid_sim_config.solver_type == gs.constraint_solver.CG:
             _kernel_cg_only_save_prev_grad(
                 constraint_state,
@@ -257,6 +660,7 @@ def func_solve_decomposed(
             constraint_state,
             static_rigid_sim_config,
         )
+
         if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton:
             _kernel_newton_only_nt_hessian(
                 constraint_state,
diff --git a/tests/test_grad.py b/tests/test_grad.py
index b637065a2..3c49a6b7f 100644
--- a/tests/test_grad.py
+++ b/tests/test_grad.py
@@ -263,6 +263,7 @@ def constraint_solver_resolve():
         )
         func_solve_body(
             entities_info=rigid_solver.entities_info,
+            dofs_info=rigid_solver.dofs_info,
             dofs_state=rigid_solver.dofs_state,
             constraint_state=constraint_solver.constraint_state,
             rigid_global_info=rigid_solver._rigid_global_info,

From ae95f8c14612226239d718dbf22b0c2118dfafec Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Tue, 10 Mar 2026 00:04:35 +0000
Subject: [PATCH 04/20] update

---
 .../engine/solvers/rigid/constraint/solver.py |  6 ----
 .../rigid/constraint/solver_breakdown.py      | 31 ++++++++++++++++---
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index d0a42e2d4..51f7269c9 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -2355,12 +2355,6 @@ def update_bracket_no_eval_local(
     return flag, p_alpha, p_cost, p_grad, p_hess, p_next_alpha
 
 
-@qd.func
-def _log_scale(min_value: gs.qd_float, max_value: gs.qd_float, num_values: qd.i32, i: qd.i32) -> gs.qd_float:
-    step = (qd.log(max_value) - qd.log(min_value)) / qd.max(1.0, gs.qd_float(num_values - 1))
-    return qd.exp(qd.log(min_value) + gs.qd_float(i) * step)
-
-
 @qd.func
 def func_linesearch_and_apply_alpha(
     i_b,
diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index a05cb2242..8eafdde45 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -4,9 +4,25 @@
 import genesis.utils.array_class as array_class
 from genesis.engine.solvers.rigid.constraint import solver
 
+# --- Parallel linesearch constants ---
+# Number of candidate step sizes evaluated simultaneously per env.
+# Each CUDA block processes one env with K threads, using shared memory for the argmin reduction.
+# Similar to BLOCK_DIM in func_hessian_direct_tiled: determines parallelism and shared memory layout.
 LS_PARALLEL_K = 16
+
+# Floor for the Newton step estimate used to center the log-spaced search range.
+# When |grad/hess| is near-zero the search range [alpha*1e-2, alpha*1e2] would collapse;
+# this clamp keeps the range meaningful. The value is well below typical linesearch tolerances
+# (ls_tolerance * tolerance ~ 1e-2 * 1e-8 for double, ~ 1e-2 * 1e-5 for float) so it never
+# masks a genuinely small optimal step.
 LS_PARALLEL_MIN_STEP = 1e-6
-LS_PARALLEL_N_REFINE = 1  # number of successive refinement passes in parallel linesearch
+
+# Number of successive refinement passes: after picking the best of K candidates the search
+# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
+# gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
+LS_PARALLEL_N_REFINE = 1
+
+# Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
 _JV_BLOCK = 32
 
@@ -269,7 +285,8 @@ def _kernel_parallel_linesearch_eval(
             hi = constraint_state.candidates[3, i_b]
 
             # Generate log-spaced alpha within [lo, hi]
-            alpha = solver._log_scale(lo, hi, _K, tid)
+            _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
+            alpha = qd.exp(qd.log(lo) + gs.qd_float(tid) * _step)
 
             # Evaluate cost at this alpha
             cost = (
@@ -338,7 +355,9 @@ def _kernel_parallel_linesearch_eval(
                 best_cost = sh_cost[0]
                 lo = constraint_state.candidates[2, i_b]
                 hi = constraint_state.candidates[3, i_b]
-                best_alpha = solver._log_scale(lo, hi, _K, best_tid)
+                # Recover best alpha from log-spaced grid
+                _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
+                best_alpha = qd.exp(qd.log(lo) + gs.qd_float(best_tid) * _step)
 
                 # Only update best alpha if this pass improved over ALL previous passes
                 best_cost_prev = constraint_state.candidates[4, i_b]
@@ -349,8 +368,10 @@ def _kernel_parallel_linesearch_eval(
                     # Narrow range around accepted point for next refinement pass
                     lo_idx = qd.max(0, best_tid - 1)
                     hi_idx = qd.min(_K - 1, best_tid + 1)
-                    constraint_state.candidates[2, i_b] = solver._log_scale(lo, hi, _K, lo_idx)
-                    constraint_state.candidates[3, i_b] = solver._log_scale(lo, hi, _K, hi_idx)
+                    # Narrow search range to neighbors of best candidate
+                    _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
+                    constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + gs.qd_float(lo_idx) * _step)
+                    constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + gs.qd_float(hi_idx) * _step)
             else:
                 constraint_state.candidates[0, i_b] = 0.0
 

From a09bef867ec9f29ed3d63393e12d8089ba26cffa Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Fri, 13 Mar 2026 16:15:38 +0000
Subject: [PATCH 05/20] update cast

---
 .../solvers/rigid/constraint/solver_breakdown.py   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 8eafdde45..ae2aa3316 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -285,8 +285,8 @@ def _kernel_parallel_linesearch_eval(
             hi = constraint_state.candidates[3, i_b]
 
             # Generate log-spaced alpha within [lo, hi]
-            _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
-            alpha = qd.exp(qd.log(lo) + gs.qd_float(tid) * _step)
+            _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
+            alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step)
 
             # Evaluate cost at this alpha
             cost = (
@@ -356,8 +356,8 @@ def _kernel_parallel_linesearch_eval(
                 lo = constraint_state.candidates[2, i_b]
                 hi = constraint_state.candidates[3, i_b]
                 # Recover best alpha from log-spaced grid
-                _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
-                best_alpha = qd.exp(qd.log(lo) + gs.qd_float(best_tid) * _step)
+                _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
+                best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step)
 
                 # Only update best alpha if this pass improved over ALL previous passes
                 best_cost_prev = constraint_state.candidates[4, i_b]
@@ -369,9 +369,9 @@ def _kernel_parallel_linesearch_eval(
                     lo_idx = qd.max(0, best_tid - 1)
                     hi_idx = qd.min(_K - 1, best_tid + 1)
                     # Narrow search range to neighbors of best candidate
-                    _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1))
-                    constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + gs.qd_float(lo_idx) * _step)
-                    constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + gs.qd_float(hi_idx) * _step)
+                    _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
+                    constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step)
+                    constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step)
             else:
                 constraint_state.candidates[0, i_b] = 0.0
 

From 30f2cf00dfd0ebe8ec9acb102a227eebbd042296 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Thu, 19 Mar 2026 00:48:39 +0000
Subject: [PATCH 06/20] test

---
 genesis/engine/solvers/rigid/constraint/solver.py | 2 +-
 tests/test_rigid_benchmarks.py                    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index 51f7269c9..bffd2475e 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -3115,7 +3115,7 @@ def func_solve_body(
 ) -> None: ...
 
 
-@func_solve_body.register(is_compatible=lambda *args, **kwargs: True)
+@func_solve_body.register(is_compatible=lambda *args, **kwargs: False)
 @qd.kernel(fastcache=gs.use_fastcache)
 def func_solve_body_monolith(
     entities_info: array_class.EntitiesInfo,
diff --git a/tests/test_rigid_benchmarks.py b/tests/test_rigid_benchmarks.py
index e6fcf1698..0fee77bdb 100644
--- a/tests/test_rigid_benchmarks.py
+++ b/tests/test_rigid_benchmarks.py
@@ -725,7 +725,7 @@ def run_benchmark(step_fn, *, n_envs, meta):
 
 
 @pytest.fixture(scope="session")
-def stream_writers(printer_session, request):
+def stream_writers(request):
     report_path = Path(request.config.getoption("--speed-test-filepath"))
 
     # Delete old unrelated worker-specific reports
@@ -746,7 +746,7 @@ def stream_writers(printer_session, request):
         report_path.unlink()
     fd = open(report_path, "w")
 
-    yield (lambda msg: print(msg, file=fd, flush=True), printer_session)
+    yield (lambda msg: print(msg, file=fd, flush=True), print)
 
     fd.close()
 

From 008716dfb96f5480fc5ebabe642931a2fe897af6 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Sat, 21 Mar 2026 14:18:47 +0000
Subject: [PATCH 07/20] update

---
 .../rigid/constraint/solver_breakdown.py      | 71 ++++++++-----------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index ae2aa3316..86dd99b60 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -8,7 +8,7 @@
 # Number of candidate step sizes evaluated simultaneously per env.
 # Each CUDA block processes one env with K threads, using shared memory for the argmin reduction.
 # Similar to BLOCK_DIM in func_hessian_direct_tiled: determines parallelism and shared memory layout.
-LS_PARALLEL_K = 16
+LS_PARALLEL_K = 32
 
 # Floor for the Newton step estimate used to center the log-spaced search range.
 # When |grad/hess| is near-zero the search range [alpha*1e-2, alpha*1e2] would collapse;
@@ -20,7 +20,7 @@
 # Number of successive refinement passes: after picking the best of K candidates the search
 # range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
 # gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
-LS_PARALLEL_N_REFINE = 1
+LS_PARALLEL_N_REFINE = 3
 
 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
@@ -46,7 +46,7 @@ def _kernel_parallel_linesearch_mv(
 
     qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
     for i_d1, i_b in qd.ndrange(n_dofs, _B):
-        if constraint_state.n_constraints[i_b] > 0:
+        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1
             i_e = dofs_info.entity_idx[I_d1]
             mv = gs.qd_float(0.0)
@@ -67,7 +67,7 @@ def _kernel_parallel_linesearch_jv(
 
     qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
     for i_c, i_b in qd.ndrange(len_constraints, _B):
-        if i_c < constraint_state.n_constraints[i_b]:
+        if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]:
             jv = gs.qd_float(0.0)
             if qd.static(static_rigid_sim_config.sparse_solve):
                 for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]):
@@ -107,7 +107,7 @@ def _kernel_parallel_linesearch_p0(
         sh_constraint_grad = qd.simt.block.SharedArray((_T,), gs.qd_float)
         sh_constraint_hess = qd.simt.block.SharedArray((_T,), gs.qd_float)
 
-        if constraint_state.n_constraints[i_b] > 0:
+        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             n_dofs = constraint_state.search.shape[0]
 
             # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs ===
@@ -151,7 +151,6 @@ def _kernel_parallel_linesearch_p0(
             else:
                 # Thread 0 writes quad_gauss to global memory
                 if tid == 0:
-                    constraint_state.improved[i_b] = True
                     constraint_state.quad_gauss[0, i_b] = constraint_state.gauss[i_b]
                     constraint_state.quad_gauss[1, i_b] = sh_qg_grad[0]
                     constraint_state.quad_gauss[2, i_b] = sh_qg_hess[0]
@@ -237,20 +236,25 @@ def _kernel_parallel_linesearch_p0(
                     constraint_state.eq_sum[2, i_b] = sh_qg_hess[0]
                     constraint_state.ls_it[i_b] = 1
                     constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0]
-                    # Initialize best alpha, search range, and best-cost tracker for parallel linesearch
-                    constraint_state.candidates[0, i_b] = 0.0  # default: no step
-
-                    # Use full Newton step (DOF + all constraints) as the range center.
+                    # Use full Newton step as initial best guess + search range center.
                     total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
+                    total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
+                    p0_cost_val = constraint_state.gauss[i_b] + sh_p0_cost[0]
+
                     if total_hess > 0.0:
-                        total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
                         alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP))
+                        # Initialize with Newton step as the best alpha so far.
+                        # Its quadratic-approximated cost is p0 - grad²/(2*hess), always < p0.
+                        newton_cost = p0_cost_val - 0.5 * total_grad * total_grad / total_hess
+                        constraint_state.candidates[0, i_b] = alpha_newton
+                        constraint_state.candidates[4, i_b] = newton_cost
                         constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
-                        constraint_state.candidates[3, i_b] = alpha_newton * 1e2
+                        constraint_state.candidates[3, i_b] = alpha_newton * 10.0
                     else:
+                        constraint_state.candidates[0, i_b] = 0.0
+                        constraint_state.candidates[4, i_b] = gs.qd_float(1e30)
                         constraint_state.candidates[2, i_b] = 1e-6
                         constraint_state.candidates[3, i_b] = 1e2
-                    constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
@@ -377,41 +381,32 @@ def _kernel_parallel_linesearch_eval(
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
-def _kernel_parallel_linesearch_apply_alpha_dofs(
+def _kernel_parallel_linesearch_apply_alpha(
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
 ):
-    """Apply best alpha to qacc and Ma, parallelized over (dof, env)."""
+    """Apply best alpha to qacc, Ma, and Jaref. Fuses dof and constraint updates."""
     n_dofs = constraint_state.qacc.shape[0]
+    len_constraints = constraint_state.Jaref.shape[0]
     _B = constraint_state.grad.shape[1]
+    n_items = qd.max(n_dofs, len_constraints)
 
     qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
-    for i_d, i_b in qd.ndrange(n_dofs, _B):
+    for i, i_b in qd.ndrange(n_items, _B):
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             alpha = constraint_state.candidates[0, i_b]
             if qd.abs(alpha) < rigid_global_info.EPS[None]:
-                if i_d == 0:
+                if i == 0:
                     constraint_state.improved[i_b] = False
             else:
-                constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha
-                constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha
-
-
-@qd.kernel(fastcache=gs.use_fastcache)
-def _kernel_parallel_linesearch_apply_alpha_constraints(
-    constraint_state: array_class.ConstraintState,
-    static_rigid_sim_config: qd.template(),
-):
-    """Apply best alpha to Jaref, parallelized over (constraint, env)."""
-    len_constraints = constraint_state.Jaref.shape[0]
-    _B = constraint_state.grad.shape[1]
-
-    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
-    for i_c, i_b in qd.ndrange(len_constraints, _B):
-        if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]:
-            alpha = constraint_state.candidates[0, i_b]
-            constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha
+                # Apply to dofs
+                if i < n_dofs:
+                    constraint_state.qacc[i, i_b] += constraint_state.search[i, i_b] * alpha
+                    constraint_state.Ma[i, i_b] += constraint_state.mv[i, i_b] * alpha
+                # Apply to constraints
+                if i < constraint_state.n_constraints[i_b]:
+                    constraint_state.Jaref[i, i_b] += constraint_state.jv[i, i_b] * alpha
 
 
 # ============================================== Shared iteration kernels ==============================================
@@ -654,15 +649,11 @@ def func_solve_decomposed(
                 rigid_global_info,
                 static_rigid_sim_config,
             )
-        _kernel_parallel_linesearch_apply_alpha_dofs(
+        _kernel_parallel_linesearch_apply_alpha(
             constraint_state,
             rigid_global_info,
             static_rigid_sim_config,
         )
-        _kernel_parallel_linesearch_apply_alpha_constraints(
-            constraint_state,
-            static_rigid_sim_config,
-        )
         if static_rigid_sim_config.solver_type == gs.constraint_solver.CG:
             _kernel_cg_only_save_prev_grad(
                 constraint_state,

From e7cd2763c11cab5ae650be533c4b3ac21abaa958 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Sat, 21 Mar 2026 15:18:50 +0000
Subject: [PATCH 08/20] update

---
 genesis/engine/solvers/rigid/constraint/solver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index bffd2475e..51f7269c9 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -3115,7 +3115,7 @@ def func_solve_body(
 ) -> None: ...
 
 
-@func_solve_body.register(is_compatible=lambda *args, **kwargs: False)
+@func_solve_body.register(is_compatible=lambda *args, **kwargs: True)
 @qd.kernel(fastcache=gs.use_fastcache)
 def func_solve_body_monolith(
     entities_info: array_class.EntitiesInfo,

From 54d0b3d522f63ee62b36cac532a4374cf15cd4d5 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Sat, 21 Mar 2026 15:19:55 +0000
Subject: [PATCH 09/20] update

---
 tests/test_rigid_benchmarks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_rigid_benchmarks.py b/tests/test_rigid_benchmarks.py
index 0fee77bdb..e6fcf1698 100644
--- a/tests/test_rigid_benchmarks.py
+++ b/tests/test_rigid_benchmarks.py
@@ -725,7 +725,7 @@ def run_benchmark(step_fn, *, n_envs, meta):
 
 
 @pytest.fixture(scope="session")
-def stream_writers(request):
+def stream_writers(printer_session, request):
     report_path = Path(request.config.getoption("--speed-test-filepath"))
 
     # Delete old unrelated worker-specific reports
@@ -746,7 +746,7 @@ def stream_writers(request):
         report_path.unlink()
     fd = open(report_path, "w")
 
-    yield (lambda msg: print(msg, file=fd, flush=True), print)
+    yield (lambda msg: print(msg, file=fd, flush=True), printer_session)
 
     fd.close()
 

From c0546258e73791bdcfa788dad186db65a002d189 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Sun, 22 Mar 2026 21:00:44 +0000
Subject: [PATCH 10/20] safe newton init

---
 .../rigid/constraint/solver_breakdown.py      | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 86dd99b60..76214ebf2 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -93,6 +93,7 @@ def _kernel_parallel_linesearch_p0(
     """
     _B = constraint_state.grad.shape[1]
     _T = qd.static(_P0_BLOCK)
+    _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP)
 
     qd.loop_config(block_dim=_T)
     for i_flat in range(_B * _T):
@@ -236,25 +237,22 @@ def _kernel_parallel_linesearch_p0(
                     constraint_state.eq_sum[2, i_b] = sh_qg_hess[0]
                     constraint_state.ls_it[i_b] = 1
                     constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0]
-                    # Use full Newton step as initial best guess + search range center.
-                    total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
-                    total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
-                    p0_cost_val = constraint_state.gauss[i_b] + sh_p0_cost[0]
+                    # Initialize best alpha, search range, and best-cost tracker for parallel linesearch
+                    constraint_state.candidates[0, i_b] = 0.0  # default: no step
 
+                    # Use full Newton step (DOF + all constraints) as the range center.
+                    total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
                     if total_hess > 0.0:
-                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP))
-                        # Initialize with Newton step as the best alpha so far.
-                        # Its quadratic-approximated cost is p0 - grad²/(2*hess), always < p0.
-                        newton_cost = p0_cost_val - 0.5 * total_grad * total_grad / total_hess
-                        constraint_state.candidates[0, i_b] = alpha_newton
-                        constraint_state.candidates[4, i_b] = newton_cost
+                        total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
+                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP))
                         constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
                         constraint_state.candidates[3, i_b] = alpha_newton * 10.0
+                        constraint_state.candidates[5, i_b] = alpha_newton  # exact Newton step for eval
                     else:
-                        constraint_state.candidates[0, i_b] = 0.0
-                        constraint_state.candidates[4, i_b] = gs.qd_float(1e30)
                         constraint_state.candidates[2, i_b] = 1e-6
                         constraint_state.candidates[3, i_b] = 1e2
+                        constraint_state.candidates[5, i_b] = 0.0
+                    constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
@@ -288,9 +286,15 @@ def _kernel_parallel_linesearch_eval(
             lo = constraint_state.candidates[2, i_b]
             hi = constraint_state.candidates[3, i_b]
 
-            # Generate log-spaced alpha within [lo, hi]
+            # Generate log-spaced alpha within [lo, hi].
+            # Thread 0 evaluates the exact Newton step instead of the grid point at lo.
+            # This gives the Newton alpha a fair cost-based comparison with grid candidates.
             _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
             alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step)
+            if tid == 0:
+                alpha_newton_val = constraint_state.candidates[5, i_b]
+                if alpha_newton_val > 0.0:
+                    alpha = alpha_newton_val
 
             # Evaluate cost at this alpha
             cost = (
@@ -359,9 +363,12 @@ def _kernel_parallel_linesearch_eval(
                 best_cost = sh_cost[0]
                 lo = constraint_state.candidates[2, i_b]
                 hi = constraint_state.candidates[3, i_b]
-                # Recover best alpha from log-spaced grid
+                # Recover best alpha: thread 0 used Newton alpha, others used grid
                 _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
                 best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step)
+                # If thread 0 (Newton candidate) won, use the exact Newton alpha
+                if best_tid == 0 and constraint_state.candidates[5, i_b] > 0.0:
+                    best_alpha = constraint_state.candidates[5, i_b]
 
                 # Only update best alpha if this pass improved over ALL previous passes
                 best_cost_prev = constraint_state.candidates[4, i_b]

From 2e56fec6964244a7d0f98720a63b49bef3615b43 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 23 Mar 2026 00:48:14 +0000
Subject: [PATCH 11/20] grad check and newton correction

---
 .../rigid/constraint/solver_breakdown.py      | 62 +++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 76214ebf2..68b381f98 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -20,7 +20,7 @@
 # Number of successive refinement passes: after picking the best of K candidates the search
 # range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
 # gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
-LS_PARALLEL_N_REFINE = 3
+LS_PARALLEL_N_REFINE = 1
 
 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
@@ -93,7 +93,6 @@ def _kernel_parallel_linesearch_p0(
     """
     _B = constraint_state.grad.shape[1]
     _T = qd.static(_P0_BLOCK)
-    _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP)
 
     qd.loop_config(block_dim=_T)
     for i_flat in range(_B * _T):
@@ -244,7 +243,7 @@ def _kernel_parallel_linesearch_p0(
                     total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
                     if total_hess > 0.0:
                         total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
-                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP))
+                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP))
                         constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
                         constraint_state.candidates[3, i_b] = alpha_newton * 10.0
                         constraint_state.candidates[5, i_b] = alpha_newton  # exact Newton step for eval
@@ -253,6 +252,12 @@ def _kernel_parallel_linesearch_p0(
                         constraint_state.candidates[3, i_b] = 1e2
                         constraint_state.candidates[5, i_b] = 0.0
                     constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
+                    # Store gtol for gradient check on last eval pass
+                    n_dofs_val = constraint_state.search.shape[0]
+                    scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val)
+                    constraint_state.candidates[7, i_b] = (
+                        rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale
+                    )
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
@@ -260,11 +265,14 @@ def _kernel_parallel_linesearch_eval(
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
+    _is_last_pass: qd.template(),
 ):
     """Evaluate K candidate alphas in parallel per env, pick the best via reduction.
 
     Reads the search range from candidates[2] (lo) and candidates[3] (hi).
     Writes narrowed range back to candidates[2,3] for successive refinement.
+    On the last pass, computes analytical gradient at best_alpha and does one
+    Newton correction step if |grad| > gtol.
     """
     _B = constraint_state.grad.shape[1]
     _K = qd.static(LS_PARALLEL_K)
@@ -383,6 +391,51 @@ def _kernel_parallel_linesearch_eval(
                     _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
                     constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step)
                     constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step)
+
+                # On last pass: analytical gradient check + Newton correction
+                if qd.static(_is_last_pass):
+                    final_alpha = constraint_state.candidates[0, i_b]
+                    if qd.abs(final_alpha) > rigid_global_info.EPS[None]:
+                        gtol = constraint_state.candidates[7, i_b]
+                        ne_g = constraint_state.n_constraints_equality[i_b]
+                        nef_g = ne_g + constraint_state.n_constraints_frictionloss[i_b]
+                        n_con_g = constraint_state.n_constraints[i_b]
+
+                        # Accumulate gradient and hessian coefficients at final_alpha
+                        g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b]
+                        g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b]
+
+                        for i_cg in range(ne_g, nef_g):
+                            Jaref_c = constraint_state.Jaref[i_cg, i_b]
+                            jv_c = constraint_state.jv[i_cg, i_b]
+                            D = constraint_state.efc_D[i_cg, i_b]
+                            f_val = constraint_state.efc_frictionloss[i_cg, i_b]
+                            r_val = constraint_state.diag[i_cg, i_b]
+                            x = Jaref_c + final_alpha * jv_c
+                            rf = r_val * f_val
+                            if x <= -rf or x >= rf:
+                                g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c)
+                            else:
+                                g1 += D * jv_c * Jaref_c
+                                g2 += D * 0.5 * jv_c * jv_c
+
+                        for i_cg in range(nef_g, n_con_g):
+                            Jaref_c = constraint_state.Jaref[i_cg, i_b]
+                            jv_c = constraint_state.jv[i_cg, i_b]
+                            D = constraint_state.efc_D[i_cg, i_b]
+                            x = Jaref_c + final_alpha * jv_c
+                            if x < 0:
+                                g1 += D * jv_c * Jaref_c
+                                g2 += D * 0.5 * jv_c * jv_c
+
+                        grad_val = 2.0 * final_alpha * g2 + g1
+                        hess_val = 2.0 * g2
+
+                        # Newton correction if gradient is not near zero
+                        if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]:
+                            alpha_corrected = final_alpha - grad_val / hess_val
+                            if alpha_corrected > 0.0:
+                                constraint_state.candidates[0, i_b] = alpha_corrected
             else:
                 constraint_state.candidates[0, i_b] = 0.0
 
@@ -649,12 +702,13 @@ def func_solve_decomposed(
             rigid_global_info,
             static_rigid_sim_config,
         )
-        # Successive refinement: each pass narrows the search range around the best alpha.
+        # Successive refinement: last pass includes gradient check + Newton correction.
         for _refine in range(LS_PARALLEL_N_REFINE):
             _kernel_parallel_linesearch_eval(
                 constraint_state,
                 rigid_global_info,
                 static_rigid_sim_config,
+                _refine == LS_PARALLEL_N_REFINE - 1,
             )
         _kernel_parallel_linesearch_apply_alpha(
             constraint_state,

From c37fb44f1cd9f16ef851bc012040ce2be03bf456 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 23 Mar 2026 01:41:41 +0000
Subject: [PATCH 12/20] add more newton correction

---
 .../rigid/constraint/solver_breakdown.py      | 148 ++++++++++--------
 1 file changed, 86 insertions(+), 62 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 68b381f98..6c6313de1 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -17,15 +17,64 @@
 # masks a genuinely small optimal step.
 LS_PARALLEL_MIN_STEP = 1e-6
 
-# Number of successive refinement passes: after picking the best of K candidates the search
-# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
-# gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
-LS_PARALLEL_N_REFINE = 1
-
 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
 _JV_BLOCK = 32
 
+# Maximum allowed alpha (prevents divergence from degenerate Newton steps)
+LS_ALPHA_MAX = 1e4
+# Maximum bisection iterations (2^8 = 256x precision improvement)
+LS_BISECT_STEPS = 8
+# Range expansion: max steps and growth factor
+LS_EXPANSION_STEPS = 6
+LS_EXPANSION_FACTOR = 4.0
+
+
+@qd.func
+def _ls_compute_grad_hess(
+    alpha,
+    i_b,
+    constraint_state: array_class.ConstraintState,
+):
+    """Compute analytical gradient and hessian of the linesearch cost at alpha.
+
+    Accumulates quadratic coefficients with alpha-dependent constraint activation.
+    Returns (grad, hess).
+    """
+    ne = constraint_state.n_constraints_equality[i_b]
+    nef = ne + constraint_state.n_constraints_frictionloss[i_b]
+    n_con = constraint_state.n_constraints[i_b]
+
+    g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b]
+    g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b]
+
+    for i_c in range(ne, nef):
+        Jaref_c = constraint_state.Jaref[i_c, i_b]
+        jv_c = constraint_state.jv[i_c, i_b]
+        D = constraint_state.efc_D[i_c, i_b]
+        f_val = constraint_state.efc_frictionloss[i_c, i_b]
+        r_val = constraint_state.diag[i_c, i_b]
+        x = Jaref_c + alpha * jv_c
+        rf = r_val * f_val
+        if x <= -rf or x >= rf:
+            g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c)
+        else:
+            g1 += D * jv_c * Jaref_c
+            g2 += D * 0.5 * jv_c * jv_c
+
+    for i_c in range(nef, n_con):
+        Jaref_c = constraint_state.Jaref[i_c, i_b]
+        jv_c = constraint_state.jv[i_c, i_b]
+        D = constraint_state.efc_D[i_c, i_b]
+        x = Jaref_c + alpha * jv_c
+        if x < 0:
+            g1 += D * jv_c * Jaref_c
+            g2 += D * 0.5 * jv_c * jv_c
+
+    grad = 2.0 * alpha * g2 + g1
+    hess = 2.0 * g2
+    return grad, hess
+
 
 @qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_parallel_linesearch_mv(
@@ -265,13 +314,12 @@ def _kernel_parallel_linesearch_eval(
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
-    _is_last_pass: qd.template(),
 ):
-    """Evaluate K candidate alphas in parallel per env, pick the best via reduction.
+    """Evaluate K candidate alphas, pick the best, then apply gradient-guided refinement.
 
-    Reads the search range from candidates[2] (lo) and candidates[3] (hi).
-    Writes narrowed range back to candidates[2,3] for successive refinement.
-    On the last pass, computes analytical gradient at best_alpha and does one
+    Phase 1: K threads evaluate log-spaced candidates (thread 0 = Newton alpha).
+    Phase 2: Argmin reduction to find lowest-cost candidate.
+    Phase 3: Thread 0 computes analytical gradient at best_alpha and does one
     Newton correction step if |grad| > gtol.
     """
     _B = constraint_state.grad.shape[1]
@@ -392,50 +440,28 @@ def _kernel_parallel_linesearch_eval(
                     constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step)
                     constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step)
 
-                # On last pass: analytical gradient check + Newton correction
-                if qd.static(_is_last_pass):
-                    final_alpha = constraint_state.candidates[0, i_b]
-                    if qd.abs(final_alpha) > rigid_global_info.EPS[None]:
-                        gtol = constraint_state.candidates[7, i_b]
-                        ne_g = constraint_state.n_constraints_equality[i_b]
-                        nef_g = ne_g + constraint_state.n_constraints_frictionloss[i_b]
-                        n_con_g = constraint_state.n_constraints[i_b]
-
-                        # Accumulate gradient and hessian coefficients at final_alpha
-                        g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b]
-                        g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b]
-
-                        for i_cg in range(ne_g, nef_g):
-                            Jaref_c = constraint_state.Jaref[i_cg, i_b]
-                            jv_c = constraint_state.jv[i_cg, i_b]
-                            D = constraint_state.efc_D[i_cg, i_b]
-                            f_val = constraint_state.efc_frictionloss[i_cg, i_b]
-                            r_val = constraint_state.diag[i_cg, i_b]
-                            x = Jaref_c + final_alpha * jv_c
-                            rf = r_val * f_val
-                            if x <= -rf or x >= rf:
-                                g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c)
-                            else:
-                                g1 += D * jv_c * Jaref_c
-                                g2 += D * 0.5 * jv_c * jv_c
-
-                        for i_cg in range(nef_g, n_con_g):
-                            Jaref_c = constraint_state.Jaref[i_cg, i_b]
-                            jv_c = constraint_state.jv[i_cg, i_b]
-                            D = constraint_state.efc_D[i_cg, i_b]
-                            x = Jaref_c + final_alpha * jv_c
-                            if x < 0:
-                                g1 += D * jv_c * Jaref_c
-                                g2 += D * 0.5 * jv_c * jv_c
-
-                        grad_val = 2.0 * final_alpha * g2 + g1
-                        hess_val = 2.0 * g2
-
-                        # Newton correction if gradient is not near zero
-                        if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]:
-                            alpha_corrected = final_alpha - grad_val / hess_val
-                            if alpha_corrected > 0.0:
-                                constraint_state.candidates[0, i_b] = alpha_corrected
+                # --- Gradient-guided refinement (thread 0) ---
+                final_alpha = constraint_state.candidates[0, i_b]
+                gtol = constraint_state.candidates[7, i_b]
+
+                if qd.abs(final_alpha) < rigid_global_info.EPS[None]:
+                    # Branch A: grid found no improvement. Check grad(0).
+                    grad_0, hess_0 = _ls_compute_grad_hess(gs.qd_float(0.0), i_b, constraint_state)
+                    if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]:
+                        # Cost descending at alpha=0 → Newton step from 0
+                        alpha_n = -grad_0 / hess_0
+                        alpha_n = qd.min(alpha_n, gs.qd_float(LS_ALPHA_MAX))
+                        if alpha_n > 0.0:
+                            constraint_state.candidates[0, i_b] = alpha_n
+                else:
+                    # Branch B: improvement found. Gradient check + Newton correction.
+                    grad_val, hess_val = _ls_compute_grad_hess(final_alpha, i_b, constraint_state)
+
+                    if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]:
+                        alpha_corrected = final_alpha - grad_val / hess_val
+                        alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(LS_ALPHA_MAX))
+                        if alpha_corrected > 0.0:
+                            constraint_state.candidates[0, i_b] = alpha_corrected
             else:
                 constraint_state.candidates[0, i_b] = 0.0
 
@@ -702,14 +728,12 @@ def func_solve_decomposed(
             rigid_global_info,
             static_rigid_sim_config,
         )
-        # Successive refinement: last pass includes gradient check + Newton correction.
-        for _refine in range(LS_PARALLEL_N_REFINE):
-            _kernel_parallel_linesearch_eval(
-                constraint_state,
-                rigid_global_info,
-                static_rigid_sim_config,
-                _refine == LS_PARALLEL_N_REFINE - 1,
-            )
+        # Grid search + gradient-guided Newton correction (single pass)
+        _kernel_parallel_linesearch_eval(
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
         _kernel_parallel_linesearch_apply_alpha(
             constraint_state,
             rigid_global_info,

From 2ade80c26010e2452bf2f0d297279ee8ea8eb278 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 23 Mar 2026 12:15:26 +0000
Subject: [PATCH 13/20] fix qudrants constant

---
 .../engine/solvers/rigid/constraint/solver_breakdown.py   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 6c6313de1..4fff250f2 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -142,6 +142,7 @@ def _kernel_parallel_linesearch_p0(
     """
     _B = constraint_state.grad.shape[1]
     _T = qd.static(_P0_BLOCK)
+    _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP)
 
     qd.loop_config(block_dim=_T)
     for i_flat in range(_B * _T):
@@ -292,7 +293,7 @@ def _kernel_parallel_linesearch_p0(
                     total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
                     if total_hess > 0.0:
                         total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
-                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP))
+                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP))
                         constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
                         constraint_state.candidates[3, i_b] = alpha_newton * 10.0
                         constraint_state.candidates[5, i_b] = alpha_newton  # exact Newton step for eval
@@ -324,6 +325,7 @@ def _kernel_parallel_linesearch_eval(
     """
     _B = constraint_state.grad.shape[1]
     _K = qd.static(LS_PARALLEL_K)
+    _LS_ALPHA_MAX = qd.static(LS_ALPHA_MAX)
 
     qd.loop_config(block_dim=_K)
     for i_flat in range(_B * _K):
@@ -450,7 +452,7 @@ def _kernel_parallel_linesearch_eval(
                     if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]:
                         # Cost descending at alpha=0 → Newton step from 0
                         alpha_n = -grad_0 / hess_0
-                        alpha_n = qd.min(alpha_n, gs.qd_float(LS_ALPHA_MAX))
+                        alpha_n = qd.min(alpha_n, gs.qd_float(_LS_ALPHA_MAX))
                         if alpha_n > 0.0:
                             constraint_state.candidates[0, i_b] = alpha_n
                 else:
@@ -459,7 +461,7 @@ def _kernel_parallel_linesearch_eval(
 
                     if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]:
                         alpha_corrected = final_alpha - grad_val / hess_val
-                        alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(LS_ALPHA_MAX))
+                        alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(_LS_ALPHA_MAX))
                         if alpha_corrected > 0.0:
                             constraint_state.candidates[0, i_b] = alpha_corrected
             else:

From 42eb2e3bea92d577ba3b4e83698af38fd9f377f8 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Mon, 23 Mar 2026 15:00:45 +0000
Subject: [PATCH 14/20] re-enable re-benchmarking

---
 .../rigid/constraint/solver_breakdown.py      | 110 +++---------------
 1 file changed, 15 insertions(+), 95 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 4fff250f2..76214ebf2 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -17,64 +17,15 @@
 # masks a genuinely small optimal step.
 LS_PARALLEL_MIN_STEP = 1e-6
 
+# Number of successive refinement passes: after picking the best of K candidates the search
+# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
+# gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
+LS_PARALLEL_N_REFINE = 3
+
 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
 _JV_BLOCK = 32
 
-# Maximum allowed alpha (prevents divergence from degenerate Newton steps)
-LS_ALPHA_MAX = 1e4
-# Maximum bisection iterations (2^8 = 256x precision improvement)
-LS_BISECT_STEPS = 8
-# Range expansion: max steps and growth factor
-LS_EXPANSION_STEPS = 6
-LS_EXPANSION_FACTOR = 4.0
-
-
-@qd.func
-def _ls_compute_grad_hess(
-    alpha,
-    i_b,
-    constraint_state: array_class.ConstraintState,
-):
-    """Compute analytical gradient and hessian of the linesearch cost at alpha.
-
-    Accumulates quadratic coefficients with alpha-dependent constraint activation.
-    Returns (grad, hess).
-    """
-    ne = constraint_state.n_constraints_equality[i_b]
-    nef = ne + constraint_state.n_constraints_frictionloss[i_b]
-    n_con = constraint_state.n_constraints[i_b]
-
-    g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b]
-    g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b]
-
-    for i_c in range(ne, nef):
-        Jaref_c = constraint_state.Jaref[i_c, i_b]
-        jv_c = constraint_state.jv[i_c, i_b]
-        D = constraint_state.efc_D[i_c, i_b]
-        f_val = constraint_state.efc_frictionloss[i_c, i_b]
-        r_val = constraint_state.diag[i_c, i_b]
-        x = Jaref_c + alpha * jv_c
-        rf = r_val * f_val
-        if x <= -rf or x >= rf:
-            g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c)
-        else:
-            g1 += D * jv_c * Jaref_c
-            g2 += D * 0.5 * jv_c * jv_c
-
-    for i_c in range(nef, n_con):
-        Jaref_c = constraint_state.Jaref[i_c, i_b]
-        jv_c = constraint_state.jv[i_c, i_b]
-        D = constraint_state.efc_D[i_c, i_b]
-        x = Jaref_c + alpha * jv_c
-        if x < 0:
-            g1 += D * jv_c * Jaref_c
-            g2 += D * 0.5 * jv_c * jv_c
-
-    grad = 2.0 * alpha * g2 + g1
-    hess = 2.0 * g2
-    return grad, hess
-
 
 @qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_parallel_linesearch_mv(
@@ -302,12 +253,6 @@ def _kernel_parallel_linesearch_p0(
                         constraint_state.candidates[3, i_b] = 1e2
                         constraint_state.candidates[5, i_b] = 0.0
                     constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
-                    # Store gtol for gradient check on last eval pass
-                    n_dofs_val = constraint_state.search.shape[0]
-                    scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val)
-                    constraint_state.candidates[7, i_b] = (
-                        rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale
-                    )
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
@@ -316,16 +261,13 @@ def _kernel_parallel_linesearch_eval(
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
 ):
-    """Evaluate K candidate alphas, pick the best, then apply gradient-guided refinement.
+    """Evaluate K candidate alphas in parallel per env, pick the best via reduction.
 
-    Phase 1: K threads evaluate log-spaced candidates (thread 0 = Newton alpha).
-    Phase 2: Argmin reduction to find lowest-cost candidate.
-    Phase 3: Thread 0 computes analytical gradient at best_alpha and does one
-    Newton correction step if |grad| > gtol.
+    Reads the search range from candidates[2] (lo) and candidates[3] (hi).
+    Writes narrowed range back to candidates[2,3] for successive refinement.
     """
     _B = constraint_state.grad.shape[1]
     _K = qd.static(LS_PARALLEL_K)
-    _LS_ALPHA_MAX = qd.static(LS_ALPHA_MAX)
 
     qd.loop_config(block_dim=_K)
     for i_flat in range(_B * _K):
@@ -441,29 +383,6 @@ def _kernel_parallel_linesearch_eval(
                     _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
                     constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step)
                     constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step)
-
-                # --- Gradient-guided refinement (thread 0) ---
-                final_alpha = constraint_state.candidates[0, i_b]
-                gtol = constraint_state.candidates[7, i_b]
-
-                if qd.abs(final_alpha) < rigid_global_info.EPS[None]:
-                    # Branch A: grid found no improvement. Check grad(0).
-                    grad_0, hess_0 = _ls_compute_grad_hess(gs.qd_float(0.0), i_b, constraint_state)
-                    if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]:
-                        # Cost descending at alpha=0 → Newton step from 0
-                        alpha_n = -grad_0 / hess_0
-                        alpha_n = qd.min(alpha_n, gs.qd_float(_LS_ALPHA_MAX))
-                        if alpha_n > 0.0:
-                            constraint_state.candidates[0, i_b] = alpha_n
-                else:
-                    # Branch B: improvement found. Gradient check + Newton correction.
-                    grad_val, hess_val = _ls_compute_grad_hess(final_alpha, i_b, constraint_state)
-
-                    if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]:
-                        alpha_corrected = final_alpha - grad_val / hess_val
-                        alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(_LS_ALPHA_MAX))
-                        if alpha_corrected > 0.0:
-                            constraint_state.candidates[0, i_b] = alpha_corrected
             else:
                 constraint_state.candidates[0, i_b] = 0.0
 
@@ -730,12 +649,13 @@ def func_solve_decomposed(
             rigid_global_info,
             static_rigid_sim_config,
         )
-        # Grid search + gradient-guided Newton correction (single pass)
-        _kernel_parallel_linesearch_eval(
-            constraint_state,
-            rigid_global_info,
-            static_rigid_sim_config,
-        )
+        # Successive refinement: each pass narrows the search range around the best alpha.
+        for _refine in range(LS_PARALLEL_N_REFINE):
+            _kernel_parallel_linesearch_eval(
+                constraint_state,
+                rigid_global_info,
+                static_rigid_sim_config,
+            )
         _kernel_parallel_linesearch_apply_alpha(
             constraint_state,
             rigid_global_info,

From ff32f252bbb2056dee94e796e08bd3b4560f6f80 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Tue, 24 Mar 2026 20:08:31 +0000
Subject: [PATCH 15/20] update with new opt

---
 .../rigid/constraint/solver_breakdown.py      | 552 +++++++++++++-----
 1 file changed, 409 insertions(+), 143 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 76214ebf2..9a3e672f5 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -17,15 +17,83 @@
 # masks a genuinely small optimal step.
 LS_PARALLEL_MIN_STEP = 1e-6
 
-# Number of successive refinement passes: after picking the best of K candidates the search
-# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already
-# gives sufficient resolution; increase for tighter convergence at the cost of more kernels.
-LS_PARALLEL_N_REFINE = 3
-
 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv.
 _P0_BLOCK = 32
 _JV_BLOCK = 32
 
+# Maximum bisection iterations for gradient-guided refinement after grid search.
+LS_BISECT_STEPS = 12
+
+# Number of alpha candidates evaluated via cooperative constraint reduction.
+# Each candidate is evaluated by ALL K threads cooperating on the constraint sum,
+# reducing per-thread work from O(n_constraints) to O(n_constraints/K).
+LS_N_CANDIDATES = 8
+
+# Maximum allowed alpha (prevents divergence from degenerate steps).
+LS_ALPHA_MAX = 1e4
+
+
+@qd.func
+def _ls_eval_cost_grad(
+    alpha,
+    i_b,
+    constraint_state: array_class.ConstraintState,
+):
+    """Compute cost and analytical gradient at alpha (thread-0 only).
+
+    Follows the same quadratic-coefficient approach as func_ls_point_fn_opt in solver.py.
+    Reuses quad_gauss and eq_sum precomputed by the p0 kernel.
+    Returns (cost, grad).
+    """
+    ne = constraint_state.n_constraints_equality[i_b]
+    nef = ne + constraint_state.n_constraints_frictionloss[i_b]
+    n_con = constraint_state.n_constraints[i_b]
+
+    # Start from precomputed DOF + equality coefficients
+    qt_0 = constraint_state.quad_gauss[0, i_b] + constraint_state.eq_sum[0, i_b]
+    qt_1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b]
+    qt_2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b]
+
+    # Friction constraints: accumulate activation-dependent quad coefficients
+    for i_c in range(ne, nef):
+        Jaref_c = constraint_state.Jaref[i_c, i_b]
+        jv_c = constraint_state.jv[i_c, i_b]
+        D = constraint_state.efc_D[i_c, i_b]
+        f_val = constraint_state.efc_frictionloss[i_c, i_b]
+        r_val = constraint_state.diag[i_c, i_b]
+        qf_0 = D * (0.5 * Jaref_c * Jaref_c)
+        qf_1 = D * (jv_c * Jaref_c)
+        qf_2 = D * (0.5 * jv_c * jv_c)
+        x = Jaref_c + alpha * jv_c
+        rf = r_val * f_val
+        linear_neg = x <= -rf
+        linear_pos = x >= rf
+        if linear_neg or linear_pos:
+            qf_0 = linear_neg * f_val * (-0.5 * rf - Jaref_c) + linear_pos * f_val * (-0.5 * rf + Jaref_c)
+            qf_1 = linear_neg * (-f_val * jv_c) + linear_pos * (f_val * jv_c)
+            qf_2 = 0.0
+        qt_0 = qt_0 + qf_0
+        qt_1 = qt_1 + qf_1
+        qt_2 = qt_2 + qf_2
+
+    # Contact constraints: active when x < 0
+    for i_c in range(nef, n_con):
+        Jaref_c = constraint_state.Jaref[i_c, i_b]
+        jv_c = constraint_state.jv[i_c, i_b]
+        D = constraint_state.efc_D[i_c, i_b]
+        x = Jaref_c + alpha * jv_c
+        active = x < 0
+        qf_0 = D * (0.5 * Jaref_c * Jaref_c)
+        qf_1 = D * (jv_c * Jaref_c)
+        qf_2 = D * (0.5 * jv_c * jv_c)
+        qt_0 = qt_0 + qf_0 * active
+        qt_1 = qt_1 + qf_1 * active
+        qt_2 = qt_2 + qf_2 * active
+
+    cost = alpha * alpha * qt_2 + alpha * qt_1 + qt_0
+    grad = 2.0 * alpha * qt_2 + qt_1
+    return cost, grad
+
 
 @qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_parallel_linesearch_mv(
@@ -81,19 +149,22 @@ def _kernel_parallel_linesearch_jv(
 
 @qd.kernel(fastcache=gs.use_fastcache)
 def _kernel_parallel_linesearch_p0(
+    dofs_info: array_class.DofsInfo,
+    entities_info: array_class.EntitiesInfo,
     dofs_state: array_class.DofsState,
     constraint_state: array_class.ConstraintState,
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
 ):
-    """Snorm check, quad_gauss, eq_sum, and p0_cost. T threads per env with shared memory reductions.
+    """Fused mv + jv + snorm + quad_gauss + eq_sum + p0_cost.
 
-    Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs (Options A+B).
+    Phase 0a: Compute mv = M @ search (cooperative over DOFs, 32 threads).
+    Phase 0b: Compute jv = J @ search (cooperative over constraints, 32 threads).
+    Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs.
     Phase 2: Parallel reduction over n_constraints for eq_sum and p0_cost.
     """
     _B = constraint_state.grad.shape[1]
     _T = qd.static(_P0_BLOCK)
-    _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP)
 
     qd.loop_config(block_dim=_T)
     for i_flat in range(_B * _T):
@@ -110,6 +181,34 @@ def _kernel_parallel_linesearch_p0(
 
         if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
             n_dofs = constraint_state.search.shape[0]
+            n_con = constraint_state.n_constraints[i_b]
+
+            # === Phase 0a: Compute mv = M @ search (cooperative over DOFs) ===
+            i_d1 = tid
+            while i_d1 < n_dofs:
+                I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1
+                i_e = dofs_info.entity_idx[I_d1]
+                mv_val = gs.qd_float(0.0)
+                for i_d2 in range(entities_info.dof_start[i_e], entities_info.dof_end[i_e]):
+                    mv_val = mv_val + rigid_global_info.mass_mat[i_d1, i_d2, i_b] * constraint_state.search[i_d2, i_b]
+                constraint_state.mv[i_d1, i_b] = mv_val
+                i_d1 += _T
+
+            # === Phase 0b: Compute jv = J @ search (cooperative over constraints) ===
+            i_c = tid
+            while i_c < n_con:
+                jv_val = gs.qd_float(0.0)
+                if qd.static(static_rigid_sim_config.sparse_solve):
+                    for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]):
+                        i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b]
+                        jv_val = jv_val + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b]
+                else:
+                    for i_d in range(n_dofs):
+                        jv_val = jv_val + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b]
+                constraint_state.jv[i_c, i_b] = jv_val
+                i_c += _T
+
+            qd.simt.block.sync()  # Ensure mv and jv are written before Phase 1 reads them
 
             # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs ===
             local_snorm_sq = gs.qd_float(0.0)
@@ -244,7 +343,9 @@ def _kernel_parallel_linesearch_p0(
                     total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0])
                     if total_hess > 0.0:
                         total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0]
-                        alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP))
+                        alpha_newton = qd.max(
+                            qd.abs(total_grad / total_hess), gs.qd_float(qd.static(LS_PARALLEL_MIN_STEP))
+                        )
                         constraint_state.candidates[2, i_b] = alpha_newton * 1e-2
                         constraint_state.candidates[3, i_b] = alpha_newton * 10.0
                         constraint_state.candidates[5, i_b] = alpha_newton  # exact Newton step for eval
@@ -253,6 +354,12 @@ def _kernel_parallel_linesearch_p0(
                         constraint_state.candidates[3, i_b] = 1e2
                         constraint_state.candidates[5, i_b] = 0.0
                     constraint_state.candidates[4, i_b] = gs.qd_float(1e30)  # best cost across passes
+                    # Store gtol for gradient-guided bisection after grid search
+                    n_dofs_val = constraint_state.search.shape[0]
+                    scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val)
+                    constraint_state.candidates[7, i_b] = (
+                        rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale
+                    )
 
 
 @qd.kernel(fastcache=gs.use_fastcache)
@@ -261,159 +368,234 @@ def _kernel_parallel_linesearch_eval(
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
 ):
-    """Evaluate K candidate alphas in parallel per env, pick the best via reduction.
+    """Evaluate alpha candidates via cooperative constraint reduction, then bisect.
+
+    All K threads cooperate on each candidate: each thread reduces n_constraints/K
+    constraints, then a shared-memory tree reduction sums the partial costs. This is
+    O(n_candidates × n_constraints/K) per thread instead of O(K × n_constraints).
 
-    Reads the search range from candidates[2] (lo) and candidates[3] (hi).
-    Writes narrowed range back to candidates[2,3] for successive refinement.
+    Phase 1: Cooperatively evaluate N_CANDIDATES + Newton alpha, pick best via argmin.
+    Phase 2: Cooperatively evaluate analytical gradient at best, then bisect if needed.
     """
     _B = constraint_state.grad.shape[1]
     _K = qd.static(LS_PARALLEL_K)
+    _NC = qd.static(LS_N_CANDIDATES)
 
     qd.loop_config(block_dim=_K)
     for i_flat in range(_B * _K):
         tid = i_flat % _K
         i_b = i_flat // _K
 
-        # Shared memory for argmin reduction
-        sh_cost = qd.simt.block.SharedArray((_K,), gs.qd_float)
-        sh_idx = qd.simt.block.SharedArray((_K,), qd.i32)
+        # Shared memory for reductions (reused across phases)
+        sh_val = qd.simt.block.SharedArray((_K,), gs.qd_float)
+        sh_val2 = qd.simt.block.SharedArray((_K,), gs.qd_float)
+        # Shared arrays for candidate costs and alphas (only _NC+1 used)
+        sh_cand_cost = qd.simt.block.SharedArray((_K,), gs.qd_float)
+        sh_cand_alpha = qd.simt.block.SharedArray((_K,), gs.qd_float)
 
-        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
+        active = constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]
+
+        if active:
             ne = constraint_state.n_constraints_equality[i_b]
             nef = ne + constraint_state.n_constraints_frictionloss[i_b]
             n_con = constraint_state.n_constraints[i_b]
-
             lo = constraint_state.candidates[2, i_b]
             hi = constraint_state.candidates[3, i_b]
-
-            # Generate log-spaced alpha within [lo, hi].
-            # Thread 0 evaluates the exact Newton step instead of the grid point at lo.
-            # This gives the Newton alpha a fair cost-based comparison with grid candidates.
-            _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
-            alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step)
-            if tid == 0:
-                alpha_newton_val = constraint_state.candidates[5, i_b]
-                if alpha_newton_val > 0.0:
-                    alpha = alpha_newton_val
-
-            # Evaluate cost at this alpha
-            cost = (
-                alpha * alpha * constraint_state.quad_gauss[2, i_b]
-                + alpha * constraint_state.quad_gauss[1, i_b]
-                + constraint_state.quad_gauss[0, i_b]
-            )
-
-            # Equality constraints (always active) - use eq_sum precomputed during init
-            cost = (
-                cost
-                + alpha * alpha * constraint_state.eq_sum[2, i_b]
-                + alpha * constraint_state.eq_sum[1, i_b]
-                + constraint_state.eq_sum[0, i_b]
-            )
-
-            # Friction constraints
-            for i_c in range(ne, nef):
-                Jaref_c = constraint_state.Jaref[i_c, i_b]
-                jv_c = constraint_state.jv[i_c, i_b]
-                D = constraint_state.efc_D[i_c, i_b]
-                f = constraint_state.efc_frictionloss[i_c, i_b]
-                r = constraint_state.diag[i_c, i_b]
-                x = Jaref_c + alpha * jv_c
-                rf = r * f
-                linear_neg = x <= -rf
-                linear_pos = x >= rf
-                if linear_neg or linear_pos:
-                    cost = cost + linear_neg * f * (-0.5 * rf - Jaref_c - alpha * jv_c)
-                    cost = cost + linear_pos * f * (-0.5 * rf + Jaref_c + alpha * jv_c)
+            p0_cost = constraint_state.candidates[1, i_b]
+            gtol = constraint_state.candidates[7, i_b]
+
+            # Pre-compute log-space step for candidate generation
+            _log_lo = qd.log(lo)
+            _cand_step = (qd.log(hi) - _log_lo) / qd.max(1.0, qd.cast(_NC - 1, gs.qd_float))
+            alpha_newton = constraint_state.candidates[5, i_b]
+
+            # === Phase 1: Cooperative evaluation of N_CANDIDATES alphas ===
+            # Evaluate each candidate sequentially; all K threads cooperate on constraint reduction.
+            n_total_cands = _NC + 1  # +1 for Newton alpha
+            for cand_idx in range(n_total_cands):
+                # Generate alpha for this candidate
+                alpha_c = gs.qd_float(0.0)
+                if cand_idx < _NC:
+                    alpha_c = qd.exp(_log_lo + qd.cast(cand_idx, gs.qd_float) * _cand_step)
                 else:
-                    cost = cost + D * 0.5 * x * x
-
-            # Contact constraints (active if x < 0)
-            for i_c in range(nef, n_con):
-                Jaref_c = constraint_state.Jaref[i_c, i_b]
-                jv_c = constraint_state.jv[i_c, i_b]
-                D = constraint_state.efc_D[i_c, i_b]
-                x = Jaref_c + alpha * jv_c
-                if x < 0:
-                    cost += D * 0.5 * x * x
-
-            sh_cost[tid] = cost
-            sh_idx[tid] = tid
-        else:
-            sh_cost[tid] = gs.qd_float(1e30)
-            sh_idx[tid] = tid
+                    alpha_c = alpha_newton  # last candidate is Newton alpha
+
+                # DOF + equality cost (O(1), same for all threads)
+                dof_eq_cost = (
+                    alpha_c * alpha_c * constraint_state.quad_gauss[2, i_b]
+                    + alpha_c * constraint_state.quad_gauss[1, i_b]
+                    + constraint_state.quad_gauss[0, i_b]
+                    + alpha_c * alpha_c * constraint_state.eq_sum[2, i_b]
+                    + alpha_c * constraint_state.eq_sum[1, i_b]
+                    + constraint_state.eq_sum[0, i_b]
+                )
 
-        qd.simt.block.sync()
+                # Cooperative constraint cost: each thread handles strided constraints
+                local_cost = gs.qd_float(0.0)
+                i_c = ne + tid  # start from ne (skip equality, already in eq_sum)
+                while i_c < n_con:
+                    Jaref_c = constraint_state.Jaref[i_c, i_b]
+                    jv_c = constraint_state.jv[i_c, i_b]
+                    D = constraint_state.efc_D[i_c, i_b]
+                    x = Jaref_c + alpha_c * jv_c
+                    if i_c < nef:
+                        # Friction constraint
+                        f_val = constraint_state.efc_frictionloss[i_c, i_b]
+                        r_val = constraint_state.diag[i_c, i_b]
+                        rf = r_val * f_val
+                        linear_neg = x <= -rf
+                        linear_pos = x >= rf
+                        if linear_neg or linear_pos:
+                            local_cost = local_cost + linear_neg * f_val * (-0.5 * rf - Jaref_c - alpha_c * jv_c)
+                            local_cost = local_cost + linear_pos * f_val * (-0.5 * rf + Jaref_c + alpha_c * jv_c)
+                        else:
+                            local_cost = local_cost + D * 0.5 * x * x
+                    else:
+                        # Contact constraint (active if x < 0)
+                        if x < 0:
+                            local_cost = local_cost + D * 0.5 * x * x
+                    i_c += _K
 
-        # Tree reduction for argmin
-        stride = _K // 2
-        while stride > 0:
-            if tid < stride:
-                if sh_cost[tid + stride] < sh_cost[tid]:
-                    sh_cost[tid] = sh_cost[tid + stride]
-                    sh_idx[tid] = sh_idx[tid + stride]
-            qd.simt.block.sync()
-            stride = stride // 2
+                # Tree reduction for constraint cost
+                sh_val[tid] = local_cost
+                qd.simt.block.sync()
+                stride = _K // 2
+                while stride > 0:
+                    if tid < stride:
+                        sh_val[tid] += sh_val[tid + stride]
+                    qd.simt.block.sync()
+                    stride //= 2
 
-        # Thread 0: acceptance check, write result, and narrow range for next pass
-        if tid == 0:
-            if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
-                p0_cost = constraint_state.candidates[1, i_b]
-                best_tid = sh_idx[0]
-                best_cost = sh_cost[0]
-                lo = constraint_state.candidates[2, i_b]
-                hi = constraint_state.candidates[3, i_b]
-                # Recover best alpha: thread 0 used Newton alpha, others used grid
-                _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
-                best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step)
-                # If thread 0 (Newton candidate) won, use the exact Newton alpha
-                if best_tid == 0 and constraint_state.candidates[5, i_b] > 0.0:
-                    best_alpha = constraint_state.candidates[5, i_b]
-
-                # Only update best alpha if this pass improved over ALL previous passes
+                # Thread 0 stores total cost for this candidate
+                if tid == 0:
+                    total_cost = dof_eq_cost + sh_val[0]
+                    sh_cand_cost[cand_idx] = total_cost
+                    sh_cand_alpha[cand_idx] = alpha_c
+                qd.simt.block.sync()
+
+            # === Phase 2: Find best candidate (thread 0) ===
+            if tid == 0:
+                best_alpha = gs.qd_float(0.0)
+                best_cost = p0_cost
                 best_cost_prev = constraint_state.candidates[4, i_b]
-                if best_cost < p0_cost and best_cost < best_cost_prev:
-                    constraint_state.candidates[0, i_b] = best_alpha
+                for ci in range(n_total_cands):
+                    c = sh_cand_cost[ci]
+                    if c < best_cost and c < best_cost_prev:
+                        best_cost = c
+                        best_alpha = sh_cand_alpha[ci]
+
+                constraint_state.candidates[0, i_b] = best_alpha
+                if best_alpha > 0.0:
                     constraint_state.candidates[4, i_b] = best_cost
+                # Store best alpha for Phase 3 cooperative bisection
+                sh_cand_alpha[0] = best_alpha
+            qd.simt.block.sync()
 
-                    # Narrow range around accepted point for next refinement pass
-                    lo_idx = qd.max(0, best_tid - 1)
-                    hi_idx = qd.min(_K - 1, best_tid + 1)
-                    # Narrow search range to neighbors of best candidate
-                    _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float))
-                    constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step)
-                    constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step)
-            else:
-                constraint_state.candidates[0, i_b] = 0.0
+            # === Phase 3: Cooperative gradient bisection ===
+            best_alpha_shared = sh_cand_alpha[0]
+            if best_alpha_shared > 0.0:
+                # Cooperatively compute gradient at best_alpha
+                alpha_eval = best_alpha_shared
 
+                # Cooperative gradient: accumulate quad_total_1 and quad_total_2
+                local_qt1 = gs.qd_float(0.0)
+                local_qt2 = gs.qd_float(0.0)
+                i_c = ne + tid
+                while i_c < n_con:
+                    Jaref_c = constraint_state.Jaref[i_c, i_b]
+                    jv_c = constraint_state.jv[i_c, i_b]
+                    D = constraint_state.efc_D[i_c, i_b]
+                    x = Jaref_c + alpha_eval * jv_c
+                    if i_c < nef:
+                        f_val = constraint_state.efc_frictionloss[i_c, i_b]
+                        r_val = constraint_state.diag[i_c, i_b]
+                        rf = r_val * f_val
+                        linear_neg = x <= -rf
+                        linear_pos = x >= rf
+                        qf_1 = D * (jv_c * Jaref_c)
+                        qf_2 = D * (0.5 * jv_c * jv_c)
+                        if linear_neg or linear_pos:
+                            qf_1 = linear_neg * (-f_val * jv_c) + linear_pos * (f_val * jv_c)
+                            qf_2 = 0.0
+                        local_qt1 = local_qt1 + qf_1
+                        local_qt2 = local_qt2 + qf_2
+                    else:
+                        act = x < 0
+                        local_qt1 = local_qt1 + D * (jv_c * Jaref_c) * act
+                        local_qt2 = local_qt2 + D * (0.5 * jv_c * jv_c) * act
+                    i_c += _K
+
+                # Reduce qt1 and qt2
+                sh_val[tid] = local_qt1
+                sh_val2[tid] = local_qt2
+                qd.simt.block.sync()
+                stride = _K // 2
+                while stride > 0:
+                    if tid < stride:
+                        sh_val[tid] += sh_val[tid + stride]
+                        sh_val2[tid] += sh_val2[tid + stride]
+                    qd.simt.block.sync()
+                    stride //= 2
 
-@qd.kernel(fastcache=gs.use_fastcache)
-def _kernel_parallel_linesearch_apply_alpha(
-    constraint_state: array_class.ConstraintState,
-    rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: qd.template(),
-):
-    """Apply best alpha to qacc, Ma, and Jaref. Fuses dof and constraint updates."""
-    n_dofs = constraint_state.qacc.shape[0]
-    len_constraints = constraint_state.Jaref.shape[0]
-    _B = constraint_state.grad.shape[1]
-    n_items = qd.max(n_dofs, len_constraints)
+                if tid == 0:
+                    qt1_total = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] + sh_val[0]
+                    qt2_total = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] + sh_val2[0]
+                    g_best = 2.0 * alpha_eval * qt2_total + qt1_total
+
+                    if qd.abs(g_best) > gtol:
+                        # Need bisection — use thread-0 sequential bisection with _ls_eval_cost_grad
+                        # (bisection is ~5 iterations, thread-0 cost is acceptable)
+                        bis_a = alpha_eval * 0.5
+                        bis_b = alpha_eval
+                        if g_best < 0.0:
+                            bis_a = alpha_eval
+                            bis_b = alpha_eval * 2.0
+
+                        _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state)
+                        _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state)
+
+                        if g_a < 0.0 and g_b > 0.0:
+                            _N_BISECT = qd.static(LS_BISECT_STEPS)
+                            for _bis_it in range(_N_BISECT):
+                                mid_b = (bis_a + bis_b) * 0.5
+                                c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
+                                if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]:
+                                    break
+                                if g_mid_b < 0.0:
+                                    bis_a = mid_b
+                                else:
+                                    bis_b = mid_b
+                            mid_b = (bis_a + bis_b) * 0.5
+                            c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
+                            if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]:
+                                constraint_state.candidates[0, i_b] = mid_b
+                                constraint_state.candidates[4, i_b] = c_mid_b
+        else:
+            if tid == 0:
+                constraint_state.candidates[0, i_b] = 0.0
+            qd.simt.block.sync()
 
-    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
-    for i, i_b in qd.ndrange(n_items, _B):
-        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
-            alpha = constraint_state.candidates[0, i_b]
-            if qd.abs(alpha) < rigid_global_info.EPS[None]:
-                if i == 0:
+        # === Phase 4: Cooperative apply alpha (fused, saves 1 kernel launch) ===
+        qd.simt.block.sync()
+        if active:
+            n_dofs_apply = constraint_state.qacc.shape[0]
+            n_con_apply = constraint_state.n_constraints[i_b]
+            alpha_apply = constraint_state.candidates[0, i_b]
+            if qd.abs(alpha_apply) < rigid_global_info.EPS[None]:
+                if tid == 0:
                     constraint_state.improved[i_b] = False
             else:
-                # Apply to dofs
-                if i < n_dofs:
-                    constraint_state.qacc[i, i_b] += constraint_state.search[i, i_b] * alpha
-                    constraint_state.Ma[i, i_b] += constraint_state.mv[i, i_b] * alpha
-                # Apply to constraints
-                if i < constraint_state.n_constraints[i_b]:
-                    constraint_state.Jaref[i, i_b] += constraint_state.jv[i, i_b] * alpha
+                # Apply to dofs (strided over threads)
+                i_d = tid
+                while i_d < n_dofs_apply:
+                    constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha_apply
+                    constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha_apply
+                    i_d += _K
+                # Apply to constraints (strided over threads)
+                i_c = tid
+                while i_c < n_con_apply:
+                    constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha_apply
+                    i_c += _K
 
 
 # ============================================== Shared iteration kernels ==============================================
@@ -605,6 +787,39 @@ def _kernel_update_search_direction(
             )
 
 
+# ============================================ Sequential linesearch ================================================
+
+
+@qd.kernel(fastcache=gs.use_fastcache)
+def _kernel_linesearch(
+    entities_info: array_class.EntitiesInfo,
+    dofs_state: array_class.DofsState,
+    constraint_state: array_class.ConstraintState,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """Sequential iterative linesearch (same as main branch).
+
+    Each thread handles one env, using Newton-guided derivative linesearch.
+    Lower per-env parallelism but less total work than the K=32 grid search.
+    Better for scenes with many constraints per env (e.g. humanoid contact).
+    """
+    _B = constraint_state.grad.shape[1]
+    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
+    for i_b in range(_B):
+        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
+            solver.func_linesearch_and_apply_alpha(
+                i_b,
+                entities_info=entities_info,
+                dofs_state=dofs_state,
+                rigid_global_info=rigid_global_info,
+                constraint_state=constraint_state,
+                static_rigid_sim_config=static_rigid_sim_config,
+            )
+        else:
+            constraint_state.improved[i_b] = False
+
+
 # ============================================== Solve body dispatch ================================================
 
 
@@ -632,31 +847,83 @@ def func_solve_decomposed(
     """
     # _n_iterations is a Python-native int to avoid CPU-GPU sync (vs rigid_global_info.iterations[None])
     for _it in range(_n_iterations):
-        _kernel_parallel_linesearch_mv(
+        # Fused: mv + jv + snorm + quad_gauss + eq_sum + p0_cost
+        _kernel_parallel_linesearch_p0(
             dofs_info,
             entities_info,
+            dofs_state,
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
+        # Fused: grid search + bisection + apply alpha
+        _kernel_parallel_linesearch_eval(
             constraint_state,
             rigid_global_info,
             static_rigid_sim_config,
         )
-        _kernel_parallel_linesearch_jv(
+        if static_rigid_sim_config.solver_type == gs.constraint_solver.CG:
+            _kernel_cg_only_save_prev_grad(
+                constraint_state,
+                static_rigid_sim_config,
+            )
+        _kernel_update_constraint_forces(
             constraint_state,
             static_rigid_sim_config,
         )
-        _kernel_parallel_linesearch_p0(
+        _kernel_update_constraint_qfrc(
+            constraint_state,
+            static_rigid_sim_config,
+        )
+        _kernel_update_constraint_cost(
             dofs_state,
             constraint_state,
-            rigid_global_info,
             static_rigid_sim_config,
         )
-        # Successive refinement: each pass narrows the search range around the best alpha.
-        for _refine in range(LS_PARALLEL_N_REFINE):
-            _kernel_parallel_linesearch_eval(
+
+        if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton:
+            _kernel_newton_only_nt_hessian(
                 constraint_state,
                 rigid_global_info,
                 static_rigid_sim_config,
             )
-        _kernel_parallel_linesearch_apply_alpha(
+        _kernel_update_gradient(
+            entities_info,
+            dofs_state,
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
+        _kernel_update_search_direction(
+            constraint_state,
+            rigid_global_info,
+            static_rigid_sim_config,
+        )
+
+
+@solver.func_solve_body.register(
+    is_compatible=lambda *args, **kwargs: False  # Disabled to benchmark parallel LS in isolation
+)
+def func_solve_decomposed_sequential(
+    entities_info,
+    dofs_info,
+    dofs_state,
+    constraint_state,
+    rigid_global_info,
+    static_rigid_sim_config,
+    _n_iterations,
+):
+    """Decomposed solver with sequential iterative linesearch.
+
+    Same kernel-per-step structure as func_solve_decomposed but uses the sequential
+    Newton-guided linesearch (1 thread per env) instead of the parallel grid search
+    (K threads per env). Better for scenes with many constraints per env where the
+    grid search does more total work than needed.
+    """
+    for _it in range(_n_iterations):
+        _kernel_linesearch(
+            entities_info,
+            dofs_state,
             constraint_state,
             rigid_global_info,
             static_rigid_sim_config,
@@ -679,7 +946,6 @@ def func_solve_decomposed(
             constraint_state,
             static_rigid_sim_config,
         )
-
         if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton:
             _kernel_newton_only_nt_hessian(
                 constraint_state,

From 6fa169f4f1caabe5e8669068cd488b644bc1b801 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Wed, 25 Mar 2026 11:14:54 +0000
Subject: [PATCH 16/20] with repeat after count

---
 genesis/engine/solvers/rigid/constraint/solver.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index 51f7269c9..00985b918 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -3102,7 +3102,11 @@ def func_solve_iter(
 
 
 @qd.perf_dispatch(
-    get_geometry_hash=lambda *args, **kwargs: (*args, frozendict(kwargs)), warmup=3, active=3, repeat_after_seconds=0.0
+    get_geometry_hash=lambda *args, **kwargs: (*args, frozendict(kwargs)),
+    warmup=3,
+    active=3,
+    repeat_after_seconds=0,
+    repeat_after_count=3000,
 )
 def func_solve_body(
     entities_info: array_class.EntitiesInfo,

From 04266ba49192977e3b6a68aa1a4a4b8d69d184e9 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Wed, 25 Mar 2026 11:19:40 +0000
Subject: [PATCH 17/20] update n candidates

---
 genesis/engine/solvers/rigid/constraint/solver_breakdown.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 9a3e672f5..161e14eca 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -27,7 +27,7 @@
 # Number of alpha candidates evaluated via cooperative constraint reduction.
 # Each candidate is evaluated by ALL K threads cooperating on the constraint sum,
 # reducing per-thread work from O(n_constraints) to O(n_constraints/K).
-LS_N_CANDIDATES = 8
+LS_N_CANDIDATES = 6
 
 # Maximum allowed alpha (prevents divergence from degenerate steps).
 LS_ALPHA_MAX = 1e4

From bd18461d2188dd8c8ac1ec267f26a97dd387578c Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Wed, 25 Mar 2026 20:53:04 +0000
Subject: [PATCH 18/20] remove the fallback for simpler code

---
 .../rigid/constraint/solver_breakdown.py      | 65 -------------------
 1 file changed, 65 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 161e14eca..223cd374c 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -899,68 +899,3 @@ def func_solve_decomposed(
             rigid_global_info,
             static_rigid_sim_config,
         )
-
-
-@solver.func_solve_body.register(
-    is_compatible=lambda *args, **kwargs: False  # Disabled to benchmark parallel LS in isolation
-)
-def func_solve_decomposed_sequential(
-    entities_info,
-    dofs_info,
-    dofs_state,
-    constraint_state,
-    rigid_global_info,
-    static_rigid_sim_config,
-    _n_iterations,
-):
-    """Decomposed solver with sequential iterative linesearch.
-
-    Same kernel-per-step structure as func_solve_decomposed but uses the sequential
-    Newton-guided linesearch (1 thread per env) instead of the parallel grid search
-    (K threads per env). Better for scenes with many constraints per env where the
-    grid search does more total work than needed.
-    """
-    for _it in range(_n_iterations):
-        _kernel_linesearch(
-            entities_info,
-            dofs_state,
-            constraint_state,
-            rigid_global_info,
-            static_rigid_sim_config,
-        )
-        if static_rigid_sim_config.solver_type == gs.constraint_solver.CG:
-            _kernel_cg_only_save_prev_grad(
-                constraint_state,
-                static_rigid_sim_config,
-            )
-        _kernel_update_constraint_forces(
-            constraint_state,
-            static_rigid_sim_config,
-        )
-        _kernel_update_constraint_qfrc(
-            constraint_state,
-            static_rigid_sim_config,
-        )
-        _kernel_update_constraint_cost(
-            dofs_state,
-            constraint_state,
-            static_rigid_sim_config,
-        )
-        if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton:
-            _kernel_newton_only_nt_hessian(
-                constraint_state,
-                rigid_global_info,
-                static_rigid_sim_config,
-            )
-        _kernel_update_gradient(
-            entities_info,
-            dofs_state,
-            constraint_state,
-            rigid_global_info,
-            static_rigid_sim_config,
-        )
-        _kernel_update_search_direction(
-            constraint_state,
-            rigid_global_info,
-            static_rigid_sim_config,
-        )

From 75e213d59afdb14047c1a28090a43c7b099aeb0c Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Wed, 25 Mar 2026 23:09:36 +0000
Subject: [PATCH 19/20] add unexpected missing newton correction..

---
 .../rigid/constraint/solver_breakdown.py      | 65 +++++++++++--------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index 223cd374c..f0fd9c0c4 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -543,33 +543,46 @@ def _kernel_parallel_linesearch_eval(
                     g_best = 2.0 * alpha_eval * qt2_total + qt1_total
 
                     if qd.abs(g_best) > gtol:
-                        # Need bisection — use thread-0 sequential bisection with _ls_eval_cost_grad
-                        # (bisection is ~5 iterations, thread-0 cost is acceptable)
-                        bis_a = alpha_eval * 0.5
-                        bis_b = alpha_eval
-                        if g_best < 0.0:
-                            bis_a = alpha_eval
-                            bis_b = alpha_eval * 2.0
-
-                        _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state)
-                        _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state)
-
-                        if g_a < 0.0 and g_b > 0.0:
-                            _N_BISECT = qd.static(LS_BISECT_STEPS)
-                            for _bis_it in range(_N_BISECT):
+                        hess_best = 2.0 * qt2_total
+                        newton_done = False
+
+                        # Try one Newton correction first (O(1) compute + 1 cost eval)
+                        if hess_best > rigid_global_info.EPS[None]:
+                            alpha_nc = alpha_eval - g_best / hess_best
+                            if alpha_nc > 0.0:
+                                c_nc, g_nc = _ls_eval_cost_grad(alpha_nc, i_b, constraint_state)
+                                if c_nc < p0_cost and c_nc < constraint_state.candidates[4, i_b]:
+                                    constraint_state.candidates[0, i_b] = alpha_nc
+                                    constraint_state.candidates[4, i_b] = c_nc
+                                    newton_done = True
+
+                        # Fall back to bisection if Newton didn't converge
+                        if not newton_done:
+                            bis_a = alpha_eval * 0.5
+                            bis_b = alpha_eval
+                            if g_best < 0.0:
+                                bis_a = alpha_eval
+                                bis_b = alpha_eval * 2.0
+
+                            _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state)
+                            _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state)
+
+                            if g_a < 0.0 and g_b > 0.0:
+                                _N_BISECT = qd.static(LS_BISECT_STEPS)
+                                for _bis_it in range(_N_BISECT):
+                                    mid_b = (bis_a + bis_b) * 0.5
+                                    c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
+                                    if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]:
+                                        break
+                                    if g_mid_b < 0.0:
+                                        bis_a = mid_b
+                                    else:
+                                        bis_b = mid_b
                                 mid_b = (bis_a + bis_b) * 0.5
-                                c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
-                                if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]:
-                                    break
-                                if g_mid_b < 0.0:
-                                    bis_a = mid_b
-                                else:
-                                    bis_b = mid_b
-                            mid_b = (bis_a + bis_b) * 0.5
-                            c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
-                            if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]:
-                                constraint_state.candidates[0, i_b] = mid_b
-                                constraint_state.candidates[4, i_b] = c_mid_b
+                                c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state)
+                                if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]:
+                                    constraint_state.candidates[0, i_b] = mid_b
+                                    constraint_state.candidates[4, i_b] = c_mid_b
         else:
             if tid == 0:
                 constraint_state.candidates[0, i_b] = 0.0

From 8a75bfed9666f93dbc53a624c2f16ac5b4f8a831 Mon Sep 17 00:00:00 2001
From: Mingrui <mingrui.zh9@gmail.com>
Date: Thu, 26 Mar 2026 00:33:54 +0000
Subject: [PATCH 20/20] update comment

---
 genesis/engine/solvers/rigid/constraint/solver_breakdown.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index f0fd9c0c4..c3bfd1306 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -375,7 +375,7 @@ def _kernel_parallel_linesearch_eval(
     O(n_candidates × n_constraints/K) per thread instead of O(K × n_constraints).
 
     Phase 1: Cooperatively evaluate N_CANDIDATES + Newton alpha, pick best via argmin.
-    Phase 2: Cooperatively evaluate analytical gradient at best, then bisect if needed.
+    Phase 2: Cooperatively evaluate analytical gradient at best, try one Newton correction first then bisect if needed.
     """
     _B = constraint_state.grad.shape[1]
     _K = qd.static(LS_PARALLEL_K)